From 6e135ef2b2bb3169f2a5c0e2eba9341416f2c321 Mon Sep 17 00:00:00 2001 From: jbeemster Date: Mon, 21 Nov 2022 14:43:28 +1100 Subject: [PATCH] Add ability to load into BigQuery from GCP Quickstart Examples (closes #43) --- terraform/gcp/pipeline/default/README.md | 32 ++++-- .../default/bigquery.terraform.tfvars | 52 +++++++++ terraform/gcp/pipeline/default/main.tf | 101 ++++++++++++++--- terraform/gcp/pipeline/default/outputs.tf | 23 +++- ...aform.tfvars => postgres.terraform.tfvars} | 12 +- terraform/gcp/pipeline/default/variables.tf | 22 +++- terraform/gcp/pipeline/secure/README.md | 32 ++++-- .../pipeline/secure/bigquery.terraform.tfvars | 51 +++++++++ terraform/gcp/pipeline/secure/main.tf | 103 ++++++++++++++---- terraform/gcp/pipeline/secure/outputs.tf | 23 +++- ...aform.tfvars => postgres.terraform.tfvars} | 12 +- terraform/gcp/pipeline/secure/variables.tf | 22 +++- 12 files changed, 400 insertions(+), 85 deletions(-) create mode 100644 terraform/gcp/pipeline/default/bigquery.terraform.tfvars rename terraform/gcp/pipeline/default/{terraform.tfvars => postgres.terraform.tfvars} (92%) create mode 100644 terraform/gcp/pipeline/secure/bigquery.terraform.tfvars rename terraform/gcp/pipeline/secure/{terraform.tfvars => postgres.terraform.tfvars} (92%) diff --git a/terraform/gcp/pipeline/default/README.md b/terraform/gcp/pipeline/default/README.md index 976e600..d5f71a7 100644 --- a/terraform/gcp/pipeline/default/README.md +++ b/terraform/gcp/pipeline/default/README.md @@ -8,25 +8,32 @@ ## Providers -No providers. +| Name | Version | +|------|---------| +| [google](#provider\_google) | ~> 3.90.1 | ## Modules | Name | Source | Version | |------|--------|---------| | [bad\_1\_topic](#module\_bad\_1\_topic) | snowplow-devops/pubsub-topic/google | 0.1.0 | +| [bad\_rows\_topic](#module\_bad\_rows\_topic) | snowplow-devops/pubsub-topic/google | 0.1.0 | +| [bigquery\_loader](#module\_bigquery\_loader) | snowplow-devops/bigquery-loader-pubsub-ce/google | 0.1.0 | | [collector\_lb](#module\_collector\_lb) | snowplow-devops/lb/google | 0.1.0 | | [collector\_pubsub](#module\_collector\_pubsub) | snowplow-devops/collector-pubsub-ce/google | 0.2.2 | | [enrich\_pubsub](#module\_enrich\_pubsub) | snowplow-devops/enrich-pubsub-ce/google | 0.1.2 | | [enriched\_topic](#module\_enriched\_topic) | snowplow-devops/pubsub-topic/google | 0.1.0 | -| [pipeline\_db](#module\_pipeline\_db) | snowplow-devops/cloud-sql/google | 0.1.1 | +| [postgres\_db](#module\_postgres\_db) | snowplow-devops/cloud-sql/google | 0.1.1 | | [postgres\_loader\_bad](#module\_postgres\_loader\_bad) | snowplow-devops/postgres-loader-pubsub-ce/google | 0.2.1 | | [postgres\_loader\_enriched](#module\_postgres\_loader\_enriched) | snowplow-devops/postgres-loader-pubsub-ce/google | 0.2.1 | | [raw\_topic](#module\_raw\_topic) | snowplow-devops/pubsub-topic/google | 0.1.0 | ## Resources -No resources. +| Name | Type | +|------|------| +| [google_bigquery_dataset.bigquery_db](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/bigquery_dataset) | resource | +| [google_storage_bucket.bq_loader_dead_letter_bucket](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket) | resource | ## Inputs @@ -35,17 +42,19 @@ No resources. | [iglu\_server\_dns\_name](#input\_iglu\_server\_dns\_name) | The DNS name of your Iglu Server | `string` | n/a | yes | | [iglu\_super\_api\_key](#input\_iglu\_super\_api\_key) | A UUIDv4 string to use as the master API key for Iglu Server management | `string` | n/a | yes | | [network](#input\_network) | The name of the network to deploy within | `string` | n/a | yes | -| [pipeline\_db\_name](#input\_pipeline\_db\_name) | The name of the database to connect to | `string` | n/a | yes | -| [pipeline\_db\_password](#input\_pipeline\_db\_password) | The password to use to connect to the database | `string` | n/a | yes | -| [pipeline\_db\_username](#input\_pipeline\_db\_username) | The username to use to connect to the database | `string` | n/a | yes | +| [postgres\_db\_name](#input\_postgres\_db\_name) | The name of the database to connect to | `string` | n/a | yes | +| [postgres\_db\_password](#input\_postgres\_db\_password) | The password to use to connect to the database | `string` | n/a | yes | +| [postgres\_db\_username](#input\_postgres\_db\_username) | The username to use to connect to the database | `string` | n/a | yes | | [prefix](#input\_prefix) | Will be prefixed to all resource names. Use to easily identify the resources created | `string` | n/a | yes | | [project\_id](#input\_project\_id) | The project ID in which the stack is being deployed | `string` | n/a | yes | | [region](#input\_region) | The name of the region to deploy within | `string` | n/a | yes | | [ssh\_ip\_allowlist](#input\_ssh\_ip\_allowlist) | The list of CIDR ranges to allow SSH traffic from | `list(any)` | n/a | yes | | [subnetwork](#input\_subnetwork) | The name of the sub-network to deploy within | `string` | n/a | yes | +| [bigquery\_db\_enabled](#input\_bigquery\_db\_enabled) | Whether to enable loading into a BigQuery Dataset | `bool` | `false` | no | | [labels](#input\_labels) | The labels to append to the resources in this module | `map(string)` | `{}` | no | -| [pipeline\_db\_authorized\_networks](#input\_pipeline\_db\_authorized\_networks) | The list of CIDR ranges to allow access to the Pipeline Database over |
list(object({
name = string
value = string
}))
| `[]` | no | -| [pipeline\_db\_tier](#input\_pipeline\_db\_tier) | The instance type to assign to the deployed Cloud SQL instance | `string` | `"db-g1-small"` | no | +| [postgres\_db\_authorized\_networks](#input\_postgres\_db\_authorized\_networks) | The list of CIDR ranges to allow access to the Pipeline Database over |
list(object({
name = string
value = string
}))
| `[]` | no | +| [postgres\_db\_enabled](#input\_postgres\_db\_enabled) | Whether to enable loading into a Postgres Database | `bool` | `false` | no | +| [postgres\_db\_tier](#input\_postgres\_db\_tier) | The instance type to assign to the deployed Cloud SQL instance | `string` | `"db-g1-small"` | no | | [ssh\_key\_pairs](#input\_ssh\_key\_pairs) | The list of SSH key-pairs to add to the servers |
list(object({
user_name = string
public_key = string
}))
| `[]` | no | | [ssl\_information](#input\_ssl\_information) | The ID of an Google Managed certificate to bind to the load balancer |
object({
enabled = bool
certificate_id = string
})
|
{
"certificate_id": "",
"enabled": false
}
| no | | [telemetry\_enabled](#input\_telemetry\_enabled) | Whether or not to send telemetry information back to Snowplow Analytics Ltd | `bool` | `true` | no | @@ -55,6 +64,9 @@ No resources. | Name | Description | |------|-------------| +| [bigquery\_db\_dataset\_id](#output\_bigquery\_db\_dataset\_id) | The ID of the BigQuery dataset where your data is being streamed | +| [bq\_loader\_bad\_rows\_topic\_name](#output\_bq\_loader\_bad\_rows\_topic\_name) | The name of the topic for bad rows emitted from the BigQuery loader | +| [bq\_loader\_dead\_letter\_bucket\_name](#output\_bq\_loader\_dead\_letter\_bucket\_name) | The name of the GCS bucket for dead letter events emitted from the BigQuery loader | | [collector\_ip\_address](#output\_collector\_ip\_address) | The IP address for the Pipeline Collector | -| [db\_ip\_address](#output\_db\_ip\_address) | The IP address of the database where your data is being streamed | -| [db\_port](#output\_db\_port) | The port of the database where your data is being streamed | \ No newline at end of file +| [postgres\_db\_ip\_address](#output\_postgres\_db\_ip\_address) | The IP address of the database where your data is being streamed | +| [postgres\_db\_port](#output\_postgres\_db\_port) | The port of the database where your data is being streamed | \ No newline at end of file diff --git a/terraform/gcp/pipeline/default/bigquery.terraform.tfvars b/terraform/gcp/pipeline/default/bigquery.terraform.tfvars new file mode 100644 index 0000000..7543883 --- /dev/null +++ b/terraform/gcp/pipeline/default/bigquery.terraform.tfvars @@ -0,0 +1,52 @@ +# Will be prefixed to all resource names +# Use this to easily identify the resources created and provide entropy for subsequent environments +prefix = "sp" + +# The project to deploy the infrastructure into +project_id = "PROJECT_ID_TO_DEPLOY_INTO" + +# Where to deploy the infrastructure +region = "REGION_TO_DEPLOY_INTO" + +# --- Default Network +# Update to the network you would like to deploy into +# +# Note: If you opt to use your own network then you will need to define a subnetwork to deploy into as well +network = "default" +subnetwork = "" + +# --- SSH +# Update this to your IP Address +ssh_ip_allowlist = ["999.999.999.999/32"] +# Generate a new SSH key locally with `ssh-keygen` +# ssh-keygen -t rsa -b 4096 +ssh_key_pairs = [ + { + user_name = "snowplow" + public_key = "ssh-rsa AAAAB3NzaC1yc2EAAAADAQA0jSi9//bRsHW4M6czodTs6smCXsxZ0gijzth0aBmycE= snowplow@Snowplows-MacBook-Pro.local" + } +] + +# --- Iglu Server Configuration +# Iglu Server DNS output from the Iglu Server stack +iglu_server_dns_name = "http://CHANGE-TO-MY-IGLU-IP" +# Used for API actions on the Iglu Server +# Change this to the same UUID from when you created the Iglu Server +iglu_super_api_key = "00000000-0000-0000-0000-000000000000" + +# --- Snowplow BigQuery Loader +bigquery_db_enabled = true + +# See for more information: https://registry.terraform.io/modules/snowplow-devops/collector-pubsub-ce/google/latest#telemetry +# Telemetry principles: https://docs.snowplowanalytics.com/docs/open-source-quick-start/what-is-the-quick-start-for-open-source/telemetry-principles/ +user_provided_id = "" +telemetry_enabled = true + +# --- SSL Configuration (optional) +ssl_information = { + certificate_id = "" + enabled = false +} + +# --- Extra Labels to append to created resources (optional) +labels = {} diff --git a/terraform/gcp/pipeline/default/main.tf b/terraform/gcp/pipeline/default/main.tf index 33d9942..f1ae83b 100644 --- a/terraform/gcp/pipeline/default/main.tf +++ b/terraform/gcp/pipeline/default/main.tf @@ -109,20 +109,22 @@ module "enrich_pubsub" { } # 4. Deploy Postgres Loader -module "pipeline_db" { +module "postgres_db" { source = "snowplow-devops/cloud-sql/google" version = "0.1.1" - name = "${var.prefix}-pipeline-db" + count = var.postgres_db_enabled ? 1 : 0 + + name = "${var.prefix}-postgres-db" region = var.region - db_name = var.pipeline_db_name - db_username = var.pipeline_db_username - db_password = var.pipeline_db_password + db_name = var.postgres_db_name + db_username = var.postgres_db_username + db_password = var.postgres_db_password - authorized_networks = var.pipeline_db_authorized_networks + authorized_networks = var.postgres_db_authorized_networks - tier = var.pipeline_db_tier + tier = var.postgres_db_tier labels = var.labels } @@ -131,6 +133,8 @@ module "postgres_loader_enriched" { source = "snowplow-devops/postgres-loader-pubsub-ce/google" version = "0.2.1" + count = var.postgres_db_enabled ? 1 : 0 + name = "${var.prefix}-pg-loader-enriched-server" network = var.network @@ -145,11 +149,11 @@ module "postgres_loader_enriched" { purpose = "ENRICHED_EVENTS" schema_name = "atomic" - db_instance_name = module.pipeline_db.connection_name - db_port = module.pipeline_db.port - db_name = var.pipeline_db_name - db_username = var.pipeline_db_username - db_password = var.pipeline_db_password + db_instance_name = join("", module.postgres_db.*.connection_name) + db_port = join("", module.postgres_db.*.port) + db_name = var.postgres_db_name + db_username = var.postgres_db_username + db_password = var.postgres_db_password # Linking in the custom Iglu Server here custom_iglu_resolvers = local.custom_iglu_resolvers @@ -164,6 +168,8 @@ module "postgres_loader_bad" { source = "snowplow-devops/postgres-loader-pubsub-ce/google" version = "0.2.1" + count = var.postgres_db_enabled ? 1 : 0 + name = "${var.prefix}-pg-loader-bad-server" network = var.network @@ -178,11 +184,72 @@ module "postgres_loader_bad" { purpose = "JSON" schema_name = "atomic_bad" - db_instance_name = module.pipeline_db.connection_name - db_port = module.pipeline_db.port - db_name = var.pipeline_db_name - db_username = var.pipeline_db_username - db_password = var.pipeline_db_password + db_instance_name = join("", module.postgres_db.*.connection_name) + db_port = join("", module.postgres_db.*.port) + db_name = var.postgres_db_name + db_username = var.postgres_db_username + db_password = var.postgres_db_password + + # Linking in the custom Iglu Server here + custom_iglu_resolvers = local.custom_iglu_resolvers + + telemetry_enabled = var.telemetry_enabled + user_provided_id = var.user_provided_id + + labels = var.labels +} + +# 5. Deploy BigQuery Loader +module "bad_rows_topic" { + source = "snowplow-devops/pubsub-topic/google" + version = "0.1.0" + + count = var.bigquery_db_enabled ? 1 : 0 + + name = "${var.prefix}-bq-bad-rows-topic" + + labels = var.labels +} + +resource "google_bigquery_dataset" "bigquery_db" { + count = var.bigquery_db_enabled ? 1 : 0 + + dataset_id = replace("${var.prefix}_pipeline_db", "-", "_") + location = var.region + + labels = var.labels +} + +resource "google_storage_bucket" "bq_loader_dead_letter_bucket" { + count = var.bigquery_db_enabled ? 1 : 0 + + name = "${var.prefix}-bq-loader-dead-letter" + location = var.region + force_destroy = true + + labels = var.labels +} + +module "bigquery_loader" { + source = "snowplow-devops/bigquery-loader-pubsub-ce/google" + version = "0.1.0" + + count = var.bigquery_db_enabled ? 1 : 0 + + name = "${var.prefix}-bq-loader-server" + + network = var.network + subnetwork = var.subnetwork + region = var.region + project_id = var.project_id + + ssh_ip_allowlist = var.ssh_ip_allowlist + ssh_key_pairs = var.ssh_key_pairs + + input_topic_name = module.enriched_topic.name + bad_rows_topic_name = join("", module.bad_rows_topic.*.name) + gcs_dead_letter_bucket_name = join("", google_storage_bucket.bq_loader_dead_letter_bucket.*.name) + bigquery_dataset_id = join("", google_bigquery_dataset.bigquery_db.*.dataset_id) # Linking in the custom Iglu Server here custom_iglu_resolvers = local.custom_iglu_resolvers diff --git a/terraform/gcp/pipeline/default/outputs.tf b/terraform/gcp/pipeline/default/outputs.tf index f5c05e7..034f7a9 100644 --- a/terraform/gcp/pipeline/default/outputs.tf +++ b/terraform/gcp/pipeline/default/outputs.tf @@ -3,12 +3,27 @@ output "collector_ip_address" { value = module.collector_lb.ip_address } -output "db_ip_address" { +output "postgres_db_ip_address" { description = "The IP address of the database where your data is being streamed" - value = module.pipeline_db.first_ip_address + value = join("", module.postgres_db.*.first_ip_address) } -output "db_port" { +output "postgres_db_port" { description = "The port of the database where your data is being streamed" - value = module.pipeline_db.port + value = join("", module.postgres_db.*.port) +} + +output "bigquery_db_dataset_id" { + description = "The ID of the BigQuery dataset where your data is being streamed" + value = join("", google_bigquery_dataset.bigquery_db.*.dataset_id) +} + +output "bq_loader_dead_letter_bucket_name" { + description = "The name of the GCS bucket for dead letter events emitted from the BigQuery loader" + value = join("", google_storage_bucket.bq_loader_dead_letter_bucket.*.name) +} + +output "bq_loader_bad_rows_topic_name" { + description = "The name of the topic for bad rows emitted from the BigQuery loader" + value = join("", module.bad_rows_topic.*.name) } diff --git a/terraform/gcp/pipeline/default/terraform.tfvars b/terraform/gcp/pipeline/default/postgres.terraform.tfvars similarity index 92% rename from terraform/gcp/pipeline/default/terraform.tfvars rename to terraform/gcp/pipeline/default/postgres.terraform.tfvars index 9bb9a3c..7ca15d7 100644 --- a/terraform/gcp/pipeline/default/terraform.tfvars +++ b/terraform/gcp/pipeline/default/postgres.terraform.tfvars @@ -35,17 +35,19 @@ iglu_server_dns_name = "http://CHANGE-TO-MY-IGLU-IP" iglu_super_api_key = "00000000-0000-0000-0000-000000000000" # --- Snowplow Postgres Loader -pipeline_db_name = "snowplow" -pipeline_db_username = "snowplow" +postgres_db_enabled = true + +postgres_db_name = "snowplow" +postgres_db_username = "snowplow" # Change and keep this secret! -pipeline_db_password = "Hell0W0rld!2" +postgres_db_password = "Hell0W0rld!2" # IP ranges that you want to query the Pipeline Postgres Cloud SQL instance from directly over the internet. An alternative access method is to leverage # the Cloud SQL Proxy service which creates an IAM authenticated tunnel to the instance # # Details: https://cloud.google.com/sql/docs/postgres/sql-proxy # # Note: this exposes your data to the internet - take care to ensure your allowlist is strict enough -pipeline_db_authorized_networks = [ +postgres_db_authorized_networks = [ { name = "foo" value = "999.999.999.999/32" @@ -57,7 +59,7 @@ pipeline_db_authorized_networks = [ ] # Note: the size of the database instance determines the number of concurrent connections - each Postgres Loader instance creates 10 open connections so having # a sufficiently powerful database tier is important to not running out of connection slots -pipeline_db_tier = "db-g1-small" +postgres_db_tier = "db-g1-small" # See for more information: https://registry.terraform.io/modules/snowplow-devops/collector-pubsub-ce/google/latest#telemetry # Telemetry principles: https://docs.snowplowanalytics.com/docs/open-source-quick-start/what-is-the-quick-start-for-open-source/telemetry-principles/ diff --git a/terraform/gcp/pipeline/default/variables.tf b/terraform/gcp/pipeline/default/variables.tf index 7dbac3c..794a92f 100644 --- a/terraform/gcp/pipeline/default/variables.tf +++ b/terraform/gcp/pipeline/default/variables.tf @@ -48,23 +48,29 @@ variable "iglu_super_api_key" { sensitive = true } -variable "pipeline_db_name" { +variable "postgres_db_enabled" { + description = "Whether to enable loading into a Postgres Database" + default = false + type = bool +} + +variable "postgres_db_name" { description = "The name of the database to connect to" type = string } -variable "pipeline_db_username" { +variable "postgres_db_username" { description = "The username to use to connect to the database" type = string } -variable "pipeline_db_password" { +variable "postgres_db_password" { description = "The password to use to connect to the database" type = string sensitive = true } -variable "pipeline_db_authorized_networks" { +variable "postgres_db_authorized_networks" { description = "The list of CIDR ranges to allow access to the Pipeline Database over" default = [] type = list(object({ @@ -73,12 +79,18 @@ variable "pipeline_db_authorized_networks" { })) } -variable "pipeline_db_tier" { +variable "postgres_db_tier" { description = "The instance type to assign to the deployed Cloud SQL instance" type = string default = "db-g1-small" } +variable "bigquery_db_enabled" { + description = "Whether to enable loading into a BigQuery Dataset" + default = false + type = bool +} + variable "telemetry_enabled" { description = "Whether or not to send telemetry information back to Snowplow Analytics Ltd" type = bool diff --git a/terraform/gcp/pipeline/secure/README.md b/terraform/gcp/pipeline/secure/README.md index 976e600..d5f71a7 100644 --- a/terraform/gcp/pipeline/secure/README.md +++ b/terraform/gcp/pipeline/secure/README.md @@ -8,25 +8,32 @@ ## Providers -No providers. +| Name | Version | +|------|---------| +| [google](#provider\_google) | ~> 3.90.1 | ## Modules | Name | Source | Version | |------|--------|---------| | [bad\_1\_topic](#module\_bad\_1\_topic) | snowplow-devops/pubsub-topic/google | 0.1.0 | +| [bad\_rows\_topic](#module\_bad\_rows\_topic) | snowplow-devops/pubsub-topic/google | 0.1.0 | +| [bigquery\_loader](#module\_bigquery\_loader) | snowplow-devops/bigquery-loader-pubsub-ce/google | 0.1.0 | | [collector\_lb](#module\_collector\_lb) | snowplow-devops/lb/google | 0.1.0 | | [collector\_pubsub](#module\_collector\_pubsub) | snowplow-devops/collector-pubsub-ce/google | 0.2.2 | | [enrich\_pubsub](#module\_enrich\_pubsub) | snowplow-devops/enrich-pubsub-ce/google | 0.1.2 | | [enriched\_topic](#module\_enriched\_topic) | snowplow-devops/pubsub-topic/google | 0.1.0 | -| [pipeline\_db](#module\_pipeline\_db) | snowplow-devops/cloud-sql/google | 0.1.1 | +| [postgres\_db](#module\_postgres\_db) | snowplow-devops/cloud-sql/google | 0.1.1 | | [postgres\_loader\_bad](#module\_postgres\_loader\_bad) | snowplow-devops/postgres-loader-pubsub-ce/google | 0.2.1 | | [postgres\_loader\_enriched](#module\_postgres\_loader\_enriched) | snowplow-devops/postgres-loader-pubsub-ce/google | 0.2.1 | | [raw\_topic](#module\_raw\_topic) | snowplow-devops/pubsub-topic/google | 0.1.0 | ## Resources -No resources. +| Name | Type | +|------|------| +| [google_bigquery_dataset.bigquery_db](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/bigquery_dataset) | resource | +| [google_storage_bucket.bq_loader_dead_letter_bucket](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket) | resource | ## Inputs @@ -35,17 +42,19 @@ No resources. | [iglu\_server\_dns\_name](#input\_iglu\_server\_dns\_name) | The DNS name of your Iglu Server | `string` | n/a | yes | | [iglu\_super\_api\_key](#input\_iglu\_super\_api\_key) | A UUIDv4 string to use as the master API key for Iglu Server management | `string` | n/a | yes | | [network](#input\_network) | The name of the network to deploy within | `string` | n/a | yes | -| [pipeline\_db\_name](#input\_pipeline\_db\_name) | The name of the database to connect to | `string` | n/a | yes | -| [pipeline\_db\_password](#input\_pipeline\_db\_password) | The password to use to connect to the database | `string` | n/a | yes | -| [pipeline\_db\_username](#input\_pipeline\_db\_username) | The username to use to connect to the database | `string` | n/a | yes | +| [postgres\_db\_name](#input\_postgres\_db\_name) | The name of the database to connect to | `string` | n/a | yes | +| [postgres\_db\_password](#input\_postgres\_db\_password) | The password to use to connect to the database | `string` | n/a | yes | +| [postgres\_db\_username](#input\_postgres\_db\_username) | The username to use to connect to the database | `string` | n/a | yes | | [prefix](#input\_prefix) | Will be prefixed to all resource names. Use to easily identify the resources created | `string` | n/a | yes | | [project\_id](#input\_project\_id) | The project ID in which the stack is being deployed | `string` | n/a | yes | | [region](#input\_region) | The name of the region to deploy within | `string` | n/a | yes | | [ssh\_ip\_allowlist](#input\_ssh\_ip\_allowlist) | The list of CIDR ranges to allow SSH traffic from | `list(any)` | n/a | yes | | [subnetwork](#input\_subnetwork) | The name of the sub-network to deploy within | `string` | n/a | yes | +| [bigquery\_db\_enabled](#input\_bigquery\_db\_enabled) | Whether to enable loading into a BigQuery Dataset | `bool` | `false` | no | | [labels](#input\_labels) | The labels to append to the resources in this module | `map(string)` | `{}` | no | -| [pipeline\_db\_authorized\_networks](#input\_pipeline\_db\_authorized\_networks) | The list of CIDR ranges to allow access to the Pipeline Database over |
list(object({
name = string
value = string
}))
| `[]` | no | -| [pipeline\_db\_tier](#input\_pipeline\_db\_tier) | The instance type to assign to the deployed Cloud SQL instance | `string` | `"db-g1-small"` | no | +| [postgres\_db\_authorized\_networks](#input\_postgres\_db\_authorized\_networks) | The list of CIDR ranges to allow access to the Pipeline Database over |
list(object({
name = string
value = string
}))
| `[]` | no | +| [postgres\_db\_enabled](#input\_postgres\_db\_enabled) | Whether to enable loading into a Postgres Database | `bool` | `false` | no | +| [postgres\_db\_tier](#input\_postgres\_db\_tier) | The instance type to assign to the deployed Cloud SQL instance | `string` | `"db-g1-small"` | no | | [ssh\_key\_pairs](#input\_ssh\_key\_pairs) | The list of SSH key-pairs to add to the servers |
list(object({
user_name = string
public_key = string
}))
| `[]` | no | | [ssl\_information](#input\_ssl\_information) | The ID of an Google Managed certificate to bind to the load balancer |
object({
enabled = bool
certificate_id = string
})
|
{
"certificate_id": "",
"enabled": false
}
| no | | [telemetry\_enabled](#input\_telemetry\_enabled) | Whether or not to send telemetry information back to Snowplow Analytics Ltd | `bool` | `true` | no | @@ -55,6 +64,9 @@ No resources. | Name | Description | |------|-------------| +| [bigquery\_db\_dataset\_id](#output\_bigquery\_db\_dataset\_id) | The ID of the BigQuery dataset where your data is being streamed | +| [bq\_loader\_bad\_rows\_topic\_name](#output\_bq\_loader\_bad\_rows\_topic\_name) | The name of the topic for bad rows emitted from the BigQuery loader | +| [bq\_loader\_dead\_letter\_bucket\_name](#output\_bq\_loader\_dead\_letter\_bucket\_name) | The name of the GCS bucket for dead letter events emitted from the BigQuery loader | | [collector\_ip\_address](#output\_collector\_ip\_address) | The IP address for the Pipeline Collector | -| [db\_ip\_address](#output\_db\_ip\_address) | The IP address of the database where your data is being streamed | -| [db\_port](#output\_db\_port) | The port of the database where your data is being streamed | \ No newline at end of file +| [postgres\_db\_ip\_address](#output\_postgres\_db\_ip\_address) | The IP address of the database where your data is being streamed | +| [postgres\_db\_port](#output\_postgres\_db\_port) | The port of the database where your data is being streamed | \ No newline at end of file diff --git a/terraform/gcp/pipeline/secure/bigquery.terraform.tfvars b/terraform/gcp/pipeline/secure/bigquery.terraform.tfvars new file mode 100644 index 0000000..71f1daf --- /dev/null +++ b/terraform/gcp/pipeline/secure/bigquery.terraform.tfvars @@ -0,0 +1,51 @@ +# Will be prefixed to all resource names +# Use this to easily identify the resources created and provide entropy for subsequent environments +prefix = "sp" + +# The project to deploy the infrastructure into +project_id = "PROJECT_ID_TO_DEPLOY_INTO" + +# Where to deploy the infrastructure +region = "REGION_TO_DEPLOY_INTO" + +# --- Network +# NOTE: The network & sub-network configured must be configured with a Cloud NAT to allow the deployed Compute Engine instances to +# connect to the internet to download the required assets +network = "YOUR_NETWORK_HERE" +subnetwork = "YOUR_SUB_NETWORK_HERE" + +# --- SSH +# Update this to the internal IP of your Bastion Host +ssh_ip_allowlist = ["999.999.999.999/32"] +# Generate a new SSH key locally with `ssh-keygen` +# ssh-keygen -t rsa -b 4096 +ssh_key_pairs = [ + { + user_name = "snowplow" + public_key = "ssh-rsa AAAAB3NzaC1yc2EAAAADAQA0jSi9//bRsHW4M6czodTs6smCXsxZ0gijzth0aBmycE= snowplow@Snowplows-MacBook-Pro.local" + } +] + +# --- Iglu Server Configuration +# Iglu Server DNS output from the Iglu Server stack +iglu_server_dns_name = "http://CHANGE-TO-MY-IGLU-IP" +# Used for API actions on the Iglu Server +# Change this to the same UUID from when you created the Iglu Server +iglu_super_api_key = "00000000-0000-0000-0000-000000000000" + +# --- Snowplow BigQuery Loader +bigquery_db_enabled = true + +# See for more information: https://registry.terraform.io/modules/snowplow-devops/collector-pubsub-ce/google/latest#telemetry +# Telemetry principles: https://docs.snowplowanalytics.com/docs/open-source-quick-start/what-is-the-quick-start-for-open-source/telemetry-principles/ +user_provided_id = "" +telemetry_enabled = true + +# --- SSL Configuration (optional) +ssl_information = { + certificate_id = "" + enabled = false +} + +# --- Extra Labels to append to created resources (optional) +labels = {} diff --git a/terraform/gcp/pipeline/secure/main.tf b/terraform/gcp/pipeline/secure/main.tf index a7a4b79..126608b 100644 --- a/terraform/gcp/pipeline/secure/main.tf +++ b/terraform/gcp/pipeline/secure/main.tf @@ -113,20 +113,22 @@ module "enrich_pubsub" { } # 4. Deploy Postgres Loader -module "pipeline_db" { +module "postgres_db" { source = "snowplow-devops/cloud-sql/google" version = "0.1.1" - name = "${var.prefix}-pipeline-db" + count = var.postgres_db_enabled ? 1 : 0 + + name = "${var.prefix}-postgres-db" region = var.region - db_name = var.pipeline_db_name - db_username = var.pipeline_db_username - db_password = var.pipeline_db_password + db_name = var.postgres_db_name + db_username = var.postgres_db_username + db_password = var.postgres_db_password - authorized_networks = var.pipeline_db_authorized_networks + authorized_networks = var.postgres_db_authorized_networks - tier = var.pipeline_db_tier + tier = var.postgres_db_tier labels = var.labels } @@ -135,6 +137,8 @@ module "postgres_loader_enriched" { source = "snowplow-devops/postgres-loader-pubsub-ce/google" version = "0.2.1" + count = var.postgres_db_enabled ? 1 : 0 + name = "${var.prefix}-pg-loader-enriched-server" network = var.network @@ -149,11 +153,11 @@ module "postgres_loader_enriched" { purpose = "ENRICHED_EVENTS" schema_name = "atomic" - db_instance_name = module.pipeline_db.connection_name - db_port = module.pipeline_db.port - db_name = var.pipeline_db_name - db_username = var.pipeline_db_username - db_password = var.pipeline_db_password + db_instance_name = join("", module.postgres_db.*.connection_name) + db_port = join("", module.postgres_db.*.port) + db_name = var.postgres_db_name + db_username = var.postgres_db_username + db_password = var.postgres_db_password # Linking in the custom Iglu Server here custom_iglu_resolvers = local.custom_iglu_resolvers @@ -161,8 +165,6 @@ module "postgres_loader_enriched" { telemetry_enabled = var.telemetry_enabled user_provided_id = var.user_provided_id - associate_public_ip_address = false - labels = var.labels } @@ -170,6 +172,8 @@ module "postgres_loader_bad" { source = "snowplow-devops/postgres-loader-pubsub-ce/google" version = "0.2.1" + count = var.postgres_db_enabled ? 1 : 0 + name = "${var.prefix}-pg-loader-bad-server" network = var.network @@ -184,11 +188,11 @@ module "postgres_loader_bad" { purpose = "JSON" schema_name = "atomic_bad" - db_instance_name = module.pipeline_db.connection_name - db_port = module.pipeline_db.port - db_name = var.pipeline_db_name - db_username = var.pipeline_db_username - db_password = var.pipeline_db_password + db_instance_name = join("", module.postgres_db.*.connection_name) + db_port = join("", module.postgres_db.*.port) + db_name = var.postgres_db_name + db_username = var.postgres_db_username + db_password = var.postgres_db_password # Linking in the custom Iglu Server here custom_iglu_resolvers = local.custom_iglu_resolvers @@ -196,7 +200,66 @@ module "postgres_loader_bad" { telemetry_enabled = var.telemetry_enabled user_provided_id = var.user_provided_id - associate_public_ip_address = false + labels = var.labels +} + +# 5. Deploy BigQuery Loader +module "bad_rows_topic" { + source = "snowplow-devops/pubsub-topic/google" + version = "0.1.0" + + count = var.bigquery_db_enabled ? 1 : 0 + + name = "${var.prefix}-bq-bad-rows-topic" + + labels = var.labels +} + +resource "google_bigquery_dataset" "bigquery_db" { + count = var.bigquery_db_enabled ? 1 : 0 + + dataset_id = replace("${var.prefix}_snowplow_db", "-", "_") + location = var.region + + labels = var.labels +} + +resource "google_storage_bucket" "bq_loader_dead_letter_bucket" { + count = var.bigquery_db_enabled ? 1 : 0 + + name = "${var.prefix}-bq-loader-dead-letter" + location = var.region + force_destroy = true + + labels = var.labels +} + +module "bigquery_loader" { + source = "snowplow-devops/bigquery-loader-pubsub-ce/google" + version = "0.1.0" + + count = var.bigquery_db_enabled ? 1 : 0 + + name = "${var.prefix}-bq-loader-server" + + network = var.network + subnetwork = var.subnetwork + region = var.region + project_id = var.project_id + + ssh_ip_allowlist = var.ssh_ip_allowlist + ssh_key_pairs = var.ssh_key_pairs + + input_topic_name = module.enriched_topic.name + bad_rows_topic_name = join("", module.bad_rows_topic.*.name) + gcs_dead_letter_bucket_name = join("", google_storage_bucket.bq_loader_dead_letter_bucket.*.name) + bigquery_dataset_id = join("", google_bigquery_dataset.bigquery_db.*.dataset_id) + + # Linking in the custom Iglu Server here + custom_iglu_resolvers = local.custom_iglu_resolvers + + telemetry_enabled = var.telemetry_enabled + user_provided_id = var.user_provided_id labels = var.labels } diff --git a/terraform/gcp/pipeline/secure/outputs.tf b/terraform/gcp/pipeline/secure/outputs.tf index f5c05e7..034f7a9 100644 --- a/terraform/gcp/pipeline/secure/outputs.tf +++ b/terraform/gcp/pipeline/secure/outputs.tf @@ -3,12 +3,27 @@ output "collector_ip_address" { value = module.collector_lb.ip_address } -output "db_ip_address" { +output "postgres_db_ip_address" { description = "The IP address of the database where your data is being streamed" - value = module.pipeline_db.first_ip_address + value = join("", module.postgres_db.*.first_ip_address) } -output "db_port" { +output "postgres_db_port" { description = "The port of the database where your data is being streamed" - value = module.pipeline_db.port + value = join("", module.postgres_db.*.port) +} + +output "bigquery_db_dataset_id" { + description = "The ID of the BigQuery dataset where your data is being streamed" + value = join("", google_bigquery_dataset.bigquery_db.*.dataset_id) +} + +output "bq_loader_dead_letter_bucket_name" { + description = "The name of the GCS bucket for dead letter events emitted from the BigQuery loader" + value = join("", google_storage_bucket.bq_loader_dead_letter_bucket.*.name) +} + +output "bq_loader_bad_rows_topic_name" { + description = "The name of the topic for bad rows emitted from the BigQuery loader" + value = join("", module.bad_rows_topic.*.name) } diff --git a/terraform/gcp/pipeline/secure/terraform.tfvars b/terraform/gcp/pipeline/secure/postgres.terraform.tfvars similarity index 92% rename from terraform/gcp/pipeline/secure/terraform.tfvars rename to terraform/gcp/pipeline/secure/postgres.terraform.tfvars index e26fc5c..fc9a4a8 100644 --- a/terraform/gcp/pipeline/secure/terraform.tfvars +++ b/terraform/gcp/pipeline/secure/postgres.terraform.tfvars @@ -34,17 +34,19 @@ iglu_server_dns_name = "http://CHANGE-TO-MY-IGLU-IP" iglu_super_api_key = "00000000-0000-0000-0000-000000000000" # --- Snowplow Postgres Loader -pipeline_db_name = "snowplow" -pipeline_db_username = "snowplow" +postgres_db_enabled = true + +postgres_db_name = "snowplow" +postgres_db_username = "snowplow" # Change and keep this secret! -pipeline_db_password = "Hell0W0rld!2" +postgres_db_password = "Hell0W0rld!2" # IP ranges that you want to query the Pipeline Postgres Cloud SQL instance from directly over the internet. An alternative access method is to leverage # the Cloud SQL Proxy service which creates an IAM authenticated tunnel to the instance # # Details: https://cloud.google.com/sql/docs/postgres/sql-proxy # # Note: this exposes your data to the internet - take care to ensure your allowlist is strict enough -pipeline_db_authorized_networks = [ +postgres_db_authorized_networks = [ { name = "foo" value = "999.999.999.999/32" @@ -56,7 +58,7 @@ pipeline_db_authorized_networks = [ ] # Note: the size of the database instance determines the number of concurrent connections - each Postgres Loader instance creates 10 open connections so having # a sufficiently powerful database tier is important to not running out of connection slots -pipeline_db_tier = "db-g1-small" +postgres_db_tier = "db-g1-small" # See for more information: https://registry.terraform.io/modules/snowplow-devops/collector-pubsub-ce/google/latest#telemetry # Telemetry principles: https://docs.snowplowanalytics.com/docs/open-source-quick-start/what-is-the-quick-start-for-open-source/telemetry-principles/ diff --git a/terraform/gcp/pipeline/secure/variables.tf b/terraform/gcp/pipeline/secure/variables.tf index 7dbac3c..794a92f 100644 --- a/terraform/gcp/pipeline/secure/variables.tf +++ b/terraform/gcp/pipeline/secure/variables.tf @@ -48,23 +48,29 @@ variable "iglu_super_api_key" { sensitive = true } -variable "pipeline_db_name" { +variable "postgres_db_enabled" { + description = "Whether to enable loading into a Postgres Database" + default = false + type = bool +} + +variable "postgres_db_name" { description = "The name of the database to connect to" type = string } -variable "pipeline_db_username" { +variable "postgres_db_username" { description = "The username to use to connect to the database" type = string } -variable "pipeline_db_password" { +variable "postgres_db_password" { description = "The password to use to connect to the database" type = string sensitive = true } -variable "pipeline_db_authorized_networks" { +variable "postgres_db_authorized_networks" { description = "The list of CIDR ranges to allow access to the Pipeline Database over" default = [] type = list(object({ @@ -73,12 +79,18 @@ variable "pipeline_db_authorized_networks" { })) } -variable "pipeline_db_tier" { +variable "postgres_db_tier" { description = "The instance type to assign to the deployed Cloud SQL instance" type = string default = "db-g1-small" } +variable "bigquery_db_enabled" { + description = "Whether to enable loading into a BigQuery Dataset" + default = false + type = bool +} + variable "telemetry_enabled" { description = "Whether or not to send telemetry information back to Snowplow Analytics Ltd" type = bool