diff --git a/.github/workflows/pr-precommit.yml b/.github/workflows/pr-precommit.yml index 10117551d7..12a32c5900 100644 --- a/.github/workflows/pr-precommit.yml +++ b/.github/workflows/pr-precommit.yml @@ -49,11 +49,5 @@ jobs: # https://github.com/terraform-linters/tflint/blob/master/docs/user-guide/plugins.md#avoiding-rate-limiting GITHUB_TOKEN: ${{ github.token }} - uses: pre-commit/action@v3.0.1 - - uses: pre-commit-ci/lite-action@v1.0.2 - # this if statement looks funny but it ensures that this step runs - # only if: user has applied "pre-commit-autofix" label - # even if: job has failed - # not if: job is canceled - if: | - (success() || failure()) && - contains(github.event.pull_request.labels.*.name, 'pre-commit-autofix') + with: + extra_args: --show-diff-on-failure --all-files diff --git a/cmd/create.go b/cmd/create.go index a68aa92936..096754b9b0 100644 --- a/cmd/create.go +++ b/cmd/create.go @@ -81,6 +81,7 @@ func runCreateCmd(cmd *cobra.Command, args []string) { func doCreate(path string) string { bp := expandOrDie(path) deplDir := filepath.Join(createFlags.outputDir, bp.DeploymentName()) + logging.Info("Creating deployment folder %q ...", deplDir) checkErr(checkOverwriteAllowed(deplDir, bp, createFlags.overwriteDeployment, createFlags.forceOverwrite)) checkErr(modulewriter.WriteDeployment(bp, deplDir)) return deplDir diff --git a/cmd/root.go b/cmd/root.go index d082dcb821..90d7d5877f 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -52,7 +52,7 @@ HPC deployments on the Google Cloud Platform.`, logging.Fatal("cmd.Help function failed: %s", err) } }, - Version: "v1.30.0", + Version: "v1.31.0", Annotations: annotation, } ) diff --git a/community/examples/hpc-slurm-gromacs.yaml b/community/examples/hpc-slurm-gromacs.yaml index abe7ee5fdf..5198dba03a 100644 --- a/community/examples/hpc-slurm-gromacs.yaml +++ b/community/examples/hpc-slurm-gromacs.yaml @@ -100,7 +100,7 @@ deployment_groups: - id: compute_partition source: community/modules/compute/schedmd-slurm-gcp-v6-partition - use: [compute_nodeset, homefs, appsfs] + use: [compute_nodeset] settings: partition_name: compute is_default: true diff --git a/community/examples/hpc-slurm-ramble-gromacs.yaml b/community/examples/hpc-slurm-ramble-gromacs.yaml index 15e6577c95..9a6c165e5a 100644 --- a/community/examples/hpc-slurm-ramble-gromacs.yaml +++ b/community/examples/hpc-slurm-ramble-gromacs.yaml @@ -29,9 +29,9 @@ vars: deployment_groups: - group: primary modules: - # Source is an embedded resource, denoted by "resources/*" without ./, ../, / - # as a prefix. To refer to a local resource, prefix with ./, ../ or / - # Example - ./resources/network/vpc + # Source is an embedded module, denoted by "modules/*" without ./, ../, / + # as a prefix. To refer to a local module, prefix with ./, ../ or / + # Example - ./modules/network/vpc - id: network1 source: modules/network/vpc diff --git a/community/examples/hpc-slurm-sharedvpc.yaml b/community/examples/hpc-slurm-sharedvpc.yaml index d44d333140..61f5c8173c 100644 --- a/community/examples/hpc-slurm-sharedvpc.yaml +++ b/community/examples/hpc-slurm-sharedvpc.yaml @@ -65,7 +65,7 @@ deployment_groups: - id: debug_partition source: community/modules/compute/schedmd-slurm-gcp-v6-partition - use: [debug_nodeset, homefs] + use: [debug_nodeset] settings: partition_name: debug exclusive: false # allows nodes to stay up after jobs are done @@ -80,7 +80,7 @@ deployment_groups: - id: compute_partition source: community/modules/compute/schedmd-slurm-gcp-v6-partition - use: [compute_nodeset, homefs] + use: [compute_nodeset] settings: partition_name: compute diff --git a/community/examples/hpc-slurm-ubuntu2004-v6.yaml b/community/examples/hpc-slurm-ubuntu2004-v6.yaml index 3c053059c6..06c2bb4a93 100644 --- a/community/examples/hpc-slurm-ubuntu2004-v6.yaml +++ b/community/examples/hpc-slurm-ubuntu2004-v6.yaml @@ -31,9 +31,9 @@ vars: deployment_groups: - group: primary modules: - # Source is an embedded resource, denoted by "resources/*" without ./, ../, / - # as a prefix. To refer to a local resource, prefix with ./, ../ or / - # Example - ./resources/network/vpc + # Source is an embedded module, denoted by "modules/*" without ./, ../, / + # as a prefix. To refer to a local module, prefix with ./, ../ or / + # Example - ./modules/network/vpc - id: network1 source: modules/network/vpc @@ -43,7 +43,7 @@ deployment_groups: settings: local_mount: /home - - id: debug_node_group + - id: debug_nodeset source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset use: [network1] settings: @@ -54,15 +54,13 @@ deployment_groups: - id: debug_partition source: community/modules/compute/schedmd-slurm-gcp-v6-partition - use: - - homefs - - debug_node_group + use: [debug_nodeset] settings: partition_name: debug exclusive: false # allows nodes to stay up after jobs are done is_default: true - - id: compute_node_group + - id: compute_nodeset source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset use: [network1] settings: @@ -72,9 +70,7 @@ deployment_groups: - id: compute_partition source: community/modules/compute/schedmd-slurm-gcp-v6-partition - use: - - homefs - - compute_node_group + use: [compute_nodeset] settings: partition_name: compute diff --git a/community/examples/hpc-slurm-ubuntu2004.yaml b/community/examples/hpc-slurm-ubuntu2004.yaml index 637a167602..e57a04e51c 100644 --- a/community/examples/hpc-slurm-ubuntu2004.yaml +++ b/community/examples/hpc-slurm-ubuntu2004.yaml @@ -32,9 +32,9 @@ vars: deployment_groups: - group: primary modules: - # Source is an embedded resource, denoted by "resources/*" without ./, ../, / - # as a prefix. To refer to a local resource, prefix with ./, ../ or / - # Example - ./resources/network/vpc + # Source is an embedded module, denoted by "modules/*" without ./, ../, / + # as a prefix. To refer to a local module, prefix with ./, ../ or / + # Example - ./modules/network/vpc - id: network1 source: modules/network/vpc diff --git a/community/examples/hpc-slurm6-tpu-maxtext.yaml b/community/examples/hpc-slurm6-tpu-maxtext.yaml index 8a0b9b25d1..e33807c5cc 100644 --- a/community/examples/hpc-slurm6-tpu-maxtext.yaml +++ b/community/examples/hpc-slurm6-tpu-maxtext.yaml @@ -70,7 +70,6 @@ deployment_groups: source: ./community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu use: [network] settings: - name: v4x8 node_type: v4-8 tf_version: 2.14.0 # Preemptible TPUs cost much less than non-preemptible TPUs. diff --git a/community/examples/hpc-slurm6-tpu.yaml b/community/examples/hpc-slurm6-tpu.yaml index a2f899b943..6a6b2f59c3 100644 --- a/community/examples/hpc-slurm6-tpu.yaml +++ b/community/examples/hpc-slurm6-tpu.yaml @@ -32,7 +32,6 @@ deployment_groups: source: ./community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu use: [network] settings: - name: v3x8 node_type: v3-8 tf_version: 2.14.0 # Preemptible TPUs cost much less than non-preemptible TPUs. diff --git a/tools/cloud-build/daily-tests/blueprints/slurm-gcp-v5-high-io.yaml b/community/examples/htc-slurm-v6.yaml similarity index 63% rename from tools/cloud-build/daily-tests/blueprints/slurm-gcp-v5-high-io.yaml rename to community/examples/htc-slurm-v6.yaml index a00d89ba97..324f6de851 100644 --- a/tools/cloud-build/daily-tests/blueprints/slurm-gcp-v5-high-io.yaml +++ b/community/examples/htc-slurm-v6.yaml @@ -1,4 +1,5 @@ -# Copyright 2022 Google LLC +# Copyright 2024 Google LLC +# Copyright (C) SchedMD LLC. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,22 +15,21 @@ --- -blueprint_name: hpc-cluster-high-io-v5 +# This blueprint provisions a cluster using the Slurm scheduler configured to +# efficiently run many short duration, loosely-coupled (non-MPI) jobs. See also: +# https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/htc.md +# https://slurm.schedmd.com/high_throughput.html + +blueprint_name: htc-slurm-v6 vars: project_id: ## Set GCP Project ID Here ## - deployment_name: high-io-slurm-gcp-v5 + deployment_name: htc-slurm-v6 region: us-west4 zone: us-west4-c # By default, public IPs are set in the login and controller to allow easier # SSH access. To turn this behavior off, set this to true. disable_public_ips: false - # Set to true for active cluster reconfiguration. - # Note that setting this option requires additional dependencies to be installed locally. - enable_reconfigure: true - # When set, active compute nodes will be cleaned up on destroy. - # Note that setting this option requires additional dependencies to be installed locally. - enable_cleanup_compute: true # Documentation for each of the modules used below can be found at # https://github.com/GoogleCloudPlatform/hpc-toolkit/blob/main/modules/README.md @@ -40,18 +40,18 @@ deployment_groups: # Source is an embedded module, denoted by "modules/*" without ./, ../, / # as a prefix. To refer to a local or community module, prefix with ./, ../ or / # Example - ./modules/network/pre-existing-vpc - - id: network1 + - id: network source: modules/network/vpc - id: homefs source: modules/file-system/filestore - use: [network1] + use: [network] settings: local_mount: /home - id: projectsfs source: modules/file-system/filestore - use: [network1] + use: [network] settings: filestore_tier: HIGH_SCALE_SSD size_gb: 10240 @@ -61,94 +61,98 @@ deployment_groups: # https://console.developers.google.com/marketplace/product/ddnstorage/exascaler-cloud - id: scratchfs source: community/modules/file-system/DDN-EXAScaler - use: [network1] + use: [network] settings: local_mount: /scratch + # The compute partition is designed for performance. + # Use: + # `srun -N 4 -p compute <>` for any node in the partition. + # `srun -N 4 -p compute --mincpus 30 <>` for node group c2s60. + - id: compute_nodeset_c2s60 + source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset + use: [network] + settings: + name: c2s60 + node_count_dynamic_max: 200 + bandwidth_tier: gvnic_enabled + enable_placement: false + + - id: compute_nodeset_c2s30 + source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset + use: [network] + settings: + node_count_dynamic_max: 200 + machine_type: c2-standard-30 + bandwidth_tier: gvnic_enabled + enable_placement: false + + - id: compute_partition + source: community/modules/compute/schedmd-slurm-gcp-v6-partition + use: + - compute_nodeset_c2s60 + - compute_nodeset_c2s30 + settings: + partition_name: compute + exclusive: false + # The lowcost partition is designed to run at a lower cost and without additional quota # Use: # `srun -N 4 <>` for any node in the partition. # `srun -N 4 --mincpus 2` for node group n2s4. - - id: low_cost_node_group_n2s2 - source: community/modules/compute/schedmd-slurm-gcp-v5-node-group + - id: low_cost_nodeset_n2s2 + source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset + use: [network] settings: name: n2s2 machine_type: n2-standard-2 node_count_dynamic_max: 10 + bandwidth_tier: gvnic_enabled + enable_placement: false - - id: low_cost_node_group_n2s4 - source: community/modules/compute/schedmd-slurm-gcp-v5-node-group + - id: low_cost_nodeset_n2s4 + source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset + use: [network] settings: name: n2s4 machine_type: n2-standard-4 node_count_dynamic_max: 10 + bandwidth_tier: gvnic_enabled + enable_placement: false - id: low_cost_partition - source: community/modules/compute/schedmd-slurm-gcp-v5-partition + source: community/modules/compute/schedmd-slurm-gcp-v6-partition use: - - network1 - - homefs - - scratchfs - - projectsfs - - low_cost_node_group_n2s2 - - low_cost_node_group_n2s4 + - low_cost_nodeset_n2s2 + - low_cost_nodeset_n2s4 settings: is_default: true partition_name: lowcost - enable_placement: false exclusive: false - # The compute partition is designed for performance. - # Use: - # `srun -N 4 -p compute <>` for any node in the partition. - # `srun -N 4 -p compute --mincpus 30 <>` for node group c2s60. - - - id: compute_node_group_c2s60 - source: community/modules/compute/schedmd-slurm-gcp-v5-node-group - settings: - name: c2s60 - node_count_dynamic_max: 200 - - - id: compute_node_group_c2s30 - source: community/modules/compute/schedmd-slurm-gcp-v5-node-group - settings: - name: c2s30 - node_count_dynamic_max: 200 - machine_type: c2-standard-30 - - - id: compute_partition - source: community/modules/compute/schedmd-slurm-gcp-v5-partition - use: - - network1 - - homefs - - scratchfs - - projectsfs - - compute_node_group_c2s60 - - compute_node_group_c2s30 + - id: slurm_login + source: community/modules/scheduler/schedmd-slurm-gcp-v6-login + use: [network] settings: - partition_name: compute + name_prefix: login + machine_type: n2-standard-4 + disable_login_public_ips: $(vars.disable_public_ips) - id: slurm_controller - source: community/modules/scheduler/schedmd-slurm-gcp-v5-controller + source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller use: - - network1 + - network - homefs - scratchfs - projectsfs - low_cost_partition - compute_partition + - slurm_login settings: machine_type: c2-standard-8 disable_controller_public_ips: $(vars.disable_public_ips) - - - id: slurm_login - source: community/modules/scheduler/schedmd-slurm-gcp-v5-login - use: - - network1 - - slurm_controller - settings: - machine_type: n2-standard-4 - disable_login_public_ips: $(vars.disable_public_ips) + slurm_conf_tpl: modules/embedded/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/etc/htc-slurm.conf.tpl + slurmdbd_conf_tpl: modules/embedded/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/etc/htc-slurmdbd.conf.tpl - id: hpc_dashboard source: modules/monitoring/dashboard diff --git a/community/examples/intel/hpc-slurm-daos.yaml b/community/examples/intel/hpc-slurm-daos.yaml index 9a2e89e44b..d6f4d7a11d 100644 --- a/community/examples/intel/hpc-slurm-daos.yaml +++ b/community/examples/intel/hpc-slurm-daos.yaml @@ -131,7 +131,7 @@ deployment_groups: - id: debug_partition source: community/modules/compute/schedmd-slurm-gcp-v6-partition - use: [debug_nodeset, homefs] + use: [debug_nodeset] settings: partition_name: debug exclusive: false # allows nodes to stay up after jobs are done @@ -154,7 +154,7 @@ deployment_groups: - id: compute_partition source: community/modules/compute/schedmd-slurm-gcp-v6-partition - use: [compute_nodeset, homefs] + use: [compute_nodeset] settings: partition_name: compute diff --git a/community/examples/tutorial-starccm-slurm.yaml b/community/examples/tutorial-starccm-slurm.yaml index db18855352..d9ad22d1a7 100644 --- a/community/examples/tutorial-starccm-slurm.yaml +++ b/community/examples/tutorial-starccm-slurm.yaml @@ -28,9 +28,9 @@ vars: deployment_groups: - group: primary modules: - # Source is an embedded resource, denoted by "resources/*" without ./, ../, / - # as a prefix. To refer to a local resource, prefix with ./, ../ or / - # Example - ./resources/network/vpc + # Source is an embedded module, denoted by "modules/*" without ./, ../, / + # as a prefix. To refer to a local module, prefix with ./, ../ or / + # Example - ./modules/network/vpc - id: network1 source: modules/network/vpc diff --git a/community/front-end/ofe/requirements.txt b/community/front-end/ofe/requirements.txt index ad2c10f94c..0ad917aae8 100644 --- a/community/front-end/ofe/requirements.txt +++ b/community/front-end/ofe/requirements.txt @@ -19,7 +19,7 @@ dill==0.3.6 distlib==0.3.6 # django-revproxy==0.11.0 released but not yet in pypi git+https://github.com/jazzband/django-revproxy.git@d2234005135dc0771b7c4e0bb0465664ccfa5787 -Django==4.2.10 +Django==4.2.11 django-allauth==0.54.0 django-extensions==3.2.3 djangorestframework==3.14.0 diff --git a/community/modules/compute/gke-node-pool/versions.tf b/community/modules/compute/gke-node-pool/versions.tf index 3b5b7737bd..4ff7942505 100644 --- a/community/modules/compute/gke-node-pool/versions.tf +++ b/community/modules/compute/gke-node-pool/versions.tf @@ -26,6 +26,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:gke-node-pool/v1.30.0" + module_name = "blueprints/terraform/hpc-toolkit:gke-node-pool/v1.31.0" } } diff --git a/community/modules/compute/htcondor-execute-point/README.md b/community/modules/compute/htcondor-execute-point/README.md index 7e9631f5a9..98e602e105 100644 --- a/community/modules/compute/htcondor-execute-point/README.md +++ b/community/modules/compute/htcondor-execute-point/README.md @@ -212,7 +212,7 @@ limitations under the License. |------|--------|---------| | [execute\_point\_instance\_template](#module\_execute\_point\_instance\_template) | terraform-google-modules/vm/google//modules/instance_template | ~> 8.0 | | [mig](#module\_mig) | github.com/terraform-google-modules/terraform-google-vm//modules/mig | aea74d1 | -| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.29.0&depth=1 | +| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.30.0&depth=1 | ## Resources diff --git a/community/modules/compute/htcondor-execute-point/main.tf b/community/modules/compute/htcondor-execute-point/main.tf index 3e24af0cc7..314377f95b 100644 --- a/community/modules/compute/htcondor-execute-point/main.tf +++ b/community/modules/compute/htcondor-execute-point/main.tf @@ -125,7 +125,7 @@ resource "google_storage_bucket_object" "execute_config" { } module "startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.29.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.30.0&depth=1" project_id = var.project_id region = var.region diff --git a/community/modules/compute/htcondor-execute-point/versions.tf b/community/modules/compute/htcondor-execute-point/versions.tf index 04f02e218d..825126ef26 100644 --- a/community/modules/compute/htcondor-execute-point/versions.tf +++ b/community/modules/compute/htcondor-execute-point/versions.tf @@ -25,6 +25,6 @@ terraform { } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-execute-point/v1.30.0" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-execute-point/v1.31.0" } } diff --git a/community/modules/compute/pbspro-execution/README.md b/community/modules/compute/pbspro-execution/README.md index fa864aaf4a..e9305f9183 100644 --- a/community/modules/compute/pbspro-execution/README.md +++ b/community/modules/compute/pbspro-execution/README.md @@ -74,9 +74,9 @@ No providers. | Name | Source | Version | |------|--------|---------| -| [execution\_startup\_script](#module\_execution\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.29.0&depth=1 | -| [pbs\_execution](#module\_pbs\_execution) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance | v1.29.0&depth=1 | -| [pbs\_install](#module\_pbs\_install) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-install | v1.29.0&depth=1 | +| [execution\_startup\_script](#module\_execution\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.30.0&depth=1 | +| [pbs\_execution](#module\_pbs\_execution) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance | v1.30.0&depth=1 | +| [pbs\_install](#module\_pbs\_install) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-install | v1.30.0&depth=1 | ## Resources diff --git a/community/modules/compute/pbspro-execution/main.tf b/community/modules/compute/pbspro-execution/main.tf index df4208234a..7bb8bc200b 100644 --- a/community/modules/compute/pbspro-execution/main.tf +++ b/community/modules/compute/pbspro-execution/main.tf @@ -42,7 +42,7 @@ locals { } module "pbs_install" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-install?ref=v1.29.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-install?ref=v1.30.0&depth=1" pbs_exec = var.pbs_exec pbs_home = var.pbs_home @@ -53,7 +53,7 @@ module "pbs_install" { } module "execution_startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.29.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.30.0&depth=1" deployment_name = var.deployment_name project_id = var.project_id @@ -68,7 +68,7 @@ module "execution_startup_script" { } module "pbs_execution" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance?ref=v1.29.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance?ref=v1.30.0&depth=1" instance_count = var.instance_count spot = var.spot diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/versions.tf index 0201db34dd..5579725d35 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-node-group/v1.30.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-node-group/v1.31.0" } required_version = ">= 1.1" } diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v5-partition/versions.tf index eaed8ffda2..1244ff3fb2 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-partition/v1.30.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-partition/v1.31.0" } required_version = ">= 0.13.0" } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/README.md index 02887707ce..532f97ee6d 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/README.md @@ -20,7 +20,6 @@ be accessed as `tpu` partition. source: ./community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu use: [network] settings: - name: v2x8 node_type: v2-8 tf_version: 2.10.0 disable_public_ips: false @@ -61,7 +60,7 @@ No resources. | [data\_disks](#input\_data\_disks) | The data disks to include in the TPU node | `list(string)` | `[]` | no | | [disable\_public\_ips](#input\_disable\_public\_ips) | If set to false. The node group VMs will have a random public IP assigned to it. Ignored if access\_config is set. | `bool` | `true` | no | | [docker\_image](#input\_docker\_image) | The gcp container registry id docker image to use in the TPU vms, it defaults to gcr.io/schedmd-slurm-public/tpu:slurm-gcp-6-4-tf- | `string` | `null` | no | -| [name](#input\_name) | Name of the nodeset tpu. | `string` | n/a | yes | +| [name](#input\_name) | Name of the nodeset. Automatically populated by the module id if not set.
If setting manually, ensure a unique value across all nodesets. | `string` | n/a | yes | | [node\_count\_dynamic\_max](#input\_node\_count\_dynamic\_max) | Maximum number of auto-scaling nodes allowed in this partition. | `number` | `5` | no | | [node\_count\_static](#input\_node\_count\_static) | Number of nodes to be statically created. | `number` | `0` | no | | [node\_type](#input\_node\_type) | Specify a node type to base the vm configuration upon it. | `string` | n/a | yes | diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/main.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/main.tf index ca70aa9bc9..3f5d195406 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/main.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/main.tf @@ -18,11 +18,12 @@ # } locals { + name = substr(replace(var.name, "/[^a-z0-9]/", ""), 0, 14) nodeset_tpu = { node_count_static = var.node_count_static node_count_dynamic_max = var.node_count_dynamic_max - nodeset_name = var.name + nodeset_name = local.name node_type = var.node_type accelerator_config = var.accelerator_config diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/metadata.yaml b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/metadata.yaml index 13ea127b3c..95b6d1c730 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/metadata.yaml +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/metadata.yaml @@ -17,4 +17,5 @@ spec: requirements: services: [] ghpc: + inject_module_id: name has_to_be_used: true diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/variables.tf index 9b7315935d..3eb265a173 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/variables.tf @@ -25,13 +25,11 @@ variable "node_count_dynamic_max" { } variable "name" { - description = "Name of the nodeset tpu." + description = <<-EOD + Name of the nodeset. Automatically populated by the module id if not set. + If setting manually, ensure a unique value across all nodesets. + EOD type = string - - validation { - condition = can(regex("^[a-z](?:[a-z0-9]{0,5})$", var.name)) - error_message = "Nodeset TPU name (var.name) must begin with a letter, be fully alphanumeric and be 6 characters or less. Regexp: '^[a-z](?:[a-z0-9]{0,5})$'." - } } variable "disable_public_ips" { diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/versions.tf index 340f1f026c..353f43a8e1 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/versions.tf @@ -18,6 +18,6 @@ terraform { required_version = ">= 1.3" provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset-tpu/v1.30.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset-tpu/v1.31.0" } } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md index 10e2bc3539..5046f93e2b 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md @@ -178,7 +178,7 @@ No modules. | [maintenance\_interval](#input\_maintenance\_interval) | Sets the maintenance interval for instances in this nodeset.
See https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance#maintenance_interval. | `string` | `null` | no | | [metadata](#input\_metadata) | Metadata, provided as a map. | `map(string)` | `{}` | no | | [min\_cpu\_platform](#input\_min\_cpu\_platform) | The name of the minimum CPU platform that you want the instance to use. | `string` | `null` | no | -| [name](#input\_name) | Name of the nodeset. Automatically populated by the module id if not set | `string` | n/a | yes | +| [name](#input\_name) | Name of the nodeset. Automatically populated by the module id if not set.
If setting manually, ensure a unique value across all nodesets. | `string` | n/a | yes | | [node\_conf](#input\_node\_conf) | Map of Slurm node line configuration. | `map(any)` | `{}` | no | | [node\_count\_dynamic\_max](#input\_node\_count\_dynamic\_max) | Maximum number of auto-scaling nodes allowed in this partition. | `number` | `10` | no | | [node\_count\_static](#input\_node\_count\_static) | Number of nodes to be statically created. | `number` | `0` | no | diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf index 44cbfcc811..21f18490a5 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf @@ -18,7 +18,7 @@ locals { } locals { - name = substr(replace(var.name, "/[^a-z0-9]/", ""), 0, 6) + name = substr(replace(var.name, "/[^a-z0-9]/", ""), 0, 14) additional_disks = [ for ad in var.additional_disks : { diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf index 34b96044a6..5847bd4628 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf @@ -13,7 +13,10 @@ # limitations under the License. variable "name" { - description = "Name of the nodeset. Automatically populated by the module id if not set" + description = <<-EOD + Name of the nodeset. Automatically populated by the module id if not set. + If setting manually, ensure a unique value across all nodesets. + EOD type = string } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf index 5d0a467a92..f3a55fcfd2 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf @@ -24,6 +24,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset/v1.30.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset/v1.31.0" } } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-partition/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-partition/README.md index 973cb927af..c40179b44e 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-partition/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v6-partition/README.md @@ -84,8 +84,9 @@ No resources. |------|-------------|------|---------|:--------:| | [exclusive](#input\_exclusive) | Exclusive job access to nodes. | `bool` | `true` | no | | [is\_default](#input\_is\_default) | Sets this partition as the default partition by updating the partition\_conf.
If "Default" is already set in partition\_conf, this variable will have no effect. | `bool` | `false` | no | -| [network\_storage](#input\_network\_storage) | An array of network attached storage mounts to be configured on the partition compute nodes. |
list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
client_install_runner = map(string)
mount_runner = map(string)
}))
| `[]` | no | +| [network\_storage](#input\_network\_storage) | DEPRECATED | `any` | `null` | no | | [nodeset](#input\_nodeset) | Define nodesets, as a list. |
list(object({
node_count_static = optional(number, 0)
node_count_dynamic_max = optional(number, 1)
node_conf = optional(map(string), {})
nodeset_name = string
additional_disks = optional(list(object({
disk_name = optional(string)
device_name = optional(string)
disk_size_gb = optional(number)
disk_type = optional(string)
disk_labels = optional(map(string), {})
auto_delete = optional(bool, true)
boot = optional(bool, false)
})), [])
bandwidth_tier = optional(string, "platform_default")
can_ip_forward = optional(bool, false)
disable_smt = optional(bool, false)
disk_auto_delete = optional(bool, true)
disk_labels = optional(map(string), {})
disk_size_gb = optional(number)
disk_type = optional(string)
enable_confidential_vm = optional(bool, false)
enable_placement = optional(bool, false)
enable_oslogin = optional(bool, true)
enable_shielded_vm = optional(bool, false)
gpu = optional(object({
count = number
type = string
}))
instance_template = optional(string)
labels = optional(map(string), {})
machine_type = optional(string)
maintenance_interval = optional(string)
metadata = optional(map(string), {})
min_cpu_platform = optional(string)
network_tier = optional(string, "STANDARD")
on_host_maintenance = optional(string)
preemptible = optional(bool, false)
region = optional(string)
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
shielded_instance_config = optional(object({
enable_integrity_monitoring = optional(bool, true)
enable_secure_boot = optional(bool, true)
enable_vtpm = optional(bool, true)
}))
source_image_family = optional(string)
source_image_project = optional(string)
source_image = optional(string)
additional_networks = optional(list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
})))
access_config = optional(list(object({
nat_ip = string
network_tier = string
})))
subnetwork_self_link = string
spot = optional(bool, false)
tags = optional(list(string), [])
termination_action = optional(string)
zones = optional(list(string), [])
zone_target_shape = optional(string, "ANY_SINGLE_ZONE")
reservation_name = optional(string)
startup_script = optional(list(object({
filename = string
content = string })), [])
}))
| `[]` | no | +| [nodeset\_dyn](#input\_nodeset\_dyn) | Defines dynamic nodesets, as a list. |
list(object({
nodeset_name = string
nodeset_feature = string
}))
| `[]` | no | | [nodeset\_tpu](#input\_nodeset\_tpu) | Define TPU nodesets, as a list. |
list(object({
node_count_static = optional(number, 0)
node_count_dynamic_max = optional(number, 5)
nodeset_name = string
enable_public_ip = optional(bool, false)
node_type = string
accelerator_config = optional(object({
topology = string
version = string
}), {
topology = ""
version = ""
})
tf_version = string
preemptible = optional(bool, false)
preserve_tpu = optional(bool, false)
zone = string
data_disks = optional(list(string), [])
docker_image = optional(string, "")
subnetwork = string
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
project_id = string
reserved = optional(string, false)
}))
| `[]` | no | | [partition\_conf](#input\_partition\_conf) | Slurm partition configuration as a map.
See https://slurm.schedmd.com/slurm.conf.html#SECTION_PARTITION-CONFIGURATION | `map(string)` | `{}` | no | | [partition\_name](#input\_partition\_name) | The name of the slurm partition. | `string` | n/a | yes | @@ -95,6 +96,7 @@ No resources. | Name | Description | |------|-------------| | [nodeset](#output\_nodeset) | Details of a nodesets in this partition | -| [nodeset\_tpu](#output\_nodeset\_tpu) | Details of a nodesets tpu in this partition | +| [nodeset\_dyn](#output\_nodeset\_dyn) | Details of a dynamic nodesets in this partition | +| [nodeset\_tpu](#output\_nodeset\_tpu) | Details of a TPU nodesets in this partition | | [partitions](#output\_partitions) | Details of a slurm partition | diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-partition/main.tf b/community/modules/compute/schedmd-slurm-gcp-v6-partition/main.tf index acc3a4f808..bffbd57fb3 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-partition/main.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-partition/main.tf @@ -19,10 +19,10 @@ locals { partition = { default = var.is_default enable_job_exclusive = var.exclusive - network_storage = var.network_storage partition_conf = var.partition_conf partition_name = var.partition_name partition_nodeset = [for ns in var.nodeset : ns.nodeset_name] partition_nodeset_tpu = [for ns in var.nodeset_tpu : ns.nodeset_name] + partition_nodeset_dyn = [for ns in var.nodeset_dyn : ns.nodeset_name] } } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-partition/outputs.tf b/community/modules/compute/schedmd-slurm-gcp-v6-partition/outputs.tf index 94d4915fdf..e3115e3d39 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-partition/outputs.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-partition/outputs.tf @@ -38,7 +38,14 @@ output "nodeset" { } output "nodeset_tpu" { - description = "Details of a nodesets tpu in this partition" + description = "Details of a TPU nodesets in this partition" value = var.nodeset_tpu } + + +output "nodeset_dyn" { + description = "Details of a dynamic nodesets in this partition" + + value = var.nodeset_dyn +} diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-partition/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v6-partition/variables.tf index ab8ebfd439..e08f3b1e74 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-partition/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-partition/variables.tf @@ -46,20 +46,6 @@ variable "exclusive" { default = true } -variable "network_storage" { - description = "An array of network attached storage mounts to be configured on the partition compute nodes." - type = list(object({ - server_ip = string, - remote_mount = string, - local_mount = string, - fs_type = string, - mount_options = string, - client_install_runner = map(string) - mount_runner = map(string) - })) - default = [] -} - variable "nodeset" { description = "Define nodesets, as a list." type = list(object({ @@ -185,3 +171,28 @@ variable "nodeset_tpu" { error_message = "All TPU nodesets must have a unique name." } } + +variable "nodeset_dyn" { + description = "Defines dynamic nodesets, as a list." + type = list(object({ + nodeset_name = string + nodeset_feature = string + })) + default = [] + + validation { + condition = length(distinct([for x in var.nodeset_dyn : x.nodeset_name])) == length(var.nodeset_dyn) + error_message = "All dynamic nodesets must have a unique name." + } +} + +# tflint-ignore: terraform_unused_declarations +variable "network_storage" { + description = "DEPRECATED" + type = any + default = null + validation { + condition = var.network_storage == null + error_message = "network_storage in partition module is deprecated and should be removed." + } +} diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-partition/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v6-partition/versions.tf index 5e0ca4feb5..fee440218c 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-partition/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-partition/versions.tf @@ -18,6 +18,6 @@ terraform { required_version = ">= 1.3" provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-partition/v1.30.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-partition/v1.31.0" } } diff --git a/community/modules/database/slurm-cloudsql-federation/versions.tf b/community/modules/database/slurm-cloudsql-federation/versions.tf index 6b337e71e5..c9446a946d 100644 --- a/community/modules/database/slurm-cloudsql-federation/versions.tf +++ b/community/modules/database/slurm-cloudsql-federation/versions.tf @@ -30,10 +30,10 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.30.0" + module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.31.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.30.0" + module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.31.0" } required_version = ">= 0.13.0" diff --git a/community/modules/file-system/cloud-storage-bucket/versions.tf b/community/modules/file-system/cloud-storage-bucket/versions.tf index 972f251be0..8fd9d7d23c 100644 --- a/community/modules/file-system/cloud-storage-bucket/versions.tf +++ b/community/modules/file-system/cloud-storage-bucket/versions.tf @@ -26,7 +26,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:cloud-storage-bucket/v1.30.0" + module_name = "blueprints/terraform/hpc-toolkit:cloud-storage-bucket/v1.31.0" } required_version = ">= 0.14.0" } diff --git a/community/modules/file-system/gke-persistent-volume/versions.tf b/community/modules/file-system/gke-persistent-volume/versions.tf index c3e89fe11a..1f4e438dfa 100644 --- a/community/modules/file-system/gke-persistent-volume/versions.tf +++ b/community/modules/file-system/gke-persistent-volume/versions.tf @@ -29,6 +29,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:gke-persistent-volume/v1.30.0" + module_name = "blueprints/terraform/hpc-toolkit:gke-persistent-volume/v1.31.0" } } diff --git a/community/modules/file-system/nfs-server/versions.tf b/community/modules/file-system/nfs-server/versions.tf index 60c26360d8..464cc34296 100644 --- a/community/modules/file-system/nfs-server/versions.tf +++ b/community/modules/file-system/nfs-server/versions.tf @@ -30,7 +30,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:nfs-server/v1.30.0" + module_name = "blueprints/terraform/hpc-toolkit:nfs-server/v1.31.0" } required_version = ">= 0.14.0" diff --git a/community/modules/files/fsi-montecarlo-on-batch/versions.tf b/community/modules/files/fsi-montecarlo-on-batch/versions.tf index d84b1aef6e..cbcce1cab9 100644 --- a/community/modules/files/fsi-montecarlo-on-batch/versions.tf +++ b/community/modules/files/fsi-montecarlo-on-batch/versions.tf @@ -35,9 +35,9 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:fsi-montecarlo-on-batch/v1.30.0" + module_name = "blueprints/terraform/hpc-toolkit:fsi-montecarlo-on-batch/v1.31.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:fsi-montecarlo-on-batch/v1.30.0" + module_name = "blueprints/terraform/hpc-toolkit:fsi-montecarlo-on-batch/v1.31.0" } } diff --git a/community/modules/project/service-enablement/versions.tf b/community/modules/project/service-enablement/versions.tf index 18ab5fb5b1..fdcf9b5115 100644 --- a/community/modules/project/service-enablement/versions.tf +++ b/community/modules/project/service-enablement/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:service-enablement/v1.30.0" + module_name = "blueprints/terraform/hpc-toolkit:service-enablement/v1.31.0" } required_version = ">= 0.14.0" diff --git a/community/modules/pubsub/bigquery-sub/versions.tf b/community/modules/pubsub/bigquery-sub/versions.tf index aadf3c22d6..e69ae052df 100644 --- a/community/modules/pubsub/bigquery-sub/versions.tf +++ b/community/modules/pubsub/bigquery-sub/versions.tf @@ -26,10 +26,10 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:bigquery-sub/v1.30.0" + module_name = "blueprints/terraform/hpc-toolkit:bigquery-sub/v1.31.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:bigquery-sub/v1.30.0" + module_name = "blueprints/terraform/hpc-toolkit:bigquery-sub/v1.31.0" } required_version = ">= 1.0" } diff --git a/community/modules/pubsub/topic/versions.tf b/community/modules/pubsub/topic/versions.tf index dc3e1e6815..670bf9ef52 100644 --- a/community/modules/pubsub/topic/versions.tf +++ b/community/modules/pubsub/topic/versions.tf @@ -27,6 +27,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:topic/v1.30.0" + module_name = "blueprints/terraform/hpc-toolkit:topic/v1.31.0" } } diff --git a/community/modules/remote-desktop/chrome-remote-desktop/README.md b/community/modules/remote-desktop/chrome-remote-desktop/README.md index 914daaa2fc..b6afbaeb8f 100644 --- a/community/modules/remote-desktop/chrome-remote-desktop/README.md +++ b/community/modules/remote-desktop/chrome-remote-desktop/README.md @@ -63,8 +63,8 @@ No providers. | Name | Source | Version | |------|--------|---------| -| [client\_startup\_script](#module\_client\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.29.0&depth=1 | -| [instances](#module\_instances) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance | v1.29.0&depth=1 | +| [client\_startup\_script](#module\_client\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.30.0&depth=1 | +| [instances](#module\_instances) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance | v1.30.0&depth=1 | ## Resources diff --git a/community/modules/remote-desktop/chrome-remote-desktop/main.tf b/community/modules/remote-desktop/chrome-remote-desktop/main.tf index 410912003b..cd5c95dde8 100644 --- a/community/modules/remote-desktop/chrome-remote-desktop/main.tf +++ b/community/modules/remote-desktop/chrome-remote-desktop/main.tf @@ -55,7 +55,7 @@ locals { } module "client_startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.29.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.30.0&depth=1" deployment_name = var.deployment_name project_id = var.project_id @@ -71,7 +71,7 @@ module "client_startup_script" { } module "instances" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance?ref=v1.29.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance?ref=v1.30.0&depth=1" instance_count = var.instance_count name_prefix = var.name_prefix diff --git a/community/modules/scheduler/gke-cluster/versions.tf b/community/modules/scheduler/gke-cluster/versions.tf index 58c4f53156..e9bef425df 100644 --- a/community/modules/scheduler/gke-cluster/versions.tf +++ b/community/modules/scheduler/gke-cluster/versions.tf @@ -30,6 +30,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:gke-cluster/v1.30.0" + module_name = "blueprints/terraform/hpc-toolkit:gke-cluster/v1.31.0" } } diff --git a/community/modules/scheduler/htcondor-access-point/README.md b/community/modules/scheduler/htcondor-access-point/README.md index a8d3770c39..e67315f406 100644 --- a/community/modules/scheduler/htcondor-access-point/README.md +++ b/community/modules/scheduler/htcondor-access-point/README.md @@ -122,7 +122,7 @@ limitations under the License. |------|--------|---------| | [access\_point\_instance\_template](#module\_access\_point\_instance\_template) | github.com/terraform-google-modules/terraform-google-vm//modules/instance_template | 84d7959 | | [htcondor\_ap](#module\_htcondor\_ap) | github.com/terraform-google-modules/terraform-google-vm//modules/mig | aea74d1 | -| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.29.0&depth=1 | +| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.30.0&depth=1 | ## Resources diff --git a/community/modules/scheduler/htcondor-access-point/main.tf b/community/modules/scheduler/htcondor-access-point/main.tf index 92682ca019..f1f165451f 100644 --- a/community/modules/scheduler/htcondor-access-point/main.tf +++ b/community/modules/scheduler/htcondor-access-point/main.tf @@ -143,7 +143,7 @@ resource "google_storage_bucket_object" "ap_config" { } module "startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.29.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.30.0&depth=1" project_id = var.project_id region = var.region diff --git a/community/modules/scheduler/htcondor-access-point/versions.tf b/community/modules/scheduler/htcondor-access-point/versions.tf index 296d4e4aa2..23894365b6 100644 --- a/community/modules/scheduler/htcondor-access-point/versions.tf +++ b/community/modules/scheduler/htcondor-access-point/versions.tf @@ -26,7 +26,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-access-point/v1.30.0" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-access-point/v1.31.0" } required_version = ">= 1.1" diff --git a/community/modules/scheduler/htcondor-central-manager/README.md b/community/modules/scheduler/htcondor-central-manager/README.md index b7e50bd755..98c8abf483 100644 --- a/community/modules/scheduler/htcondor-central-manager/README.md +++ b/community/modules/scheduler/htcondor-central-manager/README.md @@ -108,7 +108,7 @@ limitations under the License. |------|--------|---------| | [central\_manager\_instance\_template](#module\_central\_manager\_instance\_template) | github.com/terraform-google-modules/terraform-google-vm//modules/instance_template | 84d7959 | | [htcondor\_cm](#module\_htcondor\_cm) | github.com/terraform-google-modules/terraform-google-vm//modules/mig | aea74d1 | -| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.29.0&depth=1 | +| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.30.0&depth=1 | ## Resources diff --git a/community/modules/scheduler/htcondor-central-manager/main.tf b/community/modules/scheduler/htcondor-central-manager/main.tf index 602bb942c2..68009ee46a 100644 --- a/community/modules/scheduler/htcondor-central-manager/main.tf +++ b/community/modules/scheduler/htcondor-central-manager/main.tf @@ -110,7 +110,7 @@ resource "google_storage_bucket_object" "cm_config" { } module "startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.29.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.30.0&depth=1" project_id = var.project_id region = var.region diff --git a/community/modules/scheduler/htcondor-central-manager/versions.tf b/community/modules/scheduler/htcondor-central-manager/versions.tf index 02f7f07e8b..d8daaed9bc 100644 --- a/community/modules/scheduler/htcondor-central-manager/versions.tf +++ b/community/modules/scheduler/htcondor-central-manager/versions.tf @@ -26,7 +26,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-central-manager/v1.30.0" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-central-manager/v1.31.0" } required_version = ">= 1.1.0" diff --git a/community/modules/scheduler/htcondor-pool-secrets/versions.tf b/community/modules/scheduler/htcondor-pool-secrets/versions.tf index 931b71472b..72fb2e7de3 100644 --- a/community/modules/scheduler/htcondor-pool-secrets/versions.tf +++ b/community/modules/scheduler/htcondor-pool-secrets/versions.tf @@ -26,7 +26,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-pool-secrets/v1.30.0" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-pool-secrets/v1.31.0" } required_version = ">= 1.3.0" diff --git a/community/modules/scheduler/htcondor-service-accounts/README.md b/community/modules/scheduler/htcondor-service-accounts/README.md index 8d098dcc1c..363026299a 100644 --- a/community/modules/scheduler/htcondor-service-accounts/README.md +++ b/community/modules/scheduler/htcondor-service-accounts/README.md @@ -100,9 +100,9 @@ No providers. | Name | Source | Version | |------|--------|---------| -| [access\_point\_service\_account](#module\_access\_point\_service\_account) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/project/service-account | v1.29.0&depth=1 | -| [central\_manager\_service\_account](#module\_central\_manager\_service\_account) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/project/service-account | v1.29.0&depth=1 | -| [execute\_point\_service\_account](#module\_execute\_point\_service\_account) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/project/service-account | v1.29.0&depth=1 | +| [access\_point\_service\_account](#module\_access\_point\_service\_account) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/project/service-account | v1.30.0&depth=1 | +| [central\_manager\_service\_account](#module\_central\_manager\_service\_account) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/project/service-account | v1.30.0&depth=1 | +| [execute\_point\_service\_account](#module\_execute\_point\_service\_account) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/project/service-account | v1.30.0&depth=1 | ## Resources diff --git a/community/modules/scheduler/htcondor-service-accounts/main.tf b/community/modules/scheduler/htcondor-service-accounts/main.tf index c8f7151050..91045349ba 100644 --- a/community/modules/scheduler/htcondor-service-accounts/main.tf +++ b/community/modules/scheduler/htcondor-service-accounts/main.tf @@ -21,7 +21,7 @@ # require them module "access_point_service_account" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/project/service-account?ref=v1.29.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/project/service-account?ref=v1.30.0&depth=1" project_id = var.project_id display_name = "HTCondor Access Point" @@ -31,7 +31,7 @@ module "access_point_service_account" { } module "execute_point_service_account" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/project/service-account?ref=v1.29.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/project/service-account?ref=v1.30.0&depth=1" project_id = var.project_id display_name = "HTCondor Execute Point" @@ -41,7 +41,7 @@ module "execute_point_service_account" { } module "central_manager_service_account" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/project/service-account?ref=v1.29.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/project/service-account?ref=v1.30.0&depth=1" project_id = var.project_id display_name = "HTCondor Central Manager" diff --git a/community/modules/scheduler/pbspro-client/README.md b/community/modules/scheduler/pbspro-client/README.md index adbd76955f..a2d3442636 100644 --- a/community/modules/scheduler/pbspro-client/README.md +++ b/community/modules/scheduler/pbspro-client/README.md @@ -74,9 +74,9 @@ No providers. | Name | Source | Version | |------|--------|---------| -| [client\_startup\_script](#module\_client\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.29.0&depth=1 | -| [pbs\_client](#module\_pbs\_client) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance | v1.29.0&depth=1 | -| [pbs\_install](#module\_pbs\_install) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-install | v1.29.0&depth=1 | +| [client\_startup\_script](#module\_client\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.30.0&depth=1 | +| [pbs\_client](#module\_pbs\_client) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance | v1.30.0&depth=1 | +| [pbs\_install](#module\_pbs\_install) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-install | v1.30.0&depth=1 | ## Resources diff --git a/community/modules/scheduler/pbspro-client/main.tf b/community/modules/scheduler/pbspro-client/main.tf index 8c6e7876b9..2e965eda04 100644 --- a/community/modules/scheduler/pbspro-client/main.tf +++ b/community/modules/scheduler/pbspro-client/main.tf @@ -32,7 +32,7 @@ locals { } module "pbs_install" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-install?ref=v1.29.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-install?ref=v1.30.0&depth=1" pbs_exec = var.pbs_exec pbs_home = var.pbs_home @@ -43,7 +43,7 @@ module "pbs_install" { } module "client_startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.29.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.30.0&depth=1" deployment_name = var.deployment_name project_id = var.project_id @@ -57,7 +57,7 @@ module "client_startup_script" { } module "pbs_client" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance?ref=v1.29.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance?ref=v1.30.0&depth=1" instance_count = var.instance_count spot = var.spot diff --git a/community/modules/scheduler/pbspro-server/README.md b/community/modules/scheduler/pbspro-server/README.md index 10811094b0..20abb28aea 100644 --- a/community/modules/scheduler/pbspro-server/README.md +++ b/community/modules/scheduler/pbspro-server/README.md @@ -69,10 +69,10 @@ No providers. | Name | Source | Version | |------|--------|---------| -| [pbs\_install](#module\_pbs\_install) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-install | v1.29.0&depth=1 | -| [pbs\_qmgr](#module\_pbs\_qmgr) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-qmgr | v1.29.0&depth=1 | -| [pbs\_server](#module\_pbs\_server) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance | v1.29.0&depth=1 | -| [server\_startup\_script](#module\_server\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.29.0&depth=1 | +| [pbs\_install](#module\_pbs\_install) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-install | v1.30.0&depth=1 | +| [pbs\_qmgr](#module\_pbs\_qmgr) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-qmgr | v1.30.0&depth=1 | +| [pbs\_server](#module\_pbs\_server) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance | v1.30.0&depth=1 | +| [server\_startup\_script](#module\_server\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.30.0&depth=1 | ## Resources diff --git a/community/modules/scheduler/pbspro-server/main.tf b/community/modules/scheduler/pbspro-server/main.tf index c7c57b50a8..fcf62706b8 100644 --- a/community/modules/scheduler/pbspro-server/main.tf +++ b/community/modules/scheduler/pbspro-server/main.tf @@ -32,7 +32,7 @@ locals { } module "pbs_install" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-install?ref=v1.29.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-install?ref=v1.30.0&depth=1" pbs_data_service_user = var.pbs_data_service_user pbs_exec = var.pbs_exec @@ -45,7 +45,7 @@ module "pbs_install" { } module "pbs_qmgr" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-qmgr?ref=v1.29.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-qmgr?ref=v1.30.0&depth=1" client_host_count = var.client_host_count client_hostname_prefix = var.client_hostname_prefix @@ -55,7 +55,7 @@ module "pbs_qmgr" { } module "server_startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.29.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.30.0&depth=1" deployment_name = var.deployment_name project_id = var.project_id @@ -70,7 +70,7 @@ module "server_startup_script" { } module "pbs_server" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance?ref=v1.29.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance?ref=v1.30.0&depth=1" instance_count = var.instance_count spot = var.spot diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/versions.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/versions.tf index 8c82b91f4d..31919e0832 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/versions.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-controller/v1.30.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-controller/v1.31.0" } required_version = ">= 1.1" } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf index c9c26a382a..c5b53bd798 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-login/v1.30.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-login/v1.31.0" } required_version = ">= 1.1" } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md index 80fff1b89f..38553b8488 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md @@ -142,6 +142,7 @@ limitations under the License. | [slurm\_login\_instance](#module\_slurm\_login\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_login_instance | 6.4.3&depth=1 | | [slurm\_login\_template](#module\_slurm\_login\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.4.3&depth=1 | | [slurm\_nodeset](#module\_slurm\_nodeset) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset | 6.4.3&depth=1 | +| [slurm\_nodeset\_dyn](#module\_slurm\_nodeset\_dyn) | github.com/SchedMD/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_dyn | 6.2.0 | | [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.4.3&depth=1 | | [slurm\_nodeset\_tpu](#module\_slurm\_nodeset\_tpu) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu | 6.4.3&depth=1 | | [slurm\_partition](#module\_slurm\_partition) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition | 6.4.3&depth=1 | @@ -207,9 +208,10 @@ limitations under the License. | [min\_cpu\_platform](#input\_min\_cpu\_platform) | Specifies a minimum CPU platform. Applicable values are the friendly names of
CPU platforms, such as Intel Haswell or Intel Skylake. See the complete list:
https://cloud.google.com/compute/docs/instances/specify-min-cpu-platform | `string` | `null` | no | | [network\_storage](#input\_network\_storage) | An array of network attached storage mounts to be configured on all instances. |
list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
client_install_runner = map(string) # TODO: is it used? should remove it?
mount_runner = map(string)
}))
| `[]` | no | | [nodeset](#input\_nodeset) | Define nodesets, as a list. |
list(object({
node_count_static = optional(number, 0)
node_count_dynamic_max = optional(number, 1)
node_conf = optional(map(string), {})
nodeset_name = string
additional_disks = optional(list(object({
disk_name = optional(string)
device_name = optional(string)
disk_size_gb = optional(number)
disk_type = optional(string)
disk_labels = optional(map(string), {})
auto_delete = optional(bool, true)
boot = optional(bool, false)
})), [])
bandwidth_tier = optional(string, "platform_default")
can_ip_forward = optional(bool, false)
disable_smt = optional(bool, false)
disk_auto_delete = optional(bool, true)
disk_labels = optional(map(string), {})
disk_size_gb = optional(number)
disk_type = optional(string)
enable_confidential_vm = optional(bool, false)
enable_placement = optional(bool, false)
enable_oslogin = optional(bool, true)
enable_shielded_vm = optional(bool, false)
gpu = optional(object({
count = number
type = string
}))
instance_template = optional(string)
labels = optional(map(string), {})
machine_type = optional(string)
maintenance_interval = optional(string)
metadata = optional(map(string), {})
min_cpu_platform = optional(string)
network_tier = optional(string, "STANDARD")
on_host_maintenance = optional(string)
preemptible = optional(bool, false)
region = optional(string)
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
shielded_instance_config = optional(object({
enable_integrity_monitoring = optional(bool, true)
enable_secure_boot = optional(bool, true)
enable_vtpm = optional(bool, true)
}))
source_image_family = optional(string)
source_image_project = optional(string)
source_image = optional(string)
subnetwork_self_link = string
additional_networks = optional(list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
})))
access_config = optional(list(object({
nat_ip = string
network_tier = string
})))
spot = optional(bool, false)
tags = optional(list(string), [])
termination_action = optional(string)
zones = optional(list(string), [])
zone_target_shape = optional(string, "ANY_SINGLE_ZONE")
reservation_name = optional(string)
startup_script = optional(list(object({
filename = string
content = string })), [])
}))
| `[]` | no | +| [nodeset\_dyn](#input\_nodeset\_dyn) | Defines dynamic nodesets, as a list. |
list(object({
nodeset_name = string
nodeset_feature = string
}))
| `[]` | no | | [nodeset\_tpu](#input\_nodeset\_tpu) | Define TPU nodesets, as a list. |
list(object({
node_count_static = optional(number, 0)
node_count_dynamic_max = optional(number, 5)
nodeset_name = string
enable_public_ip = optional(bool, false)
node_type = string
accelerator_config = optional(object({
topology = string
version = string
}), {
topology = ""
version = ""
})
tf_version = string
preemptible = optional(bool, false)
preserve_tpu = optional(bool, false)
zone = string
data_disks = optional(list(string), [])
docker_image = optional(string, "")
subnetwork = string
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
project_id = string
reserved = optional(string, false)
}))
| `[]` | no | | [on\_host\_maintenance](#input\_on\_host\_maintenance) | Instance availability Policy. | `string` | `"MIGRATE"` | no | -| [partitions](#input\_partitions) | Cluster partitions as a list. See module slurm\_partition. |
list(object({
default = optional(bool, false)
enable_job_exclusive = optional(bool, false)
network_storage = optional(list(object({
server_ip = string
remote_mount = string
local_mount = string
fs_type = string
mount_options = string
})), [])
partition_conf = optional(map(string), {})
partition_name = string
partition_nodeset = optional(list(string), [])
partition_nodeset_dyn = optional(list(string), [])
partition_nodeset_tpu = optional(list(string), [])
resume_timeout = optional(number)
suspend_time = optional(number, 300)
suspend_timeout = optional(number)
}))
| n/a | yes | +| [partitions](#input\_partitions) | Cluster partitions as a list. See module slurm\_partition. |
list(object({
default = optional(bool, false)
enable_job_exclusive = optional(bool, false)
partition_conf = optional(map(string), {})
partition_name = string
partition_nodeset = optional(list(string), [])
partition_nodeset_dyn = optional(list(string), [])
partition_nodeset_tpu = optional(list(string), [])
resume_timeout = optional(number)
suspend_time = optional(number, 300)
suspend_timeout = optional(number)
}))
| n/a | yes | | [preemptible](#input\_preemptible) | Allow the instance to be preempted. | `bool` | `false` | no | | [project\_id](#input\_project\_id) | Project ID to create resources in. | `string` | n/a | yes | | [prolog\_scripts](#input\_prolog\_scripts) | List of scripts to be used for Prolog. Programs for the slurmd to execute
whenever it is asked to run a job step from a new job allocation.
See https://slurm.schedmd.com/slurm.conf.html#OPT_Prolog. |
list(object({
filename = string
content = string
}))
| `[]` | no | @@ -226,5 +228,8 @@ limitations under the License. ## Outputs -No outputs. +| Name | Description | +|------|-------------| +| [slurm\_bucket\_path](#output\_slurm\_bucket\_path) | Bucket path used by cluster. | +| [slurm\_cluster\_name](#output\_slurm\_cluster\_name) | Slurm cluster name. | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/etc/htc-slurm.conf.tpl b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/etc/htc-slurm.conf.tpl new file mode 100644 index 0000000000..8fb3f695e0 --- /dev/null +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/etc/htc-slurm.conf.tpl @@ -0,0 +1,67 @@ +# slurm.conf +# https://slurm.schedmd.com/high_throughput.html + +ProctrackType=proctrack/cgroup +SlurmctldPidFile=/var/run/slurm/slurmctld.pid +SlurmdPidFile=/var/run/slurm/slurmd.pid +TaskPlugin=task/affinity,task/cgroup +MaxArraySize=10001 +MaxJobCount=500000 +MaxNodeCount=100000 +MinJobAge=60 + +# +# +# SCHEDULING +SchedulerType=sched/backfill +SelectType=select/cons_tres +SelectTypeParameters=CR_Core_Memory + +# +# +# LOGGING AND ACCOUNTING +SlurmctldDebug=error +SlurmdDebug=error + +# +# +# TIMERS +MessageTimeout=60 + +################################################################################ +# vvvvv WARNING: DO NOT MODIFY SECTION BELOW vvvvv # +################################################################################ + +SlurmctldHost={control_host}({control_addr}) + +AuthType=auth/munge +AuthInfo=cred_expire=120 +AuthAltTypes=auth/jwt +CredType=cred/munge +MpiDefault={mpi_default} +ReturnToService=2 +SlurmctldPort={control_host_port} +SlurmdPort=6818 +SlurmdSpoolDir=/var/spool/slurmd +SlurmUser=slurm +StateSaveLocation={state_save} + +# +# +# LOGGING AND ACCOUNTING +AccountingStorageType=accounting_storage/slurmdbd +AccountingStorageHost={control_host} +ClusterName={name} +SlurmctldLogFile={slurmlog}/slurmctld.log +SlurmdLogFile={slurmlog}/slurmd-%n.log + +# +# +# GENERATED CLOUD CONFIGURATIONS +include cloud.conf + +################################################################################ +# ^^^^^ WARNING: DO NOT MODIFY SECTION ABOVE ^^^^^ # +################################################################################ + +SchedulerParameters=defer,salloc_wait_nodes,batch_sched_delay=20,bf_continue,bf_interval=300,bf_min_age_reserve=10800,bf_resolution=600,bf_yield_interval=1000000,partition_job_depth=500,sched_max_job_start=200,sched_min_interval=2000000 diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/etc/htc-slurmdbd.conf.tpl b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/etc/htc-slurmdbd.conf.tpl new file mode 100644 index 0000000000..9dc4ed9c70 --- /dev/null +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/etc/htc-slurmdbd.conf.tpl @@ -0,0 +1,34 @@ +# slurmdbd.conf +# https://slurm.schedmd.com/slurmdbd.conf.html + +DebugLevel=info +PidFile=/var/run/slurm/slurmdbd.pid + +# https://slurm.schedmd.com/slurmdbd.conf.html#OPT_CommitDelay +CommitDelay=1 + +################################################################################ +# vvvvv WARNING: DO NOT MODIFY SECTION BELOW vvvvv # +################################################################################ + +AuthType=auth/munge +AuthAltTypes=auth/jwt +AuthAltParameters=jwt_key={state_save}/jwt_hs256.key + +DbdHost={control_host} + +LogFile={slurmlog}/slurmdbd.log + +SlurmUser=slurm + +StorageLoc={db_name} + +StorageType=accounting_storage/mysql +StorageHost={db_host} +StoragePort={db_port} +StorageUser={db_user} +StoragePass={db_pass} + +################################################################################ +# ^^^^^ WARNING: DO NOT MODIFY SECTION ABOVE ^^^^^ # +################################################################################ diff --git a/tools/cloud-build/quota-check/bp.yaml b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/outputs.tf similarity index 50% rename from tools/cloud-build/quota-check/bp.yaml rename to community/modules/scheduler/schedmd-slurm-gcp-v6-controller/outputs.tf index e7b45aa646..d1346a51d7 100644 --- a/tools/cloud-build/quota-check/bp.yaml +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/outputs.tf @@ -1,4 +1,4 @@ -# Copyright 2023 Google LLC +# Copyright 2024 "Google LLC" # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,23 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. ---- -blueprint_name: quota-check +output "slurm_cluster_name" { + description = "Slurm cluster name." + value = local.slurm_cluster_name +} -vars: - deployment_name: quota-check - project_id: # Provided by check.py - region: # Provided by check.py - zone: # Provided by check.py - -deployment_groups: # Need to have at least one module to have a valid blueprint -- group: noop - modules: [{id: noop, source: modules/network/pre-existing-vpc}] - -validators: -- validator: test_resource_requirements - inputs: - ignore_usage: true - requirements: - - metric: compute.googleapis.com/n2_cpus - required: 750 +output "slurm_bucket_path" { + description = "Bucket path used by cluster." + value = module.slurm_files.slurm_bucket_path +} diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf index e05ecd6f77..c076111489 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf @@ -13,8 +13,14 @@ # limitations under the License. locals { - nodeset_map = { for x in var.nodeset : x.nodeset_name => x } - nodeset_tpu_map = { for x in var.nodeset_tpu : x.nodeset_name => x } + nodeset_map_ell = { for x in var.nodeset : x.nodeset_name => x... } + nodeset_map = { for k, vs in local.nodeset_map_ell : k => vs[0] } + + nodeset_tpu_map_ell = { for x in var.nodeset_tpu : x.nodeset_name => x... } + nodeset_tpu_map = { for k, vs in local.nodeset_tpu_map_ell : k => vs[0] } + + nodeset_dyn_map_ell = { for x in var.nodeset_dyn : x.nodeset_name => x... } + nodeset_dyn_map = { for k, vs in local.nodeset_dyn_map_ell : k => vs[0] } partition_map = { for x in var.partitions : x.partition_name => x } } @@ -100,6 +106,15 @@ module "slurm_nodeset_tpu" { subnetwork = each.value.subnetwork } +# NODESET DYNAMIC +module "slurm_nodeset_dyn" { + source = "github.com/SchedMD/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_dyn?ref=6.2.0" + for_each = local.nodeset_dyn_map + + nodeset_name = each.value.nodeset_name + nodeset_feature = each.value.nodeset_feature +} + # PARTITION module "slurm_partition" { source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition?ref=6.4.3&depth=1" @@ -107,10 +122,10 @@ module "slurm_partition" { partition_nodeset = [for x in each.value.partition_nodeset : module.slurm_nodeset[x].nodeset_name if try(module.slurm_nodeset[x], null) != null] partition_nodeset_tpu = [for x in each.value.partition_nodeset_tpu : module.slurm_nodeset_tpu[x].nodeset_name if try(module.slurm_nodeset_tpu[x], null) != null] + partition_nodeset_dyn = [for x in each.value.partition_nodeset_dyn : module.slurm_nodeset_dyn[x].nodeset_name if try(module.slurm_nodeset_dyn[x], null) != null] default = each.value.default enable_job_exclusive = each.value.enable_job_exclusive - network_storage = each.value.network_storage partition_name = each.value.partition_name partition_conf = each.value.partition_conf resume_timeout = each.value.resume_timeout diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf index 27e11f9064..7775720157 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf @@ -84,8 +84,7 @@ locals { filename = "ghpc_startup.sh" content = var.compute_startup_script }] - nodeset_startup_scripts = { - for ns in var.nodeset : ns.nodeset_name => ns.startup_script } + nodeset_startup_scripts = { for k, v in local.nodeset_map : k => v.startup_script } } module "slurm_files" { @@ -128,6 +127,7 @@ module "slurm_files" { partitions = values(module.slurm_partition)[*] nodeset = values(module.slurm_nodeset)[*] nodeset_tpu = values(module.slurm_nodeset_tpu)[*] + nodeset_dyn = values(module.slurm_nodeset_dyn)[*] depends_on = [module.bucket] } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf index c89e36f0ac..326fe9da92 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf @@ -158,7 +158,6 @@ variable "login_nodes" { ############ # NODESETS # ############ - variable "nodeset" { description = "Define nodesets, as a list." type = list(object({ @@ -241,14 +240,8 @@ variable "nodeset" { content = string })), []) })) default = [] - - validation { - condition = length(distinct([for x in var.nodeset : x.nodeset_name])) == length(var.nodeset) - error_message = "All nodesets must have a unique name." - } } -# REVIEWER_NOTE: copied from V6 cluster module as is variable "nodeset_tpu" { description = "Define TPU nodesets, as a list." type = list(object({ @@ -279,11 +272,16 @@ variable "nodeset_tpu" { reserved = optional(string, false) })) default = [] +} - validation { - condition = length(distinct([for x in var.nodeset_tpu : x.nodeset_name])) == length(var.nodeset_tpu) - error_message = "All TPU nodesets must have a unique name." - } + +variable "nodeset_dyn" { + description = "Defines dynamic nodesets, as a list." + type = list(object({ + nodeset_name = string + nodeset_feature = string + })) + default = [] } ############# @@ -295,15 +293,8 @@ variable "partitions" { Cluster partitions as a list. See module slurm_partition. EOD type = list(object({ - default = optional(bool, false) - enable_job_exclusive = optional(bool, false) - network_storage = optional(list(object({ - server_ip = string - remote_mount = string - local_mount = string - fs_type = string - mount_options = string - })), []) + default = optional(bool, false) + enable_job_exclusive = optional(bool, false) partition_conf = optional(map(string), {}) partition_name = string partition_nodeset = optional(list(string), []) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/versions.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/versions.tf index 7733e74f2e..bd8faf49c1 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/versions.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/versions.tf @@ -24,6 +24,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-controller/v1.30.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-controller/v1.31.0" } } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/versions.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/versions.tf index fceff60bbd..8eeeccb527 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/versions.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/versions.tf @@ -24,6 +24,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-login/v1.30.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-login/v1.31.0" } } diff --git a/community/modules/scripts/htcondor-install/README.md b/community/modules/scripts/htcondor-install/README.md index 962c03a314..ecf4fd7595 100644 --- a/community/modules/scripts/htcondor-install/README.md +++ b/community/modules/scripts/htcondor-install/README.md @@ -136,6 +136,7 @@ No resources. |------|-------------|------|---------|:--------:| | [condor\_version](#input\_condor\_version) | Yum/DNF-compatible version string; leave unset to use latest 23.0 LTS release (examples: "23.0.0","23.*")) | `string` | `"23.*"` | no | | [enable\_docker](#input\_enable\_docker) | Install and enable docker daemon alongside HTCondor | `bool` | `true` | no | +| [http\_proxy](#input\_http\_proxy) | Set system default web (http and https) proxy for Windows HTCondor installation | `string` | `""` | no | ## Outputs diff --git a/community/modules/scripts/htcondor-install/main.tf b/community/modules/scripts/htcondor-install/main.tf index 6c2a864494..35e6e3978c 100644 --- a/community/modules/scripts/htcondor-install/main.tf +++ b/community/modules/scripts/htcondor-install/main.tf @@ -39,7 +39,8 @@ locals { install_htcondor_ps1 = templatefile( "${path.module}/templates/install-htcondor.ps1.tftpl", { - condor_version = var.condor_version + condor_version = var.condor_version, + http_proxy = var.http_proxy, }) required_apis = [ diff --git a/community/modules/scripts/htcondor-install/templates/install-htcondor.ps1.tftpl b/community/modules/scripts/htcondor-install/templates/install-htcondor.ps1.tftpl index 79941524dd..aae04862ed 100644 --- a/community/modules/scripts/htcondor-install/templates/install-htcondor.ps1.tftpl +++ b/community/modules/scripts/htcondor-install/templates/install-htcondor.ps1.tftpl @@ -1,8 +1,15 @@ +#Requires -RunAsAdministrator + +# Windows 2016 needs forced upgrade to TLS 1.2 +[Net.ServicePointManager]::SecurityProtocol = 'Tls12' + +# important for catching exception in Invoke-WebRequest Set-StrictMode -Version latest $ErrorActionPreference = 'Stop' -# Windows 2016 defaults to old TLS protocols, override it -[Net.ServicePointManager]::SecurityProtocol = 'Tls12' +%{ if http_proxy != "" } +[System.Net.WebRequest]::DefaultWebProxy = New-Object System.Net.WebProxy("${http_proxy}") +%{ endif } # do not show progress bar when running Invoke-WebRequest $ProgressPreference = 'SilentlyContinue' diff --git a/community/modules/scripts/htcondor-install/variables.tf b/community/modules/scripts/htcondor-install/variables.tf index f54bb4c474..49da233e4b 100644 --- a/community/modules/scripts/htcondor-install/variables.tf +++ b/community/modules/scripts/htcondor-install/variables.tf @@ -35,3 +35,10 @@ variable "condor_version" { ) } } + +variable "http_proxy" { + description = "Set system default web (http and https) proxy for Windows HTCondor installation" + type = string + default = "" + nullable = false +} diff --git a/community/modules/scripts/ramble-execute/README.md b/community/modules/scripts/ramble-execute/README.md index 9f2b2a9304..58cdef3f9d 100644 --- a/community/modules/scripts/ramble-execute/README.md +++ b/community/modules/scripts/ramble-execute/README.md @@ -77,7 +77,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.29.0&depth=1 | +| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.30.0&depth=1 | ## Resources diff --git a/community/modules/scripts/ramble-execute/main.tf b/community/modules/scripts/ramble-execute/main.tf index 45428e0b89..ba65d16abd 100644 --- a/community/modules/scripts/ramble-execute/main.tf +++ b/community/modules/scripts/ramble-execute/main.tf @@ -55,7 +55,7 @@ locals { } module "startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.29.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.30.0&depth=1" labels = local.labels project_id = var.project_id diff --git a/community/modules/scripts/ramble-setup/README.md b/community/modules/scripts/ramble-setup/README.md index 1b03ac937b..54978a7b63 100644 --- a/community/modules/scripts/ramble-setup/README.md +++ b/community/modules/scripts/ramble-setup/README.md @@ -86,7 +86,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.29.0&depth=1 | +| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.30.0&depth=1 | ## Resources diff --git a/community/modules/scripts/ramble-setup/main.tf b/community/modules/scripts/ramble-setup/main.tf index feb5073b94..9b0693a998 100644 --- a/community/modules/scripts/ramble-setup/main.tf +++ b/community/modules/scripts/ramble-setup/main.tf @@ -94,7 +94,7 @@ resource "google_storage_bucket" "bucket" { } module "startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.29.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.30.0&depth=1" labels = local.labels project_id = var.project_id diff --git a/community/modules/scripts/spack-execute/README.md b/community/modules/scripts/spack-execute/README.md index c00ac40c2b..69a7356646 100644 --- a/community/modules/scripts/spack-execute/README.md +++ b/community/modules/scripts/spack-execute/README.md @@ -104,7 +104,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.29.0&depth=1 | +| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.30.0&depth=1 | ## Resources diff --git a/community/modules/scripts/spack-execute/main.tf b/community/modules/scripts/spack-execute/main.tf index 146efb4065..ed4510a1e8 100644 --- a/community/modules/scripts/spack-execute/main.tf +++ b/community/modules/scripts/spack-execute/main.tf @@ -54,7 +54,7 @@ locals { } module "startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.29.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.30.0&depth=1" labels = local.labels project_id = var.project_id diff --git a/community/modules/scripts/spack-setup/README.md b/community/modules/scripts/spack-setup/README.md index dcc90bb3d7..f0f6ad81af 100644 --- a/community/modules/scripts/spack-setup/README.md +++ b/community/modules/scripts/spack-setup/README.md @@ -340,7 +340,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.29.0&depth=1 | +| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.30.0&depth=1 | ## Resources diff --git a/community/modules/scripts/spack-setup/main.tf b/community/modules/scripts/spack-setup/main.tf index fe31519311..c9e223a80a 100644 --- a/community/modules/scripts/spack-setup/main.tf +++ b/community/modules/scripts/spack-setup/main.tf @@ -100,7 +100,7 @@ resource "google_storage_bucket" "bucket" { } module "startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.29.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.30.0&depth=1" labels = local.labels project_id = var.project_id diff --git a/community/modules/scripts/wait-for-startup/versions.tf b/community/modules/scripts/wait-for-startup/versions.tf index 2355ad7f21..4de81d6d60 100644 --- a/community/modules/scripts/wait-for-startup/versions.tf +++ b/community/modules/scripts/wait-for-startup/versions.tf @@ -26,7 +26,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:wait-for-startup/v1.30.0" + module_name = "blueprints/terraform/hpc-toolkit:wait-for-startup/v1.31.0" } required_version = ">= 0.14.0" diff --git a/community/modules/scripts/windows-startup-script/templates/install_gpu_driver.ps1.tftpl b/community/modules/scripts/windows-startup-script/templates/install_gpu_driver.ps1.tftpl index ad0170e4cf..55c4a2a3cd 100644 --- a/community/modules/scripts/windows-startup-script/templates/install_gpu_driver.ps1.tftpl +++ b/community/modules/scripts/windows-startup-script/templates/install_gpu_driver.ps1.tftpl @@ -5,7 +5,7 @@ # important for catching exception in Invoke-WebRequest Set-StrictMode -Version latest -$ErrorActionPreference = 'stop' +$ErrorActionPreference = 'Stop' %{ if http_proxy != "" } [System.Net.WebRequest]::DefaultWebProxy = New-Object System.Net.WebProxy("${http_proxy}") diff --git a/community/modules/scripts/windows-startup-script/versions.tf b/community/modules/scripts/windows-startup-script/versions.tf index 29305d7cfe..adfc164d1c 100644 --- a/community/modules/scripts/windows-startup-script/versions.tf +++ b/community/modules/scripts/windows-startup-script/versions.tf @@ -16,7 +16,7 @@ terraform { provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:windows-startup-script/v1.30.0" + module_name = "blueprints/terraform/hpc-toolkit:windows-startup-script/v1.31.0" } required_version = ">= 0.14.0" diff --git a/docs/tutorials/hpc-slurm-qwiklabs.yaml b/docs/tutorials/hpc-slurm-qwiklabs.yaml index 6034345636..29d3ba3bb0 100644 --- a/docs/tutorials/hpc-slurm-qwiklabs.yaml +++ b/docs/tutorials/hpc-slurm-qwiklabs.yaml @@ -28,9 +28,9 @@ vars: deployment_groups: - group: primary modules: - # Source is an embedded resource, denoted by "resources/*" without ./, ../, / - # as a prefix. To refer to a local resource, prefix with ./, ../ or / - # Example - ./resources/network/vpc + # Source is an embedded module, denoted by "modules/*" without ./, ../, / + # as a prefix. To refer to a local module, prefix with ./, ../ or / + # Example - ./modules/network/vpc - id: network1 source: modules/network/vpc diff --git a/docs/tutorials/sc23-tutorial/hcls-blueprint.yaml b/docs/tutorials/sc23-tutorial/hcls-blueprint.yaml index 02192432e3..23adb44f23 100644 --- a/docs/tutorials/sc23-tutorial/hcls-blueprint.yaml +++ b/docs/tutorials/sc23-tutorial/hcls-blueprint.yaml @@ -174,7 +174,6 @@ deployment_groups: source: community/modules/compute/schedmd-slurm-gcp-v6-partition use: - compute_nodeset - - nfs settings: partition_name: compute exclusive: false diff --git a/examples/README.md b/examples/README.md index 5764ea0cb3..da42833df9 100644 --- a/examples/README.md +++ b/examples/README.md @@ -43,6 +43,7 @@ md_toc github examples/README.md | sed -e "s/\s-\s/ * /" * [ml-gke](#ml-gkeyaml--) ![community-badge] ![experimental-badge] * [storage-gke](#storage-gkeyaml--) ![community-badge] ![experimental-badge] * [htc-slurm.yaml](#htc-slurmyaml--) ![community-badge] ![experimental-badge] + * [htc-slurm-v6.yaml](#htc-slurm-v6yaml--) ![community-badge] ![experimental-badge] * [htc-htcondor.yaml](#htc-htcondoryaml--) ![community-badge] ![experimental-badge] * [fsi-montecarlo-on-batch.yaml](#fsi-montecarlo-on-batchyaml-) ![community-badge] ![experimental-badge] * [tutorial-starccm-slurm.yaml](#tutorial-starccm-slurmyaml--) ![community-badge] ![experimental-badge] @@ -1232,6 +1233,18 @@ For more information see: [htc-slurm.yaml]: ../community/examples/htc-slurm.yaml +### [htc-slurm-v6.yaml] ![community-badge] ![experimental-badge] + +This blueprint provisions a cluster using the Slurm scheduler in a configuration +tuned for the execution of many short-duration, loosely-coupled (non-MPI) jobs. + +For more information see: + +* [Slurm on Google Cloud High Throughput documentation](https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/htc.md) +* [General Slurm High Throughput documentation](https://slurm.schedmd.com/high_throughput.html) + +[htc-slurm-v6.yaml]: ../community/examples/htc-slurm-v6.yaml + ### [fsi-montecarlo-on-batch.yaml](../community/examples/fsi-montecarlo-on-batch.yaml) ![community-badge] ![experimental-badge] ## Monte Carlo Simulations for Value at Risk diff --git a/examples/hpc-slurm-v6.yaml b/examples/hpc-slurm-v6.yaml index 59edc0586b..de7cfe5614 100644 --- a/examples/hpc-slurm-v6.yaml +++ b/examples/hpc-slurm-v6.yaml @@ -51,7 +51,6 @@ deployment_groups: - id: debug_partition source: community/modules/compute/schedmd-slurm-gcp-v6-partition use: - - homefs - debug_nodeset settings: partition_name: debug @@ -68,7 +67,6 @@ deployment_groups: - id: compute_partition source: community/modules/compute/schedmd-slurm-gcp-v6-partition use: - - homefs - compute_nodeset settings: partition_name: compute @@ -87,7 +85,6 @@ deployment_groups: - id: h3_partition source: community/modules/compute/schedmd-slurm-gcp-v6-partition use: - - homefs - h3_nodeset settings: partition_name: h3 diff --git a/examples/image-builder-v6.yaml b/examples/image-builder-v6.yaml index 1da2052155..b68b853504 100644 --- a/examples/image-builder-v6.yaml +++ b/examples/image-builder-v6.yaml @@ -48,16 +48,6 @@ deployment_groups: #!/bin/sh echo "Hello World" > /home/hello.txt - - id: builder_sa - source: community/modules/project/service-account - settings: - name: pkr - project_roles: - - compute.instanceAdmin.v1 - - logging.logWriter - - monitoring.metricWriter - - storage.objectViewer - - group: packer modules: - id: custom-image @@ -66,7 +56,6 @@ deployment_groups: use: - network - scripts_for_image - - builder_sa settings: source_image_project_id: [schedmd-slurm-public] # see latest in https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#published-image-family diff --git a/examples/image-builder.yaml b/examples/image-builder.yaml index a676837dbc..3a11e001b3 100644 --- a/examples/image-builder.yaml +++ b/examples/image-builder.yaml @@ -49,16 +49,6 @@ deployment_groups: #!/bin/sh echo "Hello World" > /home/hello.txt - - id: builder_sa - source: community/modules/project/service-account - settings: - name: pkr - project_roles: - - compute.instanceAdmin.v1 - - logging.logWriter - - monitoring.metricWriter - - storage.objectViewer - - group: packer modules: - id: custom-image @@ -67,7 +57,6 @@ deployment_groups: use: - network1 - scripts_for_image - - builder_sa settings: source_image_project_id: [schedmd-slurm-public] # see latest in https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#published-image-family diff --git a/go.mod b/go.mod index 5b71756731..ec3d768175 100644 --- a/go.mod +++ b/go.mod @@ -28,7 +28,7 @@ require ( github.com/hashicorp/terraform-exec v0.20.0 github.com/mattn/go-isatty v0.0.20 github.com/zclconf/go-cty-debug v0.0.0-20191215020915-b22d67c1ba0b - google.golang.org/api v0.169.0 + google.golang.org/api v0.170.0 ) require ( @@ -54,7 +54,7 @@ require ( golang.org/x/time v0.5.0 // indirect golang.org/x/tools v0.15.0 // indirect google.golang.org/genproto/googleapis/api v0.0.0-20240205150955-31a09d347014 // indirect - google.golang.org/genproto/googleapis/rpc v0.0.0-20240304161311-37d4d3c04a78 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20240311132316-a219d84964c2 // indirect gopkg.in/yaml.v2 v2.4.0 // indirect ) @@ -95,13 +95,13 @@ require ( github.com/ulikunitz/xz v0.5.10 // indirect github.com/xanzy/ssh-agent v0.3.3 // indirect go.opencensus.io v0.24.0 // indirect - golang.org/x/crypto v0.19.0 // indirect - golang.org/x/net v0.21.0 // indirect - golang.org/x/oauth2 v0.17.0 // indirect + golang.org/x/crypto v0.21.0 // indirect + golang.org/x/net v0.22.0 // indirect + golang.org/x/oauth2 v0.18.0 // indirect golang.org/x/sys v0.18.0 golang.org/x/text v0.14.0 // indirect google.golang.org/appengine v1.6.8 // indirect - google.golang.org/grpc v1.62.0 // indirect + google.golang.org/grpc v1.62.1 // indirect google.golang.org/protobuf v1.33.0 // indirect gopkg.in/warnings.v0 v0.1.2 // indirect ) diff --git a/go.sum b/go.sum index 2b382931e0..ef95a19bd0 100644 --- a/go.sum +++ b/go.sum @@ -538,8 +538,8 @@ golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5y golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= golang.org/x/crypto v0.3.1-0.20221117191849-2c476679df9a/go.mod h1:hebNnKkNXi2UzZN1eVRvBB7co0a+JxK6XbPiWVs/3J4= golang.org/x/crypto v0.7.0/go.mod h1:pYwdfH91IfpZVANVyUOhSIPZaFoJGxTFbZhFTx+dXZU= -golang.org/x/crypto v0.19.0 h1:ENy+Az/9Y1vSrlrvBSyna3PITt4tiZLf7sgCjZBX7Wo= -golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU= +golang.org/x/crypto v0.21.0 h1:X31++rzVUdKhX5sWmSOFZxx8UW/ldWx55cbf08iNAMA= +golang.org/x/crypto v0.21.0/go.mod h1:0BP7YvVV9gBbVKyeTG0Gyn+gZm94bibOW5BjDEYAOMs= golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190510132918-efd6b22b2522/go.mod h1:ZjyILWgesfNpC6sMxTJOJm9Kp84zZh5NQWvqDGG3Qr8= @@ -633,8 +633,8 @@ golang.org/x/net v0.1.0/go.mod h1:Cx3nUiGt4eDBEyega/BKRp+/AlGL8hYe7U9odMt2Cco= golang.org/x/net v0.2.0/go.mod h1:KqCZLdyyvdV855qA2rE3GC2aiw5xGR5TEjj8smXukLY= golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= golang.org/x/net v0.8.0/go.mod h1:QVkue5JL9kW//ek3r6jTKnTFis1tRmNAW2P1shuFdJc= -golang.org/x/net v0.21.0 h1:AQyQV4dYCvJ7vGmJyKki9+PBdyvhkSd8EIx/qb0AYv4= -golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44= +golang.org/x/net v0.22.0 h1:9sGLhx7iRIHEiX0oAJ3MRZMUCElJgy7Br1nO+AMN3Tc= +golang.org/x/net v0.22.0/go.mod h1:JKghWKKOSdJwpW2GEx0Ja7fmaKnMsbu+MWVZTokSYmg= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= @@ -660,8 +660,8 @@ golang.org/x/oauth2 v0.0.0-20220822191816-0ebed06d0094/go.mod h1:h4gKUeWbJ4rQPri golang.org/x/oauth2 v0.0.0-20220909003341-f21342109be1/go.mod h1:h4gKUeWbJ4rQPri7E0u6Gs4e9Ri2zaLxzw5DI5XGrYg= golang.org/x/oauth2 v0.0.0-20221014153046-6fdb5e3db783/go.mod h1:h4gKUeWbJ4rQPri7E0u6Gs4e9Ri2zaLxzw5DI5XGrYg= golang.org/x/oauth2 v0.1.0/go.mod h1:G9FE4dLTsbXUu90h/Pf85g4w1D+SSAgR+q46nJZ8M4A= -golang.org/x/oauth2 v0.17.0 h1:6m3ZPmLEFdVxKKWnKq4VqZ60gutO35zm+zrAHVmHyDQ= -golang.org/x/oauth2 v0.17.0/go.mod h1:OzPDGQiuQMguemayvdylqddI7qcD9lnSDb+1FiwQ5HA= +golang.org/x/oauth2 v0.18.0 h1:09qnuIAgzdx1XplqJvW6CQqMCtGZykZWcXzPMPUusvI= +golang.org/x/oauth2 v0.18.0/go.mod h1:Wf7knwG0MPoWIMMBgFlEaSUDaKskp0dCfrlJRJXbBi8= golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= @@ -758,8 +758,8 @@ golang.org/x/term v0.1.0/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/term v0.2.0/go.mod h1:TVmDHMZPmdnySmBfhjOoOdhjzdE1h4u1VwSiw2l1Nuc= golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= golang.org/x/term v0.6.0/go.mod h1:m6U89DPEgQRMq3DNkDClhWw02AUbt2daBVO4cn4Hv9U= -golang.org/x/term v0.17.0 h1:mkTF7LCd6WGJNL3K1Ad7kwxNfYAW6a8a8QqtMblp/4U= -golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk= +golang.org/x/term v0.18.0 h1:FcHjZXDMxI8mM3nwhX9HlKop4C0YQvCVCdwYl2wOtE8= +golang.org/x/term v0.18.0/go.mod h1:ILwASektA3OnRv7amZ1xhE/KTR+u50pbXfZ03+6Nx58= golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= @@ -892,8 +892,8 @@ google.golang.org/api v0.96.0/go.mod h1:w7wJQLTM+wvQpNf5JyEcBoxK0RH7EDrh/L4qfsuJ google.golang.org/api v0.97.0/go.mod h1:w7wJQLTM+wvQpNf5JyEcBoxK0RH7EDrh/L4qfsuJ13s= google.golang.org/api v0.98.0/go.mod h1:w7wJQLTM+wvQpNf5JyEcBoxK0RH7EDrh/L4qfsuJ13s= google.golang.org/api v0.100.0/go.mod h1:ZE3Z2+ZOr87Rx7dqFsdRQkRBk36kDtp/h+QpHbB7a70= -google.golang.org/api v0.169.0 h1:QwWPy71FgMWqJN/l6jVlFHUa29a7dcUy02I8o799nPY= -google.golang.org/api v0.169.0/go.mod h1:gpNOiMA2tZ4mf5R9Iwf4rK/Dcz0fbdIgWYWVoxmsyLg= +google.golang.org/api v0.170.0 h1:zMaruDePM88zxZBG+NG8+reALO2rfLhe/JShitLyT48= +google.golang.org/api v0.170.0/go.mod h1:/xql9M2btF85xac/VAm4PsLMTLVGUOpq4BE9R8jyNy8= google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= google.golang.org/appengine v1.5.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= @@ -1008,8 +1008,8 @@ google.golang.org/genproto v0.0.0-20240205150955-31a09d347014 h1:g/4bk7P6TPMkAUb google.golang.org/genproto v0.0.0-20240205150955-31a09d347014/go.mod h1:xEgQu1e4stdSSsxPDK8Azkrk/ECl5HvdPf6nbZrTS5M= google.golang.org/genproto/googleapis/api v0.0.0-20240205150955-31a09d347014 h1:x9PwdEgd11LgK+orcck69WVRo7DezSO4VUMPI4xpc8A= google.golang.org/genproto/googleapis/api v0.0.0-20240205150955-31a09d347014/go.mod h1:rbHMSEDyoYX62nRVLOCc4Qt1HbsdytAYoVwgjiOhF3I= -google.golang.org/genproto/googleapis/rpc v0.0.0-20240304161311-37d4d3c04a78 h1:Xs9lu+tLXxLIfuci70nG4cpwaRC+mRQPUL7LoIeDJC4= -google.golang.org/genproto/googleapis/rpc v0.0.0-20240304161311-37d4d3c04a78/go.mod h1:UCOku4NytXMJuLQE5VuqA5lX3PcHCBo8pxNyvkf4xBs= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240311132316-a219d84964c2 h1:9IZDv+/GcI6u+a4jRFRLxQs0RUCfavGfoOgEW6jpkI0= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240311132316-a219d84964c2/go.mod h1:UCOku4NytXMJuLQE5VuqA5lX3PcHCBo8pxNyvkf4xBs= google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= google.golang.org/grpc v1.20.1/go.mod h1:10oTOabMzJvdu6/UiuZezV6QK5dSlG84ov/aaiqXj38= google.golang.org/grpc v1.21.1/go.mod h1:oYelfM1adQP15Ek0mdvEgi9Df8B9CZIaU1084ijfRaM= @@ -1045,8 +1045,8 @@ google.golang.org/grpc v1.48.0/go.mod h1:vN9eftEi1UMyUsIF80+uQXhHjbXYbm0uXoFCACu google.golang.org/grpc v1.49.0/go.mod h1:ZgQEeidpAuNRZ8iRrlBKXZQP1ghovWIVhdJRyCDK+GI= google.golang.org/grpc v1.50.0/go.mod h1:ZgQEeidpAuNRZ8iRrlBKXZQP1ghovWIVhdJRyCDK+GI= google.golang.org/grpc v1.50.1/go.mod h1:ZgQEeidpAuNRZ8iRrlBKXZQP1ghovWIVhdJRyCDK+GI= -google.golang.org/grpc v1.62.0 h1:HQKZ/fa1bXkX1oFOvSjmZEUL8wLSaZTjCcLAlmZRtdk= -google.golang.org/grpc v1.62.0/go.mod h1:IWTG0VlJLCh1SkC58F7np9ka9mx/WNkjl4PGJaiq+QE= +google.golang.org/grpc v1.62.1 h1:B4n+nfKzOICUXMgyrNd19h/I9oH0L1pizfk1d4zSgTk= +google.golang.org/grpc v1.62.1/go.mod h1:IWTG0VlJLCh1SkC58F7np9ka9mx/WNkjl4PGJaiq+QE= google.golang.org/grpc/cmd/protoc-gen-go-grpc v1.1.0/go.mod h1:6Kw0yEErY5E/yWrBtf03jp27GLLJujG4z/JK95pnjjw= google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8= google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0= diff --git a/modules/compute/vm-instance/README.md b/modules/compute/vm-instance/README.md index 8ef2d09418..7c66803b85 100644 --- a/modules/compute/vm-instance/README.md +++ b/modules/compute/vm-instance/README.md @@ -185,7 +185,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [netstorage\_startup\_script](#module\_netstorage\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.29.0&depth=1 | +| [netstorage\_startup\_script](#module\_netstorage\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.30.0&depth=1 | ## Resources diff --git a/modules/compute/vm-instance/startup_from_network_storage.tf b/modules/compute/vm-instance/startup_from_network_storage.tf index 1bbc83620a..ee236da77a 100644 --- a/modules/compute/vm-instance/startup_from_network_storage.tf +++ b/modules/compute/vm-instance/startup_from_network_storage.tf @@ -55,7 +55,7 @@ locals { } module "netstorage_startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.29.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.30.0&depth=1" labels = local.labels project_id = var.project_id diff --git a/modules/compute/vm-instance/versions.tf b/modules/compute/vm-instance/versions.tf index 016c71eafb..f3d42853fd 100644 --- a/modules/compute/vm-instance/versions.tf +++ b/modules/compute/vm-instance/versions.tf @@ -31,10 +31,10 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.30.0" + module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.31.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.30.0" + module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.31.0" } required_version = ">= 1.3.0" diff --git a/modules/file-system/filestore/versions.tf b/modules/file-system/filestore/versions.tf index cc0585a322..a58f128fb7 100644 --- a/modules/file-system/filestore/versions.tf +++ b/modules/file-system/filestore/versions.tf @@ -26,10 +26,10 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.30.0" + module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.31.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.30.0" + module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.31.0" } required_version = ">= 0.14.0" diff --git a/modules/monitoring/dashboard/versions.tf b/modules/monitoring/dashboard/versions.tf index 168b2637cf..06cd66e668 100644 --- a/modules/monitoring/dashboard/versions.tf +++ b/modules/monitoring/dashboard/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:dashboard/v1.30.0" + module_name = "blueprints/terraform/hpc-toolkit:dashboard/v1.31.0" } required_version = ">= 0.14.0" diff --git a/modules/network/firewall-rules/versions.tf b/modules/network/firewall-rules/versions.tf index eb64bebfa8..8058682598 100644 --- a/modules/network/firewall-rules/versions.tf +++ b/modules/network/firewall-rules/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:firewall-rules/v1.30.0" + module_name = "blueprints/terraform/hpc-toolkit:firewall-rules/v1.31.0" } required_version = ">= 1.3" diff --git a/modules/network/pre-existing-vpc/versions.tf b/modules/network/pre-existing-vpc/versions.tf index f8722c5cda..9b7200513c 100644 --- a/modules/network/pre-existing-vpc/versions.tf +++ b/modules/network/pre-existing-vpc/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:pre-existing-vpc/v1.30.0" + module_name = "blueprints/terraform/hpc-toolkit:pre-existing-vpc/v1.31.0" } required_version = ">= 0.14.0" diff --git a/modules/packer/custom-image/README.md b/modules/packer/custom-image/README.md index 69f3823a56..430b337657 100644 --- a/modules/packer/custom-image/README.md +++ b/modules/packer/custom-image/README.md @@ -75,7 +75,8 @@ buckets. Recommended roles are: - `roles/monitoring.metricWriter` - `roles/storage.objectViewer` -These roles are demonstrated in the [image builder example][examples readme]. +It is recommended to create this service account as a separate step outside a +blueprint due to known delay in [IAM bindings propagation][iamprop]. ## Example blueprints @@ -286,8 +287,9 @@ No resources. | [omit\_external\_ip](#input\_omit\_external\_ip) | Provision the image building VM without a public IP address | `bool` | `true` | no | | [on\_host\_maintenance](#input\_on\_host\_maintenance) | Describes maintenance behavior for the instance. If left blank this will default to `MIGRATE` except the use of GPUs requires it to be `TERMINATE` | `string` | `null` | no | | [project\_id](#input\_project\_id) | Project in which to create VM and image | `string` | n/a | yes | -| [scopes](#input\_scopes) | Service account scopes to attach to the instance. See
https://cloud.google.com/compute/docs/access/service-accounts. | `list(string)` |
[
"https://www.googleapis.com/auth/cloud-platform"
]
| no | +| [scopes](#input\_scopes) | DEPRECATED: use var.service\_account\_scopes | `set(string)` | `null` | no | | [service\_account\_email](#input\_service\_account\_email) | The service account email to use. If null or 'default', then the default Compute Engine service account will be used. | `string` | `null` | no | +| [service\_account\_scopes](#input\_service\_account\_scopes) | Service account scopes to attach to the instance. See
https://cloud.google.com/compute/docs/access/service-accounts. | `set(string)` |
[
"https://www.googleapis.com/auth/cloud-platform"
]
| no | | [shell\_scripts](#input\_shell\_scripts) | A list of paths to local shell scripts which will be uploaded to customize the VM image | `list(string)` | `[]` | no | | [shielded\_instance\_config](#input\_shielded\_instance\_config) | Shielded VM configuration for the instance (must set var.enabled\_shielded\_vm) |
object({
enable_secure_boot = bool
enable_vtpm = bool
enable_integrity_monitoring = bool
})
|
{
"enable_integrity_monitoring": true,
"enable_secure_boot": true,
"enable_vtpm": true
}
| no | | [source\_image](#input\_source\_image) | Source OS image to build from | `string` | `null` | no | @@ -314,6 +316,7 @@ No outputs. [cloudnat]: https://cloud.google.com/nat/docs/overview [examples readme]: ../../../examples/README.md#image-builderyaml- [hpcimage]: https://cloud.google.com/compute/docs/instances/create-hpc-vm +[iamprop]: https://cloud.google.com/iam/docs/access-change-propagation [iaptunnel]: https://cloud.google.com/iap/docs/using-tcp-forwarding [image builder]: ../../../examples/image-builder.yaml [logging-console]: https://console.cloud.google.com/logs/ diff --git a/modules/packer/custom-image/image.pkr.hcl b/modules/packer/custom-image/image.pkr.hcl index e243ca5665..d742fed982 100644 --- a/modules/packer/custom-image/image.pkr.hcl +++ b/modules/packer/custom-image/image.pkr.hcl @@ -107,7 +107,8 @@ source "googlecompute" "toolkit_image" { use_internal_ip = var.omit_external_ip subnetwork = var.subnetwork_name network_project_id = var.network_project_id - scopes = var.scopes + service_account_email = var.service_account_email + scopes = var.service_account_scopes source_image = var.source_image source_image_family = var.source_image_family source_image_project_id = var.source_image_project_id diff --git a/modules/packer/custom-image/variables.pkr.hcl b/modules/packer/custom-image/variables.pkr.hcl index 6e455ef46a..ce2b51b3e1 100644 --- a/modules/packer/custom-image/variables.pkr.hcl +++ b/modules/packer/custom-image/variables.pkr.hcl @@ -109,11 +109,22 @@ variable "service_account_email" { } variable "scopes" { + description = "DEPRECATED: use var.service_account_scopes" + type = set(string) + default = null + + validation { + condition = var.scopes == null + error_message = "DEPRECATED: var.scopes was renamed to var.service_account_scopes with identical format." + } +} + +variable "service_account_scopes" { description = < [instance\_template](#module\_instance\_template) | terraform-google-modules/vm/google//modules/instance_template | ~> 8.0 | -| [netstorage\_startup\_script](#module\_netstorage\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.29.0&depth=1 | +| [netstorage\_startup\_script](#module\_netstorage\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.30.0&depth=1 | ## Resources diff --git a/modules/scheduler/batch-job-template/startup_from_network_storage.tf b/modules/scheduler/batch-job-template/startup_from_network_storage.tf index 1bbc83620a..ee236da77a 100644 --- a/modules/scheduler/batch-job-template/startup_from_network_storage.tf +++ b/modules/scheduler/batch-job-template/startup_from_network_storage.tf @@ -55,7 +55,7 @@ locals { } module "netstorage_startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.29.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.30.0&depth=1" labels = local.labels project_id = var.project_id diff --git a/modules/scheduler/batch-login-node/README.md b/modules/scheduler/batch-login-node/README.md index 90270f3ef5..f15dbcbce7 100644 --- a/modules/scheduler/batch-login-node/README.md +++ b/modules/scheduler/batch-login-node/README.md @@ -89,7 +89,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [login\_startup\_script](#module\_login\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.29.0&depth=1 | +| [login\_startup\_script](#module\_login\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.30.0&depth=1 | ## Resources diff --git a/modules/scheduler/batch-login-node/main.tf b/modules/scheduler/batch-login-node/main.tf index 6f5f0c0784..66d81dad6a 100644 --- a/modules/scheduler/batch-login-node/main.tf +++ b/modules/scheduler/batch-login-node/main.tf @@ -94,7 +94,7 @@ locals { } module "login_startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.29.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.30.0&depth=1" labels = local.labels project_id = var.project_id deployment_name = var.deployment_name diff --git a/modules/scheduler/batch-login-node/versions.tf b/modules/scheduler/batch-login-node/versions.tf index b36133478d..d9c19c92e4 100644 --- a/modules/scheduler/batch-login-node/versions.tf +++ b/modules/scheduler/batch-login-node/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:batch-login-node/v1.30.0" + module_name = "blueprints/terraform/hpc-toolkit:batch-login-node/v1.31.0" } required_version = ">= 0.14.0" diff --git a/modules/scripts/startup-script/versions.tf b/modules/scripts/startup-script/versions.tf index 20ea248e17..e90fdb8455 100644 --- a/modules/scripts/startup-script/versions.tf +++ b/modules/scripts/startup-script/versions.tf @@ -30,7 +30,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:startup-script/v1.30.0" + module_name = "blueprints/terraform/hpc-toolkit:startup-script/v1.31.0" } required_version = ">= 0.14.0" diff --git a/pkg/config/config.go b/pkg/config/config.go index f4b183b29a..3ce2f03a79 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -19,6 +19,7 @@ import ( "bytes" "fmt" "os" + "path/filepath" "regexp" "sort" "strings" @@ -132,7 +133,7 @@ func (bp Blueprint) ModuleGroup(mod ModuleID) (Group, error) { func (bp Blueprint) ModuleGroupOrDie(mod ModuleID) Group { g, err := bp.ModuleGroup(mod) if err != nil { - panic(fmt.Errorf("module %s not found in blueprint: %s", mod, err)) + panic(err) } return g } @@ -242,6 +243,13 @@ type Blueprint struct { Vars Dict Groups []Group `yaml:"deployment_groups"` TerraformBackendDefaults TerraformBackend `yaml:"terraform_backend_defaults,omitempty"` + + // internal & non-serializable fields + + // absolute path to the blueprint file + path string + // records of intentions to stage file (populated by ghpc_stage function) + stagedFiles map[string]string } // DeploymentSettings are deployment-specific override settings @@ -263,6 +271,9 @@ func (bp *Blueprint) Expand() error { if err := bp.expandVars(); err != nil { return err } + if err := bp.validateNoGhpcStageFuncs(); err != nil { + return err + } return bp.expandGroups() } @@ -342,7 +353,16 @@ func checkMovedModule(source string) error { // NewBlueprint is a constructor for Blueprint func NewBlueprint(path string) (Blueprint, YamlCtx, error) { - return parseYamlFile[Blueprint](path) + absPath, err := filepath.Abs(path) + if err != nil { + return Blueprint{}, YamlCtx{}, err + } + bp, ctx, err := parseYamlFile[Blueprint](absPath) + if err != nil { + return Blueprint{}, ctx, err + } + bp.path = absPath + return bp, ctx, nil } func NewDeploymentSettings(deploymentFilename string) (DeploymentSettings, YamlCtx, error) { @@ -361,14 +381,12 @@ func (bp Blueprint) Export(outputFilename string) error { d := buf.Bytes() if err != nil { - return fmt.Errorf("%s: %w", errMsgYamlMarshalError, err) + return fmt.Errorf("failed to export the configuration to a blueprint yaml file: %w", err) } err = os.WriteFile(outputFilename, d, 0644) if err != nil { - // hitting this error writing yaml - return fmt.Errorf("%s, Filename: %s: %w", - errMsgYamlSaveError, outputFilename, err) + return fmt.Errorf("failed to write the expanded yaml %s: %w", outputFilename, err) } return nil } @@ -392,7 +410,7 @@ func checkModulesAndGroups(bp Blueprint) error { errs.At(pg.Name, grp.Name.Validate()) if seenGrp[grp.Name] { - errs.At(pg.Name, fmt.Errorf("%s: %s used more than once", errMsgDuplicateGroup, grp.Name)) + errs.At(pg.Name, fmt.Errorf("group names must be unique, %q used more than once", grp.Name)) } seenGrp[grp.Name] = true @@ -409,7 +427,7 @@ func checkModulesAndGroups(bp Blueprint) error { for im, mod := range grp.Modules { pm := pg.Modules.At(im) if seenMod[mod.ID] { - errs.At(pm.ID, fmt.Errorf("%s: %s used more than once", errMsgDuplicateID, mod.ID)) + errs.At(pm.ID, fmt.Errorf("module IDs must be unique, %q used more than once", mod.ID)) } seenMod[mod.ID] = true errs.Add(validateModule(pm, mod, bp)) @@ -494,7 +512,7 @@ func validateDeploymentName(bp Blueprint) error { if !bp.Vars.Has("deployment_name") { return BpError{path, InputValueError{ inputKey: "deployment_name", - cause: errMsgVarNotFound, + cause: "could not find source of variable", }} } @@ -527,23 +545,6 @@ func validateDeploymentName(bp Blueprint) error { return nil } -// ProjectID returns the project_id -func (bp Blueprint) ProjectID() (string, error) { - pid := "project_id" - if !bp.Vars.Has(pid) { - return "", BpError{Root.Vars, fmt.Errorf("%q variable is not specified", pid)} - } - - v, err := bp.Eval(GlobalRef(pid).AsValue()) - if err != nil { - return "", err - } - if v.Type() != cty.String { - return "", BpError{Root.Vars.Dot(pid), fmt.Errorf("%q variable is not a string", pid)} - } - return v.AsString(), nil -} - // checkBlueprintName returns an error if blueprint_name does not comply with // requirements for correct GCP label values. func (bp *Blueprint) checkBlueprintName() error { @@ -681,7 +682,7 @@ func (bp *Blueprint) evalVars() (Dict, error) { res := map[string]cty.Value{} ctx := hcl.EvalContext{ Variables: map[string]cty.Value{}, - Functions: functions()} + Functions: bp.functions()} for _, n := range order { ctx.Variables["var"] = cty.ObjectVal(res) ev, err := eval(bp.Vars.Get(n), &ctx) diff --git a/pkg/config/config_test.go b/pkg/config/config_test.go index a82b04ba24..9ad7ce039f 100644 --- a/pkg/config/config_test.go +++ b/pkg/config/config_test.go @@ -135,19 +135,19 @@ func (s *zeroSuite) TestCheckModulesAndGroups(c *C) { { // Duplicate module id same group g := Group{Name: "ice", Modules: []Module{pony, pony}} err := checkModulesAndGroups(Blueprint{Groups: []Group{g}}) - c.Check(err, ErrorMatches, ".*pony used more than once") + c.Check(err, ErrorMatches, ".*pony.* used more than once") } { // Duplicate module id different groups ice := Group{Name: "ice", Modules: []Module{pony}} fire := Group{Name: "fire", Modules: []Module{pony}} err := checkModulesAndGroups(Blueprint{Groups: []Group{ice, fire}}) - c.Check(err, ErrorMatches, ".*pony used more than once") + c.Check(err, ErrorMatches, ".*pony.* used more than once") } { // Duplicate group name ice := Group{Name: "ice", Modules: []Module{pony}} ice9 := Group{Name: "ice", Modules: []Module{zebra}} err := checkModulesAndGroups(Blueprint{Groups: []Group{ice, ice9}}) - c.Check(err, ErrorMatches, ".*ice used more than once") + c.Check(err, ErrorMatches, ".*ice.* used more than once") } { // Mixing module kinds g := Group{Name: "ice", Modules: []Module{pony, zebra}} @@ -328,6 +328,8 @@ func (s *zeroSuite) TestNewBlueprint(c *C) { c.Assert(bp.Export(outFile), IsNil) newBp, _, err := NewBlueprint(outFile) c.Assert(err, IsNil) + + bp.path = outFile // set expected path c.Assert(bp, DeepEquals, newBp) } diff --git a/pkg/config/errors.go b/pkg/config/errors.go index d5c1239db1..d415602f08 100644 --- a/pkg/config/errors.go +++ b/pkg/config/errors.go @@ -82,7 +82,7 @@ type UnknownModuleError struct { } func (e UnknownModuleError) Error() string { - return fmt.Sprintf("invalid module id: \"%s\"", e.ID) + return fmt.Sprintf("unknown module id: %q", e.ID) } // Errors is an error wrapper to combine multiple errors @@ -160,17 +160,9 @@ var EmptyGroupName = errors.New("group name must be set for each deployment grou // Error messages const ( - errMsgYamlMarshalError = string("failed to export the configuration to a blueprint yaml file") - errMsgYamlSaveError = string("failed to write the expanded yaml") - errMsgInvalidVar = string("invalid variable definition in") - errMsgVarNotFound = string("could not find source of variable") - errMsgIntergroupOrder = string("references to outputs from other groups must be to earlier groups") - errMsgCannotUsePacker = string("Packer modules cannot be used by other modules") - errMsgDuplicateGroup = string("group names must be unique") - errMsgDuplicateID = string("module IDs must be unique") - errMsgInvalidOutput = string("requested output was not found in the module") + errMsgIntergroupOrder = string("references to outputs from other groups must be to earlier groups") + errMsgValueNotString = string("value was not of type string") errMsgValueEmptyString = string("value is an empty string") - errMsgLabelNameReqs = string("name must begin with a lowercase letter, can only contain lowercase letters, numeric characters, underscores and dashes, and must be between 1 and 63 characters long") errMsgLabelValueReqs = string("value can only contain lowercase letters, numeric characters, underscores and dashes, and must be between 0 and 63 characters long") ) diff --git a/pkg/config/expand.go b/pkg/config/expand.go index 6b4187adb1..6ee7d6becd 100644 --- a/pkg/config/expand.go +++ b/pkg/config/expand.go @@ -324,7 +324,7 @@ func validateModuleReference(bp Blueprint, from Module, toID ModuleID) error { } if to.Kind == PackerKind { - return fmt.Errorf("%s: %s", errMsgCannotUsePacker, to.ID) + return fmt.Errorf("packer modules cannot be used by other modules: %s", to.ID) } fg := bp.ModuleGroupOrDie(from.ID) @@ -345,7 +345,7 @@ func validateModuleSettingReference(bp Blueprint, mod Module, r Reference) error // simplest case to evaluate is a deployment variable's existence if r.GlobalVar { if !bp.Vars.Has(r.Name) { - err := fmt.Errorf("module %#v references unknown global variable %#v", mod.ID, r.Name) + err := fmt.Errorf("module %q references unknown global variable %q", mod.ID, r.Name) return hintSpelling(r.Name, bp.Vars.Keys(), err) } return nil diff --git a/pkg/config/expression.go b/pkg/config/expression.go index cacd4a879c..5f651d3070 100644 --- a/pkg/config/expression.go +++ b/pkg/config/expression.go @@ -219,7 +219,7 @@ func handleEvalErr(diag hcl.Diagnostics) error { } err := diag.Errs()[0] if match := regexp.MustCompile(`There is no function named "(\w+)"`).FindStringSubmatch(err.Error()); match != nil { - sf := strings.Join(maps.Keys(functions()), ", ") + sf := strings.Join(maps.Keys(availableFunctions), ", ") return HintError{ Err: fmt.Errorf("unsupported function %q", match[1]), Hint: fmt.Sprintf("this context only supports following functions: %v", sf)} @@ -359,7 +359,7 @@ func TokensForValue(val cty.Value) hclwrite.Tokens { // FunctionCallExpression is a helper to build function call expression. func FunctionCallExpression(n string, args ...cty.Value) Expression { - if _, ok := functions()[n]; !ok { + if _, ok := availableFunctions[n]; !ok { panic("unknown function " + n) } ta := make([]hclwrite.Tokens, len(args)) @@ -370,10 +370,16 @@ func FunctionCallExpression(n string, args ...cty.Value) Expression { return MustParseExpression(string(toks.Bytes())) } -func functions() map[string]function.Function { +var availableFunctions = map[string]struct{}{ + "flatten": {}, + "merge": {}, + "ghpc_stage": {}} + +func (bp *Blueprint) functions() map[string]function.Function { return map[string]function.Function{ - "flatten": stdlib.FlattenFunc, - "merge": stdlib.MergeFunc, + "flatten": stdlib.FlattenFunc, + "merge": stdlib.MergeFunc, + "ghpc_stage": bp.makeGhpcStageFunc(), } } @@ -397,7 +403,7 @@ func (bp *Blueprint) Eval(v cty.Value) (cty.Value, error) { } ctx := hcl.EvalContext{ Variables: map[string]cty.Value{"var": vars.AsObject()}, - Functions: functions()} + Functions: bp.functions()} return eval(v, &ctx) } diff --git a/pkg/config/staging.go b/pkg/config/staging.go new file mode 100644 index 0000000000..d75c06fdb9 --- /dev/null +++ b/pkg/config/staging.go @@ -0,0 +1,125 @@ +// Copyright 2024 "Google LLC" +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package config + +import ( + "crypto/md5" + "fmt" + "path/filepath" + + "github.com/hashicorp/hcl/v2/hclsyntax" + "github.com/pkg/errors" + "github.com/zclconf/go-cty/cty" + "github.com/zclconf/go-cty/cty/function" +) + +// Relative path from deployment group to the staging directory +const StagingDir = "../.ghpc/staged" + +type StagedFile struct { + AbsSrc string // absolute path + RelDst string // relative (to deployment group folder) path +} + +func (bp Blueprint) StagedFiles() []StagedFile { + if len(bp.stagedFiles) == 0 { + return nil + } + + if bp.path == "" { + panic("blueprint doesn't have known path, can't resolve staged files to absolute paths") + } + res := []StagedFile{} + for src, dst := range bp.stagedFiles { + if !filepath.IsAbs(src) { // make it absolute + src = filepath.Join(filepath.Dir(bp.path), src) + } + res = append(res, StagedFile{AbsSrc: src, RelDst: dst}) + } + return res +} + +func (bp *Blueprint) makeGhpcStageImpl() func(src string) string { + // Move implementation instantiation to a separate function for easier testing + return func(src string) string { + // NOTE: we can't perform file validation here, because evaluation can be performed + // on expanded blueprints, and relative `src` will not be valid at that point. + // NOTE: this function needs to be deterministic, regardless of the invocation context. + hash := fmt.Sprintf("%x", md5.Sum([]byte(src)))[:10] + name := filepath.Base(src) + if name == "." || name == ".." || filepath.ToSlash(name) == "/" { + name = "file" // shouldn't use this as a human readable name, replace with innocuous "file" + } + dst := filepath.Join(StagingDir, fmt.Sprintf("%s_%s", name, hash)) + + if bp.stagedFiles == nil { + bp.stagedFiles = map[string]string{} + } + bp.stagedFiles[src] = dst + return dst + } +} + +// Makes an `ghpc_stage` function while capturing Blueprint +// in its closure to updade Blueprint state (stagedFiles) +func (bp *Blueprint) makeGhpcStageFunc() function.Function { + impl := bp.makeGhpcStageImpl() + return function.New(&function.Spec{ + Description: `Copy file into the deployment directory to make it available for deployment`, + Params: []function.Parameter{{Name: "path", Type: cty.String}}, + Type: function.StaticReturnType(cty.String), + Impl: func(args []cty.Value, retType cty.Type) (cty.Value, error) { + src := args[0].AsString() + dst := impl(src) + return cty.StringVal(dst), nil + }, + }) +} + +// Validate that the `ghpc_stage` is only used in `vars` declarations +func (bp Blueprint) validateNoGhpcStageFuncs() error { + errs := Errors{} + // check modules + bp.WalkModules(func(mp ModulePath, m *Module) error { + for k, v := range m.Settings.Items() { + errs.Add(validateNoGhpcStageFuncsInValue(mp.Settings.Dot(k), v)) + } + return nil + }) + // TODO: check terraform backends and validators inputs + return errs.OrNil() +} + +func validateNoGhpcStageFuncsInValue(vp ctyPath, val cty.Value) error { + err := HintError{ + Err: errors.New("ghpc_stage function can only be used in deployment Vars declarations"), + Hint: "declare dedicated deployment variable and reference it here"} + + errs := Errors{} + cty.Walk(val, func(p cty.Path, v cty.Value) (bool, error) { + exp, is := IsExpressionValue(v) + if !is { // not an expression + return true, nil + } + // naive check for `ghpc_stage` identity tokens + for _, tok := range exp.Tokenize() { + if tok.Type == hclsyntax.TokenIdent && string(tok.Bytes) == "ghpc_stage" { + errs.At(vp.Cty(p), err) + } + } + return true, nil + }) + return errs.OrNil() +} diff --git a/pkg/config/staging_test.go b/pkg/config/staging_test.go new file mode 100644 index 0000000000..b3fe735dd3 --- /dev/null +++ b/pkg/config/staging_test.go @@ -0,0 +1,99 @@ +// Copyright 2024 "Google LLC" +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package config + +import ( + "path/filepath" + "strings" + + "github.com/google/go-cmp/cmp" + "golang.org/x/exp/slices" + . "gopkg.in/check.v1" +) + +func (s *zeroSuite) TestValidateNoGhpcStageFuncs(c *C) { + bp := Blueprint{ + Groups: []Group{{ + Modules: []Module{ + { + Settings: Dict{}. + With("tree", MustParseExpression("ghpc_stage(\"bush\")").AsValue()), + }}}}} + c.Check(bp.validateNoGhpcStageFuncs(), NotNil) +} + +func (s *zeroSuite) TestGhpcStageImpl(c *C) { + h := func(path, want string) { + bp := Blueprint{path: "/zebra/greendoodle.yaml"} + c.Check(bp.makeGhpcStageImpl()(path), Equals, want) + c.Check(bp.StagedFiles(), DeepEquals, []StagedFile{ + {AbsSrc: filepath.Join("/zebra/", path), RelDst: want}, + }) + } + + h("zero", "../.ghpc/staged/zero_d02c4c4cde") + h("zero/one.txt", "../.ghpc/staged/one.txt_f8669c6c22") + h("./../../two.gif", "../.ghpc/staged/two.gif_711b257c4f") + h(".", "../.ghpc/staged/file_5058f1af83") + h("..", "../.ghpc/staged/file_58b9e70b65") + + { + bp := Blueprint{path: "/zebra/greendoodle.yaml"} + + c.Check(bp.makeGhpcStageImpl()("one.txt"), Equals, "../.ghpc/staged/one.txt_08bc3de154") + c.Check(bp.makeGhpcStageImpl()("zero/one.txt"), Equals, "../.ghpc/staged/one.txt_f8669c6c22") + c.Check(bp.makeGhpcStageImpl()("/root/abs.txt"), Equals, "../.ghpc/staged/abs.txt_ffac5d1d6b") + + got := bp.StagedFiles() + slices.SortFunc(got, func(a, b StagedFile) int { + return strings.Compare(a.AbsSrc, b.AbsSrc) + }) + + if diff := cmp.Diff(got, []StagedFile{ + {"/root/abs.txt", "../.ghpc/staged/abs.txt_ffac5d1d6b"}, + {"/zebra/one.txt", "../.ghpc/staged/one.txt_08bc3de154"}, + {"/zebra/zero/one.txt", "../.ghpc/staged/one.txt_f8669c6c22"}, + }); diff != "" { + c.Errorf("diff (-want +got):\n%s", diff) + } + } +} + +func (s *zeroSuite) TestGhpcStageFunc(c *C) { + bp := Blueprint{path: "/zebra/greendoodle.yaml"} + + h := func(p string) string { + g, err := bp.Eval(MustParseExpression("ghpc_stage(\"" + p + "\")").AsValue()) + if err != nil { + c.Fatal(err) + } + return g.AsString() + } + + c.Check(h("bush"), Equals, "../.ghpc/staged/bush_dbbc546e35") + c.Check(h("push"), Equals, "../.ghpc/staged/push_21a361d96e") + + got := bp.StagedFiles() + slices.SortFunc(got, func(a, b StagedFile) int { + return strings.Compare(a.AbsSrc, b.AbsSrc) + }) + + if diff := cmp.Diff(got, []StagedFile{ + {"/zebra/bush", "../.ghpc/staged/bush_dbbc546e35"}, + {"/zebra/push", "../.ghpc/staged/push_21a361d96e"}, + }); diff != "" { + c.Errorf("diff (-want +got):\n%s", diff) + } +} diff --git a/pkg/config/validate.go b/pkg/config/validate.go index 553c583ead..5fa290186f 100644 --- a/pkg/config/validate.go +++ b/pkg/config/validate.go @@ -60,7 +60,7 @@ func validateGlobalLabels(bp Blueprint) error { if !isValidLabelName(k) { errs.At(vp, HintError{ Err: fmt.Errorf("invalid label name %q", k), - Hint: errMsgLabelNameReqs}) + Hint: "name must begin with a lowercase letter, can only contain lowercase letters, numeric characters, underscores and dashes, and must be between 1 and 63 characters long"}) } if _, is := IsExpressionValue(v); is { @@ -138,7 +138,7 @@ func validateOutputs(p ModulePath, mod Module, info modulereader.ModuleInfo) err // Ensure output exists in the underlying modules for io, output := range mod.Outputs { if _, ok := outputs[output.Name]; !ok { - err := fmt.Errorf("%s, module: %s output: %s", errMsgInvalidOutput, mod.ID, output.Name) + err := fmt.Errorf("requested output %q was not found in the module %q", output.Name, mod.ID) errs.At(p.Outputs.At(io), err) } } diff --git a/pkg/inspect/modules_test.go b/pkg/inspect/modules_test.go index e0558c1de6..a0e3671659 100644 --- a/pkg/inspect/modules_test.go +++ b/pkg/inspect/modules_test.go @@ -104,6 +104,15 @@ func hasInput(name string) predicate { } } +func hasInputNotDeprecated(name string) predicate { + return func(mod modInfo) bool { + if vi, ok := mod.Input(name); ok { + return !strings.HasPrefix(vi.Description, "DEPRECATED") + } + return false + } +} + // Fails test if slice is empty, returns not empty slice as is. func notEmpty[E any](l []E, t *testing.T) []E { if len(l) == 0 { @@ -148,7 +157,7 @@ func TestNetworkStorage(t *testing.T) { })`) lst := modulereader.NormalizeType(fmt.Sprintf("list(%s)", obj)) - for _, mod := range notEmpty(query(hasInput("network_storage")), t) { + for _, mod := range notEmpty(query(hasInputNotDeprecated("network_storage")), t) { i, _ := mod.Input("network_storage") got := typeexpr.TypeString(i.Type) if got != obj && got != lst { diff --git a/pkg/modulewriter/modulewriter.go b/pkg/modulewriter/modulewriter.go index bc000794f7..e2aebe48c0 100644 --- a/pkg/modulewriter/modulewriter.go +++ b/pkg/modulewriter/modulewriter.go @@ -25,6 +25,7 @@ import ( "fmt" "hpc-toolkit/pkg/config" "hpc-toolkit/pkg/deploymentio" + "hpc-toolkit/pkg/logging" "hpc-toolkit/pkg/sourcereader" "io" "os" @@ -32,6 +33,7 @@ import ( "path/filepath" "github.com/hashicorp/go-getter" + "github.com/otiai10/copy" ) // strings that get re-used throughout this package and others @@ -78,6 +80,10 @@ func WriteDeployment(bp config.Blueprint, deploymentDir string) error { return err } + if err := stageFiles(bp, deploymentDir); err != nil { + return err + } + instructions, err := os.Create(InstructionsPath(deploymentDir)) if err != nil { return err @@ -106,6 +112,62 @@ func WriteDeployment(bp config.Blueprint, deploymentDir string) error { return nil } +func stageFiles(bp config.Blueprint, deplPath string) error { + staged := bp.StagedFiles() + if len(staged) == 0 { + return nil + } + + // create staging directory + if err := os.MkdirAll(filepath.Join(deplPath, config.StagingDir), 0700); err != nil { + return err + } + + errs := config.Errors{} + for _, f := range staged { + // TODO: attribute error to the position in the blueprint + errs.Add(stageFile(deplPath, f)) + } + return errs.OrNil() +} + +func doesExists(path string) (bool, error) { + _, err := os.Lstat(path) + if err != nil { + if errors.Is(err, os.ErrNotExist) { + return false, nil + } + return false, err + } + return true, nil +} + +func stageFile(deplPath string, f config.StagedFile) error { + // RelDst is relative to group folders ("../.ghpc/staged"), + // prepend any_group_dir to be "eaten" by ".." + dst := filepath.Join(deplPath, "any_group_dir", f.RelDst) + dstExists, err := doesExists(dst) + if err != nil { + return err + } + + srcExists, err := doesExists(f.AbsSrc) + if err != nil { + return err + } + + if !srcExists && !dstExists { + return fmt.Errorf("file for staging %s does not exists", f.AbsSrc) + } + if !srcExists && dstExists { + // We implement this relaxation for cases where user does not have access to the original blueprint, + // and does re-creation using expanded blueprint. + logging.Error("WARNING: file %s does not exists, proceeding by using previously staged copy", f.AbsSrc) + return nil + } + return copy.Copy(f.AbsSrc, dst) +} + func writeGroup(deplPath string, bp config.Blueprint, gIdx int, instructions io.Writer) error { g := bp.Groups[gIdx] gPath, err := createGroupDir(deplPath, g) diff --git a/pkg/modulewriter/modulewriter_test.go b/pkg/modulewriter/modulewriter_test.go index 6b9881c89c..9f0972ce73 100644 --- a/pkg/modulewriter/modulewriter_test.go +++ b/pkg/modulewriter/modulewriter_test.go @@ -565,3 +565,78 @@ func (s *zeroSuite) TestWritePackerDestroyInstructions(c *C) { c.Check(got, Matches, ".*Aldebaran.*Betelgeuse.*") } } + +func (s *zeroSuite) TestStagingDirConsistency(c *C) { + // pkg/config can't use modulewriter, ensure consistency by testing. + want := filepath.Join("..", HiddenGhpcDirName, "staged") + c.Check(config.StagingDir, Equals, want) +} + +func (s *zeroSuite) TestStageFile(c *C) { + srcDir := c.MkDir() + + deplDir := c.MkDir() + stagedDir := filepath.Join(deplDir, "staged") + if err := os.Mkdir(stagedDir, 0755); err != nil { + c.Fatal(err) + } + + { // src doesn't exist + f := config.StagedFile{ + AbsSrc: filepath.Join(srcDir, "bush"), + RelDst: "../staged/bush_44"} + c.Assert(stageFile(deplDir, f), ErrorMatches, ".*bush does not exists.*") + } + + { // src exists, dst doesn't + f := config.StagedFile{ + AbsSrc: filepath.Join(srcDir, "ugg"), + RelDst: "../staged/ugg_44"} + + if err := os.WriteFile(f.AbsSrc, []byte("riddle"), 0644); err != nil { + c.Fatal(err) + } + + c.Assert(stageFile(deplDir, f), IsNil) + dat, err := os.ReadFile(filepath.Join(deplDir, "any_group", f.RelDst)) + c.Assert(err, IsNil) + c.Assert(string(dat), Equals, "riddle") + } + + { // src exists, dst exists and get overwritten + f := config.StagedFile{ + AbsSrc: filepath.Join(srcDir, "clement"), + RelDst: "../staged/clement_44"} + + if err := os.WriteFile(f.AbsSrc, []byte("barrel"), 0644); err != nil { + c.Fatal(err) + } + + absDst := filepath.Join(deplDir, "any_group", f.RelDst) + if err := os.WriteFile(absDst, []byte("not_barrel"), 0644); err != nil { + c.Fatal(err) + } + + c.Assert(stageFile(deplDir, f), IsNil) + dat, err := os.ReadFile(absDst) + c.Assert(err, IsNil) + c.Assert(string(dat), Equals, "barrel") + } + + { // src doesn't exists, but dst exists + f := config.StagedFile{ + AbsSrc: filepath.Join(srcDir, "orange"), + RelDst: "../staged/orange_44"} + + absDst := filepath.Join(deplDir, "any_group", f.RelDst) + if err := os.WriteFile(absDst, []byte("pulp"), 0644); err != nil { + c.Fatal(err) + } + + c.Assert(stageFile(deplDir, f), IsNil) + dat, err := os.ReadFile(absDst) + c.Assert(err, IsNil) + c.Assert(string(dat), Equals, "pulp") + } + +} diff --git a/pkg/validators/cloud.go b/pkg/validators/cloud.go index 78a0585f2b..3f3faf686d 100644 --- a/pkg/validators/cloud.go +++ b/pkg/validators/cloud.go @@ -180,10 +180,10 @@ func TestZoneInRegion(projectID string, zone string, region string) error { } func testApisEnabled(bp config.Blueprint, inputs config.Dict) error { - if err := checkInputs(inputs, []string{}); err != nil { + if err := checkInputs(inputs, []string{"project_id"}); err != nil { return err } - p, err := bp.ProjectID() + m, err := inputsAsStrings(inputs) if err != nil { return err } @@ -194,7 +194,7 @@ func testApisEnabled(bp config.Blueprint, inputs config.Dict) error { apis[api] = true } }) - return TestApisEnabled(p, maps.Keys(apis)) + return TestApisEnabled(m["project_id"], maps.Keys(apis)) } func testProjectExists(bp config.Blueprint, inputs config.Dict) error { diff --git a/pkg/validators/quota.go b/pkg/validators/quota.go deleted file mode 100644 index a004e3078d..0000000000 --- a/pkg/validators/quota.go +++ /dev/null @@ -1,428 +0,0 @@ -// Copyright 2023 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package validators - -import ( - "context" - "fmt" - "hpc-toolkit/pkg/config" - "strings" - "time" - - "golang.org/x/exp/maps" - - "github.com/zclconf/go-cty/cty" - "github.com/zclconf/go-cty/cty/convert" - "github.com/zclconf/go-cty/cty/gocty" - cm "google.golang.org/api/monitoring/v3" - sub "google.golang.org/api/serviceusage/v1beta1" -) - -// ResourceRequirement represents an amount of desired resource. -type ResourceRequirement struct { - Consumer string `cty:"consumer"` // e.g. "projects/myprojectid"" - Service string `cty:"service"` // e.g. "compute.googleapis.com" - Metric string `cty:"metric"` // e.g. "compute.googleapis.com/disks_total_storage" - Required int64 `cty:"required"` - Dimensions map[string]string `cty:"dimensions"` // e.g. {"region": "us-central1"} -} - -// InBucket returns true if all dimensions specified in the bucket match dimensions the requirement. -func (q ResourceRequirement) InBucket(b *sub.QuotaBucket) bool { - for d, v := range b.Dimensions { - if q.Dimensions[d] != v { - return false - } - } - return true -} - -// QuotaError represents an event of not having enough quota. -type QuotaError struct { - Consumer string - Service string - Metric string - DisplayName string - Unit string - Dimensions map[string]string - EffectiveLimit int64 - Usage int64 - Requested int64 -} - -func (e QuotaError) Error() string { - loc := "" - if len(e.Dimensions) > 0 { - prettyMap := fmt.Sprintf("%v", e.Dimensions)[3:] - loc = fmt.Sprintf(" in %s", prettyMap) - } - rhs := fmt.Sprintf("requested=%d", e.Requested) - if e.Usage > 0 { - rhs = fmt.Sprintf("requested=%d + usage=%d", e.Requested, e.Usage) - } - return fmt.Sprintf("not enough quota %q as %q%s, limit=%d < %s", e.DisplayName, e.Unit, loc, e.EffectiveLimit, rhs) -} - -func validateResourceRequirements(rs []ResourceRequirement, up *usageProvider) ([]QuotaError, error) { - // Group by Consumer and Service - type gk struct { - Consumer string - Service string - } - - groups := map[gk][]ResourceRequirement{} - for _, r := range rs { - k := gk{r.Consumer, r.Service} - groups[k] = append(groups[k], r) - } - - // Process all groups in parallel - type chs struct { - qe []QuotaError - err error - } - ch := make(chan chs) - for k, g := range groups { // Spawn - go func(k gk, g []ResourceRequirement) { - qe, err := validateServiceRequirements(k.Consumer, k.Service, g, up) - ch <- chs{qe, err} - }(k, g) - } - errs := config.Errors{} - qerrs := []QuotaError{} - for range groups { // Gather - s := <-ch - qerrs = append(qerrs, s.qe...) - errs.Add(s.err) - } - return qerrs, errs.OrNil() -} - -// Validate requirements for a single consumer & service pair. -// The `ServiceUsage.ConsumerQuotaMetrics` API call returns following structure: -// -// list[ConsumerQuotaMetric] - one per metric/quota, e.g. compute.googleapis.com/n2_cpus for `N2 CPUs`. -// ├Metric -// └list[ConsumerQuotaLimit] - one per quota scope (e.g. regional or zonal) -// ....|.......................for N2 CPUs there are two: `N2-CPUS-per-project-region` & `N2-CPUS-per-project-zone` -// ....├Unit - e.g. '1/{project}/{region}' for regional "scope" -// ....└list[QuotaBucket] - represents the "slice" of the "scope" with specified limit -// ........|................e.g. for `N2-CPUS-per-project-region` there will be buckets with specific region: `{'region': 'asia-east1'}`, -// ........|................and one bucket, named `N2 CPUs (default)` in the UI, without `dimensions={}` to act as a wildcard and provide default limit. -// ........├EffectiveLimit -// ........└Dimensions - e.g. {"region": "us-central1"} -func validateServiceRequirements(consumer string, service string, rs []ResourceRequirement, up *usageProvider) ([]QuotaError, error) { - qms, err := queryMetrics(consumer, service) - if err != nil { - return nil, err - } - - errs := config.Errors{} - reqToBuckets, err := gatherBucketsRequirements(rs, qms) - errs.Add(err) - qerrs := []QuotaError{} - for _, br := range reqToBuckets { - qerrs = append(qerrs, validateBucket(br, up)...) - } - return qerrs, errs.OrNil() -} - -// Find a bucket in the ConsumerQuotaLimit that matches the ResourceRequirement. -func findBucket(r ResourceRequirement, ql *sub.ConsumerQuotaLimit) (*sub.QuotaBucket, error) { - // Iterate buckets in order from most to less specific, see: - // https://cloud.google.com/service-usage/docs/reference/rest/v1beta1/services.consumerQuotaMetrics.limits - // | QuotaBuckets: Summary of the enforced quota buckets, organized by - // | quota dimension, ordered from least specific to most specific (for - // | example, the global default bucket, with no quota dimensions, will - // | always appear first). - for i := len(ql.QuotaBuckets) - 1; i >= 0; i-- { - if r.InBucket(ql.QuotaBuckets[i]) { - return ql.QuotaBuckets[i], nil - } - } - // We should never end up here, do not panic, return fake "unlimited" bucket and report error. - return &sub.QuotaBucket{Dimensions: map[string]string{}, EffectiveLimit: -1}, - fmt.Errorf("unexpected default-less ConsumerQuotaLimit: %q", ql.Name) -} - -// Set of requirements that fall into the specific bucket with context attached -type bucketRequirements struct { - QuotaMetric *sub.ConsumerQuotaMetric - QuotaLimit *sub.ConsumerQuotaLimit - Bucket *sub.QuotaBucket - Requirements []ResourceRequirement -} - -// Attribute each requirement to one or more buckets that this requirement should be checked against. -// Organize result by buckets. -func gatherBucketsRequirements(rs []ResourceRequirement, qms map[string]*sub.ConsumerQuotaMetric) ([]bucketRequirements, error) { - res := map[string]bucketRequirements{} - errs := config.Errors{} - - for _, r := range rs { // Iterate requirements - qm, ok := qms[r.Metric] - if !ok { - // TODO: add path to ResourceRequirement for better error reporting - errs.Add(fmt.Errorf("can't find quota for metric %q", r.Metric)) - continue - } - - // Each ConsumerQuotaMetric can contain multiple ConsumerQuotaLimits, - // e.g. ConsumerQuotaMetric for "N2 CPUs" has two ConsumerQuotaLimits: regional and zonal. - for _, ql := range qm.ConsumerQuotaLimits { - b, err := findBucket(r, ql) - errs.Add(err) - - k := fmt.Sprintf("%s|%v", ql.Name, b.Dimensions) // unique key to identify bucket across all ConsumerQuotaMetric(s) - br, ok := res[k] // update stored bucket requirements - if !ok { - br = bucketRequirements{qm, ql, b, []ResourceRequirement{}} - } - br.Requirements = append(br.Requirements, r) - res[k] = br - } - - } - return maps.Values(res), errs.OrNil() -} - -// validateBucket aggregates (sum) all requirements and usage for the given bucket -// and returns QuotaError if the bucket quota limit is not sufficient. -func validateBucket(br bucketRequirements, up *usageProvider) []QuotaError { - if len(br.Requirements) == 0 { - return nil - } - - required := int64(0) - for _, r := range br.Requirements { - required += r.Required - } - usage := up.Usage(br.QuotaMetric.Metric, br.Bucket.Dimensions["region"], br.Bucket.Dimensions["zone"]) - - if !satisfied(required+usage, br.Bucket.EffectiveLimit) { - r0 := br.Requirements[0] // Take any, they all should have the same metric, service, and consumer - return []QuotaError{{ - Consumer: r0.Consumer, - Service: r0.Service, - Metric: r0.Metric, - DisplayName: br.QuotaMetric.DisplayName, - Unit: br.QuotaLimit.Unit, - Dimensions: br.Bucket.Dimensions, - EffectiveLimit: br.Bucket.EffectiveLimit, - Usage: usage, - Requested: required, - }} - } - return nil -} - -func satisfied(requested int64, limit int64) bool { - if limit == -1 { - return true - } - return requested <= limit -} - -func queryMetrics(consumer string, service string) (map[string]*sub.ConsumerQuotaMetric, error) { - ctx := context.Background() - s, err := sub.NewService(ctx) - if err != nil { - return nil, err - } - res := map[string]*sub.ConsumerQuotaMetric{} - parent := fmt.Sprintf("%s/services/%s", consumer, service) - err = s.Services.ConsumerQuotaMetrics. - List(parent). - View("BASIC"). // BASIC reduces the response size & latency - Pages(ctx, func(page *sub.ListConsumerQuotaMetricsResponse) error { - for _, m := range page.Metrics { - res[m.Metric] = m - } - return nil - }) - return res, err -} - -type usageKey struct { - Metric string - Location string // either "global", region, or zone -} - -// usageProvider provides usage for a given metric and location. -type usageProvider struct { - u map[usageKey]int64 -} - -func (up *usageProvider) Usage(metric string, region string, zone string) int64 { - if up.u == nil { - return 0 - } - k := usageKey{metric, "global"} - if region != "" { - k.Location = region - } - if zone != "" { - k.Location = zone - } - return up.u[k] // 0 if not found -} - -func newUsageProvider(projectID string) (usageProvider, error) { - s, err := cm.NewService(context.Background()) - if err != nil { - return usageProvider{}, err - } - - u := map[usageKey]int64{} - err = s.Projects.TimeSeries.List("projects/"+projectID). - Filter(`metric.type="serviceruntime.googleapis.com/quota/allocation/usage"`). - IntervalEndTime(time.Now().Format(time.RFC3339)). - // Quota usage metrics get duplicated once a day - IntervalStartTime(time.Now().Add(-24*time.Hour).Format(time.RFC3339)). - Pages(context.Background(), func(page *cm.ListTimeSeriesResponse) error { - for _, ts := range page.TimeSeries { - usage := ts.Points[0].Value.Int64Value // Points[0] is latest - if *usage == 0 { - continue - } - metric := ts.Metric.Labels["quota_metric"] - location := ts.Resource.Labels["location"] - u[usageKey{metric, location}] = *usage - } - return nil - }) - if err != nil { - return usageProvider{}, err - } - return usageProvider{u}, nil -} - -type rrInputs struct { - Requirements []ResourceRequirement `cty:"requirements"` - IgnoreUsage bool `cty:"ignore_usage"` -} - -func ifNull(v cty.Value, d cty.Value) cty.Value { - if v.IsNull() { - return d - } - return v -} - -func extractServiceName(metric string) (string, error) { - // metric is in the form of "service.googleapis.com/metric" - // we want to extract the "service.googleapis.com" part - parts := strings.Split(metric, "/") - if len(parts) < 2 { - return "", fmt.Errorf("can not deduce service from metric %q", metric) - } - return parts[0], nil -} - -func parseResourceRequirementsInputs(bp config.Blueprint, inputs config.Dict) (rrInputs, error) { - // sanitize inputs dict by matching with type - rty := cty.ObjectWithOptionalAttrs(map[string]cty.Type{ - "metric": cty.String, - "service": cty.String, - "consumer": cty.String, - "required": cty.Number, - "dimensions": cty.Map(cty.String), - }, - /*optional=*/ []string{"service", "consumer", "dimensions"}) - ity := cty.ObjectWithOptionalAttrs(map[string]cty.Type{ - "requirements": cty.List(rty), - "ignore_usage": cty.Bool, - }, - /*optional=*/ []string{"ignore_usage"}) - clean, err := convert.Convert(inputs.AsObject(), ity) - if err != nil { - return rrInputs{}, err - } - - vars, _ := bp.Vars.Eval(bp) - - // fill in default values - ignoreUsage := ifNull(clean.GetAttr("ignore_usage"), cty.False) - projectID, err := bp.ProjectID() - if err != nil { - return rrInputs{}, err - } - reqs := []cty.Value{} - rit := clean.GetAttr("requirements").ElementIterator() - for rit.Next() { - _, r := rit.Element() - defConsumer := fmt.Sprintf("projects/%s", projectID) - defService, err := extractServiceName(r.GetAttr("metric").AsString()) - if err != nil { - return rrInputs{}, err - } - defDims := map[string]cty.Value{} - if vars.Has("region") { - defDims["region"] = vars.Get("region") - } - if vars.Has("zone") { - defDims["zone"] = vars.Get("zone") - } - defDimsVal := cty.MapValEmpty(cty.String) - if len(defDims) > 0 { - defDimsVal = cty.MapVal(defDims) - } - - reqs = append(reqs, cty.ObjectVal(map[string]cty.Value{ - "metric": r.GetAttr("metric"), - "service": ifNull(r.GetAttr("service"), cty.StringVal(defService)), - "consumer": ifNull(r.GetAttr("consumer"), cty.StringVal(defConsumer)), - "required": r.GetAttr("required"), - "dimensions": ifNull(r.GetAttr("dimensions"), defDimsVal), - })) - } - - reqsVal := cty.ListValEmpty(rty) - if len(reqs) > 0 { - reqsVal = cty.ListVal(reqs) - } - - full := cty.ObjectVal(map[string]cty.Value{ - "requirements": reqsVal, - "ignore_usage": ignoreUsage, - }) - - var s rrInputs - return s, gocty.FromCtyValue(full, &s) -} - -func testResourceRequirements(bp config.Blueprint, inputs config.Dict) error { - in, err := parseResourceRequirementsInputs(bp, inputs) - if err != nil { - return err - } - errs := config.Errors{} - up := usageProvider{} - if !in.IgnoreUsage { - p, err := bp.ProjectID() - errs.Add(err) - if p != "" { - up, err = newUsageProvider(p) - errs.Add(err) // don't terminate fallback to ignore usage - } - } - - qerrs, err := validateResourceRequirements(in.Requirements, &up) - for _, qe := range qerrs { - errs.Add(qe) - } - errs.Add(err) - return errs.OrNil() -} diff --git a/pkg/validators/quota_test.go b/pkg/validators/quota_test.go deleted file mode 100644 index 7fbd073f28..0000000000 --- a/pkg/validators/quota_test.go +++ /dev/null @@ -1,302 +0,0 @@ -// Copyright 2023 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package validators - -import ( - "fmt" - "hpc-toolkit/pkg/config" - "testing" - - "github.com/google/go-cmp/cmp" - "github.com/zclconf/go-cty/cty" - sub "google.golang.org/api/serviceusage/v1beta1" - "gopkg.in/yaml.v3" -) - -func TestSatisfied(t *testing.T) { - type test struct { - requested int64 - limit int64 - want bool - } - tests := []test{ - {1, 1, true}, - {1, 2, true}, - {2, 1, false}, - {1, -1, true}, - } - - for _, tc := range tests { - t.Run(fmt.Sprintf("%d::%d", tc.requested, tc.limit), func(t *testing.T) { - got := satisfied(tc.requested, tc.limit) - if diff := cmp.Diff(tc.want, got); diff != "" { - t.Errorf("diff (-want +got):\n%s", diff) - } - }) - } -} - -func TestInBucket(t *testing.T) { - type test struct { - qDimensions map[string]string - bDimensions map[string]string - want bool - } - tests := []test{ - {map[string]string{"a": "1", "b": "2"}, map[string]string{"a": "1", "b": "2"}, true}, - {map[string]string{"a": "1", "b": "2"}, map[string]string{"a": "1", "b": "3"}, false}, - {map[string]string{"a": "1", "b": "2"}, map[string]string{"a": "1"}, true}, - {map[string]string{"a": "1", "b": "2"}, map[string]string{"a": "1", "b": "2", "c": "3"}, false}, - {map[string]string{}, map[string]string{}, true}, - } - for _, tc := range tests { - t.Run(fmt.Sprintf("%#v::%#v", tc.qDimensions, tc.bDimensions), func(t *testing.T) { - q := ResourceRequirement{Dimensions: tc.qDimensions} - b := sub.QuotaBucket{Dimensions: tc.bDimensions} - - got := q.InBucket(&b) - if diff := cmp.Diff(tc.want, got); diff != "" { - t.Errorf("diff (-want +got):\n%s", diff) - } - }) - } -} - -func TestValidateBucket(t *testing.T) { - qm := sub.ConsumerQuotaMetric{Metric: "pony.api/friendship", DisplayName: "apple"} - ql := sub.ConsumerQuotaLimit{Unit: "1/{road}"} - b := sub.QuotaBucket{ - EffectiveLimit: 10, - Dimensions: map[string]string{"zone": "ponyland"}, - } - br := bucketRequirements{ - QuotaMetric: &qm, - QuotaLimit: &ql, - Bucket: &b, - Requirements: []ResourceRequirement{ - { - Consumer: "redhat", - Service: "pony.api", - Metric: "pony.api/friendship", - Required: 5, - }, - { - Consumer: "redhat", - Service: "pony.api", - Metric: "pony.api/friendship", - Required: 4, - }, - }, - } - up := usageProvider{u: map[usageKey]int64{ - {Metric: "pony.api/friendship", Location: "ponyland"}: 3, - }} - - errs := validateBucket(br, &up) - if len(errs) != 1 { - t.Errorf("got %d errors, want 1", len(errs)) - } else { - want := QuotaError{ - Metric: "pony.api/friendship", - Consumer: "redhat", - Service: "pony.api", - DisplayName: "apple", - Unit: "1/{road}", - Dimensions: map[string]string{"zone": "ponyland"}, - Requested: 5 + 4, - Usage: 3, - EffectiveLimit: 10, - } - if diff := cmp.Diff(want, errs[0]); diff != "" { - t.Errorf("diff (-want +got):\n%s", diff) - } - } -} - -func TestUsageProviderGet(t *testing.T) { - up := usageProvider{u: map[usageKey]int64{ - {Metric: "pony", Location: "global"}: 17, - {Metric: "pony", Location: "us-west1"}: 13, - {Metric: "pony", Location: "us-west1-c"}: 11, - {Metric: "zebra", Location: "us-east1"}: 7, - }} - - type test struct { - metric string - region string - zone string - want int64 - } - tests := []test{ - {"pony", "", "", 17}, - {"zebra", "", "", 0}, - {"pony", "us-west1", "", 13}, - {"zebra", "us-east2", "", 0}, - {"pony", "us-west1", "us-west1-c", 11}, - {"zebra", "us-east1", "us-east1-b", 0}, - } - for _, tc := range tests { - t.Run(fmt.Sprintf("%#v", tc), func(t *testing.T) { - got := up.Usage(tc.metric, tc.region, tc.zone) - if diff := cmp.Diff(tc.want, got); diff != "" { - t.Errorf("diff (-want +got):\n%s", diff) - } - }) - } -} - -func TestParseResourceRequirementsInputs(t *testing.T) { - type test struct { - yml string - want rrInputs - err bool - } - tests := []test{ - {`# empty -requirements: []`, rrInputs{Requirements: []ResourceRequirement{}}, false}, - {`# complete -ignore_usage: true -requirements: -- metric: pony.api/friendship - consumer: redhat - service: zebra.api - required: 22 - dimensions: {"x": "y", "left": "right"}`, rrInputs{ - IgnoreUsage: true, - Requirements: []ResourceRequirement{ - { - Metric: "pony.api/friendship", - Consumer: "redhat", - Service: "zebra.api", - Required: 22, - Dimensions: map[string]string{ - "x": "y", - "left": "right", - }, - }, - }, - }, false}, - {`# fill in -requirements: -- metric: pony.api/friendship - required: 33`, rrInputs{ - IgnoreUsage: false, - Requirements: []ResourceRequirement{ - { - Metric: "pony.api/friendship", - Service: "pony.api", - Consumer: "projects/apple", - Required: 33, - Dimensions: map[string]string{ - "region": "narnia", - "zone": "narnia-51", - }, - }, - }, - }, false}, - } - for _, tc := range tests { - t.Run(tc.yml, func(t *testing.T) { - var in config.Dict - bp := config.Blueprint{Vars: config.Dict{}. - With("project_id", cty.StringVal("apple")). - With("region", cty.StringVal("narnia")). - With("zone", cty.StringVal("narnia-51"))} - if err := yaml.Unmarshal([]byte(tc.yml), &in); err != nil { - t.Fatal("failed to unmarshal yaml") - } - rr, err := parseResourceRequirementsInputs(bp, in) - if (err == nil) == tc.err { - t.Fatalf("unexpected error: %v", err) - } - if diff := cmp.Diff(tc.want, rr); diff != "" { - t.Errorf("diff (-want +got):\n%s", diff) - } - }) - } -} - -func TestQuotaError(t *testing.T) { - type test struct { - err QuotaError - want string - } - tests := []test{ - {QuotaError{ - DisplayName: "zebra", - Unit: "1/{road}", - Dimensions: map[string]string{"zone": "zoo"}, - Requested: 10, - Usage: 5, - EffectiveLimit: 13, - }, `not enough quota "zebra" as "1/{road}" in [zone:zoo], limit=13 < requested=10 + usage=5`}, - {QuotaError{ - DisplayName: "zebra", - Unit: "1/{road}", - Requested: 10, - Usage: 5, - EffectiveLimit: 13, - }, `not enough quota "zebra" as "1/{road}", limit=13 < requested=10 + usage=5`}, - {QuotaError{ - DisplayName: "zebra", - Unit: "1/{road}", - Requested: 10, - EffectiveLimit: 13, - }, `not enough quota "zebra" as "1/{road}", limit=13 < requested=10`}, - } - for _, tc := range tests { - t.Run(tc.want, func(t *testing.T) { - if diff := cmp.Diff(tc.want, tc.err.Error()); diff != "" { - t.Errorf("diff (-want +got):\n%s", diff) - } - }) - } -} - -func TestGatherBucketsRequirements(t *testing.T) { - b0 := sub.QuotaBucket{ - EffectiveLimit: 10, - Dimensions: map[string]string{"zone": "ponyland"}, - } - ql := sub.ConsumerQuotaLimit{ - Unit: "1/{road}", - QuotaBuckets: []*sub.QuotaBucket{&b0}, - } - qm := sub.ConsumerQuotaMetric{ConsumerQuotaLimits: []*sub.ConsumerQuotaLimit{&ql}} - qms := map[string]*sub.ConsumerQuotaMetric{"pony.api/friendship": &qm} - r0 := ResourceRequirement{Metric: "not_gonna_find_me"} - r1 := ResourceRequirement{ - Metric: "pony.api/friendship", - Dimensions: map[string]string{"zone": "ponyland"}, - } - rs := []ResourceRequirement{r0, r1} - br, err := gatherBucketsRequirements(rs, qms) - - brWant := []bucketRequirements{ - { - QuotaMetric: &qm, - QuotaLimit: &ql, - Bucket: &b0, - Requirements: []ResourceRequirement{r1}, - }, - } - if diff := cmp.Diff(brWant, br); diff != "" { - t.Errorf("diff (-want +got):\n%s", diff) - } - wantErr := `can't find quota for metric "not_gonna_find_me"` - if diff := cmp.Diff(wantErr, err.Error()); diff != "" { - t.Errorf("diff (-want +got):\n%s", diff) - } -} diff --git a/pkg/validators/validators.go b/pkg/validators/validators.go index 0b56f76660..daa7040371 100644 --- a/pkg/validators/validators.go +++ b/pkg/validators/validators.go @@ -47,7 +47,6 @@ const ( testZoneInRegionName = "test_zone_in_region" testModuleNotUsedName = "test_module_not_used" testDeploymentVariableNotUsedName = "test_deployment_variable_not_used" - testResourceRequirementsName = "test_resource_requirements" ) func implementations() map[string]func(config.Blueprint, config.Dict) error { @@ -59,7 +58,6 @@ func implementations() map[string]func(config.Blueprint, config.Dict) error { testZoneInRegionName: testZoneInRegion, testModuleNotUsedName: testModuleNotUsed, testDeploymentVariableNotUsedName: testDeploymentVariableNotUsed, - testResourceRequirementsName: testResourceRequirements, } } @@ -166,17 +164,17 @@ func defaults(bp config.Blueprint) []config.Validator { // only succeed if credentials can access the project. If the project ID // validator fails, all remaining validators are not executed. if projectIDExists { + inputs := config.Dict{}.With("project_id", projectRef) defaults = append(defaults, config.Validator{ Validator: testProjectExistsName, - Inputs: config.NewDict(map[string]cty.Value{"project_id": projectRef}), - }) + Inputs: inputs, + }, config.Validator{ + Validator: testApisEnabledName, + Inputs: inputs, + }, + ) } - // it is safe to run this validator even if vars.project_id is undefined; - // it will likely fail but will do so helpfully to the user - defaults = append(defaults, - config.Validator{Validator: testApisEnabledName}) - if projectIDExists && regionExists { defaults = append(defaults, config.Validator{ Validator: testRegionExistsName, diff --git a/pkg/validators/validators_test.go b/pkg/validators/validators_test.go index 175f90b76f..3f690e5821 100644 --- a/pkg/validators/validators_test.go +++ b/pkg/validators/validators_test.go @@ -73,36 +73,27 @@ func (s *MySuite) TestCheckInputs(c *C) { func (s *MySuite) TestDefaultValidators(c *C) { unusedMods := config.Validator{Validator: "test_module_not_used"} unusedVars := config.Validator{Validator: "test_deployment_variable_not_used"} - apisEnabled := config.Validator{Validator: "test_apis_enabled"} - projectRef := config.GlobalRef("project_id").AsValue() - regionRef := config.GlobalRef("region").AsValue() - zoneRef := config.GlobalRef("zone").AsValue() + prjInp := config.Dict{}.With("project_id", config.GlobalRef("project_id").AsValue()) + regInp := prjInp.With("region", config.GlobalRef("region").AsValue()) + zoneInp := prjInp.With("zone", config.GlobalRef("zone").AsValue()) + regZoneInp := regInp.With("zone", config.GlobalRef("zone").AsValue()) projectExists := config.Validator{ - Validator: testProjectExistsName, - Inputs: config.NewDict(map[string]cty.Value{"project_id": projectRef})} + Validator: "test_project_exists", Inputs: prjInp} + apisEnabled := config.Validator{ + Validator: "test_apis_enabled", Inputs: prjInp} regionExists := config.Validator{ - Validator: testRegionExistsName, - Inputs: config.NewDict(map[string]cty.Value{ - "project_id": projectRef, - "region": regionRef})} + Validator: testRegionExistsName, Inputs: regInp} zoneExists := config.Validator{ - Validator: testZoneExistsName, - Inputs: config.NewDict(map[string]cty.Value{ - "project_id": projectRef, - "zone": zoneRef})} + Validator: testZoneExistsName, Inputs: zoneInp} zoneInRegion := config.Validator{ - Validator: testZoneInRegionName, - Inputs: config.NewDict(map[string]cty.Value{ - "project_id": projectRef, - "region": regionRef, - "zone": zoneRef})} + Validator: testZoneInRegionName, Inputs: regZoneInp} { bp := config.Blueprint{} c.Check(defaults(bp), DeepEquals, []config.Validator{ - unusedMods, unusedVars, apisEnabled}) + unusedMods, unusedVars}) } { diff --git a/tools/capture_serial.sh b/tools/capture_serial.sh new file mode 100755 index 0000000000..2595489365 --- /dev/null +++ b/tools/capture_serial.sh @@ -0,0 +1,45 @@ +#!/bin/bash +# Copyright 2024 "Google LLC" +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# +# Checks for serial output of a VM and saves it on a file until the VM is +# deleted. +# + +# Check for parameters +if [ $# -lt 3 ]; then + echo "Usage: $0 " + exit 1 +fi + +VM_NAME=$1 +PROJECT=$2 +ZONE=$3 + +OUTPUT_FILE="vm_serial_$(date +%Y%m%d_%H%M%S).log" # Dynamic filename with timestamp + +# Main loop +while true; do + output=$(gcloud compute instances get-serial-port-output "$VM_NAME" --zone "$ZONE" --project "$PROJECT" --port 1 2>/dev/null) + + # Check if output is not empty + if [ -n "$output" ]; then + echo "$output" >"$OUTPUT_FILE" + else + exit 0 + fi + + sleep 1 # Wait for 1 second +done diff --git a/tools/cloud-build/babysit_tests.py b/tools/cloud-build/babysit_tests.py index 8888b7ea04..ecd181f23e 100755 --- a/tools/cloud-build/babysit_tests.py +++ b/tools/cloud-build/babysit_tests.py @@ -229,7 +229,7 @@ def get_pr(pr_num: int) -> dict: def get_changed_files_tags(base: str, head: str) -> set[str]: res = subprocess.run(["git", "log", f"{base}..{head}", "--name-only", "--format="], stdout=subprocess.PIPE) - assert res.returncode == 0 + assert res.returncode == 0, "Is your local repo up to date?" changed_files = res.stdout.decode('ascii').strip().split("\n") tags = set() for f in changed_files: @@ -249,17 +249,17 @@ def get_changed_files_tags(base: str, head: str) -> set[str]: parser.add_argument("--tags", nargs="*", type=str, help="Filter tests by tags") parser.add_argument("--auto", action="store_true", help="If true, will inspect changed files and run tests for them") parser.add_argument("--all", action="store_true", help="Run all tests") - + parser.add_argument("--project", type=str, help="GCP ProjectID, if not set will use default one (`gcloud config get-value project`)") parser.add_argument("-c", type=int, default=1, help="Number of tests to run concurrently, default is 1") parser.add_argument("-r", type=int, default=1, help="Number of retries, to disable retries set to 0, default is 1") - + parser.add_argument("--base", type=str, help="Revision to inspect diff from") parser.add_argument("--head", type=str, help="Revision to inspect diff to, may be different in case of merged PRs") - + args = parser.parse_args() assert (args.sha is None) ^ (args.pr is None), "either --pr or --sha are required" @@ -272,8 +272,8 @@ def get_changed_files_tags(base: str, head: str) -> set[str]: print("PR is already merged") if args.head is None: # use merge commit as head, since original PR sha may not be available in Git history. - args.head = pr["merge_commit_sha"] - + args.head = pr["merge_commit_sha"] + if args.base is None: args.base = pr["base"]["sha"] else: @@ -297,7 +297,7 @@ def get_changed_files_tags(base: str, head: str) -> set[str]: assert args.base is not None, "--base & [--head] or --pr are required for auto mode" auto_tags = get_changed_files_tags(args.base, args.head) selectors += [selector_by_tag(t) for t in auto_tags] - + ui = UI() cb = cloudbuild_v1.services.cloud_build.CloudBuildClient() Babysitter(ui, cb, project, sha, selectors, args.c, args.r).do() diff --git a/tools/cloud-build/daily-tests/builds/hpc-high-io-v5.yaml b/tools/cloud-build/daily-tests/builds/htc-slurm-v6.yaml similarity index 82% rename from tools/cloud-build/daily-tests/builds/hpc-high-io-v5.yaml rename to tools/cloud-build/daily-tests/builds/htc-slurm-v6.yaml index 226da8ac45..8575def8c1 100644 --- a/tools/cloud-build/daily-tests/builds/hpc-high-io-v5.yaml +++ b/tools/cloud-build/daily-tests/builds/htc-slurm-v6.yaml @@ -1,4 +1,4 @@ -# Copyright 2023 Google LLC +# Copyright 2024 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,15 +14,15 @@ --- tags: -- m.DDN-EXAScaler -- m.dashboard - m.filestore -- m.schedmd-slurm-gcp-v5-controller -- m.schedmd-slurm-gcp-v5-login -- m.schedmd-slurm-gcp-v5-node-group -- m.schedmd-slurm-gcp-v5-partition +- m.DDN-EXAScaler +- m.schedmd-slurm-gcp-v6-controller +- m.schedmd-slurm-gcp-v6-login +- m.schedmd-slurm-gcp-v6-nodeset +- m.schedmd-slurm-gcp-v6-partition - m.vpc -- slurm5 +- m.dashboard +- slurm6 timeout: 14400s # 4hr steps: @@ -44,8 +44,9 @@ steps: args: - -c - echo "done fetching builder" -## Test Slurm High IO v5 Example (Slurm on GCP v5) -- id: hpc-high-io-v5 + +# Test htc-slurm deployment. +- id: htc-slurm-v6 waitFor: ["fetch_builder", "build_ghpc"] name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/hpc-toolkit-builder entrypoint: /bin/bash @@ -61,4 +62,4 @@ steps: ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \ --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ - --extra-vars="@tools/cloud-build/daily-tests/tests/high-io-slurm-gcp-v5.yml" + --extra-vars="@tools/cloud-build/daily-tests/tests/htc-slurm-v6.yml" diff --git a/tools/cloud-build/daily-tests/builds/packer-v6.yaml b/tools/cloud-build/daily-tests/builds/packer-v6.yaml index 191295a1f2..1134afcd8f 100644 --- a/tools/cloud-build/daily-tests/builds/packer-v6.yaml +++ b/tools/cloud-build/daily-tests/builds/packer-v6.yaml @@ -19,7 +19,6 @@ tags: - m.schedmd-slurm-gcp-v6-login - m.schedmd-slurm-gcp-v6-nodeset - m.schedmd-slurm-gcp-v6-partition -- m.service-account - m.startup-script - m.vpc - packer diff --git a/tools/cloud-build/daily-tests/builds/packer.yaml b/tools/cloud-build/daily-tests/builds/packer.yaml index bcb4756b79..d45a705fb8 100644 --- a/tools/cloud-build/daily-tests/builds/packer.yaml +++ b/tools/cloud-build/daily-tests/builds/packer.yaml @@ -19,7 +19,6 @@ tags: - m.schedmd-slurm-gcp-v5-login - m.schedmd-slurm-gcp-v5-node-group - m.schedmd-slurm-gcp-v5-partition -- m.service-account - m.startup-script - m.vpc - packer diff --git a/tools/cloud-build/daily-tests/tests/high-io-slurm-gcp-v5.yml b/tools/cloud-build/daily-tests/tests/htc-slurm-v6.yml similarity index 65% rename from tools/cloud-build/daily-tests/tests/high-io-slurm-gcp-v5.yml rename to tools/cloud-build/daily-tests/tests/htc-slurm-v6.yml index 2160b4635f..a6d6e87c90 100644 --- a/tools/cloud-build/daily-tests/tests/high-io-slurm-gcp-v5.yml +++ b/tools/cloud-build/daily-tests/tests/htc-slurm-v6.yml @@ -1,4 +1,4 @@ -# Copyright 2022 Google LLC +# Copyright 2024 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,24 +14,29 @@ --- -test_name: hpc-cluster-high-io-v5 -deployment_name: "io-v5-{{ build }}" -slurm_cluster_name: "iov5{{ build[0:6] }}" +test_name: htc-slurm-v6 +deployment_name: htcv6{{ build }} +slurm_cluster_name: "htcv6{{ build[0:5] }}" zone: us-west4-c + +cli_deployment_vars: + region: us-west4 + zone: us-west4-c + workspace: /workspace -blueprint_yaml: "{{ workspace }}/tools/cloud-build/daily-tests/blueprints/slurm-gcp-v5-high-io.yaml" +blueprint_yaml: "{{ workspace }}/community/examples/htc-slurm-v6.yaml" network: "{{ deployment_name }}-net" -max_nodes: 5 +# Note: Pattern matching in gcloud only supports 1 wildcard, a*-login-* won't work. login_node: "{{ slurm_cluster_name }}-login-*" controller_node: "{{ slurm_cluster_name }}-controller" post_deploy_tests: - test-validation/test-mounts.yml - test-validation/test-partitions.yml custom_vars: - partitions: - - compute - - lowcost - mounts: - - /home - - /scratch - - /projects + partitions: + - compute + - lowcost + mounts: + - /home + - /projects + - /scratch diff --git a/tools/cloud-build/quota-check/README.md b/tools/cloud-build/quota-check/README.md deleted file mode 100644 index 0571fd8bec..0000000000 --- a/tools/cloud-build/quota-check/README.md +++ /dev/null @@ -1,13 +0,0 @@ -# `quota-check` tool - -`quota-check` is a tool to verify that GCP project has enough quota across multiple regions and zones. - -## Usage - -* Configure desired amount of resource quotas in `bp.yaml`; -* Configure set of regions and zones in `check.py`; -* Run the tool: - -```sh -tools/cloud-build/quota-check/check.py --project= -``` diff --git a/tools/cloud-build/quota-check/check.py b/tools/cloud-build/quota-check/check.py deleted file mode 100755 index 3bcc0d8e97..0000000000 --- a/tools/cloud-build/quota-check/check.py +++ /dev/null @@ -1,76 +0,0 @@ -#!/usr/bin/env python3 -# Copyright 2023 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import argparse -import subprocess -from subprocess import CalledProcessError -from typing import List - -DESCRIPTION = """ -quota-check is a tool to verify that GCP project has enough quota across multiple regions and zones. -Usage: -tools/cloud-build/quota-check/check.py --project= -""" - -LOCATIONS = { - "us-central1": ["a", "c"], - "us-west4": ["c"] -} - - -def _run_ghpc(args: List[str]) -> None: - subprocess.run(["./ghpc " + " ".join(args)], shell=True, check=True, capture_output=True) - -def _process_ghpc_output(serr: str) -> None: - for l in serr.splitlines(): - if l.startswith("not enough quota"): - print(l) - -def _check_zone(project: str, region: str, zone: str) -> None: - print(f"Checking {region=} {zone=}", end=" ") - try: - _run_ghpc([ - "expand", "tools/cloud-build/quota-check/bp.yaml", - "-l ERROR", # so validation will cause failure - "--skip-validators='test_deployment_variable_not_used'", # this validator is false-positive and irrelevant - f"--vars='project_id={project},{region=},{zone=}'", - ]) - except CalledProcessError as e: - print("FAIL") - _process_ghpc_output(e.stderr.decode("utf-8")) - else: - print("OK") - - -def main() -> None: - parser = argparse.ArgumentParser(description=DESCRIPTION) - parser.add_argument("--project", help="The project ID.") - - args = parser.parse_args() - assert args.project, DESCRIPTION - - - try: - _run_ghpc(["--version"]) # Smoke test - except CalledProcessError as e: - print(e.stderr.decode("utf-8")) - exit(e.returncode) - - for region, suffixes in LOCATIONS.items(): - for suffix in suffixes: - zone = f"{region}-{suffix}" - _check_zone(args.project, region, zone) - -if __name__ == '__main__': - main() diff --git a/tools/enforce_coverage.pl b/tools/enforce_coverage.pl index 77e59c2f5b..d1ea19783a 100755 --- a/tools/enforce_coverage.pl +++ b/tools/enforce_coverage.pl @@ -24,7 +24,7 @@ cmd 40 pkg/shell 0 pkg/logging 0 - pkg/validators 25 + pkg/validators 13 pkg/inspect 60 pkg 80 ); diff --git a/tools/validate_configs/golden_copies/configs/files/connect_mode.txt b/tools/validate_configs/golden_copies/configs/files/connect_mode.txt new file mode 100644 index 0000000000..1a4533f873 --- /dev/null +++ b/tools/validate_configs/golden_copies/configs/files/connect_mode.txt @@ -0,0 +1 @@ +PRIVATE_SERVICE_ACCESS \ No newline at end of file diff --git a/tools/validate_configs/golden_copies/configs/igc_tf.yaml b/tools/validate_configs/golden_copies/configs/igc_tf.yaml index fea38bde23..4d25db158b 100644 --- a/tools/validate_configs/golden_copies/configs/igc_tf.yaml +++ b/tools/validate_configs/golden_copies/configs/igc_tf.yaml @@ -20,6 +20,8 @@ vars: deployment_name: igc-tf-test region: us-east4 zone: $(vars.region)-c + connect_mode_file_path: $(ghpc_stage("files/connect_mode.txt")) + terraform_backend_defaults: type: gcs @@ -41,3 +43,4 @@ deployment_groups: settings: local_mount: /home name: $(network0.subnetwork_name) + connect_mode: $(file(vars.connect_mode_file_path)) diff --git a/tools/validate_configs/golden_copies/expectations/igc_pkr/one/image/image.pkr.hcl b/tools/validate_configs/golden_copies/expectations/igc_pkr/one/image/image.pkr.hcl index e243ca5665..d742fed982 100644 --- a/tools/validate_configs/golden_copies/expectations/igc_pkr/one/image/image.pkr.hcl +++ b/tools/validate_configs/golden_copies/expectations/igc_pkr/one/image/image.pkr.hcl @@ -107,7 +107,8 @@ source "googlecompute" "toolkit_image" { use_internal_ip = var.omit_external_ip subnetwork = var.subnetwork_name network_project_id = var.network_project_id - scopes = var.scopes + service_account_email = var.service_account_email + scopes = var.service_account_scopes source_image = var.source_image source_image_family = var.source_image_family source_image_project_id = var.source_image_project_id diff --git a/tools/validate_configs/golden_copies/expectations/igc_pkr/one/image/variables.pkr.hcl b/tools/validate_configs/golden_copies/expectations/igc_pkr/one/image/variables.pkr.hcl index 6e455ef46a..ce2b51b3e1 100644 --- a/tools/validate_configs/golden_copies/expectations/igc_pkr/one/image/variables.pkr.hcl +++ b/tools/validate_configs/golden_copies/expectations/igc_pkr/one/image/variables.pkr.hcl @@ -109,11 +109,22 @@ variable "service_account_email" { } variable "scopes" { + description = "DEPRECATED: use var.service_account_scopes" + type = set(string) + default = null + + validation { + condition = var.scopes == null + error_message = "DEPRECATED: var.scopes was renamed to var.service_account_scopes with identical format." + } +} + +variable "service_account_scopes" { description = </dev/null || { echo "*** ERROR: error creating deployment with ghpc for ${bpFile}" + printf '%s' "$debugInfo" exit 1 } if grep -q "${LOCAL_SOURCE_PATTERN}" "${cwd}/${bp}"; then @@ -59,6 +62,7 @@ run_test() { fi cd "${tmpdir}"/"${DEPLOYMENT}" || { echo "*** ERROR: can't cd into the deployment folder ${DEPLOYMENT}" + echo "$debugInfo" exit 1 } @@ -74,11 +78,13 @@ run_test() { diff --recursive --exclude="previous_deployment_groups" \ "$(pwd)" "${cwd}/${gc}" || { echo "*** ERROR: ${tmpdir}/${DEPLOYMENT} does not match ${gc}" + echo "$debugInfo" exit 1 } rm -rf "${DEPLOYMENT}" || { echo "*** ERROR: could not remove deployment folder from $(pwd)" + echo "$debugInfo" exit 1 } cd "${cwd}" diff --git a/tools/validate_configs/os_compatibility_tests/batch-filestore.yaml b/tools/validate_configs/os_compatibility_tests/batch-filestore.yaml index 9197c613ed..e4676b2dba 100644 --- a/tools/validate_configs/os_compatibility_tests/batch-filestore.yaml +++ b/tools/validate_configs/os_compatibility_tests/batch-filestore.yaml @@ -42,9 +42,6 @@ deployment_groups: # Network # ########### - # Source is an embedded resource, denoted by "resources/*" without ./, ../, / - # as a prefix. To refer to a local resource, prefix with ./, ../ or / - # Example - ./resources/network/vpc - id: network1 source: modules/network/vpc diff --git a/tools/validate_configs/os_compatibility_tests/batch-lustre.yaml b/tools/validate_configs/os_compatibility_tests/batch-lustre.yaml index dacd24efc3..fa74f995d5 100644 --- a/tools/validate_configs/os_compatibility_tests/batch-lustre.yaml +++ b/tools/validate_configs/os_compatibility_tests/batch-lustre.yaml @@ -36,9 +36,6 @@ deployment_groups: # Network # ########### - # Source is an embedded resource, denoted by "resources/*" without ./, ../, / - # as a prefix. To refer to a local resource, prefix with ./, ../ or / - # Example - ./resources/network/vpc - id: network1 source: modules/network/vpc diff --git a/tools/validate_configs/os_compatibility_tests/batch-startup.yaml b/tools/validate_configs/os_compatibility_tests/batch-startup.yaml index 35764de57d..97122dc28f 100644 --- a/tools/validate_configs/os_compatibility_tests/batch-startup.yaml +++ b/tools/validate_configs/os_compatibility_tests/batch-startup.yaml @@ -42,9 +42,6 @@ deployment_groups: # Network # ########### - # Source is an embedded resource, denoted by "resources/*" without ./, ../, / - # as a prefix. To refer to a local resource, prefix with ./, ../ or / - # Example - ./resources/network/vpc - id: network1 source: modules/network/vpc diff --git a/tools/validate_configs/os_compatibility_tests/slurm-filestore.yaml b/tools/validate_configs/os_compatibility_tests/slurm-filestore.yaml index 06eafcb4bd..7b940a58bb 100644 --- a/tools/validate_configs/os_compatibility_tests/slurm-filestore.yaml +++ b/tools/validate_configs/os_compatibility_tests/slurm-filestore.yaml @@ -40,9 +40,6 @@ deployment_groups: # Network # ########### - # Source is an embedded resource, denoted by "resources/*" without ./, ../, / - # as a prefix. To refer to a local resource, prefix with ./, ../ or / - # Example - ./resources/network/vpc - id: network1 source: modules/network/vpc diff --git a/tools/validate_configs/os_compatibility_tests/vm-crd.yaml b/tools/validate_configs/os_compatibility_tests/vm-crd.yaml index 37d6f9b805..54042dd690 100644 --- a/tools/validate_configs/os_compatibility_tests/vm-crd.yaml +++ b/tools/validate_configs/os_compatibility_tests/vm-crd.yaml @@ -36,9 +36,6 @@ deployment_groups: # Network # ########### - # Source is an embedded resource, denoted by "resources/*" without ./, ../, / - # as a prefix. To refer to a local resource, prefix with ./, ../ or / - # Example - ./resources/network/vpc - id: network1 source: modules/network/vpc diff --git a/tools/validate_configs/os_compatibility_tests/vm-filestore.yaml b/tools/validate_configs/os_compatibility_tests/vm-filestore.yaml index 7a57b8cb48..34557f50ad 100644 --- a/tools/validate_configs/os_compatibility_tests/vm-filestore.yaml +++ b/tools/validate_configs/os_compatibility_tests/vm-filestore.yaml @@ -31,9 +31,6 @@ deployment_groups: # Network # ########### - # Source is an embedded resource, denoted by "resources/*" without ./, ../, / - # as a prefix. To refer to a local resource, prefix with ./, ../ or / - # Example - ./resources/network/vpc - id: network1 source: modules/network/vpc diff --git a/tools/validate_configs/os_compatibility_tests/vm-lustre.yaml b/tools/validate_configs/os_compatibility_tests/vm-lustre.yaml index 5e250f42eb..4ad6c51b77 100644 --- a/tools/validate_configs/os_compatibility_tests/vm-lustre.yaml +++ b/tools/validate_configs/os_compatibility_tests/vm-lustre.yaml @@ -32,9 +32,6 @@ deployment_groups: # Network # ########### - # Source is an embedded resource, denoted by "resources/*" without ./, ../, / - # as a prefix. To refer to a local resource, prefix with ./, ../ or / - # Example - ./resources/network/vpc - id: network1 source: modules/network/pre-existing-vpc diff --git a/tools/validate_configs/os_compatibility_tests/vm-startup.yaml b/tools/validate_configs/os_compatibility_tests/vm-startup.yaml index bb1b2ddc62..9228be5e8d 100644 --- a/tools/validate_configs/os_compatibility_tests/vm-startup.yaml +++ b/tools/validate_configs/os_compatibility_tests/vm-startup.yaml @@ -43,9 +43,6 @@ deployment_groups: # Network # ########### - # Source is an embedded resource, denoted by "resources/*" without ./, ../, / - # as a prefix. To refer to a local resource, prefix with ./, ../ or / - # Example - ./resources/network/vpc - id: network1 source: modules/network/vpc diff --git a/tools/validate_configs/test_configs/config-ssh.yaml b/tools/validate_configs/test_configs/config-ssh.yaml index 8a343c1b29..bacc50902e 100644 --- a/tools/validate_configs/test_configs/config-ssh.yaml +++ b/tools/validate_configs/test_configs/config-ssh.yaml @@ -1,4 +1,4 @@ -# Copyright 2023 Google LLC +# Copyright 2024 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -28,15 +28,12 @@ vars: deployment_groups: - group: primary modules: - # Source is an embedded resource, denoted by "resources/*" without ./, ../, / - # as a prefix. To refer to a local resource, prefix with ./, ../ or / - # Example - ./resources/network/vpc - - id: network1 + - id: network source: modules/network/vpc - id: homefs source: modules/file-system/filestore - use: [network1] + use: [network] settings: local_mount: /home @@ -46,54 +43,51 @@ deployment_groups: configure_ssh_host_patterns: ['10.182.0.*', 'hpcsm*'] - id: debug_node_group - source: community/modules/compute/schedmd-slurm-gcp-v5-node-group + source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset + use: [network] settings: node_count_dynamic_max: 4 machine_type: n2-standard-2 + enable_placement: false # the default is: true - id: debug_partition - source: community/modules/compute/schedmd-slurm-gcp-v5-partition + source: community/modules/compute/schedmd-slurm-gcp-v6-partition use: - - network1 - - homefs - debug_node_group settings: partition_name: debug exclusive: false # allows nodes to stay up after jobs are done - enable_placement: false # the default is: true is_default: true - id: compute_node_group - source: community/modules/compute/schedmd-slurm-gcp-v5-node-group + source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset + use: [network] settings: node_count_dynamic_max: 20 - id: compute_partition - source: community/modules/compute/schedmd-slurm-gcp-v5-partition + source: community/modules/compute/schedmd-slurm-gcp-v6-partition use: - - network1 - - homefs - compute_node_group settings: partition_name: compute + - id: slurm_login + source: community/modules/scheduler/schedmd-slurm-gcp-v6-login + use: [network] + settings: + name_prefix: login + machine_type: n2-standard-4 + disable_login_public_ips: false + - id: slurm_controller - source: community/modules/scheduler/schedmd-slurm-gcp-v5-controller + source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller use: - - network1 + - network + - slurm_login - debug_partition - compute_partition - homefs - script settings: disable_controller_public_ips: false - - - id: slurm_login - source: community/modules/scheduler/schedmd-slurm-gcp-v5-login - use: - - network1 - - slurm_controller - - script - settings: - machine_type: n2-standard-4 - disable_login_public_ips: false diff --git a/tools/validate_configs/test_configs/gpu-v6.yaml b/tools/validate_configs/test_configs/gpu-v6.yaml new file mode 100644 index 0000000000..8d92e62cad --- /dev/null +++ b/tools/validate_configs/test_configs/gpu-v6.yaml @@ -0,0 +1,195 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- + +blueprint_name: gpu-vm-v6 + +vars: + project_id: ## Set GCP Project ID Here ## + deployment_name: gpu-vm-v6 + region: us-central1 + zone: us-central1-c + instance_image_vm: + family: common-dl-gpu-debian-10 + project: ml-images + +# Broken into 3 groups to better manage GPU quotas +deployment_groups: +- group: high-count-auto + modules: + - id: network-hca + source: modules/network/pre-existing-vpc + + - id: auto-megagpu + source: modules/compute/vm-instance + use: + - network-hca + settings: + name_prefix: auto-megagpu + machine_type: a2-megagpu-16g + instance_image: $(vars.instance_image_vm) + +- group: high-count-manual + modules: + - id: network-hcm + source: modules/network/pre-existing-vpc + + - id: manual-megagpu + source: modules/compute/vm-instance + use: + - network-hcm + settings: + name_prefix: manual-megagpu + machine_type: a2-megagpu-16g + instance_image: $(vars.instance_image_vm) + guest_accelerator: + - type: nvidia-tesla-a100 + count: 16 + +- group: low-count + modules: + # Source is an embedded module, denoted by "modules/*" without ./, ../, / + # as a prefix. To refer to a local or community module, prefix with ./, ../ or / + # Example - ./modules/network/vpc + - id: network + source: modules/network/pre-existing-vpc + + - id: nogpu-n1 + source: ./modules/compute/vm-instance + use: + - network + settings: + name_prefix: nogpu-n1 + machine_type: n1-standard-8 + instance_image: $(vars.instance_image_vm) + + - id: manual-n1 + source: ./modules/compute/vm-instance + use: + - network + settings: + name_prefix: manual-n1 + machine_type: n1-standard-32 + on_host_maintenance: TERMINATE + instance_image: $(vars.instance_image_vm) + guest_accelerator: + - type: nvidia-tesla-t4 + count: 1 + + - id: auto-highgpu + source: modules/compute/vm-instance + use: + - network + settings: + name_prefix: auto-highgpu + machine_type: a2-highgpu-1g + instance_image: $(vars.instance_image_vm) + + - id: manual-highgpu + source: modules/compute/vm-instance + use: + - network + settings: + name_prefix: manual-highgpu + machine_type: a2-highgpu-2g + instance_image: $(vars.instance_image_vm) + guest_accelerator: + - type: nvidia-tesla-a100 + count: 2 + + - id: auto-ultragpu + source: modules/compute/vm-instance + use: + - network + settings: + name_prefix: auto-ultragpu + machine_type: a2-ultragpu-2g + instance_image: $(vars.instance_image_vm) + + - id: manual-ultragpu + source: modules/compute/vm-instance + use: + - network + settings: + name_prefix: manual-ultragpu + machine_type: a2-ultragpu-2g + instance_image: $(vars.instance_image_vm) + guest_accelerator: + - type: nvidia-a100-80gb + count: 2 + +- group: slurm-gcp-v6 + modules: + - id: network_slurm + source: modules/network/pre-existing-vpc + + - id: nogpu_nodegroup + source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset + use: [network_slurm] + settings: + name: nogpu + enable_placement: false + node_count_dynamic_max: 4 + machine_type: n2-standard-2 + + - id: manual_nodegroup + source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset + use: [network_slurm] + settings: + name: man + enable_placement: false + node_count_dynamic_max: 4 + machine_type: a2-ultragpu-2g + guest_accelerator: + - type: nvidia-a100-80gb + count: 2 + + - id: auto_nodegroup + source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset + use: [network_slurm] + settings: + name: auto + enable_placement: false + node_count_dynamic_max: 4 + machine_type: a2-ultragpu-2g + + - id: partition + source: community/modules/compute/schedmd-slurm-gcp-v6-partition + use: + - nogpu_nodegroup + - manual_nodegroup + - auto_nodegroup + settings: + partition_name: debug + is_default: true + + - id: slurm_login + source: community/modules/scheduler/schedmd-slurm-gcp-v6-login + use: + - network_slurm + settings: + name_prefix: login + disable_login_public_ips: false + machine_type: a2-highgpu-1g + + - id: slurm_controller + source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller + use: + - network_slurm + - slurm_login + - partition + settings: + disable_controller_public_ips: false + machine_type: a2-highgpu-2g diff --git a/tools/validate_configs/test_configs/hpc-cluster-simple-nfs-sql.yaml b/tools/validate_configs/test_configs/hpc-cluster-simple-nfs-sql.yaml index a6d7001fd5..6f094cc007 100644 --- a/tools/validate_configs/test_configs/hpc-cluster-simple-nfs-sql.yaml +++ b/tools/validate_configs/test_configs/hpc-cluster-simple-nfs-sql.yaml @@ -52,7 +52,6 @@ deployment_groups: - id: compute-partition source: community/modules/compute/schedmd-slurm-gcp-v6-partition use: - - homefs - compute-nodeset settings: partition_name: compute diff --git a/tools/validate_configs/test_configs/node-groups-v6.yaml b/tools/validate_configs/test_configs/node-groups-v6.yaml new file mode 100644 index 0000000000..29ea3f5f46 --- /dev/null +++ b/tools/validate_configs/test_configs/node-groups-v6.yaml @@ -0,0 +1,173 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- + +blueprint_name: node-group-test-v6 + +vars: + project_id: ## Set GCP Project ID Here ## + deployment_name: slurm-gcp-v6 + region: us-central1 + zone: us-central1-c + +# Documentation for each of the modules used below can be found at +# https://github.com/GoogleCloudPlatform/hpc-toolkit/blob/main/modules/README.md + +deployment_groups: +- group: primary + modules: + # Source is an embedded resource, denoted by "resources/*" without ./, ../, / + # as a prefix. To refer to a local resource, prefix with ./, ../ or / + # Example - ./resources/network/vpc + - id: network + source: modules/network/vpc + + - id: homefs + source: community/modules/file-system/nfs-server + use: [network] + settings: + local_mounts: [/home] + auto_delete_disk: true + + ## Single node group, use defaults where appropriate + - id: default_nodeset + source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset + use: [network] + settings: + name: simple + machine_type: c2-standard-30 + + - id: single_nodeset_partition + source: community/modules/compute/schedmd-slurm-gcp-v6-partition + use: + - default_nodeset + settings: + partition_name: simple + + ## Complex partition using node groups + - id: nodeset_1 + source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset + use: [network] + settings: + name: c30 + machine_type: c2-standard-30 + instance_image: + family: slurm-gcp-6-4-debian-11 + project: schedmd-slurm-public + instance_image_custom: true + + - id: nodeset_2 + source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset + use: [network] + settings: + name: c60 + machine_type: c2-standard-60 + instance_image: + family: slurm-gcp-6-4-hpc-centos-7 + project: schedmd-slurm-public + + - id: nodeset_3 + source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset + use: [network] + settings: + name: cd112 + machine_type: c2d-standard-112 + instance_image: + family: slurm-gcp-6-4-hpc-centos-7 + project: schedmd-slurm-public + instance_image_custom: true + enable_smt: true + + - id: nodeset_4 + source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset + use: [network] + settings: + name: cd56 + machine_type: c2d-standard-56 + + - id: multiple_nodesets + source: community/modules/compute/schedmd-slurm-gcp-v6-partition + use: + - nodeset_1 + - nodeset_2 + - nodeset_3 + - nodeset_4 + settings: + partition_name: multns + + ## Explicitly set node partition with one nodeset + - id: single_nodeset_explicit_partition + source: community/modules/compute/schedmd-slurm-gcp-v6-partition + settings: + partition_name: explns + is_default: true + nodeset: + - nodeset_name: expl + node_count_static: 0 + node_count_dynamic_max: 4 + enable_placement: false + node_conf: {} + additional_disks: [] + additional_networks: [] + bandwidth_tier: null + can_ip_forward: false + disable_smt: false + disk_auto_delete: true + disk_labels: {} + disk_size_gb: 50 + disk_type: pd-standard + enable_confidential_vm: false + enable_oslogin: true + enable_shielded_vm: false + enable_spot_vm: false + gpu: null + instance_template: null + labels: $(vars.labels) + machine_type: n2-standard-16 + maintenance_interval: "" + metadata: {} + min_cpu_platform: null + on_host_maintenance: TERMINATE + preemptible: false + reservation_name: null # will be replaced by default value empty string + service_account: null + shielded_instance_config: null + subnetwork_self_link: $(network.subnetwork_self_link) + spot_instance_config: null + source_image_family: null + source_image_project: null + source_image: null + tags: [] + access_config: [] + + - id: slurm_login + source: community/modules/scheduler/schedmd-slurm-gcp-v6-login + use: [network] + settings: + name_prefix: login + machine_type: n2-standard-4 + disable_login_public_ips: false + + - id: slurm_controller + source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller + use: + - network + - single_nodeset_partition + - multiple_nodesets + - single_nodeset_explicit_partition + - homefs + - slurm_login + settings: + disable_controller_public_ips: false diff --git a/tools/validate_configs/test_configs/node-groups.yaml b/tools/validate_configs/test_configs/node-groups.yaml index 026457c949..2ead7732a8 100644 --- a/tools/validate_configs/test_configs/node-groups.yaml +++ b/tools/validate_configs/test_configs/node-groups.yaml @@ -28,9 +28,6 @@ vars: deployment_groups: - group: primary modules: - # Source is an embedded resource, denoted by "resources/*" without ./, ../, / - # as a prefix. To refer to a local resource, prefix with ./, ../ or / - # Example - ./resources/network/vpc - id: network1 source: modules/network/vpc diff --git a/tools/validate_configs/test_configs/slurm-gcp-v5-startup-scripts.yaml b/tools/validate_configs/test_configs/slurm-gcp-v5-startup-scripts.yaml index de561e39af..f15605b90d 100644 --- a/tools/validate_configs/test_configs/slurm-gcp-v5-startup-scripts.yaml +++ b/tools/validate_configs/test_configs/slurm-gcp-v5-startup-scripts.yaml @@ -25,9 +25,6 @@ vars: deployment_groups: - group: primary modules: - # Source is an embedded resource, denoted by "resources/*" without ./, ../, / - # as a prefix. To refer to a local resource, prefix with ./, ../ or / - # Example - ./resources/network/vpc - id: network1 source: modules/network/vpc diff --git a/tools/validate_configs/test_configs/slurm-gcp-v6-startup-scripts.yaml b/tools/validate_configs/test_configs/slurm-gcp-v6-startup-scripts.yaml index 8cd900f29c..d1bbe8baf1 100644 --- a/tools/validate_configs/test_configs/slurm-gcp-v6-startup-scripts.yaml +++ b/tools/validate_configs/test_configs/slurm-gcp-v6-startup-scripts.yaml @@ -25,9 +25,6 @@ vars: deployment_groups: - group: primary modules: - # Source is an embedded resource, denoted by "resources/*" without ./, ../, / - # as a prefix. To refer to a local resource, prefix with ./, ../ or / - # Example - ./resources/network/vpc - id: network source: modules/network/vpc @@ -69,8 +66,6 @@ deployment_groups: - id: debug_partition source: community/modules/compute/schedmd-slurm-gcp-v6-partition use: - - homefs - - bucket - debug_nodeset settings: partition_name: debug @@ -87,8 +82,6 @@ deployment_groups: - id: compute_partition source: community/modules/compute/schedmd-slurm-gcp-v6-partition use: - - homefs - - bucket - compute_nodeset settings: partition_name: compute diff --git a/tools/validate_configs/test_configs/slurm-static-test.yaml b/tools/validate_configs/test_configs/slurm-static-test.yaml index 7e3adcbb9a..81b95e2b02 100644 --- a/tools/validate_configs/test_configs/slurm-static-test.yaml +++ b/tools/validate_configs/test_configs/slurm-static-test.yaml @@ -44,10 +44,6 @@ deployment_groups: ########### # Network # ########### - - # Source is an embedded resource, denoted by "resources/*" without ./, ../, / - # as a prefix. To refer to a local resource, prefix with ./, ../ or / - # Example - ./resources/network/vpc - id: network1 source: modules/network/pre-existing-vpc diff --git a/tools/validate_configs/test_configs/zone-policies-slurm-v5.yaml b/tools/validate_configs/test_configs/zone-policies-slurm-v5.yaml index 9a4b917ad2..0403fb7fdd 100644 --- a/tools/validate_configs/test_configs/zone-policies-slurm-v5.yaml +++ b/tools/validate_configs/test_configs/zone-policies-slurm-v5.yaml @@ -31,9 +31,6 @@ vars: deployment_groups: - group: primary modules: - # Source is an embedded resource, denoted by "resources/*" without ./, ../, / - # as a prefix. To refer to a local resource, prefix with ./, ../ or / - # Example - ./resources/network/vpc - id: network1 source: modules/network/vpc