Skip to content

Commit

Permalink
Merge pull request #2394 from GoogleCloudPlatform/release-candidate
Browse files Browse the repository at this point in the history
Release v1.31.0
  • Loading branch information
tpdownes authored Mar 28, 2024
2 parents 08ae77e + ebb2fad commit fe6b653
Show file tree
Hide file tree
Showing 164 changed files with 1,421 additions and 1,335 deletions.
10 changes: 2 additions & 8 deletions .github/workflows/pr-precommit.yml
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,5 @@ jobs:
# https://github.com/terraform-linters/tflint/blob/master/docs/user-guide/plugins.md#avoiding-rate-limiting
GITHUB_TOKEN: ${{ github.token }}
- uses: pre-commit/[email protected]
- uses: pre-commit-ci/[email protected]
# this if statement looks funny but it ensures that this step runs
# only if: user has applied "pre-commit-autofix" label
# even if: job has failed
# not if: job is canceled
if: |
(success() || failure()) &&
contains(github.event.pull_request.labels.*.name, 'pre-commit-autofix')
with:
extra_args: --show-diff-on-failure --all-files
1 change: 1 addition & 0 deletions cmd/create.go
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ func runCreateCmd(cmd *cobra.Command, args []string) {
func doCreate(path string) string {
bp := expandOrDie(path)
deplDir := filepath.Join(createFlags.outputDir, bp.DeploymentName())
logging.Info("Creating deployment folder %q ...", deplDir)
checkErr(checkOverwriteAllowed(deplDir, bp, createFlags.overwriteDeployment, createFlags.forceOverwrite))
checkErr(modulewriter.WriteDeployment(bp, deplDir))
return deplDir
Expand Down
2 changes: 1 addition & 1 deletion cmd/root.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ HPC deployments on the Google Cloud Platform.`,
logging.Fatal("cmd.Help function failed: %s", err)
}
},
Version: "v1.30.0",
Version: "v1.31.0",
Annotations: annotation,
}
)
Expand Down
2 changes: 1 addition & 1 deletion community/examples/hpc-slurm-gromacs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ deployment_groups:

- id: compute_partition
source: community/modules/compute/schedmd-slurm-gcp-v6-partition
use: [compute_nodeset, homefs, appsfs]
use: [compute_nodeset]
settings:
partition_name: compute
is_default: true
Expand Down
6 changes: 3 additions & 3 deletions community/examples/hpc-slurm-ramble-gromacs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,9 @@ vars:
deployment_groups:
- group: primary
modules:
# Source is an embedded resource, denoted by "resources/*" without ./, ../, /
# as a prefix. To refer to a local resource, prefix with ./, ../ or /
# Example - ./resources/network/vpc
# Source is an embedded module, denoted by "modules/*" without ./, ../, /
# as a prefix. To refer to a local module, prefix with ./, ../ or /
# Example - ./modules/network/vpc
- id: network1
source: modules/network/vpc

Expand Down
4 changes: 2 additions & 2 deletions community/examples/hpc-slurm-sharedvpc.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ deployment_groups:

- id: debug_partition
source: community/modules/compute/schedmd-slurm-gcp-v6-partition
use: [debug_nodeset, homefs]
use: [debug_nodeset]
settings:
partition_name: debug
exclusive: false # allows nodes to stay up after jobs are done
Expand All @@ -80,7 +80,7 @@ deployment_groups:

- id: compute_partition
source: community/modules/compute/schedmd-slurm-gcp-v6-partition
use: [compute_nodeset, homefs]
use: [compute_nodeset]
settings:
partition_name: compute

Expand Down
18 changes: 7 additions & 11 deletions community/examples/hpc-slurm-ubuntu2004-v6.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,9 @@ vars:
deployment_groups:
- group: primary
modules:
# Source is an embedded resource, denoted by "resources/*" without ./, ../, /
# as a prefix. To refer to a local resource, prefix with ./, ../ or /
# Example - ./resources/network/vpc
# Source is an embedded module, denoted by "modules/*" without ./, ../, /
# as a prefix. To refer to a local module, prefix with ./, ../ or /
# Example - ./modules/network/vpc
- id: network1
source: modules/network/vpc

Expand All @@ -43,7 +43,7 @@ deployment_groups:
settings:
local_mount: /home

- id: debug_node_group
- id: debug_nodeset
source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset
use: [network1]
settings:
Expand All @@ -54,15 +54,13 @@ deployment_groups:

- id: debug_partition
source: community/modules/compute/schedmd-slurm-gcp-v6-partition
use:
- homefs
- debug_node_group
use: [debug_nodeset]
settings:
partition_name: debug
exclusive: false # allows nodes to stay up after jobs are done
is_default: true

- id: compute_node_group
- id: compute_nodeset
source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset
use: [network1]
settings:
Expand All @@ -72,9 +70,7 @@ deployment_groups:

- id: compute_partition
source: community/modules/compute/schedmd-slurm-gcp-v6-partition
use:
- homefs
- compute_node_group
use: [compute_nodeset]
settings:
partition_name: compute

Expand Down
6 changes: 3 additions & 3 deletions community/examples/hpc-slurm-ubuntu2004.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,9 @@ vars:
deployment_groups:
- group: primary
modules:
# Source is an embedded resource, denoted by "resources/*" without ./, ../, /
# as a prefix. To refer to a local resource, prefix with ./, ../ or /
# Example - ./resources/network/vpc
# Source is an embedded module, denoted by "modules/*" without ./, ../, /
# as a prefix. To refer to a local module, prefix with ./, ../ or /
# Example - ./modules/network/vpc
- id: network1
source: modules/network/vpc

Expand Down
1 change: 0 additions & 1 deletion community/examples/hpc-slurm6-tpu-maxtext.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,6 @@ deployment_groups:
source: ./community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu
use: [network]
settings:
name: v4x8
node_type: v4-8
tf_version: 2.14.0
# Preemptible TPUs cost much less than non-preemptible TPUs.
Expand Down
1 change: 0 additions & 1 deletion community/examples/hpc-slurm6-tpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ deployment_groups:
source: ./community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu
use: [network]
settings:
name: v3x8
node_type: v3-8
tf_version: 2.14.0
# Preemptible TPUs cost much less than non-preemptible TPUs.
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# Copyright 2022 Google LLC
# Copyright 2024 Google LLC
# Copyright (C) SchedMD LLC.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -14,22 +15,21 @@

---

blueprint_name: hpc-cluster-high-io-v5
# This blueprint provisions a cluster using the Slurm scheduler configured to
# efficiently run many short duration, loosely-coupled (non-MPI) jobs. See also:
# https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/htc.md
# https://slurm.schedmd.com/high_throughput.html

blueprint_name: htc-slurm-v6

vars:
project_id: ## Set GCP Project ID Here ##
deployment_name: high-io-slurm-gcp-v5
deployment_name: htc-slurm-v6
region: us-west4
zone: us-west4-c
# By default, public IPs are set in the login and controller to allow easier
# SSH access. To turn this behavior off, set this to true.
disable_public_ips: false
# Set to true for active cluster reconfiguration.
# Note that setting this option requires additional dependencies to be installed locally.
enable_reconfigure: true
# When set, active compute nodes will be cleaned up on destroy.
# Note that setting this option requires additional dependencies to be installed locally.
enable_cleanup_compute: true

# Documentation for each of the modules used below can be found at
# https://github.com/GoogleCloudPlatform/hpc-toolkit/blob/main/modules/README.md
Expand All @@ -40,18 +40,18 @@ deployment_groups:
# Source is an embedded module, denoted by "modules/*" without ./, ../, /
# as a prefix. To refer to a local or community module, prefix with ./, ../ or /
# Example - ./modules/network/pre-existing-vpc
- id: network1
- id: network
source: modules/network/vpc

- id: homefs
source: modules/file-system/filestore
use: [network1]
use: [network]
settings:
local_mount: /home

- id: projectsfs
source: modules/file-system/filestore
use: [network1]
use: [network]
settings:
filestore_tier: HIGH_SCALE_SSD
size_gb: 10240
Expand All @@ -61,94 +61,98 @@ deployment_groups:
# https://console.developers.google.com/marketplace/product/ddnstorage/exascaler-cloud
- id: scratchfs
source: community/modules/file-system/DDN-EXAScaler
use: [network1]
use: [network]
settings:
local_mount: /scratch

# The compute partition is designed for performance.
# Use:
# `srun -N 4 -p compute <<Command>>` for any node in the partition.
# `srun -N 4 -p compute --mincpus 30 <<Command>>` for node group c2s60.
- id: compute_nodeset_c2s60
source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset
use: [network]
settings:
name: c2s60
node_count_dynamic_max: 200
bandwidth_tier: gvnic_enabled
enable_placement: false

- id: compute_nodeset_c2s30
source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset
use: [network]
settings:
node_count_dynamic_max: 200
machine_type: c2-standard-30
bandwidth_tier: gvnic_enabled
enable_placement: false

- id: compute_partition
source: community/modules/compute/schedmd-slurm-gcp-v6-partition
use:
- compute_nodeset_c2s60
- compute_nodeset_c2s30
settings:
partition_name: compute
exclusive: false

# The lowcost partition is designed to run at a lower cost and without additional quota
# Use:
# `srun -N 4 <<Command>>` for any node in the partition.
# `srun -N 4 --mincpus 2` for node group n2s4.
- id: low_cost_node_group_n2s2
source: community/modules/compute/schedmd-slurm-gcp-v5-node-group
- id: low_cost_nodeset_n2s2
source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset
use: [network]
settings:
name: n2s2
machine_type: n2-standard-2
node_count_dynamic_max: 10
bandwidth_tier: gvnic_enabled
enable_placement: false

- id: low_cost_node_group_n2s4
source: community/modules/compute/schedmd-slurm-gcp-v5-node-group
- id: low_cost_nodeset_n2s4
source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset
use: [network]
settings:
name: n2s4
machine_type: n2-standard-4
node_count_dynamic_max: 10
bandwidth_tier: gvnic_enabled
enable_placement: false

- id: low_cost_partition
source: community/modules/compute/schedmd-slurm-gcp-v5-partition
source: community/modules/compute/schedmd-slurm-gcp-v6-partition
use:
- network1
- homefs
- scratchfs
- projectsfs
- low_cost_node_group_n2s2
- low_cost_node_group_n2s4
- low_cost_nodeset_n2s2
- low_cost_nodeset_n2s4
settings:
is_default: true
partition_name: lowcost
enable_placement: false
exclusive: false

# The compute partition is designed for performance.
# Use:
# `srun -N 4 -p compute <<Command>>` for any node in the partition.
# `srun -N 4 -p compute --mincpus 30 <<Command>>` for node group c2s60.

- id: compute_node_group_c2s60
source: community/modules/compute/schedmd-slurm-gcp-v5-node-group
settings:
name: c2s60
node_count_dynamic_max: 200

- id: compute_node_group_c2s30
source: community/modules/compute/schedmd-slurm-gcp-v5-node-group
settings:
name: c2s30
node_count_dynamic_max: 200
machine_type: c2-standard-30

- id: compute_partition
source: community/modules/compute/schedmd-slurm-gcp-v5-partition
use:
- network1
- homefs
- scratchfs
- projectsfs
- compute_node_group_c2s60
- compute_node_group_c2s30
- id: slurm_login
source: community/modules/scheduler/schedmd-slurm-gcp-v6-login
use: [network]
settings:
partition_name: compute
name_prefix: login
machine_type: n2-standard-4
disable_login_public_ips: $(vars.disable_public_ips)

- id: slurm_controller
source: community/modules/scheduler/schedmd-slurm-gcp-v5-controller
source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller
use:
- network1
- network
- homefs
- scratchfs
- projectsfs
- low_cost_partition
- compute_partition
- slurm_login
settings:
machine_type: c2-standard-8
disable_controller_public_ips: $(vars.disable_public_ips)

- id: slurm_login
source: community/modules/scheduler/schedmd-slurm-gcp-v5-login
use:
- network1
- slurm_controller
settings:
machine_type: n2-standard-4
disable_login_public_ips: $(vars.disable_public_ips)
slurm_conf_tpl: modules/embedded/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/etc/htc-slurm.conf.tpl
slurmdbd_conf_tpl: modules/embedded/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/etc/htc-slurmdbd.conf.tpl

- id: hpc_dashboard
source: modules/monitoring/dashboard
Expand Down
4 changes: 2 additions & 2 deletions community/examples/intel/hpc-slurm-daos.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ deployment_groups:

- id: debug_partition
source: community/modules/compute/schedmd-slurm-gcp-v6-partition
use: [debug_nodeset, homefs]
use: [debug_nodeset]
settings:
partition_name: debug
exclusive: false # allows nodes to stay up after jobs are done
Expand All @@ -154,7 +154,7 @@ deployment_groups:

- id: compute_partition
source: community/modules/compute/schedmd-slurm-gcp-v6-partition
use: [compute_nodeset, homefs]
use: [compute_nodeset]
settings:
partition_name: compute

Expand Down
6 changes: 3 additions & 3 deletions community/examples/tutorial-starccm-slurm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,9 @@ vars:
deployment_groups:
- group: primary
modules:
# Source is an embedded resource, denoted by "resources/*" without ./, ../, /
# as a prefix. To refer to a local resource, prefix with ./, ../ or /
# Example - ./resources/network/vpc
# Source is an embedded module, denoted by "modules/*" without ./, ../, /
# as a prefix. To refer to a local module, prefix with ./, ../ or /
# Example - ./modules/network/vpc
- id: network1
source: modules/network/vpc

Expand Down
Loading

0 comments on commit fe6b653

Please sign in to comment.