From 4114403de1dbad0dcb49695059f191b16727c849 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?H=C3=A9l=C3=A8ne=20Polv=C3=A8che=20=28=20CECS/I-Stem=20=29?= <99477756+helenepolveche@users.noreply.github.com> Date: Thu, 31 Oct 2024 15:17:01 +0100 Subject: [PATCH 01/33] psmn.config : delete Lake-Flix --- conf/psmn.config | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/conf/psmn.config b/conf/psmn.config index 47cd5d148..72942e375 100644 --- a/conf/psmn.config +++ b/conf/psmn.config @@ -14,32 +14,32 @@ charliecloud { process { executor = 'slurm' - clusterOptions = "--partition=E5,Lake,Lake-flix" + clusterOptions = "--partition=E5,Lake" cpus = 1 memory = 32.GB time = 24.h withLabel: 'process_single|process_single_thread|sc_tiny|sc_small|sc_medium' { - clusterOptions = "--partition=E5,Lake,Lake-flix" + clusterOptions = "--partition=E5,Lake" cpus = 1 memory = 114.GB time = 24.h } withLabel:'process_low|mc_small|process_very_low' { - clusterOptions = "--partition=E5,Lake,Lake-flix" + clusterOptions = "--partition=E5,Lake" cpus = 16 memory = 90.GB time = 24.h } withLabel:'process_medium|mc_medium' { - clusterOptions = "--partition=Lake,Lake-flix" + clusterOptions = "--partition=Lake" cpus = 32 memory = 180.GB time = 48.h } withLabel:'process_high|mc_large|mc_huge|process_high_cpus|cpus_max' { - clusterOptions = "--partition=Lake,Lake-flix" + clusterOptions = "--partition=Lake" cpus = 32 memory = 370.GB time = 48.h @@ -49,7 +49,7 @@ process { time = 96.h } withLabel: 'process_high_memory|memory_max' { - clusterOptions = "--partition=Lake,Lake-flix" + clusterOptions = "--partition=Lake" memory = 370.GB } withLabel: gpu { From a9e03c0c942e79db352e808a3d4c278cf6f6b633 Mon Sep 17 00:00:00 2001 From: baldikacti Date: Fri, 1 Nov 2024 16:51:14 +0000 Subject: [PATCH 02/33] Initial commit of unity.config --- .github/workflows/main.yml | 1 + README.md | 1 + conf/unity.config | 47 ++++++++++++++++++++++++++++++++++++++ docs/unity.md | 0 nfcore_custom.config | 3 +++ 5 files changed, 52 insertions(+) create mode 100644 conf/unity.config create mode 100644 docs/unity.md diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 19195ac35..8d265002c 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -151,6 +151,7 @@ jobs: - "ucd_sonic" - "uge" - "unibe_ibu" + - "unity" - "unc_lccc" - "unc_longleaf" - "uod_hpc" diff --git a/README.md b/README.md index ee46ec70f..f7b356295 100644 --- a/README.md +++ b/README.md @@ -203,6 +203,7 @@ Currently documentation is available for the following systems: - [UNC_LONGLEAF](docs/unc_longleaf.md) - [UGE](docs/uge.md) - [UNIBE_IBU](docs/unibe_ibu.md) +- [Unity](docs/unity.md) - [UOD_HPC](docs/uod_hpc.md) - [UPPMAX](docs/uppmax.md) - [UTD_GANYMEDE](docs/utd_ganymede.md) diff --git a/conf/unity.config b/conf/unity.config new file mode 100644 index 000000000..f5d353f42 --- /dev/null +++ b/conf/unity.config @@ -0,0 +1,47 @@ +/* + * ------------------------------------------------- + * Unity HPC cluster config file for nf-core + * ------------------------------------------------- + * https://unity.rc.umass.edu/ + */ + +params { + config_profile_description = 'Unity cluster profile provided by nf-core/configs.' + config_profile_contact = 'Berent Aldikacti (baldikacti)' + config_profile_url = 'https://unity.rc.umass.edu/' + igenomes_base = '///igenomes/' + max_memory = 2.TB + max_cpus = 192 + max_time = 14.d +} + +process { + resourceLimits = [ + cpus: params.max_cpus, + memory: params.max_memory, + time: params.max_time + ] + executor = 'slurm' + queue = { task.time <= 2.h ? 'cpu-preempt' : 'cpu' } + maxRetries = 2 + clusterOptions = { "${task.time >= 48.h ?: '-q long'}" } + beforeScript = 'module load apptainer/latest' +} + +executor { + queueSize = 1000 + submitRateLimit = '6/1min' +} + +apptainer { + enabled = true + autoMounts = true +} + +cleanup = true + +profiles { + debug { + cleanup = false + } +} \ No newline at end of file diff --git a/docs/unity.md b/docs/unity.md new file mode 100644 index 000000000..e69de29bb diff --git a/nfcore_custom.config b/nfcore_custom.config index b4239abff..69629df93 100644 --- a/nfcore_custom.config +++ b/nfcore_custom.config @@ -370,6 +370,9 @@ profiles { unibe_ibu { includeConfig "${params.custom_config_base}/conf/unibe_ibu.config" } + unity { + includeConfig "${params.custom_config_base}/conf/unity.config" + } uod_hpc { includeConfig "${params.custom_config_base}/conf/uod_hpc.config" } From cdcfc1d2fe81d41457800c2c0c8ae77bbc8ddb53 Mon Sep 17 00:00:00 2001 From: Laurent Modolo Date: Tue, 5 Nov 2024 10:01:41 +0100 Subject: [PATCH 03/33] psmn.config: update slurm partition information --- conf/psmn.config | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/conf/psmn.config b/conf/psmn.config index 72942e375..d6743f9ee 100644 --- a/conf/psmn.config +++ b/conf/psmn.config @@ -21,15 +21,15 @@ process { time = 24.h withLabel: 'process_single|process_single_thread|sc_tiny|sc_small|sc_medium' { - clusterOptions = "--partition=E5,Lake" + clusterOptions = "--partition=Lake" cpus = 1 - memory = 114.GB + memory = 96.GB time = 24.h } withLabel:'process_low|mc_small|process_very_low' { - clusterOptions = "--partition=E5,Lake" + clusterOptions = "--partition=Lake" cpus = 16 - memory = 90.GB + memory = 80.GB time = 24.h } withLabel:'process_medium|mc_medium' { @@ -49,8 +49,8 @@ process { time = 96.h } withLabel: 'process_high_memory|memory_max' { - clusterOptions = "--partition=Lake" - memory = 370.GB + clusterOptions = "--partition=Epyc" + memory = 500.GB } withLabel: gpu { clusterOptions = "--partition=E5-GPU" From fddf2571ab1955c49428cf691e539c055a89aef9 Mon Sep 17 00:00:00 2001 From: Laurent Modolo Date: Tue, 5 Nov 2024 10:09:07 +0100 Subject: [PATCH 04/33] psmn.config: update number of space for linter --- conf/psmn.config | 80 ++++++++++++++++++++++++------------------------ 1 file changed, 40 insertions(+), 40 deletions(-) diff --git a/conf/psmn.config b/conf/psmn.config index 5fb6c5f4c..9fc044653 100644 --- a/conf/psmn.config +++ b/conf/psmn.config @@ -13,48 +13,48 @@ charliecloud { } process { - executor = 'slurm' - clusterOptions = "--partition=E5,Lake" + executor = 'slurm' + clusterOptions = "--partition=E5,Lake" - cpus = 1 - memory = 32.GB - time = 24.h + cpus = 1 + memory = 32.GB + time = 24.h - withLabel: 'process_single|process_single_thread|sc_tiny|sc_small|sc_medium' { - clusterOptions = "--partition=Lake" - cpus = 1 - memory = 96.GB - time = 24.h - } - withLabel:'process_low|mc_small|process_very_low' { - clusterOptions = "--partition=Lake" - cpus = 16 - memory = 80.GB - time = 24.h - } - withLabel:'process_medium|mc_medium' { - clusterOptions = "--partition=Lake" - cpus = 32 - memory = 180.GB - time = 48.h - } - withLabel:'process_high|mc_large|mc_huge|process_high_cpus|cpus_max' { - clusterOptions = "--partition=Lake" - cpus = 32 - memory = 370.GB - time = 48.h - } - withLabel: 'process_long|process_maximum_time|process_long_parallelized' { - clusterOptions = "--partition=Lake" - time = 96.h - } - withLabel: 'process_high_memory|memory_max' { - clusterOptions = "--partition=Epyc" - memory = 500.GB - } - withLabel: gpu { - clusterOptions = "--partition=E5-GPU" - } + withLabel: 'process_single|process_single_thread|sc_tiny|sc_small|sc_medium' { + clusterOptions = "--partition=Lake" + cpus = 1 + memory = 96.GB + time = 24.h + } + withLabel:'process_low|mc_small|process_very_low' { + clusterOptions = "--partition=Lake" + cpus = 16 + memory = 80.GB + time = 24.h + } + withLabel:'process_medium|mc_medium' { + clusterOptions = "--partition=Lake" + cpus = 32 + memory = 180.GB + time = 48.h + } + withLabel:'process_high|mc_large|mc_huge|process_high_cpus|cpus_max' { + clusterOptions = "--partition=Lake" + cpus = 32 + memory = 370.GB + time = 48.h + } + withLabel: 'process_long|process_maximum_time|process_long_parallelized' { + clusterOptions = "--partition=Lake" + time = 96.h + } + withLabel: 'process_high_memory|memory_max' { + clusterOptions = "--partition=Epyc" + memory = 500.GB + } + withLabel: gpu { + clusterOptions = "--partition=E5-GPU" + } } params { From bdfc3e081a8206435e31ab4cd3f26f27ba2e29fc Mon Sep 17 00:00:00 2001 From: Laurent Modolo Date: Wed, 6 Nov 2024 08:47:04 +0100 Subject: [PATCH 05/33] psmn.config: add back resourceLimits --- conf/psmn.config | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/conf/psmn.config b/conf/psmn.config index 9fc044653..d0972f87b 100644 --- a/conf/psmn.config +++ b/conf/psmn.config @@ -13,6 +13,11 @@ charliecloud { } process { + resourceLimits = [ + memory: 370.GB, + cpus: 32, + time: 96.h + ] executor = 'slurm' clusterOptions = "--partition=E5,Lake" From df37bd98968ad0bb9accadae59c44bee01a38ad9 Mon Sep 17 00:00:00 2001 From: Laurent Modolo Date: Wed, 6 Nov 2024 09:05:11 +0100 Subject: [PATCH 06/33] psmn.config: change clusterOption --parition to queue --- conf/psmn.config | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/conf/psmn.config b/conf/psmn.config index d0972f87b..725fff206 100644 --- a/conf/psmn.config +++ b/conf/psmn.config @@ -19,46 +19,46 @@ process { time: 96.h ] executor = 'slurm' - clusterOptions = "--partition=E5,Lake" + queue = "E5,Lake" cpus = 1 memory = 32.GB time = 24.h withLabel: 'process_single|process_single_thread|sc_tiny|sc_small|sc_medium' { - clusterOptions = "--partition=Lake" + queue = "Lake" cpus = 1 memory = 96.GB time = 24.h } - withLabel:'process_low|mc_small|process_very_low' { - clusterOptions = "--partition=Lake" + withLabel: 'process_low|mc_small|process_very_low' { + queue = "Lake" cpus = 16 memory = 80.GB time = 24.h } - withLabel:'process_medium|mc_medium' { - clusterOptions = "--partition=Lake" + withLabel: 'process_medium|mc_medium' { + queue = "Lake" cpus = 32 memory = 180.GB time = 48.h } - withLabel:'process_high|mc_large|mc_huge|process_high_cpus|cpus_max' { - clusterOptions = "--partition=Lake" + withLabel: 'process_high|mc_large|mc_huge|process_high_cpus|cpus_max' { + queue = "Lake" cpus = 32 memory = 370.GB time = 48.h } withLabel: 'process_long|process_maximum_time|process_long_parallelized' { - clusterOptions = "--partition=Lake" + queue = "Lake" time = 96.h } withLabel: 'process_high_memory|memory_max' { - clusterOptions = "--partition=Epyc" + queue = "Epyc" memory = 500.GB } withLabel: gpu { - clusterOptions = "--partition=E5-GPU" + queue = "E5-GPU" } } From ddf06441e8c98b2a7e8e73d34e61da382f0af923 Mon Sep 17 00:00:00 2001 From: Joon-Klaps Date: Fri, 22 Nov 2024 11:15:58 +0100 Subject: [PATCH 07/33] refactor code & add new gpu_profiles --- conf/vsc_kul_uhasselt.config | 159 ++++++++++++++++++++++++----------- 1 file changed, 109 insertions(+), 50 deletions(-) diff --git a/conf/vsc_kul_uhasselt.config b/conf/vsc_kul_uhasselt.config index 51fffc5c8..f761c8bfb 100644 --- a/conf/vsc_kul_uhasselt.config +++ b/conf/vsc_kul_uhasselt.config @@ -1,9 +1,7 @@ // Default to /tmp directory if $VSC_SCRATCH scratch env is not available, // see: https://github.com/nf-core/configs?tab=readme-ov-file#adding-a-new-config -def scratch_dir = System.getenv("VSC_SCRATCH") ?: "/tmp" - -// Specify the work directory -workDir = "$scratch_dir/work" +def scratch_dir = System.getenv("VSC_SCRATCH") ?: "/tmp" +def tier1_project = System.getenv("SLURM_ACCOUNT") ?: null // Perform work directory cleanup when the run has succesfully completed // cleanup = true @@ -16,15 +14,11 @@ try { System.err.println("WARNING: Could not run sinfo to determine current cluster, defaulting to genius") } -def tier1_project = System.getenv("SLURM_ACCOUNT") ?: null - if (! tier1_project && (hostname.contains("genius") || hostname.contains("wice"))) { - // Hard-code that Tier 1 cluster dodrio requires a project account System.err.println("Please specify your VSC project account with environment variable SLURM_ACCOUNT.") System.exit(1) } - // Reduce the job submit rate to about 50 per minute, this way the server won't be bombarded with jobs // Limit queueSize to keep job rate under control and avoid timeouts executor { @@ -35,11 +29,13 @@ executor { // Add backoff strategy to catch cluster timeouts and proper symlinks of files in scratch to the work directory process { - stageInMode = "symlink" - stageOutMode = "rsync" + executor = 'slurm' + scratch = "$scratch_dir" + stageInMode = "symlink" + stageOutMode = "rsync" errorStrategy = { sleep(Math.pow(2, task.attempt) * 200 as long); return 'retry' } - maxRetries = 5 - // array = 50 + maxRetries = 3 + array = 50 } // Specify that singularity should be used and where the cache dir will be for the images @@ -49,6 +45,11 @@ singularity { cacheDir = "$scratch_dir/.singularity" } +params { + config_profile_contact = 'GitHub: @Joon-Klaps - Email: joon.klaps@kuleuven.be' + config_profile_url = 'https://docs.vscentrum.be/en/latest/index.html' +} + env { APPTAINER_TMPDIR="$scratch_dir/.apptainer/tmp" APPTAINER_CACHEDIR="$scratch_dir/.apptainer/cache" @@ -56,28 +57,22 @@ env { // AWS maximum retries for errors (This way the pipeline doesn't fail if the download fails one time) aws { - maxErrorRetry = 3 + maxErrorRetry = 3 } // Define profiles for each cluster profiles { genius { - params { - config_profile_description = 'HPC_GENIUS profile for use on the genius cluster of the VSC HPC.' - config_profile_contact = 'GitHub: @Joon-Klaps - Email: joon.klaps@kuleuven.be' - config_profile_url = 'https://docs.vscentrum.be/en/latest/index.html' - max_memory = 703.GB // 768 - 65 so 65GB for overhead, max is 720000MB - max_time = 168.h - max_cpus = 36 - } + params.config_profile_description = 'genius profile for use on the genius cluster of the VSC HPC.' process { + clusterOptions = { "--clusters=genius --account=$tier1_project" } resourceLimits = [ - memory: 703.GB, - cpus: 136, + memory: 703.GB, // 768 - 65 so 65GB for overhead, max is 720000MB + cpus: 36, time: 168.h ] - executor = 'slurm' + queue = { switch (task.memory) { case { it >= 175.GB }: // max is 180000 @@ -96,29 +91,61 @@ profiles { } } } - clusterOptions = { "--clusters=genius --account=$tier1_project" } - scratch = "$scratch_dir" + } } - wice { + genius_gpu { + + params.config_profile_description = 'genius_gpu profile for use on the genius cluster of the VSC HPC.' + + docker.runOptions = '-u $(id -u):$(id -g) --gpus all' + apptainer.runOptions = '--containall --cleanenv --nv' + singularity.runOptions = '--containall --cleanenv --nv' + + process { + beforeScript = 'module load cuDNN/8.4.1.50-CUDA-11.7.0' + clusterOptions = { "--clusters=genius --account=$tier1_project" } + + resourceLimits = [ + memory: 703.GB, // 768 - 65 so 65GB for overhead, max is 720000MB + cpus: 36, + time: 168.h, + ] + + queue = { + switch (task.memory) { + case { it >= 175.GB }: // max is 180000 + switch (task.time) { + case { it >= 72.h }: + return 'gpu_v100_long' + default: + return 'gpu_v100' + } + default: + switch (task.time) { + case { it >= 72.h }: + return 'gpu_p100_long,amd_long' + default: + return 'gpu_p100,gpu_p100_debug,amd' + } + } + } - params { - config_profile_description = 'HPC_WICE profile for use on the Wice cluster of the VSC HPC.' - config_profile_contact = 'GitHub: @Joon-Klaps - Email: joon.klaps@kuleuven.be' - config_profile_url = 'https://docs.vscentrum.be/en/latest/index.html' - max_memory = 1968.GB // max is 2016000 - max_cpus = 72 - max_time = 168.h } + } + + wice { + params.config_profile_description = 'wice profile for use on the Wice cluster of the VSC HPC.' process { + clusterOptions = { "--clusters=wice --account=$tier1_project"} resourceLimits = [ - memory: 1968.GB, + memory: 1968.GB, // max is 2016000 cpus: 72, time: 168.h ] - executor = 'slurm' + queue = { switch (task.memory) { case { it >= 239.GB }: // max is 244800 @@ -137,31 +164,63 @@ profiles { } } } - clusterOptions = { "--clusters=wice --account=$tier1_project"} - scratch = "$scratch_dir" + } } - superdome { - params { - config_profile_description = 'HPC_SUPERDOME profile for use on the genius cluster of the VSC HPC.' - config_profile_contact = 'GitHub: @Joon-Klaps - Email: joon.klaps@kuleuven.be' - config_profile_url = 'https://docs.vscentrum.be/en/latest/index.html' - max_memory = 5772.GB // 6000 - 228 so 228GB for overhead, max is 5910888MB - max_cpus = 14 - max_time = 168.h + wice_gpu { + + params.config_profile_description = 'wice_gpu profile for use on the genius cluster of the VSC HPC.' + + docker.runOptions = '-u $(id -u):$(id -g) --gpus all' + apptainer.runOptions = '--containall --cleanenv --nv' + singularity.runOptions = '--containall --cleanenv --nv' + + process { + beforeScript = 'module load cuDNN/8.4.1.50-CUDA-11.7.0' + clusterOptions = { "--clusters=genius --account=$tier1_project" } + resourceLimits = [ + memory: 703.GB, // 768 - 65 so 65GB for overhead, max is 720000MB + cpus: 60, + time: 168.h + ] + + queue = { + switch (task.memory) { + case { it >= 478.GB }: // max is 489600 + switch (task.time) { + case { it >= 72.h }: + return 'dedicated_big_gpu_h100,dedicated_big_gpu' + default: + return 'gpu,gpu_h100' + } + default: + switch (task.time) { + case { it >= 72.h }: + return 'gpu_a100' + default: + return 'gpu_a100_debug' + } + } + } + } + } + + superdome { + params.config_profile_description = 'superdome profile for use on the genius cluster of the VSC HPC.' process { + clusterOptions = {"--clusters=genius --account=$tier1_project"} resourceLimits = [ - memory: 5772.GB, + memory: 5772.GB, // 6000 - 228 so 228GB for overhead, max is 5910888MB cpus: 14, time: 168.h ] - executor = 'slurm' + queue = { task.time <= 72.h ? 'superdome' : 'superdome_long' } - clusterOptions = {"--clusters=genius --account=$tier1_project"} - scratch = "$scratch_dir" } } } + + From a6ba9846ab37b407f2b895e74aecd861495356e3 Mon Sep 17 00:00:00 2001 From: Joon-Klaps Date: Fri, 22 Nov 2024 15:29:09 +0100 Subject: [PATCH 08/33] update logic & readability vsc_kul_uhasselt GPU --- conf/vsc_kul_uhasselt.config | 150 ++++++++++++----------------------- 1 file changed, 52 insertions(+), 98 deletions(-) diff --git a/conf/vsc_kul_uhasselt.config b/conf/vsc_kul_uhasselt.config index f761c8bfb..fede1181f 100644 --- a/conf/vsc_kul_uhasselt.config +++ b/conf/vsc_kul_uhasselt.config @@ -1,24 +1,11 @@ // Default to /tmp directory if $VSC_SCRATCH scratch env is not available, // see: https://github.com/nf-core/configs?tab=readme-ov-file#adding-a-new-config -def scratch_dir = System.getenv("VSC_SCRATCH") ?: "/tmp" -def tier1_project = System.getenv("SLURM_ACCOUNT") ?: null +scratch_dir = System.getenv("VSC_SCRATCH") ?: "/tmp" +tier1_project = System.getenv("SLURM_ACCOUNT") ?: null // Perform work directory cleanup when the run has succesfully completed // cleanup = true -// Get the hostname and check some values for tier1 -def hostname = "genius" -try { - hostname = ['/bin/bash', '-c', 'sinfo --clusters=genius,wice -s | head -n 1'].execute().text.replace('CLUSTER: ','') -} catch (java.io.IOException e) { - System.err.println("WARNING: Could not run sinfo to determine current cluster, defaulting to genius") -} - -if (! tier1_project && (hostname.contains("genius") || hostname.contains("wice"))) { - System.err.println("Please specify your VSC project account with environment variable SLURM_ACCOUNT.") - System.exit(1) -} - // Reduce the job submit rate to about 50 per minute, this way the server won't be bombarded with jobs // Limit queueSize to keep job rate under control and avoid timeouts executor { @@ -29,8 +16,6 @@ executor { // Add backoff strategy to catch cluster timeouts and proper symlinks of files in scratch to the work directory process { - executor = 'slurm' - scratch = "$scratch_dir" stageInMode = "symlink" stageOutMode = "rsync" errorStrategy = { sleep(Math.pow(2, task.attempt) * 200 as long); return 'retry' } @@ -67,34 +52,33 @@ profiles { process { clusterOptions = { "--clusters=genius --account=$tier1_project" } - resourceLimits = [ - memory: 703.GB, // 768 - 65 so 65GB for overhead, max is 720000MB - cpus: 36, - time: 168.h - ] + // 768 - 65 so 65GB for overhead, max is 720000MB + resourceLimits = [ memory: 703.GB, cpus: 36, time: 168.h ] + + withLabel: '.*gpu.*'{ + beforeScript = 'module load cuDNN/8.4.1.50-CUDA-11.7.0' + queue = { + switch (task.memory) { + case { it >= 175.GB }: // max is 180000 + return task.time >= 72.h ? 'gpu_v100_long' : 'gpu_v100' + default: + return task.time >= 72.h ? 'gpu_p100_long,amd_long' : 'gpu_p100,amd' + } + } + } queue = { switch (task.memory) { - case { it >= 175.GB }: // max is 180000 - switch (task.time) { - case { it >= 72.h }: - return 'dedicated_big_bigmem,dedicated_big_batch,bigmem_long' - default: - return 'bigmem' - } - default: - switch (task.time) { - case { it >= 72.h }: - return 'batch_long' + case { it >= 175.GB }: // max is 180000 + return task.time >= 72.h ? 'dedicated_big_bigmem,dedicated_big_batch,bigmem_long' : 'bigmem' default: - return 'batch' - } + return task.time >= 72.h ? 'batch_long' : 'batch' } } - } } + genius_gpu { params.config_profile_description = 'genius_gpu profile for use on the genius cluster of the VSC HPC.' @@ -107,31 +91,17 @@ profiles { beforeScript = 'module load cuDNN/8.4.1.50-CUDA-11.7.0' clusterOptions = { "--clusters=genius --account=$tier1_project" } - resourceLimits = [ - memory: 703.GB, // 768 - 65 so 65GB for overhead, max is 720000MB - cpus: 36, - time: 168.h, - ] + // 768 - 65 so 65GB for overhead, max is 720000MB + resourceLimits = [ memory: 703.GB, cpus: 36, time: 168.h] queue = { switch (task.memory) { - case { it >= 175.GB }: // max is 180000 - switch (task.time) { - case { it >= 72.h }: - return 'gpu_v100_long' + case { it >= 175.GB }: // max is 180000 + return task.time >= 72.h ? 'gpu_v100_long' : 'gpu_v100' default: - return 'gpu_v100' - } - default: - switch (task.time) { - case { it >= 72.h }: - return 'gpu_p100_long,amd_long' - default: - return 'gpu_p100,gpu_p100_debug,amd' - } + return task.time >= 72.h ? 'gpu_p100_long,amd_long' : 'gpu_p100,amd' } } - } } @@ -140,34 +110,35 @@ profiles { process { clusterOptions = { "--clusters=wice --account=$tier1_project"} - resourceLimits = [ - memory: 1968.GB, // max is 2016000 - cpus: 72, - time: 168.h - ] + // max is 2016000 + resourceLimits = [ memory: 1968.GB, cpus: 72, time: 168.h ] queue = { - switch (task.memory) { - case { it >= 239.GB }: // max is 244800 - switch (task.time) { - case { it >= 72.h }: - return 'dedicated_big_bigmem' - default: - return 'bigmem,hugemem' + switch (task.memory) { + case { it >= 239.GB }: // max is 244800 + return task.time >= 72.h ? 'dedicated_big_bigmem' : 'bigmem,hugemem' + default: + return task.time >= 72.h ? 'batch_long,batch_icelake_long,batch_sapphirerapids_long': 'batch,batch_sapphirerapids,batch_icelake' } - default: - switch (task.time) { - case { it >= 72.h }: - return 'batch_long,batch_icelake_long,batch_sapphirerapids_long' - default: - return 'batch,batch_sapphirerapids,batch_icelake' + } + + withLabel: '.*gpu.*'{ + beforeScript = 'module load cuDNN/8.4.1.50-CUDA-11.7.0' + resourceLimits = [ memory: 703.GB, cpus: 64, time: 168.h ] + + queue = { + switch (task.memory) { + case { it >= 239.GB }: // max is 244800 + return task.time >= 72.h ? 'dedicated_big_gpu_h100' : 'gpu_h100' + default: + return task.time >= 72.h ? 'dedicated_big_gpu': 'gpu_a100,gpu' } } } - } } + wice_gpu { params.config_profile_description = 'wice_gpu profile for use on the genius cluster of the VSC HPC.' @@ -179,31 +150,17 @@ profiles { process { beforeScript = 'module load cuDNN/8.4.1.50-CUDA-11.7.0' clusterOptions = { "--clusters=genius --account=$tier1_project" } - resourceLimits = [ - memory: 703.GB, // 768 - 65 so 65GB for overhead, max is 720000MB - cpus: 60, - time: 168.h - ] + // 768 - 65 so 65GB for overhead, max is 720000MB + resourceLimits = [ memory: 703.GB, cpus: 64, time: 168.h ] queue = { switch (task.memory) { - case { it >= 478.GB }: // max is 489600 - switch (task.time) { - case { it >= 72.h }: - return 'dedicated_big_gpu_h100,dedicated_big_gpu' - default: - return 'gpu,gpu_h100' - } - default: - switch (task.time) { - case { it >= 72.h }: - return 'gpu_a100' + case { it >= 478.GB }: // max is 489600 + return task.time >= 72.h ? 'dedicated_big_gpu_h100' : 'gpu_h100' default: - return 'gpu_a100_debug' - } + return task.time >= 72.h ? 'dedicated_big_gpu': 'gpu_a100,gpu' } } - } } @@ -212,11 +169,8 @@ profiles { process { clusterOptions = {"--clusters=genius --account=$tier1_project"} - resourceLimits = [ - memory: 5772.GB, // 6000 - 228 so 228GB for overhead, max is 5910888MB - cpus: 14, - time: 168.h - ] + // 6000 - 228 so 228GB for overhead, max is 5910888MB + resourceLimits = [ memory: 5772.GB, cpus: 14, time: 168.h] queue = { task.time <= 72.h ? 'superdome' : 'superdome_long' } } From 9292d5a1d70719618640c14cd0a9eddc2dc010ee Mon Sep 17 00:00:00 2001 From: Joon-Klaps Date: Mon, 25 Nov 2024 13:05:13 +0100 Subject: [PATCH 09/33] Getting somewhere finally --- conf/vsc_kul_uhasselt.config | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/conf/vsc_kul_uhasselt.config b/conf/vsc_kul_uhasselt.config index fede1181f..a74b1952a 100644 --- a/conf/vsc_kul_uhasselt.config +++ b/conf/vsc_kul_uhasselt.config @@ -16,6 +16,7 @@ executor { // Add backoff strategy to catch cluster timeouts and proper symlinks of files in scratch to the work directory process { + executor = 'slurm' stageInMode = "symlink" stageOutMode = "rsync" errorStrategy = { sleep(Math.pow(2, task.attempt) * 200 as long); return 'retry' } @@ -51,12 +52,14 @@ profiles { params.config_profile_description = 'genius profile for use on the genius cluster of the VSC HPC.' process { - clusterOptions = { "--clusters=genius --account=$tier1_project" } // 768 - 65 so 65GB for overhead, max is 720000MB resourceLimits = [ memory: 703.GB, cpus: 36, time: 168.h ] + clusterOptions = { "--clusters=genius --account=$tier1_project" } withLabel: '.*gpu.*'{ beforeScript = 'module load cuDNN/8.4.1.50-CUDA-11.7.0' + clusterOptions = { "--gpus-per-node=${task.cpus < 18 ? 1 : 2} --clusters=genius --account=$tier1_project" } + queue = { switch (task.memory) { case { it >= 175.GB }: // max is 180000 @@ -88,11 +91,10 @@ profiles { singularity.runOptions = '--containall --cleanenv --nv' process { - beforeScript = 'module load cuDNN/8.4.1.50-CUDA-11.7.0' - clusterOptions = { "--clusters=genius --account=$tier1_project" } - // 768 - 65 so 65GB for overhead, max is 720000MB resourceLimits = [ memory: 703.GB, cpus: 36, time: 168.h] + beforeScript = 'module load cuDNN/8.4.1.50-CUDA-11.7.0' + clusterOptions = { "--gpus-per-node=${task.cpus < 18 ? 1 : 2} --clusters=genius --account=$tier1_project" } queue = { switch (task.memory) { @@ -125,6 +127,7 @@ profiles { withLabel: '.*gpu.*'{ beforeScript = 'module load cuDNN/8.4.1.50-CUDA-11.7.0' resourceLimits = [ memory: 703.GB, cpus: 64, time: 168.h ] + clusterOptions = { "--gpus-per-node=${task.cpus < 32 ? 1 : 2} --clusters=genius --account=$tier1_project" } queue = { switch (task.memory) { @@ -148,10 +151,10 @@ profiles { singularity.runOptions = '--containall --cleanenv --nv' process { - beforeScript = 'module load cuDNN/8.4.1.50-CUDA-11.7.0' - clusterOptions = { "--clusters=genius --account=$tier1_project" } // 768 - 65 so 65GB for overhead, max is 720000MB resourceLimits = [ memory: 703.GB, cpus: 64, time: 168.h ] + beforeScript = 'module load cuDNN/8.4.1.50-CUDA-11.7.0' + clusterOptions = { "--gpus-per-node=${task.cpus < 32 ? 1 : 2} --clusters=genius --account=$tier1_project" } queue = { switch (task.memory) { From d14a293e9f0e542c20668cae2aa1658918055496 Mon Sep 17 00:00:00 2001 From: Joon-Klaps Date: Mon, 25 Nov 2024 16:22:36 +0100 Subject: [PATCH 10/33] Local tests passing! --- conf/vsc_kul_uhasselt.config | 44 +++++++++++++++++------------------- 1 file changed, 21 insertions(+), 23 deletions(-) diff --git a/conf/vsc_kul_uhasselt.config b/conf/vsc_kul_uhasselt.config index a74b1952a..279e93ad3 100644 --- a/conf/vsc_kul_uhasselt.config +++ b/conf/vsc_kul_uhasselt.config @@ -56,9 +56,20 @@ profiles { resourceLimits = [ memory: 703.GB, cpus: 36, time: 168.h ] clusterOptions = { "--clusters=genius --account=$tier1_project" } + queue = { + switch (task.memory) { + case { it >= 175.GB }: // max is 180000 + return task.time >= 72.h ? 'dedicated_big_bigmem,dedicated_big_batch,bigmem_long' : 'bigmem' + default: + return task.time >= 72.h ? 'batch_long' : 'batch' + } + } + withLabel: '.*gpu.*'{ - beforeScript = 'module load cuDNN/8.4.1.50-CUDA-11.7.0' - clusterOptions = { "--gpus-per-node=${task.cpus < 18 ? 1 : 2} --clusters=genius --account=$tier1_project" } + resourceLimits = [ memory: 703.GB, cpus: 36 , time: 168.h ] + clusterOptions = { "--gpus-per-node=${Math.ceil(task.cpus/9) as int} --clusters=genius --account=$tier1_project" } + apptainer.runOptions = '--containall --cleanenv --nv' + singularity.runOptions = '--containall --cleanenv --nv' queue = { switch (task.memory) { @@ -69,15 +80,6 @@ profiles { } } } - - queue = { - switch (task.memory) { - case { it >= 175.GB }: // max is 180000 - return task.time >= 72.h ? 'dedicated_big_bigmem,dedicated_big_batch,bigmem_long' : 'bigmem' - default: - return task.time >= 72.h ? 'batch_long' : 'batch' - } - } } } @@ -85,16 +87,13 @@ profiles { genius_gpu { params.config_profile_description = 'genius_gpu profile for use on the genius cluster of the VSC HPC.' - - docker.runOptions = '-u $(id -u):$(id -g) --gpus all' - apptainer.runOptions = '--containall --cleanenv --nv' - singularity.runOptions = '--containall --cleanenv --nv' + apptainer.runOptions = '--containall --cleanenv --nv' + singularity.runOptions = '--containall --cleanenv --nv' process { // 768 - 65 so 65GB for overhead, max is 720000MB resourceLimits = [ memory: 703.GB, cpus: 36, time: 168.h] - beforeScript = 'module load cuDNN/8.4.1.50-CUDA-11.7.0' - clusterOptions = { "--gpus-per-node=${task.cpus < 18 ? 1 : 2} --clusters=genius --account=$tier1_project" } + clusterOptions = { "--gpus-per-node=${Math.ceil(task.cpus/9) as int} --clusters=genius --account=$tier1_project" } queue = { switch (task.memory) { @@ -111,9 +110,9 @@ profiles { params.config_profile_description = 'wice profile for use on the Wice cluster of the VSC HPC.' process { - clusterOptions = { "--clusters=wice --account=$tier1_project"} // max is 2016000 resourceLimits = [ memory: 1968.GB, cpus: 72, time: 168.h ] + clusterOptions = { "--clusters=wice --account=$tier1_project"} queue = { switch (task.memory) { @@ -125,9 +124,8 @@ profiles { } withLabel: '.*gpu.*'{ - beforeScript = 'module load cuDNN/8.4.1.50-CUDA-11.7.0' resourceLimits = [ memory: 703.GB, cpus: 64, time: 168.h ] - clusterOptions = { "--gpus-per-node=${task.cpus < 32 ? 1 : 2} --clusters=genius --account=$tier1_project" } + clusterOptions = { "--gpus-per-node=${Math.ceil(task.cpus/16) as int} --clusters=wice --account=$tier1_project" } queue = { switch (task.memory) { @@ -144,7 +142,7 @@ profiles { wice_gpu { - params.config_profile_description = 'wice_gpu profile for use on the genius cluster of the VSC HPC.' + params.config_profile_description = 'wice_gpu profile for use on the Wice cluster of the VSC HPC.' docker.runOptions = '-u $(id -u):$(id -g) --gpus all' apptainer.runOptions = '--containall --cleanenv --nv' @@ -153,8 +151,8 @@ profiles { process { // 768 - 65 so 65GB for overhead, max is 720000MB resourceLimits = [ memory: 703.GB, cpus: 64, time: 168.h ] - beforeScript = 'module load cuDNN/8.4.1.50-CUDA-11.7.0' - clusterOptions = { "--gpus-per-node=${task.cpus < 32 ? 1 : 2} --clusters=genius --account=$tier1_project" } + + clusterOptions = { "--gpus-per-node=${Math.ceil(task.cpus/16) as int} --clusters=wice --account=$tier1_project" } queue = { switch (task.memory) { From bf3d46d0c286888c316f196563e13ee21ec2cb69 Mon Sep 17 00:00:00 2001 From: Joon-Klaps Date: Mon, 25 Nov 2024 16:24:45 +0100 Subject: [PATCH 11/33] remove dockerrunOpts on HPC --- conf/vsc_kul_uhasselt.config | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/conf/vsc_kul_uhasselt.config b/conf/vsc_kul_uhasselt.config index 279e93ad3..34a1414f6 100644 --- a/conf/vsc_kul_uhasselt.config +++ b/conf/vsc_kul_uhasselt.config @@ -85,7 +85,6 @@ profiles { genius_gpu { - params.config_profile_description = 'genius_gpu profile for use on the genius cluster of the VSC HPC.' apptainer.runOptions = '--containall --cleanenv --nv' singularity.runOptions = '--containall --cleanenv --nv' @@ -124,8 +123,10 @@ profiles { } withLabel: '.*gpu.*'{ - resourceLimits = [ memory: 703.GB, cpus: 64, time: 168.h ] - clusterOptions = { "--gpus-per-node=${Math.ceil(task.cpus/16) as int} --clusters=wice --account=$tier1_project" } + resourceLimits = [ memory: 703.GB, cpus: 64, time: 168.h ] + clusterOptions = { "--gpus-per-node=${Math.ceil(task.cpus/16) as int} --clusters=wice --account=$tier1_project" } + apptainer.runOptions = '--containall --cleanenv --nv' + singularity.runOptions = '--containall --cleanenv --nv' queue = { switch (task.memory) { @@ -141,12 +142,9 @@ profiles { wice_gpu { - params.config_profile_description = 'wice_gpu profile for use on the Wice cluster of the VSC HPC.' - - docker.runOptions = '-u $(id -u):$(id -g) --gpus all' - apptainer.runOptions = '--containall --cleanenv --nv' - singularity.runOptions = '--containall --cleanenv --nv' + apptainer.runOptions = '--containall --cleanenv --nv' + singularity.runOptions = '--containall --cleanenv --nv' process { // 768 - 65 so 65GB for overhead, max is 720000MB From e3e948e1b105ce6626556268989d597ff0b3e9ae Mon Sep 17 00:00:00 2001 From: Joon-Klaps Date: Mon, 25 Nov 2024 16:49:18 +0100 Subject: [PATCH 12/33] remove switch statement, use floor instead of ceil --- conf/vsc_kul_uhasselt.config | 65 +++++++++++++----------------------- 1 file changed, 24 insertions(+), 41 deletions(-) diff --git a/conf/vsc_kul_uhasselt.config b/conf/vsc_kul_uhasselt.config index 34a1414f6..0a9f16d0e 100644 --- a/conf/vsc_kul_uhasselt.config +++ b/conf/vsc_kul_uhasselt.config @@ -57,27 +57,22 @@ profiles { clusterOptions = { "--clusters=genius --account=$tier1_project" } queue = { - switch (task.memory) { - case { it >= 175.GB }: // max is 180000 - return task.time >= 72.h ? 'dedicated_big_bigmem,dedicated_big_batch,bigmem_long' : 'bigmem' - default: - return task.time >= 72.h ? 'batch_long' : 'batch' - } + task.memory >= 175.GB ? + (task.time >= 72.h ? 'dedicated_big_bigmem,dedicated_big_batch,bigmem_long' : 'bigmem') : + (task.time >= 72.h ? 'batch_long' : 'batch') } withLabel: '.*gpu.*'{ resourceLimits = [ memory: 703.GB, cpus: 36 , time: 168.h ] - clusterOptions = { "--gpus-per-node=${Math.ceil(task.cpus/9) as int} --clusters=genius --account=$tier1_project" } + // suggested to request 9 cpus per gpu + clusterOptions = { "--gpus-per-node=${Math.max(1,Math.floor(task.cpus/9)) as int} --clusters=genius --account=$tier1_project" } apptainer.runOptions = '--containall --cleanenv --nv' singularity.runOptions = '--containall --cleanenv --nv' queue = { - switch (task.memory) { - case { it >= 175.GB }: // max is 180000 - return task.time >= 72.h ? 'gpu_v100_long' : 'gpu_v100' - default: - return task.time >= 72.h ? 'gpu_p100_long,amd_long' : 'gpu_p100,amd' - } + task.memory >= 175.GB ? + (task.time >= 72.h ? 'gpu_v100_long' : 'gpu_v100') : + (task.time >= 72.h ? 'gpu_p100_long,amd_long' : 'gpu_p100,amd') } } } @@ -92,15 +87,12 @@ profiles { process { // 768 - 65 so 65GB for overhead, max is 720000MB resourceLimits = [ memory: 703.GB, cpus: 36, time: 168.h] - clusterOptions = { "--gpus-per-node=${Math.ceil(task.cpus/9) as int} --clusters=genius --account=$tier1_project" } + clusterOptions = { "--gpus-per-node=${Math.max(1,Math.floor(task.cpus/9)) as int} --clusters=genius --account=$tier1_project" } queue = { - switch (task.memory) { - case { it >= 175.GB }: // max is 180000 - return task.time >= 72.h ? 'gpu_v100_long' : 'gpu_v100' - default: - return task.time >= 72.h ? 'gpu_p100_long,amd_long' : 'gpu_p100,amd' - } + task.memory >= 175.GB ? + (task.time >= 72.h ? 'gpu_v100_long' : 'gpu_v100') : + (task.time >= 72.h ? 'gpu_p100_long,amd_long' : 'gpu_p100,amd') } } } @@ -114,27 +106,21 @@ profiles { clusterOptions = { "--clusters=wice --account=$tier1_project"} queue = { - switch (task.memory) { - case { it >= 239.GB }: // max is 244800 - return task.time >= 72.h ? 'dedicated_big_bigmem' : 'bigmem,hugemem' - default: - return task.time >= 72.h ? 'batch_long,batch_icelake_long,batch_sapphirerapids_long': 'batch,batch_sapphirerapids,batch_icelake' - } + task.memory >= 239.GB ? + (task.time >= 72.h ? 'dedicated_big_bigmem' : 'bigmem,hugemem') : + (task.time >= 72.h ? 'batch_long,batch_icelake_long,batch_sapphirerapids_long' : 'batch,batch_sapphirerapids,batch_icelake') } withLabel: '.*gpu.*'{ resourceLimits = [ memory: 703.GB, cpus: 64, time: 168.h ] - clusterOptions = { "--gpus-per-node=${Math.ceil(task.cpus/16) as int} --clusters=wice --account=$tier1_project" } + clusterOptions = { "--gpus-per-node=${Math.max(1,Math.floor(task.cpus/16)) as int} --clusters=wice --account=$tier1_project" } apptainer.runOptions = '--containall --cleanenv --nv' singularity.runOptions = '--containall --cleanenv --nv' queue = { - switch (task.memory) { - case { it >= 239.GB }: // max is 244800 - return task.time >= 72.h ? 'dedicated_big_gpu_h100' : 'gpu_h100' - default: - return task.time >= 72.h ? 'dedicated_big_gpu': 'gpu_a100,gpu' - } + task.memory >= 239.GB ? + (task.time >= 72.h ? 'dedicated_big_gpu_h100' : 'gpu_h100') : + (task.time >= 72.h ? 'dedicated_big_gpu' : 'gpu_a100,gpu') } } } @@ -149,16 +135,13 @@ profiles { process { // 768 - 65 so 65GB for overhead, max is 720000MB resourceLimits = [ memory: 703.GB, cpus: 64, time: 168.h ] - - clusterOptions = { "--gpus-per-node=${Math.ceil(task.cpus/16) as int} --clusters=wice --account=$tier1_project" } + // suggested to request 16-18 cpus per gpu + clusterOptions = { "--gpus-per-node=${Math.max(1,Math.floor(task.cpus/16)) as int} --clusters=wice --account=$tier1_project" } queue = { - switch (task.memory) { - case { it >= 478.GB }: // max is 489600 - return task.time >= 72.h ? 'dedicated_big_gpu_h100' : 'gpu_h100' - default: - return task.time >= 72.h ? 'dedicated_big_gpu': 'gpu_a100,gpu' - } + task.memory >= 239.GB ? + (task.time >= 72.h ? 'dedicated_big_gpu_h100' : 'gpu_h100') : + (task.time >= 72.h ? 'dedicated_big_gpu' : 'gpu_a100,gpu') } } } From 6974904caddb9b64f4f2344dd80b598db23bb87e Mon Sep 17 00:00:00 2001 From: Joon-Klaps Date: Mon, 25 Nov 2024 17:15:19 +0100 Subject: [PATCH 13/33] refactor cluster options for GPU resource allocation and update documentation for Nextflow version requirements --- conf/vsc_kul_uhasselt.config | 36 ++++++++++++++++++++++++------------ docs/vsc_kul_uhasselt.md | 11 +++++++---- 2 files changed, 31 insertions(+), 16 deletions(-) diff --git a/conf/vsc_kul_uhasselt.config b/conf/vsc_kul_uhasselt.config index 0a9f16d0e..0ff37ba4e 100644 --- a/conf/vsc_kul_uhasselt.config +++ b/conf/vsc_kul_uhasselt.config @@ -63,11 +63,14 @@ profiles { } withLabel: '.*gpu.*'{ - resourceLimits = [ memory: 703.GB, cpus: 36 , time: 168.h ] - // suggested to request 9 cpus per gpu - clusterOptions = { "--gpus-per-node=${Math.max(1,Math.floor(task.cpus/9)) as int} --clusters=genius --account=$tier1_project" } - apptainer.runOptions = '--containall --cleanenv --nv' - singularity.runOptions = '--containall --cleanenv --nv' + resourceLimits = [ memory: 703.GB, cpus: 36 , time: 168.h ] + apptainer.runOptions = '--containall --cleanenv --nv' + singularity.runOptions = '--containall --cleanenv --nv' + clusterOptions = { + // suggested to use 9 cpus per gpu + def gpus = task.accelerator?.request ?: Math.max(1, Math.floor(task.cpus/9) as int) + "--gres=gpu:${gpus} --clusters=genius --account=$tier1_project" + } queue = { task.memory >= 175.GB ? @@ -87,7 +90,10 @@ profiles { process { // 768 - 65 so 65GB for overhead, max is 720000MB resourceLimits = [ memory: 703.GB, cpus: 36, time: 168.h] - clusterOptions = { "--gpus-per-node=${Math.max(1,Math.floor(task.cpus/9)) as int} --clusters=genius --account=$tier1_project" } + clusterOptions = { + def gpus = task.accelerator?.request ?: Math.max(1, Math.floor(task.cpus/9) as int) + "--gres=gpu:${gpus} --clusters=genius --account=$tier1_project" + } queue = { task.memory >= 175.GB ? @@ -112,10 +118,14 @@ profiles { } withLabel: '.*gpu.*'{ - resourceLimits = [ memory: 703.GB, cpus: 64, time: 168.h ] - clusterOptions = { "--gpus-per-node=${Math.max(1,Math.floor(task.cpus/16)) as int} --clusters=wice --account=$tier1_project" } - apptainer.runOptions = '--containall --cleanenv --nv' - singularity.runOptions = '--containall --cleanenv --nv' + resourceLimits = [ memory: 703.GB, cpus: 64, time: 168.h ] + apptainer.runOptions = '--containall --cleanenv --nv' + singularity.runOptions = '--containall --cleanenv --nv' + clusterOptions = { + // suggested to use 16 cpus per gpu + def gpus = task.accelerator?.request ?: Math.max(1, Math.floor(task.cpus/16) as int) + "--gres=gpu:${gpus} --clusters=wice --account=$tier1_project" + } queue = { task.memory >= 239.GB ? @@ -135,8 +145,10 @@ profiles { process { // 768 - 65 so 65GB for overhead, max is 720000MB resourceLimits = [ memory: 703.GB, cpus: 64, time: 168.h ] - // suggested to request 16-18 cpus per gpu - clusterOptions = { "--gpus-per-node=${Math.max(1,Math.floor(task.cpus/16)) as int} --clusters=wice --account=$tier1_project" } + clusterOptions = { + def gpus = task.accelerator?.request ?: Math.max(1, Math.floor(task.cpus/16) as int) + "--gres=gpu:${gpus} --clusters=wice --account=$tier1_project" + } queue = { task.memory >= 239.GB ? diff --git a/docs/vsc_kul_uhasselt.md b/docs/vsc_kul_uhasselt.md index a9caec11b..2f31304ca 100644 --- a/docs/vsc_kul_uhasselt.md +++ b/docs/vsc_kul_uhasselt.md @@ -28,14 +28,14 @@ export NXF_CONDA_CACHEDIR="$VSC_SCRATCH/miniconda3/envs" # Optional tower key # export TOWER_ACCESS_TOKEN="" -# export NXF_VER="" # make sure it's larger then 24.04.0 +# export NXF_VER="" # make sure it's larger then 24.10.1 ``` :::warning -The current config is setup with array jobs. Make sure nextflow version >= 24.04.0, read [array jobs in nextflow](https://www.nextflow.io/docs/latest/process.html#array) you can do this in +The current config is setup with array jobs. Make sure nextflow version >= 24.10.1, read [array jobs in nextflow](https://www.nextflow.io/docs/latest/process.html#array) you can do this in ```bash -export NXF_VER=24.04.0 +export NXF_VER=24.10.1 ``` ::: @@ -64,10 +64,13 @@ nextflow run -profile vsc_kul_uhasselt, **NB:** The vsc_kul_uhasselt profile is based on a selected amount of SLURM partitions. Should you require resources outside of these limits (e.g.gpus) you will need to provide a custom config specifying an appropriate SLURM partition (e.g. 'gpu\*'). +> **NB:** The vsc_kul_uhasselt profile is based on a selected amount of SLURM partitions. The profile will select to its best ability the most appropriate partition for the job. Including modules with a label containing `gpu`will be allocated to a gpu partition when the 'normal' `genius` profile is selected. Select the `genius_gpu` or `wice_gpu` profile to force the job to be allocated to a gpu partition. +> **NB:** If the module does not have `accelerator` set, it will determine the number of GPUs based on the requested resources. Use the `--cluster` option to specify the cluster you intend to use when submitting the job: From dcd9741dd9fb58d64e8a2e37ec99ca9518964cea Mon Sep 17 00:00:00 2001 From: Joon-Klaps Date: Tue, 26 Nov 2024 16:54:39 +0100 Subject: [PATCH 14/33] reduce job submission limits & add apptainer timeout --- conf/vsc_kul_uhasselt.config | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/conf/vsc_kul_uhasselt.config b/conf/vsc_kul_uhasselt.config index 0ff37ba4e..92e9780d3 100644 --- a/conf/vsc_kul_uhasselt.config +++ b/conf/vsc_kul_uhasselt.config @@ -10,7 +10,7 @@ tier1_project = System.getenv("SLURM_ACCOUNT") ?: null // Limit queueSize to keep job rate under control and avoid timeouts executor { submitRateLimit = '50/1min' - queueSize = 30 + queueSize = 50 exitReadTimeout = "10min" } @@ -19,16 +19,17 @@ process { executor = 'slurm' stageInMode = "symlink" stageOutMode = "rsync" - errorStrategy = { sleep(Math.pow(2, task.attempt) * 200 as long); return 'retry' } + errorStrategy = { sleep(Math.pow(2, task.attempt ?: 1) * 200 as long); return 'retry' } maxRetries = 3 - array = 50 + array = 30 } // Specify that singularity should be used and where the cache dir will be for the images singularity { - enabled = true - autoMounts = true - cacheDir = "$scratch_dir/.singularity" + enabled = true + autoMounts = true + cacheDir = "$scratch_dir/.singularity" + pullTimeout = "30 min" } params { From 595da58ed72c88f7fda65394ef0cbd4a37b66256 Mon Sep 17 00:00:00 2001 From: Joon-Klaps Date: Wed, 27 Nov 2024 10:13:02 +0100 Subject: [PATCH 15/33] add default task.cpus --- conf/vsc_kul_uhasselt.config | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/conf/vsc_kul_uhasselt.config b/conf/vsc_kul_uhasselt.config index 92e9780d3..69e938e83 100644 --- a/conf/vsc_kul_uhasselt.config +++ b/conf/vsc_kul_uhasselt.config @@ -69,7 +69,7 @@ profiles { singularity.runOptions = '--containall --cleanenv --nv' clusterOptions = { // suggested to use 9 cpus per gpu - def gpus = task.accelerator?.request ?: Math.max(1, Math.floor(task.cpus/9) as int) + def gpus = task.accelerator?.request ?: Math.max(1, Math.floor((task.cpus ?:1)/9) as int) "--gres=gpu:${gpus} --clusters=genius --account=$tier1_project" } @@ -92,7 +92,7 @@ profiles { // 768 - 65 so 65GB for overhead, max is 720000MB resourceLimits = [ memory: 703.GB, cpus: 36, time: 168.h] clusterOptions = { - def gpus = task.accelerator?.request ?: Math.max(1, Math.floor(task.cpus/9) as int) + def gpus = task.accelerator?.request ?: Math.max(1, Math.floor((task.cpus ?:1)/9) as int) "--gres=gpu:${gpus} --clusters=genius --account=$tier1_project" } @@ -124,7 +124,7 @@ profiles { singularity.runOptions = '--containall --cleanenv --nv' clusterOptions = { // suggested to use 16 cpus per gpu - def gpus = task.accelerator?.request ?: Math.max(1, Math.floor(task.cpus/16) as int) + def gpus = task.accelerator?.request ?: Math.max(1, Math.floor((task.cpus ?:1)/16) as int) "--gres=gpu:${gpus} --clusters=wice --account=$tier1_project" } @@ -147,7 +147,7 @@ profiles { // 768 - 65 so 65GB for overhead, max is 720000MB resourceLimits = [ memory: 703.GB, cpus: 64, time: 168.h ] clusterOptions = { - def gpus = task.accelerator?.request ?: Math.max(1, Math.floor(task.cpus/16) as int) + def gpus = task.accelerator?.request ?: Math.max(1, Math.floor((task.cpus ?:1)/16) as int) "--gres=gpu:${gpus} --clusters=wice --account=$tier1_project" } From 01446f0c62b92542f1803888eaae61751a6f563b Mon Sep 17 00:00:00 2001 From: Edmund Miller Date: Sat, 16 Nov 2024 19:44:23 -0600 Subject: [PATCH 16/33] chore: Update memory to 28.GB for medium tasks --- conf/utd_ganymede.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/utd_ganymede.config b/conf/utd_ganymede.config index 4b3a07fa7..76e40225e 100644 --- a/conf/utd_ganymede.config +++ b/conf/utd_ganymede.config @@ -57,7 +57,7 @@ process { withLabel:process_medium { cpus = { 16 * task.attempt } - memory = { 30.GB * task.attempt } + memory = { 28.GB * task.attempt } } } From a03e5a6f95cb0e501008f28311282d45df23aeed Mon Sep 17 00:00:00 2001 From: Edmund Miller Date: Sat, 16 Nov 2024 19:57:19 -0600 Subject: [PATCH 17/33] fix: Set Genomics queue memory variable --- conf/utd_ganymede.config | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/conf/utd_ganymede.config b/conf/utd_ganymede.config index 76e40225e..a5f9c2aa8 100644 --- a/conf/utd_ganymede.config +++ b/conf/utd_ganymede.config @@ -19,15 +19,16 @@ singularity { } def membership = "groups".execute().text +def genomics_queue_memory = 28.GB def select_queue = { memory, cpu -> - if (memory <= 28.GB && cpu <= 16 && membership.contains('genomics')) { + if (memory <= genomics_queue_memory && cpu <= 16 && membership.contains('genomics')) { return 'genomics,normal' } - if (memory > 28.GB && memory <= 125.GB && cpu <= 12 && membership.contains('kim')) { + if (memory > genomics_queue_memory && memory <= 125.GB && cpu <= 12 && membership.contains('kim')) { return 'Kim,128s' } - if (memory > 28.GB && memory <= 125.GB && cpu <= 16) { + if (memory > genomics_queue_memory && memory <= 125.GB && cpu <= 16) { return '128s' } if (memory <= 250.GB && cpu <= 28) { @@ -57,7 +58,7 @@ process { withLabel:process_medium { cpus = { 16 * task.attempt } - memory = { 28.GB * task.attempt } + memory = { genomics_queue_memory * task.attempt } } } From dd7ee04aa4e2de8d01b81fa904a04c898351777b Mon Sep 17 00:00:00 2001 From: Edmund Miller Date: Wed, 27 Nov 2024 20:40:30 -0600 Subject: [PATCH 18/33] feat: Use new previousTrace --- conf/utd_ganymede.config | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/conf/utd_ganymede.config b/conf/utd_ganymede.config index a5f9c2aa8..0aa273bdf 100644 --- a/conf/utd_ganymede.config +++ b/conf/utd_ganymede.config @@ -57,11 +57,15 @@ process { queue = { select_queue(task.memory, task.cpu) } withLabel:process_medium { - cpus = { 16 * task.attempt } - memory = { genomics_queue_memory * task.attempt } + cpus = { task.attempt > 2 ? task.previousTrace.cpus * 1.5 : (16) } + memory = { task.attempt > 1 ? task.previousTrace.memory * 2 : (genomics_queue_memory) } } } +manifest { + nextflowVersion = '!>=24.10.0' +} + params { max_memory = 250.GB max_cpus = 28 From d23a8955398b50d46386cb95769652a54f78e18e Mon Sep 17 00:00:00 2001 From: Edmund Miller Date: Wed, 27 Nov 2024 20:43:42 -0600 Subject: [PATCH 19/33] chore: Remove old params --- conf/utd_ganymede.config | 6 ------ 1 file changed, 6 deletions(-) diff --git a/conf/utd_ganymede.config b/conf/utd_ganymede.config index 0aa273bdf..676f1975e 100644 --- a/conf/utd_ganymede.config +++ b/conf/utd_ganymede.config @@ -65,9 +65,3 @@ process { manifest { nextflowVersion = '!>=24.10.0' } - -params { - max_memory = 250.GB - max_cpus = 28 - max_time = 96.h -} From 597efab1e6a7a31dc681a80e2c9ae10406c06281 Mon Sep 17 00:00:00 2001 From: Edmund Miller Date: Wed, 27 Nov 2024 21:08:45 -0600 Subject: [PATCH 20/33] fix: Give up and revert --- conf/utd_ganymede.config | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/conf/utd_ganymede.config b/conf/utd_ganymede.config index 676f1975e..11073cd8a 100644 --- a/conf/utd_ganymede.config +++ b/conf/utd_ganymede.config @@ -57,8 +57,8 @@ process { queue = { select_queue(task.memory, task.cpu) } withLabel:process_medium { - cpus = { task.attempt > 2 ? task.previousTrace.cpus * 1.5 : (16) } - memory = { task.attempt > 1 ? task.previousTrace.memory * 2 : (genomics_queue_memory) } + cpus = 16 + memory = { 27.GB * task.attempt } } } From 363fd2c2ff814dfdfdb86ef02d69471dd68417ed Mon Sep 17 00:00:00 2001 From: JohannesKersting Date: Mon, 2 Dec 2024 16:18:00 +0100 Subject: [PATCH 21/33] use group of launch dir for work dir --- conf/daisybio.config | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/conf/daisybio.config b/conf/daisybio.config index fe73eb61d..d2b3ac715 100644 --- a/conf/daisybio.config +++ b/conf/daisybio.config @@ -11,7 +11,17 @@ params { // define workDir in /nfs/scratch/nf-core_work/ named after the launch dir def work_dir = "/nfs/scratch/nf-core_work/" if(new File(work_dir).exists() && System.getenv("PWD")) { - workDir = work_dir+System.getenv("PWD").tokenize('/').join('.') + work_dir = work_dir+System.getenv("PWD").tokenize('/').join('.') + workDir = work_dir + + // if directory does not exist, create it and set the group to the group launch dir + if(!new File(work_dir).exists()) { + "mkdir -p ${work_dir}".execute() + def pwd = System.getenv("PWD") + def group = "stat -c %g ${pwd}".execute().text.trim() + "chgrp -R ${group} ${work_dir}".execute() + "chmod -R g+s ${work_dir}".execute() + } } process { From eb0fa5b70fbeb6bd3b62f99253dd2998a659bb73 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Tue, 3 Dec 2024 10:15:48 +0100 Subject: [PATCH 22/33] Comment out broken line of internal imported config --- conf/kaust.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/kaust.config b/conf/kaust.config index cd0e3d34f..fdd3739f6 100755 --- a/conf/kaust.config +++ b/conf/kaust.config @@ -9,7 +9,7 @@ params { } // Load genome resources and assets hosted by the Bioinformatics team on IBEX cluster -includeConfig '/biocorelab/BIX/resources/configs/genomes.yaml' +// includeConfig '/biocorelab/BIX/resources/configs/genomes.yaml' singularity { enabled = true From bf238465635f23411fb8fa019038f288c4d9f162 Mon Sep 17 00:00:00 2001 From: Joon-Klaps Date: Tue, 3 Dec 2024 12:47:35 +0100 Subject: [PATCH 23/33] load default module --- conf/vsc_kul_uhasselt.config | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/conf/vsc_kul_uhasselt.config b/conf/vsc_kul_uhasselt.config index 69e938e83..4e4634601 100644 --- a/conf/vsc_kul_uhasselt.config +++ b/conf/vsc_kul_uhasselt.config @@ -55,6 +55,7 @@ profiles { process { // 768 - 65 so 65GB for overhead, max is 720000MB resourceLimits = [ memory: 703.GB, cpus: 36, time: 168.h ] + beforeScript = 'module load cluster/genius' clusterOptions = { "--clusters=genius --account=$tier1_project" } queue = { @@ -91,6 +92,7 @@ profiles { process { // 768 - 65 so 65GB for overhead, max is 720000MB resourceLimits = [ memory: 703.GB, cpus: 36, time: 168.h] + beforeScript = 'module load cluster/genius' clusterOptions = { def gpus = task.accelerator?.request ?: Math.max(1, Math.floor((task.cpus ?:1)/9) as int) "--gres=gpu:${gpus} --clusters=genius --account=$tier1_project" @@ -111,6 +113,7 @@ profiles { // max is 2016000 resourceLimits = [ memory: 1968.GB, cpus: 72, time: 168.h ] clusterOptions = { "--clusters=wice --account=$tier1_project"} + beforeScript = 'module load cluster/wice' queue = { task.memory >= 239.GB ? @@ -146,6 +149,7 @@ profiles { process { // 768 - 65 so 65GB for overhead, max is 720000MB resourceLimits = [ memory: 703.GB, cpus: 64, time: 168.h ] + beforeScript = 'module load cluster/wice' clusterOptions = { def gpus = task.accelerator?.request ?: Math.max(1, Math.floor((task.cpus ?:1)/16) as int) "--gres=gpu:${gpus} --clusters=wice --account=$tier1_project" @@ -164,6 +168,7 @@ profiles { process { clusterOptions = {"--clusters=genius --account=$tier1_project"} + beforeScript = 'module load cluster/genius/superdome' // 6000 - 228 so 228GB for overhead, max is 5910888MB resourceLimits = [ memory: 5772.GB, cpus: 14, time: 168.h] From 7d6e4bb06937165f9b9eb0b90025386e102ed719 Mon Sep 17 00:00:00 2001 From: Maxime U Garcia Date: Wed, 4 Dec 2024 09:23:39 +0100 Subject: [PATCH 24/33] Update main.yml --- .github/workflows/main.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 717994c07..e6cdd4ac4 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -172,7 +172,11 @@ jobs: steps: - uses: actions/checkout@v4 - - name: Install Nextflow + - uses: actions/setup-java@8df1039502a15bceb9433410b1a100fbe190c53b # v4 + with: + distribution: "temurin" + java-version: "17" + - name: Set up Nextflow uses: nf-core/setup-nextflow@v2 with: version: "latest-everything" From cc97fe3ff824eb56fe5ca1bcaa9cbf4f78ef7b02 Mon Sep 17 00:00:00 2001 From: Benjamin Rombaut Date: Thu, 5 Dec 2024 13:04:59 +0100 Subject: [PATCH 25/33] Update vsc_ugent.md @nvnieuwk small update on the setup details. --- docs/vsc_ugent.md | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/docs/vsc_ugent.md b/docs/vsc_ugent.md index 5dd38b39d..cea1f8a9b 100644 --- a/docs/vsc_ugent.md +++ b/docs/vsc_ugent.md @@ -5,7 +5,8 @@ > [!IMPORTANT] > You will need an [account](https://www.ugent.be/hpc/en/access/faq/access) to use the HPC cluster to run the pipeline. -Regarding environment variables in `~/.bashrc`, make sure you have a setup similar to the one below. If you're already part of a VO, ask for one or use `VSC_DATA_USER` instead of `VSC_DATA_VO_USER`. +Make sure you have an environment variable setup similar to the one below in `~/.bashrc`. If you're not already part of a VO, ask your admin to add you or use `VSC_DATA_USER` instead of `VSC_DATA_VO_USER`. +For more installation help, read the documentation of a Nextflow workshop on VSC infrastructure like [this one](https://vibbits-nextflow-workshop.readthedocs.io/en/latest/installations.html). ```bash # Needed for Tier1 accounts, not for Tier2 @@ -15,8 +16,8 @@ export SBATCH_ACCOUNT=$SLURM_ACCOUNT # Needed for running Nextflow jobs export NXF_HOME=$VSC_DATA_VO_USER/.nextflow # Needed for running Apptainer containers -export APPTAINER_CACHEDIR=$VSC_DATA_VO_USER/.apptainer/cache -export APPTAINER_TMPDIR=$VSC_DATA_VO_USER/.apptainer/tmp +export APPTAINER_CACHEDIR=$VSC_SCRATCH_VO_USER/.apptainer/cache +export APPTAINER_TMPDIR=$VSC_SCRATCH_VO_USER/.apptainer/tmp ``` First you should go to the cluster you want to run the pipeline on. You can check what clusters have the most free space on this [link](https://shieldon.ugent.be:8083/pbsmon-web-users/). Use the following commands to easily switch between clusters: @@ -55,8 +56,11 @@ qsub