Skip to content

Commit

Permalink
refactored code for dedicated queues and added max tasklimit if dedic…
Browse files Browse the repository at this point in the history
…ated queue is not available
  • Loading branch information
ljwharbers committed Jan 7, 2025
1 parent c1b3583 commit 2c391ba
Showing 1 changed file with 61 additions and 21 deletions.
82 changes: 61 additions & 21 deletions conf/vsc_kul_uhasselt.config
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
// see: https://github.com/nf-core/configs?tab=readme-ov-file#adding-a-new-config
scratch_dir = System.getenv("VSC_SCRATCH") ?: "/tmp"
tier1_project = System.getenv("SLURM_ACCOUNT") ?: null
avail_queues = System.getenv("VSC_DEDICATED_QUEUES") ?: null
def availQueues = avail_queues?.toString()?.split(',')

// Perform work directory cleanup when the run has succesfully completed
// cleanup = true
Expand Down Expand Up @@ -47,6 +49,11 @@ aws {
maxErrorRetry = 3
}

// Function to limit task time when dedicated queues are not available
def limitTaskTime(time, maxTime) {
return time > maxTime ? maxTime : time
}

// Define profiles for each cluster
profiles {
genius {
Expand All @@ -68,6 +75,8 @@ profiles {
resourceLimits = [ memory: 703.GB, cpus: 36 , time: 168.h ]
apptainer.runOptions = '--containall --cleanenv --nv'
singularity.runOptions = '--containall --cleanenv --nv'

// Set clusteroptions
clusterOptions = {
// suggested to use 9 cpus per gpu
def gpus = task.accelerator?.request ?: Math.max(1, Math.floor((task.cpus ?:1)/9) as int)
Expand Down Expand Up @@ -104,27 +113,34 @@ profiles {
}
}
}

wice {
params.config_profile_description = 'wice profile for use on the Wice cluster of the VSC HPC.'

process {
// max is 2016000
resourceLimits = [ memory: 1968.GB, cpus: 72, time: 168.h ]
beforeScript = 'module load cluster/wice'

// Set queue
// The task time is limites to 72 hours if the memory is larger than 239GB
// and dedicated queues are not available
queue = {
task.memory >= 239.GB ?
(task.time >= 72.h ? 'dedicated_big_bigmem' : 'bigmem,hugemem') :
(task.time >= 72.h ? 'batch_long,batch_icelake_long,batch_sapphirerapids_long' : 'batch,batch_sapphirerapids,batch_icelake')
def maxTime = 72.h
if (task.memory >= 239.GB) {
task.time = task.time >= maxTime && !availQueues.contains('dedicated_big_bigmem') ?
limitTaskTime(task.time, maxTime) : task.time
return availQueues.contains('dedicated_big_bigmem') ? 'dedicated_big_bigmem' : 'bigmem,hugemem'
} else {
return task.time >= maxTime ? 'batch_long,batch_icelake_long,batch_sapphirerapids_long' : 'batch,batch_sapphirerapids,batch_icelake'
}
}

// Set clusterOptions, changing account based on queue
clusterOptions = {
def queueValue = {
task.memory >= 239.GB ?
(task.time >= 72.h ? 'dedicated_big_bigmem' : 'bigmem,hugemem') :
task.memory >= 239.GB ?
(task.time >= 72.h && availQueues.contains('dedicated_big_bigmem') ? 'dedicated_big_bigmem' : 'bigmem,hugemem') :
(task.time >= 72.h ? 'batch_long,batch_icelake_long,batch_sapphirerapids_long' : 'batch,batch_sapphirerapids,batch_icelake')
}
queueValue() =~ /dedicated/ ? "--clusters=wice --account=lp_big_wice_cpu" : "--clusters=wice --account=$tier1_project"
Expand All @@ -134,19 +150,31 @@ profiles {
resourceLimits = [ memory: 703.GB, cpus: 64, time: 168.h ]
apptainer.runOptions = '--containall --cleanenv --nv'
singularity.runOptions = '--containall --cleanenv --nv'

// Set queue
// The task time is limites to 72 hours if the memory is larger than 239GB
// and dedicated queues are not available
queue = {
task.memory >= 239.GB ?
(task.time >= 72.h ? 'dedicated_big_gpu_h100' : 'gpu_h100') :
(task.time >= 72.h ? 'dedicated_big_gpu' : 'gpu_a100,gpu')
def maxTime = 72.h
if (task.memory >= 239.GB) {
task.time = task.time >= maxTime && !availQueues.contains('dedicated_big_gpu_h100') ?
limitTaskTime(task.time, maxTime) : task.time
return availQueues.contains('dedicated_big_gpu_h100') ? 'dedicated_big_gpu_h100' : 'gpu_h100'
} else {
task.time = task.time >= maxTime && !availQueues.contains('dedicated_big_gpu') ?
limitTaskTime(task.time, maxTime) : task.time
return availQueues.contains('dedicated_big_gpu') ? 'dedicated_big_gpu' : 'gpu_a100,gpu'
}
}

clusterOptions = {
// suggested to use 16 cpus per gpu
def gpus = task.accelerator?.request ?: Math.max(1, Math.floor((task.cpus ?:1)/16) as int)
// Do same queue evaluation as above
def queueValue = {
task.memory >= 239.GB ?
(task.time >= 72.h ? 'dedicated_big_gpu_h100' : 'gpu_h100') :
(task.time >= 72.h ? 'dedicated_big_gpu' : 'gpu_a100,gpu')
task.memory >= 239.GB ?
(task.time >= 72.h && availQueues.contains('dedicated_big_gpu_h100') ? 'dedicated_big_gpu_h100' : 'gpu_h100') :
(task.time >= 72.h && availQueues.contains('dedicated_big_gpu') ? 'dedicated_big_gpu' : 'gpu_a100,gpu')
}

// Set clusterOptions, changing account based on queue
Expand All @@ -167,19 +195,31 @@ profiles {
// 768 - 65 so 65GB for overhead, max is 720000MB
resourceLimits = [ memory: 703.GB, cpus: 64, time: 168.h ]
beforeScript = 'module load cluster/wice'
// Set queue
// The task time is limites to 72 hours if the memory is larger than 239GB
// and dedicated queues are not available
queue = {
task.memory >= 239.GB ?
(task.time >= 72.h ? 'dedicated_big_gpu_h100' : 'gpu_h100') :
(task.time >= 72.h ? 'dedicated_big_gpu' : 'gpu_a100,gpu')
def maxTime = 72.h
if (task.memory >= 239.GB) {
task.time = task.time >= maxTime && !availQueues.contains('dedicated_big_gpu_h100') ?
limitTaskTime(task.time, maxTime) : task.time
return availQueues.contains('dedicated_big_gpu_h100') ? 'dedicated_big_gpu_h100' : 'gpu_h100'
} else {
task.time = task.time >= maxTime && !availQueues.contains('dedicated_big_gpu') ?
limitTaskTime(task.time, maxTime) : task.time
return availQueues.contains('dedicated_big_gpu') ? 'dedicated_big_gpu' : 'gpu_a100,gpu'
}
}

// Set clusteroptions
clusterOptions = {
// suggested to use 16 cpus per gpu
def gpus = task.accelerator?.request ?: Math.max(1, Math.floor((task.cpus ?:1)/16) as int)
// Do same queue evaluation as above
// Do same queue evaluation as above, without adjusting task.time
def queueValue = {
task.memory >= 239.GB ?
(task.time >= 72.h ? 'dedicated_big_gpu_h100' : 'gpu_h100') :
(task.time >= 72.h ? 'dedicated_big_gpu' : 'gpu_a100,gpu')
task.memory >= 239.GB ?
(task.time >= 72.h && availQueues.contains('dedicated_big_gpu_h100') ? 'dedicated_big_gpu_h100' : 'gpu_h100') :
(task.time >= 72.h && availQueues.contains('dedicated_big_gpu') ? 'dedicated_big_gpu' : 'gpu_a100,gpu')
}

// Set clusterOptions, changing account based on queue
Expand Down

0 comments on commit 2c391ba

Please sign in to comment.