Skip to content

Slurm Cheat Sheet

This page contains assorted Slurm commands and Bash snippets that should be helpful.

man pages!

$ man sinfo
$ man scontrol
$ man squeue
# etc...

interactive sessions

hpc-login-1:~$ srun --pty bash
med0740:~$ echo "Hello World"
med0740:~$ exit

batch submission

hpc-login-1:~$ sbatch script.sh
Submitted batch job 2
hpc-login-1:~$ squeue
             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
                27     debug script.s holtgrem  R       0:06      1 med0703

listing nodes

$ sinfo -N
NODELIST   NODES PARTITION STATE
med0740        1    debug* idle
med0741        1    debug* down*
med0742        1    debug* down*

$ scontrol show nodes
NodeName=med0740 Arch=x86_64 CoresPerSocket=8
   CPUAlloc=0 CPUTot=32 CPULoad=0.06
   AvailableFeatures=(null)
[...]

$ scontrol show nodes med0740
NodeName=med0740 Arch=x86_64 CoresPerSocket=8
   CPUAlloc=0 CPUTot=32 CPULoad=0.06
   AvailableFeatures=(null)
   ActiveFeatures=(null)
   Gres=(null)
   NodeAddr=med0740 NodeHostName=med0740 Version=20.02.0
   OS=Linux 3.10.0-1062.12.1.el7.x86_64 #1 SMP Tue Feb 4 23:02:59 UTC 2020
   RealMemory=1 AllocMem=0 FreeMem=174388 Sockets=2 Boards=1
   State=IDLE ThreadsPerCore=2 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A
   Partitions=debug
   BootTime=2020-03-05T00:54:15 SlurmdStartTime=2020-03-05T16:23:25
   CfgTRES=cpu=32,mem=1M,billing=32
   AllocTRES=
   CapWatts=n/a
   CurrentWatts=0 AveWatts=0
   ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s

queue states

$ squeue
             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
$ squeue -u holtgrem_c
             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)

node resources

$ sinfo -o "%20N  %10c  %10m  %25f  %10G "

additional resources such as GPUs

$ sinfo -o "%N %G"

listing job details

$ scontrol show job 225
JobId=225 JobName=bash
   UserId=XXX(135001) GroupId=XXX(30069) MCS_label=N/A
   Priority=4294901580 Nice=0 Account=(null) QOS=normal
   JobState=FAILED Reason=NonZeroExitCode Dependency=(null)
   Requeue=1 Restarts=0 BatchFlag=0 Reboot=0 ExitCode=130:0
   RunTime=00:16:27 TimeLimit=14-00:00:00 TimeMin=N/A
   SubmitTime=2020-03-23T11:34:26 EligibleTime=2020-03-23T11:34:26
   AccrueTime=Unknown
   StartTime=2020-03-23T11:34:26 EndTime=2020-03-23T11:50:53 Deadline=N/A
   SuspendTime=None SecsPreSuspend=0 LastSchedEval=2020-03-23T11:34:26
   Partition=gpu AllocNode:Sid=hpc-login-1:1918
   ReqNodeList=(null) ExcNodeList=(null)
   NodeList=med0301
   BatchHost=med0301
   NumNodes=1 NumCPUs=2 NumTasks=0 CPUs/Task=1 ReqB:S:C:T=0:0:*:*
   TRES=cpu=2,node=1,billing=2
   Socks/Node=* NtasksPerN:B:S:C=0:0:*:* CoreSpec=*
   MinCPUsNode=1 MinMemoryNode=0 MinTmpDiskNode=0
   Features=(null) DelayBoot=00:00:00
   OverSubscribe=OK Contiguous=0 Licenses=(null) Network=(null)
   Command=bash
   WorkDir=XXX
   Power=
   TresPerNode=gpu:tesla:4
   MailUser=(null) MailType=NONE
host:~$ squeue
             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON) 
              1177    medium     bash jweiner_  R 4-21:52:24      1 med0127 
              1192    medium     bash jweiner_  R 4-07:08:40      1 med0127 
              1209   highmem     bash mkuhrin_  R 2-01:07:17      1 med0402 
              1210       gpu     bash hilberta  R 1-10:30:34      1 med0304 
              1213      long     bash schubacm  R 1-09:42:27      1 med0127 
              2401       gpu     bash ramkem_c  R 1-05:14:53      1 med0303 
              2431    medium ngs_mapp holtgrem  R 1-05:01:41      1 med0127 
              2437  critical snakejob holtgrem  R 1-05:01:34      1 med0135 
              2733     debug     bash schubacm  R    7:36:42      1 med0127 
              3029  critical ngs_mapp holtgrem  R    5:59:07      1 med0127 
              3030  critical snakejob holtgrem  R    5:56:23      1 med0134 
              3031  critical snakejob holtgrem  R    5:56:23      1 med0137 
              3032  critical snakejob holtgrem  R    5:56:23      1 med0137 
              3033  critical snakejob holtgrem  R    5:56:23      1 med0138 
              3034  critical snakejob holtgrem  R    5:56:23      1 med0138 
              3035  critical snakejob holtgrem  R    5:56:20      1 med0139 
              3036  critical snakejob holtgrem  R    5:56:20      1 med0139 
              3037  critical snakejob holtgrem  R    5:56:20      1 med0140 
              3038  critical snakejob holtgrem  R    5:56:20      1 med0140 
              3039  critical snakejob holtgrem  R    5:56:20      1 med0141 
              3040  critical snakejob holtgrem  R    5:56:20      1 med0141 
              3041  critical snakejob holtgrem  R    5:56:20      1 med0142 
              3042  critical snakejob holtgrem  R    5:56:20      1 med0142 
              3043  critical snakejob holtgrem  R    5:56:20      1 med0143 
              3044  critical snakejob holtgrem  R    5:56:20      1 med0143 
              3063      long     bash schubacm  R    4:12:37      1 med0127 
              3066      long     bash schubacm  R    4:11:47      1 med0127 
              3113    medium ngs_mapp holtgrem  R    1:52:33      1 med0708 
              3118    medium snakejob holtgrem  R    1:50:38      1 med0133 
              3119    medium snakejob holtgrem  R    1:50:38      1 med0703 
              3126    medium snakejob holtgrem  R    1:50:38      1 med0706 
              3127    medium snakejob holtgrem  R    1:50:38      1 med0144 
              3128    medium snakejob holtgrem  R    1:50:38      1 med0144 
              3133    medium snakejob holtgrem  R    1:50:35      1 med0147 
              3134    medium snakejob holtgrem  R    1:50:35      1 med0147 
              3135    medium snakejob holtgrem  R    1:50:35      1 med0148 
              3136    medium snakejob holtgrem  R    1:50:35      1 med0148 
              3138    medium snakejob holtgrem  R    1:50:35      1 med0104 
host:~$ squeue -o "%.10i %9P %20j %10u %.2t %.10M %.6D %10R %b"
     JOBID PARTITION NAME                 USER       ST       TIME  NODES NODELIST(R TRES_PER_NODE
      1177 medium    bash                 jweiner_m   R 4-21:52:22      1 med0127    N/A
      1192 medium    bash                 jweiner_m   R 4-07:08:38      1 med0127    N/A
      1209 highmem   bash                 mkuhrin_m   R 2-01:07:15      1 med0402    N/A
      1210 gpu       bash                 hilberta_c  R 1-10:30:32      1 med0304    gpu:tesla:4
      1213 long      bash                 schubacm_c  R 1-09:42:25      1 med0127    N/A
      2401 gpu       bash                 ramkem_c    R 1-05:14:51      1 med0303    gpu:tesla:1
      2431 medium    ngs_mapping          holtgrem_c  R 1-05:01:39      1 med0127    N/A
      2437 critical  snakejob.ngs_mapping holtgrem_c  R 1-05:01:32      1 med0135    N/A
      2733 debug     bash                 schubacm_c  R    7:36:40      1 med0127    N/A
      3029 critical  ngs_mapping          holtgrem_c  R    5:59:05      1 med0127    N/A
      3030 critical  snakejob.ngs_mapping holtgrem_c  R    5:56:21      1 med0134    N/A
      3031 critical  snakejob.ngs_mapping holtgrem_c  R    5:56:21      1 med0137    N/A
      3032 critical  snakejob.ngs_mapping holtgrem_c  R    5:56:21      1 med0137    N/A
      3033 critical  snakejob.ngs_mapping holtgrem_c  R    5:56:21      1 med0138    N/A
      3034 critical  snakejob.ngs_mapping holtgrem_c  R    5:56:21      1 med0138    N/A
      3035 critical  snakejob.ngs_mapping holtgrem_c  R    5:56:18      1 med0139    N/A
      3036 critical  snakejob.ngs_mapping holtgrem_c  R    5:56:18      1 med0139    N/A
      3037 critical  snakejob.ngs_mapping holtgrem_c  R    5:56:18      1 med0140    N/A
      3038 critical  snakejob.ngs_mapping holtgrem_c  R    5:56:18      1 med0140    N/A
      3039 critical  snakejob.ngs_mapping holtgrem_c  R    5:56:18      1 med0141    N/A
      3040 critical  snakejob.ngs_mapping holtgrem_c  R    5:56:18      1 med0141    N/A
      3041 critical  snakejob.ngs_mapping holtgrem_c  R    5:56:18      1 med0142    N/A
      3042 critical  snakejob.ngs_mapping holtgrem_c  R    5:56:18      1 med0142    N/A
      3043 critical  snakejob.ngs_mapping holtgrem_c  R    5:56:18      1 med0143    N/A
      3044 critical  snakejob.ngs_mapping holtgrem_c  R    5:56:18      1 med0143    N/A
      3063 long      bash                 schubacm_c  R    4:12:35      1 med0127    N/A
      3066 long      bash                 schubacm_c  R    4:11:45      1 med0127    N/A
      3113 medium    ngs_mapping          holtgrem_c  R    1:52:31      1 med0708    N/A
      3118 medium    snakejob.ngs_mapping holtgrem_c  R    1:50:36      1 med0133    N/A
      3119 medium    snakejob.ngs_mapping holtgrem_c  R    1:50:36      1 med0703    N/A
      3126 medium    snakejob.ngs_mapping holtgrem_c  R    1:50:36      1 med0706    N/A
      3127 medium    snakejob.ngs_mapping holtgrem_c  R    1:50:36      1 med0144    N/A
      3128 medium    snakejob.ngs_mapping holtgrem_c  R    1:50:36      1 med0144    N/A
      3133 medium    snakejob.ngs_mapping holtgrem_c  R    1:50:33      1 med0147    N/A
      3134 medium    snakejob.ngs_mapping holtgrem_c  R    1:50:33      1 med0147    N/A
      3135 medium    snakejob.ngs_mapping holtgrem_c  R    1:50:33      1 med0148    N/A
      3136 medium    snakejob.ngs_mapping holtgrem_c  R    1:50:33      1 med0148    N/A
      3138 medium    snakejob.ngs_mapping holtgrem_c  R    1:50:33      1 med0104    N/A
host:~$ sinfo
PARTITION AVAIL  TIMELIMIT  NODES  STATE NODELIST 
debug*       up    8:00:00     11  drain med[0707,0709-0710,0740-0742,0744-0745,0749,0752,0755] 
debug*       up    8:00:00      8    mix med[0104,0127,0133-0135,0703,0706,0708] 
debug*       up    8:00:00     10  alloc med[0137-0144,0147-0148] 
debug*       up    8:00:00    103   idle med[0105-0124,0136,0145-0146,0151-0164,0201-0264,0704-0705] 
medium       up 7-00:00:00     11  drain med[0707,0709-0710,0740-0742,0744-0745,0749,0752,0755] 
medium       up 7-00:00:00      8    mix med[0104,0127,0133-0135,0703,0706,0708] 
medium       up 7-00:00:00     10  alloc med[0137-0144,0147-0148] 
medium       up 7-00:00:00    103   idle med[0105-0124,0136,0145-0146,0151-0164,0201-0264,0704-0705] 
long         up 28-00:00:0     11  drain med[0707,0709-0710,0740-0742,0744-0745,0749,0752,0755] 
long         up 28-00:00:0      8    mix med[0104,0127,0133-0135,0703,0706,0708] 
long         up 28-00:00:0     10  alloc med[0137-0144,0147-0148] 
long         up 28-00:00:0    103   idle med[0105-0124,0136,0145-0146,0151-0164,0201-0264,0704-0705] 
critical     up 7-00:00:00     11  drain med[0707,0709-0710,0740-0742,0744-0745,0749,0752,0755] 
critical     up 7-00:00:00      8    mix med[0104,0127,0133-0135,0703,0706,0708] 
critical     up 7-00:00:00     10  alloc med[0137-0144,0147-0148] 
critical     up 7-00:00:00    103   idle med[0105-0124,0136,0145-0146,0151-0164,0201-0264,0704-0705] 
highmem      up 14-00:00:0      1    mix med0402 
highmem      up 14-00:00:0      3   idle med[0401,0403-0404] 
gpu          up 14-00:00:0      2    mix med[0303-0304] 
gpu          up 14-00:00:0      2   idle med[0301-0302]