Commit 2a315829 authored by HOURDIN Christophe's avatar HOURDIN Christophe
Browse files

print cpu infos for JEAN-ZAY at beginning & end of job: JEAN-ZAY.cpu_info.sh

parent 03a16503
#!/bin/sh
printf "\n\n remove $1 \n\n"
echo "rm -rf ${EXPDIR}/${EXPER}/jobs/$1"
rm -rf ${EXPDIR}/${EXPER}/jobs/$1
echo "rm -rf ${EXEDIR}/$1"
rm -rf ${EXEDIR}/$1
echo "rm -rf ${OUTDIR}/$1"
rm -rf ${OUTDIR}/$1
rst1=`echo $1 | cut -c 10-`
echo "rm -rf ${RSTDIR}/${rst1}"
rm -rf ${RSTDIR}/${rst1}
echo""
echo""
#!/bin/sh -x
#!/bin/sh
printf "\n\n================================================================================\n"
printf "===== rename $1 $2 =====\n"
printf "================================================================================\n"
mv ${EXPDIR}/${EXPER}/jobs/$1 ${EXPDIR}/${EXPER}/jobs/$2
mv ${EXEDIR}/$1 ${EXEDIR}/$2
mv ${OUTDIR}/$1 ${OUTDIR}/$2
printf "\n\n rename $1 \n $2\n\n"
printf "mv ${EXPDIR}/${EXPER}/jobs/$1 \n ${EXPDIR}/${EXPER}/jobs/$2\n\n"
mv ${EXPDIR}/${EXPER}/jobs/$1 ${EXPDIR}/${EXPER}/jobs/$2
printf "mv ${EXEDIR}/$1 \n ${EXEDIR}/$2\n\n"
mv ${EXEDIR}/$1 ${EXEDIR}/$2
printf "mv ${OUTDIR}/$1 \n ${OUTDIR}/$2\n\n"
mv ${OUTDIR}/$1 ${OUTDIR}/$2
rst1=`echo $1 | cut -c 10-`
rst2=`echo $2 | cut -c 10-`
mv ${RSTDIR}/${rst1} ${RSTDIR}/${rst2}
printf "mv ${RSTDIR}/${rst1} \n ${RSTDIR}/${rst2}\n\n"
mv ${RSTDIR}/${rst1} ${RSTDIR}/${rst2}
#!/bin/sh -x
printf "\n\n================================================================================\n"
printf "===== remove $1 =====\n"
printf "================================================================================\n"
rm -rf ${EXPDIR}/${EXPER}/jobs/$1
rm -rf ${EXEDIR}/$1
rm -rf ${OUTDIR}/$1
rst1=`echo $1 | cut -c 10-`
rm -rf ${RSTDIR}/${rst1}
......@@ -12,7 +12,13 @@
#MSUB -j oe
#======= IreneSKL (skylake) : 24 cores per node
Nodes: 1 656
Total cores: 79 488
#======= IreneSKL (skylake)
### CPUs: 2x24-cores Intel Skylake@2.7GHz (AVX512) Cores/Node: 48
### RAM/Core: 3.75GB => RAM/Node: 180GB
#MSUB -q skylake
#MSUB -c 1 # number of cores per mpi task
#MSUB -n 96 # Total number of mpi task to use
......@@ -28,7 +34,7 @@
# #MSUB -Q test # 1800 s max
# #MSUB -T 1800
#======= Production Queue
#MSUB -T 14400 # 4 hours for 144 cores & 6 nodes (can11sen2)
#MSUB -T 14400 # 4 hours for 144 cores & 6 nodes (can11sen2cp)
#======= Various
......
#!/bin/bash
# argument: jobid
verbose=false
if [ -z ${SLURM_JOB_ID+x} ]; then
running=false
else
running=true
fi
printf "\n\nrunning = ${running}\n"
printf "verbose = ${verbose}\n"
printf "\n\n pour avoir des infos sur un job en cours d'execution : \n"
printf " sacct - displays accounting data for all jobs and job steps in the Slurm job accounting log or Slurm database\n"
printf " avec 'sacct -e' et 'man sacct' pour la liste des infos disponibles\n"
printf " scontrol show job JOBID\n"
printf "\n\n pour avoir des infos sur un job pendant ou après : \n"
printf " sstat - Display various status information of a running job/step\n"
printf " avec 'sstat -e' et 'man sstat' pour la liste des infos disponibles\n"
if [ "$#" == "1" ]; then
JOBID2=$1
printf "\n\nJOBID = ${JOBID2}\n\n"
echo""
get_Jobname=`sacct -j ${JOBID2} -X -n -P --format="Jobname"`
echo "Jobname : ${get_Jobname}"
echo""
get_WorkDir=`sacct -j ${JOBID2} -X -n -P --format="WorkDir"`
echo "WorkDir : ${get_WorkDir}"
printf "\n\n> sacct -j ${JOBID2}\n\n"
sacct -j ${JOBID2}
printf "\n\n> sacct -j ${JOBID2} -X --format='Cluster,JobID,JobName,State,Partition,QOS%%14,Account,User,ReqCPUS,AllocCPUS,AllocNodes,NodeList'\n\n"
sacct -j ${JOBID2} -X --format="Cluster,JobID,JobName,State,Partition,QOS%14,Account,User,ReqCPUS,AllocCPUS,AllocNodes,NodeList%25"
printf "\n\n> sacct -j ${JOBID2} -X --format='CPUTime,CPUTimeRAW,Elapsed,Timelimit,TimelimitRaw,Submit%%22,Start%%22,End%%22'\n\n"
sacct -j ${JOBID2} -X --format="CPUTime,CPUTimeRAW,Elapsed,Timelimit,TimelimitRaw,Submit%22,Start%22,End%22"
printf "\n\n> sacct -j ${JOBID2} -X -n -P --format='AllocTRES'\n\n"
sacct -j ${JOBID2} -X -n -P --format="AllocTRES"
echo""
echo""
if ${verbose}; then
printf "\n\n\nListe exhaustive des infos sur un jobid avec la commande sacct\n"
for option in "Account" \
"AdminComment" \
"AllocCPUS" \
"AllocGRES" \
"AllocNodes" \
"AllocTRES" \
"AssocID" \
"AveCPU" \
"AveCPUFreq" \
"AveDiskRead" \
"AveDiskWrite" \
"AvePages" \
"AveRSS" \
"AveVMSize" \
"BlockID" \
"Cluster" \
"Comment" \
"ConsumedEnergy" \
"ConsumedEnergyRaw" \
"CPUTime" \
"CPUTimeRAW" \
"DerivedExitCode" \
"Elapsed" \
"ElapsedRaw" \
"Eligible" \
"End" \
"ExitCode" \
"GID" \
"Group" \
"JobID" \
"JobIDRaw" \
"JobName" \
"Layout" \
"MaxDiskRead" \
"MaxDiskReadNode" \
"MaxDiskReadTask" \
"MaxDiskWrite" \
"MaxDiskWriteNode" \
"MaxDiskWriteTask" \
"MaxPages" \
"MaxPagesNode" \
"MaxPagesTask" \
"MaxRSS" \
"MaxRSSNode" \
"MaxRSSTask" \
"MaxVMSize" \
"MaxVMSizeNode" \
"MaxVMSizeTask" \
"McsLabel" \
"MinCPU" \
"MinCPUNode" \
"MinCPUTask" \
"NCPUS" \
"NNodes" \
"NodeList" \
"NTasks" \
"Priority" \
"Partition" \
"QOS" \
"QOSRAW" \
"ReqCPUFreq" \
"ReqCPUFreqMin" \
"ReqCPUFreqMax" \
"ReqCPUFreqGov" \
"ReqCPUS" \
"ReqGRES" \
"ReqMem" \
"ReqNodes" \
"ReqTRES" \
"Reservation" \
"ReservationId" \
"Reserved" \
"ResvCPU" \
"ResvCPURAW" \
"Start" \
"State" \
"Submit" \
"Suspended" \
"SystemCPU" \
"SystemComment" \
"Timelimit" \
"TimelimitRaw" \
"TotalCPU" \
"TRESUsageInAve" \
"TRESUsageInMax" \
"TRESUsageInMaxNode" \
"TRESUsageInMaxTask" \
"TRESUsageInMin" \
"TRESUsageInMinNode" \
"TRESUsageInMinTask" \
"TRESUsageInTot" \
"TRESUsageOutAve" \
"TRESUsageOutMax" \
"TRESUsageOutMaxNode" \
"TRESUsageOutMaxTask" \
"TRESUsageOutMin" \
"TRESUsageOutMinNode" \
"TRESUsageOutMinTask" \
"TRESUsageOutTot" \
"UID" \
"User" \
"UserCPU" \
"WCKey" \
"WCKeyID" \
"WorkDir";
do
get1=`sacct -j ${JOBID2} -X -n -P --format="${option}"`
echo "${option} : ${get1}"
done
fi
# Pour un job en cours d'execution :
if ${running}; then
printf "\n\n> scontrol show job ${JOBID2}\n\n"
scontrol show job ${JOBID2}
# printf "\n\n> sstat -j ${JOBID2}\n\n"
# sstat -j ${JOBID2}
if ${verbose}; then
printf "\n\n\nListe exhaustive des infos sur un jobid en cours d'execution avec la commande sstat\n"
for option in "AveCPU" \
"AveCPUFreq" \
"AveDiskRead" \
"AveDiskWrite" \
"AvePages" \
"AveRSS" \
"AveVMSize" \
"ConsumedEnergy" \
"ConsumedEnergyRaw" \
"JobID" \
"MaxDiskRead" \
"MaxDiskReadNode" \
"MaxDiskReadTask" \
"MaxDiskWrite" \
"MaxDiskWriteNode" \
"MaxDiskWriteTask" \
"MaxPages" \
"MaxPagesNode" \
"MaxPagesTask" \
"MaxRSS" \
"MaxRSSNode" \
"MaxRSSTask" \
"MaxVMSize" \
"MaxVMSizeNode" \
"MaxVMSizeTask" \
"MinCPU" \
"MinCPUNode" \
"MinCPUTask" \
"Nodelist" \
"NTasks" \
"Pids" \
"ReqCPUFreq" \
"ReqCPUFreqMin" \
"ReqCPUFreqMax" \
"ReqCPUFreqGov" \
"TRESUsageInAve" \
"TRESUsageInMax" \
"TRESUsageInMaxNode" \
"TRESUsageInMaxTask" \
"TRESUsageInMin" \
"TRESUsageInMinNode" \
"TRESUsageInMinTask" \
"TRESUsageInTot" \
"TRESUsageOutAve" \
"TRESUsageOutMax" \
"TRESUsageOutMaxNode" \
"TRESUsageOutMaxTask" \
"TRESUsageOutMin" \
"TRESUsageOutMinNode" \
"TRESUsageOutMinTask" \
"TRESUsageOutTot";
do
get1=`sstat -j ${JOBID2} -n -P --format="${option}"`
echo "${option} : ${get1}"
done
fi
fi
else
printf "\n\n\n Passez le jobid en argument \n\n\n\n"
fi
......@@ -10,7 +10,7 @@ export STOREDIR=${STORE}
#-------------------------------------------------------------------------------
# croco
export CROCO_SOURCE="/gpfswork/rech/eee/reee084/locean_croco_pisces/croco_locean_v1.00/OCEAN"
export CROCO_SCRATCH="${SCRATCHDIR}/croco_scratch_${EXP_NAME}"
export CROCO_SCRATCH="${SCRATCHDIR}/croco_scratch/${EXP_NAME}${TEST_NAME}"
export MPIF90="mpiifort"
export FC="ifort"
export NETCDFLIB="-lnetcdff -lnetcdf"
......@@ -29,8 +29,8 @@ if [ "${MODEL_OCE}" = "nemo" ] ; then
export BDY="bdy/tr12_quik/from_obc"
export DATA="/ccc/work/cont005/ra0542/hourdinc/data"
elif [ "${MODEL_OCE}" = "croco" ] ; then
# export INDIR="${WORKDIR}/${CONFIG}/inputs"
export INDIR="/ccc/work/cont005/gen1140/chabertp/can11sen2_croco/inputs"
export INDIR="${WORKDIR}/${CONFIG}/inputs"
# export INDIR="/ccc/work/cont005/gen1140/chabertp/can11sen2_croco/inputs"
fi
#-------------------------------------------------------------------------------
......
......@@ -14,25 +14,31 @@
#-------------------------------------------------------------------------------
# Processors (scalar or CPU partition) : 40 cores per node
# Processors (scalar or CPU partition) : 1 node = 40 cores & 40x4 = 160 Go
#-------------------------------------------------------------------------------
# 1528 nodes
# 1 node = 2 processors Intel Cascade Lake 6248 (20 cores à 2,5 GHz)
# => 1 node = 40 cores per node
# 1 node = 192 Go
# http://www.idris.fr/jean-zay/cpu/jean-zay-cpu-hw.html
#
# if more than 1 node is reserved, the job will be executed in exclusive mode
# for example: 41 cores asked => 2 nodes (80 nodes) reserved
# 1 noeud contient 2 processeurs : processors Intel Cascade Lake 6248
# 1 processeur contient 20 coeurs à 2.5 GHz
# 1 coeur peut faire tourner 2 processus (cpu ou thread) en hyperthreading
# 1 coeur dispose de 4Go de mémoire
#
# => 1 noeud = 40 coeurs = 80 threads ou cpu = 160 Go
#
# "scontrol show job $JOBID" give all informations about the job (memory, core, node, time... )
#-------------------------------------------------------------------------------
#SBATCH --ntasks=96 # Total number of mpi task to use
#SBATCH --ntasks-per-node=40 # number of mpi task per node
# for Slurm, "multithread" = hyperthreading.
#SBATCH --hint=nomultithread # 1 mpi task per core
#SBATCH --ntasks=120 # Total number of mpi task to use
#SBATCH --ntasks-per-node=40 # number of mpi task per node
#SBATCH --cpus-per-task=1 # --cpus-per-task=1 (default)
#SBATCH --hint=nomultithread # 1 mpi task per core # for Slurm, "multithread" = hyperthreading.
# pour avoir plus de mémoire que la mémoire réservée par le nombre de coeur (4Go / coeur)
# on peut réserver N noeuds (N * 160 Go) avec :
# ntasks-per-node = ntasks / N
#-------------------------------------------------------------------------------
# Partition
# partitions Slurm CPU
#-------------------------------------------------------------------------------
# http://www.idris.fr/jean-zay/cpu/jean-zay-cpu-exec_partition_slurm.html
......@@ -40,15 +46,15 @@
###SBATCH --partition=visu # visualization node : no time cpu used / time default=00:10:00 / time limit < 01:00:00
###SBATCH --partition=archive # no time cpu used / time default=02:00:00 / time limit < 20:00:00
###SBATCH --partition=cpu_p1 # default (time limit = HH:MM:SS ≤ 100:00:00)
#SBATCH --partition=cpu_p1 # time limit = HH:MM:SS ≤ 100:00:00 (default)
# time limit | ressources limit
# | per job | per user | per QoS (Quality of Service)
###SBATCH --qos=qos_cpu-t3 (défaut) # 20h | 512
###SBATCH --qos=qos_cpu-t4 # 100h | 1 | 32 | 28
#SBATCH --qos=qos_cpu-dev # 2h | 128 | 128 | 1000
# if cpu_p1 partition : QoS (Quality of Service) choice
# time limit | ressources limit per job
###SBATCH --qos=qos_cpu-t3 # 20h | 512 nodes = 20480 cores (default)
###SBATCH --qos=qos_cpu-t4 # 100h | 4 nodes = 160 cores
#SBATCH --qos=qos_cpu-dev # 2h | 128 nodes = 5120 cores
#SBATCH --time=00:10:00 # max cpu time
#SBATCH --time=02:00:00 # max cpu time
#-------------------------------------------------------------------------------
......@@ -57,12 +63,26 @@
#SBATCH --account eee@cpu
#===============================================================================
umask 022
set -u
#===============================================================================
#if ! [ -z ${SLURM_JOB_ID+x} ]; then
# echo ""
# echo "date_chris : `date "+%Y%m%d-%H:%M:%S"`"
# echo "scontrol show job ${SLURM_JOB_ID}"
# scontrol show job ${SLURM_JOB_ID}
# printf "\n\n\n\n"
#
#fi
#===============================================================================
module unload intel-compilers intel-mpi
module load intel-compilers/19.0.4
module load intel-mpi/19.0.4
......
......@@ -16,7 +16,8 @@ do
if [ ${AGRIFZ} -eq 0 ]; then # AGRIF=false
lnfile ${INDIR}/ini/can11_rst_bio_fromcan11bio1.nc${suff} croco_ini.nc${suff}
else # AGRIF=true
lnfile ${INDIR}/ini/can11sen2_rst_bio_rstphy8year.nc${suff} croco_ini.nc${suff}
# lnfile ${INDIR}/ini/can11sen2_rst_bio_rstphy8year.nc${suff} croco_ini.nc${suff}
lnfile ${INDIR}/ini/can11sen2_rst_bio.nc${suff} croco_ini.nc${suff}
fi
else
lnfile ${RSTDIR_IN}/restart_oce_${EXP_NAME}_*.nc${suff} croco_ini.nc${suff}
......
......@@ -31,6 +31,18 @@ ${MACHINE_STOCKAGE} mkdir -p ${RSTDIR_OUT}
#===============================================================================
# Step0 : jobid info at start time
#===============================================================================
echo ""
echo "================================================================================"
echo " Step0 : jobid info at start time..."
echo "================================================================================"
${SCRIPTDIR}/${COMPUTER}.cpu_info.sh ${SLURM_JOB_ID}
#===============================================================================
# Step 1 : get_file step
#===============================================================================
......@@ -39,7 +51,7 @@ if [ ${LOADL_STEP_NAME} == "get_file" ] || [ ${LOADL_STEP_NAME} == "XXX" ]; then
echo ""
echo ""
echo "================================================================================"
echo " Step0 : common printing..."
echo " common printing..."
echo "================================================================================"
### some printings
......@@ -163,26 +175,52 @@ if [ ${LOADL_STEP_NAME} == "put_file" ] || [ ${LOADL_STEP_NAME} == "XXX" ]; then
${USE_OCE} && { for file in ${PUT_FILES_OCE}; do cpfile2 ${file} ${JOBDIR}; done; echo ""; }
${USE_ATM} && { for file in ${PUT_FILES_ATM}; do cpfile2 ${file} ${JOBDIR}; done; echo ""; }
FILES_JOB="${jobname} ${listing_root_name}*"
cd ${JOBDIR_ROOT}; for file in ${FILES_JOB}; do mvfile2 ${file} ${JOBDIR}; done; cd -; echo "";
# FILES_JOB="${jobname} ${listing_root_name}*"
# cd ${JOBDIR_ROOT}; for file in ${FILES_JOB}; do mvfile2 ${file} ${JOBDIR}; done; cd -; echo "";
cd ${JOBDIR_ROOT}
mvfile2 ${jobname} ${JOBDIR}/submitjob.sh
mvfile2 ${listing_root_name}* ${JOBDIR}/listing.txt
cd -
echo ""
echo ""
fi # Step3
#===============================================================================
# Step 4 : jobid info at end time
#===============================================================================
echo ""
echo "================================================================================"
echo " Step4 : jobid info at end time..."
echo "================================================================================"
${SCRIPTDIR}/${COMPUTER}.cpu_info.sh ${SLURM_JOB_ID}
echo ""
echo ""
echo "date_chris : `date "+%Y%m%d-%H:%M:%S"`"
echo ""
echo ""
#===============================================================================
# Step 4 : NEXT job!
# Step 5 : NEXT job!
#===============================================================================
if [ ${LOADL_STEP_NAME} == "put_file" ] || [ ${LOADL_STEP_NAME} == "XXX" ]; then
echo ""
echo "================================================================================"
echo " Step4 : Next job..."
echo " Step5 : Next job..."
echo "================================================================================"
if ${MODE_TEST} ; then # Test Mode
echo ""
echo ""
echo" MODE_TEST=${MODE_TEST} Test Mode => Just one job => STOP ."
echo "MODE_TEST=${MODE_TEST} => Just one job => END "
echo ""
echo ""
else # Production Mode
......
......@@ -25,12 +25,19 @@ export MODE_TEST=true # for running different tests in the same exp with diff
# export TEST_NAME="_test_sans_AGRIF_6x24_procs_pdt_400_rst_sans_AGRIF"
# export TEST_NAME="_test_sans_AGRIF_6x24_procs_pdt_600_rst_sans_AGRIF"
# export TEST_NAME="_test_12x12"
export TEST_NAME="_test_sans_AGRIF_4x24_procs_pdt_600_rst_sans_AGRIF"
# export TEST_NAME="_test_scripts1"
# export TEST_NAME="_test_sans_AGRIF_4x24_procs_pdt_400_rst_sans_AGRIF"
# export TEST_NAME="_test_sans_AGRIF_3x40_procs_pdt_400_rst_sans_AGRIF"
# export TEST_NAME="_test_sans_AGRIF_5x24_procs_pdt_400_rst_sans_AGRIF"
# export TEST_NAME="_test_sans_AGRIF_3x40_procs_pdt_400_rst_sans_AGRIF_sans_PISCES"
# export TEST_NAME="_test_sans_AGRIF_4x30_procs_pdt_400_rst_sans_AGRIF_sans_PISCES"
# export TEST_NAME="_test_sans_AGRIF_4x30_procs_pdt_400_rst_sans_AGRIF"
export TEST_NAME="_test_avec_AGRIF_4x30_procs_pdt_600_rst"
# export TEST_NAME="_test_sans_AGRIF_2x12_procs_pdt_400_rst_sans_AGRIF"
# export TEST_NAME="_test_2x12_avec_2_noeuds"
export TEST_SCRIPT=false # the job is not submited
export TEST_FEW_NTIMES=false # replace the job timestep number by DNTIMES
export TEST_FEW_NTIMES=true # replace the job timestep number by DNTIMES
export DNTIMES=1 # number of timsteps for the parent grid ( wich corresponds to 1 month in the script)
......@@ -47,13 +54,13 @@ export USE_OCE=true
# WARNING : if AGRIFZ=0 et USE_AGRIF_2WAY=true... compilation error?? ... wetdry_.f(282): error #6404:
# BIOLOGY
export USE_BIOLOGY=true
export USE_BIOLOGY=false
export USE_PISCES=true
# MPI
export USE_MPI=true
export NPROC_X=4
export NPROC_Y=24
export NPROC_Y=30
# OpenMP
export USE_OPENMP=false
......@@ -75,7 +82,7 @@ export USE_OCE=true
fi
# files to save in ascii job dir for archive after running
export PUT_FILES_OCE="croco.in* out_run.txt"
export PUT_FILES_OCE="croco.in* out_run.txt out_cpp.txt"
${USE_BIOLOGY} && { export PUT_FILES_OCE="${PUT_FILES_OCE} kRGB61.txt* namelist_pisces_cfg* namelist_pisces_ref* output.namelist.pis*"; }
[ ${AGRIFZ} -ge 1 ] && { export PUT_FILES_OCE="${PUT_FILES_OCE} AGRIF_FixedGrids.in"; }
......
......@@ -9,15 +9,15 @@ export YEAR_BEGIN_EXP=1001
export MONTH_BEGIN_EXP=1
export DAY_BEGIN_EXP=1
# Duration of the Experiment
export EXP_DUR_MTH=$(( 1 * 1 ))
# export EXP_DUR_MTH=0
export EXP_DUR_DAY=0
#export EXP_DUR_MTH=$(( 1 * 1 ))
export EXP_DUR_MTH=0
export EXP_DUR_DAY=5
# Period of Job
export YEAR_BEGIN_JOB=1001
export MONTH_BEGIN_JOB=1
export DAY_BEGIN_JOB=1
# Duration of the Job
export JOB_DUR_MTH=1
export JOB_DUR_DAY=0
export JOB_DUR_MTH=0
export JOB_DUR_DAY=5
#
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment