302 lines
10 KiB
Python
302 lines
10 KiB
Python
# Sisyphus settings file
|
|
import getpass
|
|
import os.path
|
|
import socket
|
|
|
|
# import sys
|
|
|
|
# sys.path.append("/u/beck/dev/cachemanager/")
|
|
#############################
|
|
# Local Settings File Options
|
|
#############################
|
|
|
|
# can be "apptainer", "docker", "singularity" or None
|
|
CONTAINER_MODE = "apptainer"
|
|
|
|
# local path or e.g. docker registry image path
|
|
# CONTAINER_IMAGE = "/work/asr4/rossenbach/rescale/pytorch_mixed_precision/apptainer/u22_pytorch2.1_onnx_flashlight_0224_jaist_project.sif"
|
|
# diff
|
|
# CONTAINER_IMAGE = "/work/asr4/rossenbach/rescale/pytorch_mixed_precision/apptainer/u22cuda12.1_pytorch2.2_onnxrt_1.17.1_flashlight_warprnnt_0624.sif"
|
|
|
|
# runable one - 128baseline + lm
|
|
#CONTAINER_IMAGE = "/work/asr4/rossenbach/rescale/pytorch_mixed_precision/apptainer/u22cuda12.1_pytorch2.2_onnxrt_1.17.1_flashlight_warprnnt_0624v3.sif"
|
|
# torch 2.7 for wav2vec2
|
|
CONTAINER_IMAGE = "/work/asr4/zyang/images/u22cuda12.1_pytorch2.7_onnxrt_1.17.1_flashlight_warprnnt_0624v3_huggingface.sif"
|
|
# simon
|
|
# CONTAINER_IMAGE = "/work/asr4/berger/apptainer/images/torch-2.8_onnx-1.22.sif"
|
|
|
|
# file systems to bind in a "<source_path>:<target_path>" format
|
|
CONTAINER_BINDS = ["/work/asr4", "/work/asr3", "/work/common", "/work/tools22", "/u/corpora", "/work/smt4",
|
|
"/work/tools", "/u/jxu", "/u/rossenbach", "/u/joerg.barkoczi", "/u/berger", "/u/zyang", "/u/zeyer", "/u/zeineldeen", "/u/zhan.shu","/run", "/u/enrique.leon.lozano"]
|
|
|
|
# can be "sge", "slurm" or "pbs" (pbs is experimental)
|
|
SUBMIT_ENGINE = "slurm"
|
|
|
|
# hostname or ip of machine to use for cluster access
|
|
SUBMIT_GATEWAY = "cn-04"
|
|
|
|
# the username
|
|
USER = "joerg.barkoczi"
|
|
|
|
# List if extra env vars to set "before" sisyphus execution
|
|
# Can for example be used to set the PYTHONPATH to a custom sisyphus
|
|
# different to the one installed in the container for debugging / development purposes
|
|
EXTRA_ENV = [
|
|
"PYTHONNOUSERSITE=1",
|
|
]
|
|
|
|
#########################
|
|
# Setup Specific Settings
|
|
#########################
|
|
|
|
# Unfortunately needed when using g2p from image:
|
|
# diff: but never used
|
|
G2P_PATH = "/usr/local/bin/g2p.py"
|
|
|
|
##########################
|
|
# Sisyphus Global Settings
|
|
##########################
|
|
try:
|
|
MAIL_ADDRESS = getpass.getuser()
|
|
except KeyError:
|
|
MAIL_ADDRESS = None
|
|
|
|
JOB_USE_TAGS_IN_PATH = False
|
|
JOB_AUTO_CLEANUP = False
|
|
SHOW_JOB_TARGETS = False
|
|
PRINT_ERROR = False
|
|
DELAYED_CHECK_FOR_WORKER = False
|
|
|
|
WARNING_ABSPATH = False
|
|
|
|
SHORT_JOB_NAMES = True
|
|
|
|
# For debugging to 1
|
|
# GRAPH_WORKER = 1
|
|
|
|
DEFAULT_ENVIRONMENT_KEEP = {'CUDA_VISIBLE_DEVICES', 'HOME', 'PWD', 'SGE_STDERR_PATH', 'SGE_TASK_ID', 'TMP', 'TMPDIR',
|
|
'USER', "LD_LIBRARY_PATH"}
|
|
|
|
DEFAULT_ENVIRONMENT_SET = {
|
|
'LANG': 'en_US.UTF-8',
|
|
'MKL_NUM_THREADS': 2,
|
|
'OMP_NUM_THREADS': 2,
|
|
'PATH': ':'.join(['/usr/local/sbin', '/usr/local/bin',
|
|
'/usr/sbin', '/usr/bin',
|
|
'/sbin', '/bin']),
|
|
'SHELL': '/bin/bash',
|
|
"NUMBA_CACHE_DIR": f"/var/tmp/numba_cache_{USER}", # used for librosa
|
|
"PYTORCH_KERNEL_CACHE_PATH": f"/var/tmp/", # used for cuda pytorch
|
|
# diff
|
|
"MQTT_USERNAME": "i6",
|
|
"MQTT_PASSWORD": "1801",
|
|
"PYTHONNOUSERSITE": "1", # used for huggingface image
|
|
}
|
|
|
|
|
|
###########################
|
|
# Sisyphus Code Definitions
|
|
###########################
|
|
|
|
def engine():
|
|
from sisyphus.engine import EngineSelector
|
|
from sisyphus.localengine import LocalEngine
|
|
from sisyphus.simple_linux_utility_for_resource_management_engine import SimpleLinuxUtilityForResourceManagementEngine
|
|
|
|
temp_exclude = [
|
|
257,
|
|
231, # works but very slow?
|
|
240, # random mem illegal access
|
|
222,
|
|
282,
|
|
283,
|
|
803, # doesnt produce output
|
|
601, # doesnt produce output
|
|
602, # doesnt produce output
|
|
]
|
|
|
|
default_rqmt={'cpu' : 1, 'mem' : 2, 'gpu' : 0, 'time' : 1}
|
|
|
|
if temp_exclude:
|
|
default_rqmt["sbatch_args"] = ["-x", ",".join([f"cn-{node}" for node in temp_exclude])]
|
|
|
|
# diff: only slurm
|
|
return EngineSelector(
|
|
engines={
|
|
'short': SimpleLinuxUtilityForResourceManagementEngine(
|
|
default_rqmt=default_rqmt,
|
|
gateway=SUBMIT_GATEWAY,
|
|
),
|
|
'long': SimpleLinuxUtilityForResourceManagementEngine(
|
|
default_rqmt=default_rqmt,
|
|
gateway=SUBMIT_GATEWAY,
|
|
)},
|
|
default_engine='long'
|
|
)
|
|
|
|
# def engine():
|
|
# from sisyphus.engine import EngineSelector
|
|
# from sisyphus.localengine import LocalEngine
|
|
# from sisyphus.simple_linux_utility_for_resource_management_engine import (
|
|
# SimpleLinuxUtilityForResourceManagementEngine,
|
|
# )
|
|
|
|
# temp_exclude = [
|
|
# 257,
|
|
# 231, # works but very slow?
|
|
# 240, # random mem illegal access
|
|
# 222,
|
|
# 283,
|
|
# 803, # doesnt produce output
|
|
# 601, # doesnt produce output
|
|
# 602, # doesnt produce output
|
|
# ]
|
|
|
|
# default_rqmt = {"cpu": 1, "mem": 4, "time": 1}
|
|
# if temp_exclude:
|
|
# default_rqmt["sbatch_args"] = ["-x", ",".join([f"cn-{node}" for node in temp_exclude])]
|
|
|
|
# return EngineSelector(
|
|
# engines={
|
|
# "short": LocalEngine(cpus=4),
|
|
# "long": SimpleLinuxUtilityForResourceManagementEngine(default_rqmt=default_rqmt),
|
|
# },
|
|
# default_engine="long",
|
|
# )
|
|
|
|
# def check_engine_limits(current_rqmt, task):
|
|
# """
|
|
# i6 support for gpu_mem
|
|
# """
|
|
# current_rqmt['time'] = min(168, current_rqmt.get('time', 2))
|
|
# if current_rqmt.get('gpu', 0) > 0 and '-p' not in current_rqmt.get('sbatch_args', []):
|
|
# if current_rqmt.get("gpu_mem", 0) > 24:
|
|
# current_rqmt['sbatch_args'] = ['-p', 'gpu_48gb']
|
|
# elif current_rqmt.get("gpu_mem", 0) > 11:
|
|
# current_rqmt['sbatch_args'] = ['-p', 'gpu_24gb', "--exclude=cn-233"]
|
|
# else:
|
|
# current_rqmt['sbatch_args'] = ['-p', 'gpu_11gb']
|
|
|
|
# #############################################
|
|
# # Example to set settings based on alias name
|
|
# #############################################
|
|
|
|
# #aliases = []
|
|
# #for prefix in list(task._job._sis_alias_prefixes) + [""]:
|
|
# # for alias in task._job.get_aliases() or [""]:
|
|
# # aliases.append(prefix + alias)
|
|
|
|
# #if "gmm_align" in "\t".join(aliases or ""):
|
|
# # current_rqmt['sbatch_args'] = ['-p', 'cpu_slow']
|
|
|
|
# return current_rqmt
|
|
|
|
CPU_SHORT_JOBLIST = ["AverageTFCheckpointsJob", "GetBestTFCheckpointJob"]
|
|
def check_engine_limits(current_rqmt, task):
|
|
current_rqmt["time"] = min(168, current_rqmt.get("time", 2))
|
|
curr_sbatch_args = current_rqmt.get("sbatch_args", [])
|
|
if "-p" not in current_rqmt.get("sbatch_args", []):
|
|
if current_rqmt.get("gpu", 0) > 0:
|
|
# gpu
|
|
if current_rqmt["time"] <= 1 and current_rqmt.get("gpu_mem", 0) <= 24:
|
|
current_rqmt["sbatch_args"] = ["-p", "gpu_test_24gb"] + curr_sbatch_args
|
|
elif current_rqmt.get("gpu_mem", 0) > 24:
|
|
current_rqmt["sbatch_args"] = ["-p", "gpu_48gb"] + curr_sbatch_args # ["-p", "gpu_80gb"] + curr_sbatch_args
|
|
elif current_rqmt.get("gpu_mem", 0) > 11:
|
|
current_rqmt["sbatch_args"] = ["-p", "gpu_24gb"] + curr_sbatch_args
|
|
else:
|
|
current_rqmt["sbatch_args"] = ["-p", "gpu_11gb"] + curr_sbatch_args
|
|
else:
|
|
# cpu with SSE4 and AVX
|
|
if task._job.__class__.__name__ in CPU_SHORT_JOBLIST:
|
|
current_rqmt["sbatch_args"] = ["-p", "cpu_short"] + curr_sbatch_args
|
|
return current_rqmt
|
|
|
|
|
|
def build_apptainer_command(call):
|
|
"""
|
|
Apptainer specific launch code
|
|
"""
|
|
command = []
|
|
if EXTRA_ENV:
|
|
command += ["env"] + EXTRA_ENV
|
|
command += ["apptainer", "exec", "--nv"]
|
|
for bind in CONTAINER_BINDS:
|
|
command += ["--bind", bind]
|
|
command += [CONTAINER_IMAGE]
|
|
return command + ["sis"] + call[2:]
|
|
|
|
|
|
def build_singularity_command(call):
|
|
"""
|
|
Singularity specific launch code
|
|
"""
|
|
command = []
|
|
if EXTRA_ENV:
|
|
command += ["env"] + EXTRA_ENV
|
|
command += ["singularity", "exec", "--nv"]
|
|
for bind in CONTAINER_BINDS:
|
|
command += ["--bind", bind]
|
|
command += [CONTAINER_IMAGE]
|
|
return command + ["sis"] + call[2:]
|
|
|
|
|
|
def build_docker_command(call):
|
|
"""
|
|
Docker specific launch code
|
|
"""
|
|
from pwd import getpwnam
|
|
# diff: USER, 's2322008'
|
|
userid, groupid = getpwnam('s2322008')[2:4]
|
|
exp_dir = os.path.dirname(__file__)
|
|
command = [
|
|
"docker", "run", "-t",
|
|
"--rm", # delete container after execution
|
|
"-u", "%i:%i" % (userid, groupid), # passing the username directly does not work with LDAP users
|
|
"--runtime=nvidia",
|
|
"--gpus", "device=%i" % gpu,
|
|
"-m", "32g",
|
|
"--shm-size", "16g",
|
|
"-w", exp_dir,
|
|
]
|
|
for env in EXTRA_ENV:
|
|
command += ["-e", env]
|
|
for bind in CONTAINER_BINDS:
|
|
command += ["-v", bind]
|
|
command += [CONTAINER_IMAGE]
|
|
command += ["sh", "-e", "-c"]
|
|
return command + ["sis"] + call[2:]
|
|
|
|
|
|
def worker_wrapper(job, task_name, call):
|
|
"""
|
|
All worker calls are passed through this function.
|
|
Is used to wrap the execution call with the correct container command.
|
|
Usually it is not necessary to alter things here,
|
|
but any worker call can be fully customized here.
|
|
"""
|
|
from sisyphus.engine import EngineSelector
|
|
from sisyphus.localengine import LocalEngine
|
|
app_blacklist = ["FairseqHydraTrainWav2VecUJob"]
|
|
if type(job).__name__ in app_blacklist:
|
|
return call
|
|
|
|
if CONTAINER_MODE == "apptainer":
|
|
command = build_apptainer_command(call)
|
|
elif CONTAINER_MODE == "docker":
|
|
command = build_docker_command(call)
|
|
elif CONTAINER_MODE == "singularity":
|
|
command = build_singularity_command(call)
|
|
else:
|
|
raise ValueError("Invalid CONTAINER_MODE %s" % CONTAINER_MODE)
|
|
|
|
ts = {t.name(): t for t in job.tasks()}
|
|
t = ts[task_name]
|
|
e = engine() # Usually EngineSelector, but can be LocalEngine if no settings file is present
|
|
if isinstance(e, EngineSelector):
|
|
e = engine().get_used_engine_by_rqmt(t.rqmt())
|
|
if isinstance(e, LocalEngine):
|
|
return call
|
|
else:
|
|
return command
|