Files
i6_setups/gen/settings.py
Jörg Barkoczi 9115f50248 add gen
2026-04-21 15:19:51 +02:00

302 lines
10 KiB
Python

# Sisyphus settings file
import getpass
import os.path
import socket
# import sys
# sys.path.append("/u/beck/dev/cachemanager/")
#############################
# Local Settings File Options
#############################
# can be "apptainer", "docker", "singularity" or None
CONTAINER_MODE = "apptainer"
# local path or e.g. docker registry image path
# CONTAINER_IMAGE = "/work/asr4/rossenbach/rescale/pytorch_mixed_precision/apptainer/u22_pytorch2.1_onnx_flashlight_0224_jaist_project.sif"
# diff
# CONTAINER_IMAGE = "/work/asr4/rossenbach/rescale/pytorch_mixed_precision/apptainer/u22cuda12.1_pytorch2.2_onnxrt_1.17.1_flashlight_warprnnt_0624.sif"
# runable one - 128baseline + lm
#CONTAINER_IMAGE = "/work/asr4/rossenbach/rescale/pytorch_mixed_precision/apptainer/u22cuda12.1_pytorch2.2_onnxrt_1.17.1_flashlight_warprnnt_0624v3.sif"
# torch 2.7 for wav2vec2
CONTAINER_IMAGE = "/work/asr4/zyang/images/u22cuda12.1_pytorch2.7_onnxrt_1.17.1_flashlight_warprnnt_0624v3_huggingface.sif"
# simon
# CONTAINER_IMAGE = "/work/asr4/berger/apptainer/images/torch-2.8_onnx-1.22.sif"
# file systems to bind in a "<source_path>:<target_path>" format
CONTAINER_BINDS = ["/work/asr4", "/work/asr3", "/work/common", "/work/tools22", "/u/corpora", "/work/smt4",
"/work/tools", "/u/jxu", "/u/rossenbach", "/u/joerg.barkoczi", "/u/berger", "/u/zyang", "/u/zeyer", "/u/zeineldeen", "/u/zhan.shu","/run", "/u/enrique.leon.lozano"]
# can be "sge", "slurm" or "pbs" (pbs is experimental)
SUBMIT_ENGINE = "slurm"
# hostname or ip of machine to use for cluster access
SUBMIT_GATEWAY = "cn-04"
# the username
USER = "joerg.barkoczi"
# List if extra env vars to set "before" sisyphus execution
# Can for example be used to set the PYTHONPATH to a custom sisyphus
# different to the one installed in the container for debugging / development purposes
EXTRA_ENV = [
"PYTHONNOUSERSITE=1",
]
#########################
# Setup Specific Settings
#########################
# Unfortunately needed when using g2p from image:
# diff: but never used
G2P_PATH = "/usr/local/bin/g2p.py"
##########################
# Sisyphus Global Settings
##########################
try:
MAIL_ADDRESS = getpass.getuser()
except KeyError:
MAIL_ADDRESS = None
JOB_USE_TAGS_IN_PATH = False
JOB_AUTO_CLEANUP = False
SHOW_JOB_TARGETS = False
PRINT_ERROR = False
DELAYED_CHECK_FOR_WORKER = False
WARNING_ABSPATH = False
SHORT_JOB_NAMES = True
# For debugging to 1
# GRAPH_WORKER = 1
DEFAULT_ENVIRONMENT_KEEP = {'CUDA_VISIBLE_DEVICES', 'HOME', 'PWD', 'SGE_STDERR_PATH', 'SGE_TASK_ID', 'TMP', 'TMPDIR',
'USER', "LD_LIBRARY_PATH"}
DEFAULT_ENVIRONMENT_SET = {
'LANG': 'en_US.UTF-8',
'MKL_NUM_THREADS': 2,
'OMP_NUM_THREADS': 2,
'PATH': ':'.join(['/usr/local/sbin', '/usr/local/bin',
'/usr/sbin', '/usr/bin',
'/sbin', '/bin']),
'SHELL': '/bin/bash',
"NUMBA_CACHE_DIR": f"/var/tmp/numba_cache_{USER}", # used for librosa
"PYTORCH_KERNEL_CACHE_PATH": f"/var/tmp/", # used for cuda pytorch
# diff
"MQTT_USERNAME": "i6",
"MQTT_PASSWORD": "1801",
"PYTHONNOUSERSITE": "1", # used for huggingface image
}
###########################
# Sisyphus Code Definitions
###########################
def engine():
from sisyphus.engine import EngineSelector
from sisyphus.localengine import LocalEngine
from sisyphus.simple_linux_utility_for_resource_management_engine import SimpleLinuxUtilityForResourceManagementEngine
temp_exclude = [
257,
231, # works but very slow?
240, # random mem illegal access
222,
282,
283,
803, # doesnt produce output
601, # doesnt produce output
602, # doesnt produce output
]
default_rqmt={'cpu' : 1, 'mem' : 2, 'gpu' : 0, 'time' : 1}
if temp_exclude:
default_rqmt["sbatch_args"] = ["-x", ",".join([f"cn-{node}" for node in temp_exclude])]
# diff: only slurm
return EngineSelector(
engines={
'short': SimpleLinuxUtilityForResourceManagementEngine(
default_rqmt=default_rqmt,
gateway=SUBMIT_GATEWAY,
),
'long': SimpleLinuxUtilityForResourceManagementEngine(
default_rqmt=default_rqmt,
gateway=SUBMIT_GATEWAY,
)},
default_engine='long'
)
# def engine():
# from sisyphus.engine import EngineSelector
# from sisyphus.localengine import LocalEngine
# from sisyphus.simple_linux_utility_for_resource_management_engine import (
# SimpleLinuxUtilityForResourceManagementEngine,
# )
# temp_exclude = [
# 257,
# 231, # works but very slow?
# 240, # random mem illegal access
# 222,
# 283,
# 803, # doesnt produce output
# 601, # doesnt produce output
# 602, # doesnt produce output
# ]
# default_rqmt = {"cpu": 1, "mem": 4, "time": 1}
# if temp_exclude:
# default_rqmt["sbatch_args"] = ["-x", ",".join([f"cn-{node}" for node in temp_exclude])]
# return EngineSelector(
# engines={
# "short": LocalEngine(cpus=4),
# "long": SimpleLinuxUtilityForResourceManagementEngine(default_rqmt=default_rqmt),
# },
# default_engine="long",
# )
# def check_engine_limits(current_rqmt, task):
# """
# i6 support for gpu_mem
# """
# current_rqmt['time'] = min(168, current_rqmt.get('time', 2))
# if current_rqmt.get('gpu', 0) > 0 and '-p' not in current_rqmt.get('sbatch_args', []):
# if current_rqmt.get("gpu_mem", 0) > 24:
# current_rqmt['sbatch_args'] = ['-p', 'gpu_48gb']
# elif current_rqmt.get("gpu_mem", 0) > 11:
# current_rqmt['sbatch_args'] = ['-p', 'gpu_24gb', "--exclude=cn-233"]
# else:
# current_rqmt['sbatch_args'] = ['-p', 'gpu_11gb']
# #############################################
# # Example to set settings based on alias name
# #############################################
# #aliases = []
# #for prefix in list(task._job._sis_alias_prefixes) + [""]:
# # for alias in task._job.get_aliases() or [""]:
# # aliases.append(prefix + alias)
# #if "gmm_align" in "\t".join(aliases or ""):
# # current_rqmt['sbatch_args'] = ['-p', 'cpu_slow']
# return current_rqmt
CPU_SHORT_JOBLIST = ["AverageTFCheckpointsJob", "GetBestTFCheckpointJob"]
def check_engine_limits(current_rqmt, task):
current_rqmt["time"] = min(168, current_rqmt.get("time", 2))
curr_sbatch_args = current_rqmt.get("sbatch_args", [])
if "-p" not in current_rqmt.get("sbatch_args", []):
if current_rqmt.get("gpu", 0) > 0:
# gpu
if current_rqmt["time"] <= 1 and current_rqmt.get("gpu_mem", 0) <= 24:
current_rqmt["sbatch_args"] = ["-p", "gpu_test_24gb"] + curr_sbatch_args
elif current_rqmt.get("gpu_mem", 0) > 24:
current_rqmt["sbatch_args"] = ["-p", "gpu_48gb"] + curr_sbatch_args # ["-p", "gpu_80gb"] + curr_sbatch_args
elif current_rqmt.get("gpu_mem", 0) > 11:
current_rqmt["sbatch_args"] = ["-p", "gpu_24gb"] + curr_sbatch_args
else:
current_rqmt["sbatch_args"] = ["-p", "gpu_11gb"] + curr_sbatch_args
else:
# cpu with SSE4 and AVX
if task._job.__class__.__name__ in CPU_SHORT_JOBLIST:
current_rqmt["sbatch_args"] = ["-p", "cpu_short"] + curr_sbatch_args
return current_rqmt
def build_apptainer_command(call):
"""
Apptainer specific launch code
"""
command = []
if EXTRA_ENV:
command += ["env"] + EXTRA_ENV
command += ["apptainer", "exec", "--nv"]
for bind in CONTAINER_BINDS:
command += ["--bind", bind]
command += [CONTAINER_IMAGE]
return command + ["sis"] + call[2:]
def build_singularity_command(call):
"""
Singularity specific launch code
"""
command = []
if EXTRA_ENV:
command += ["env"] + EXTRA_ENV
command += ["singularity", "exec", "--nv"]
for bind in CONTAINER_BINDS:
command += ["--bind", bind]
command += [CONTAINER_IMAGE]
return command + ["sis"] + call[2:]
def build_docker_command(call):
"""
Docker specific launch code
"""
from pwd import getpwnam
# diff: USER, 's2322008'
userid, groupid = getpwnam('s2322008')[2:4]
exp_dir = os.path.dirname(__file__)
command = [
"docker", "run", "-t",
"--rm", # delete container after execution
"-u", "%i:%i" % (userid, groupid), # passing the username directly does not work with LDAP users
"--runtime=nvidia",
"--gpus", "device=%i" % gpu,
"-m", "32g",
"--shm-size", "16g",
"-w", exp_dir,
]
for env in EXTRA_ENV:
command += ["-e", env]
for bind in CONTAINER_BINDS:
command += ["-v", bind]
command += [CONTAINER_IMAGE]
command += ["sh", "-e", "-c"]
return command + ["sis"] + call[2:]
def worker_wrapper(job, task_name, call):
"""
All worker calls are passed through this function.
Is used to wrap the execution call with the correct container command.
Usually it is not necessary to alter things here,
but any worker call can be fully customized here.
"""
from sisyphus.engine import EngineSelector
from sisyphus.localengine import LocalEngine
app_blacklist = ["FairseqHydraTrainWav2VecUJob"]
if type(job).__name__ in app_blacklist:
return call
if CONTAINER_MODE == "apptainer":
command = build_apptainer_command(call)
elif CONTAINER_MODE == "docker":
command = build_docker_command(call)
elif CONTAINER_MODE == "singularity":
command = build_singularity_command(call)
else:
raise ValueError("Invalid CONTAINER_MODE %s" % CONTAINER_MODE)
ts = {t.name(): t for t in job.tasks()}
t = ts[task_name]
e = engine() # Usually EngineSelector, but can be LocalEngine if no settings file is present
if isinstance(e, EngineSelector):
e = engine().get_used_engine_by_rqmt(t.rqmt())
if isinstance(e, LocalEngine):
return call
else:
return command