# Sisyphus settings file import getpass import os.path import socket # import sys # sys.path.append("/u/beck/dev/cachemanager/") ############################# # Local Settings File Options ############################# # can be "apptainer", "docker", "singularity" or None CONTAINER_MODE = "apptainer" # local path or e.g. docker registry image path # CONTAINER_IMAGE = "/work/asr4/rossenbach/rescale/pytorch_mixed_precision/apptainer/u22_pytorch2.1_onnx_flashlight_0224_jaist_project.sif" # diff # CONTAINER_IMAGE = "/work/asr4/rossenbach/rescale/pytorch_mixed_precision/apptainer/u22cuda12.1_pytorch2.2_onnxrt_1.17.1_flashlight_warprnnt_0624.sif" # runable one - 128baseline + lm #CONTAINER_IMAGE = "/work/asr4/rossenbach/rescale/pytorch_mixed_precision/apptainer/u22cuda12.1_pytorch2.2_onnxrt_1.17.1_flashlight_warprnnt_0624v3.sif" # torch 2.7 for wav2vec2 CONTAINER_IMAGE = "/work/asr4/zyang/images/u22cuda12.1_pytorch2.7_onnxrt_1.17.1_flashlight_warprnnt_0624v3_huggingface.sif" # simon # CONTAINER_IMAGE = "/work/asr4/berger/apptainer/images/torch-2.8_onnx-1.22.sif" # file systems to bind in a ":" format CONTAINER_BINDS = ["/work/asr4", "/work/asr3", "/work/common", "/work/tools22", "/u/corpora", "/work/smt4", "/work/tools", "/u/jxu", "/u/rossenbach", "/u/joerg.barkoczi", "/u/berger", "/u/zyang", "/u/zeyer", "/u/zeineldeen", "/u/zhan.shu","/run", "/u/enrique.leon.lozano"] # can be "sge", "slurm" or "pbs" (pbs is experimental) SUBMIT_ENGINE = "slurm" # hostname or ip of machine to use for cluster access SUBMIT_GATEWAY = "cn-04" # the username USER = "joerg.barkoczi" # List if extra env vars to set "before" sisyphus execution # Can for example be used to set the PYTHONPATH to a custom sisyphus # different to the one installed in the container for debugging / development purposes EXTRA_ENV = [ "PYTHONNOUSERSITE=1", ] ######################### # Setup Specific Settings ######################### # Unfortunately needed when using g2p from image: # diff: but never used G2P_PATH = "/usr/local/bin/g2p.py" ########################## # Sisyphus Global Settings ########################## try: MAIL_ADDRESS = getpass.getuser() except KeyError: MAIL_ADDRESS = None JOB_USE_TAGS_IN_PATH = False JOB_AUTO_CLEANUP = False SHOW_JOB_TARGETS = False PRINT_ERROR = False DELAYED_CHECK_FOR_WORKER = False WARNING_ABSPATH = False SHORT_JOB_NAMES = True # For debugging to 1 # GRAPH_WORKER = 1 DEFAULT_ENVIRONMENT_KEEP = {'CUDA_VISIBLE_DEVICES', 'HOME', 'PWD', 'SGE_STDERR_PATH', 'SGE_TASK_ID', 'TMP', 'TMPDIR', 'USER', "LD_LIBRARY_PATH"} DEFAULT_ENVIRONMENT_SET = { 'LANG': 'en_US.UTF-8', 'MKL_NUM_THREADS': 2, 'OMP_NUM_THREADS': 2, 'PATH': ':'.join(['/usr/local/sbin', '/usr/local/bin', '/usr/sbin', '/usr/bin', '/sbin', '/bin']), 'SHELL': '/bin/bash', "NUMBA_CACHE_DIR": f"/var/tmp/numba_cache_{USER}", # used for librosa "PYTORCH_KERNEL_CACHE_PATH": f"/var/tmp/", # used for cuda pytorch # diff "MQTT_USERNAME": "i6", "MQTT_PASSWORD": "1801", "PYTHONNOUSERSITE": "1", # used for huggingface image } ########################### # Sisyphus Code Definitions ########################### def engine(): from sisyphus.engine import EngineSelector from sisyphus.localengine import LocalEngine from sisyphus.simple_linux_utility_for_resource_management_engine import SimpleLinuxUtilityForResourceManagementEngine temp_exclude = [ 257, 231, # works but very slow? 240, # random mem illegal access 222, 282, 283, 803, # doesnt produce output 601, # doesnt produce output 602, # doesnt produce output ] default_rqmt={'cpu' : 1, 'mem' : 2, 'gpu' : 0, 'time' : 1} if temp_exclude: default_rqmt["sbatch_args"] = ["-x", ",".join([f"cn-{node}" for node in temp_exclude])] # diff: only slurm return EngineSelector( engines={ 'short': SimpleLinuxUtilityForResourceManagementEngine( default_rqmt=default_rqmt, gateway=SUBMIT_GATEWAY, ), 'long': SimpleLinuxUtilityForResourceManagementEngine( default_rqmt=default_rqmt, gateway=SUBMIT_GATEWAY, )}, default_engine='long' ) # def engine(): # from sisyphus.engine import EngineSelector # from sisyphus.localengine import LocalEngine # from sisyphus.simple_linux_utility_for_resource_management_engine import ( # SimpleLinuxUtilityForResourceManagementEngine, # ) # temp_exclude = [ # 257, # 231, # works but very slow? # 240, # random mem illegal access # 222, # 283, # 803, # doesnt produce output # 601, # doesnt produce output # 602, # doesnt produce output # ] # default_rqmt = {"cpu": 1, "mem": 4, "time": 1} # if temp_exclude: # default_rqmt["sbatch_args"] = ["-x", ",".join([f"cn-{node}" for node in temp_exclude])] # return EngineSelector( # engines={ # "short": LocalEngine(cpus=4), # "long": SimpleLinuxUtilityForResourceManagementEngine(default_rqmt=default_rqmt), # }, # default_engine="long", # ) # def check_engine_limits(current_rqmt, task): # """ # i6 support for gpu_mem # """ # current_rqmt['time'] = min(168, current_rqmt.get('time', 2)) # if current_rqmt.get('gpu', 0) > 0 and '-p' not in current_rqmt.get('sbatch_args', []): # if current_rqmt.get("gpu_mem", 0) > 24: # current_rqmt['sbatch_args'] = ['-p', 'gpu_48gb'] # elif current_rqmt.get("gpu_mem", 0) > 11: # current_rqmt['sbatch_args'] = ['-p', 'gpu_24gb', "--exclude=cn-233"] # else: # current_rqmt['sbatch_args'] = ['-p', 'gpu_11gb'] # ############################################# # # Example to set settings based on alias name # ############################################# # #aliases = [] # #for prefix in list(task._job._sis_alias_prefixes) + [""]: # # for alias in task._job.get_aliases() or [""]: # # aliases.append(prefix + alias) # #if "gmm_align" in "\t".join(aliases or ""): # # current_rqmt['sbatch_args'] = ['-p', 'cpu_slow'] # return current_rqmt CPU_SHORT_JOBLIST = ["AverageTFCheckpointsJob", "GetBestTFCheckpointJob"] def check_engine_limits(current_rqmt, task): current_rqmt["time"] = min(168, current_rqmt.get("time", 2)) curr_sbatch_args = current_rqmt.get("sbatch_args", []) if "-p" not in current_rqmt.get("sbatch_args", []): if current_rqmt.get("gpu", 0) > 0: # gpu if current_rqmt["time"] <= 1 and current_rqmt.get("gpu_mem", 0) <= 24: current_rqmt["sbatch_args"] = ["-p", "gpu_test_24gb"] + curr_sbatch_args elif current_rqmt.get("gpu_mem", 0) > 24: current_rqmt["sbatch_args"] = ["-p", "gpu_48gb"] + curr_sbatch_args # ["-p", "gpu_80gb"] + curr_sbatch_args elif current_rqmt.get("gpu_mem", 0) > 11: current_rqmt["sbatch_args"] = ["-p", "gpu_24gb"] + curr_sbatch_args else: current_rqmt["sbatch_args"] = ["-p", "gpu_11gb"] + curr_sbatch_args else: # cpu with SSE4 and AVX if task._job.__class__.__name__ in CPU_SHORT_JOBLIST: current_rqmt["sbatch_args"] = ["-p", "cpu_short"] + curr_sbatch_args return current_rqmt def build_apptainer_command(call): """ Apptainer specific launch code """ command = [] if EXTRA_ENV: command += ["env"] + EXTRA_ENV command += ["apptainer", "exec", "--nv"] for bind in CONTAINER_BINDS: command += ["--bind", bind] command += [CONTAINER_IMAGE] return command + ["sis"] + call[2:] def build_singularity_command(call): """ Singularity specific launch code """ command = [] if EXTRA_ENV: command += ["env"] + EXTRA_ENV command += ["singularity", "exec", "--nv"] for bind in CONTAINER_BINDS: command += ["--bind", bind] command += [CONTAINER_IMAGE] return command + ["sis"] + call[2:] def build_docker_command(call): """ Docker specific launch code """ from pwd import getpwnam # diff: USER, 's2322008' userid, groupid = getpwnam('s2322008')[2:4] exp_dir = os.path.dirname(__file__) command = [ "docker", "run", "-t", "--rm", # delete container after execution "-u", "%i:%i" % (userid, groupid), # passing the username directly does not work with LDAP users "--runtime=nvidia", "--gpus", "device=%i" % gpu, "-m", "32g", "--shm-size", "16g", "-w", exp_dir, ] for env in EXTRA_ENV: command += ["-e", env] for bind in CONTAINER_BINDS: command += ["-v", bind] command += [CONTAINER_IMAGE] command += ["sh", "-e", "-c"] return command + ["sis"] + call[2:] def worker_wrapper(job, task_name, call): """ All worker calls are passed through this function. Is used to wrap the execution call with the correct container command. Usually it is not necessary to alter things here, but any worker call can be fully customized here. """ from sisyphus.engine import EngineSelector from sisyphus.localengine import LocalEngine app_blacklist = ["FairseqHydraTrainWav2VecUJob"] if type(job).__name__ in app_blacklist: return call if CONTAINER_MODE == "apptainer": command = build_apptainer_command(call) elif CONTAINER_MODE == "docker": command = build_docker_command(call) elif CONTAINER_MODE == "singularity": command = build_singularity_command(call) else: raise ValueError("Invalid CONTAINER_MODE %s" % CONTAINER_MODE) ts = {t.name(): t for t in job.tasks()} t = ts[task_name] e = engine() # Usually EngineSelector, but can be LocalEngine if no settings file is present if isinstance(e, EngineSelector): e = engine().get_used_engine_by_rqmt(t.rqmt()) if isinstance(e, LocalEngine): return call else: return command