i6_setups/gen/settings.py

# Sisyphus settings file
import getpass
import os.path
import socket

# import sys

# sys.path.append("/u/beck/dev/cachemanager/")
#############################
# Local Settings File Options
#############################

# can be "apptainer", "docker", "singularity" or None
CONTAINER_MODE = "apptainer"

# local path or e.g. docker registry image path
# CONTAINER_IMAGE = "/work/asr4/rossenbach/rescale/pytorch_mixed_precision/apptainer/u22_pytorch2.1_onnx_flashlight_0224_jaist_project.sif"
# diff
# CONTAINER_IMAGE = "/work/asr4/rossenbach/rescale/pytorch_mixed_precision/apptainer/u22cuda12.1_pytorch2.2_onnxrt_1.17.1_flashlight_warprnnt_0624.sif"

# runable one - 128baseline + lm
#CONTAINER_IMAGE = "/work/asr4/rossenbach/rescale/pytorch_mixed_precision/apptainer/u22cuda12.1_pytorch2.2_onnxrt_1.17.1_flashlight_warprnnt_0624v3.sif"
# torch 2.7 for wav2vec2
CONTAINER_IMAGE = "/work/asr4/zyang/images/u22cuda12.1_pytorch2.7_onnxrt_1.17.1_flashlight_warprnnt_0624v3_huggingface.sif"
# simon
# CONTAINER_IMAGE = "/work/asr4/berger/apptainer/images/torch-2.8_onnx-1.22.sif"

# file systems to bind in a "<source_path>:<target_path>" format
CONTAINER_BINDS = ["/work/asr4", "/work/asr3", "/work/common", "/work/tools22", "/u/corpora", "/work/smt4",
                   "/work/tools", "/u/jxu", "/u/rossenbach", "/u/joerg.barkoczi", "/u/berger", "/u/zyang", "/u/zeyer", "/u/zeineldeen", "/u/zhan.shu","/run", "/u/enrique.leon.lozano"]

# can be "sge", "slurm" or "pbs" (pbs is experimental)
SUBMIT_ENGINE = "slurm"

# hostname or ip of machine to use for cluster access
SUBMIT_GATEWAY = "cn-04"

# the username
USER = "joerg.barkoczi"

# List if extra env vars to set "before" sisyphus execution
# Can for example be used to set the PYTHONPATH to a custom sisyphus
# different to the one installed in the container for debugging / development purposes
EXTRA_ENV = [
    "PYTHONNOUSERSITE=1",
]

#########################
# Setup Specific Settings
#########################

# Unfortunately needed when using g2p from image:
# diff: but never used
G2P_PATH = "/usr/local/bin/g2p.py"

##########################
# Sisyphus Global Settings
##########################
try:
    MAIL_ADDRESS = getpass.getuser()
except KeyError:
    MAIL_ADDRESS = None

JOB_USE_TAGS_IN_PATH = False
JOB_AUTO_CLEANUP = False
SHOW_JOB_TARGETS = False
PRINT_ERROR = False
DELAYED_CHECK_FOR_WORKER = False

WARNING_ABSPATH = False

SHORT_JOB_NAMES = True

# For debugging to 1
# GRAPH_WORKER = 1

DEFAULT_ENVIRONMENT_KEEP = {'CUDA_VISIBLE_DEVICES', 'HOME', 'PWD', 'SGE_STDERR_PATH', 'SGE_TASK_ID', 'TMP', 'TMPDIR',
                            'USER', "LD_LIBRARY_PATH"}

DEFAULT_ENVIRONMENT_SET = {
    'LANG': 'en_US.UTF-8',
    'MKL_NUM_THREADS': 2,
    'OMP_NUM_THREADS': 2,
    'PATH': ':'.join(['/usr/local/sbin', '/usr/local/bin',
                      '/usr/sbin', '/usr/bin',
                      '/sbin', '/bin']),
    'SHELL': '/bin/bash',
    "NUMBA_CACHE_DIR": f"/var/tmp/numba_cache_{USER}",  # used for librosa
    "PYTORCH_KERNEL_CACHE_PATH": f"/var/tmp/",  # used for cuda pytorch
    # diff
    "MQTT_USERNAME": "i6",
    "MQTT_PASSWORD": "1801",
    "PYTHONNOUSERSITE": "1", # used for huggingface image
}


###########################
# Sisyphus Code Definitions
###########################

def engine():
    from sisyphus.engine import EngineSelector
    from sisyphus.localengine import LocalEngine
    from sisyphus.simple_linux_utility_for_resource_management_engine import SimpleLinuxUtilityForResourceManagementEngine

    temp_exclude = [
        257,
        231,  # works but very slow?
        240,  # random mem illegal access
        222,
        282,
        283,
        803, # doesnt produce output
        601, # doesnt produce output
        602, # doesnt produce output
    ]

    default_rqmt={'cpu' : 1, 'mem' : 2, 'gpu' : 0, 'time' : 1}

    if temp_exclude:
        default_rqmt["sbatch_args"] = ["-x", ",".join([f"cn-{node}" for node in temp_exclude])]

    # diff: only slurm
    return EngineSelector(
        engines={
            'short': SimpleLinuxUtilityForResourceManagementEngine(
                default_rqmt=default_rqmt,
                gateway=SUBMIT_GATEWAY,
                ),
            'long': SimpleLinuxUtilityForResourceManagementEngine(
                default_rqmt=default_rqmt,
                gateway=SUBMIT_GATEWAY,
                )},
        default_engine='long'
    )

# def engine():
#     from sisyphus.engine import EngineSelector
#     from sisyphus.localengine import LocalEngine
#     from sisyphus.simple_linux_utility_for_resource_management_engine import (
#         SimpleLinuxUtilityForResourceManagementEngine,
#     )

#     temp_exclude = [
#         257,
#         231,  # works but very slow?
#         240,  # random mem illegal access
#         222,
#         283,
#         803, # doesnt produce output
#         601, # doesnt produce output
#         602, # doesnt produce output
#     ]

#     default_rqmt = {"cpu": 1, "mem": 4, "time": 1}
#     if temp_exclude:
#         default_rqmt["sbatch_args"] = ["-x", ",".join([f"cn-{node}" for node in temp_exclude])]

#     return EngineSelector(
#         engines={
#             "short": LocalEngine(cpus=4),
#             "long": SimpleLinuxUtilityForResourceManagementEngine(default_rqmt=default_rqmt),
#         },
#         default_engine="long",
#     )

# def check_engine_limits(current_rqmt, task):
#     """
#     i6 support for gpu_mem
#     """
#     current_rqmt['time'] = min(168, current_rqmt.get('time', 2))
#     if current_rqmt.get('gpu', 0) > 0 and '-p' not in current_rqmt.get('sbatch_args', []):
#         if current_rqmt.get("gpu_mem", 0) > 24:
#             current_rqmt['sbatch_args'] = ['-p', 'gpu_48gb']
#         elif current_rqmt.get("gpu_mem", 0) > 11:
#             current_rqmt['sbatch_args'] = ['-p', 'gpu_24gb', "--exclude=cn-233"]
#         else:
#             current_rqmt['sbatch_args'] = ['-p', 'gpu_11gb']

#     #############################################
#     # Example to set settings based on alias name
#     #############################################

#     #aliases = []
#     #for prefix in list(task._job._sis_alias_prefixes) + [""]:
#     #    for alias in task._job.get_aliases() or [""]:
#     #        aliases.append(prefix + alias)

#     #if "gmm_align" in "\t".join(aliases or ""):
#     #    current_rqmt['sbatch_args'] = ['-p', 'cpu_slow']

#     return current_rqmt

CPU_SHORT_JOBLIST = ["AverageTFCheckpointsJob", "GetBestTFCheckpointJob"]
def check_engine_limits(current_rqmt, task):
        current_rqmt["time"] = min(168, current_rqmt.get("time", 2))
        curr_sbatch_args = current_rqmt.get("sbatch_args", [])
        if "-p" not in current_rqmt.get("sbatch_args", []):
            if current_rqmt.get("gpu", 0) > 0:
                # gpu
                if current_rqmt["time"] <= 1 and current_rqmt.get("gpu_mem", 0) <= 24:
                    current_rqmt["sbatch_args"] = ["-p", "gpu_test_24gb"] + curr_sbatch_args
                elif current_rqmt.get("gpu_mem", 0) > 24:
                    current_rqmt["sbatch_args"] = ["-p", "gpu_48gb"] + curr_sbatch_args # ["-p", "gpu_80gb"] + curr_sbatch_args
                elif current_rqmt.get("gpu_mem", 0) > 11:
                    current_rqmt["sbatch_args"] = ["-p", "gpu_24gb"] + curr_sbatch_args
                else:
                    current_rqmt["sbatch_args"] = ["-p", "gpu_11gb"] + curr_sbatch_args
            else:
                # cpu with SSE4 and AVX
                if task._job.__class__.__name__ in CPU_SHORT_JOBLIST:
                    current_rqmt["sbatch_args"] = ["-p", "cpu_short"] + curr_sbatch_args
        return current_rqmt


def build_apptainer_command(call):
    """
        Apptainer specific launch code
    """
    command = []
    if EXTRA_ENV:
        command += ["env"] + EXTRA_ENV
    command += ["apptainer", "exec", "--nv"]
    for bind in CONTAINER_BINDS:
        command += ["--bind", bind]
    command += [CONTAINER_IMAGE]
    return command + ["sis"] + call[2:]


def build_singularity_command(call):
    """
        Singularity specific launch code
    """
    command = []
    if EXTRA_ENV:
        command += ["env"] + EXTRA_ENV
    command += ["singularity", "exec", "--nv"]
    for bind in CONTAINER_BINDS:
        command += ["--bind", bind]
    command += [CONTAINER_IMAGE]
    return command + ["sis"] + call[2:]


def build_docker_command(call):
    """
        Docker specific launch code
    """
    from pwd import getpwnam
    # diff: USER, 's2322008'
    userid, groupid = getpwnam('s2322008')[2:4]
    exp_dir = os.path.dirname(__file__)
    command = [
        "docker", "run", "-t",
        "--rm",  # delete container after execution
        "-u", "%i:%i" % (userid, groupid),  # passing the username directly does not work with LDAP users
        "--runtime=nvidia",
        "--gpus", "device=%i" % gpu,
        "-m", "32g",
        "--shm-size", "16g",
        "-w",  exp_dir,
    ]
    for env in EXTRA_ENV:
        command += ["-e", env]
    for bind in CONTAINER_BINDS:
        command += ["-v", bind]
    command += [CONTAINER_IMAGE]
    command += ["sh",  "-e", "-c"]
    return command + ["sis"] + call[2:]


def worker_wrapper(job, task_name, call):
    """
    All worker calls are passed through this function.
    Is used to wrap the execution call with the correct container command.
    Usually it is not necessary to alter things here,
    but any worker call can be fully customized here.
    """
    from sisyphus.engine import EngineSelector
    from sisyphus.localengine import LocalEngine
    app_blacklist = ["FairseqHydraTrainWav2VecUJob"]
    if type(job).__name__ in app_blacklist:
        return call

    if CONTAINER_MODE == "apptainer":
        command = build_apptainer_command(call)
    elif CONTAINER_MODE == "docker":
        command = build_docker_command(call)
    elif CONTAINER_MODE == "singularity":
        command = build_singularity_command(call)
    else:
        raise ValueError("Invalid CONTAINER_MODE %s" % CONTAINER_MODE)

    ts = {t.name(): t for t in job.tasks()}
    t = ts[task_name]
    e = engine()  # Usually EngineSelector, but can be LocalEngine if no settings file is present
    if isinstance(e, EngineSelector):
        e = engine().get_used_engine_by_rqmt(t.rqmt())
    if isinstance(e, LocalEngine):
        return call
    else:
        return command