Source code for caelus.run.hpc_queue

# -*- coding: utf-8 -*-

"""\
Job Scheduler Interface
-----------------------

This module provides a unified interface to submitting serial, local-MPI
parallel, and parallel jobs on high-performance computing (HPC) queues.
"""

import abc
import logging
import os
import re
import shlex
import subprocess
import sys
import textwrap
from collections import OrderedDict

try:
    from collections.abc import Mapping
except ImportError:
    from collections import Mapping

import six

from ..config import cmlenv, config
from ..config.jinja2wrappers import CaelusTemplates
from ..utils import osutils

_lgr = logging.getLogger(__name__)



[docs]
def caelus_execute(cmd, env=None, stdout=sys.stdout, stderr=sys.stderr):
    """Execute a CML command with the right environment setup

    A wrapper around subprocess.Popen to set up the correct environment before
    invoing the CML executable.

    The command can either be a string or a list of arguments as appropriate
    for Caelus executables.

    Examples:
      caelus_execute("blockMesh -help")

    Args:
        cmd (str or list): The command to be executed
        env (CMLEnv): An instance representing the CML installation
                      (default: latest)
        stdout: A file handle where standard output is redirected
        stderr: A file handle where standard error is redirected

    Returns:
        subprocess.Popen : The task instance
    """
    renv = env or cmlenv.cml_get_latest_version()
    posix = not (osutils.ostype() == "windows")
    cmd_popen = cmd if isinstance(cmd, list) else shlex.split(cmd, posix=posix)
    _lgr.debug("Executing shell command: %s", ' '.join(cmd_popen))
    task = subprocess.Popen(
        cmd_popen, stdout=stdout, stderr=stderr, env=renv.environ
    )
    return task




[docs]
def python_execute(
    pyscript, script_args="", env=None, log_file=None, log_to_file=True
):
    """Execute a python script with the right environment

    This function will setup the correct CPL and CML environment and execute
    the python script within this environment. The user should only provide the
    name of the script and not ``python script`` as this it is this functions
    job to detect the correct python executable and execute within that
    environment.

    If ``log_file`` isn't provided it automatically creates a "py_*.log" file
    to redirect output messages from the script where ``*`` is replaced with
    the basename of the python script.

    Args:
        pyscript (path): Filename of the python script
        script_args (str): Extra arguments to be passed to the python script
        env (CMLEnv): CML environment used for execution
        log_file (filename): Filename to redirect output to
        log_to_file (bool): Should outputs be redirected to log file

    Returns:
        status (int): The status of the execution

    """
    spath = osutils.abspath(pyscript)
    if not log_file and log_to_file:
        _, sbase, _ = osutils.split_path(spath)
        log_file = "py_%s.log" % sbase
    pycmd = "%s %s %s" % (sys.executable, spath, script_args)
    fh = open(log_file, 'w') if log_file else sys.stdout
    task = caelus_execute(pycmd, env, fh, stderr=subprocess.STDOUT)
    status = task.wait()
    if status != 0:
        _lgr.error("Python script %s failed; status = %d", spath, status)
    if log_file is not None:
        fh.close()
    return status




[docs]
@six.add_metaclass(abc.ABCMeta)
class HPCQueue:
    """Abstract base class for job submission interface

    Attributes:
        name (str): Job name
        queue (str): Queue/partition where job is submitted
        account (str): Account the job is charged to
        num_nodes (int): Number of nodes requested
        num_ranks (int): Number of MPI ranks
        stdout (path): Filename where standard out is redirected
        stderr (path): Filename where standard error is redirected
        join_outputs (bool): Merge stdout/stderr to same file
        mail_opts (str): Mail options (see specific queue implementation)
        email_address (str): Email address for notifications
        qos (str): Quality of service
        time_limit (str): Wall clock time limit
        shell (str): shell to use for scripts
        mpi_extra_args (str): additional arguments for MPI
    """

    #: Variables to parse from configuration file
    _cpl_config_vars = [
        'name',
        'queue',
        'account',
        'num_nodes',
        'num_ranks',
        'stdout',
        'stderr',
        'join_outputs',
        'mail_opts',
        'email_address',
        'qos',
        'time_limit',
        'shell',
        'exclusive',
        'mem_per_node',
        'mem_per_rank',
    ]

    #: Identifier used for queue
    queue_name = "_ERROR_"

    #: Attribute to job scheduler option mapping
    _queue_var_map = {}

    #: Default values for attributes
    _queue_default_values = {}


[docs]
    @classmethod
    @abc.abstractmethod
    def submit(
        cls, script_file, job_dependencies=None, extra_args=None, dep_type=None
    ):
        """Submit the job to the queue"""



[docs]
    @staticmethod
    @abc.abstractmethod
    def delete(job_id):
        """Delete a job from the queue"""



[docs]
    @staticmethod
    def is_parallel():
        """Flag indicating whether the queue type can support parallel runs"""
        return True



[docs]
    @staticmethod
    def is_job_scheduler():
        """Is this a job scheduler"""
        return True


    def __init__(self, name, cml_env=None, **kwargs):
        """
        Args:
            name (str): Name of the job
            cml_env (CMLEnv): Environment used for execution
        """
        self.name = name
        self.cml_env = cml_env
        self.shell = "/bin/bash"
        self.num_ranks = 1
        self.env_config = ""
        self._has_script_body = False
        self._script_body = None
        cfg = config.get_config()
        opts = cfg.caelus.system.scheduler_defaults
        for key, val in self._queue_default_values.items():
            setattr(self, key, val)
        for key, val in opts.items():
            setattr(self, key, val)
        for key, val in kwargs.items():
            setattr(self, key, val)

    def __repr__(self):
        return """<%s (%s)>""" % (self.__class__.__name__, self.name)


[docs]
    def write_script(self, script_name=None):
        """Write a submission script using the arguments provided

        Args:
            script_name (path): Name of the script file
        """
        if not self._has_script_body:
            raise RuntimeError("Contents of script have not been initialized")
        fname = script_name or "%s_%s.job" % (self.name, self.queue_name)
        qconf = self.get_queue_settings()
        tmpl = CaelusTemplates()
        tmpl.write_template(
            fname,
            "run/hpc_queue/hpc_queue.job.tmpl",
            job=self,
            queue_config=qconf,
        )
        return fname



[docs]
    def update(self, settings):
        """Update queue settings from the given dictionary"""
        for key in self._cpl_config_vars:
            if key in settings:
                setattr(self, key, settings[key])



[docs]
    def process_cml_run_env(self):
        """Populate the run variables for script"""
        env_cfg = """
        # CAELUS environment updates
        export PROJECT_DIR=%s
        export CAELUS_PROJECT_DIR=${PROJECT_DIR}
        export PATH=%s:${PATH}
        export LD_LIBRARY_PATH=%s:${LD_LIBRARY_PATH}
        export MPI_BUFFER_SIZE=20000000

        """
        renv = self.cml_env or cmlenv.cml_get_latest_version()
        path_var = (
            renv.bin_dir
            + os.pathsep
            + renv.user_bindir
            + os.pathsep
            + renv.mpi_bindir
        )
        lib_var = (
            renv.lib_dir
            + os.pathsep
            + renv.user_libdir
            + os.pathsep
            + renv.mpi_libdir
        )
        self.env_config = textwrap.dedent(env_cfg) % (
            renv.project_dir,
            path_var,
            lib_var,
        )



[docs]
    def process_foam_run_env(self):
        """Populate the run variables for OpenFOAM execution"""
        env_cfg = """
        # Modules
        %s

        # OpenFOAM configuration
        source %s
        export LD_LIBRARY_PATH=%s:${LD_LIBRARY_PATH}

        """
        renv = self.cml_env
        bashrc_path = self.cml_env.foam_bashrc
        libs = "lib_dir user_libdir site_libdir mpi_libdir".split()
        libvar = os.pathsep.join(
            getattr(renv, vv) for vv in libs if getattr(renv, vv)
        )
        modules = "# no modules loaded"
        if renv.module_list:
            modules = '\n'.join(
                "module load %s" % mm for mm in renv.module_list
            )
        self.env_config = textwrap.dedent(env_cfg) % (
            modules,
            bashrc_path,
            libvar,
        )



[docs]
    def process_run_env(self):
        """Process runtime environment for scripts"""
        if self.cml_env and isinstance(self.cml_env, cmlenv.FOAMEnv):
            self.process_foam_run_env()
        else:
            self.process_cml_run_env()



[docs]
    @abc.abstractmethod
    def get_queue_settings(self):
        """Return a string with all the necessary queue options"""



[docs]
    def prepare_mpi_cmd(self):
        """Prepare the MPI invocation"""
        num_mpi_ranks = getattr(self, "num_ranks", 1)
        cmd_tmpl = (
            "mpiexec -localonly %d "
            if osutils.ostype() == "windows"
            else "mpiexec -np %d "
        )
        mpi_cmd = cmd_tmpl % num_mpi_ranks
        return mpi_cmd + getattr(self, "mpi_extra_args", "")


    @abc.abstractmethod
    def __call__(self, **kwargs):
        """Submit job to scheduler"""

    @property
    def script_body(self):
        """The contents of the script submitted to scheduler"""
        return self._script_body

    @script_body.setter
    def script_body(self, value):
        self._script_body = value
        self._has_script_body = True




[docs]
class SerialJob(HPCQueue):
    """Interface to a serial job"""

    queue_name = "serial_job"


[docs]
    @classmethod
    def submit(cls, script_file, job_dependencies=None, extra_args=None):
        """Submit the job to the queue"""
        task = subprocess.Popen(script_file)
        status = task.wait()
        if status != 0:
            _lgr.error("Error executing script %s", script_file)
        return status



[docs]
    @staticmethod
    def delete(job_id):
        """Delete a job from the queue"""
        pass



[docs]
    @staticmethod
    def is_parallel():
        """Flag indicating whether the queue type can support parallel runs"""
        return False



[docs]
    @staticmethod
    def is_job_scheduler():
        """Flag indicating whether this is a job scheduler"""
        return False



[docs]
    def get_queue_settings(self):
        """Return queue settings"""
        return ""



[docs]
    def prepare_mpi_cmd(self):
        """Prepare the MPI invocation"""
        return ""


    def __call__(self, **kwargs):
        wait = kwargs.get("wait", True)
        if not self._has_script_body:
            raise RuntimeError("Invalid command for execution")
        cmdline = self.script_body
        outfile = getattr(self, "stdout", "%s.log" % self.name)
        with open(outfile, 'w') as fh:
            task = caelus_execute(
                cmdline, env=self.cml_env, stdout=fh, stderr=subprocess.STDOUT
            )
            self.task = task  # pylint: disable=attribute-defined-outside-init
            if wait:
                status = task.wait()
                if status != 0:
                    _lgr.error("Error running command: %s", cmdline)
                return status




[docs]
class ParallelJob(SerialJob):
    """Interface to a parallel job"""

    queue_name = "parallel_job"


[docs]
    @staticmethod
    def is_parallel():
        """Flag indicating whether the queue type can support parallel runs"""
        return True



[docs]
    def prepare_mpi_cmd(self):
        """Prepare the MPI invocation"""
        num_mpi_ranks = getattr(self, "num_ranks", 1)
        machinefile = getattr(self, "machinefile", None)
        cmd_tmpl = (
            "mpiexec -localonly %d "
            if osutils.ostype() == "windows"
            else "mpiexec -np %d "
        )
        mpi_cmd = cmd_tmpl % num_mpi_ranks
        if machinefile:
            mpi_cmd += mpi_cmd + " -machinefile %s " % machinefile
        return mpi_cmd + getattr(self, "mpi_extra_args", "")





[docs]
class SlurmQueue(HPCQueue):
    """Interface to SLURM queue manager"""

    queue_name = "slurm"

    _queue_var_map = OrderedDict(
        name="job-name",
        queue="partition",
        account="account",
        num_nodes="nodes",
        num_ranks="ntasks",
        stdout="output",
        stderr="error",
        mail_opts="mail-type",
        email_address="mail-user",
        qos="qos",
        time_limit="time",
        dependencies="depend",
        licenses="licenses",
        features="constraint",
        mem_per_node="mem",
        mem_per_rank="mem-per-cpu",
        exclusive="exclusive",
        kill_invalid="kill-on-invalid-dep",
    )

    _queue_default_values = dict(
        stdout="job-%x-%J.out", mail_opts="NONE", shell="/bin/bash"
    )

    _batch_job_regex = re.compile(r"Submitted batch job (\d+)")


[docs]
    @classmethod
    def submit(
        cls,
        script_file,
        job_dependencies=None,
        extra_args=None,
        dep_type="afterok",
    ):
        """Submit to SLURM using sbatch command

        ``job_dependencies`` is a list of SLURM job IDs. The submitted job will
        not run until after all the jobs provided in this list have been
        completed successfully.

        ``extra_args`` is a dictionary of extra arguments to be passed to
        ``sbatch`` command. Note that this can override options provided in the
        script file as well as introduce additional options during submission.

        ``dep_type`` can be one of: after, afterok, afternotok afterany

        The job ID returned by this method can be used as an argument to delete
        method or as an entry in ``job_dependencies`` for a subsequent job
        submission.

        Args:
            script_file (path): Script provided to sbatch command
            job_dependencies (list): List of jobs to wait for
            extra_args (dict): Extra SLURM arguments
            dep_type (str): Dependency type

        Returns:
            str: Job ID as a string
        """
        depends_arg = ""
        if job_dependencies:
            depends_arg = "--depend %s:" % dep_type + ":".join(
                "%s" % i for i in job_dependencies
            )

        slurm_args = ""
        if isinstance(extra_args, Mapping):
            slurm_args = " ".join(
                "--%s %s" % (cls._queue_var_map.get(key, key), val)
                for key, val in extra_args.items()
            )
        elif extra_args is not None:
            slurm_args = extra_args

        sbatch_cmd = "sbatch %s %s %s" % (depends_arg, slurm_args, script_file)
        cmd_line = shlex.split(sbatch_cmd)
        _lgr.debug("Executing SLURM sbatch command: %s", sbatch_cmd)
        pp = subprocess.Popen(
            cmd_line, stdout=subprocess.PIPE, stderr=subprocess.PIPE
        )
        out, err = pp.communicate()
        job_id_match = cls._batch_job_regex.search(out.decode('utf-8'))
        if err or not job_id_match:
            raise RuntimeError("Error submitting job: '%s'" % sbatch_cmd)
        job_id = job_id_match.group(1)
        return job_id



[docs]
    @staticmethod
    def delete(job_id):
        """Delete the SLURM batch job using job ID"""
        scancel_cmd = "scancel %s" % job_id
        cmd_line = shlex.split(scancel_cmd)
        _lgr.debug("Executing SLURM scancel command: %s", scancel_cmd)
        pp = subprocess.Popen(
            cmd_line, stdout=subprocess.PIPE, stderr=subprocess.PIPE
        )
        out, err = pp.communicate()
        if out:
            _lgr.debug("scancel output: %s", out)
        if err:
            _lgr.debug("Error executing scancel: %s", err)


    def __init__(self, name, cml_env=None, **kwargs):
        """
        Args:
            name (str): Name of the job
            cml_env (CMLEnv): Environment used for execution
        """
        super(SlurmQueue, self).__init__(name, cml_env, **kwargs)
        cfg = config.get_config()
        opts = cfg.caelus.system
        use_mpiexec = opts.get("slurm_use_mpiexec", True)
        if not use_mpiexec:
            self.prepare_mpi_cmd = self.prepare_srun_cmd


[docs]
    def get_queue_settings(self):
        """Return all SBATCH options suitable for embedding in script"""

        def _opts_helper():
            """Helper function to convert options"""
            for key, skey in self._queue_var_map.items():
                if not hasattr(self, key):
                    continue
                val = getattr(self, key)
                if isinstance(val, bool):
                    val = " "
                yield "#SBATCH --%s %s" % (skey, val)

        qopts = "\n".join(_opts_helper())
        header = "\n# SLURM options\n"
        return header + qopts + "\n"



[docs]
    def prepare_srun_cmd(self):
        """Prepare the call to SLURM srun command"""
        return "srun --ntasks ${SLURM_NTASKS} " + getattr(
            self, "mpi_extra_args", ""
        )


    def __call__(self, **kwargs):
        """Submit the job"""
        script_file = kwargs.get("script_file", None)
        job_deps = kwargs.get("job_dependencies", None)
        extra_args = kwargs.get("extra_args", None)
        if not self._has_script_body:
            raise RuntimeError(
                "Script contents have not been set before submit"
            )
        self.process_run_env()
        script_file = self.write_script(script_file)
        return self.submit(script_file, job_deps, extra_args)




[docs]
class PBSQueue(HPCQueue):
    """PBS Queue Interface"""

    queue_name = "pbs"

    _queue_var_map = OrderedDict(
        name="-N ",
        queue="-q ",
        account="-A ",
        num_nodes="-l nodes=",
        stdout="-o ",
        stderr="-e ",
        join_outputs="-j ",
        mail_opts="-m ",
        email_address="-M ",
        time_limit="-l walltime=",
    )

    _default_queue_vaues = dict(
        stdout="job-$PBS_JOBNAME-$PBS_JOBID.out",
        join_outputs="oe",
        shell="/bin/bash",
    )

    _batch_job_regex = re.compile(r"(\d+)")


[docs]
    @classmethod
    def submit(
        cls,
        script_file,
        job_dependencies=None,
        extra_args=None,
        dep_type="afterok",
    ):
        """Submit a PBS job using qsub command

        ``job_dependencies`` is a list of PBS job IDs. The submitted job will
        run depending the status of the dependencies.

        ``extra_args`` is a dictionary of arguments passed to ``qsub`` command.


        The job ID returned by this method can be used as an argument to delete
        method or as an entry in ``job_dependencies`` for a subsequent job
        submission.

        Args:
            script_file (path): Script provided to sbatch command
            job_dependencies (list): List of jobs to wait for
            extra_args (dict): Extra SLURM arguments

        Returns:
            str: Job ID as a string
        """
        depends_arg = ""
        if job_dependencies:
            depends_arg = "-W depend=%s:" % dep_type + ":".join(
                "%s" % i for i in job_dependencies
            )
        qsub_args = extra_args or ""

        qsub_cmd = "qsub %s %s %s" % (depends_arg, qsub_args, script_file)
        cmd_line = shlex.split(qsub_cmd)
        _lgr.debug("Executing PBS qsub command: %s", qsub_cmd)
        pp = subprocess.Popen(
            cmd_line, stdout=subprocess.PIPE, stderr=subprocess.PIPE
        )
        out, err = pp.communicate()
        job_id_match = cls._batch_job_regex.search(out.decode('utf-8'))
        if err or not job_id_match:
            raise RuntimeError("Error submitting job: '%s'" % qsub_cmd)
        job_id = job_id_match.group(1)
        return job_id



[docs]
    @staticmethod
    def delete(job_id):
        """Delete the PBS batch job using job ID"""
        qdel_cmd = "qdel %s" % job_id
        cmd_line = shlex.split(qdel_cmd)
        _lgr.debug("Executing PBS qdel command: %s", qdel_cmd)
        pp = subprocess.Popen(
            cmd_line, stdout=subprocess.PIPE, stderr=subprocess.PIPE
        )
        out, err = pp.communicate()
        if out:
            _lgr.debug("qdel output: %s", out)
        if err:
            _lgr.debug("Error executing qdel: %s", err)



[docs]
    def get_queue_settings(self):
        """Return all PBS options suitable for embedding in script"""
        qopts = "\n".join(
            "#PBS %s%s" % (val, getattr(self, key))
            for key, val in self._queue_var_map.items()
            if hasattr(self, key)
        )
        header = "\n# PBS Queue options\n"
        return header + qopts + "\n"


    def __call__(self, **kwargs):
        """Submit the job"""
        script_file = kwargs.get("script_file", None)
        job_deps = kwargs.get("job_dependencies", None)
        extra_args = kwargs.get("extra_args", None)
        if not self._has_script_body:
            raise RuntimeError(
                "Script contents have not been set before submit"
            )
        self.process_run_env()
        script_file = self.write_script(script_file)
        return self.submit(script_file, job_deps, extra_args)



_hpc_queue_map = dict(
    no_mpi=SerialJob,
    local_mpi=ParallelJob,
    slurm=SlurmQueue,
    pbs=PBSQueue,
)



[docs]
def get_job_scheduler(queue_type=None):
    """Return an instance of the job scheduler"""
    cfg = config.get_config()
    cfg_queue_type = cfg.caelus.system.get("job_scheduler", 'local_mpi')
    qtype = queue_type or cfg_queue_type
    return _hpc_queue_map.get(qtype.lower(), ParallelJob)