Source code for cloudmesh_client.shell.plugins.HpcCommand

from cloudmesh_client.cloud.experiment import Experiment
from cloudmesh_client.cloud.hpc.BatchProvider import BatchProvider
from cloudmesh_client.common.Printer import Printer
from cloudmesh_client.common.util import yn_choice
from cloudmesh_client.default import Default
from cloudmesh_client.shell.command import PluginCommand, HPCPluginCommand, \
    CometPluginCommand
from cloudmesh_client.shell.command import command
from cloudmesh_client.shell.console import Console


# noinspection PyBroadException
[docs]class HpcCommand(PluginCommand, HPCPluginCommand, CometPluginCommand): topics = {"hpc": "system"} def __init__(self, context): self.context = context if self.context.debug: print("init hpc command") # noinspection PyUnusedLocal @command def do_hpc(self, args, arguments): # noinspection PyPep8 """ :: Usage: hpc queue [--job=NAME][--cluster=CLUSTER][--format=FORMAT] hpc info [--cluster=CLUSTER][--format=FORMAT] hpc run list [ID] [--cluster=CLUSTER] hpc run output [ID] [--cluster=CLUSTER] hpc run rm [ID] [--cluster=CLUSTER] hpc run SCRIPT [--queue=QUEUE] [--t=TIME] [--N=nodes] [--name=NAME] [--cluster=CLUSTER][--dir=DIR][--group=GROUP][--format=FORMAT] hpc delete --job=NAME [--cluster=CLUSTER][--group=GROUP] hpc delete all [--cluster=CLUSTER][--group=GROUP][--format=FORMAT] hpc status [--job=name] [--cluster=CLUSTER] [--group=GROUP] hpc test --cluster=CLUSTER [--time=SECONDS] Options: --format=FORMAT the output format [default: table] Examples: Special notes if the group is specified only jobs from that group are considered. Otherwise the default group is used. If the group is set to None, all groups are used. cm hpc queue lists the details of the queues of the hpc cluster cm hpc queue --job=NAME lists the details of the job in the queue of the hpc cluster cm hpc info lists the details of the hpc cluster cm hpc run SCRIPT submits the script to the cluster. The script will be copied prior to execution into the home directory on the remote machine. If a DIR is specified it will be copied into that dir. The name of the script is either specified in the script itself, or if not the default naming scheme of cloudmesh is used using the same index incremented name as in vms fro clouds: cloudmes husername-index cm hpc delete all kills all jobs on the default hpc group cm hpc delete --job=NAME kills a job with a given name or job id cm default cluster=NAME sets the default hpc cluster cm hpc status returns the status of all jobs cm hpc status job=ID returns the status of the named job cm hpc test --cluster=CLUSTER --time=SECONDS submits a simple test job to the named cluster and returns if the job could be successfully executed. This is a blocking call and may take a long time to complete dependent on if the queuing system of that cluster is busy. It will only use one node/core and print the message #CLOUDMESH: Test ok in that is being looked for to identify if the test is successful. If time is used, the job is terminated after the time is elapsed. Examples: cm hpc queue cm hpc queue --job=xxx cm hpc info cm hpc delete --job=6 cm hpc delete all cm hpc status cm hpc status --job=6 cm hpc run uname cm hpc run ~/test.sh --cluster=india """ format = arguments['--format'] cluster = arguments['--cluster'] or Default.cluster arguments["CLUSTER"] = cluster if cluster is None: Console.error("Default cluster doesn't exist") return batch = BatchProvider(cluster) if arguments["queue"]: name = arguments['--job'] result = batch.queue(cluster, format=format, job=name) Console.msg(result) elif arguments["info"]: Console.msg(batch.info(cluster, format)) elif arguments['delete'] and arguments['all']: group = arguments['--group'] or Default.get(name='group') if group is None: Console.error('set default group using: default group=<value> --cloud=general') return Console.ok(batch.delete(cluster, None, group)) elif arguments["delete"]: job = arguments['--job'] Console.ok(batch.delete(cluster, job)) elif arguments["status"]: name = arguments['--job'] result = batch.queue(cluster, format=format, job=name) Console.msg(result) elif arguments["test"]: time_secs = arguments['--time'] if time_secs: time = '00:00:' + time_secs else: time = '00:00:10' # give a default time of 10 secs print(batch.test(cluster, time)) elif arguments["run"] and arguments["list"]: # hpc experiment list [--cluster=CLUSTER] if arguments["ID"]: print("# List of experiment {ID} on Cluster {CLUSTER}".format(**arguments)) result = Experiment.list(cluster, id=arguments["ID"], format="list") if result is not None: print("\n".join(result)) else: Console.error("Could not find experiment {ID} on {CLUSTER}".format(**arguments)) else: print("# List of experiments on Cluster {CLUSTER}".format(**arguments)) ids = Experiment.list(cluster, id=None, format="list") if ids is not None: print(", ".join([str(i) for i in ids])) else: Console.error("Could not find experiment {ID} on {CLUSTER}".format(**arguments)) elif arguments["run"] and arguments["rm"]: # hpc experiment list [--cluster=CLUSTER] if arguments["ID"]: force = yn_choice("Would you like to delete experiment {ID} on Cluster {CLUSTER}".format(**arguments)) if force: try: result = Experiment.rm(cluster, id=arguments["ID"]) Console.ok("Experiment {ID} on Cluster {CLUSTER} deleted".format(**arguments)) except: Console.error("Could not delete experiment {ID} on {CLUSTER}".format(**arguments)) else: result = Experiment.list(cluster, id=None, format="list") if result is not None: arguments['experiments'] = ", ".join([str(i) for i in result]) else: Console.error("Could not find experiment {ID} on {CLUSTER}".format(**arguments)) return "" force = yn_choice("Would you like to delete the experiments {experiments} on Cluster {CLUSTER}".format( **arguments)) if force: try: result = Experiment.rm(cluster, id=None) Console.ok("Experiments {experiments} on Cluster {CLUSTER} deleted".format(**arguments)) except: Console.error("Could delete the experiments on {CLUSTER}".format(**arguments)) return "" elif arguments["run"] and arguments["output"]: # hpc experiment list [--cluster=CLUSTER] if arguments["ID"]: print("# List of experiment {ID} on Cluster {CLUSTER}".format(**arguments)) result = Experiment.output(cluster, id=arguments["ID"], format="list") if result is not None: print("\n".join(result)) else: Console.error("Could not find experiment {ID} on {CLUSTER}".format(**arguments)) else: print("# List of experiments on Cluster {CLUSTER}".format(**arguments)) ids = Experiment.output(cluster, id=None, format="list") if ids is not None: print(", ".join([str(i) for i in ids])) else: Console.error("Could not find experiment {ID} on {CLUSTER}".format(**arguments)) elif arguments["run"]: queue = arguments['--queue'] or Default.get(name='queue') # if not queue: # Console.error('set default queue using: default queue=<value>') # return group = arguments['--group'] or Default.get(name='group') if group is None: Console.error('set default group using: default group=<value> --cloud=general') return script = arguments['SCRIPT'] arg_dict = { '-name': arguments['--name'], '-p': queue, '-t': arguments['--t'], '-N': arguments['--N'] } result = batch.run(cluster, group, script, **arg_dict) if isinstance(result, dict): print(Printer.attribute(result)) Console.ok("Experiment {count}: Started batch job {job_id} on {cluster}".format(**result)) else: Console.error(result) return ""