1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24 """
25 gridmap provides a high level front-end to DRMAA-python.
26
27 This module provides wrappers that simplify submission and collection of jobs,
28 in a more 'pythonic' fashion.
29
30 @author: Christian Widmer
31 @author: Cheng Soon Ong
32 @author: Dan Blanchard (dblanchard@ets.org)
33 """
34
35 from __future__ import print_function, unicode_literals
36
37 import argparse
38 import bz2
39 try:
40 import cPickle as pickle
41 except ImportError:
42 import pickle
43 import inspect
44 import os
45 import re
46 import subprocess
47 import sys
48 import traceback
49 import uuid
50 from socket import gethostname
51 from time import sleep
52
53 import drmaa
54 from redis import StrictRedis
55 from redis.exceptions import ConnectionError as RedisConnectionError
56
57
58 if sys.version_info < (3, 0):
59 range = xrange
60
61
62
63
64 REDIS_DB = 2
65 REDIS_PORT = 7272
66 MAX_TRIES = 50
67 SLEEP_TIME = 3
68
69
70 USE_MEM_FREE = False
71
72
73 DEFAULT_QUEUE = 'all.q'
74
75
76 -class Job(object):
77 """
78 Central entity that wraps a function and its data. Basically, a job consists
79 of a function, its argument list, its keyword list and a field "ret" which
80 is filled, when the execute method gets called.
81
82 @note: This can only be used to wrap picklable functions (i.e., those that
83 are defined at the module or class level).
84 """
85
86 __slots__ = ('_f', 'args', 'jobid', 'kwlist', 'cleanup', 'ret', 'exception',
87 'environment', 'replace_env', 'working_dir', 'num_slots',
88 'mem_free', 'white_list', 'path', 'uniq_id', 'name', 'queue')
89
90 - def __init__(self, f, args, kwlist=None, cleanup=True, mem_free="1G",
91 name='gridmap_job', num_slots=1, queue=DEFAULT_QUEUE):
92 """
93 Initializes a new Job.
94
95 @param f: a function, which should be executed.
96 @type f: function
97 @param args: argument list of function f
98 @type args: list
99 @param kwlist: dictionary of keyword arguments for f
100 @type kwlist: dict
101 @param cleanup: flag that determines the cleanup of input and log file
102 @type cleanup: boolean
103 @param mem_free: Estimate of how much memory this job will need (for
104 scheduling)
105 @type mem_free: C{basestring}
106 @param name: Name to give this job
107 @type name: C{basestring}
108 @param num_slots: Number of slots this job should use.
109 @type num_slots: C{int}
110 @param queue: SGE queue to schedule job on.
111 @type queue: C{basestring}
112 """
113
114 self.path = None
115 self._f = None
116 self.function = f
117 self.args = args
118 self.jobid = -1
119 self.kwlist = kwlist if kwlist is not None else {}
120 self.cleanup = cleanup
121 self.ret = None
122 self.environment = None
123 self.replace_env = False
124 self.working_dir = os.getcwd()
125 self.num_slots = num_slots
126 self.mem_free = mem_free
127 self.white_list = []
128 self.uniq_id = None
129 self.name = name
130 self.queue = queue
131
132 @property
134 ''' Function this job will execute. '''
135 return self._f
136
137 @function.setter
139 """
140 setter for function that carefully takes care of
141 namespace, avoiding __main__ as a module
142 """
143
144 m = inspect.getmodule(f)
145 try:
146 self.path = _clean_path(os.path.dirname(os.path.abspath(
147 inspect.getsourcefile(f))))
148 except TypeError:
149 self.path = ''
150
151
152 if m.__name__ != "__main__":
153 self._f = f
154
155 else:
156
157
158 mn = os.path.splitext(os.path.basename(m.__file__))[0]
159
160
161 __import__(mn)
162
163
164 mod = sys.modules[mn]
165
166
167 self._f = getattr(mod, f.__name__)
168
170 """
171 Executes function f with given arguments
172 and writes return value to field ret.
173 If an exception is encountered during execution, ret will
174 contain a pickled version of it.
175 Input data is removed after execution to save space.
176 """
177 try:
178 self.ret = self.function(*self.args, **self.kwlist)
179 except Exception as exception:
180 self.ret = exception
181 traceback.print_exc()
182 del self.args
183 del self.kwlist
184
185 @property
205
206
207 -def _submit_jobs(jobs, uniq_id, temp_dir='/scratch', white_list=None,
208 quiet=True):
209 """
210 Method used to send a list of jobs onto the cluster.
211 @param jobs: list of jobs to be executed
212 @type jobs: c{list} of L{Job}
213 @param uniq_id: The unique suffix for the tables corresponding to this job
214 in the database.
215 @type uniq_id: C{basestring}
216 @param temp_dir: Local temporary directory for storing output for an
217 individual job.
218 @type temp_dir: C{basestring}
219 @param white_list: List of acceptable nodes to use for scheduling job. If
220 None, all are used.
221 @type white_list: C{list} of C{basestring}
222 @param quiet: When true, do not output information about the jobs that have
223 been submitted.
224 @type quiet: C{bool}
225 """
226
227 session = drmaa.Session()
228 session.initialize()
229 jobids = []
230
231 for job_num, job in enumerate(jobs):
232
233 job.white_list = white_list
234
235
236 jobid = _append_job_to_session(session, job, uniq_id, job_num,
237 temp_dir=temp_dir, quiet=quiet)
238 jobids.append(jobid)
239
240 sid = session.contact
241 session.exit()
242
243 return (sid, jobids)
244
248 """
249 For an active session, append new job based on information stored in job
250 object. Also sets job.job_id to the ID of the job on the grid.
251
252 @param session: The current DRMAA session with the grid engine.
253 @type session: C{drmaa.Session}
254 @param job: The Job to add to the queue.
255 @type job: L{Job}
256 @param uniq_id: The unique suffix for the tables corresponding to this job
257 in the database.
258 @type uniq_id: C{basestring}
259 @param job_num: The row in the table to store/retrieve data on. This is only
260 non-zero for jobs created via grid_map.
261 @type job_num: C{int}
262 @param temp_dir: Local temporary directory for storing output for an
263 individual job.
264 @type temp_dir: C{basestring}
265 @param quiet: When true, do not output information about the jobs that have
266 been submitted.
267 @type quiet: C{bool}
268 """
269
270 jt = session.createJobTemplate()
271
272
273 shell_env = os.environ
274
275 if job.environment and job.replace_env:
276
277 jt.jobEnvironment = job.environment
278
279 elif job.environment and not job.replace_env:
280
281 env = shell_env
282 env.update(job.environment)
283 jt.jobEnvironment = env
284
285 else:
286
287 jt.jobEnvironment = shell_env
288
289
290 jt.remoteCommand = re.sub(r'\.pyc$', '.py',
291 _clean_path(os.path.abspath(__file__)))
292 jt.args = ['{0}'.format(uniq_id), '{0}'.format(job_num), job.path, temp_dir,
293 gethostname()]
294 jt.nativeSpecification = job.native_specification
295 jt.outputPath = ":" + temp_dir
296 jt.errorPath = ":" + temp_dir
297
298 jobid = session.runJob(jt)
299
300
301 job.jobid = jobid
302
303 if not quiet:
304 print('Your job {0} has been submitted with id {1}'.format(job.name,
305 jobid),
306 file=sys.stderr)
307
308 session.deleteJobTemplate(jt)
309
310 return jobid
311
312
313 -def _collect_jobs(sid, jobids, joblist, redis_server, uniq_id,
314 temp_dir='/scratch/', wait=True):
315 """
316 Collect the results from the jobids, returns a list of Jobs
317
318 @param sid: session identifier
319 @type sid: string returned by cluster
320 @param jobids: list of job identifiers returned by the cluster
321 @type jobids: list of strings
322 @param redis_server: Open connection to the database where the results will
323 be stored.
324 @type redis_server: L{StrictRedis}
325 @param wait: Wait for jobs to finish?
326 @type wait: Boolean, defaults to False
327 @param temp_dir: Local temporary directory for storing output for an
328 individual job.
329 @type temp_dir: C{basestring}
330 """
331
332 for ix in range(len(jobids)):
333 assert(jobids[ix] == joblist[ix].jobid)
334
335 s = drmaa.Session()
336 s.initialize(sid)
337
338 if wait:
339 drmaaWait = drmaa.Session.TIMEOUT_WAIT_FOREVER
340 else:
341 drmaaWait = drmaa.Session.TIMEOUT_NO_WAIT
342
343 s.synchronize(jobids, drmaaWait, True)
344
345 s.exit()
346
347
348 job_output_list = []
349 for ix, job in enumerate(joblist):
350
351 log_stdout_fn = os.path.join(temp_dir, job.name + '.o' + jobids[ix])
352 log_stderr_fn = os.path.join(temp_dir, job.name + '.e' + jobids[ix])
353
354 try:
355 job_output = _zload_db(redis_server, 'output{0}'.format(uniq_id),
356 ix)
357 except Exception as detail:
358 print(("Error while unpickling output for gridmap job {1} from" +
359 " stored with key output_{0}_{1}").format(uniq_id, ix),
360 file=sys.stderr)
361 print("This could caused by a problem with the cluster " +
362 "environment, imports or environment variables.",
363 file=sys.stderr)
364 print(("Try running `gridmap.py {0} {1} {2} {3} {4}` to see " +
365 "if your job crashed before writing its " +
366 "output.").format(uniq_id,
367 ix,
368 job.path,
369 temp_dir,
370 gethostname()),
371 file=sys.stderr)
372 print("Check log files for more information: ", file=sys.stderr)
373 print("stdout:", log_stdout_fn, file=sys.stderr)
374 print("stderr:", log_stderr_fn, file=sys.stderr)
375 print("Exception: {0}".format(detail))
376 sys.exit(2)
377
378
379 if isinstance(job_output, Exception):
380 print("Exception encountered in job with log file:",
381 file=sys.stderr)
382 print(log_stdout_fn, file=sys.stderr)
383 print(job_output, file=sys.stderr)
384 print(file=sys.stderr)
385
386 job_output_list.append(job_output)
387
388 return job_output_list
389
390
391 -def process_jobs(jobs, temp_dir='/scratch/', wait=True, white_list=None,
392 quiet=True):
393 """
394 Take a list of jobs and process them on the cluster.
395
396 @param temp_dir: Local temporary directory for storing output for an
397 individual job.
398 @type temp_dir: C{basestring}
399 @param wait: Should we wait for jobs to finish? (Should only be false if the
400 function you're running doesn't return anything)
401 @type wait: C{bool}
402 @param white_list: If specified, limit nodes used to only those in list.
403 @type white_list: C{list} of C{basestring}
404 @param quiet: When true, do not output information about the jobs that have
405 been submitted.
406 @type quiet: C{bool}
407 """
408
409 redis_server = StrictRedis(host=gethostname(), db=REDIS_DB, port=REDIS_PORT)
410
411
412 try:
413 redis_server.set('connection_test', True)
414 except RedisConnectionError:
415 with open('/dev/null') as null_file:
416 redis_process = subprocess.Popen(['redis-server', '-'],
417 stdout=null_file,
418 stdin=subprocess.PIPE,
419 stderr=null_file)
420 redis_process.stdin.write('''daemonize yes
421 pidfile {0}
422 port {1}
423 '''.format(os.path.join(temp_dir,
424 'redis{0}.pid'.format(REDIS_PORT)),
425 REDIS_PORT))
426 redis_process.stdin.close()
427
428 sleep(5)
429
430
431 uniq_id = uuid.uuid4()
432
433
434 for job_id, job in enumerate(jobs):
435 _zsave_db(job, redis_server, 'job{0}'.format(uniq_id), job_id)
436
437
438 sids, jobids = _submit_jobs(jobs, uniq_id, white_list=white_list,
439 temp_dir=temp_dir, quiet=quiet)
440
441
442 job_outputs = _collect_jobs(sids, jobids, jobs, redis_server, uniq_id,
443 temp_dir=temp_dir, wait=wait)
444
445
446 assert(len(jobs) == len(job_outputs))
447
448
449 redis_server.delete(*redis_server.keys('job{0}_*'.format(uniq_id)))
450 redis_server.delete(*redis_server.keys('output{0}_*'.format(uniq_id)))
451 return job_outputs
452
453
454
455
456
457 -def grid_map(f, args_list, cleanup=True, mem_free="1G", name='gridmap_job',
458 num_slots=1, temp_dir='/scratch/', white_list=None,
459 queue=DEFAULT_QUEUE, quiet=True):
460 """
461 Maps a function onto the cluster.
462 @note: This can only be used with picklable functions (i.e., those that are
463 defined at the module or class level).
464
465 @param f: The function to map on args_list
466 @type f: C{function}
467 @param args_list: List of arguments to pass to f
468 @type args_list: C{list}
469 @param cleanup: Should we remove the stdout and stderr temporary files for
470 each job when we're done? (They are left in place if there's
471 an error.)
472 @type cleanup: C{bool}
473 @param mem_free: Estimate of how much memory each job will need (for
474 scheduling). (Not currently used, because our cluster does
475 not have that setting enabled.)
476 @type mem_free: C{basestring}
477 @param name: Base name to give each job (will have a number add to end)
478 @type name: C{basestring}
479 @param num_slots: Number of slots each job should use.
480 @type num_slots: C{int}
481 @param temp_dir: Local temporary directory for storing output for an
482 individual job.
483 @type temp_dir: C{basestring}
484 @param white_list: If specified, limit nodes used to only those in list.
485 @type white_list: C{list} of C{basestring}
486 @param queue: The SGE queue to use for scheduling.
487 @type queue: C{basestring}
488 @param quiet: When true, do not output information about the jobs that have
489 been submitted.
490 @type quiet: C{bool}
491 """
492
493
494 jobs = [Job(f, [args] if not isinstance(args, list) else args,
495 cleanup=cleanup, mem_free=mem_free,
496 name='{0}{1}'.format(name, job_num), num_slots=num_slots,
497 queue=queue)
498 for job_num, args in enumerate(args_list)]
499
500
501 job_results = process_jobs(jobs, temp_dir=temp_dir, white_list=white_list,
502 quiet=quiet)
503
504 return job_results
505
506
507 -def pg_map(f, args_list, cleanup=True, mem_free="1G", name='gridmap_job',
508 num_slots=1, temp_dir='/scratch/', white_list=None,
509 queue=DEFAULT_QUEUE, quiet=True):
510 """
511 @deprecated: This function has been renamed grid_map.
512
513 @param f: The function to map on args_list
514 @type f: C{function}
515 @param args_list: List of arguments to pass to f
516 @type args_list: C{list}
517 @param cleanup: Should we remove the stdout and stderr temporary files for
518 each job when we're done? (They are left in place if there's
519 an error.)
520 @type cleanup: C{bool}
521 @param mem_free: Estimate of how much memory each job will need (for
522 scheduling). (Not currently used, because our cluster does
523 not have that setting enabled.)
524 @type mem_free: C{basestring}
525 @param name: Base name to give each job (will have a number add to end)
526 @type name: C{basestring}
527 @param num_slots: Number of slots each job should use.
528 @type num_slots: C{int}
529 @param temp_dir: Local temporary directory for storing output for an
530 individual job.
531 @type temp_dir: C{basestring}
532 @param white_list: If specified, limit nodes used to only those in list.
533 @type white_list: C{list} of C{basestring}
534 @param queue: The SGE queue to use for scheduling.
535 @type queue: C{basestring}
536 @param quiet: When true, do not output information about the jobs that have
537 been submitted.
538 @type quiet: C{bool}
539 """
540 return grid_map(f, args_list, cleanup=cleanup, mem_free=mem_free, name=name,
541 num_slots=num_slots, temp_dir=temp_dir,
542 white_list=white_list, queue=queue, quiet=quiet)
543
549 ''' Replace all weird SAN paths with normal paths '''
550
551 path = re.sub(r'/\.automount/\w+/SAN/NLP/(\w+)-(dynamic|static)',
552 r'/home/nlp-\1/\2', path)
553 path = re.sub(r'/\.automount/[^/]+/SAN/Research/HomeResearch',
554 '/home/research', path)
555 return path
556
557
558 -def _zsave_db(obj, redis_server, prefix, job_num):
559 """
560 Saves an object/function as bz2-compressed pickled data in a Redis database
561
562 @param obj: The object/function to store.
563 @type obj: C{object} or C{function}
564 @param redis_server: An open connection to the database
565 @type redis_server: L{StrictRedis}
566 @param prefix: The prefix to use for the key for this data.
567 @type prefix: C{basestring}
568 @param job_num: The ID of the job this data is for.
569 @type job_num: C{int}
570 """
571
572
573 pickled_data = bz2.compress(pickle.dumps(obj, pickle.HIGHEST_PROTOCOL), 9)
574
575
576 redis_server.set('{0}_{1}'.format(prefix, job_num), pickled_data)
577
578
579 -def _zload_db(redis_server, prefix, job_num):
580 """
581 Loads bz2-compressed pickled object from a Redis database
582
583 @param redis_server: An open connection to the database
584 @type redis_server: L{StrictRedis}
585 @param prefix: The prefix to use for the key for this data.
586 @type prefix: C{basestring}
587 @param job_num: The ID of the job this data is for.
588 @type job_num: C{int}
589 """
590 attempt = 0
591 pickled_data = None
592 while pickled_data is None and attempt < MAX_TRIES:
593 pickled_data = redis_server.get('{0}_{1}'.format(prefix, job_num))
594 attempt += 1
595 sleep(SLEEP_TIME)
596 return pickle.loads(bz2.decompress(pickled_data))
597
598
599
600
601
602 -def _run_job(uniq_id, job_num, temp_dir, redis_host):
603 """
604 Execute the pickled job and produce pickled output.
605
606 @param uniq_id: The unique suffix for the tables corresponding to this job
607 in the database.
608 @type uniq_id: C{basestring}
609 @param job_num: The index for this job's content in the job and output
610 tables.
611 @type job_num: C{int}
612 @param temp_dir: Local temporary directory for storing output for an
613 individual job.
614 @type temp_dir: C{basestring}
615 @param redis_host: Hostname of the database to connect to get the job data.
616 @type redis_host: C{basestring}
617 """
618
619 redis_server = StrictRedis(host=redis_host, port=REDIS_PORT, db=REDIS_DB)
620
621 print("Loading job...", end="", file=sys.stderr)
622 sys.stderr.flush()
623 job = _zload_db(redis_server, 'job{0}'.format(uniq_id), job_num)
624 print("done", file=sys.stderr)
625
626 print("Running job...", end="", file=sys.stderr)
627 sys.stderr.flush()
628 job.execute()
629 print("done", file=sys.stderr)
630
631 print("Writing output to database for job {0}...".format(job_num), end="",
632 file=sys.stderr)
633 sys.stderr.flush()
634 _zsave_db(job.ret, redis_server, 'output{0}'.format(uniq_id), job_num)
635 print("done", file=sys.stderr)
636
637
638 if job.cleanup:
639 log_stdout_fn = os.path.join(temp_dir, '{0}.o{1}'.format(job.name,
640 job.jobid))
641 log_stderr_fn = os.path.join(temp_dir, '{0}.e{1}'.format(job.name,
642 job.jobid))
643
644 try:
645 os.remove(log_stdout_fn)
646 os.remove(log_stderr_fn)
647 except OSError:
648 pass
649
652 """
653 Parse the command line inputs and call _run_job
654 """
655
656
657 parser = argparse.ArgumentParser(description="This wrapper script will run \
658 a pickled Python function on \
659 some pickled data in a Redis\
660 database, " + "and write the\
661 results back to the database.\
662 You almost never want to run\
663 this yourself.",
664 formatter_class=argparse.ArgumentDefaultsHelpFormatter,
665 conflict_handler='resolve')
666 parser.add_argument('uniq_id',
667 help='The unique suffix for the tables corresponding to\
668 this job in the database.')
669 parser.add_argument('job_number',
670 help='Which job number should be run. Dictates which \
671 input data is read from database and where output\
672 data is stored.',
673 type=int)
674 parser.add_argument('module_dir',
675 help='Directory that contains module containing pickled\
676 function. This will get added to PYTHONPATH \
677 temporarily.')
678 parser.add_argument('temp_dir',
679 help='Directory that temporary output will be stored\
680 in.')
681 parser.add_argument('redis_host',
682 help='The hostname of the server that where the Redis\
683 database is.')
684 args = parser.parse_args()
685
686 print("Appended {0} to PYTHONPATH".format(args.module_dir), file=sys.stderr)
687 sys.path.append(_clean_path(args.module_dir))
688
689
690 _run_job(args.uniq_id, args.job_number, _clean_path(args.temp_dir),
691 args.redis_host)
692
693
694 if __name__ == "__main__":
695 _main()
696