1 | #!/usr/bin/env python |
---|
2 | # |
---|
3 | # Author: |
---|
4 | # Willem Vermin, SARA, April 2012 |
---|
5 | # |
---|
6 | # SVN Info: |
---|
7 | # $Id# |
---|
8 | # $URL: trunk/examples/pbs_jobmonitor $ |
---|
9 | # |
---|
10 | # pbs_jobmonitor, pbs_joblogin <jobnr> [nodenr] |
---|
11 | # jobnr: the number of the job |
---|
12 | # nodenr: the rank of the node in the job |
---|
13 | # |
---|
14 | # depending on the name with this script is called it performs the |
---|
15 | # following: |
---|
16 | |
---|
17 | # called as pbs_jobmonitor: |
---|
18 | # shows the output of top -u user on the node |
---|
19 | # - one cycle of top |
---|
20 | # - user: the user the job belongs to |
---|
21 | # |
---|
22 | # called as pbs_joblogin: |
---|
23 | # logs in to the node as the user who invokes this script |
---|
24 | # (os.getenv('USER')) |
---|
25 | # |
---|
26 | from PBSQuery import PBSQuery |
---|
27 | import sys,os |
---|
28 | def uniq(seq, idfun=None): |
---|
29 | # http://www.peterbe.com/plog/uniqifiers-benchmark |
---|
30 | # order preserving |
---|
31 | if idfun is None: |
---|
32 | def idfun(x): return x |
---|
33 | seen = {} |
---|
34 | result = [] |
---|
35 | for item in seq: |
---|
36 | marker = idfun(item) |
---|
37 | if marker in seen: continue |
---|
38 | seen[marker] = 1 |
---|
39 | result.append(item) |
---|
40 | return result |
---|
41 | |
---|
42 | def usage(a): |
---|
43 | if a == 'pbs_jobmonitor': |
---|
44 | print a,'shows the system usage of a node where a job is running' |
---|
45 | if a == 'pbs_joblogin': |
---|
46 | print a,'logs you in to a node where a job is running' |
---|
47 | |
---|
48 | print 'Usage:' |
---|
49 | print a,'<jobnumber> [nodenumber]' |
---|
50 | print 'where <jobnumber> is the number of the job' |
---|
51 | print ' nodenumber is the rank number of the node allocated to the job' |
---|
52 | print ' (default 0)' |
---|
53 | |
---|
54 | me = sys.argv[0].split('/')[-1] |
---|
55 | print '['+me+']' |
---|
56 | p = PBSQuery() |
---|
57 | |
---|
58 | try: |
---|
59 | j=sys.argv[1] |
---|
60 | except: |
---|
61 | usage(me) |
---|
62 | sys.exit(1) |
---|
63 | |
---|
64 | if len(sys.argv) > 2: |
---|
65 | try: |
---|
66 | num = int(sys.argv[2]) |
---|
67 | except: |
---|
68 | usage(me) |
---|
69 | sys.exit(1) |
---|
70 | else: |
---|
71 | num = 0 |
---|
72 | |
---|
73 | job = p.getjob(j) |
---|
74 | |
---|
75 | try: |
---|
76 | h = job['exec_host'][0] |
---|
77 | except: |
---|
78 | print 'No such job:',j |
---|
79 | sys.exit(1) |
---|
80 | |
---|
81 | hh = h.split('+') |
---|
82 | nodes=[] |
---|
83 | for h in hh: |
---|
84 | nodes = nodes + [ h.split('/')[0]] |
---|
85 | |
---|
86 | nodes = uniq(nodes) |
---|
87 | print 'Job',j,'is running on',len(nodes),'nodes:' |
---|
88 | i=0 |
---|
89 | for h in nodes: |
---|
90 | print h, |
---|
91 | i = i+1 |
---|
92 | if i > 7: |
---|
93 | i=0 |
---|
94 | print |
---|
95 | if i != 0: |
---|
96 | print |
---|
97 | |
---|
98 | if num >= len(nodes): |
---|
99 | print 'No node number',num |
---|
100 | sys.exit(1) |
---|
101 | |
---|
102 | if me == 'pbs_jobmonitor': |
---|
103 | user=job['Job_Owner'][0].split('@')[0] |
---|
104 | print 'top for node #',num,':',nodes[num],'user:',user |
---|
105 | sys.stdout.flush() |
---|
106 | os.system('ssh '+nodes[num]+' top -n1 -b -u ' + user) |
---|
107 | |
---|
108 | if me == 'pbs_joblogin': |
---|
109 | user = os.getenv('USER') |
---|
110 | print 'logging in to node #',num,':',nodes[num],'user:',user |
---|
111 | sys.stdout.flush() |
---|
112 | os.system('ssh -X '+nodes[num]) |
---|
113 | |
---|