[104] | 1 | #! /usr/bin/env python |
---|
| 2 | # |
---|
[248] | 3 | # This version of pbsmon is base on the new_rack_pbsmon.py |
---|
| 4 | # |
---|
| 5 | # Authors: |
---|
| 6 | # Bas van der Vlies |
---|
| 7 | # Dennis Stam |
---|
[104] | 8 | # |
---|
[248] | 9 | # SVN Info: |
---|
| 10 | # $Id: new_rack_pbsmon.py 288 2013-02-19 15:55:59Z dennis $ |
---|
| 11 | # $URL$ |
---|
[109] | 12 | # |
---|
[248] | 13 | |
---|
| 14 | """ |
---|
| 15 | Usage: pbsmon [hosts].... |
---|
| 16 | |
---|
| 17 | Specifying hostnames: |
---|
| 18 | To specify a range use the [] to indicate a range, a couple of examples: |
---|
| 19 | |
---|
| 20 | The first five nodes of rack 16 |
---|
| 21 | - gb-r16n[1-5] |
---|
| 22 | |
---|
| 23 | The first five nodes and node 12 and 18 of rack 16 to 20 |
---|
| 24 | - gb-r[16-20]n[1-5,12,18] |
---|
| 25 | |
---|
| 26 | The first five nodes de in rack 16 with padding enabled |
---|
| 27 | - gb-r[16]n[01-5] |
---|
| 28 | |
---|
| 29 | The ranges ([]) are not only limited to numbers, letters can also be used. |
---|
| 30 | """ |
---|
| 31 | |
---|
[104] | 32 | import sys |
---|
[201] | 33 | import re |
---|
[248] | 34 | import re |
---|
| 35 | import types |
---|
| 36 | from optparse import OptionParser |
---|
[104] | 37 | |
---|
| 38 | import pbs |
---|
[271] | 39 | from PBSAdvancedParser import AdvancedParser |
---|
[104] | 40 | from PBSQuery import PBSQuery |
---|
[109] | 41 | from PBSQuery import PBSError |
---|
[104] | 42 | |
---|
[248] | 43 | # Remark: When both are True, extended view is being printed |
---|
| 44 | PRINT_TABLE = True |
---|
| 45 | PRINT_EXTENDED = False |
---|
| 46 | |
---|
| 47 | # Which nodes must be skipped |
---|
| 48 | EXCLUDE_NODES = [ 'login' ] |
---|
| 49 | |
---|
| 50 | # Some global OPTS |
---|
| 51 | OPT_SKIP_EMPTY_RACKS = True |
---|
| 52 | OPT_SERVERNAME = None |
---|
| 53 | |
---|
| 54 | ## Begin: TABLE view opts |
---|
| 55 | |
---|
[201] | 56 | # A node has the following syntax gb-r10n10 |
---|
| 57 | # r10 is rack name -> skip one char --> gives us rack number = 10 |
---|
| 58 | # n10 is node name -> skip one char --> gives us node number = 10 |
---|
| 59 | # Then we have to set these variables to determine automatically the |
---|
| 60 | # number of nodes and racks |
---|
| 61 | # |
---|
[288] | 62 | NODE_EXPR = "r(?P<racknr>[0-9]+)n(?P<nodenr>[0-9]+)" |
---|
[109] | 63 | |
---|
[248] | 64 | START_RACK = 1 |
---|
[201] | 65 | |
---|
[248] | 66 | ## End: TABLE view opts |
---|
[104] | 67 | |
---|
[248] | 68 | ## Begin: EXTENDED view opts |
---|
[104] | 69 | |
---|
[248] | 70 | LENGTH_NODE = 0 |
---|
| 71 | LENGTH_STATE = 0 |
---|
| 72 | |
---|
| 73 | EXTENDED_PATTERNS = { |
---|
| 74 | 'header' : ' %-*s | %-*s | %s', |
---|
| 75 | 'row': ' %-*s | %-*s | %s', |
---|
| 76 | 'line': ' %s', |
---|
| 77 | 'line_char': '-', |
---|
| 78 | } |
---|
| 79 | |
---|
| 80 | ## End: EXTENDED view opts |
---|
| 81 | |
---|
| 82 | pbs_ND_single = 'job (single)' |
---|
| 83 | pbs_ND_total = 'total' |
---|
| 84 | pbs_ND_free_serial = 'free serial' |
---|
| 85 | pbs_ND_free_parallel = 'free parallel' |
---|
| 86 | |
---|
[104] | 87 | PBS_STATES = { |
---|
[248] | 88 | pbs.ND_free : '_', |
---|
| 89 | pbs.ND_down : 'X', |
---|
| 90 | pbs.ND_offline : '.', |
---|
| 91 | pbs.ND_reserve : 'R', |
---|
| 92 | pbs.ND_job_exclusive : 'J', |
---|
| 93 | pbs.ND_job_sharing : 'S', |
---|
| 94 | pbs.ND_busy : '*', |
---|
| 95 | pbs.ND_state_unknown : '?', |
---|
| 96 | pbs.ND_timeshared : 'T', |
---|
| 97 | pbs.ND_cluster : 'C', |
---|
| 98 | pbs_ND_single : 'j', |
---|
| 99 | pbs_ND_free_serial : '_', |
---|
| 100 | pbs_ND_free_parallel : '_', |
---|
| 101 | pbs_ND_total : ' ' |
---|
[104] | 102 | } |
---|
| 103 | |
---|
[248] | 104 | def sanitize_jobs( jobs ): |
---|
[109] | 105 | |
---|
[248] | 106 | ljobs = list() |
---|
[109] | 107 | |
---|
[248] | 108 | for job in jobs: |
---|
| 109 | ljobs.extend( re.findall( r'[0-9]+\/([0-9]+)\.*.', job ) ) |
---|
[104] | 110 | |
---|
[248] | 111 | return list( set( ljobs ) ) |
---|
[104] | 112 | |
---|
[248] | 113 | def parse_nodename( nodename ): |
---|
| 114 | global NODE_EXPR |
---|
[104] | 115 | |
---|
[248] | 116 | parts = re.search( r'%s' % NODE_EXPR, nodename, re.VERBOSE ) |
---|
[104] | 117 | |
---|
[248] | 118 | try: |
---|
| 119 | racknr = parts.group( 'racknr' ) |
---|
| 120 | except Exception: |
---|
| 121 | racknr = 0 |
---|
[104] | 122 | |
---|
[248] | 123 | try: |
---|
| 124 | nodenr = parts.group( 'nodenr' ) |
---|
| 125 | except Exception: |
---|
| 126 | nodenr = 0 |
---|
[104] | 127 | |
---|
[248] | 128 | return int( racknr ), int( nodenr ) |
---|
[104] | 129 | |
---|
[248] | 130 | def get_nodes( racknode=False, hosts=None ): |
---|
| 131 | global LENGTH_NODE |
---|
| 132 | global LENGTH_STATE |
---|
| 133 | global OPT_SERVERNAME |
---|
[109] | 134 | |
---|
[248] | 135 | nodes_dict = dict() |
---|
[104] | 136 | |
---|
[248] | 137 | try: |
---|
| 138 | if not OPT_SERVERNAME: |
---|
| 139 | p = PBSQuery() |
---|
| 140 | else: |
---|
| 141 | p = PBSQuery( OPT_SERVERNAME ) |
---|
| 142 | except PBSError, reason: |
---|
| 143 | print 'Error: %s' % reason |
---|
| 144 | sys.exit( -1 ) |
---|
[104] | 145 | |
---|
[248] | 146 | p.new_data_structure() |
---|
[104] | 147 | |
---|
[248] | 148 | attr = [ 'state', 'jobs', 'properties' ] |
---|
[104] | 149 | |
---|
[248] | 150 | try: |
---|
| 151 | nodes = p.getnodes( attr ) |
---|
| 152 | except PBSError, reason: |
---|
| 153 | print 'Error: %s' % reason |
---|
| 154 | sys.exit( -1 ) |
---|
[104] | 155 | |
---|
[248] | 156 | number_of_racks = 0 |
---|
| 157 | nodes_per_rack = 0 |
---|
| 158 | hosts_list = list() |
---|
| 159 | |
---|
| 160 | for node, attr in nodes.items(): |
---|
| 161 | if node in EXCLUDE_NODES: |
---|
| 162 | continue |
---|
[104] | 163 | |
---|
[248] | 164 | if hosts and node not in hosts: |
---|
| 165 | continue |
---|
[104] | 166 | |
---|
[248] | 167 | if pbs.ND_down in attr.state: |
---|
| 168 | state = pbs.ND_down |
---|
| 169 | else: |
---|
| 170 | state = attr.state[ 0 ] |
---|
[104] | 171 | |
---|
[248] | 172 | state_char = PBS_STATES[ state ] |
---|
[109] | 173 | |
---|
[248] | 174 | if attr.is_free() and attr.has_job(): |
---|
| 175 | state = pbs.ND_busy |
---|
| 176 | state_char = PBS_STATES[ pbs_ND_single ] |
---|
[109] | 177 | |
---|
[248] | 178 | if not nodes_dict.has_key( node ): |
---|
| 179 | nodes_dict[ node ] = dict() |
---|
[104] | 180 | |
---|
[248] | 181 | # Setting the longest lenght |
---|
| 182 | if len( node ) > LENGTH_NODE: |
---|
| 183 | LENGTH_NODE = len( node ) |
---|
[109] | 184 | |
---|
[248] | 185 | if len( state ) > LENGTH_STATE: |
---|
| 186 | LENGTH_STATE = len( state ) |
---|
[109] | 187 | |
---|
[248] | 188 | if racknode: |
---|
| 189 | racknr, nodenr = parse_nodename( node ) |
---|
[104] | 190 | |
---|
[248] | 191 | if racknr > number_of_racks: |
---|
| 192 | number_of_racks = racknr |
---|
[104] | 193 | |
---|
[248] | 194 | if nodenr > nodes_per_rack: |
---|
| 195 | nodes_per_rack = nodenr |
---|
| 196 | |
---|
| 197 | if not nodes_dict.has_key( racknr ): |
---|
| 198 | nodes_dict[ racknr ] = dict() |
---|
[104] | 199 | |
---|
[248] | 200 | if not nodes_dict[ racknr ].has_key( nodenr ): |
---|
| 201 | nodes_dict[ racknr ][ nodenr ] = dict() |
---|
[104] | 202 | |
---|
[248] | 203 | nodes_dict[ racknr ][ nodenr ][ 'state_char' ] = state_char |
---|
| 204 | nodes_dict[ racknr ][ nodenr ][ 'state' ] = state |
---|
| 205 | |
---|
| 206 | if attr.has_key( 'jobs' ): |
---|
| 207 | nodes_dict[ racknr ][ nodenr ][ 'jobs' ] = sanitize_jobs( attr.jobs ) |
---|
| 208 | else: |
---|
| 209 | nodes_dict[ racknr ][ nodenr ][ 'jobs' ] = [] |
---|
| 210 | else: |
---|
| 211 | hosts_list.append( node ) |
---|
| 212 | nodes_dict[ node ][ 'state_char' ] = state_char |
---|
| 213 | nodes_dict[ node ][ 'state' ] = state |
---|
| 214 | |
---|
| 215 | if attr.has_key( 'jobs' ): |
---|
| 216 | nodes_dict[ node ][ 'jobs' ] = sanitize_jobs( attr.jobs ) |
---|
| 217 | else: |
---|
| 218 | nodes_dict[ node ][ 'jobs' ] = [] |
---|
[104] | 219 | |
---|
[248] | 220 | if not racknode: |
---|
| 221 | return nodes_dict, hosts_list |
---|
[104] | 222 | |
---|
[248] | 223 | return nodes_dict, number_of_racks, nodes_per_rack |
---|
[109] | 224 | |
---|
[248] | 225 | def _generate_index( str ): |
---|
| 226 | index = [] |
---|
[109] | 227 | |
---|
[248] | 228 | def _append( fragment, alist=index ): |
---|
| 229 | if fragment.isdigit(): |
---|
| 230 | fragment = int( fragment ) |
---|
| 231 | alist.append( fragment ) |
---|
[109] | 232 | |
---|
[248] | 233 | prev_isdigit = str[0].isdigit() |
---|
| 234 | current_fragment = '' |
---|
[109] | 235 | |
---|
[248] | 236 | for char in str: |
---|
| 237 | curr_isdigit = char.isdigit() |
---|
[109] | 238 | |
---|
[248] | 239 | if curr_isdigit == prev_isdigit: |
---|
| 240 | current_fragment += char |
---|
| 241 | else: |
---|
| 242 | _append( current_fragment ) |
---|
| 243 | current_fragment = char |
---|
| 244 | prev_isdigit = curr_isdigit |
---|
[109] | 245 | |
---|
[248] | 246 | _append( current_fragment ) |
---|
[109] | 247 | |
---|
[248] | 248 | return tuple( index ) |
---|
[109] | 249 | |
---|
[248] | 250 | def real_sort( inlist ): |
---|
| 251 | indices = map(_generate_index, inlist ) |
---|
| 252 | decorated = zip( indices, inlist ) |
---|
| 253 | decorated.sort() |
---|
[109] | 254 | |
---|
[248] | 255 | return [ item for index, item in decorated ] |
---|
[109] | 256 | |
---|
[248] | 257 | def print_table(): |
---|
| 258 | global START_RACK |
---|
| 259 | global OPT_SKIP_EMPTY_RACKS |
---|
| 260 | |
---|
| 261 | nodes, racknr, nodenr = get_nodes( True ) |
---|
| 262 | |
---|
| 263 | ## Code herebelow has been taken from the new_rack_pbsmon.py |
---|
| 264 | save_column = None |
---|
| 265 | |
---|
| 266 | print |
---|
| 267 | print ' ', |
---|
| 268 | for rack in xrange( START_RACK, racknr + 1 ): |
---|
| 269 | |
---|
| 270 | if not ( rack % 10 ): |
---|
| 271 | char = '%d' % ( rack / 10 ) |
---|
| 272 | save_column = char |
---|
| 273 | else: |
---|
| 274 | char = ' ' |
---|
| 275 | |
---|
| 276 | if OPT_SKIP_EMPTY_RACKS: |
---|
| 277 | if nodes.has_key( rack ): |
---|
| 278 | if save_column: |
---|
| 279 | char = save_column |
---|
| 280 | save_column = None |
---|
| 281 | print char, |
---|
| 282 | else: |
---|
| 283 | print char, |
---|
| 284 | print |
---|
| 285 | |
---|
| 286 | print ' ', |
---|
| 287 | for rack in xrange( START_RACK, racknr + 1 ): |
---|
| 288 | |
---|
| 289 | char = rack % 10 |
---|
| 290 | if OPT_SKIP_EMPTY_RACKS: |
---|
| 291 | if nodes.has_key( rack ): |
---|
| 292 | print char, |
---|
| 293 | else: |
---|
| 294 | print char, |
---|
| 295 | print |
---|
| 296 | |
---|
| 297 | for node in xrange( 1, nodenr + 1 ): |
---|
| 298 | print '%2d' % node, |
---|
| 299 | |
---|
| 300 | for rack in xrange( START_RACK, racknr + 1 ): |
---|
| 301 | if OPT_SKIP_EMPTY_RACKS: |
---|
| 302 | if not nodes.has_key( rack ): |
---|
| 303 | continue |
---|
| 304 | try: |
---|
| 305 | print nodes[ rack ][ node ][ 'state_char' ], |
---|
| 306 | except KeyError: |
---|
| 307 | print ' ', |
---|
| 308 | print |
---|
| 309 | print |
---|
| 310 | |
---|
| 311 | def print_table_summary(): |
---|
| 312 | global PBS_STATES |
---|
| 313 | global OPT_SERVERNAME |
---|
| 314 | |
---|
| 315 | try: |
---|
| 316 | if not OPT_SERVERNAME: |
---|
| 317 | p = PBSQuery() |
---|
| 318 | else: |
---|
| 319 | p = PBSQuery( OPT_SERVERNAME ) |
---|
| 320 | except PBSError, reason: |
---|
| 321 | print 'error: %s' % reason |
---|
| 322 | sys.exit(-1) |
---|
| 323 | |
---|
| 324 | # get the state of the nodes |
---|
| 325 | attr = [ 'state', 'jobs', 'properties' ] |
---|
| 326 | try: |
---|
| 327 | nodes = p.getnodes(attr) |
---|
| 328 | except PBSError, reason: |
---|
| 329 | print 'error: %s' % reason |
---|
| 330 | sys.exit(-1) |
---|
| 331 | |
---|
| 332 | node_dict = {} |
---|
| 333 | |
---|
| 334 | count_states = {} |
---|
| 335 | for key in PBS_STATES.keys(): |
---|
| 336 | count_states[key] = 0 |
---|
| 337 | |
---|
| 338 | for nodename, node in nodes.items(): |
---|
| 339 | |
---|
| 340 | # Skip login nodes in status display |
---|
| 341 | # |
---|
| 342 | if not nodename.find('login'): |
---|
| 343 | continue |
---|
| 344 | |
---|
| 345 | state = node['state'][ 0 ] |
---|
| 346 | |
---|
| 347 | state_char = PBS_STATES[state] |
---|
| 348 | count_states[state] += 1 |
---|
| 349 | count_states[pbs_ND_total] += 1 |
---|
| 350 | |
---|
| 351 | if node.is_free(): # can happen for single CPU jobs |
---|
| 352 | if node.has_job(): |
---|
| 353 | # print 'TD: %s' % nodename, node |
---|
| 354 | state_char = PBS_STATES[pbs_ND_single] |
---|
| 355 | count_states[pbs.ND_free] -= 1 |
---|
| 356 | count_states[pbs_ND_single] += 1 |
---|
| 357 | else: |
---|
| 358 | if 'infiniband' in node['properties']: |
---|
| 359 | count_states[pbs_ND_free_parallel] += 1 |
---|
| 360 | elif 'ifiniband' in node['properties']: |
---|
| 361 | count_states[pbs_ND_free_serial] += 1 |
---|
| 362 | #else: |
---|
| 363 | # count_states[pbs_ND_free_serial] += 1 |
---|
| 364 | |
---|
| 365 | # print 'TD: %s %s' % (nodename, state_char) |
---|
| 366 | dummy = nodename.split('-') |
---|
| 367 | if len( dummy ) > 1: |
---|
| 368 | node_dict[dummy[1]] = state_char |
---|
| 369 | else: |
---|
| 370 | node_dict[dummy[0]] = state_char |
---|
| 371 | |
---|
| 372 | legend = PBS_STATES.keys() |
---|
| 373 | legend.sort() |
---|
| 374 | |
---|
| 375 | n = 0 |
---|
| 376 | for state in legend: |
---|
| 377 | print ' %s %-13s : %-5d' % (PBS_STATES[state], state, count_states[state]), |
---|
| 378 | |
---|
| 379 | n = n + 1 |
---|
| 380 | if not (n & 1): |
---|
| 381 | print |
---|
| 382 | |
---|
| 383 | def print_extended( hosts=None ): |
---|
| 384 | global LENGTH_NODE |
---|
| 385 | global LENGTH_STATE |
---|
| 386 | global EXTENDED_PATTERNS |
---|
| 387 | |
---|
| 388 | nodes, ihosts = get_nodes( hosts=hosts ) |
---|
| 389 | row_header = EXTENDED_PATTERNS[ 'header' ] % ( ( LENGTH_NODE + 2 ), 'Node', ( LENGTH_STATE + 2 ), 'State', 'Jobs' ) |
---|
| 390 | LENGTH_ROW = len( row_header ) |
---|
| 391 | |
---|
| 392 | rows_str = list() |
---|
| 393 | ihosts = real_sort( ihosts ) |
---|
| 394 | |
---|
| 395 | for node in ihosts: |
---|
| 396 | attr = nodes[ node ] |
---|
| 397 | row_str = EXTENDED_PATTERNS[ 'row' ] % ( ( LENGTH_NODE + 2 ), node, ( LENGTH_STATE + 2 ), attr[ 'state' ], ','.join( attr[ 'jobs' ] ) ) |
---|
| 398 | |
---|
| 399 | if len( row_str ) > LENGTH_ROW: |
---|
| 400 | LENGTH_ROW = len( row_str ) |
---|
| 401 | |
---|
| 402 | rows_str.append( row_str ) |
---|
| 403 | |
---|
| 404 | print |
---|
| 405 | print row_header |
---|
| 406 | print EXTENDED_PATTERNS[ 'line' ] % ( EXTENDED_PATTERNS[ 'line_char' ] * LENGTH_ROW ) |
---|
| 407 | print '\n'.join( rows_str ) |
---|
| 408 | print |
---|
| 409 | |
---|
[104] | 410 | if __name__ == '__main__': |
---|
[248] | 411 | |
---|
| 412 | parser = AdvancedParser(usage=__doc__) |
---|
[104] | 413 | |
---|
[248] | 414 | parser.add_option( "-t", "--table", dest="table", action="store_true", help="Show an table" ) |
---|
| 415 | parser.add_option( "-l", "--list", dest="extended", action="store_true", help="Show node rows with state and jobinfo" ) |
---|
| 416 | parser.add_option( "-s", "--summary", dest="summary", action="store_true", help="Display a short summary" ) |
---|
[262] | 417 | parser.add_option( "-a", "--all", dest="summary", action="store_true", help="Display a short summary" ) |
---|
| 418 | |
---|
[248] | 419 | parser.add_option( "-w", "--wide", dest="wide", action="store_true", help="Wide display for node status ( only when -t is used )" ) |
---|
| 420 | parser.add_option( "-S", "--servername", dest="servername", help="Change the default servername" ) |
---|
[104] | 421 | |
---|
[248] | 422 | parser.set_defaults( table=PRINT_TABLE ) |
---|
| 423 | parser.set_defaults( summary=False ) |
---|
| 424 | parser.set_defaults( extended=PRINT_EXTENDED ) |
---|
| 425 | parser.set_defaults( servername=None ) |
---|
[109] | 426 | |
---|
[248] | 427 | ( options, args ) = parser.parse_args() |
---|
[109] | 428 | |
---|
[248] | 429 | if options.servername: |
---|
| 430 | OPT_SERVERNAME = options.servername |
---|
[104] | 431 | |
---|
[248] | 432 | if options.wide: |
---|
| 433 | OPT_SKIP_EMPTY_RACKS = False |
---|
| 434 | |
---|
| 435 | if args: |
---|
| 436 | options.extended = True |
---|
| 437 | |
---|
| 438 | if options.extended and PRINT_TABLE: |
---|
| 439 | options.table = False |
---|
| 440 | |
---|
| 441 | if options.table and PRINT_EXTENDED: |
---|
| 442 | options.extended = False |
---|
| 443 | |
---|
| 444 | if options.extended: |
---|
| 445 | print_extended( args ) |
---|
| 446 | elif options.table: |
---|
| 447 | print_table() |
---|
| 448 | else: |
---|
| 449 | print 'Something is wrong, bye!' |
---|
| 450 | sys.exit( -1 ) |
---|
| 451 | |
---|
| 452 | if options.summary: |
---|
| 453 | print_table_summary() |
---|