Index: /trunk/jobmond/jobmond.py =================================================================== --- /trunk/jobmond/jobmond.py (revision 511) +++ /trunk/jobmond/jobmond.py (revision 512) @@ -462,4 +462,6 @@ """Submit job info list""" + global BATCH_API + self.dp.multicastGmetric( 'MONARCH-HEARTBEAT', str( int( int( self.cur_time ) + int( self.timeoffset ) ) ) ) @@ -483,4 +485,26 @@ self.dp.multicastGmetric( 'MONARCH-RJ', str( running_jobs ), 'uint32', 'jobs' ) self.dp.multicastGmetric( 'MONARCH-QJ', str( queued_jobs ), 'uint32', 'jobs' ) + + # Report down/offline nodes in batch (PBS only ATM) + # + if BATCH_API == 'pbs': + + downed_nodes = list() + offline_nodes = list() + + l = ['state'] + + for name, node in self.pq.getnodes().items(): + + if ( node[ 'state' ].find( "down" ) != -1 ): + + downed_nodes.append( name ) + + if ( node[ 'state' ].find( "offline" ) != -1 ): + + offline_nodes.append( name ) + + self.dp.multicastGmetric( 'MONARCH-DOWN' , str( downed_nodes ), 'uint32', 'jobs' ) + self.dp.multicastGmetric( 'MONARCH-OFFLINE', str( offline_nodes ), 'uint32', 'jobs' ) # Now let's spread the knowledge Index: /trunk/web/addons/job_monarch/overview.php =================================================================== --- /trunk/web/addons/job_monarch/overview.php (revision 511) +++ /trunk/web/addons/job_monarch/overview.php (revision 512) @@ -527,4 +527,7 @@ $queued_cpus = 0; + $na_nodes = 0; + $na_cpus = 0; + $total_nodes = 0; $total_cpus = 0; @@ -578,4 +581,11 @@ $rjqj_host = null; + $nodes_down = null; + $nodes_offline = null; + + $replacestr = array(); + $replacestr[] = "'"; + $replacestr[] = " "; + foreach( $metrics as $bhost => $bmetric ) { @@ -586,7 +596,42 @@ $rjqj_host = $bhost; } + if( ( $mname == 'MONARCH-DOWN' ) ) + { + $nodes_down = str_replace($replacestr,NULL,split(',',substr($mval['VAL'],1,-1))); + } + if( ( $mname == 'MONARCH-OFFLINE' ) ) + { + $nodes_offline = str_replace($replacestr,NULL,split(',',substr($mval['VAL'],1,-1))); + } } } + $nodes_counted = array(); + + if($nodes_down != NULL || $nodes_offline!=NULL ) + { + foreach( $metrics as $bh => $bm ) + { + if (in_array($bh,$nodes_offline) && $gnodes[$bh]) + { + $nodes_counted[] = $bh; + if(! $gnodes[$bh]->getJobs()) + { + $na_cpus += ($bm['cpu_num'][VAL]); + $na_nodes += 1; + } + } + if (in_array($bh,$nodes_down) && !in_array($bh,$nodes_counted) && $gnodes[$bh]) + { + $nodes_counted[] = $bh; + if(! $gnodes[$bh]->getJobs()) + { + $na_cpus += ($bm['cpu_num'][VAL]); + $na_nodes += 1; + } + } + } + } + // Running / queued amount jobs graph // @@ -875,7 +920,7 @@ $total_jobs = $queued_jobs + $running_jobs; - $free_nodes = $avail_nodes - $running_nodes; + $free_nodes = $avail_nodes - $running_nodes - $na_nodes; $free_nodes = ( $free_nodes >= 0 ) ? $free_nodes : 0; - $free_cpus = $avail_cpus - $running_cpus; + $free_cpus = $avail_cpus - $running_cpus - $na_cpus; $free_cpus = ( $free_cpus >= 0 ) ? $free_cpus : 0; @@ -886,4 +931,7 @@ $tpl->assignGlobal( "queued_jobs", $queued_jobs ); $tpl->assignGlobal( "queued_cpus", $queued_cpus ); + + $tpl->assignGlobal( "na_nodes", $na_nodes ); + $tpl->assignGlobal( "na_cpus", $na_cpus ); $tpl->assignGlobal( "total_nodes", $total_nodes ); Index: /trunk/web/addons/job_monarch/templates/overview.tpl =================================================================== --- /trunk/web/addons/job_monarch/templates/overview.tpl (revision 511) +++ /trunk/web/addons/job_monarch/templates/overview.tpl (revision 512) @@ -81,4 +81,19 @@