Changeset 622


Ignore:
Timestamp:
06/03/09 14:23:11 (15 years ago)
Author:
davel
Message:

Fixes for some SGE 6.0 breakage and abortive start on 6.2.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/jobmond/jobmond.py

    r579 r622  
    44#
    55# Copyright (C) 2006-2007  Ramon Bastiaans
    6 # Copyright (C) 2007  Dave Love  (SGE code)
     6# Copyright (C) 2007, 2009  Dave Love  (SGE code)
    77#
    88# Jobmonarch is free software; you can redistribute it and/or modify
     
    2626import xdrlib, socket, syslog, xml, xml.sax
    2727from xml.sax.handler import feature_namespaces
     28from collections import deque
    2829
    2930VERSION='0.3.1'
     
    9192        return loadConfig( config_filename )
    9293
     94# Fixme:  This doesn't DTRT with commented-out bits of the file.  E.g.
     95# it picked up a commented-out `mcast_join' and tried to use a
     96# multicast channel when it shouldn't have done.
    9397class GangliaConfigParser:
    9498
     
    722726                                if my_val_str:
    723727
    724                                         my_val_str = my_val_str + ' ' + val_name + '=' + val_value
     728                                        try:
     729                                                # fixme: It's getting
     730                                                # ('nodes', None) items
     731                                                my_val_str = my_val_str + ' ' + val_name + '=' + val_value
     732                                        except:
     733                                                pass
     734
    725735                                else:
    726736                                        my_val_str = val_name + '=' + val_value
     
    777787                        time.sleep( BATCH_POLL_INTERVAL )       
    778788
    779 # SGE code by Dave Love <fx@gnu.org>.  Tested with SGE 6.0u8 and 6.0u11.
    780 # Probably needs modification for SGE 6.1.  See also the fixmes.
     789# SGE code by Dave Love <fx@gnu.org>.  Tested with SGE 6.0u8 and 6.0u11.  May
     790# work with SGE 6.1 (else should be easily fixable), but definitely doesn't
     791# with 6.2.  See also the fixmes.
    781792
    782793class NoJobs (Exception):
     
    794805                self.in_joblist = False
    795806                self.lrequest = False
     807                self.eltq = deque()
    796808                xml.sax.handler.ContentHandler.__init__(self)
    797809
    798         # The structure of the output is as follows.  Unfortunately
    799         # it's voluminous, and probably doesn't scale to large
    800         # clusters/queues.
     810        # The structure of the output is as follows (for SGE 6.0).  It's
     811        # similar for 6.1, but radically different for 6.2, and is
     812        # undocumented generally.  Unfortunately it's voluminous, and probably
     813        # doesn't scale to large clusters/queues.
    801814
    802815        # <detailed_job_info  xmlns:xsd="http://www.w3.org/2001/XMLSchema">
     
    838851                if name == "djob_info": # job list
    839852                        self.in_joblist = True
    840                 elif name == "qmaster_response" and self.in_joblist: # job
     853                # The job container is "qmaster_response" in SGE 6.0
     854                # and 6.1, but "element" in 6.2.  This is only the very
     855                # start of what's necessary for 6.2, though (sigh).
     856                elif (name == "qmaster_response" or name == "element") \
     857                            and self.eltq[-1] == "djob_info": # job
    841858                        self.job = {"job_state": "U", "slots": 0,
    842859                                    "nodes": [], "queued_timestamp": "",
     
    851868                elif name == "unknown_jobs":
    852869                        raise NoJobs
     870                self.eltq.append (name)
    853871
    854872        def characters(self, ch):
     
    866884                  }
    867885                value = self.value
     886                self.eltq.pop ()
    868887
    869888                if name == "djob_info":
     
    9881007                # Output with args `-xml -ext -f -r' is easier to parse
    9891008                # in some ways, harder in others, but it doesn't provide
    990                 # the submission time, at least.
    991                 piping = popen2.Popen3("qstat -u '*' -j '*' -xml" + queues,
    992                                        True)
     1009                # the submission time (at least SGE 6.0).  The pipeline
     1010                # into sed corrects bogus XML observed with a configuration
     1011                # of SGE 6.0u8, which otherwise causes the parsing to hang.
     1012                piping = popen2.Popen3("qstat -u '*' -j '*' -xml | \
     1013sed -e 's/reported usage>/reported_usage>/g' -e 's;<\/*JATASK:.*>;;'" \
     1014                                               + queues, True)
    9931015                qstatparser = SgeQstatXMLParser()
    9941016                parse_err = 0
     
    10171039                        if job["status"] == "R":
    10181040                                job["nodes"] = do_nodelist (job["nodes"])
    1019                                 # Fixme: Is this right?
    1020                                 job["ppn"] = float(job["slots"]) / \
    1021                                     len(job["nodes"])
     1041                                # Fixme: why is job["nodes"] sometimes null?
     1042                                try:
     1043                                        # Fixme: Is this sensible?  The
     1044                                        # PBS-type PPN isn't something you use
     1045                                        # with SGE.
     1046                                        job["ppn"] = float(job["slots"]) / \
     1047                                            len(job["nodes"])
     1048                                except:
     1049                                        job["ppn"] = 0
    10221050                                if DETECT_TIME_DIFFS:
    10231051                                        # If a job start is later than our
Note: See TracChangeset for help on using the changeset viewer.