diff --git a/cgi/cloud_mode_active_cgi.py b/cgi/cloud_mode_active_cgi.py new file mode 100755 index 0000000..c6d4e80 --- /dev/null +++ b/cgi/cloud_mode_active_cgi.py @@ -0,0 +1,11 @@ +#!/usr/bin/env python2.6 +import cgi +import os +print "Content-Type: text/html" # HTML is following +print + +try: + cloud = os.listdir('/etc/appliance/resources/cloud') + print len(cloud) +except Exception as ex: + print ex diff --git a/cgi/exclude_cgi.py b/cgi/exclude_cgi.py new file mode 100755 index 0000000..5fc9a86 --- /dev/null +++ b/cgi/exclude_cgi.py @@ -0,0 +1,15 @@ +#!/usr/bin/env python2.6 +import cgi +import os +form = cgi.FieldStorage() +print "Content-Type: text/html" # HTML is following +print +print "CGI script exclude" + +try: + os.unlink('exclude') +except: + pass +fp = open('exclude','w+') +fp.close() + diff --git a/cgi/include_cgi.py b/cgi/include_cgi.py new file mode 100755 index 0000000..34d37e5 --- /dev/null +++ b/cgi/include_cgi.py @@ -0,0 +1,15 @@ +#!/usr/bin/env python2.6 +import cgi +import os +form = cgi.FieldStorage() +print "Content-Type: text/html" # HTML is following +print +print "CGI script exclude" + +try: + os.unlink('include') +except: + pass +fp = open('include','w+') +fp.close() + diff --git a/cgi/suspend_cgi.py b/cgi/suspend_cgi.py index 6a2f97c..ad8e9b6 100755 --- a/cgi/suspend_cgi.py +++ b/cgi/suspend_cgi.py @@ -5,10 +5,15 @@ print "Content-Type: text/html" # HTML is following print print "CGI script suspend" + +portsuffix="" +if "port" in form: + portsuffix=form["port"].value + try: - os.unlink('suspend') + os.unlink('suspend'+portsuffix) except: pass -fp = open('suspend','w+') +fp = open('suspend'+portsuffix,'w+') fp.close() diff --git a/esplugins/head-master.zip b/esplugins/head-master.zip new file mode 100644 index 0000000..4d16a1e Binary files /dev/null and b/esplugins/head-master.zip differ diff --git a/esplugins/hq-master.zip b/esplugins/hq-master.zip new file mode 100644 index 0000000..6f50d38 Binary files /dev/null and b/esplugins/hq-master.zip differ diff --git a/esplugins/install.sh b/esplugins/install.sh index 7bd2e8f..1e63fda 100644 --- a/esplugins/install.sh +++ b/esplugins/install.sh @@ -1,4 +1,4 @@ cd $1 -echo installing elasticsearch plugins... -bin/plugin --url file:///opt/fff/esplugins/$2 --install $3 +echo installing elasticsearch plugin $3 ... +bin/plugin -s --url file:///opt/fff/esplugins/$2 --install $3 diff --git a/esplugins/paramedic-master.zip b/esplugins/paramedic-master.zip new file mode 100644 index 0000000..b0fc5e5 Binary files /dev/null and b/esplugins/paramedic-master.zip differ diff --git a/esplugins/uninstall.sh b/esplugins/uninstall.sh index c22303c..301411a 100644 --- a/esplugins/uninstall.sh +++ b/esplugins/uninstall.sh @@ -1,4 +1,5 @@ #!/bin/bash cd $1 -bin/plugin --remove $2 +echo uninstalling elastic plugin $2 ... +bin/plugin -s --remove $2 diff --git a/etc/hltd.conf b/etc/hltd.conf index a91699b..c1643a4 100644 --- a/etc/hltd.conf +++ b/etc/hltd.conf @@ -1,5 +1,6 @@ [General] enabled = False +instance = main exec_directory = /opt/hltd user = daqlocal watch_directory = /fff/data @@ -9,19 +10,19 @@ mount_command = mount mount_type = nfs4 mount_options_ramdisk = rw,noatime,vers=4,rsize=65536,wsize=65536,namlen=255,hard,proto=tcp,timeo=600,retrans=2,sec=sys,noac mount_options_output = rw,vers=4,rsize=65536,wsize=65536,namlen=255,hard,proto=tcp,timeo=600,retrans=2,sec=sys -micromerge_output = /fff/BU0/output delete_run_dir = True output_adler32 = True [Monitoring] use_elasticsearch = True -close_es_index = False +close_es_index = True es_cmssw_log_level = DISABLED es_hltd_log_level = ERROR es_local = localhost [Web] cgi_port = 9000 +cgi_instance_port_offset = 0 soap2file_port = 8010 [Resources] diff --git a/etc/instances.input b/etc/instances.input new file mode 100644 index 0000000..f826192 --- /dev/null +++ b/etc/instances.input @@ -0,0 +1,27 @@ +{ + "DISABLED-dvbu-c2f34-30-01": + { + "names":["main","testing"], + "sizes":[20,30] + }, + "DISABLED-dvrubu-c2f34-17-03": + { + "names":["testing"], + "sizes":[0] + }, + "DISABLED-dvrubu-c2f34-17-04": + { + "names":["testing"], + "sizes":[0] + }, + "bu-vm-01-01.cern.ch": + { + "names":["main","testing"], + "sizes":[1000,500] + }, + "fu-vm-02-02.cern.ch": + { + "names":["testing"], + "sizes":[0] + } +} diff --git a/json/runapplianceTemplate.json b/json/runapplianceTemplate.json index d410952..8578066 100644 --- a/json/runapplianceTemplate.json +++ b/json/runapplianceTemplate.json @@ -274,50 +274,35 @@ } } }, - "hltrates-legend": { + "qstatus": { "properties": { - "path-names": { - "type": "string", - "index": "not_analyzed" - }, - "dataset-names": { - "type": "string", - "index": "not_analyzed" - } - } - }, - "hltrates": { - "properties": { - "ls": { - "type": "integer" - }, - "pid": { - "type": "integer" - }, - "processed": { - "type": "integer" - }, - "path-wasrun": { + "numQueuedLS": { "type": "integer" }, - "path-afterl1seed": { + "maxQueuedLS": { "type": "integer" }, - "path-afterprescale": { + "numReadFromQueueLS": { "type": "integer" }, - "path-accepted": { + "maxClosedLS": { "type": "integer" }, - "path-rejected": { + "numReadOpenLS": { "type": "integer" }, - "path-errors": { - "type": "integer" + "fm_date": { + "type": "date" }, - "dataset-accepted": { - "type": "integer" + "host": { + "type": "string", + "index":"not_analyzed" } + }, + "_timestamp": { + "enabled": true, + "store": "yes", + "path": "fm_date" } }, "cmsswlog": { diff --git a/lib/python-procname/procnamemodule.c b/lib/python-procname/procnamemodule.c new file mode 100644 index 0000000..e447032 --- /dev/null +++ b/lib/python-procname/procnamemodule.c @@ -0,0 +1,72 @@ +/* + * Copyright (C) 2008 Eugene A. Lisitsky + * + * The procname library for Python. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * */ + +#include +#include + +void Py_GetArgcArgv(int*, char***); + +PyDoc_STRVAR(procname__doc__, "Module for setting/getting process name"); + +static PyObject * +procname_check(PyObject *self, PyObject *args) { + return Py_BuildValue("i", 1); +}; + + +static PyObject * +procname_getprocname(PyObject *self, PyObject *args) { + int argc; + char **argv; + Py_GetArgcArgv(&argc, &argv); + return Py_BuildValue("s", argv[0]); +}; + + +static PyObject * +procname_setprocname(PyObject *self, PyObject *args) { + int argc; + char **argv; + char *name; + if (!PyArg_ParseTuple(args, "s", &name)) + return NULL; + Py_GetArgcArgv(&argc, &argv); + strncpy(argv[0], name , strlen(name)); + memset(&argv[0][strlen(name)], '\0', strlen(&argv[0][strlen(name)])); + prctl (15 /* PR_SET_NAME */, name, 0, 0, 0); + Py_INCREF(Py_None); + return Py_None; +}; + + +static PyMethodDef procname_methods[] = { + {"check", procname_check, METH_VARARGS, "Test func"}, + {"getprocname", procname_getprocname, METH_VARARGS, + "Get procname.\nReturns name (string)"}, + {"setprocname", procname_setprocname, METH_VARARGS, + "Set procname.\n name (string) -> new process name.\nReturns None."}, + {NULL, NULL, 0, NULL} +}; + +PyMODINIT_FUNC +initprocname(void) { + (void) Py_InitModule3("procname", procname_methods, procname__doc__); +} + diff --git a/lib/python-procname/setup.py b/lib/python-procname/setup.py new file mode 100755 index 0000000..cf97d9e --- /dev/null +++ b/lib/python-procname/setup.py @@ -0,0 +1,17 @@ +#!/usr/bin/env python + +import distutils.core +import distutils.util + +platform = distutils.util.get_platform() + + +distutils.core.setup( + name='procname', + version='0.1', + description='Process name renaming', + author="Eugene A Lisitsky", + license='LGPL', + platforms='Linux', + ext_modules=[distutils.core.Extension('procname', sources=['procnamemodule.c'])], + ) diff --git a/python/aUtils.py b/python/aUtils.py index df226e3..76c9499 100644 --- a/python/aUtils.py +++ b/python/aUtils.py @@ -2,17 +2,19 @@ import os,stat import time,datetime import shutil -import json +import simplejson as json import logging import zlib import subprocess +import threading +#import fcntl from inotifywrapper import InotifyWrapper import _inotify as inotify ES_DIR_NAME = "TEMP_ES_DIRECTORY" -UNKNOWN,OUTPUTJSD,JSD,STREAM,INDEX,FAST,SLOW,OUTPUT,STREAMERR,STREAMDQMHISTOUTPUT,INI,EOLS,EOR,COMPLETE,DAT,PDAT,PJSNDATA,PIDPB,PB,CRASH,MODULELEGEND,PATHLEGEND,BOX,BOLS,HLTRATES,HLTRATESLEGEND = range(26) #file types +UNKNOWN,OUTPUTJSD,DEFINITION,STREAM,INDEX,FAST,SLOW,OUTPUT,STREAMERR,STREAMDQMHISTOUTPUT,INI,EOLS,EOR,COMPLETE,DAT,PDAT,PJSNDATA,PIDPB,PB,CRASH,MODULELEGEND,PATHLEGEND,BOX,BOLS,QSTATUS = range(25) #file types TO_ELASTICIZE = [STREAM,INDEX,OUTPUT,STREAMERR,STREAMDQMHISTOUTPUT,EOLS,EOR,COMPLETE] TEMPEXT = ".recv" ZEROLS = 'ls0000' @@ -40,6 +42,16 @@ def __init__(self,recursiveMode=False): self.logger = logging.getLogger(self.__class__.__name__) self.eventQueue = False self.inotifyWrapper = InotifyWrapper(self,recursiveMode) + self.queueStatusPath = None + self.queueStatusPathMon = None + self.queueStatusPathDir = None + self.queuedLumiList = [] + self.maxQueuedLumi=-1 + #max seen/closed by anelastic thread + self.maxReceivedEoLS=-1 + self.maxClosedLumi=-1 + self.numOpenLumis=-1 + self.lock = threading.Lock() def register_inotify_path(self,path,mask): self.inotifyWrapper.registerPath(path,mask) @@ -48,20 +60,101 @@ def start_inotify(self): self.inotifyWrapper.start() def stop_inotify(self): - logging.info("MonitorRanger: Stop inotify wrapper") + self.logger.info("MonitorRanger: Stop inotify wrapper") self.inotifyWrapper.stop() - logging.info("MonitorRanger: Join inotify wrapper") + self.logger.info("MonitorRanger: Join inotify wrapper") self.inotifyWrapper.join() - logging.info("MonitorRanger: Inotify wrapper returned") + self.logger.info("MonitorRanger: Inotify wrapper returned") def process_default(self, event): self.logger.debug("event: %s on: %s" %(str(event.mask),event.fullpath)) if self.eventQueue: - self.eventQueue.put(event) + + if self.queueStatusPath!=None: + if self.checkNewLumi(event): + self.eventQueue.put(event) + else: + self.eventQueue.put(event) def setEventQueue(self,queue): self.eventQueue = queue + def checkNewLumi(self,event): + if event.fullpath.endswith("_EoLS.jsn"): + try: + queuedLumi = int(os.path.basename(event.fullpath).split('_')[1][2:]) + self.lock.acquire() + if queuedLumi not in self.queuedLumiList: + if queuedLumi>self.maxQueuedLumi: + self.maxQueuedLumi=queuedLumi + self.queuedLumiList.append(queuedLumi) + self.lock.release() + self.updateQueueStatusFile() + else: + self.lock.release() + #skip if EoL for LS in queue has already been written once (e.g. double file create race) + return False + except: + self.logger.warning("Problem checking new EoLS filename: "+str(os.path.basename(event.fullpath)) + " error:"+str(ex)) + try:self.lock.release() + except:pass + return True + + def notifyLumi(self,ls,maxReceivedEoLS,maxClosedLumi,numOpenLumis): + if self.queueStatusPath==None:return + self.lock.acquire() + if ls!=None and ls in self.queuedLumiList: + self.queuedLumiList.remove(ls) + self.maxReceivedEoLS=maxReceivedEoLS + self.maxClosedLumi=maxClosedLumi + self.numOpenLumis=numOpenLumis + self.lock.release() + self.updateQueueStatusFile() + + def setQueueStatusPath(self,path,monpath): + self.queueStatusPath = path + self.queueStatusPathMon = monpath + self.queueStatusPathDir = path[:path.rfind('/')] + + def updateQueueStatusFile(self): + if self.queueStatusPath==None:return + num_queued_lumis = len(self.queuedLumiList) + if not os.path.exists(self.queueStatusPathDir): + self.logger.error("No directory to write queueStatusFile: "+str(self.queueStatusPathDir)) + else: + self.logger.info("Update status file - queued lumis:"+str(num_queued_lumis)+ " EoLS:: max queued:"+str(self.maxQueuedLumi) \ + +" un-queued:"+str(self.maxReceivedEoLS)+" Lumis:: last closed:"+str(self.maxClosedLumi)+ " num open:"+str(self.numOpenLumis)) + #write json + doc = {"numQueuedLS":num_queued_lumis, + "maxQueuedLS":self.maxQueuedLumi, + "numReadFromQueueLS:":self.maxReceivedEoLS, + "maxClosedLS":self.maxClosedLumi, + "numReadOpenLS":self.numOpenLumis + } + try: + if self.queueStatusPath!=None: + attempts=3 + while attempts>0: + try: + with open(self.queueStatusPath+TEMPEXT,"w") as fp: + #fcntl.flock(fp, fcntl.LOCK_EX) + json.dump(doc,fp) + os.rename(self.queueStatusPath+TEMPEXT,self.queueStatusPath) + break + except Exception as ex: + attempts-=1 + if attempts==0: + raise ex + self.logger.warning("Unable to write status file, with error:" + str(ex)+".retrying...") + time.sleep(0.05) + try: + shutil.copyfile(self.queueStatusPath,self.queueStatusPathMon) + except: + pass + except Exception as ex: + self.logger.error("Unable to open/write " + self.queueStatusPath) + self.logger.exception(ex) + class fileHandler(object): def __eq__(self,other): @@ -106,6 +199,7 @@ def getFiletype(self,filepath = None): if not filepath: filepath = self.filepath filename = self.basename name,ext = self.name,self.ext + if ext==TEMPEXT:return UNKNOWN name = name.upper() if "mon" not in filepath: if ext == ".dat" and "_PID" not in name: return DAT @@ -113,26 +207,26 @@ def getFiletype(self,filepath = None): if ext == ".jsndata" and "_PID" in name: return PJSNDATA if ext == ".ini" and "_PID" in name: return INI if ext == ".jsd" and "OUTPUT_" in name: return OUTPUTJSD - if ext == ".jsd" : return JSD + if ext == ".jsd" : return DEFINITION if ext == ".jsn": if STREAMERRORNAME.upper() in name: return STREAMERR - elif "BOLS" in name : return BOLS - elif "STREAM" in name and "_PID" in name: return STREAM - elif "INDEX" in name and "_PID" in name: return INDEX - elif "CRASH" in name and "_PID" in name: return CRASH - elif "EOLS" in name: return EOLS - elif "EOR" in name: return EOR + elif "_BOLS" in name : return BOLS + elif "_STREAM" in name and "_PID" in name: return STREAM + elif "_INDEX" in name and "_PID" in name: return INDEX + elif "_CRASH" in name and "_PID" in name: return CRASH + elif "_EOLS" in name: return EOLS + elif "_EOR" in name: return EOR + elif "_TRANSFER" in name: return DEFINITION if ext==".jsn": if STREAMDQMHISTNAME.upper() in name and "_PID" not in name: return STREAMDQMHISTOUTPUT - if "STREAM" in name and "_PID" not in name: return OUTPUT - if "_HLTRATESLEGEND" in name: return HLTRATESLEGEND - elif "_HLTRATES" in name: return HLTRATES + if "_STREAM" in name and "_PID" not in name: return OUTPUT + if name.startswith("QUEUE_STATUS"): return QSTATUS if ext==".pb": if "_PID" not in name: return PB else: return PIDPB if name.endswith("COMPLETE"): return COMPLETE - if ".fast" in filename: return FAST - if "slow" in filename: return SLOW + if ext == ".fast" in filename: return FAST + if ext == ".slow" in filename: return SLOW if ext == ".leg" and "MICROSTATELEGEND" in name: return MODULELEGEND if ext == ".leg" and "PATHLEGEND" in name: return PATHLEGEND if "boxes" in filepath : return BOX @@ -149,7 +243,6 @@ def getFileHeaders(self): elif filetype in [DAT,PB,OUTPUT,STREAMERR,STREAMDQMHISTOUTPUT]: self.run,self.ls,self.stream,self.host = splitname elif filetype == INDEX: self.run,self.ls,self.index,self.pid = splitname elif filetype == EOLS: self.run,self.ls,self.eols = splitname - elif filetype == HLTRATES:self.run,self.ls,self.ftype,self.pid = splitname else: self.logger.warning("Bad filetype: %s" %self.filepath) self.run,self.ls,self.stream = [None]*3 @@ -167,11 +260,12 @@ def getBoxData(self,filepath = None): data = fi.read() data = data.strip(sep).split(sep) data = dict([d.split('=') for d in data]) + except IOError,e: + data = {} except StandardError,e: self.logger.exception(e) data = {} - return data #get data from json file @@ -247,7 +341,12 @@ def setFieldByName(self,field,value,warning=True): #get definitions from jsd file def getDefinitions(self): if self.filetype in [STREAM]: + #try: self.jsdfile = self.data["definition"] + #except: + # self.logger.error("no definition field in "+str(self.filepath)) + # self.definitions = {} + # return False elif not self.jsdfile: self.logger.warning("jsd file not set") self.definitions = {} @@ -256,10 +355,11 @@ def getDefinitions(self): return True - def deleteFile(self): + def deleteFile(self,silent=False): #return True filepath = self.filepath - self.logger.info(filepath) + if silent==False: + self.logger.info(filepath) if os.path.isfile(filepath): try: os.remove(filepath) @@ -389,21 +489,37 @@ def writeout(self,empty=False): return False return True + #TODO:make sure that the file is copied only once def esCopy(self): if not self.exists(): return if self.filetype in TO_ELASTICIZE: esDir = os.path.join(self.dir,ES_DIR_NAME) if os.path.isdir(esDir): + newpathTemp = os.path.join(esDir,self.basename+TEMPEXT) newpath = os.path.join(esDir,self.basename) retries = 5 while True: try: - shutil.copy(self.filepath,newpath) + shutil.copy(self.filepath,newpathTemp) + break + except (OSError,IOError),e: + retries-=1 + if retries == 0: + self.logger.exception(e) + return + #raise e #non-critical exception + else: + time.sleep(0.5) + retries = 5 + while True: + try: + os.rename(newpathTemp,newpath) break except (OSError,IOError),e: retries-=1 if retries == 0: self.logger.exception(e) + return #raise e #non-critical exception else: time.sleep(0.5) diff --git a/python/anelastic.py b/python/anelastic.py index 99428ef..63db0b2 100755 --- a/python/anelastic.py +++ b/python/anelastic.py @@ -11,7 +11,7 @@ import _inotify as inotify import threading import Queue -import json +import simplejson as json import logging @@ -21,8 +21,9 @@ class LumiSectionRanger(): host = os.uname()[1] - def __init__(self,tempdir,outdir,run_number): + def __init__(self,mr,tempdir,outdir,run_number): self.logger = logging.getLogger(self.__class__.__name__) + self.mr = mr self.stoprequest = threading.Event() self.emptyQueue = threading.Event() self.firstStream = threading.Event() @@ -41,7 +42,10 @@ def __init__(self,tempdir,outdir,run_number): self.jsdfile = None self.buffer = [] # file list before the first stream file self.emptyOutTemplate = None - + self.useTimeout=60 + self.maxQueuedLumi=0 + self.maxReceivedEoLS=0 + self.maxClosedLumi=0 def join(self, stop=False, timeout=None): @@ -52,7 +56,8 @@ def join(self, stop=False, timeout=None): def start(self): self.run() - def stop(self): + def stop(self,timeout=60): + self.useTimeout=timeout self.stoprequest.set() def setSource(self,source): @@ -71,11 +76,15 @@ def run(self): self.process() except (KeyboardInterrupt,Queue.Empty) as e: self.emptyQueue.set() + except Exception as ex: + self.logger.exception(ex) + self.logger.fatal("Exiting on unhandled exception") + os._exit(1) else: time.sleep(0.5) #allow timeout in case 'complete' file is received and lumi is not closed if self.stoprequest.isSet() and self.emptyQueue.isSet() and self.checkClosure()==False: - if endTimeout<=-1: endTimeout=100 + if endTimeout<=-1: endTimeout=self.useTimeout*2 if endTimeout==0: break endTimeout-=1 @@ -105,8 +114,8 @@ def process(self): eventtype = self.eventtype if eventtype:# & inotify.IN_CLOSE_WRITE: - if filetype == JSD: - self.processJsdFile() + if filetype == DEFINITION: + self.processDefinitionFile() if filetype == OUTPUTJSD and not self.jsdfile: self.jsdfile=self.infile.filepath self.createEmptyOutputTemplate() @@ -119,18 +128,28 @@ def process(self): elif filetype in [STREAM,STREAMDQMHISTOUTPUT,INDEX,EOLS,DAT,PB]: run,ls = (self.infile.run,self.infile.ls) key = (run,ls) + ls_num=int(ls[2:]) if filetype == EOLS : + if self.maxReceivedEoLS=0: - if numFiles == 1: - #fastHadd crashes trying to merge only one file - os.rename(command_args[4],command_args[3]) - else: - p = subprocess.Popen(command_args,stdout=subprocess.PIPE,stderr=subprocess.STDOUT) - p.wait() - if p.returncode!=0: - self.logger.error('fastHadd returned with exit code '+str(p.returncode)+' and response: ' + str(p.communicate()) + '. Merging parameters given:'+str(command_args) +' ,file sizes(B):'+str(inFileSizes)) - #DQM more verbose debugging - try: - filesize = os.stat(fullOutputPath).st_size - self.logger.error('fastHadd reported to fail at merging, while output pb file exists! '+ fullOutputPath + ' with size(B): '+str(filesize)) - except: - pass - outfile.setFieldByName('ReturnCodeMask', str(p.returncode)) - hasError=True - if True: - if numFiles==1: - try: - filesize = os.stat(fullOutputPath).st_size - except: - self.logger.error('Error checking fastHadd output file size: '+ fullOutputPath) - hasError=True - try: - os.chmod(fullOutputPath,0666) - except: - self.logger.error('Error fixing permissions of fastHadd output file: '+ fullOutputPath) - if numFiles>1: - for f in command_args[4:]: - try: - if hasError==False:os.remove(f) - except OSError as ex: - self.logger.warning('exception removing file '+f+' : '+str(ex)) + p = subprocess.Popen(command_args,stdout=subprocess.PIPE,stderr=subprocess.STDOUT) + p.wait() + if p.returncode!=0: + self.logger.error('fastHadd returned with exit code '+str(p.returncode)+' and response: ' + str(p.communicate()) + '. Merging parameters given:'+str(command_args) +' ,file sizes(B):'+str(inFileSizes)) + #DQM more verbose debugging + try: + filesize = os.stat(fullOutputPath).st_size + self.logger.error('fastHadd reported to fail at merging, while output pb file exists! '+ fullOutputPath + ' with size(B): '+str(filesize)) + except: + pass + outfile.setFieldByName('ReturnCodeMask', str(p.returncode)) + hasError=True + + for f in command_args[4:]: + try: + if hasError==False:os.remove(f) + except OSError as ex: + self.logger.warning('exception removing file '+f+' : '+str(ex)) else: hasError=True @@ -830,8 +869,14 @@ def abortMerging(self): if __name__ == "__main__": + + import procname + procname.setprocname('anelastic') + + conf=initConf() + logging.basicConfig(filename=os.path.join(conf.log_dir,"anelastic.log"), - level=logging.INFO, + level=conf.service_log_level, format='%(levelname)s:%(asctime)s - %(funcName)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S') logger = logging.getLogger(os.path.basename(__file__)) @@ -848,7 +893,7 @@ def abortMerging(self): rawinputdir = sys.argv[3] dirname = os.path.basename(os.path.normpath(dirname)) watchDir = os.path.join(conf.watch_directory,dirname) - outputDir = conf.micromerge_output + outputDir = sys.argv[4] dqmHandler = None @@ -868,11 +913,12 @@ def abortMerging(self): #starting inotify thread mr = MonitorRanger() mr.setEventQueue(eventQueue) + mr.setQueueStatusPath(os.path.join(watchDir,"open","queue_status.jsn"),os.path.join(watchDir,"mon","queue_status.jsn")) mr.register_inotify_path(watchDir,mask) mr.start_inotify() #starting lsRanger thread - ls = LumiSectionRanger(watchDir,outputDir,run_number) + ls = LumiSectionRanger(mr,watchDir,outputDir,run_number) ls.setSource(eventQueue) ls.start() diff --git a/python/applianceumount.py b/python/applianceumount.py index 523e259..ee3c13f 100644 --- a/python/applianceumount.py +++ b/python/applianceumount.py @@ -36,8 +36,8 @@ def run(self): os.symlink('/opt/hltd/cgi',self.watch_directory+'/cgi-bin') handler.cgi_directories = ['/cgi-bin'] - print("starting http server on port "+str(self.cgi_port+5)) - self.httpd = BaseHTTPServer.HTTPServer(("", self.cgi_port+5), handler) + print("starting http server on port "+str(self.cgi_port+20)) + self.httpd = BaseHTTPServer.HTTPServer(("", self.cgi_port+20), handler) self.httpd.serve_forever() self.finished=True @@ -51,9 +51,10 @@ def run(self): def stop(self): self.httpd.shutdown() -def checkMode(): +def checkMode(instance): try: hltdconf='/etc/hltd.conf' + if instance != "main": hltdconf='/etc/hltd-'+instance+'.conf' with open(hltdconf,'r') as f: for l in f.readlines(): ls=l.strip(' \n') @@ -63,31 +64,37 @@ def checkMode(): pass return "unknown" -def stopFUs(): +def stopFUs(instance): hltdconf='/etc/hltd.conf' watch_directory='/fff/ramdisk' + if instance != "main": hltdconf='/etc/hltd-'+instance+'.conf' machine_is_bu=False machine_is_fu=False cgi_port=9000 + cgi_offset=0 try: f=open(hltdconf,'r') for l in f.readlines(): ls=l.strip(' \n') - if not ls.startswith('#') and ls.startswith('watch_directory'): + if ls.startswith('watch_directory'): watch_directory=ls.split('=')[1].strip(' ') - if not ls.startswith('#') and ls.startswith('role'): + elif ls.startswith('role'): if 'bu' in ls.split('=')[1].strip(' '): machine_is_bu=True if 'fu' in ls.split('=')[1].strip(' ')=='fu': machine_is_fu=True - if not ls.startswith('#') and ls.startswith('cgi_port'): + elif ls.startswith('cgi_instance_port_offset'): + cgi_offset=int(ls.split('=')[1].strip(' ')) + elif ls.startswith('cgi_port'): cgi_port=int(ls.split('=')[1].strip(' ')) f.close() except Exception as ex: - print "Unable to read parameters",str(ex),"using defaults" + if instance!="main": raise ex + else: + print "Unable to read parameters",str(ex),"using defaults" if machine_is_bu==False:return True - syslog.syslog("hltd:Initiating FU unmount procedure") + syslog.syslog("hltd-"+str(instance)+": initiating FU unmount procedure") #continue with notifying FUs boxinfodir=os.path.join(watch_directory,'appliance/boxes') @@ -106,15 +113,16 @@ def stopFUs(): current_time = time.time() age = current_time - os.path.getmtime(os.path.join(boxinfodir,machine)) print "found machine",machine," which is ",str(age)," seconds old" - syslog.syslog("hltd: found machine "+str(machine) + " which is "+ str(age)+" seconds old") + syslog.syslog("hltd-"+str(instance)+": found machine "+str(machine) + " which is "+ str(age)+" seconds old") if age < 30: if receiver==None: receiver = UmountResponseReceiver(watch_directory,cgi_port) receiver.start() time.sleep(1) try: - connection = httplib.HTTPConnection(machine, cgi_port,timeout=5) - connection.request("GET",'cgi-bin/suspend_cgi.py') + #subtract cgi offset when connecting machine + connection = httplib.HTTPConnection(machine, cgi_port-cgi_offset,timeout=5) + connection.request("GET",'cgi-bin/suspend_cgi.py?port='+str(cgi_port)) response = connection.getresponse() machinelist.append(machine) except: @@ -133,7 +141,7 @@ def stopFUs(): machinePending=True activeMachines.append(machine) - syslog.syslog("hltd: waiting for machines to respond:"+str(activeMachines)) + syslog.syslog("hltd-"+str(instance)+": waiting for machines to respond:"+str(activeMachines)) if machinePending: usedTimeout+=2 time.sleep(2) @@ -142,12 +150,12 @@ def stopFUs(): except: #handle interrupt print "Interrupted!" - syslog.syslog("hltd: FU suspend was interrupted") + syslog.syslog("hltd-"+str(instance)+": FU suspend was interrupted") count=0 if receiver!=None: while receiver.finished==False: count+=1 - if count%100==0:syslog.syslog("hltd stop: trying to stop suspend receiver HTTP server thread (script interrupted)") + if count%100==0:syslog.syslog("hltd-"+str(instance)+": stop: trying to stop suspend receiver HTTP server thread (script interrupted)") try: receiver.stop() time.sleep(.1) @@ -161,7 +169,7 @@ def stopFUs(): if receiver!=None: while receiver.finished==False: count+=1 - if count%100==0:syslog.syslog("hltd stop: trying to stop suspend receiver HTTP server thread") + if count%100==0:syslog.syslog("hltd-"+str(instance)+": stop: trying to stop suspend receiver HTTP server thread") try: receiver.stop() time.sleep(.1) @@ -172,10 +180,10 @@ def stopFUs(): print "Finished FU suspend for:",str(machinelist) print "Not successful:",str(activeMachines) - syslog.syslog("hltd: unmount script completed. remaining machines :"+str(activeMachines)) + syslog.syslog("hltd-"+str(instance)+": unmount script completed. remaining machines :"+str(activeMachines)) if usedTimeout==maxTimeout: print "FU suspend failed for hosts:",activeMachines - syslog.syslog("hltd: FU suspend failed for hosts"+str(activeMachines)) + syslog.syslog("hltd-"+str(instance)+": FU suspend failed for hosts"+str(activeMachines)) return False return True diff --git a/python/daemon2.py b/python/daemon2.py index 97e51f6..a5c78c6 100644 --- a/python/daemon2.py +++ b/python/daemon2.py @@ -17,12 +17,25 @@ class Daemon2: attn: May change in the near future to use PEP daemon """ - def __init__(self, pidfile, processname, stdin='/dev/null', stdout='/dev/null', stderr='/dev/null'): + def __init__(self, processname, instance, confname=None, stdin='/dev/null', stdout='/dev/null', stderr='/dev/null'): self.stdin = stdin self.stdout = stdout self.stderr = stderr - self.pidfile = pidfile self.processname = processname + self.instance = instance + if confname==None:confname=processname + if instance=="main": + instsuffix="" + self.instancemsg="" + else: + instsuffix="-"+instance + self.instancemsg=" instance"+instance + + self.pidfile = "/var/run/" + processname + instsuffix + ".pid" + self.conffile = "/etc/" + confname + instsuffix + ".conf" + self.lockfile = '/var/lock/subsys/'+processname + instsuffix + + def daemonize(self): @@ -35,7 +48,7 @@ def daemonize(self): pid = os.fork() if pid > 0: # exit first parent - sys.exit(0) + return -1 except OSError, e: sys.stderr.write("fork #1 failed: %d (%s)\n" % (e.errno, e.strerror)) sys.exit(1) @@ -71,14 +84,21 @@ def daemonize(self): atexit.register(self.delpid) pid = str(os.getpid()) file(self.pidfile,'w+').write("%s\n" % pid) + return 0 def delpid(self): - os.remove(self.pidfile) + if os.path.exists(self.pidfile): + os.remove(self.pidfile) def start(self): """ Start the daemon """ + if not os.path.exists(self.conffile): + print "Missing "+self.conffile+" - can not start instance" + #raise Exception("Missing "+self.conffile) + sys.exit(4) # Check for a pidfile to see if the daemon already runs + try: pf = file(self.pidfile,'r') pid = int(pf.read().strip()) @@ -89,10 +109,13 @@ def start(self): if pid: message = "pidfile %s already exists. Daemon already running?\n" sys.stderr.write(message % self.pidfile) - sys.exit(1) + sys.exit(3) # Start the daemon - self.daemonize() - self.run() + ret = self.daemonize() + if ret == 0: + self.run() + ret = 0 + return ret def status(self): """ @@ -107,16 +130,22 @@ def status(self): except IOError: pid = None if not pid: - message = self.processname+" not running, no pidfile %s\n" + message = self.processname + self.instancemsg +" not running, no pidfile %s\n" else: try: os.kill(pid,0) - message = self.processname+" is running with pidfile %s\n" + message = self.processname + self.instancemsg + " is running with pidfile %s\n" retval = True + except OSError as ex: + if ex.errno==1: + message = self.processname + self.instancemsg + " is running with pidfile %s\n" + else: + message = self.processname + self.instancemsg + " pid exist in %s but process is not running\n" except: - message = self.processname+" pid exist in %s but process is not running\n" + message = self.processname + self.instancemsg + " pid exist in %s but process is not running\n" + #should return true for puppet to detect service crash (also when stopped) - sys.stderr.write(message % self.pidfile) + sys.stdout.write(message % self.pidfile) return retval def silentStatus(self): @@ -132,7 +161,7 @@ def silentStatus(self): except IOError: pid = None if not pid: - message = self.processname+" not running, no pidfile %s\n" + message = self.processname + self.instancemsg +" not running, no pidfile %s\n" else: try: os.kill(pid,0) @@ -155,12 +184,18 @@ def stop(self): pid = None if not pid: - message = "pidfile %s does not exist. Daemon not running?\n" - sys.stderr.write(message % self.pidfile) + message = " not running, no pidfile %s\n" + sys.stdout.write(message % self.pidfile) + sys.stdout.flush() return # not an error in a restart # Try killing the daemon process + processPresent=False try: + #check is process is alive + os.kill(pid,0) + processPresent=True + sys.stdout.flush() # signal the daemon to stop timeout = 5.0 #kill timeout os.kill(pid, SIGINT) @@ -183,25 +218,37 @@ def stop(self): time.sleep(0.5) timeout-=0.5 except OSError, err: + time.sleep(.1) err = str(err) if err.find("No such process") > 0: #this handles the successful stopping of the daemon... if os.path.exists(self.pidfile): - print 'removing pidfile' - os.remove(self.pidfile) - sys.stdout.write('[OK]\n') - sys.stdout.flush() + if processPresent==False: + sys.stdout.write(" process "+str(pid)+" is dead. Removing pidfile" + self.pidfile+ " pid:" + str(pid)) + try: + os.remove(self.pidfile) + except Exception as ex: + sys.stdout.write(' [ \033[1;31mFAILED\033[0;39m ]\n') + sys.stderr.write(str(ex)+'\n') + sys.exit(1) + elif not os.path.exists(self.pidfile): + if processPresent==False: + sys.stdout.write(' service is not running') else: - print str(err) + sys.stdout.write(' [ \033[1;31mFAILED\033[0;39m ]\n') + sys.stderr.write(str(err)+'\n') sys.exit(1) - sys.stdout.write('[OK]\n') + + if (self.processname!="hltd"):sys.stdout.write("\t\t") + sys.stdout.write('\t\t\t [ \033[1;32mOK\033[0;39m ]\n') + sys.stdout.flush() def restart(self): """ Restart the daemon """ self.stop() - self.start() + return self.start() def run(self): """ @@ -212,7 +259,7 @@ def run(self): def emergencyUmount(self): cfg = ConfigParser.SafeConfigParser() - cfg.read('/etc/hltd.conf') + cfg.read(self.conffile) bu_base_dir=None#/fff/BU0? ramdisk_subdirectory = 'ramdisk' @@ -229,7 +276,7 @@ def emergencyUmount(self): process = subprocess.Popen(['mount'],stdout=subprocess.PIPE) out = process.communicate()[0] mounts = re.findall('/'+bu_base_dir+'[0-9]+',out) - if len(mounts)>1 and mounts[0]==mounts[1]: mounts=[mounts[0]] + mounts = sorted(list(set(mounts))) for point in mounts: sys.stdout.write("trying emergency umount of "+point+"\n") try: @@ -237,7 +284,8 @@ def emergencyUmount(self): except subprocess.CalledProcessError, err1: pass except Exception as ex: - sys.stdout.write(ex.args[0]+"\n") + #ok(legacy mountpoint) + pass try: subprocess.check_call(['umount',os.path.join('/'+point,ramdisk_subdirectory)]) except subprocess.CalledProcessError, err1: @@ -252,4 +300,20 @@ def emergencyUmount(self): sys.stdout.write(str(err1.returncode)+"\n") except Exception as ex: sys.stdout.write(ex.args[0]+"\n") + + + def touchLockFile(self): + try: + with open(self.lockfile,"w+") as fi: + pass + except: + pass + + def removeLockFile(self): + try: + os.unlink(self.lockfile) + except: + pass + + diff --git a/python/elastic.py b/python/elastic.py index 28fccdd..f5c6048 100755 --- a/python/elastic.py +++ b/python/elastic.py @@ -25,7 +25,6 @@ def __init__(self, esDir, inMonDir): self.inputMonDir = inMonDir self.movedModuleLegend = False self.movedPathLegend = False - self.processedHLTRatesLegend = False def start(self): self.run() @@ -44,7 +43,11 @@ def run(self): self.emptyQueue.clear() self.process() except (KeyboardInterrupt,Queue.Empty) as e: - self.emptyQueue.set() + self.emptyQueue.set() + except Exception as ex: + self.logger.exception(ex) + self.logger.fatal("Exiting on unhandled exception") + os._exit(1) else: time.sleep(0.5) @@ -60,8 +63,8 @@ def process(self): infile = self.infile filetype = infile.filetype eventtype = self.eventtype - if eventtype & inotify.IN_CLOSE_WRITE: - if filetype in [FAST,SLOW]: + if eventtype & (inotify.IN_CLOSE_WRITE | inotify.IN_MOVED_TO) : + if filetype in [FAST,SLOW,QSTATUS]: self.elasticize() elif self.esDirName in infile.dir: if filetype in [INDEX,STREAM,OUTPUT,STREAMDQMHISTOUTPUT]:self.elasticize() @@ -85,13 +88,6 @@ def process(self): logger.error(ex) pass self.movedPathLegend = True - elif filetype == HLTRATES: - self.logger.debug('received json HLT rates') - self.elasticize() - elif filetype == HLTRATESLEGEND and self.processedHLTRatesLegend==False: - self.logger.debug('received json HLT legend rates') - self.elasticize() - @@ -106,47 +102,46 @@ def elasticize(self): elif filetype == SLOW: es.elasticize_prc_sstate(infile) self.logger.debug(name+" going into prc-sstate") - self.infile.deleteFile() + self.infile.deleteFile(silent=True) elif filetype == INDEX: self.logger.info(name+" going into prc-in") es.elasticize_prc_in(infile) - self.infile.deleteFile() + self.infile.deleteFile(silent=True) elif filetype == STREAM: self.logger.info(name+" going into prc-out") es.elasticize_prc_out(infile) - self.infile.deleteFile() + self.infile.deleteFile(silent=True) elif filetype in [OUTPUT,STREAMDQMHISTOUTPUT]: self.logger.info(name+" going into fu-out") es.elasticize_fu_out(infile) - self.infile.deleteFile() + self.infile.deleteFile(silent=True) + elif filetype == QSTATUS: + self.logger.debug(name+" going into qstatus") + es.elasticize_queue_status(infile) elif filetype == COMPLETE: self.logger.info(name+" going into fu-complete") dt=os.path.getctime(infile.filepath) completed = datetime.datetime.utcfromtimestamp(dt).isoformat() es.elasticize_fu_complete(completed) - self.infile.deleteFile() + self.infile.deleteFile(silent=True) self.stop() - elif filetype == HLTRATESLEGEND: - if self.processedHLTRatesLegend==False: - es.elasticize_hltrateslegend(infile) - self.processedHLTRatesLegend=True - self.infile.deleteFile() - elif filetype == HLTRATES: - self.logger.info(name+" going into hlt-rates") - es.elasticize_hltrates(infile) - self.infile.deleteFile() def elasticizeLS(self): ls = self.infile.ls es.flushLS(ls) - self.infile.deleteFile() + self.infile.deleteFile(silent=True) if __name__ == "__main__": + + import procname + procname.setprocname('elastic') + + conf=initConf() logging.basicConfig(filename=os.path.join(conf.log_dir,"elastic.log"), - level=logging.INFO, + level=conf.service_log_level, format='%(levelname)s:%(asctime)s - %(funcName)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S') logger = logging.getLogger(os.path.basename(__file__)) @@ -165,17 +160,14 @@ def elasticizeLS(self): expected_processes = int(sys.argv[3]) indexSuffix = conf.elastic_cluster update_modulo=conf.fastmon_insert_modulo - dirname = os.path.basename(os.path.normpath(dirname)) - watchDir = os.path.join(conf.watch_directory,dirname)#??? - outputDir = conf.micromerge_output - monDir = os.path.join(watchDir,"mon") - tempDir = os.path.join(watchDir,ES_DIR_NAME) + rundirname = os.path.basename(os.path.normpath(dirname)) + monDir = os.path.join(dirname,"mon") + tempDir = os.path.join(dirname,ES_DIR_NAME) - mask = inotify.IN_CLOSE_WRITE | inotify.IN_MOVED_TO - monMask = inotify.IN_CLOSE_WRITE - tempMask = inotify.IN_CLOSE_WRITE + monMask = inotify.IN_CLOSE_WRITE | inotify.IN_MOVED_TO + tempMask = inotify.IN_CLOSE_WRITE | inotify.IN_MOVED_TO - logger.info("starting elastic for "+dirname) + logger.info("starting elastic for "+rundirname[:3]+' '+rundirname[3:]) try: os.makedirs(monDir) @@ -191,12 +183,11 @@ def elasticizeLS(self): #starting inotify thread mr = MonitorRanger() mr.setEventQueue(eventQueue) - #mr.register_inotify_path(watchDir,mask) mr.register_inotify_path(monDir,monMask) mr.register_inotify_path(tempDir,tempMask) mr.start_inotify() - es = elasticBand.elasticBand('http://'+conf.es_local+':9200',dirname,indexSuffix,expected_processes,update_modulo) + es = elasticBand.elasticBand('http://'+conf.es_local+':9200',rundirname,indexSuffix,expected_processes,update_modulo) #starting elasticCollector thread ec = elasticCollector(ES_DIR_NAME,inmondir) diff --git a/python/elasticBand.py b/python/elasticBand.py index 20c5093..978feed 100644 --- a/python/elasticBand.py +++ b/python/elasticBand.py @@ -5,15 +5,13 @@ from pyelasticsearch.client import ElasticHttpError from pyelasticsearch.client import ConnectionError from pyelasticsearch.client import Timeout -import json +import simplejson as json import csv import math import logging from aUtils import * -#MONBUFFERSIZE = 50 -es_server_url = 'http://localhost:9200' class elasticBand(): @@ -24,7 +22,7 @@ def __init__(self,es_server_url,runstring,indexSuffix,monBufferSize,fastUpdateMo self.prcinBuffer = {} self.prcoutBuffer = {} self.fuoutBuffer = {} - self.es = ElasticSearch(es_server_url,timeout=20) + self.es = ElasticSearch(es_server_url,timeout=20,revival_delay=60) self.hostname = os.uname()[1] self.hostip = socket.gethostbyname_ex(self.hostname)[2][0] #self.number_of_data_nodes = self.es.health()['number_of_data_nodes'] @@ -36,12 +34,13 @@ def __init__(self,es_server_url,runstring,indexSuffix,monBufferSize,fastUpdateMo aliasName = runstring + "_" + indexSuffix self.indexName = aliasName# + "_" + self.hostname - def imbue_jsn(self,infile): + def imbue_jsn(self,infile,silent=False): with open(infile.filepath,'r') as fp: try: document = json.load(fp) except json.scanner.JSONDecodeError,ex: - logger.exception(ex) + if silent==False: + self.logger.exception(ex) return None,-1 return document,0 @@ -155,58 +154,24 @@ def elasticize_prc_in(self,infile): document['data']=datadict document['ls']=int(ls[2:]) document['index']=int(index[5:]) - document['dest']=os.uname()[1] + document['dest']=self.hostname document['process']=int(prc[3:]) try:document.pop('definition') except:pass self.prcinBuffer.setdefault(ls,[]).append(document) #self.es.index(self.indexName,'prc-in',document) - - def elasticize_hltrateslegend(self,infile): - document,ret = self.imbue_jsn(infile) + def elasticize_queue_status(self,infile): + document,ret = self.imbue_jsn(infile,silent=True) if ret<0:return False - datadict={} - #datadict['pid'] = int(infile.pid[3:]) - try: - paths=document['data'][0].strip('[]') - datasets=document['data'][1].strip('[]') - datadict['dataset-names']=datasets.split(',') if len(datasets)>0 else [] - datadict['path-names']=paths.split(',') if len(paths)>0 else [] - except: - pass - self.tryIndex('hltrates-legend',datadict) + document['fm_date']=str(infile.mtime) + document['host']=self.hostname + self.tryIndex('qstatus',document) return True - - def elasticize_hltrates(self,infile): - document,ret = self.imbue_jsn(infile) - if ret<0:return False - datadict={} - try: - datadict['ls'] = int(infile.ls[2:]) - datadict['pid'] = int(infile.pid[3:]) - try: - if json.loads(document['data'][0])[0]==0:return True - except: - pass - datadict['processed']=json.loads(document['data'][0])[0] - datadict['path-wasrun']=json.loads(document['data'][1]) - datadict['path-afterl1seed']=json.loads(document['data'][2]) - datadict['path-afterprescale']=json.loads(document['data'][3]) - datadict['path-accepted']=json.loads(document['data'][4]) - datadict['path-rejected']=json.loads(document['data'][5]) - datadict['path-errors']=json.loads(document['data'][6]) - datadict['dataset-accepted']=json.loads(document['data'][7]) - except: - return False - self.tryIndex('hltrates',datadict) - return True - - def elasticize_fu_complete(self,timestamp): document = {} - document['host']=os.uname()[1] + document['host']=self.hostname document['fm_date']=timestamp self.tryIndex('fu-complete',document) @@ -264,7 +229,7 @@ def tryBulkIndex(self,docname,documents,attempts=1): if attempts==0: self.indexFailures+=1 if self.indexFailures<2: - self.logger.error("Elasticsearch connection error.") + self.logger.warning("Elasticsearch connection error.") time.sleep(5) except ElasticHttpError as ex: if attempts==0: diff --git a/python/elasticbu.py b/python/elasticbu.py index 8565615..2fad372 100755 --- a/python/elasticbu.py +++ b/python/elasticbu.py @@ -23,10 +23,12 @@ import requests import simplejson as json - import socket -def getURLwithIP(url): +#silence HTTP connection info from requests package +logging.getLogger("urllib3").setLevel(logging.WARNING) + +def getURLwithIP(url,nsslock=None): try: prefix = '' if url.startswith('http://'): @@ -41,7 +43,17 @@ def getURLwithIP(url): logging.error('could not parse URL ' +url) raise(ex) if url!='localhost': - ip = socket.gethostbyname(url) + if nsslock is not None: + try: + nsslock.acquire() + ip = socket.gethostbyname(url) + nsslock.release() + except Exception as ex: + try:nsslock.release() + except:pass + raise ex + else: + ip = socket.gethostbyname(url) else: ip='127.0.0.1' return prefix+str(ip)+suffix @@ -49,8 +61,9 @@ def getURLwithIP(url): class elasticBandBU: - def __init__(self,runnumber,startTime,runMode=True): + def __init__(self,conf,runnumber,startTime,runMode=True,nsslock=None): self.logger = logging.getLogger(self.__class__.__name__) + self.conf=conf self.es_server_url=conf.elastic_runindex_url self.runindex_write="runindex_"+conf.elastic_runindex_name+"_write" self.runindex_read="runindex_"+conf.elastic_runindex_name+"_read" @@ -66,8 +79,14 @@ def __init__(self,runnumber,startTime,runMode=True): self.runMode=runMode self.boxinfoFUMap = {} self.ip_url=None + self.nsslock=nsslock self.updateIndexMaybe(self.runindex_name,self.runindex_write,self.runindex_read,mappings.central_es_settings,mappings.central_runindex_mapping) self.updateIndexMaybe(self.boxinfo_name,self.boxinfo_write,self.boxinfo_read,mappings.central_es_settings,mappings.central_boxinfo_mapping) + self.black_list=None + if self.conf.instance=='main': + self.hostinst = self.host + else: + self.hostinst = self.host+'_'+self.conf.instance #write run number document if runMode == True and self.stopping==False: @@ -89,14 +108,19 @@ def updateIndexMaybe(self,index_name,alias_write,alias_read,settings,mapping): connectionAttempts+=1 try: if retry or self.ip_url==None: - self.ip_url=getURLwithIP(self.es_server_url) - self.es = ElasticSearch(self.es_server_url) + self.ip_url=getURLwithIP(self.es_server_url,self.nsslock) + self.es = ElasticSearch(self.ip_url,timeout=20,revival_delay=60) #check if runindex alias exists - self.logger.info('writing to elastic index '+alias_write) if requests.get(self.es_server_url+'/_alias/'+alias_write).status_code == 200: + self.logger.info('writing to elastic index '+alias_write + ' on '+self.es_server_url+' - '+self.ip_url ) self.createDocMappingsMaybe(alias_write,mapping) - break + break + else: + time.sleep(.5) + if (connectionAttempts%10)==0: + self.logger.error('unable to access to elasticsearch alias ' + alias_write + ' on '+self.es_server_url+' / '+self.ip_url) + continue except ElasticHttpError as ex: #es error, retry self.logger.error(ex) @@ -110,7 +134,7 @@ def updateIndexMaybe(self,index_name,alias_write,alias_read,settings,mapping): retry=True continue - except (ConnectionError,Timeout) as ex: + except (socket.gaierror,ConnectionError,Timeout) as ex: #try to reconnect with different IP from DNS load balancing if self.runMode and connectionAttempts>100: self.logger.error('elastic (BU): exiting after 100 connection attempts to '+ self.es_server_url) @@ -128,12 +152,19 @@ def createDocMappingsMaybe(self,index_name,mapping): doc = {key:mapping[key]} res = requests.get(self.ip_url+'/'+index_name+'/'+key+'/_mapping') #only update if mapping is empty - if res.status_code==200 and res.content.strip()=='{}': - requests.post(self.ip_url+'/'+index_name+'/'+key+'/_mapping',json.dumps(doc)) - - def resetURL(url): - self.es = None - self.es = ElasticSearch(url) + if res.status_code==200: + if res.content.strip()=='{}': + requests.post(self.ip_url+'/'+index_name+'/'+key+'/_mapping',json.dumps(doc)) + else: + #still check if number of properties is identical in each type + inmapping = json.loads(res.content) + for indexname in inmapping: + properties = inmapping[indexname]['mappings'][key]['properties'] + #should be size 1 + for pdoc in mapping[key]['properties']: + if pdoc not in properties: + requests.post(self.ip_url+'/'+index_name+'/'+key+'/_mapping',json.dumps(doc)) + break def read_line(self,fullpath): with open(fullpath,'r') as fp: @@ -177,7 +208,31 @@ def elasticize_box(self,infile): basename = infile.basename self.logger.debug(basename) current_time = time.time() - if basename.startswith('fu'): + + if infile.data=={}:return + + bu_doc=False + if basename.startswith('bu') or basename.startswith('dvbu'): + bu_doc=True + + #check box file against blacklist + if bu_doc or self.black_list==None: + self.black_list=[] + + try: + with open(os.path.join(self.conf.watch_directory,'appliance','blacklist'),"r") as fi: + try: + self.black_list = json.load(fi) + except ValueError: + #file is being written or corrupted + return + except: + #blacklist file is not present, do not filter + pass + + if basename in self.black_list:return + + if bu_doc==False: try: self.boxinfoFUMap[basename] = [infile.data,current_time] except Exception as ex: @@ -185,34 +240,56 @@ def elasticize_box(self,infile): return try: document = infile.data - document['id']=basename + #unique id for separate instances + if bu_doc: + document['id']=self.hostinst + else: + document['id']=basename + + #both here and in "boxinfo_appliance" + document['appliance']=self.host + document['instance']=self.conf.instance + #only here + document['host']=basename + self.index_documents('boxinfo',[document]) except Exception as ex: self.logger.warning('box info not injected: '+str(ex)) return - if basename.startswith('bu') or basename.startswith('dvbu'): + if bu_doc: try: document = infile.data + try: + document.pop('id') + except:pass + try: + document.pop('host') + except:pass #aggregation from FUs document['idles']=0 document['used']=0 document['broken']=0 document['quarantined']=0 + document['cloud']=0 document['usedDataDir']=0 document['totalDataDir']=0 document['hosts']=[basename] + document['blacklistedHosts']=[] for key in self.boxinfoFUMap: - dpair = self.boxinfoFUMap[key] - d = dpair[0] - #check if entry is not older than 10 seconds - if current_time - dpair[1] > 10:continue - document['idles']+=int(d['idles']) - document['used']+=int(d['used']) - document['broken']+=int(d['broken']) - document['quarantined']+=int(d['quarantined']) - document['usedDataDir']+=int(d['usedDataDir']) - document['totalDataDir']+=int(d['totalDataDir']) - document['hosts'].append(key) + dpair = self.boxinfoFUMap[key] + d = dpair[0] + #check if entry is not older than 10 seconds + if current_time - dpair[1] > 10:continue + document['idles']+=int(d['idles']) + document['used']+=int(d['used']) + document['broken']+=int(d['broken']) + document['quarantined']+=int(d['quarantined']) + document['cloud']+=int(d['cloud']) + document['usedDataDir']+=int(d['usedDataDir']) + document['totalDataDir']+=int(d['totalDataDir']) + document['hosts'].append(key) + for blacklistedHost in self.black_list: + document['blacklistedHosts'].append(blacklistedHost) self.index_documents('boxinfo_appliance',[document],bulk=False) except Exception as ex: #in case of malformed box info @@ -238,8 +315,10 @@ def elasticize_eols(self,infile): def index_documents(self,name,documents,bulk=True): attempts=0 destination_index = "" + is_box=False if name.startswith("boxinfo"): destination_index = self.boxinfo_write + is_box=True else: destination_index = self.runindex_write while True: @@ -253,16 +332,18 @@ def index_documents(self,name,documents,bulk=True): except ElasticHttpError as ex: if attempts<=1:continue self.logger.error('elasticsearch HTTP error. skipping document '+name) + if is_box==True:break #self.logger.exception(ex) return False - except (ConnectionError,Timeout) as ex: + except (socket.gaierror,ConnectionError,Timeout) as ex: if attempts>100 and self.runMode: raise(ex) self.logger.error('elasticsearch connection error. retry.') + if is_box==True:break if self.stopping:return False time.sleep(0.1) - ip_url=getURLwithIP(self.es_server_url) - self.es = ElasticSearch(ip_url) + ip_url=getURLwithIP(self.es_server_url,self.nsslock) + self.es = ElasticSearch(ip_url,timeout=20,revival_delay=60) return False @@ -290,7 +371,7 @@ def stop(self): self.stoprequest.set() def run(self): - self.logger.info("Start main loop") + self.logger.info("elasticCollectorBU: start main loop (monitoring:"+self.inRunDir+")") count = 0 while not (self.stoprequest.isSet() and self.emptyQueue.isSet()) : if self.source: @@ -300,16 +381,16 @@ def run(self): self.infile = fileHandler(event.fullpath) self.emptyQueue.clear() if self.infile.filetype==EOR: - if self.es: - try: - dt=os.path.getctime(event.fullpath) - endtime = datetime.datetime.utcfromtimestamp(dt).isoformat() - self.es.elasticize_runend_time(endtime) - except Exception as ex: - self.logger.warning(str(ex)) - endtime = datetime.datetime.utcnow().isoformat() - self.es.elasticize_runend_time(endtime) - break + if self.es: + try: + dt=os.path.getctime(event.fullpath) + endtime = datetime.datetime.utcfromtimestamp(dt).isoformat() + self.es.elasticize_runend_time(endtime) + except Exception as ex: + self.logger.warning(str(ex)) + endtime = datetime.datetime.utcnow().isoformat() + self.es.elasticize_runend_time(endtime) + break self.process() except (KeyboardInterrupt,Queue.Empty) as e: self.emptyQueue.set() @@ -325,9 +406,9 @@ def run(self): #if run dir deleted if os.path.exists(self.inRunDir)==False: self.logger.info("Exiting because run directory in has disappeared") - #nevertheless put run end time if self.es: - endtime = datetime.datetime.utcnow().isoformat() + #write end timestamp in case EoR file was not seen + endtime = datetime.datetime.utcnow().isoformat() self.es.elasticize_runend_time(endtime) break self.logger.info("Stop main loop (watching directory " + str(self.inRunDir) + ")") @@ -374,7 +455,7 @@ def stop(self): self.stoprequest.set() def run(self): - self.logger.info("Start main loop") + self.logger.info("elasticBoxCollectorBU: start main loop") while not (self.stoprequest.isSet() and self.emptyQueue.isSet()) : if self.source: try: @@ -391,7 +472,7 @@ def run(self): self.logger.warning("IOError on reading "+event.fullpath) else: time.sleep(1.0) - self.logger.info("Stop main loop") + self.logger.info("elasticBoxCollectorBU: stop main loop") def setSource(self,source): self.source = source @@ -408,9 +489,12 @@ def process(self): class BoxInfoUpdater(threading.Thread): - def __init__(self,ramdisk): + def __init__(self,ramdisk,conf,nsslock): self.logger = logging.getLogger(self.__class__.__name__) self.stopping = False + self.es=None + self.conf=conf + self.nsslock=nsslock try: threading.Thread.__init__(self) @@ -435,7 +519,7 @@ def __init__(self,ramdisk): def run(self): try: - self.es = elasticBandBU(0,'',False) + self.es = elasticBandBU(self.conf,0,'',False,self.nsslock) if self.stopping:return self.ec = elasticBoxCollectorBU(self.es) @@ -450,7 +534,7 @@ def stop(self): try: self.stopping=True self.threadEvent.set() - if self.es: + if self.es is not None: self.es.stopping=True self.es.threadEvent.set() if self.mr is not None: @@ -465,16 +549,19 @@ def stop(self): class RunCompletedChecker(threading.Thread): - def __init__(self,mode,nr,nresources,run_dir,active_runs,elastic_process): + def __init__(self,conf,mode,nr,nresources,run_dir,active_runs,active_runs_errors,elastic_process): self.logger = logging.getLogger(self.__class__.__name__) + self.conf=conf self.mode = mode self.nr = nr self.nresources = nresources - self.rundirCheckPath = conf.watch_directory +'/run'+ str(nr).zfill(conf.run_number_padding) + rundir = 'run'+ str(nr).zfill(conf.run_number_padding) + self.rundirCheckPath = os.path.join(conf.watch_directory, rundir) self.eorCheckPath = os.path.join(self.rundirCheckPath,'run' + str(nr).zfill(conf.run_number_padding) + '_ls0000_EoR.jsn') - self.url = 'http://localhost:9200/run'+str(nr).zfill(conf.run_number_padding)+'*/fu-complete/_count' - self.urlclose = 'http://localhost:9200/run'+str(nr).zfill(conf.run_number_padding)+'*/_close' - self.urlsearch = 'http://localhost:9200/run'+str(nr).zfill(conf.run_number_padding)+'*/fu-complete/_search?size=1' + self.indexPrefix = 'run'+str(nr).zfill(conf.run_number_padding) + '_' + conf.elastic_cluster + self.url = 'http://'+conf.es_local+':9200/' + self.indexPrefix + '*/fu-complete/_count' + self.urlclose = 'http://'+conf.es_local+':9200/' + self.indexPrefix + '*/_close' + self.urlsearch = 'http://'+conf.es_local+':9200/' + self.indexPrefix + '*/fu-complete/_search?size=1' self.url_query = '{ "query": { "filtered": {"query": {"match_all": {}}}}, "sort": { "fm_date": { "order": "desc" }}}' @@ -482,6 +569,7 @@ def __init__(self,mode,nr,nresources,run_dir,active_runs,elastic_process): self.threadEvent = threading.Event() self.run_dir = run_dir self.active_runs = active_runs + self.active_runs_errors = active_runs_errors self.elastic_process=elastic_process try: threading.Thread.__init__(self) @@ -492,7 +580,6 @@ def __init__(self,mode,nr,nresources,run_dir,active_runs,elastic_process): def checkBoxes(self,dir): - files = os.listdir(dir) endAllowed=True runFound=False @@ -553,7 +640,7 @@ def run(self): if os.path.exists(self.eorCheckPath) or os.path.exists(self.rundirCheckPath)==False: break - dir = conf.resource_base+'/boxes/' + dir = self.conf.resource_base+'/boxes/' check_boxes=True check_es_complete=True total_es_elapsed=0 @@ -563,9 +650,14 @@ def run(self): check_boxes = self.checkBoxes(dir) if check_boxes==False: + try: + self.active_runs_errors.pop(self.active_runs.index(int(self.nr))) + except: + pass try: self.active_runs.remove(int(self.nr)) - except:pass + except: + pass if check_es_complete: try: @@ -578,29 +670,21 @@ def run(self): fm_time = str(dataq['hits']['hits'][0]['_source']['fm_date']) #fill in central index completition time postq = "{runNumber\":\"" + str(self.nr) + "\",\"completedTime\" : \"" + fm_time + "\"}" - requests.post(conf.elastic_runindex_url+'/'+"runindex_"+conf.elastic_runindex_name+'_write/run',postq,timeout=5) - self.logger.info("filled in completition time for run"+str(self.nr)) + requests.post(self.conf.elastic_runindex_url+'/'+"runindex_"+self.conf.elastic_runindex_name+'_write/run',postq,timeout=5) + self.logger.info("filled in completition time for run "+str(self.nr)) except IndexError: # 0 FU resources present in this run, skip writing completition time pass except Exception as ex: self.logger.exception(ex) - try: - if conf.close_es_index==True: - #wait a bit for central ES queries to complete - time.sleep(10) - resp = requests.post(self.urlclose,timeout=5) - self.logger.info('closed appliance ES index for run '+str(self.nr)) - except Exception as exc: - self.logger.error('Error in run completition check') - self.logger.exception(exc) check_es_complete=False continue else: + #TODO:do this only using active runs time.sleep(5) total_es_elapsed+=5 if total_es_elapsed>600: - self.logger.error('run index complete flag was not written by all FUs, giving up after 10 minutes.') + self.logger.warning('run index complete flag was not written by all FUs, giving up checks after 10 minutes.') check_es_complete=False continue except Exception,ex: @@ -609,7 +693,17 @@ def run(self): check_es_complete=False #exit if both checks are complete - if check_boxes==False and check_es_complete==False:break + if check_boxes==False and check_es_complete==False: + try: + if self.conf.close_es_index==True: + #wait a bit for queries to complete + time.sleep(10) + resp = requests.post(self.urlclose,timeout=5) + self.logger.info('closed appliance ES index for run '+str(self.nr)) + except Exception as exc: + self.logger.error('Error in closing run index') + self.logger.exception(exc) + break #check every 10 seconds self.threadEvent.wait(10) @@ -622,10 +716,15 @@ def stop(self): self.threadEvent.set() - if __name__ == "__main__": + + import procname + procname.setprocname('elasticbu') + + conf=initConf(sys.argv[1]) + logging.basicConfig(filename=os.path.join(conf.log_dir,"elasticbu.log"), - level=logging.INFO, + level=conf.service_log_level, format='%(levelname)s:%(asctime)s - %(funcName)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S') logger = logging.getLogger(os.path.basename(__file__)) @@ -636,9 +735,8 @@ def stop(self): eventQueue = Queue.Queue() - runnumber = sys.argv[1] + runnumber = sys.argv[2] watchdir = conf.watch_directory - mainDir = os.path.join(watchdir,'run'+ runnumber.zfill(conf.run_number_padding)) dt=os.path.getctime(mainDir) startTime = datetime.datetime.utcfromtimestamp(dt).isoformat() @@ -668,7 +766,7 @@ def stop(self): mr.start_inotify() - es = elasticBandBU(runnumber,startTime) + es = elasticBandBU(conf,runnumber,startTime) #starting elasticCollector thread ec = elasticCollectorBU(es,mainDir) diff --git a/python/fillresources.py b/python/fillresources.py index 902c548..cc3c7d1 100755 --- a/python/fillresources.py +++ b/python/fillresources.py @@ -3,6 +3,18 @@ import os import shutil import hltdconf +import time + +def clearDir(dir): + try: + files = os.listdir(dir) + for file in files: + try: + os.unlink(os.path.join(dir,file)) + except: + pass + except: + pass conf=hltdconf.hltdConf('/etc/hltd.conf') @@ -13,26 +25,14 @@ elif 'fu' in os.uname()[1]: role='fu' else: role = conf.role -if role=='fu' and conf.dqm_machine=="False": - - try: - shutil.rmtree('/etc/appliance/online/*') - except: - pass - try: - shutil.rmtree('/etc/appliance/offline/*') - except: - pass - try: - shutil.rmtree('/etc/appliance/except/*') - except: - pass - try: - shutil.rmtree('/etc/appliance/quarantined/*') - except: - pass - +if role=='fu' and not conf.dqm_machine: + clearDir(conf.resource_base+'/idle') + clearDir(conf.resource_base+'/online') + clearDir(conf.resource_base+'/except') + clearDir(conf.resource_base+'/quarantined') + clearDir(conf.resource_base+'/cloud') + fp=open('/proc/cpuinfo','r') resource_count = 0 for line in fp: diff --git a/python/genTestFakeBu_cfg.py b/python/genTestFakeBu_cfg.py index 39424da..f1963f5 100644 --- a/python/genTestFakeBu_cfg.py +++ b/python/genTestFakeBu_cfg.py @@ -50,7 +50,7 @@ process.source = cms.Source("EmptySource", firstRun= cms.untracked.uint32(options.runNumber), - numberEventsInLuminosityBlock = cms.untracked.uint32(500), + numberEventsInLuminosityBlock = cms.untracked.uint32(200), numberEventsInRun = cms.untracked.uint32(0) ) @@ -79,7 +79,7 @@ process.out = cms.OutputModule("RawStreamFileWriterForBU", ProductLabel = cms.untracked.string("s"), - numEventsPerFile = cms.untracked.uint32(100), + numEventsPerFile = cms.untracked.uint32(50), jsonDefLocation = cms.untracked.string(cmsswbase+"/src/EventFilter/Utilities/plugins/budef.jsd"), debug = cms.untracked.bool(True) ) diff --git a/python/hltd b/python/hltd index edaedd4..6b125e5 100755 --- a/python/hltd +++ b/python/hltd @@ -13,35 +13,47 @@ from applianceumount import checkMode import time import syslog -def touchLockFile(): - try: - with open('/var/lock/subsys/hltd',"w+") as fi: - pass - except: - pass - -def removeLockFile(): - try: - os.unlink('/var/lock/subsys/hltd') - except: - pass + +def startService(daemon,srvInstance): + daemon.touchLockFile() + proc = Popen(["/opt/hltd/python/hltd.py",srvInstance], stdout=PIPE) + output = proc.communicate()[0] + time.sleep(.1) + if daemon.silentStatus() and proc.returncode==0: + print 'Starting hltd instance',srvInstance,':\t\t\t\t [ \033[1;32mOK\033[0;39m ]' + + daemon.touchLockFile() + else: + if proc.returncode==3:sys.exit(0) + print 'Starting hltd instance',srvInstance,':\t\t\t\t [ \033[1;31mFAILED\033[0;39m ]' + print output + sys.exit(1) if __name__ == "__main__": - daemon = hltd('/var/run/hltd.pid') - if len(sys.argv) == 2: + + if len(sys.argv) <=2 or sys.argv[2]=="all": + try: + instances=[] + with open('/etc/hltd.instances','r') as fi: + for line in fi.readlines(): + lnstrip = line.strip(' \n') + if len(lnstrip)>0 and lnstrip.startswith("#")==False: + instances.append(lnstrip) + except: + instances = ["main"] + else: + instances = [sys.argv[2]] + + for instance in instances: + daemon = hltd(instance) + + if len(sys.argv) >= 2: if 'start' == sys.argv[1]: - touchLockFile() - output = Popen(["/opt/hltd/python/hltd.py"], stdout=PIPE).communicate()[0] - if daemon.silentStatus(): - print '[OK]' - else: - print '[Failed]' - print output + startService(daemon,instance) + elif 'stop' == sys.argv[1]: - if daemon.status(): - daemon.stop() - elif os.path.exists('/var/run/hltd.pid'): - daemon.delpid() + sys.stdout.write('Stopping hltd instance '+instance+':') + daemon.stop() #determine runlevel std_out="" @@ -52,58 +64,55 @@ if __name__ == "__main__": from_level = std_out.split('\t')[0].rstrip('\n').strip().split(' ')[0] to_level = std_out.split('\t')[0].rstrip('\n').strip().split(' ')[1] if to_level.isdigit() and int(to_level) in [0,1,6] and str(from_level)!="1": - - if stopFUs()==False: + + if stopFUs(instance)==False: msg = "Shutdown or reboot is cancelled by hltd - FU umount failed! Switching to runlevel 3..." - syslog.syslog(msg) + syslog.syslog("hltd-"+str(instance)+":"+msg) time.sleep(2) p = Popen("init 3", shell=True, stdout=PIPE) p.wait() else: - removeLockFile() + daemon.removeLockFile() else: - if checkMode()=="fu": - removeLockFile() + if checkMode(instance)=="fu": + daemon.removeLockFile() else: print "Lock file remains. Run stop-appliance to unmount FUs." except: print "Runlevel:",std_out - syslog.syslog("Exception when determining runlevel:"+str(std_out)) - + syslog.syslog("hltd-"+str(instance)+":Exception when determining runlevel:"+str(std_out)) + elif 'stop-appliance' == sys.argv[1]: - if daemon.status(): - daemon.stop() - elif os.path.exists('/var/run/hltd.pid'): - daemon.delpid() - - if checkMode()=="fu": - print "This command is not supported on FU." - - elif stopFUs()==False: - print "FU umount failed, lock file remains. FU umount failed." + sys.stdout.write('Stopping hltd instance '+instance+':') + daemon.stop() + + if checkMode(instance)=="fu": + print "This command is not supported on FU. Performed only service stop." + + elif stopFUs(instance)==False: + print "FU umount failed, lock file remains." else: - removeLockFile() + daemon.removeLockFile() elif 'stop-light' == sys.argv[1]: - if daemon.status(): - daemon.stop() - elif os.path.exists('/var/run/hltd.pid'): - daemon.delpid() - removeLockFile() - + sys.stdout.write('Stopping hltd instance '+instance+':') + daemon.stop() + daemon.removeLockFile() + elif 'restart' == sys.argv[1]: - daemon.restart() - touchLockFile() + sys.stdout.write('Stopping hltd instance '+instance+':') + daemon.stop() + startService(daemon,instance) + elif 'status' == sys.argv[1]: daemon.status() else: print "Unknown command" sys.exit(2) -# print "hltd "+sys.argv[1]+"ed" -# logging.debug("executed "+sys.argv[1]) - sys.exit(0) else: - print "usage: %s start|stop|stop-light|restart|status" % sys.argv[0] + print "usage: %s start|stop|stop-light|restart|status |all|main|instance" % sys.argv[0] sys.exit(2) + +sys.exit(0) diff --git a/python/hltd.py b/python/hltd.py index 3a1ea6f..5adbb70 100755 --- a/python/hltd.py +++ b/python/hltd.py @@ -9,10 +9,9 @@ import subprocess from signal import SIGKILL from signal import SIGINT -import json +import simplejson as json #import SOAPpy import threading -import fcntl import CGIHTTPServer import BaseHTTPServer import cgitb @@ -21,6 +20,8 @@ import re import shutil import socket +#import fcntl +#import random #modules distributed with hltd import prctl @@ -34,26 +35,63 @@ from elasticbu import BoxInfoUpdater from elasticbu import RunCompletedChecker -idles = conf.resource_base+'/idle/' -used = conf.resource_base+'/online/' -broken = conf.resource_base+'/except/' -quarantined = conf.resource_base+'/quarantined/' +from aUtils import fileHandler + nthreads = None nstreams = None expected_processes = None run_list=[] +runs_pending_shutdown=[] bu_disk_list_ramdisk=[] bu_disk_list_output=[] +bu_disk_list_ramdisk_instance=[] +bu_disk_list_output_instance=[] active_runs=[] +active_runs_errors=[] resource_lock = threading.Lock() +nsslock = threading.Lock() suspended=False +entering_cloud_mode=False +cloud_mode=False + +ramdisk_submount_size=0 +machine_blacklist=[] +boxinfoFUMap = {} + +logCollector = None + +def setFromConf(myinstance): + + global conf + global logger + global idles + global used + global broken + global quarantined + global cloud + + conf=initConf(myinstance) -logging.basicConfig(filename=os.path.join(conf.log_dir,"hltd.log"), + idles = conf.resource_base+'/idle/' + used = conf.resource_base+'/online/' + broken = conf.resource_base+'/except/' + quarantined = conf.resource_base+'/quarantined/' + cloud = conf.resource_base+'/cloud/' + + #prepare log directory + if myinstance!='main': + if not os.path.exists(conf.log_dir): os.makedirs(conf.log_dir) + if not os.path.exists(os.path.join(conf.log_dir,'pid')): os.makedirs(os.path.join(conf.log_dir,'pid')) + os.chmod(conf.log_dir,0777) + os.chmod(os.path.join(conf.log_dir,'pid'),0777) + + logging.basicConfig(filename=os.path.join(conf.log_dir,"hltd.log"), level=conf.service_log_level, format='%(levelname)s:%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S') + logger = logging.getLogger(os.path.basename(__file__)) + conf.dump() -conf.dump() def preexec_function(): dem = demote.demote(conf.user) @@ -62,35 +100,72 @@ def preexec_function(): # os.setpgrp() def cleanup_resources(): + try: + dirlist = os.listdir(cloud) + for cpu in dirlist: + os.rename(cloud+cpu,idles+cpu) + dirlist = os.listdir(broken) + for cpu in dirlist: + os.rename(broken+cpu,idles+cpu) + dirlist = os.listdir(used) + for cpu in dirlist: + os.rename(used+cpu,idles+cpu) + dirlist = os.listdir(quarantined) + for cpu in dirlist: + os.rename(quarantined+cpu,idles+cpu) + dirlist = os.listdir(idles) + #quarantine files beyond use fraction limit (rounded to closest integer) + num_excluded = round(len(dirlist)*(1.-conf.resource_use_fraction)) + for i in range(0,int(num_excluded)): + os.rename(idles+dirlist[i],quarantined+dirlist[i]) + return True + except Exception as ex: + logger.warning(str(ex)) + return False +def move_resources_to_cloud(): dirlist = os.listdir(broken) for cpu in dirlist: - os.rename(broken+cpu,idles+cpu) + os.rename(broken+cpu,cloud+cpu) dirlist = os.listdir(used) for cpu in dirlist: - os.rename(used+cpu,idles+cpu) + os.rename(used+cpu,cloud+cpu) dirlist = os.listdir(quarantined) for cpu in dirlist: - os.rename(quarantined+cpu,idles+cpu) + os.rename(quarantined+cpu,cloud+cpu) dirlist = os.listdir(idles) - #quarantine files beyond use fraction limit (rounded to closest integer) - num_excluded = round(len(dirlist)*(1.-conf.resource_use_fraction)) - for i in range(0,int(num_excluded)): - os.rename(idles+dirlist[i],quarantined+dirlist[i]) + for cpu in dirlist: + os.rename(idles+cpu,cloud+cpu) + dirlist = os.listdir(idles) + for cpu in dirlist: + os.rename(idles+cpu,cloud+cpu) + def cleanup_mountpoints(remount=True): - bu_disk_list_ramdisk[:] = [] - bu_disk_list_output[:] = [] + + global bu_disk_list_ramdisk + global bu_disk_list_ramdisk_instance + global bu_disk_list_output + global bu_disk_list_output_instance + + bu_disk_list_ramdisk = [] + bu_disk_list_output = [] + bu_disk_list_ramdisk_instance = [] + bu_disk_list_output_instance = [] + if conf.bu_base_dir[0] == '/': - bu_disk_list_ramdisk[:] = [os.path.join(conf.bu_base_dir,conf.ramdisk_subdirectory)] - bu_disk_list_output[:] = [os.path.join(conf.bu_base_dir,conf.output_subdirectory)] + bu_disk_list_ramdisk = [os.path.join(conf.bu_base_dir,conf.ramdisk_subdirectory)] + bu_disk_list_output = [os.path.join(conf.bu_base_dir,conf.output_subdirectory)] + if conf.instance=="main": + bu_disk_list_ramdisk_instance = bu_disk_list_ramdisk + bu_disk_list_output_instance = bu_disk_list_output + else: + bu_disk_list_ramdisk_instance = [os.path.join(bu_disk_list_ramdisk[0],conf.instance)] + bu_disk_list_output_instance = [os.path.join(bu_disk_list_output[0],conf.instance)] + #make subdirectories if necessary and return if remount==True: - try: - os.makedirs(conf.bu_base_dir) - except OSError: - pass try: os.makedirs(os.path.join(conf.bu_base_dir,conf.ramdisk_subdirectory)) except OSError: @@ -104,59 +179,63 @@ def cleanup_mountpoints(remount=True): process = subprocess.Popen(['mount'],stdout=subprocess.PIPE) out = process.communicate()[0] mounts = re.findall('/'+conf.bu_base_dir+'[0-9]+',out) - mounts = list(set(mounts)) - #if len(mounts)>1 and mounts[0]==mounts[1]: mounts=[mounts[0]] - logging.info("cleanup_mountpoints: found following mount points ") - logging.info(mounts) + mounts = sorted(list(set(mounts))) + logger.info("cleanup_mountpoints: found following mount points: ") + logger.info(mounts) umount_failure=False for point in mounts: - logging.info("trying umount of "+point) + try: + #try to unmount old style mountpoint(ok if fails) subprocess.check_call(['umount','/'+point]) - except subprocess.CalledProcessError, err1: - pass - except Exception as ex: - logging.exception(ex) + except:pass try: subprocess.check_call(['umount',os.path.join('/'+point,conf.ramdisk_subdirectory)]) except subprocess.CalledProcessError, err1: - logging.error("Error calling umount in cleanup_mountpoints") - logging.error(str(err1.returncode)) - umount_failure=True + logger.info("trying to kill users of ramdisk") + try: + subprocess.check_call(['fuser','-km',os.path.join('/'+point,conf.ramdisk_subdirectory)]) + except subprocess.CalledProcessError, err2: + logger.error("Error calling umount in cleanup_mountpoints (ramdisk), return code:"+str(err2.returncode)) + try: + subprocess.check_call(['umount',os.path.join('/'+point,conf.ramdisk_subdirectory)]) + except subprocess.CalledProcessError, err2: + logger.error("Error calling umount in cleanup_mountpoints (ramdisk), return code:"+str(err2.returncode)) + umount_failure=True try: subprocess.check_call(['umount',os.path.join('/'+point,conf.output_subdirectory)]) except subprocess.CalledProcessError, err1: - logging.error("Error calling umount in cleanup_mountpoints") - logging.error(str(err1.returncode)) - umount_failure=True - #this will remove directories only if they are empty (as unomunted mount point should be) + logger.info("trying to kill users of output") + try: + subprocess.check_call(['fuser','-km',os.path.join('/'+point,conf.output_subdirectory)]) + except subprocess.CalledProcessError, err2: + logger.error("Error calling umount in cleanup_mountpoints (output), return code:"+str(err2.returncode)) + try: + subprocess.check_call(['umount',os.path.join('/'+point,conf.output_subdirectory)]) + except subprocess.CalledProcessError, err2: + logger.error("Error calling umount in cleanup_mountpoints (output), return code:"+str(err2.returncode)) + umount_failure=True + + #this will remove directories only if they are empty (as unmounted mount point should be) try: if os.path.join('/'+point,conf.ramdisk_subdirectory)!='/': os.rmdir(os.path.join('/'+point,conf.ramdisk_subdirectory)) except Exception as ex: - logging.exception(ex) + logger.exception(ex) try: if os.path.join('/'+point,conf.output_subdirectory)!='/': os.rmdir(os.path.join('/'+point,conf.output_subdirectory)) except Exception as ex: - logging.exception(ex) - try: - if os.path.join('/',point)!='/': - os.rmdir('/'+point) - except Exception as ex: - logging.exception(ex) + logger.exception(ex) if remount==False: if umount_failure:return False return True i = 0 bus_config = os.path.join(os.path.dirname(conf.resource_base.rstrip(os.path.sep)),'bus.config') if os.path.exists(bus_config): + busconfig_age = os.path.getmtime(bus_config) for line in open(bus_config): - logging.info("found BU to mount at "+line.strip()) - try: - os.makedirs('/'+conf.bu_base_dir+str(i)) - except OSError: - pass + logger.info("found BU to mount at "+line.strip()) try: os.makedirs(os.path.join('/'+conf.bu_base_dir+str(i),conf.ramdisk_subdirectory)) except OSError: @@ -174,16 +253,20 @@ def cleanup_mountpoints(remount=True): break else: p_end = datetime.datetime.now() - logging.warn('unable to ping '+line.strip()) + logger.warn('unable to ping '+line.strip()) dt = p_end - p_begin if dt.seconds < 10: time.sleep(10-dt.seconds) attemptsLeft-=1 - if attemptsLeft==0: - logging.fatal('hltd was unable to ping BU '+line.strip()) - sys.exit(1) - else: - logging.info("trying to mount "+line.strip()+':/fff/'+conf.ramdisk_subdirectory+' '+os.path.join('/'+conf.bu_base_dir+str(i),conf.ramdisk_subdirectory)) + if attemptsLeft==0: + logger.fatal('hltd was unable to ping BU '+line.strip()) + #check if bus.config has been updated + if (os.path.getmtime(bus_config) - busconfig_age)>1: + return cleanup_mountpoints(remount) + attemptsLeft=8 + #sys.exit(1) + if True: + logger.info("trying to mount "+line.strip()+':/fff/'+conf.ramdisk_subdirectory+' '+os.path.join('/'+conf.bu_base_dir+str(i),conf.ramdisk_subdirectory)) try: subprocess.check_call( [conf.mount_command, @@ -194,13 +277,18 @@ def cleanup_mountpoints(remount=True): line.strip()+':/fff/'+conf.ramdisk_subdirectory, os.path.join('/'+conf.bu_base_dir+str(i),conf.ramdisk_subdirectory)] ) - bu_disk_list_ramdisk.append(os.path.join('/'+conf.bu_base_dir+str(i),conf.ramdisk_subdirectory)) + toappend = os.path.join('/'+conf.bu_base_dir+str(i),conf.ramdisk_subdirectory) + bu_disk_list_ramdisk.append(toappend) + if conf.instance=="main": + bu_disk_list_ramdisk_instance.append(toappend) + else: + bu_disk_list_ramdisk_instance.append(os.path.join(toappend,conf.instance)) except subprocess.CalledProcessError, err2: - logging.exception(err2) - logging.fatal("Unable to mount ramdisk - exiting.") + logger.exception(err2) + logger.fatal("Unable to mount ramdisk - exiting.") sys.exit(1) - logging.info("trying to mount "+line.strip()+':/fff/'+conf.output_subdirectory+' '+os.path.join('/'+conf.bu_base_dir+str(i),conf.output_subdirectory)) + logger.info("trying to mount "+line.strip()+':/fff/'+conf.output_subdirectory+' '+os.path.join('/'+conf.bu_base_dir+str(i),conf.output_subdirectory)) try: subprocess.check_call( [conf.mount_command, @@ -211,26 +299,49 @@ def cleanup_mountpoints(remount=True): line.strip()+':/fff/'+conf.output_subdirectory, os.path.join('/'+conf.bu_base_dir+str(i),conf.output_subdirectory)] ) - bu_disk_list_output.append(os.path.join('/'+conf.bu_base_dir+str(i),conf.output_subdirectory)) + toappend = os.path.join('/'+conf.bu_base_dir+str(i),conf.output_subdirectory) + bu_disk_list_output.append(toappend) + if conf.instance=="main" or conf.instance_same_destination==True: + bu_disk_list_output_instance.append(toappend) + else: + bu_disk_list_output_instance.append(os.path.join(toappend,conf.instance)) except subprocess.CalledProcessError, err2: - logging.exception(err2) - logging.fatal("Unable to mount output - exiting.") + logger.exception(err2) + logger.fatal("Unable to mount output - exiting.") sys.exit(1) - i+=1 #clean up suspended state try: - if remount==True:os.unlink(conf.watch_directory+'/suspend') + if remount==True:os.popen('rm -rf '+conf.watch_directory+'/suspend*') except:pass except Exception as ex: - logging.error("Exception in cleanup_mountpoints") - logging.exception(ex) + logger.error("Exception in cleanup_mountpoints") + logger.exception(ex) if remount==True: - logging.fatal("Unable to handle (un)mounting") + logger.fatal("Unable to handle (un)mounting") return False else:return False +def submount_size(basedir): + loop_size=0 + try: + p = subprocess.Popen("mount", shell=False, stdout=subprocess.PIPE) + p.wait() + std_out=p.stdout.read().split("\n") + for l in std_out: + try: + ls = l.strip() + toks = l.split() + if toks[0].startswith(basedir) and toks[2].startswith(basedir) and 'loop' in toks[5]: + imgstat = os.stat(toks[0]) + imgsize = imgstat.st_size + loop_size+=imgsize + except:pass + except:pass + return loop_size + + def calculate_threadnumber(): global nthreads global nstreams @@ -240,12 +351,58 @@ def calculate_threadnumber(): nthreads = idlecount/conf.cmssw_threads_autosplit nstreams = idlecount/conf.cmssw_threads_autosplit if nthreads*conf.cmssw_threads_autosplit != nthreads: - logging.error("idle cores can not be evenly split to cmssw threads") + logger.error("idle cores can not be evenly split to cmssw threads") else: nthreads = conf.cmssw_threads - nstreams = conf.cmssw_threads + nstreams = conf.cmssw_streams expected_processes = idlecount/nstreams + +def updateBlacklist(): + black_list=[] + active_black_list=[] + #TODO:this will be updated to read blacklist from database + if conf.role=='bu': + try: + if os.stat('/etc/appliance/blacklist').st_size>0: + with open('/etc/appliance/blacklist','r') as fi: + try: + static_black_list = json.load(fi) + for item in static_black_list: + black_list.append(item) + logger.info("found these resources in /etc/appliance/blacklist: "+str(black_list)) + except ValueError: + logger.error("error parsing /etc/appliance/blacklist") + except: + #no blacklist file, this is ok + pass + black_list=list(set(black_list)) + try: + forceUpdate=False + with open(os.path.join(conf.watch_directory,'appliance','blacklist'),'r') as fi: + active_black_list = json.load(fi) + except: + forceUpdate=True + if forceUpdate==True or active_black_list != black_list: + try: + with open(os.path.join(conf.watch_directory,'appliance','blacklist'),'w') as fi: + json.dump(black_list,fi) + except: + return False,black_list + #TODO:check on FU if blacklisted + return True,black_list + +def restartLogCollector(instanceParam): + global logCollector + if logCollector!=None: + logger.info("terminating logCollector") + logCollector.terminate() + logCollector = None + logger.info("starting logcollector.py") + logcollector_args = ['/opt/hltd/python/logcollector.py'] + logcollector_args.append(instanceParam) + logCollector = subprocess.Popen(logcollector_args,preexec_fn=preexec_function,close_fds=True) + class system_monitor(threading.Thread): def __init__(self): @@ -259,65 +416,152 @@ def __init__(self): def rehash(self): if conf.role == 'fu': - self.directory = ['/'+x+'/appliance/boxes/' for x in bu_disk_list_ramdisk] + self.directory = [os.path.join(bu_disk_list_ramdisk_instance[0],'appliance','boxes')] + #self.directory = ['/'+x+'/appliance/boxes/' for x in bu_disk_list_ramdisk_instance] + #write only in one location else: - self.directory = [conf.watch_directory+'/appliance/boxes/'] - self.file = [x+self.hostname for x in self.directory] - for dir in self.directory: + self.directory = [os.path.join(conf.watch_directory,'appliance/boxes/')] try: - os.makedirs(dir) + #if directory does not exist: check if it is renamed to specific name (non-main instance) + if not os.path.exists(self.directory[0]) and conf.instance=="main": + os.makedirs(self.directory[0]) except OSError: pass - logging.info("system_monitor: rehash found the following BU disks") + + self.file = [os.path.join(x,self.hostname) for x in self.directory] + + logger.info("system_monitor: rehash found the following BU disk(s):"+str(self.file)) for disk in self.file: - logging.info(disk) + logger.info(disk) def run(self): try: - logging.debug('entered system monitor thread ') + logger.debug('entered system monitor thread ') global suspended + global ramdisk_submount_size + res_path_temp = os.path.join(conf.watch_directory,'appliance','resource_summary_temp') + res_path = os.path.join(conf.watch_directory,'appliance','resource_summary') + selfhost = os.uname()[1] + counter=0 while self.running: -# logging.info('system monitor - running '+str(self.running)) - self.threadEvent.wait(5) + self.threadEvent.wait(5 if counter>0 else 1) + counter+=1 + counter=counter%5 if suspended:continue tstring = datetime.datetime.utcfromtimestamp(time.time()).isoformat() - fp = None + ramdisk = None + if conf.role == 'bu': + ramdisk = os.statvfs(conf.watch_directory) + ramdisk_occ=1 + try:ramdisk_occ = float((ramdisk.f_blocks - ramdisk.f_bavail)*ramdisk.f_bsize - ramdisk_submount_size)/float(ramdisk.f_blocks*ramdisk.f_bsize - ramdisk_submount_size) + except:pass + if ramdisk_occ<0: + ramdisk_occ=0 + logger.info('incorrect ramdisk occupancy',ramdisk_occ) + if ramdisk_occ>1: + ramdisk_occ=1 + logger.info('incorrect ramdisk occupancy',ramdisk_occ) + + resource_count_idle = 0 + resource_count_used = 0 + resource_count_broken = 0 + cloud_count = 0 + lastFURuns = [] + lastFURun=-1 + activeRunQueuedLumisNum = -1 + current_time = time.time() + for key in boxinfoFUMap: + if key==selfhost:continue + entry = boxinfoFUMap[key] + if current_time - entry[1] > 10:continue + resource_count_idle+=int(entry[0]['idles']) + resource_count_used+=int(entry[0]['used']) + resource_count_broken+=int(entry[0]['broken']) + cloud_count+=int(entry[0]['cloud']) + try: + lastFURuns.append(int(entry[0]['activeRuns'].strip('[]').split(',')[-1])) + except:pass + fuRuns = sorted(list(set(lastFURuns))) + if len(fuRuns)>0: + lastFURun = fuRuns[-1] + #second pass + for key in boxinfoFUMap: + if key==selfhost:continue + entry = boxinfoFUMap[key] + if current_time - entry[1] > 10:continue + try: + lastrun = int(entry[0]['activeRuns'].strip('[]').split(',')[-1]) + if lastrun==lastFURun: + qlumis = int(entry[0]['activeRunNumQueuedLS']) + if qlumis>activeRunQueuedLumisNum:activeRunQueuedLumisNum=qlumis + except:pass + res_doc = { + "active_resources":resource_count_idle+resource_count_used, + "idle":resource_count_idle, + "used":resource_count_used, + "broken":resource_count_broken, + "cloud":cloud_count, + "activeFURun":lastFURun, + "activeRunNumQueuedLS":activeRunQueuedLumisNum, + "ramdisk_occupancy":ramdisk_occ + } + with open(res_path_temp,'w') as fp: + json.dump(res_doc,fp) + os.rename(res_path_temp,res_path) + for mfile in self.file: if conf.role == 'fu': dirstat = os.statvfs(conf.watch_directory) - fp=open(mfile,'w+') - fp.write('fm_date='+tstring+'\n') - fp.write('idles='+str(len(os.listdir(idles)))+'\n') - fp.write('used='+str(len(os.listdir(used)))+'\n') - fp.write('broken='+str(len(os.listdir(broken)))+'\n') - fp.write('quarantined='+str(len(os.listdir(quarantined)))+'\n') - fp.write('usedDataDir='+str(((dirstat.f_blocks - dirstat.f_bavail)*dirstat.f_bsize)>>20)+'\n') - fp.write('totalDataDir='+str((dirstat.f_blocks*dirstat.f_bsize)>>20)+'\n') - #two lines with active runs (used to check file consistency) - fp.write('activeRuns='+str(active_runs).strip('[]')+'\n') - fp.write('activeRuns='+str(active_runs).strip('[]')+'\n') - fp.write('entriesComplete=True') - fp.close() + try: + with open(mfile,'w+') as fp: + fp.write('fm_date='+tstring+'\n') + if cloud_mode==True and entering_cloud_mode==True: + #lie about cores in cloud if cloud mode enabled, even if still processing + fp.write('idles=0\n') + fp.write('used=0\n') + fp.write('broken=0\n') + fp.write('cloud='+str(len(os.listdir(cloud))+len(os.listdir(idles))+len(os.listdir(used))+len(os.listdir(broken)))+'\n') + else: + fp.write('idles='+str(len(os.listdir(idles)))+'\n') + fp.write('used='+str(len(os.listdir(used)))+'\n') + fp.write('broken='+str(len(os.listdir(broken)))+'\n') + fp.write('cloud='+str(len(os.listdir(cloud)))+'\n') + + fp.write('quarantined='+str(len(os.listdir(quarantined)))+'\n') + fp.write('usedDataDir='+str(((dirstat.f_blocks - dirstat.f_bavail)*dirstat.f_bsize)>>20)+'\n') + fp.write('totalDataDir='+str((dirstat.f_blocks*dirstat.f_bsize)>>20)+'\n') + #two lines with active runs (used to check file consistency) + fp.write('activeRuns='+str(active_runs).strip('[]')+'\n') + fp.write('activeRuns='+str(active_runs).strip('[]')+'\n') + fp.write('activeRunsErrors='+str(active_runs_errors).strip('[]')+'\n') + fp.write('activeRunNumQueuedLS='+self.getLumiQueueStat()+'\n') + fp.write('entriesComplete=True') + except Exception as ex: + logger.warning('boxinfo file write failed +'+str(ex)) + if counter==0: + #in case something happened with the BU server, try remount + cleanup_mountpoints() + if conf.role == 'bu': - ramdisk = os.statvfs(conf.watch_directory) + #ramdisk = os.statvfs(conf.watch_directory) outdir = os.statvfs('/fff/output') - fp=open(mfile,'w+') - - fp.write('fm_date='+tstring+'\n') - fp.write('idles=0\n') - fp.write('used=0\n') - fp.write('broken=0\n') - fp.write('quarantined=0\n') - fp.write('usedRamdisk='+str(((ramdisk.f_blocks - ramdisk.f_bavail)*ramdisk.f_bsize)>>20)+'\n') - fp.write('totalRamdisk='+str((ramdisk.f_blocks*ramdisk.f_bsize)>>20)+'\n') - fp.write('usedOutput='+str(((outdir.f_blocks - outdir.f_bavail)*outdir.f_bsize)>>20)+'\n') - fp.write('totalOutput='+str((outdir.f_blocks*outdir.f_bsize)>>20)+'\n') - fp.write('activeRuns='+str(active_runs).strip('[]')+'\n') - fp.write('activeRuns='+str(active_runs).strip('[]')+'\n') - fp.write('entriesComplete=True') - fp.close() - + with open(mfile,'w+') as fp: + fp.write('fm_date='+tstring+'\n') + fp.write('idles=0\n') + fp.write('used=0\n') + fp.write('broken=0\n') + fp.write('quarantined=0\n') + fp.write('cloud=0\n') + fp.write('usedRamdisk='+str(((ramdisk.f_blocks - ramdisk.f_bavail)*ramdisk.f_bsize - ramdisk_submount_size)>>20)+'\n') + fp.write('totalRamdisk='+str((ramdisk.f_blocks*ramdisk.f_bsize - ramdisk_submount_size)>>20)+'\n') + fp.write('usedOutput='+str(((outdir.f_blocks - outdir.f_bavail)*outdir.f_bsize)>>20)+'\n') + fp.write('totalOutput='+str((outdir.f_blocks*outdir.f_bsize)>>20)+'\n') + fp.write('activeRuns='+str(active_runs).strip('[]')+'\n') + fp.write('activeRuns='+str(active_runs).strip('[]')+'\n') + fp.write('entriesComplete=True') + + #deprecated if conf.role == 'bu': mfile = conf.resource_base+'/disk.jsn' stat=[] @@ -336,7 +580,7 @@ def run(self): json.dump(stat,fp) fp.close() except Exception as ex: - logging.error(ex) + logger.error(ex) for mfile in self.file: try: @@ -344,10 +588,20 @@ def run(self): except OSError: pass - logging.debug('exiting system monitor thread ') + logger.debug('exiting system monitor thread ') + + def getLumiQueueStat(self): + try: + with open(os.path.join(conf.watch_directory,'run'+str(active_runs[-1]).zfill(conf.run_number_padding), + 'open','queue_status.jsn'),'r') as fp: + #fcntl.flock(fp, fcntl.LOCK_EX) + statusDoc = json.load(fp) + return str(statusDoc["numQueuedLS"]) + except: + return "-1" def stop(self): - logging.debug("system_monitor: request to stop") + logger.debug("system_monitor: request to stop") self.running = False self.threadEvent.set() @@ -358,13 +612,13 @@ def __init__(self): def startNewRun(self,nr): if self.runnumber: - logging.error("Another BU emulator run "+str(self.runnumber)+" is already ongoing") + logger.error("Another BU emulator run "+str(self.runnumber)+" is already ongoing") return self.runnumber = nr configtouse = conf.test_bu_config destination_base = None if role == 'fu': - destination_base = bu_disk_list_ramdisk[startindex%len(bu_disk_list_ramdisk)] + destination_base = bu_disk_list_ramdisk_instance[startindex%len(bu_disk_list_ramdisk_instance)] else: destination_base = conf.watch_directory @@ -393,8 +647,8 @@ def startNewRun(self,nr): close_fds=True ) except Exception as ex: - logging.error("Error in forking BU emulator process") - logging.error(ex) + logger.error("Error in forking BU emulator process") + logger.error(ex) def stop(self): os.kill(self.process.pid,SIGINT) @@ -424,22 +678,22 @@ def ping(self): def NotifyNewRun(self,runnumber): self.runnumber = runnumber - logging.info("calling start of run on "+self.cpu[0]); + logger.info("calling start of run on "+self.cpu[0]); try: - connection = httplib.HTTPConnection(self.cpu[0], conf.cgi_port) + connection = httplib.HTTPConnection(self.cpu[0], conf.cgi_port - conf.cgi_instance_port_offset) connection.request("GET",'cgi-bin/start_cgi.py?run='+str(runnumber)) response = connection.getresponse() #do something intelligent with the response code - logging.error("response was "+str(response.status)) + logger.error("response was "+str(response.status)) if response.status > 300: self.hoststate = 1 else: - logging.info(response.read()) + logger.info(response.read()) except Exception as ex: - logging.exception(ex) + logger.exception(ex) def NotifyShutdown(self): try: - connection = httplib.HTTPConnection(self.cpu[0], conf.cgi_port) + connection = httplib.HTTPConnection(self.cpu[0], conf.cgi_port - self.cgi_instance_port_offset) connection.request("GET",'cgi-bin/stop_cgi.py?run='+str(self.runnumber)) time.sleep(0.05) response = connection.getresponse() @@ -447,10 +701,10 @@ def NotifyShutdown(self): #do something intelligent with the response code if response.status > 300: self.hoststate = 0 except Exception as ex: - logging.exception(ex) + logger.exception(ex) def StartNewProcess(self ,runnumber, startindex, arch, version, menu,num_threads,num_streams): - logging.debug("OnlineResource: StartNewProcess called") + logger.debug("OnlineResource: StartNewProcess called") self.runnumber = runnumber """ @@ -458,10 +712,10 @@ def StartNewProcess(self ,runnumber, startindex, arch, version, menu,num_threads independent mounts of the BU - it should not be necessary in due course IFF it is necessary, it should address "any" number of mounts, not just 2 """ - input_disk = bu_disk_list_ramdisk[startindex%len(bu_disk_list_ramdisk)] + input_disk = bu_disk_list_ramdisk_instance[startindex%len(bu_disk_list_ramdisk_instance)] #run_dir = input_disk + '/run' + str(self.runnumber).zfill(conf.run_number_padding) - logging.info("starting process with "+version+" and run number "+str(runnumber)) + logger.info("starting process with "+version+" and run number "+str(runnumber)) if "_patch" in version: full_release="cmssw-patch" @@ -492,7 +746,7 @@ def StartNewProcess(self ,runnumber, startindex, arch, version, menu,num_threads if self.watchdog: new_run_args.append("skipFirstLumis=True") - logging.info("arg array "+str(new_run_args).translate(None, "'")) + logger.info("arg array "+str(new_run_args).translate(None, "'")) try: # dem = demote.demote(conf.user) self.process = subprocess.Popen(new_run_args, @@ -500,29 +754,29 @@ def StartNewProcess(self ,runnumber, startindex, arch, version, menu,num_threads close_fds=True ) self.processstate = 100 - logging.info("started process "+str(self.process.pid)) + logger.info("started process "+str(self.process.pid)) # time.sleep(1.) if self.watchdog==None: self.watchdog = ProcessWatchdog(self,self.lock) self.watchdog.start() - logging.debug("watchdog thread for "+str(self.process.pid)+" is alive " + logger.debug("watchdog thread for "+str(self.process.pid)+" is alive " + str(self.watchdog.is_alive())) else: self.watchdog.join() self.watchdog = ProcessWatchdog(self,self.lock) self.watchdog.start() - logging.debug("watchdog thread restarted for "+str(self.process.pid)+" is alive " + logger.debug("watchdog thread restarted for "+str(self.process.pid)+" is alive " + str(self.watchdog.is_alive())) except Exception as ex: - logging.info("OnlineResource: exception encountered in forking hlt slave") - logging.info(ex) + logger.info("OnlineResource: exception encountered in forking hlt slave") + logger.info(ex) def join(self): - logging.debug('calling join on thread ' +self.watchdog.name) + logger.debug('calling join on thread ' +self.watchdog.name) self.watchdog.join() def disableRestart(self): - logging.debug("OnlineResource "+str(self.cpu)+" restart is now disabled") + logger.debug("OnlineResource "+str(self.cpu)+" restart is now disabled") if self.watchdog: self.watchdog.disableRestart() @@ -530,11 +784,11 @@ def clearQuarantined(self): resource_lock.acquire() try: for cpu in self.quarantined: - logging.info('Clearing quarantined resource '+cpu) + logger.info('Clearing quarantined resource '+cpu) os.rename(quarantined+cpu,idles+cpu) self.quarantined = [] except Exception as ex: - logging.exception(ex) + logger.exception(ex) resource_lock.release() class ProcessWatchdog(threading.Thread): @@ -549,16 +803,16 @@ def __init__(self,resource,lock): def run(self): try: monfile = self.resource.associateddir+'/hltd.jsn' - logging.info('watchdog for process '+str(self.resource.process.pid)) + logger.info('watchdog for process '+str(self.resource.process.pid)) self.resource.process.wait() returncode = self.resource.process.returncode pid = self.resource.process.pid #update json process monitoring file self.resource.processstate=returncode - logging.debug('ProcessWatchdog: acquire lock thread '+str(pid)) + logger.debug('ProcessWatchdog: acquire lock thread '+str(pid)) self.lock.acquire() - logging.debug('ProcessWatchdog: acquired lock thread '+str(pid)) + logger.debug('ProcessWatchdog: acquired lock thread '+str(pid)) try: with open(monfile,"r+") as fp: @@ -573,13 +827,13 @@ def run(self): fp.flush() except IOError,ex: - logging.exception(ex) + logger.exception(ex) except ValueError: pass - logging.debug('ProcessWatchdog: release lock thread '+str(pid)) + logger.debug('ProcessWatchdog: release lock thread '+str(pid)) self.lock.release() - logging.debug('ProcessWatchdog: released lock thread '+str(pid)) + logger.debug('ProcessWatchdog: released lock thread '+str(pid)) abortedmarker = self.resource.statefiledir+'/'+Run.ABORTED @@ -591,20 +845,24 @@ def run(self): try: os.rename(used+cpu,idles+cpu) except Exception as ex: - logging.exception(ex) + logger.exception(ex) except:pass resource_lock.release() return - #quit codes (configuration errors): - quit_codes = [127,90,65,73] + #bump error count in active_runs_errors which is logged in the box file + if returncode!=0: + try: + global active_runs + global active_runs_errors + active_runs_errors[active_runs.index(self.resource.runnumber)]+=1 + except: + pass - #cleanup actions- remove process from list and - # attempt restart on same resource - #dqm mode will treat configuration error as a crash and eventually move to quarantined - if returncode != 0 and ( returncode not in quit_codes or conf.dqm_machine==True): + #cleanup actions- remove process from list and attempt restart on same resource + if returncode != 0: if returncode < 0: - logging.error("process "+str(pid) + logger.error("process "+str(pid) +" for run "+str(self.resource.runnumber) +" on resource(s) " + str(self.resource.cpu) +" exited with signal " @@ -613,7 +871,7 @@ def run(self): +str(self.retry_enabled) ) else: - logging.error("process "+str(pid) + logger.error("process "+str(pid) +" for run "+str(self.resource.runnumber) +" on resource(s) " + str(self.resource.cpu) +" exited with code " @@ -621,8 +879,23 @@ def run(self): +" restart is enabled ? " +str(self.retry_enabled) ) - - + #quit codes (configuration errors): + quit_codes = [127,90,73] + + #removed 65 because it is not only configuration error + #quit_codes = [127,90,65,73] + + #dqm mode will treat configuration error as a crash and eventually move to quarantined + if conf.dqm_machine==False and returncode in quit_codes: + if self.resource.retry_attempts < self.retry_limit: + logger.warning('for this type of error, restarting this process is disabled') + self.resource.retry_attempts=self.retry_limit + if returncode==127: + logger.fatal('Exit code indicates that CMSSW environment might not be available (cmsRun executable not in path).') + elif returncode==90: + logger.fatal('Exit code indicates that there might be a python error in the CMSSW configuration.') + else: + logger.fatal('Exit code indicates that there might be a C/C++ error in the CMSSW configuration.') #generate crashed pid json file like: run000001_ls0000_crash_pid12345.jsn oldpid = "pid"+str(pid).zfill(5) @@ -635,8 +908,8 @@ def run(self): try: with open(filepath,"w+") as fi: json.dump(document,fi) - except: logging.exception("unable to create %r" %filename) - logging.info("pid crash file: %r" %filename) + except: logger.exception("unable to create %r" %filename) + logger.info("pid crash file: %r" %filename) if self.resource.retry_attempts < self.retry_limit: @@ -649,7 +922,7 @@ def run(self): self.resource.process = None self.resource.retry_attempts += 1 - logging.info("try to restart process for resource(s) " + logger.info("try to restart process for resource(s) " +str(self.resource.cpu) +" attempt " + str(self.resource.retry_attempts)) @@ -657,10 +930,10 @@ def run(self): for cpu in self.resource.cpu: os.rename(used+cpu,broken+cpu) resource_lock.release() - logging.debug("resource(s) " +str(self.resource.cpu)+ + logger.debug("resource(s) " +str(self.resource.cpu)+ " successfully moved to except") elif self.resource.retry_attempts >= self.retry_limit: - logging.error("process for run " + logger.error("process for run " +str(self.resource.runnumber) +" on resources " + str(self.resource.cpu) +" reached max retry limit " @@ -680,20 +953,11 @@ def run(self): fp = open(conf.watch_directory+'/quarantined'+str(self.resource.runnumber).zfill(conf.run_number_padding),'w+') fp.close() except Exception as ex: - logging.exception(ex) + logger.exception(ex) #successful end= release resource (TODO:maybe should mark aborted for non-0 error codes) - elif returncode == 0 or returncode in quit_codes: - if returncode==0: - logging.info('releasing resource, exit 0 meaning end of run '+str(self.resource.cpu)) - elif returncode==127: - logging.fatal('error executing start script. Maybe CMSSW environment is not available (cmsRun executable not in path).') - elif returncode==90: - logging.fatal('error executing start script: python error.') - elif returncode in quit_codes: - logging.fatal('error executing start script: CMSSW configuration error.') - else: - logging.fatal('error executing start script: unspecified error.') + elif returncode == 0: + logger.info('releasing resource, exit 0 meaning end of run '+str(self.resource.cpu)) # generate an end-of-run marker if it isn't already there - it will be picked up by the RunRanger endmarker = conf.watch_directory+'/end'+str(self.resource.runnumber).zfill(conf.run_number_padding) @@ -714,12 +978,12 @@ def run(self): #self.resource.process=None - # logging.info('exiting thread '+str(self.resource.process.pid)) + # logger.info('exiting thread '+str(self.resource.process.pid)) except Exception as ex: resource_lock.release() - logging.info("OnlineResource watchdog: exception") - logging.exception(ex) + logger.info("OnlineResource watchdog: exception") + logger.exception(ex) return def disableRestart(self): @@ -736,7 +1000,8 @@ class Run: VALID_MARKERS = [STARTING,ACTIVE,STOPPING,COMPLETE,ABORTED] - def __init__(self,nr,dirname,bu_dir): + def __init__(self,nr,dirname,bu_dir,instance): + self.instance = instance self.runnumber = nr self.dirname = dirname self.online_resource_list = [] @@ -754,22 +1019,23 @@ def __init__(self,nr,dirname,bu_dir): self.anelasticWatchdog = None self.threadEvent = threading.Event() global active_runs + global active_runs_errors if conf.role == 'fu': self.changeMarkerMaybe(Run.STARTING) if int(self.runnumber) in active_runs: raise Exception("Run "+str(self.runnumber)+ "already active") active_runs.append(int(self.runnumber)) + active_runs_errors.append(0) else: - #currently unused on BU active_runs.append(int(self.runnumber)) + active_runs_errors.append(0) self.menu_directory = bu_dir+'/'+conf.menu_directory readMenuAttempts=0 #polling for HLT menu directory while os.path.exists(self.menu_directory)==False and conf.dqm_machine==False and conf.role=='fu': - time.sleep(.2) readMenuAttempts+=1 #10 seconds allowed before defaulting to local configuration if readMenuAttempts>50: break @@ -780,19 +1046,17 @@ def __init__(self,nr,dirname,bu_dir): while True: self.menu = self.menu_directory+'/'+conf.menu_name if os.path.exists(self.menu_directory+'/'+conf.arch_file): - fp = open(self.menu_directory+'/'+conf.arch_file,'r') - self.arch = fp.readline().strip() - fp.close() + with open(self.menu_directory+'/'+conf.arch_file,'r') as fp: + self.arch = fp.readline().strip() if os.path.exists(self.menu_directory+'/'+conf.version_file): - fp = open(self.menu_directory+'/'+conf.version_file,'r') - self.version = fp.readline().strip() - fp.close() + with open(self.menu_directory+'/'+conf.version_file,'r') as fp: + self.version = fp.readline().strip() try: - logging.info("Run "+str(self.runnumber)+" uses "+ self.version+" ("+self.arch+") with "+self.menu) + logger.info("Run "+str(self.runnumber)+" uses "+ self.version+" ("+self.arch+") with "+self.menu) break except Exception as ex: - logging.exception(ex) - logging.error("Run parameters obtained for run "+str(self.runnumber)+": "+ str(self.version)+" ("+str(self.arch)+") with "+str(self.menu)) + logger.exception(ex) + logger.error("Run parameters obtained for run "+str(self.runnumber)+": "+ str(self.version)+" ("+str(self.arch)+") with "+str(self.menu)) time.sleep(.5) readMenuAttempts+=1 if readMenuAttempts==3: raise Exception("Unable to parse HLT parameters") @@ -802,73 +1066,83 @@ def __init__(self,nr,dirname,bu_dir): self.version = conf.cmssw_default_version self.menu = conf.test_hlt_config1 if conf.role=='fu': - logging.warn("Using default values for run "+str(self.runnumber)+": "+self.version+" ("+self.arch+") with "+self.menu) + logger.warn("Using default values for run "+str(self.runnumber)+": "+self.version+" ("+self.arch+") with "+self.menu) self.rawinputdir = None + # if conf.role == "bu": try: self.rawinputdir = conf.watch_directory+'/run'+str(self.runnumber).zfill(conf.run_number_padding) - self.buoutputdir = conf.micromerge_output+'/run'+str(self.runnumber).zfill(conf.run_number_padding) + #if conf.instance!="main" and conf.instance_same_destination==False: + # try:os.mkdir(os.path.join(conf.micromerge_output,conf.instance)) + # except:pass + # self.buoutputdir = os.path.join(conf.micromerge_output,instance,'run'+str(self.runnumber).zfill(conf.run_number_padding)) + #else: + # self.buoutputdir = os.path.join(conf.micromerge_output,'run'+str(self.runnumber).zfill(conf.run_number_padding)) os.mkdir(self.rawinputdir+'/mon') except Exception, ex: - logging.error("could not create mon dir inside the run input directory") + logger.error("could not create mon dir inside the run input directory") else: - self.rawinputdir= bu_disk_list_ramdisk[0]+'/run' + str(self.runnumber).zfill(conf.run_number_padding) + #self.rawinputdir= os.path.join(random.choice(bu_disk_list_ramdisk_instance),'run' + str(self.runnumber).zfill(conf.run_number_padding)) + self.rawinputdir= os.path.join(bu_disk_list_ramdisk_instance[0],'run' + str(self.runnumber).zfill(conf.run_number_padding)) self.lock = threading.Lock() - #conf.use_elasticsearch = False - #note: start elastic.py first! + if conf.use_elasticsearch == True: + global nsslock try: if conf.role == "bu": - logging.info("starting elasticbu.py with arguments:"+self.dirname) - elastic_args = ['/opt/hltd/python/elasticbu.py',str(self.runnumber)] + nsslock.acquire() + logger.info("starting elasticbu.py with arguments:"+self.dirname) + elastic_args = ['/opt/hltd/python/elasticbu.py',self.instance,str(self.runnumber)] else: - logging.info("starting elastic.py with arguments:"+self.dirname) - elastic_args = ['/opt/hltd/python/elastic.py',self.dirname,self.rawinputdir+'/mon',str(expected_processes),str(conf.elastic_cluster)] + logger.info("starting elastic.py with arguments:"+self.dirname) + elastic_args = ['/opt/hltd/python/elastic.py',self.dirname,self.rawinputdir+'/mon',str(expected_processes)] self.elastic_monitor = subprocess.Popen(elastic_args, preexec_fn=preexec_function, close_fds=True ) - except OSError as ex: - logging.error("failed to start elasticsearch client") - logging.error(ex) + logger.error("failed to start elasticsearch client") + logger.error(ex) + try:nsslock.release() + except:pass if conf.role == "fu" and conf.dqm_machine==False: try: - logging.info("starting anelastic.py with arguments:"+self.dirname) - elastic_args = ['/opt/hltd/python/anelastic.py',self.dirname,str(self.runnumber), self.rawinputdir] + logger.info("starting anelastic.py with arguments:"+self.dirname) + #elastic_args = ['/opt/hltd/python/anelastic.py',self.dirname,str(self.runnumber), self.rawinputdir,random.choice(bu_disk_list_output_instance)] + elastic_args = ['/opt/hltd/python/anelastic.py',self.dirname,str(self.runnumber), self.rawinputdir,bu_disk_list_output_instance[0]] self.anelastic_monitor = subprocess.Popen(elastic_args, preexec_fn=preexec_function, close_fds=True ) except OSError as ex: - logging.fatal("failed to start anelastic.py client:") - logging.exception(ex) + logger.fatal("failed to start anelastic.py client:") + logger.exception(ex) sys.exit(1) def AcquireResource(self,resourcenames,fromstate): idles = conf.resource_base+'/'+fromstate+'/' try: - logging.debug("Trying to acquire resource " + logger.debug("Trying to acquire resource " +str(resourcenames) +" from "+fromstate) for resourcename in resourcenames: os.rename(idles+resourcename,used+resourcename) if not filter(lambda x: x.cpu==resourcenames,self.online_resource_list): - logging.debug("resource(s) "+str(resourcenames) + logger.debug("resource(s) "+str(resourcenames) +" not found in online_resource_list, creating new") self.online_resource_list.append(OnlineResource(resourcenames,self.lock)) return self.online_resource_list[-1] - logging.debug("resource(s) "+str(resourcenames) + logger.debug("resource(s) "+str(resourcenames) +" found in online_resource_list") return filter(lambda x: x.cpu==resourcenames,self.online_resource_list)[0] except Exception as ex: - logging.info("exception encountered in looking for resources") - logging.info(ex) + logger.info("exception encountered in looking for resources") + logger.info(ex) def ContactResource(self,resourcename): self.online_resource_list.append(OnlineResource(resourcename,self.lock)) @@ -878,28 +1152,39 @@ def ReleaseResource(self,res): self.online_resource_list.remove(res) def AcquireResources(self,mode): - logging.info("acquiring resources from "+conf.resource_base) + logger.info("acquiring resources from "+conf.resource_base) idles = conf.resource_base idles += '/idle/' if conf.role == 'fu' else '/boxes/' try: dirlist = os.listdir(idles) except Exception as ex: - logging.info("exception encountered in looking for resources") - logging.info(ex) - logging.info(dirlist) + logger.info("exception encountered in looking for resources") + logger.info(ex) + logger.info(str(dirlist)) current_time = time.time() count = 0 cpu_group=[] #self.lock.acquire() + global machine_blacklist + if conf.role=='bu': + update_success,machine_blacklist=updateBlacklist() + if update_success==False: + logger.fatal("unable to check blacklist: giving up on run start") + return False + for cpu in dirlist: #skip self - if conf.role=='bu' and cpu == os.uname()[1]:continue - + if conf.role=='bu': + if cpu == os.uname()[1]:continue + if cpu in machine_blacklist: + logger.info("skipping blacklisted resource "+str(cpu)) + continue + count = count+1 cpu_group.append(cpu) age = current_time - os.path.getmtime(idles+cpu) - logging.info("found resource "+cpu+" which is "+str(age)+" seconds old") + logger.info("found resource "+cpu+" which is "+str(age)+" seconds old") if conf.role == 'fu': if count == nstreams: self.AcquireResource(cpu_group,'idle') @@ -909,12 +1194,13 @@ def AcquireResources(self,mode): if age < 10: cpus = [cpu] self.ContactResource(cpus) + return True #self.lock.release() def Start(self): self.is_active_run = True for resource in self.online_resource_list: - logging.info('start run '+str(self.runnumber)+' on cpu(s) '+str(resource.cpu)) + logger.info('start run '+str(self.runnumber)+' on cpu(s) '+str(resource.cpu)) if conf.role == 'fu': self.StartOnResource(resource) else: @@ -929,11 +1215,11 @@ def Start(self): self.startCompletedChecker() def StartOnResource(self, resource): - logging.debug("StartOnResource called") + logger.debug("StartOnResource called") resource.statefiledir=conf.watch_directory+'/run'+str(self.runnumber).zfill(conf.run_number_padding) mondir = os.path.join(resource.statefiledir,'mon') resource.associateddir=mondir - logging.info(str(nthreads)+' '+str(nstreams)) + logger.info(str(nthreads)+' '+str(nstreams)) resource.StartNewProcess(self.runnumber, self.online_resource_list.index(resource), self.arch, @@ -941,10 +1227,10 @@ def StartOnResource(self, resource): self.menu, int(round((len(resource.cpu)*float(nthreads)/nstreams))), len(resource.cpu)) - logging.debug("StartOnResource process started") - #logging.debug("StartOnResource going to acquire lock") + logger.debug("StartOnResource process started") + #logger.debug("StartOnResource going to acquire lock") #self.lock.acquire() - #logging.debug("StartOnResource lock acquired") + #logger.debug("StartOnResource lock acquired") try: os.makedirs(mondir) except OSError: @@ -954,7 +1240,7 @@ def StartOnResource(self, resource): fp=None stat = [] if not os.path.exists(monfile): - logging.debug("No log file "+monfile+" found, creating one") + logger.debug("No log file "+monfile+" found, creating one") fp=open(monfile,'w+') attempts=0 while True: @@ -966,12 +1252,12 @@ def StartOnResource(self, resource): attempts+=1 continue else: - logging.error("could not retrieve process parameters") - logging.exception(ex) + logger.error("could not retrieve process parameters") + logger.exception(ex) break else: - logging.debug("Updating existing log file "+monfile) + logger.debug("Updating existing log file "+monfile) fp=open(monfile,'r+') stat=json.load(fp) attempts=0 @@ -990,8 +1276,8 @@ def StartOnResource(self, resource): time.sleep(.05) continue else: - logging.error("could not retrieve process parameters") - logging.exception(ex) + logger.error("could not retrieve process parameters") + logger.exception(ex) break fp.seek(0) fp.truncate() @@ -1000,11 +1286,34 @@ def StartOnResource(self, resource): fp.flush() fp.close() #self.lock.release() - #logging.debug("StartOnResource lock released") + #logger.debug("StartOnResource lock released") + + def Stop(self): + #used to gracefully stop CMSSW and finish scripts + with open(os.path.join(self.dirname,"temp_CMSSW_STOP"),'w') as f: + writedoc = {} + bu_lumis = [] + try: + bu_eols_files = filter( lambda x: x.endswith("_EoLS.jsn"),os.listdir(self.rawinputdir)) + bu_lumis = (sorted([int(x.split('_')[1][2:]) for x in bu_eols_files])) + except: + logger.error("Unable to parse BU EoLS files") + if len(bu_lumis): + logger.info('last closed lumisection in ramdisk is '+str(bu_lumis[-1])) + writedoc['lastLS']=bu_lumis[-1]+2 #current+2 + else: writedoc['lastLS']=2 + json.dump(writedoc,f) + try: + os.rename(os.path.join(self.dirname,"temp_CMSSW_STOP"),os.path.join(self.dirname,"CMSSW_STOP")) + except:pass + - def Shutdown(self,herod=False): + def Shutdown(self,herod): #herod mode sends sigkill to all process, however waits for all scripts to finish - logging.debug("Run:Shutdown called") + logger.debug("Run:Shutdown called") + global runs_pending_shutdown + if self.runnumber in runs_pending_shutdown: runs_pending_shutdown.remove(self.runnumber) + self.is_active_run = False try: self.changeMarkerMaybe(Run.ABORTED) @@ -1017,16 +1326,16 @@ def Shutdown(self,herod=False): for resource in self.online_resource_list: if conf.role == 'fu': if resource.processstate==100: - logging.info('terminating process '+str(resource.process.pid)+ + logger.info('terminating process '+str(resource.process.pid)+ ' in state '+str(resource.processstate)) if herod:resource.process.kill() else:resource.process.terminate() - logging.info('process '+str(resource.process.pid)+' join watchdog thread') + logger.info('process '+str(resource.process.pid)+' join watchdog thread') # time.sleep(.1) resource.join() - logging.info('process '+str(resource.process.pid)+' terminated') - logging.info('releasing resource(s) '+str(resource.cpu)) + logger.info('process '+str(resource.process.pid)+' terminated') + logger.info('releasing resource(s) '+str(resource.cpu)) resource.clearQuarantined() resource_lock.acquire() @@ -1034,8 +1343,8 @@ def Shutdown(self,herod=False): try: os.rename(used+cpu,idles+cpu) except OSError: - #@SM:happens if t was quarantined - logging.warning('Unable to find resource file '+used+cpu+'.') + #@SM:happens if it was quarantined + logger.warning('Unable to find resource file '+used+cpu+'.') except Exception as ex: resource_lock.release() raise(ex) @@ -1055,8 +1364,8 @@ def Shutdown(self,herod=False): self.anelastic_monitor.terminate() self.anelastic_monitor.wait() except Exception as ex: - logging.info("exception encountered in shutting down anelastic.py "+ str(ex)) - #logging.exception(ex) + logger.info("exception encountered in shutting down anelastic.py "+ str(ex)) + #logger.exception(ex) if conf.use_elasticsearch == True: try: if self.elastic_monitor: @@ -1066,18 +1375,21 @@ def Shutdown(self,herod=False): self.elastic_monitor.terminate() self.elastic_monitor.wait() except Exception as ex: - logging.info("exception encountered in shutting down elastic.py") - logging.exception(ex) + logger.info("exception encountered in shutting down elastic.py") + if "No child processes" in str(ex):pass + else:logger.exception(ex) if self.waitForEndThread is not None: self.waitForEndThread.join() except Exception as ex: - logging.info("exception encountered in shutting down resources") - logging.exception(ex) + logger.info("exception encountered in shutting down resources") + logger.exception(ex) global active_runs + global active_runs_errors active_runs_copy = active_runs[:] for run_num in active_runs_copy: if run_num == self.runnumber: + active_runs_errors.pop(active_runs.index(run_num)) active_runs.remove(run_num) try: @@ -1087,7 +1399,7 @@ def Shutdown(self,herod=False): except: pass - logging.info('Shutdown of run '+str(self.runnumber).zfill(conf.run_number_padding)+' completed') + logger.info('Shutdown of run '+str(self.runnumber).zfill(conf.run_number_padding)+' completed') def ShutdownBU(self): @@ -1108,16 +1420,18 @@ def ShutdownBU(self): time.sleep(.1) self.elastic_monitor.wait() except Exception as ex: - logging.info("exception encountered in shutting down elasticbu.py: " + str(ex)) - #logging.exception(ex) + logger.info("exception encountered in shutting down elasticbu.py: " + str(ex)) + #logger.exception(ex) global active_runs + global active_runs_errors active_runs_copy = active_runs[:] for run_num in active_runs_copy: if run_num == self.runnumber: + active_runs_errors.pop(active_runs.index(run_num)) active_runs.remove(run_num) - logging.info('Shutdown of run '+str(self.runnumber).zfill(conf.run_number_padding)+' on BU completed') + logger.info('Shutdown of run '+str(self.runnumber).zfill(conf.run_number_padding)+' on BU completed') def StartWaitForEnd(self): @@ -1127,11 +1441,13 @@ def StartWaitForEnd(self): self.waitForEndThread = threading.Thread(target = self.WaitForEnd) self.waitForEndThread.start() except Exception as ex: - logging.info("exception encountered in starting run end thread") - logging.info(ex) + logger.info("exception encountered in starting run end thread") + logger.info(ex) def WaitForEnd(self): - logging.info("wait for end thread!") + logger.info("wait for end thread!") + global cloud_mode + global entering_cloud_mode try: for resource in self.online_resource_list: resource.disableRestart() @@ -1139,19 +1455,19 @@ def WaitForEnd(self): if resource.processstate is not None:#was:100 if resource.process is not None and resource.process.pid is not None: ppid = resource.process.pid else: ppid="None" - logging.info('waiting for process '+str(ppid)+ + logger.info('waiting for process '+str(ppid)+ ' in state '+str(resource.processstate) + ' to complete ') try: resource.join() - logging.info('process '+str(resource.process.pid)+' completed') + logger.info('process '+str(resource.process.pid)+' completed') except:pass # os.rename(used+resource.cpu,idles+resource.cpu) resource.clearQuarantined() resource.process=None self.online_resource_list = [] if conf.role == 'fu': - logging.info('writing complete file') + logger.info('writing complete file') self.changeMarkerMaybe(Run.COMPLETE) try: os.remove(conf.watch_directory+'/end'+str(self.runnumber).zfill(conf.run_number_padding)) @@ -1160,29 +1476,43 @@ def WaitForEnd(self): if conf.dqm_machine==False: self.anelastic_monitor.wait() except OSError,ex: - logging.info("Exception encountered in waiting for termination of anelastic:" +str(ex)) + logger.info("Exception encountered in waiting for termination of anelastic:" +str(ex)) if conf.use_elasticsearch == True: try: self.elastic_monitor.wait() except OSError,ex: - logging.info("Exception encountered in waiting for termination of anelastic:" +str(ex)) + logger.info("Exception encountered in waiting for termination of anelastic:" +str(ex)) if conf.delete_run_dir is not None and conf.delete_run_dir == True: try: shutil.rmtree(self.dirname) except Exception as ex: - logging.exception(ex) + logger.exception(ex) global active_runs - logging.info("active runs.."+str(active_runs)) + global active_runs_errors + logger.info("active runs.."+str(active_runs)) for run_num in active_runs: if run_num == self.runnumber: + active_runs_errors.pop(active_runs.index(run_num)) active_runs.remove(run_num) - logging.info("new active runs.."+str(active_runs)) + logger.info("new active runs.."+str(active_runs)) + + if cloud_mode==True: + resource_lock.acquire() + if len(active_runs)>=1: + logger.info("VM mode: waiting for runs: "+str(active_runs)+" to finish") + else: + logger.info("No active runs. moving all resource files to cloud") + #give resources to cloud and bail out + move_resources_to_cloud() + entering_cloud_mode=False + resource_lock.release() except Exception as ex: - logging.error("exception encountered in ending run") - logging.exception(ex) + resource_lock.release() + logger.error("exception encountered in ending run") + logger.exception(ex) def changeMarkerMaybe(self,marker): dir = self.dirname @@ -1192,7 +1522,7 @@ def changeMarkerMaybe(self,marker): fp = open(dir+'/'+marker,'w+') fp.close() else: - logging.error("There are more than one markers for run " + logger.error("There are more than one markers for run " +str(self.runnumber)) return @@ -1201,8 +1531,8 @@ def startAnelasticWatchdog(self): self.anelasticWatchdog = threading.Thread(target = self.runAnelasticWatchdog) self.anelasticWatchdog.start() except Exception as ex: - logging.info("exception encountered in starting anelastic watchdog thread") - logging.info(ex) + logger.info("exception encountered in starting anelastic watchdog thread") + logger.info(ex) def runAnelasticWatchdog(self): try: @@ -1210,8 +1540,8 @@ def runAnelasticWatchdog(self): if self.is_active_run == True: #abort the run self.anelasticWatchdog=None - logging.fatal("Premature end of anelastic.py") - self.Shutdown() + logger.fatal("Premature end of anelastic.py") + self.Shutdown(False) except: pass @@ -1223,14 +1553,14 @@ def stopAnelasticWatchdog(self): def startCompletedChecker(self): if conf.role == 'bu': #and conf.use_elasticsearch == True: try: - logging.info('start checking completition of run '+str(self.runnumber)) + logger.info('start checking completition of run '+str(self.runnumber)) #mode 1: check for complete entries in ES #mode 2: check for runs in 'boxes' files - self.endChecker = RunCompletedChecker(1,int(self.runnumber),self.online_resource_list,self.dirname, active_runs,self.elastic_monitor) + self.endChecker = RunCompletedChecker(conf,1,int(self.runnumber),self.online_resource_list,self.dirname,active_runs,active_runs_errors,self.elastic_monitor) self.endChecker.start() except Exception,ex: - logging.error('failure to start run completition checker:') - logging.exception(ex) + logger.error('failure to start run completition checker:') + logger.exception(ex) def checkQuarantinedLimit(self): allQuarantined=True @@ -1248,8 +1578,9 @@ def checkQuarantinedLimit(self): class RunRanger: - def __init__(self): + def __init__(self,instance): self.inotifyWrapper = InotifyWrapper(self) + self.instance = instance def register_inotify_path(self,path,mask): self.inotifyWrapper.registerPath(path,mask) @@ -1258,25 +1589,64 @@ def start_inotify(self): self.inotifyWrapper.start() def stop_inotify(self): - logging.info("RunRanger: Stop inotify wrapper") self.inotifyWrapper.stop() - logging.info("RunRanger: Join inotify wrapper") self.inotifyWrapper.join() - logging.info("RunRanger: Inotify wrapper returned") + logger.info("RunRanger: Inotify wrapper shutdown done") def process_IN_CREATE(self, event): nr=0 global run_list - logging.info('RunRanger: event '+event.fullpath) + global runs_pending_shutdown + global active_runs + global active_runs_errors + global cloud_mode + global entering_cloud_mode + logger.info('RunRanger: event '+event.fullpath) dirname=event.fullpath[event.fullpath.rfind("/")+1:] - logging.info('RunRanger: new filename '+dirname) + logger.info('RunRanger: new filename '+dirname) if dirname.startswith('run'): + + if os.path.islink(event.fullpath): + logger.info('directory ' + event.fullpath + ' is link. Ignoring this run') + return + if not os.path.isdir(event.fullpath): + logger.info(event.fullpath +' is a file. A directory is needed to start a run.') + return nr=int(dirname[3:]) if nr!=0: try: - logging.info('new run '+str(nr)) + logger.info('new run '+str(nr)) + #terminate quarantined runs + for q_runnumber in runs_pending_shutdown: + q_run = filter(lambda x: x.runnumber==q_runnumber,run_list) + if len(q_run): + q_run[0].Shutdown(True)#run abort in herod mode (wait for anelastic/elastic to shut down) + time.sleep(.1) + + if cloud_mode==True and entering_cloud_mode==False: + logger.info("received new run notification in VM mode. Checking if idle cores are available...") + try: + if len(os.listdir(idles))<1: + logger.info("this run is skipped because FU is in VM mode and resources have not been returned") + return + #return all resources to HLTD (TODO:check if VM tool is done) + while True: + resource_lock.acquire() + #retry this operation in case cores get moved around by other means + if cleanup_resources()==True: + resource_lock.release() + break + resource_lock.release() + time.sleep(0.1) + logger.warning("could not move all resources, retrying.") + cloud_mode=False + except Exception as ex: + #resource_lock.release() + logger.fatal("failed to disable VM mode when receiving notification for run "+str(nr)) + logger.exception(ex) if conf.role == 'fu': - bu_dir = bu_disk_list_ramdisk[0]+'/'+dirname + #bu_dir = random.choice(bu_disk_list_ramdisk_instance)+'/'+dirname + bu_dir = bu_disk_list_ramdisk_instance[0]+'/'+dirname try: os.symlink(bu_dir+'/jsd',event.fullpath+'/jsd') except: @@ -1294,17 +1664,23 @@ def process_IN_CREATE(self, event): # create an EoR file that will trigger all the running jobs to exit nicely open(EoR_file_name, 'w').close() - run_list.append(Run(nr,event.fullpath,bu_dir)) + run_list.append(Run(nr,event.fullpath,bu_dir,self.instance)) resource_lock.acquire() - run_list[-1].AcquireResources(mode='greedy') - run_list[-1].Start() + if run_list[-1].AcquireResources(mode='greedy'): + run_list[-1].Start() + else: + run_list.remove(run_list[-1]) resource_lock.release() + if conf.role == 'bu' and conf.instance != 'main': + logger.info('creating run symlink in main ramdisk directory') + main_ramdisk = os.path.dirname(os.path.normpath(conf.watch_directory)) + os.symlink(event.fullpath,os.path.join(main_ramdisk,os.path.basename(event.fullpath))) except OSError as ex: - logging.error("RunRanger: "+str(ex)+" "+ex.filename) - logging.exception(ex) + logger.error("RunRanger: "+str(ex)+" "+ex.filename) + logger.exception(ex) except Exception as ex: - logging.error("RunRanger: unexpected exception encountered in forking hlt slave") - logging.exception(ex) + logger.error("RunRanger: unexpected exception encountered in forking hlt slave") + logger.exception(ex) elif dirname.startswith('emu'): nr=int(dirname[3:]) @@ -1316,8 +1692,8 @@ def process_IN_CREATE(self, event): bu_emulator.startNewRun(nr) except Exception as ex: - logging.info("exception encountered in starting BU emulator run") - logging.info(ex) + logger.info("exception encountered in starting BU emulator run") + logger.info(ex) os.remove(event.fullpath) @@ -1330,7 +1706,7 @@ def process_IN_CREATE(self, event): try: runtoend = filter(lambda x: x.runnumber==nr,run_list) if len(runtoend)==1: - logging.info('end run '+str(nr)) + logger.info('end run '+str(nr)) #remove from run_list to prevent intermittent restarts #lock used to fix a race condition when core files are being moved around resource_lock.acquire() @@ -1341,34 +1717,34 @@ def process_IN_CREATE(self, event): runtoend[0].StartWaitForEnd() if bu_emulator and bu_emulator.runnumber != None: bu_emulator.stop() - #logging.info('run '+str(nr)+' removing end-of-run marker') + #logger.info('run '+str(nr)+' removing end-of-run marker') #os.remove(event.fullpath) elif len(runtoend)==0: - logging.warning('request to end run '+str(nr) + logger.warning('request to end run '+str(nr) +' which does not exist') os.remove(event.fullpath) else: - logging.error('request to end run '+str(nr) + logger.error('request to end run '+str(nr) +' has more than one run object - this should ' +'*never* happen') except Exception as ex: resource_lock.release() - logging.info("exception encountered when waiting hltrun to end") - logging.info(ex) + logger.info("exception encountered when waiting hltrun to end") + logger.info(ex) else: - logging.error('request to end run '+str(nr) + logger.error('request to end run '+str(nr) +' which is an invalid run number - this should ' +'*never* happen') else: - logging.error('request to end run '+str(nr) + logger.error('request to end run '+str(nr) +' which is NOT a run number - this should ' +'*never* happen') elif dirname.startswith('herod'): os.remove(event.fullpath) if conf.role == 'fu': - logging.info("killing all CMSSW child processes") + logger.info("killing all CMSSW child processes") for run in run_list: run.Shutdown(True) elif conf.role == 'bu': @@ -1378,46 +1754,48 @@ def process_IN_CREATE(self, event): try: dirlist = os.listdir(boxdir) current_time = time.time() - logging.info("sending herod to child FUs") + logger.info("sending herod to child FUs") for name in dirlist: if name == os.uname()[1]:continue age = current_time - os.path.getmtime(boxdir+name) - logging.info('found box '+name+' with keepalive age '+str(age)) + logger.info('found box '+name+' with keepalive age '+str(age)) if age < 20: - connection = httplib.HTTPConnection(name, conf.cgi_port) + connection = httplib.HTTPConnection(name, conf.cgi_port - self.cgi_instance_port_offset) connection.request("GET",'cgi-bin/herod_cgi.py') response = connection.getresponse() - logging.info("sent herod to all child FUs") + logger.info("sent herod to all child FUs") except Exception as ex: - logging.error("exception encountered in contacting resources") - logging.info(ex) + logger.error("exception encountered in contacting resources") + logger.info(ex) run_list=[] + active_runs_errors=[] active_runs=[] - elif dirname.startswith('populationcontrol'): - logging.info("terminating all ongoing runs") - for run in run_list: - if conf.role=='fu': - run.Shutdown() - elif conf.role=='bu': - run.ShutdownBU() + if len(run_list)>0: + logger.info("terminating all ongoing runs via cgi interface (populationcontrol): "+str(run_list)) + for run in run_list: + if conf.role=='fu': + run.Shutdown(run.runnumber in runs_pending_shutdown) + elif conf.role=='bu': + run.ShutdownBU() + logger.info("terminated all ongoing runs via cgi interface (populationcontrol)") run_list = [] + active_runs_errors=[] active_runs=[] - logging.info("terminated all ongoing runs via cgi interface (populationcontrol)") os.remove(event.fullpath) elif dirname.startswith('harakiri') and conf.role == 'fu': os.remove(event.fullpath) pid=os.getpid() - logging.info('asked to commit seppuku:'+str(pid)) + logger.info('asked to commit seppuku:'+str(pid)) try: - logging.info('sending signal '+str(SIGKILL)+' to myself:'+str(pid)) + logger.info('sending signal '+str(SIGKILL)+' to myself:'+str(pid)) retval = os.kill(pid, SIGKILL) - logging.info('sent SIGINT to myself:'+str(pid)) - logging.info('got return '+str(retval)+'waiting to die...and hope for the best') + logger.info('sent SIGINT to myself:'+str(pid)) + logger.info('got return '+str(retval)+'waiting to die...and hope for the best') except Exception as ex: - logging.error("exception in committing harakiri - the blade is not sharp enough...") - logging.error(ex) + logger.error("exception in committing harakiri - the blade is not sharp enough...") + logger.error(ex) elif dirname.startswith('quarantined'): try: @@ -1431,30 +1809,35 @@ def process_IN_CREATE(self, event): runtoend = filter(lambda x: x.runnumber==nr,run_list) if len(runtoend)==1: if runtoend[0].checkQuarantinedLimit()==True: - runtoend[0].Shutdown(True)#run abort in herod mode (wait for anelastic/elastic to shut down) + hasHigherRuns = filter(lambda x: x.runnumber>nr,run_list) + if len(hasHigherRuns)>0: + runtoend[0].Shutdown(True) + else: + runs_pending_shutdown.append(nr) except Exception as ex: - logging.exception(ex) + logger.exception(ex) elif dirname.startswith('suspend') and conf.role == 'fu': - logging.info('suspend mountpoints initiated') + logger.info('suspend mountpoints initiated') + replyport = int(dirname[7:]) if dirname[7:].isdigit()==True else conf.cgi_port global suspended suspended=True for run in run_list: - run.Shutdown(False)#terminate all ongoing runs + run.Shutdown(run.runnumber in runs_pending_shutdown)#terminate all ongoing runs run_list=[] time.sleep(.5) umount_success = cleanup_mountpoints(remount=False) if umount_success==False: time.sleep(1) - logging.error("Suspend initiated from BU failed, trying again...") + logger.error("Suspend initiated from BU failed, trying again...") #notifying itself again try:os.remove(event.fullpath) except:pass fp = open(event.fullpath,"w+") fp.close() return - #logging.info("Suspend failed, preparing for harakiri...") + #logger.info("Suspend failed, preparing for harakiri...") #time.sleep(.1) #fp = open(os.path.join(os.path.dirname(event.fullpath.rstrip(os.path.sep)),'harakiri'),"w+") #fp.close() @@ -1471,15 +1854,15 @@ def process_IN_CREATE(self, event): #first report to BU that umount was done try: if bu_name==None: - logging.fatal("No BU name was found in the bus.config file. Leaving mount points unmounted until the hltd service restart.") + logger.fatal("No BU name was found in the bus.config file. Leaving mount points unmounted until the hltd service restart.") os.remove(event.fullpath) return - connection = httplib.HTTPConnection(bu_name, conf.cgi_port+5,timeout=5) + connection = httplib.HTTPConnection(bu_name, replyport+20,timeout=5) connection.request("GET",'cgi-bin/report_suspend_cgi.py?host='+os.uname()[1]) response = connection.getresponse() except Exception as ex: - logging.error("Unable to report suspend state to BU "+str(bu_name)+':'+str(conf.cgi_port+5)) - logging.exception(ex) + logger.error("Unable to report suspend state to BU "+str(bu_name)+':'+str(replyport+20)) + logger.exception(ex) #loop while BU is not reachable while True: @@ -1493,26 +1876,26 @@ def process_IN_CREATE(self, event): bu_name=line.split('.')[0] break except: - logging.info('exception test 1') + logger.info('exception test 1') time.sleep(5) continue if bu_name==None: - logging.info('exception test 2') + logger.info('exception test 2') time.sleep(5) continue - logging.info('checking if BU hltd is available...') - connection = httplib.HTTPConnection(bu_name, conf.cgi_port,timeout=5) + logger.info('checking if BU hltd is available...') + connection = httplib.HTTPConnection(bu_name, replyport,timeout=5) connection.request("GET",'cgi-bin/getcwd_cgi.py') response = connection.getresponse() - logging.info('BU hltd is running !...') + logger.info('BU hltd is running !...') #if we got here, the service is back up break except Exception as ex: try: - logging.info('Failed to contact BU hltd service: ' + str(ex.args[0]) +" "+ str(ex.args[1])) + logger.info('Failed to contact BU hltd service: ' + str(ex.args[0]) +" "+ str(ex.args[1])) except: - logging.info('Failed to contact BU hltd service: ') + logger.info('Failed to contact BU hltd service '+str(ex)) time.sleep(5) #mount again @@ -1520,12 +1903,59 @@ def process_IN_CREATE(self, event): try:os.remove(event.fullpath) except:pass suspended=False - logging.info("Remount is performed") + logger.info("Remount is performed") + + elif dirname.startswith('exclude') and conf.role == 'fu': + #service on this machine is asked to be excluded for cloud use + logger.info('machine exclude initiated') + resource_lock.acquire() + cloud_mode=True + entering_cloud_mode=True + try: + for run in run_list: + if run.runnumber in runs_pending_shutdown: + run.Shutdown(True) + else: + #write signal file for CMSSW to quit with 0 after certain LS + run.Stop() + except Exception as ex: + logger.fatal("Unable to clear runs. Will not enter VM mode.") + logger.exception(ex) + cloud_mode=False + resource_lock.release() + os.remove(event.fullpath) + + elif dirname.startswith('include') and conf.role == 'fu': + #TODO: pick up latest working run.. + tries=1000 + if cloud_mode==True: + while True: + resource_lock.acquire() + #retry this operation in case cores get moved around by other means + if entering_cloud_mode==False and cleanup_resources()==True: + resource_lock.release() + break + resource_lock.release() + time.sleep(0.1) + tries-=1 + if tries==0: + logger.fatal("Timeout: taking resources from cloud after waiting for 100 seconds") + cleanup_resources() + entering_cloud_mode=False + break + if (tries%10)==0: + logger.warning("could not move all resources, retrying.") + cloud_mode=False + os.remove(event.fullpath) + elif dirname.startswith('logrestart'): + #hook to restart logcollector process manually + restartLogCollector(self.instance) + os.remove(event.fullpath) - logging.debug("RunRanger completed handling of event "+event.fullpath) + logger.debug("RunRanger completed handling of event "+event.fullpath) def process_default(self, event): - logging.info('RunRanger: event '+event.fullpath+' type '+str(event.mask)) + logger.info('RunRanger: event '+event.fullpath+' type '+str(event.mask)) filename=event.fullpath[event.fullpath.rfind("/")+1:] class ResourceRanger: @@ -1543,29 +1973,27 @@ def start_inotify(self): self.inotifyWrapper.start() def stop_managed_monitor(self): - logging.info("ResourceRanger: Stop managed monitor") self.managed_monitor.stop() - logging.info("ResourceRanger: Join managed monitor") self.managed_monitor.join() - logging.info("ResourceRanger: managed monitor returned") + logger.info("ResourceRanger: managed monitor shutdown done") def stop_inotify(self): - logging.info("ResourceRanger: Stop inotify wrapper") self.inotifyWrapper.stop() - logging.info("ResourceRanger: Join inotify wrapper") self.inotifyWrapper.join() - logging.info("ResourceRanger: Inotify wrapper returned") + logger.info("ResourceRanger: Inotify wrapper shutdown done") def process_IN_MOVED_TO(self, event): - logging.debug('ResourceRanger-MOVEDTO: event '+event.fullpath) + logger.debug('ResourceRanger-MOVEDTO: event '+event.fullpath) + basename = os.path.basename(event.fullpath) + if basename.startswith('resource_summary'):return try: resourcepath=event.fullpath[1:event.fullpath.rfind("/")] resourcestate=resourcepath[resourcepath.rfind("/")+1:] resourcename=event.fullpath[event.fullpath.rfind("/")+1:] resource_lock.acquire() - if not (resourcestate == 'online' or resourcestate == 'offline' + if not (resourcestate == 'online' or resourcestate == 'cloud' or resourcestate == 'quarantined'): - logging.debug('ResourceNotifier: new resource ' + logger.debug('ResourceNotifier: new resource ' +resourcename +' in ' +resourcepath @@ -1575,7 +2003,7 @@ def process_IN_MOVED_TO(self, event): ongoing_runs = filter(lambda x: x.is_active_run==True,run_list) if ongoing_runs: ongoing_run = ongoing_runs[0] - logging.info("ResourceRanger: found active run "+str(ongoing_run.runnumber)) + logger.info("ResourceRanger: found active run "+str(ongoing_run.runnumber)) """grab resources that become available #@@EM implement threaded acquisition of resources here """ @@ -1584,8 +2012,8 @@ def process_IN_MOVED_TO(self, event): try: reslist = os.listdir(idlesdir) except Exception as ex: - logging.info("exception encountered in looking for resources") - logging.exception(ex) + logger.info("exception encountered in looking for resources") + logger.exception(ex) #put inotify-ed resource as the first item for resindex,resname in enumerate(reslist): fileFound=False @@ -1614,9 +2042,9 @@ def process_IN_MOVED_TO(self, event): res = ongoing_run.AcquireResource(resourcenames,resourcestate) if acquired_sufficient: - logging.info("ResourceRanger: acquired resource(s) "+str(res.cpu)) + logger.info("ResourceRanger: acquired resource(s) "+str(res.cpu)) ongoing_run.StartOnResource(res) - logging.info("ResourceRanger: started process on resource " + logger.info("ResourceRanger: started process on resource " +str(res.cpu)) else: #if no run is active, move (x N threads) files from except to idle to be picked up for the next run @@ -1650,19 +2078,20 @@ def process_IN_MOVED_TO(self, event): os.rename(broken+resname,idles+resname) except Exception as ex: - logging.info("exception encountered in looking for resources in except") - logging.info(ex) + logger.info("exception encountered in looking for resources in except") + logger.info(ex) except Exception as ex: - logging.error("exception in ResourceRanger") - logging.error(ex) + logger.error("exception in ResourceRanger") + logger.error(ex) try: resource_lock.release() except:pass def process_IN_MODIFY(self, event): - - logging.debug('ResourceRanger-MODIFY: event '+event.fullpath) + logger.debug('ResourceRanger-MODIFY: event '+event.fullpath) + basename = os.path.basename(event.fullpath) + if basename.startswith('resource_summary'):return try: bus_config = os.path.join(os.path.dirname(conf.resource_base.rstrip(os.path.sep)),'bus.config') if event.fullpath == bus_config: @@ -1673,21 +2102,58 @@ def process_IN_MODIFY(self, event): if self.managed_monitor: self.managed_monitor = system_monitor() self.managed_monitor.start() - logging.info("ResouceRanger: managed monitor is "+str(self.managed_monitor)) + logger.info("ResouceRanger: managed monitor is "+str(self.managed_monitor)) except Exception as ex: - logging.error("exception in ResourceRanger") - logging.error(ex) + logger.error("exception in ResourceRanger") + logger.error(ex) def process_default(self, event): - logging.debug('ResourceRanger: event '+event.fullpath +' type '+ str(event.mask)) + logger.debug('ResourceRanger: event '+event.fullpath +' type '+ str(event.mask)) filename=event.fullpath[event.fullpath.rfind("/")+1:] + def process_IN_CLOSE_WRITE(self, event): + logger.debug('ResourceRanger-IN_CLOSE_WRITE: event '+event.fullpath) + global machine_blacklist + resourcepath=event.fullpath[0:event.fullpath.rfind("/")] + basename = os.path.basename(event.fullpath) + if basename.startswith('resource_summary'):return + if conf.role=='fu':return + if basename == os.uname()[1]:return + if basename == 'blacklist': + with open(os.path.join(conf.watch_directory,'appliance','blacklist','r')) as fi: + try: + machine_blacklist = json.load(fi) + except: + pass + if resourcepath.endswith('boxes'): + global boxinfoFUMap + if basename in machine_blacklist: + try:boxinfoFUMap.remove(basename) + except:pass + else: + try: + infile = fileHandler(event.fullpath) + current_time = time.time() + boxinfoFUMap[basename] = [infile.data,current_time] + except Exception as ex: + logger.error("Unable to read of parse boxinfo file "+basename) + logger.exception(ex) + + class hltd(Daemon2,object): - def __init__(self, pidfile): - Daemon2.__init__(self,pidfile,'hltd') + def __init__(self, instance): + self.instance=instance + Daemon2.__init__(self,'hltd',instance,'hltd') def stop(self): + #read configuration file + try: + setFromConf(self.instance) + except Exception as ex: + print " CONFIGURATION error:",str(ex),"(check configuration file) [ \033[1;31mFAILED\033[0;39m ]" + sys.exit(4) + if self.silentStatus(): try: if os.path.exists(conf.watch_directory+'/populationcontrol'): @@ -1697,13 +2163,18 @@ def stop(self): count = 10 while count: os.stat(conf.watch_directory+'/populationcontrol') - sys.stdout.write('o.o') + if count==10: + sys.stdout.write(' o.o') + else: + sys.stdout.write('o.o') sys.stdout.flush() - time.sleep(1.) + time.sleep(.5) count-=1 except OSError, err: + time.sleep(.1) pass except IOError, err: + time.sleep(.1) pass super(hltd,self).stop() @@ -1713,8 +2184,15 @@ def run(self): infer it from the name of the machine """ + #read configuration file + setFromConf(self.instance) + logger.info(" ") + logger.info(" ") + logger.info("<<<< ---- hltd start : instance " + self.instance + " ---- >>>>") + logger.info(" ") + if conf.enabled==False: - logging.warning("Service is currently disabled.") + logger.warning("Service is currently disabled.") sys.exit(1) if conf.role == 'fu': @@ -1722,8 +2200,11 @@ def run(self): """ cleanup resources """ + while True: + if cleanup_resources()==True:break + time.sleep(0.1) + logger.warning("retrying cleanup_resources") - cleanup_resources() """ recheck mount points this is done at start and whenever the file /etc/appliance/bus.config is modified @@ -1740,6 +2221,13 @@ def run(self): except: pass + if conf.role == 'bu': + global machine_blacklist + update_success,machine_blacklist=updateBlacklist() + global ramdisk_submount_size + if self.instance == 'main': + #if there are other instance mountpoints in ramdisk, they will be subtracted from size estimate + ramdisk_submount_size = submount_size(conf.watch_directory) """ the line below is a VERY DIRTY trick to address the fact that @@ -1751,39 +2239,47 @@ def run(self): watch_directory = os.readlink(conf.watch_directory) if os.path.islink(conf.watch_directory) else conf.watch_directory resource_base = os.readlink(conf.resource_base) if os.path.islink(conf.resource_base) else conf.resource_base + if conf.use_elasticsearch == True: + time.sleep(.2) + restartLogCollector(self.instance) + #start boxinfo elasticsearch updater + global nsslock boxInfo = None if conf.role == 'bu' and conf.use_elasticsearch == True: - boxInfo = BoxInfoUpdater(watch_directory) + boxInfo = BoxInfoUpdater(watch_directory,conf,nsslock) boxInfo.start() - logCollector = None - if conf.use_elasticsearch == True: - logging.info("starting logcollector.py") - logcolleccor_args = ['/opt/hltd/python/logcollector.py',] - logCollector = subprocess.Popen(['/opt/hltd/python/logcollector.py'],preexec_fn=preexec_function,close_fds=True) - - runRanger = RunRanger() + runRanger = RunRanger(self.instance) runRanger.register_inotify_path(watch_directory,inotify.IN_CREATE) runRanger.start_inotify() - logging.info("started RunRanger - watch_directory " + watch_directory) + logger.info("started RunRanger - watch_directory " + watch_directory) + + appliance_base=resource_base + if resource_base.endswith('/'): + resource_base = resource_base[:-1] + if resource_base.rfind('/')>0: + appliance_base = resource_base[:resource_base.rfind('/')] rr = ResourceRanger() try: - imask = inotify.IN_MOVED_TO | inotify.IN_CREATE | inotify.IN_DELETE | inotify.IN_MODIFY if conf.role == 'bu': + pass #currently does nothing on bu + imask = inotify.IN_MOVED_TO | inotify.IN_CLOSE_WRITE | inotify.IN_DELETE rr.register_inotify_path(resource_base, imask) rr.register_inotify_path(resource_base+'/boxes', imask) else: - rr.register_inotify_path(resource_base, imask) + imask_appl = inotify.IN_MODIFY + imask = inotify.IN_MOVED_TO + rr.register_inotify_path(appliance_base, imask_appl) rr.register_inotify_path(resource_base+'/idle', imask) rr.register_inotify_path(resource_base+'/except', imask) rr.start_inotify() - logging.info("started ResourceRanger - watch_directory "+resource_base) + logger.info("started ResourceRanger - watch_directory "+resource_base) except Exception as ex: - logging.error("Exception caught in starting notifier2") - logging.error(ex) + logger.error("Exception caught in starting ResourceRanger notifier") + logger.error(ex) try: cgitb.enable(display=0, logdir="/tmp") @@ -1791,48 +2287,53 @@ def run(self): # the following allows the base directory of the http # server to be 'conf.watch_directory, which is writeable # to everybody - if os.path.exists(conf.watch_directory+'/cgi-bin'): - os.remove(conf.watch_directory+'/cgi-bin') - os.symlink('/opt/hltd/cgi',conf.watch_directory+'/cgi-bin') + if os.path.exists(watch_directory+'/cgi-bin'): + os.remove(watch_directory+'/cgi-bin') + os.symlink('/opt/hltd/cgi',watch_directory+'/cgi-bin') handler.cgi_directories = ['/cgi-bin'] - logging.info("starting http server on port "+str(conf.cgi_port)) + logger.info("starting http server on port "+str(conf.cgi_port)) httpd = BaseHTTPServer.HTTPServer(("", conf.cgi_port), handler) - logging.info("hltd serving at port "+str(conf.cgi_port)+" with role "+conf.role) - os.chdir(conf.watch_directory) + logger.info("hltd serving at port "+str(conf.cgi_port)+" with role "+conf.role) + os.chdir(watch_directory) + logger.info("<<<< ---- hltd instance " + self.instance + ": init complete, starting httpd ---- >>>>") + logger.info("") httpd.serve_forever() except KeyboardInterrupt: - logging.info("terminating all ongoing runs") - for run in run_list: - if conf.role=='fu': - run.Shutdown() - elif conf.role=='bu': - run.ShutdownBU() - logging.info("terminated all ongoing runs") - logging.info("stopping run ranger inotify helper") + logger.info("stop signal detected") + if len(run_list)>0: + logger.info("terminating all ongoing runs") + for run in run_list: + if conf.role=='fu': + global runs_pending_shutdown + run.Shutdown(run.runnumber in runs_pending_shutdown) + elif conf.role=='bu': + run.ShutdownBU() + logger.info("terminated all ongoing runs") runRanger.stop_inotify() - logging.info("stopping resource ranger inotify helper") rr.stop_inotify() if boxInfo is not None: - logging.info("stopping boxinfo updater") + logger.info("stopping boxinfo updater") boxInfo.stop() + global logCollector if logCollector is not None: + logger.info("terminating logCollector") logCollector.terminate() - logging.info("stopping system monitor") + logger.info("stopping system monitor") rr.stop_managed_monitor() - logging.info("closing httpd socket") + logger.info("closing httpd socket") httpd.socket.close() - logging.info(threading.enumerate()) - logging.info("unmounting mount points") + logger.info(threading.enumerate()) + logger.info("unmounting mount points") if cleanup_mountpoints(remount=False)==False: time.sleep(1) cleanup_mountpoints(remount=False) - logging.info("shutdown of service completed") + logger.info("shutdown of service (main thread) completed") except Exception as ex: - logging.info("exception encountered in operating hltd") - logging.info(ex) + logger.info("exception encountered in operating hltd") + logger.info(ex) runRanger.stop_inotify() rr.stop_inotify() rr.stop_managed_monitor() @@ -1840,5 +2341,7 @@ def run(self): if __name__ == "__main__": - daemon = hltd('/var/run/hltd.pid') + import procname + procname.setprocname('hltd') + daemon = hltd(sys.argv[1]) daemon.start() diff --git a/python/hltdconf.py b/python/hltdconf.py index a93765c..70578d8 100644 --- a/python/hltdconf.py +++ b/python/hltdconf.py @@ -33,8 +33,14 @@ def __init__(self, conffile): self.use_elasticsearch = bool(self.use_elasticsearch=="True") self.close_es_index = bool(self.close_es_index=="True") self.cgi_port = int(self.cgi_port) + self.cgi_instance_port_offset = int(self.cgi_instance_port_offset) self.soap2file_port = int(self.soap2file_port) + try: + self.instance_same_destination=bool(self.instance_same_destination=="True") + except: + self.instance_same_destination = True + self.dqm_machine = bool(self.dqm_machine=="True") if self.dqm_machine: self.resource_base = self.dqm_resource_base @@ -48,7 +54,7 @@ def __init__(self, conffile): self.service_log_level = getattr(logging,self.service_log_level) self.autodetect_parameters() - #read cluster name from elastic search configuration file (used to specify index name) + #read cluster name from elastic search configuration file (if not set up directly) if not self.elastic_cluster and self.use_elasticsearch == True: f = None try: @@ -63,14 +69,10 @@ def __init__(self, conffile): self.elastic_cluster = line.split(':')[1].strip() def dump(self): - logging.info( '') - logging.info( 'conf.user '+self.user) - logging.info( 'conf.role '+ self.role) - logging.info( 'conf.cmssw_base '+ self.cmssw_base) - logging.info( '') + logging.info( '') def autodetect_parameters(self): - if not self.role and 'bu' in os.uname()[1]: + if not self.role and (os.uname()[1].startswith('bu-') or os.uname()[1].startswith('dvbu-')): self.role = 'bu' elif not self.role: self.role = 'fu' @@ -78,5 +80,12 @@ def autodetect_parameters(self): if self.role == 'bu': self.watch_directory='/fff/ramdisk' if self.role == 'fu': self.watch_directory='/fff/data' +def initConf(instance='main'): + conf=None + try: + if instance!='main': + conf = hltdConf('/etc/hltd-'+instance+'.conf') + except:pass + if conf==None and instance=='main': conf = hltdConf('/etc/hltd.conf') + return conf -conf = hltdConf('/etc/hltd.conf') diff --git a/python/logcollector.py b/python/logcollector.py index 4662558..ac82a9e 100755 --- a/python/logcollector.py +++ b/python/logcollector.py @@ -15,7 +15,7 @@ import _inotify as inotify import threading import Queue -import json +import simplejson as json import logging import collections import subprocess @@ -733,12 +733,7 @@ def __init__(self,es_server_url): ip_url=getURLwithIP(es_server_url) self.es = ElasticSearch(ip_url) #update in case of new documents added to mapping definition - for key in mappings.central_hltdlogs_mapping: - doc = mappings.central_hltdlogs_mapping[key] - res = requests.get(ip_url+'/'+self.index_name+'/'+key+'/_mapping') - #only update if mapping is empty - if res.status_code==200 and res.content.strip()=='{}': - requests.post(ip_url+'/'+self.index_name+'/'+key+'/_mapping',str(doc)) + self.updateMappingMaybe(ip_url) break except (ElasticHttpError,ConnectionError,Timeout) as ex: #try to reconnect with different IP from DNS load balancing @@ -783,6 +778,14 @@ def elasticize_log(self,type,severity,timestamp,msg): self.es.index(self.index_name,'hltdlog',document) except: logger.warning('failed connection attempts to ' + self.es_server_url) + + def updateMappingMaybe(self,ip_url): + for key in mappings.central_hltdlogs_mapping: + doc = mappings.central_hltdlogs_mapping[key] + res = requests.get(ip_url+'/'+self.index_name+'/'+key+'/_mapping') + #only update if mapping is empty + if res.status_code==200 and res.content.strip()=='{}': + requests.post(ip_url+'/'+self.index_name+'/'+key+'/_mapping',json.dumps(doc)) class HLTDLogParser(threading.Thread): def __init__(self,dir,file,loglevel,esHandler,skipToEnd): @@ -951,8 +954,14 @@ def registerSignal(eventRef): if __name__ == "__main__": + + import procname + procname.setprocname('logcol') + + conf=initConf(sys.argv[1]) + logging.basicConfig(filename=os.path.join(conf.log_dir,"logcollector.log"), - level=logging.INFO, + level=conf.service_log_level, format='%(levelname)s:%(asctime)s - %(funcName)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S') logger = logging.getLogger(os.path.basename(__file__)) @@ -988,9 +997,10 @@ def registerSignal(eventRef): threadEvent = threading.Event() registerSignal(threadEvent) - hltdlogdir = '/var/log/hltd' + hltdlogdir = conf.log_dir hltdlogs = ['hltd.log','anelastic.log','elastic.log','elasticbu.log'] - cmsswlogdir = '/var/log/hltd/pid' + cmsswlogdir = os.path.join(conf.log_dir,'pid') + mask = inotify.IN_CREATE logger.info("starting CMSSW log collector for "+cmsswlogdir) diff --git a/python/mappings.py b/python/mappings.py index fbc6141..9da6b62 100644 --- a/python/mappings.py +++ b/python/mappings.py @@ -139,7 +139,7 @@ 'processed' :{'type':'integer'}, 'accepted' :{'type':'integer'}, 'errorEvents' :{'type':'integer'}, - 'size' :{'type':'integer'}, + 'size' :{'type':'long'}, } }, 'macromerge' : { @@ -154,7 +154,7 @@ 'processed' :{'type':'integer'}, 'accepted' :{'type':'integer'}, 'errorEvents' :{'type':'integer'}, - 'size' :{'type':'integer'}, + 'size' :{'type':'long'}, } } @@ -165,17 +165,22 @@ 'properties' : { 'fm_date' :{'type':'date'}, 'id' :{'type':'string'}, + 'host' :{'type':'string',"index":"not_analyzed"}, + 'appliance' :{'type':'string',"index":"not_analyzed"}, + 'instance' :{'type':'string',"index":"not_analyzed"}, 'broken' :{'type':'integer'}, 'used' :{'type':'integer'}, 'idles' :{'type':'integer'}, 'quarantined' :{'type':'integer'}, + 'cloud' :{'type':'integer'}, 'usedDataDir' :{'type':'integer'}, 'totalDataDir' :{'type':'integer'}, 'usedRamdisk' :{'type':'integer'}, 'totalRamdisk' :{'type':'integer'}, 'usedOutput' :{'type':'integer'}, 'totalOutput' :{'type':'integer'}, - 'activeRuns' :{'type':'string'} + 'activeRuns' :{'type':'string'}, + 'activeRunsErrors':{'type':'string',"index":"not_analyzed"}, }, '_timestamp' : { 'enabled' : True, @@ -193,6 +198,7 @@ 'used' :{'type':'integer'}, 'idles' :{'type':'integer'}, 'quarantined' :{'type':'integer'}, + 'cloud' :{'type':'integer'}, 'usedDataDir' :{'type':'integer'}, 'totalDataDir' :{'type':'integer'}, 'usedRamdisk' :{'type':'integer'}, @@ -200,38 +206,17 @@ 'usedOutput' :{'type':'integer'}, 'totalOutput' :{'type':'integer'}, 'activeRuns' :{'type':'string'}, - 'hosts' :{'type':'string',"index":"not_analyzed"} - }, - '_timestamp' : { - 'enabled' : True, - 'store' : "yes", - "path" : "fm_date" - } - }, - 'boxinfo_last' : {#deprecated - '_id' :{'path':'id'}, - 'properties' : { - 'fm_date' :{'type':'date'}, - 'id' :{'type':'string'}, - 'broken' :{'type':'integer'}, - 'used' :{'type':'integer'}, - 'idles' :{'type':'integer'}, - 'quarantined' :{'type':'integer'}, - 'usedDataDir' :{'type':'integer'}, - 'totalDataDir' :{'type':'integer'}, - 'usedRamdisk' :{'type':'integer'}, - 'totalRamdisk' :{'type':'integer'}, - 'usedOutput' :{'type':'integer'}, - 'totalOutput' :{'type':'integer'}, - 'activeRuns' :{'type':'string'} + 'hosts' :{'type':'string',"index":"not_analyzed"}, + 'blacklistedHosts':{'type':'string',"index":"not_analyzed"}, + 'host' :{'type':'string',"index":"not_analyzed"}, + 'instance' :{'type':'string',"index":"not_analyzed"} }, '_timestamp' : { 'enabled' : True, 'store' : "yes", "path" : "fm_date" } - } - + }, } diff --git a/python/setupmachine.py b/python/setupmachine.py index cf5dde4..9e875c9 100755 --- a/python/setupmachine.py +++ b/python/setupmachine.py @@ -2,6 +2,9 @@ import os,sys,socket import shutil +import json +import subprocess +import shutil import time @@ -34,10 +37,6 @@ dblogin = 'empty' dbpwd = 'empty' equipmentSet = 'latest' -default_eqset_daq2val = 'eq_140325_attributes' -#default_eqset_daq2 = 'eq_140430_mounttest' -#default_eqset_daq2 = 'eq_14-508_emu' -default_eqset_daq2 = 'eq_140522_emu' minidaq_list = ["bu-c2f13-21-01","bu-c2f13-23-01","bu-c2f13-25-01","bu-c2f13-27-01", "fu-c2f13-17-01","fu-c2f13-17-02","fu-c2f13-17-03","fu-c2f13-17-04" "fu-c2f13-19-01","fu-c2f13-19-02","fu-c2f13-19-03","fu-c2f13-19-04"] @@ -45,27 +44,52 @@ "fu-c2f13-39-03","fu-c2f13-39-04"] ed_list = ["bu-c2f13-29-01","fu-c2f13-41-01","fu-c2f13-41-02", "fu-c2f13-41-03","fu-c2f13-41-04"] + +#es_cdaq_list = ["srv-c2a11-07-01","srv-c2a11-08-01","srv-c2a11-09-01","srv-c2a11-10-01", +# "srv-c2a11-11-01","srv-c2a11-14-01","srv-c2a11-15-01","srv-c2a11-16-01", +# "srv-c2a11-17-01","srv-c2a11-18-01","srv-c2a11-19-01","srv-c2a11-20-01", +# "srv-c2a11-21-01","srv-c2a11-22-01","srv-c2a11-23-01","srv-c2a11-26-01", +# "srv-c2a11-27-01","srv-c2a11-28-01","srv-c2a11-29-01","srv-c2a11-30-01"] +# +#es_tribe_list = ["srv-c2a11-31-01","srv-c2a11-32-01","srv-c2a11-33-01","srv-c2a11-34-01", +# "srv-c2a11-35-01","srv-c2a11-38-01","srv-c2a11-39-01","srv-c2a11-40-01", +# "srv-c2a11-41-01","srv-c2a11-42-01"] + +tribe_ignore_list = ['bu-c2f13-29-01','bu-c2f13-31-01'] + myhost = os.uname()[1] -def countCPUs(): - fp=open('/proc/cpuinfo','r') - resource_count = 0 - for line in fp: - if line.startswith('processor'): - resource_count+=1 - return resource_count +#testing dual mount point +vm_override_buHNs = { + "fu-vm-01-01.cern.ch":["bu-vm-01-01","bu-vm-01-01"], + "fu-vm-01-02.cern.ch":["bu-vm-01-01"], + "fu-vm-02-01.cern.ch":["bu-vm-01-01","bu-vm-01-01"], + "fu-vm-02-02.cern.ch":["bu-vm-01-01"] + } def getmachinetype(): #print "running on host ",myhost if myhost.startswith('dvrubu-') or myhost.startswith('dvfu-') : return 'daq2val','fu' elif myhost.startswith('dvbu-') : return 'daq2val','bu' - elif myhost.startswith('bu-') : return 'daq2','bu' elif myhost.startswith('fu-') : return 'daq2','fu' - elif myhost.startswith('cmsdaq-401b28') : return 'test','fu' - elif myhost.startswith('dvfu-') : return 'test','fu' + elif myhost.startswith('bu-') : return 'daq2','bu' + elif myhost.startswith('srv-') : + try: + es_cdaq_list = socket.gethostbyname_ex('es-cdaq')[2] + es_tribe_list = socket.gethostbyname_ex('es-tribe')[2] + myaddr = socket.gethostbyname(myhost) + if myaddr in es_cdaq_list: + return 'es','escdaq' + elif myaddr in es_tribe_list: + return 'es','tribe' + else: + return 'unknown','unknown' + except socket.gaierror, ex: + print 'dns lookup error ',str(ex) + raise ex else: - print "debug" + print "unknown machine type" return 'unknown','unknown' @@ -94,7 +118,7 @@ def checkModifiedConfigInFile(file): else:zone=tzones[0] for l in lines: - if l.strip().startswith("#edited by fff meta rpm at "+getTimeString()): + if l.strip().startswith("#edited by fff meta rpm"): return True return False @@ -102,24 +126,36 @@ def checkModifiedConfigInFile(file): def checkModifiedConfig(lines): for l in lines: - if l.strip().startswith("#edited by fff meta rpm at "+getTimeString()): + if l.strip().startswith("#edited by fff meta rpm"): return True return False - + + +#alternates between two data inteface indices based on host naming convention +def name_identifier(): + try: + nameParts = os.uname()[1].split('-') + return (int(nameParts[-1]) * int(nameParts[-2]/2)) % 2 + except: + return 0 + + def getBUAddr(parentTag,hostname): global equipmentSet #con = cx_Oracle.connect('CMS_DAQ2_TEST_HW_CONF_W/'+dbpwd+'@'+dbhost+':10121/int2r_lb.cern.ch', - #equipmentSet = 'eq_140325_attributes' - - if equipmentSet == 'default': - if parentTag == 'daq2val': - equipmentSet = default_eqset_daq2val - if parentTag == 'daq2': - equipmentSet = default_eqset_daq2 if env == "vm": + + try: + #cluster in openstack that is not (yet) in mysql + retval = [] + for bu_hn in vm_override_buHNs[hostname]: + retval.append(["myBU",bu_hn]) + return retval + except: + pass con = MySQLdb.connect( host= dbhost, user = dblogin, passwd = dbpwd, db = dbsid) else: if parentTag == 'daq2': @@ -131,7 +167,7 @@ def getBUAddr(parentTag,hostname): con = cx_Oracle.connect(dblogin+'/'+dbpwd+'@'+dbhost+':10121/'+dbsid, cclass="FFFSETUP",purity = cx_Oracle.ATTR_PURITY_SELF) else: - con = cx_Oracle.connect('CMS_DAQ2_TEST_HW_CONF_W/'+dbpwd+'@int2r2-v.cern.ch:10121/int2r_lb.cern.ch', + con = cx_Oracle.connect('CMS_DAQ2_TEST_HW_CONF_R/'+dbpwd+'@int2r2-v.cern.ch:10121/int2r_lb.cern.ch', cclass="FFFSETUP",purity = cx_Oracle.ATTR_PURITY_SELF) #print con.version @@ -175,7 +211,6 @@ def getBUAddr(parentTag,hostname): cur.execute(qstring) else: print "query equipment set",parentTag+'/'+equipmentSet - #print '\n',qstring2 cur.execute(qstring2) retval = [] @@ -185,19 +220,64 @@ def getBUAddr(parentTag,hostname): #print retval return retval +def getAllBU(requireFU=False): + + #setups = ['daq2','daq2val'] + parentTag = 'daq2' + if True: + #if parentTag == 'daq2': + if dbhost.strip()=='null': + #con = cx_Oracle.connect('CMS_DAQ2_HW_CONF_W','pwd','cms_rcms', + con = cx_Oracle.connect(dblogin,dbpwd,dbsid, + cclass="FFFSETUP",purity = cx_Oracle.ATTR_PURITY_SELF) + else: + con = cx_Oracle.connect(dblogin+'/'+dbpwd+'@'+dbhost+':10121/'+dbsid, + cclass="FFFSETUP",purity = cx_Oracle.ATTR_PURITY_SELF) + #else: + # con = cx_Oracle.connect('CMS_DAQ2_TEST_HW_CONF_W/'+dbpwd+'@int2r2-v.cern.ch:10121/int2r_lb.cern.ch', + # cclass="FFFSETUP",purity = cx_Oracle.ATTR_PURITY_SELF) + + cur = con.cursor() + retval = [] + if requireFU==False: + qstring= "select dnsname from DAQ_EQCFG_DNSNAME where (dnsname like 'bu-%' OR dnsname like '__bu-%') \ + AND eqset_id = (select eqset_id from DAQ_EQCFG_EQSET where tag='"+parentTag.upper()+"' AND \ + ctime = (SELECT MAX(CTIME) FROM DAQ_EQCFG_EQSET WHERE tag='"+parentTag.upper()+"'))" + + else: + qstring = "select attr_value from \ + DAQ_EQCFG_HOST_ATTRIBUTE ha, \ + DAQ_EQCFG_HOST_NIC hn, \ + DAQ_EQCFG_DNSNAME d \ + where \ + ha.eqset_id=hn.eqset_id AND \ + hn.eqset_id=d.eqset_id AND \ + ha.host_id = hn.host_id AND \ + ha.attr_name like 'myBU%' AND \ + hn.nic_id = d.nic_id AND \ + d.dnsname like 'fu-%' \ + AND d.eqset_id = (select eqset_id from DAQ_EQCFG_EQSET \ + where tag='"+parentTag.upper()+"' AND \ + ctime = (SELECT MAX(CTIME) FROM DAQ_EQCFG_EQSET WHERE tag='"+parentTag.upper()+"'))" + + + + + cur.execute(qstring) + + for res in cur: + retval.append(res[0]) + cur.close() + retval = sorted(list(set(map(lambda v: v.split('.')[0], retval)))) + print retval + return retval + def getSelfDataAddr(parentTag): global equipmentSet #con = cx_Oracle.connect('CMS_DAQ2_TEST_HW_CONF_W/'+dbpwd+'@'+dbhost+':10121/int2r_lb.cern.ch', - #equipmentSet = 'eq_140325_attributes' - - if equipmentSet == 'default': - if parentTag == 'daq2val': - equipmentSet = default_eqset_daq2val - if parentTag == 'daq2': - equipmentSet = default_eqset_daq2 con = cx_Oracle.connect(dblogin+'/'+dbpwd+'@'+dbhost+':10121/'+dbsid, cclass="FFFSETUP",purity = cx_Oracle.ATTR_PURITY_SELF) @@ -235,13 +315,27 @@ def getSelfDataAddr(parentTag): return retval +def getInstances(hostname): + #instance.input example: + #{"cmsdaq-401b28.cern.ch":{"names":["main","ecal"],"sizes":[40,20]}} #size is in megabytes + #BU can have multiple instances, FU should have only one specified. If none, any host is assumed to have only main instance + try: + with open('/opt/fff/instances.input','r') as fi: + doc = json.load(fi) + return doc[hostname]['names'],doc[hostname]['sizes'] + except: + return ["main"],0 + class FileManager: - def __init__(self,file,sep,edited,os1='',os2=''): + def __init__(self,file,sep,edited,os1='',os2='',recreate=False): self.name = file - f = open(file,'r') - self.lines = f.readlines() - f.close() + if recreate==False: + f = open(file,'r') + self.lines = f.readlines() + f.close() + else: + self.lines=[] self.sep = sep self.regs = [] self.remove = [] @@ -259,7 +353,7 @@ def removeEntry(self,key): def commit(self): out = [] if self.edited == False: - out.append('#edited by fff meta rpm\n') + out.append('#edited by fff meta rpm at '+getTimeString()+'\n') #first removing elements for rm in self.remove: @@ -298,6 +392,8 @@ def commit(self): if insertionDone == False: self.lines.append(toAdd) for l in self.lines: + #already written + if l.startswith("#edited by fff meta rpm"):continue out.append(l) #print "file ",self.name,"\n\n" #for o in out: print o @@ -344,11 +440,6 @@ def restoreFileMaybe(file): if 'elasticsearch' in selection: restoreFileMaybe(elasticsysconf) restoreFileMaybe(elasticconf) - if 'hltd' in selection: - try: - os.remove(os.path.join(backup_dir,os.path.basename(busconfig))) - except: - pass sys.exit(0) @@ -444,13 +535,15 @@ def restoreFileMaybe(file): dqmmachine = 'False' execdir = '/opt/hltd' resourcefract = '0.5' + if cluster == 'daq2val': - runindex_name = 'dv' + runindex_name = 'dv' elif cluster == 'daq2': runindex_name = 'cdaq' if myhost in minidaq_list: runindex_name = 'minidaq' if myhost in dqm_list or myhost in ed_list: + use_elasticsearch = 'False' runindex_name = 'dqm' cmsswloglevel = 'DISABLED' @@ -470,66 +563,35 @@ def restoreFileMaybe(file): cmssw_base = '/home/dqmdevlocal' execdir = '/home/dqmdevlocal/output' ##not yet - #hardcode minidaq hosts until role is available - #if cnhostname == 'bu-c2f13-27-01.cms' or cnhostname == 'fu-c2f13-19-03.cms' or cnhostname == 'fu-c2f13-19-04.cms': - # runindex_name = 'runindex_minidaq' - #hardcode dqm hosts until role is available - #if cnhostname == 'bu-c2f13-31-01.cms' or cnhostname == 'fu-c2f13-39-01.cms' or cnhostname == 'fu-c2f13-39-02.cms' or cnhostname == 'fu-c2f13-39-03.cms' or cnhostname == 'fu-c2f13-39-04.cms': - # runindex_name = 'runindex_dqm' - else: - runindex_name = 'test' + buName = None + buDataAddr=[] - buName = '' - budomain = '' if type == 'fu': - if cluster == 'daq2val' or cluster == 'daq2': - addrList = getBUAddr(cluster,cnhostname) - selectedAddr = False - for addr in addrList: - #result = os.system("ping -c 1 "+ str(addr[1])+" >& /dev/null") - result = 0#ping disabled for now - #os.system("clear") - if result == 0: - buDataAddr = addr[1] - if addr[1].find('.'): - buName = addr[1].split('.')[0] - budomain = addr[1][addr[1].find('.'):] - else: - buName = addr[1] - selectedAddr=True - break - else: - print "failed to ping",str(addr[1]) + if cluster == 'daq2val' or cluster == 'daq2': + for addr in getBUAddr(cluster,cnhostname): + if buName==None: + buName = addr[1].split('.')[0] + elif buName != addr[1].split('.')[0]: + print "BU name not same for all interfaces:",buName,buNameCheck + continue + buDataAddr.append(addr[1]) #if none are pingable, first one is picked - if selectedAddr==False: - if len(addrList)>0: - addr = addrList[0] - buDataAddr = addr[1] - if addr[1].find('.'): - buName = addr[1].split('.')[0] - else: - buName = addr[1] - if buName == '': + if buName == None or len(buDataAddr)==0: print "no BU found for this FU in the dabatase" sys.exit(-1) + else: + print "FU configuration in cluster",cluster,"not supported yet !!" + sys.exit(-2) - elif cluster =='test': - hn = os.uname()[1].split(".")[0] - addrList = [hn] - buName = hn - buDataAddr = hn - else: - print "FU configuration in cluster",cluster,"not supported yet !!" - sys.exit(-2) - elif type == 'bu': if env == "vm": buName = os.uname()[1].split(".")[0] else: buName = os.uname()[1] - addrList = buName + elif type == 'tribe': + buDataAddr = getAllBU(requireFU=False) + buName='es-tribe' - #print "detected address", addrList," and name ",buName print "running configuration for machine",cnhostname,"of type",type,"in cluster",cluster,"; appliance bu is:",buName clusterName='appliance_'+buName @@ -543,7 +605,7 @@ def restoreFileMaybe(file): #print "will modify sysconfig elasticsearch configuration" #maybe backup vanilla versions essysEdited = checkModifiedConfigInFile(elasticsysconf) - if essysEdited == False and type == 'fu': #modified only on FU + if essysEdited == False: #print "elasticsearch sysconfig configuration was not yet modified" shutil.copy(elasticsysconf,os.path.join(backup_dir,os.path.basename(elasticsysconf))) @@ -551,97 +613,224 @@ def restoreFileMaybe(file): if esEdited == False: shutil.copy(elasticconf,os.path.join(backup_dir,os.path.basename(elasticconf))) - escfg = FileManager(elasticconf,':',esEdited,'',' ') + if type == 'fu' or type == 'bu': - escfg.reg('cluster.name',clusterName) - escfg.reg('node.name',cnhostname) - essyscfg = FileManager(elasticsysconf,'=',essysEdited) - essyscfg.reg('ES_HEAP_SIZE','1G') - essyscfg.commit() + essyscfg = FileManager(elasticsysconf,'=',essysEdited) + essyscfg.reg('ES_HEAP_SIZE','1G') + essyscfg.commit() - if type == 'fu': + escfg = FileManager(elasticconf,':',esEdited,'',' ') + escfg.reg('cluster.name',clusterName) + escfg.reg('node.name',cnhostname) escfg.reg('discovery.zen.ping.multicast.enabled','false') - if env=="vm": - escfg.reg('discovery.zen.ping.unicast.hosts',"[\"" + buName + "\"]") - else: - escfg.reg('discovery.zen.ping.unicast.hosts',"[\"" + buName + ".cms" + "\"]") escfg.reg('network.publish_host',es_publish_host) escfg.reg('transport.tcp.compress','true') - escfg.reg('indices.fielddata.cache.size', '50%') - if cluster != 'test': + + if type == 'fu': + if env=="vm": + escfg.reg('discovery.zen.ping.unicast.hosts',"[\"" + buName + "\"]") + else: + escfg.reg('discovery.zen.ping.unicast.hosts',"[\"" + buName + ".cms" + "\"]") + escfg.reg('indices.fielddata.cache.size', '50%') escfg.reg('node.master','false') escfg.reg('node.data','true') - if type == 'bu': - escfg.reg('network.publish_host',es_publish_host) - #escfg.reg('discovery.zen.ping.multicast.enabled','false') - #escfg.reg('discovery.zen.ping.unicast.hosts','[ \"'+elastic_host2+'\" ]') + if type == 'bu': + #escfg.reg('discovery.zen.ping.unicast.hosts','[ \"'+elastic_host2+'\" ]') + escfg.reg('node.master','true') + escfg.reg('node.data','false') + escfg.commit() + + if type == 'tribe': + essyscfg = FileManager(elasticsysconf,'=',essysEdited) + essyscfg.reg('ES_HEAP_SIZE','12G') + essyscfg.commit() + + escfg = FileManager(elasticconf,':',esEdited,'',' ',recreate=True) + escfg.reg('cluster.name','es-tribe') + escfg.reg('discovery.zen.ping.multicast.enabled','false') + #escfg.reg('discovery.zen.ping.unicast.hosts','['+','.join(buDataAddr)+']') + escfg.reg('transport.tcp.compress','true') + bustring = "[" + for bu in buDataAddr: + if bu in tribe_ignore_list:continue + + try: + socket.gethostbyname_ex(bu+'.cms') + except: + print "skipping",bu," - unable to lookup IP address" + continue + if bustring!="[":bustring+=',' + bustring+='"'+bu+'.cms'+'"' + bustring += "]" + escfg.reg('discovery.zen.ping.unicast.hosts',bustring) + + escfg.reg('tribe','') + i=1; + for bu in buDataAddr: + if bu in tribe_ignore_list:continue + + try: + socket.gethostbyname_ex(bu+'.cms') + except: + # print "skipping",bu," - unable to lookup IP address" + continue + + escfg.reg(' t'+str(i),'') + #escfg.reg(' discovery.zen.ping.unicast.hosts', '["'+bu+'.cms"]') + escfg.reg(' cluster.name', 'appliance_'+bu) + i=i+1 + escfg.commit() + + if type == 'escdaq': + essyscfg = FileManager(elasticsysconf,'=',essysEdited) + essyscfg.reg('ES_HEAP_SIZE','10G') + essyscfg.commit() + + escfg = FileManager(elasticconf,':',esEdited,'',' ',recreate=True) + escfg.reg('cluster.name','es-cdaq') + escfg.reg('discovery.zen.minimum_master_nodes','11') + escfg.reg('index.mapper.dynamic','false') + escfg.reg('action.auto_create_index','false') escfg.reg('transport.tcp.compress','true') escfg.reg('node.master','true') - escfg.reg('node.data','false') + escfg.reg('node.data','true') + escfg.commit() - escfg.commit() if "hltd" in selection: #first prepare bus.config file if type == 'fu': - try: - shutil.copy(busconfig,os.path.join(backup_dir,os.path.basename(busconfig))) - os.remove(busconfig) - except Exception,ex: - print "problem with copying bus.config? ",ex - pass + + #permissive:try to remove old bus.config + try:os.remove(os.path.join(backup_dir,os.path.basename(busconfig))) + except:pass + try:os.remove(busconfig) + except:pass #write bu ip address - print "WRITING BUS CONFIG ", busconfig f = open(busconfig,'w+') - f.writelines(getIPs(buDataAddr)[0]) + + #swap entries based on name (only C6100 hosts with two data interfaces): + if len(buDataAddr)>1 and name_identifier()==1: + temp = buDataAddr[0] + buDataAddr[0]=buDataAddr[1] + buDataAddr[1]=temp + + newline=False + for addr in buDataAddr: + if newline:f.writelines('\n') + newline=True + f.writelines(getIPs(addr)[0]) + #break after writing first entry. it is not yet safe to use secondary interface + break f.close() + #FU should have one instance assigned, BUs can have multiple + watch_dir_bu = '/fff/ramdisk' + out_dir_bu = '/fff/output' + log_dir_bu = '/var/log/hltd' + + instances,sizes=getInstances(os.uname()[1]) + if len(instances)==0: instances=['main'] + hltdEdited = checkModifiedConfigInFile(hltdconf) - #print "was modified?",hltdEdited + if hltdEdited == False: shutil.copy(hltdconf,os.path.join(backup_dir,os.path.basename(hltdconf))) - hltdcfg = FileManager(hltdconf,'=',hltdEdited,' ',' ') - hltdcfg.reg('enabled','True','[General]') if type=='bu': + try:os.remove('/etc/hltd.instances') + except:pass + + #do major ramdisk cleanup (unmount existing loop mount points, run directories and img files) + try: + subprocess.check_call(['/opt/hltd/scripts/unmountloopfs.sh','/fff/ramdisk']) + #delete existing run directories to ensure there is space (if this machine has a non-main instance) + if instances!=["main"]: + os.popen('rm -rf /fff/ramdisk/run*') + except subprocess.CalledProcessError, err1: + print 'failed to cleanup ramdisk',err1 + except Exception as ex: + print 'failed to cleanup ramdisk',ex + + cgibase=9000 + + for idx,val in enumerate(instances): + if idx!=0 and val=='main': + instances[idx]=instances[0] + instances[0]=val + break + for idx, instance in enumerate(instances): + + watch_dir_bu = '/fff/ramdisk' + out_dir_bu = '/fff/output' + log_dir_bu = '/var/log/hltd' + + cfile = hltdconf + if instance != 'main': + cfile = '/etc/hltd-'+instance+'.conf' + shutil.copy(hltdconf,cfile) + watch_dir_bu = os.path.join(watch_dir_bu,instance) + out_dir_bu = os.path.join(out_dir_bu,instance) + log_dir_bu = os.path.join(log_dir_bu,instance) + + #run loopback setup for non-main instances (is done on every boot since ramdisk is volatile) + try: + subprocess.check_call(['/opt/hltd/scripts/makeloopfs.sh','/fff/ramdisk',instance, str(sizes[idx])]) + except subprocess.CalledProcessError, err1: + print 'failed to configure loopback device mount in ramdisk' + + soap2file_port='0' + + if myhost in dqm_list or myhost in ed_list or cluster == 'daq2val' or env=='vm': + soap2file_port='8010' + + hltdcfg = FileManager(cfile,'=',hltdEdited,' ',' ') + + hltdcfg.reg('enabled','True','[General]') + hltdcfg.reg('role','bu','[General]') - #get needed info here hltdcfg.reg('user',username,'[General]') - hltdcfg.reg('cgi_port','9000','[Web]') + hltdcfg.reg('instance',instance,'[General]') + + #port for multiple instances + hltdcfg.reg('cgi_port',str(cgibase+idx),'[Web]') + hltdcfg.reg('cgi_instance_port_offset',str(idx),'[Web]') + hltdcfg.reg('soap2file_port',soap2file_port,'[Web]') + hltdcfg.reg('elastic_cluster',clusterName,'[Monitoring]') - hltdcfg.reg('watch_directory','/fff/ramdisk','[General]') - hltdcfg.reg('role','bu','[General]') - hltdcfg.reg('micromerge_output','/fff/output','[General]') + hltdcfg.reg('watch_directory',watch_dir_bu,'[General]') + #hltdcfg.reg('micromerge_output',out_dir_bu,'[General]') hltdcfg.reg('elastic_runindex_url',elastic_host,'[Monitoring]') hltdcfg.reg('elastic_runindex_name',runindex_name,'[Monitoring]') hltdcfg.reg('use_elasticsearch',use_elasticsearch,'[Monitoring]') hltdcfg.reg('es_cmssw_log_level',cmsswloglevel,'[Monitoring]') hltdcfg.reg('dqm_machine',dqmmachine,'[DQM]') - #hltdcfg.removeEntry('watch_directory') + hltdcfg.reg('log_dir',log_dir_bu,'[Logs]') hltdcfg.commit() - #remove /fff/data from BU (hack) - try: - shutil.rmtree('/fff/data') - except: - pass + + #write all instances in a file + if 'main' not in instances or len(instances)>1: + with open('/etc/hltd.instances',"w") as fi: + for instance in instances: fi.write(instance+"\n") + if type=='fu': + hltdcfg = FileManager(hltdconf,'=',hltdEdited,' ',' ') - #max_cores_done = False - #do_max_cores = True - #num_max_cores = countCPUs() + hltdcfg.reg('enabled','True','[General]') + hltdcfg.reg('role','fu','[General]') - #num_threads_done = False - #do_num_threads = True - #num_threads = nthreads - hltdcfg.reg('exec_directory',execdir,'[General]') hltdcfg.reg('user',username,'[General]') + #FU can only have one instance (so we take instance[0] and ignore others) + hltdcfg.reg('instance',instances[0],'[General]') + + hltdcfg.reg('exec_directory',execdir,'[General]') hltdcfg.reg('watch_directory','/fff/data','[General]') - hltdcfg.reg('role','fu','[General]') hltdcfg.reg('cgi_port','9000','[Web]') - #hltdcfg.reg('mount_options_output','rw,vers=4,rsize=65536,wsize=65536,namlen=255,hard,proto=tcp,timeo=600,retrans=2,sec=sys','[General]') + hltdcfg.reg('cgi_instance_port_offset',"0",'[Web]') + hltdcfg.reg('soap2file_port','0','[Web]') hltdcfg.reg('elastic_cluster',clusterName,'[Monitoring]') hltdcfg.reg('es_cmssw_log_level',cmsswloglevel,'[Monitoring]') hltdcfg.reg('elastic_runindex_url',elastic_host,'[Monitoring]') @@ -653,6 +842,11 @@ def restoreFileMaybe(file): hltdcfg.reg('cmssw_threads',nthreads,'[CMSSW]') hltdcfg.reg('cmssw_streams',nfwkstreams,'[CMSSW]') hltdcfg.reg('resource_use_fraction',resourcefract,'[Resources]') - #hltdcfg.removeEntry('watch_directory') hltdcfg.commit() + if "web" in selection: + try:os.rmdir('/var/www/html') + except: + try:os.unlink('/var/www/html') + except:pass + os.symlink('/es-web','/var/www/html') diff --git a/python/soap2file b/python/soap2file new file mode 100755 index 0000000..9126c7d --- /dev/null +++ b/python/soap2file @@ -0,0 +1,64 @@ +#!/bin/env python +# +# chkconfig: 2345 81 03 +# + +import sys +import SOAPpy +import time +from subprocess import Popen +from subprocess import PIPE + +sys.path.append('/opt/hltd/python') +#sys.path.append('/opt/hltd/lib') + +from soap2file import Soap2file + + +def startService(daemon): + proc = Popen(["/opt/hltd/python/soap2file.py"], stdout=PIPE) + output = proc.communicate()[0] + time.sleep(.1) + if daemon.silentStatus() and proc.returncode==0: + print 'Starting soap2file:\t\t\t\t\t [ \033[1;32mOK\033[0;39m ]' + else: + if proc.returncode==3:sys.exit(0) + print 'Starting soap2file instance: [ \032[1;32mFAILED\033[0;39m ]' + print output + sys.exit(1) + + + +if __name__ == "__main__": + + soap2file = Soap2file() + + if not soap2file.checkEnabled(): + print "Soap2file service is disabled" + sys.exit(0) + + if len(sys.argv) == 2: + + if 'start' == sys.argv[1]: + startService(soap2file) + + elif 'stop' == sys.argv[1]: + sys.stdout.write('Stopping soap2file:') + soap2file.stop() + + elif 'restart' == sys.argv[1]: + sys.stdout.write('Stopping soap2file:') + soap2file.stop() + startService(soap2file) + + elif 'status' == sys.argv[1]: + soap2file.status() + + else: + print "Unknown command" + sys.exit(2) + sys.exit(0) + else: + print "usage: %s start|stop|restart|status" % sys.argv[0] + sys.exit(2) + diff --git a/python/soap2file.py b/python/soap2file.py index d8e6cae..ca63a88 100755 --- a/python/soap2file.py +++ b/python/soap2file.py @@ -4,12 +4,11 @@ # import os -import pwd import sys import SOAPpy sys.path.append('/opt/hltd/python') -sys.path.append('/opt/hltd/lib') +#sys.path.append('/opt/hltd/lib') import demote import hltdconf @@ -30,7 +29,6 @@ def writeToFile(filename,content,overwrite): except IOError as ex: return "Failed to write data: "+str(ex) - def createDirectory(dirname): try: os.mkdir(dirname) @@ -38,15 +36,25 @@ def createDirectory(dirname): except OSError as ex: return "Failed to create directory: "+str(ex) +def renamePath(oldpath,newpath): + try: + os.rename(oldpath,newpath) + return "Success" + except Exception as ex: + return "Failed to rename file: "+str(ex) class Soap2file(Daemon2): - def __init__(self,pidfile): - Daemon2.__init__(self,pidfile,'soap2file') + def __init__(self): + Daemon2.__init__(self,'soap2file','main','hltd') #SOAPpy.Config.debug = 1 self._conf=hltdconf.hltdConf('/etc/hltd.conf') self._hostname = os.uname()[1] + def checkEnabled(self): + if self._conf.soap2file_port>0:return True + return False + def run(self): dem = demote.demote(self._conf.user) dem() @@ -54,43 +62,13 @@ def run(self): server = SOAPpy.SOAPServer((self._hostname, self._conf.soap2file_port)) server.registerFunction(writeToFile) server.registerFunction(createDirectory) + server.registerFunction(renamePath) server.serve_forever() if __name__ == "__main__": - - pidfile = '/var/run/soap2file.pid' - soap2file = Soap2file(pidfile) - - if len(sys.argv) == 2: - - if 'start' == sys.argv[1]: - try: - soap2file.start() - if soap2file.silentStatus(): - print '[OK]' - else: - print '[Failed]' - except: - pass - - elif 'stop' == sys.argv[1]: - if soap2file.status(): - soap2file.stop() - elif os.path.exists(pidfile): - soap2file.delpid() - - elif 'restart' == sys.argv[1]: - soap2file.restart() - - elif 'status' == sys.argv[1]: - soap2file.status() - - else: - print "Unknown command" - sys.exit(2) - sys.exit(0) - else: - print "usage: %s start|stop|restart|status" % sys.argv[0] - sys.exit(2) + daemon = Soap2file() + import procname + procname.setprocname('soap2file') + daemon.start() diff --git a/python/testFUHistograms_cfg2.py b/python/testFUHistograms_cfg2.py index 796b453..4d69212 100644 --- a/python/testFUHistograms_cfg2.py +++ b/python/testFUHistograms_cfg2.py @@ -1,178 +1,40 @@ -import FWCore.ParameterSet.Config as cms -import FWCore.ParameterSet.VarParsing as VarParsing -import DQMServices.Components.test.checkBooking as booking -import DQMServices.Components.test.createElements as c -import os,sys - -cmsswbase = os.path.expandvars('$CMSSW_BASE/') - -options = VarParsing.VarParsing ('analysis') - -options.register ('runNumber', - 1, # default value - VarParsing.VarParsing.multiplicity.singleton, - VarParsing.VarParsing.varType.int, # string, int, or float - "Run Number") - -options.register ('buBaseDir', - '/fff/BU0', # default value - VarParsing.VarParsing.multiplicity.singleton, - VarParsing.VarParsing.varType.string, # string, int, or float - "BU base directory") - -options.register ('dataDir', - '/fff/data', # default value - VarParsing.VarParsing.multiplicity.singleton, - VarParsing.VarParsing.varType.string, # string, int, or float - "FU data directory") - -options.register ('numThreads', - 1, # default value - VarParsing.VarParsing.multiplicity.singleton, - VarParsing.VarParsing.varType.int, # string, int, or float - "Number of CMSSW threads") - -options.register ('numFwkStreams', - 1, # default value - VarParsing.VarParsing.multiplicity.singleton, - VarParsing.VarParsing.varType.int, # string, int, or float - "Number of CMSSW streams") - - - -options.parseArguments() - -process = cms.Process("HLT") - -# load DQM -process.load("DQMServices.Core.DQM_cfg") -process.load("DQMServices.Components.DQMEnvironment_cfi") - -#b = booking.BookingParams(sys.argv) -#b = booking.BookingParams(["CTOR","BJ","BR"]) -#b.doCheck(testOnly=False) - -elements = c.createElements() -readRunElements = c.createReadRunElements() -readLumiElements = c.createReadLumiElements() - +# /users/avetisya/LS1/DAQTest/HLT/V3 (CMSSW_7_2_1) +import FWCore.ParameterSet.Config as cms +process = cms.Process( "HLT" ) -process.maxEvents = cms.untracked.PSet( - input = cms.untracked.int32(-1) +process.HLTConfigVersion = cms.PSet( + tableName = cms.string('/users/avetisya/LS1/DAQTest/HLT/V3') ) -process.options = cms.untracked.PSet( - numberOfThreads = cms.untracked.uint32(options.numThreads), - numberOfStreams = cms.untracked.uint32(options.numFwkStreams), - multiProcesses = cms.untracked.PSet( - maxChildProcesses = cms.untracked.int32(0) - ) +process.streams = cms.PSet( + A = cms.vstring( 'A1' ), + B = cms.vstring( 'B' ), + DQM = cms.vstring( 'DQM1' ) +) +process.datasets = cms.PSet( + A1 = cms.vstring( 'p1' ), + B = cms.vstring( 'p3' ), + DQM1 = cms.vstring( 'p2' ) ) -process.MessageLogger = cms.Service("MessageLogger", - destinations = cms.untracked.vstring( 'cout' ), - cout = cms.untracked.PSet( FwkReport = - cms.untracked.PSet(reportEvery = cms.untracked.int32(10), - optionalPSet = cms.untracked.bool(True), - #limit = cms.untracked.int32(10000000) - ), - threshold = cms.untracked.string( "INFO" ) - ) - ) - -process.FastMonitoringService = cms.Service("FastMonitoringService", - sleepTime = cms.untracked.int32(1), - microstateDefPath = cms.untracked.string( cmsswbase+'/src/EventFilter/Utilities/plugins/microstatedef.jsd' ), - #fastMicrostateDefPath = cms.untracked.string( cmsswbase+'/src/EventFilter/Utilities/plugins/microstatedeffast.jsd' ), - fastName = cms.untracked.string( 'fastmoni' ), - slowName = cms.untracked.string( 'slowmoni' )) - -process.EvFDaqDirector = cms.Service("EvFDaqDirector", - buBaseDir = cms.untracked.string(options.buBaseDir), - baseDir = cms.untracked.string(options.dataDir), - directorIsBU = cms.untracked.bool(False ), - testModeNoBuilderUnit = cms.untracked.bool(False), - runNumber = cms.untracked.uint32(options.runNumber) - ) -process.PrescaleService = cms.Service( "PrescaleService", - lvl1DefaultLabel = cms.string( "B" ), - lvl1Labels = cms.vstring( 'A', - 'B' - ), - prescaleTable = cms.VPSet( - cms.PSet( pathName = cms.string( "p1" ), - prescales = cms.vuint32( 0, 10) - ), - cms.PSet( pathName = cms.string( "p2" ), - prescales = cms.vuint32( 0, 100) - ) - )) - - -process.source = cms.Source("FedRawDataInputSource", - getLSFromFilename = cms.untracked.bool(True), - testModeNoBuilderUnit = cms.untracked.bool(False), - eventChunkSize = cms.untracked.uint32(128), - numBuffers = cms.untracked.uint32(2), - eventChunkBlock = cms.untracked.uint32(128), - useL1EventID=cms.untracked.bool(True) - ) - - -process.filter1 = cms.EDFilter("HLTPrescaler", - L1GtReadoutRecordTag = cms.InputTag( "hltGtDigis" ) - ) -process.filter2 = cms.EDFilter("HLTPrescaler", - L1GtReadoutRecordTag = cms.InputTag( "hltGtDigis" ) - ) - -process.a = cms.EDAnalyzer("ExceptionGenerator", - defaultAction = cms.untracked.int32(0), - defaultQualifier = cms.untracked.int32(120)) - -process.b = cms.EDAnalyzer("ExceptionGenerator", - defaultAction = cms.untracked.int32(0), - defaultQualifier = cms.untracked.int32(0)) - - -process.filler = cms.EDAnalyzer("DummyBookFillDQMStoreMultiThread", - folder = cms.untracked.string("TestFolder/"), - elements = cms.untracked.VPSet(*elements), - fillRuns = cms.untracked.bool(True), - fillLumis = cms.untracked.bool(True), - book_at_constructor = cms.untracked.bool(False), - book_at_beginJob = cms.untracked.bool(False), - book_at_beginRun = cms.untracked.bool(True)) - - - - - -process.p1 = cms.Path(process.a*process.filter1) -process.p2 = cms.Path(process.b*process.filter2) - -process.dqmsave_step = cms.Path(process.filler*process.dqmSaver) - -### global options Online ### -process.add_(cms.Service("DQMStore")) -process.DQMStore.LSbasedMode = cms.untracked.bool(True) -process.DQMStore.verbose = cms.untracked.int32(5) -process.DQMStore.enableMultiThread = cms.untracked.bool(True) - -process.dqmSaver.workflow = '' -process.dqmSaver.convention = 'FilterUnit' -process.dqmSaver.saveByLumiSection = True -process.dqmSaver.fileFormat = cms.untracked.string('PB') -process.dqmSaver.fakeFilterUnitMode = cms.untracked.bool(False) - +process.source = cms.Source( "FedRawDataInputSource", + numBuffers = cms.untracked.uint32( 1 ), + useL1EventID = cms.untracked.bool( True ), + eventChunkSize = cms.untracked.uint32( 128 ), + eventChunkBlock = cms.untracked.uint32( 128 ), + getLSFromFilename = cms.untracked.bool( True ), + verifyAdler32 = cms.untracked.bool( True ) +) -process.GlobalTag = cms.ESSource( "PoolDBESSource", +process.PoolDBESSource = cms.ESSource( "PoolDBESSource", globaltag = cms.string( "GR_H_V39::All" ), - toGet = cms.VPSet( + RefreshEachRun = cms.untracked.bool( False ), + RefreshOpenIOVs = cms.untracked.bool( False ), + toGet = cms.VPSet( ), - DBParameters = cms.PSet( + DBParameters = cms.PSet( authenticationPath = cms.untracked.string( "." ), connectionRetrialTimeOut = cms.untracked.int32( 60 ), idleConnectionCleanupPeriod = cms.untracked.int32( 10 ), @@ -181,32 +43,306 @@ enableConnectionSharing = cms.untracked.bool( True ), enableReadOnlySessionOnUpdateConnection = cms.untracked.bool( False ), connectionTimeOut = cms.untracked.int32( 0 ), + authenticationSystem = cms.untracked.int32( 0 ), connectionRetrialPeriod = cms.untracked.int32( 10 ) ), RefreshAlways = cms.untracked.bool( False ), - ReconnectEachRun = cms.untracked.bool( False ), - RefreshEachRun = cms.untracked.bool( False ), - RefreshOpenIOVs = cms.untracked.bool( False ), connect = cms.string( "frontier://(proxyurl=http://localhost:3128)(serverurl=http://localhost:8000/FrontierOnProd)(serverurl=http://localhost:8000/FrontierOnProd)(retrieve-ziplevel=0)/CMS_COND_31X_GLOBALTAG" ), + ReconnectEachRun = cms.untracked.bool( False ), BlobStreamerName = cms.untracked.string( "TBufferBlobStreamingService" ) ) +process.FastTimerService = cms.Service( "FastTimerService", + dqmPath = cms.untracked.string( "HLT/TimerService" ), + dqmModuleTimeRange = cms.untracked.double( 40.0 ), + useRealTimeClock = cms.untracked.bool( True ), + enableTimingModules = cms.untracked.bool( True ), + enableDQM = cms.untracked.bool( True ), + enableDQMbyModule = cms.untracked.bool( False ), + enableTimingExclusive = cms.untracked.bool( False ), + skipFirstPath = cms.untracked.bool( False ), + enableDQMbyLumiSection = cms.untracked.bool( True ), + dqmPathTimeResolution = cms.untracked.double( 0.5 ), + dqmPathTimeRange = cms.untracked.double( 100.0 ), + dqmTimeRange = cms.untracked.double( 1000.0 ), + dqmLumiSectionsRange = cms.untracked.uint32( 2500 ), + enableDQMbyProcesses = cms.untracked.bool( True ), + enableDQMSummary = cms.untracked.bool( True ), + enableTimingSummary = cms.untracked.bool( False ), + enableDQMbyPathTotal = cms.untracked.bool( True ), + enableTimingPaths = cms.untracked.bool( True ), + enableDQMbyPathExclusive = cms.untracked.bool( True ), + dqmTimeResolution = cms.untracked.double( 5.0 ), + dqmModuleTimeResolution = cms.untracked.double( 0.2 ), + enableDQMbyPathActive = cms.untracked.bool( True ), + enableDQMbyPathDetails = cms.untracked.bool( True ), + enableDQMbyPathOverhead = cms.untracked.bool( True ), + enableDQMbyPathCounters = cms.untracked.bool( True ), + enableDQMbyModuleType = cms.untracked.bool( False ) +) +process.DQMStore = cms.Service( "DQMStore", + verbose = cms.untracked.int32( 0 ), + collateHistograms = cms.untracked.bool( False ), + enableMultiThread = cms.untracked.bool( True ), + forceResetOnBeginLumi = cms.untracked.bool( False ), + LSbasedMode = cms.untracked.bool( True ), + verboseQT = cms.untracked.int32( 0 ) +) +process.EvFDaqDirector = cms.Service( "EvFDaqDirector", + buBaseDir = cms.untracked.string( "." ), + runNumber = cms.untracked.uint32( 0 ), + outputAdler32Recheck = cms.untracked.bool( False ), + baseDir = cms.untracked.string( "." ) +) +process.FastMonitoringService = cms.Service( "FastMonitoringService", + slowName = cms.untracked.string( "slowmoni" ), + sleepTime = cms.untracked.int32( 1 ), + fastMonIntervals = cms.untracked.uint32( 2 ), + fastName = cms.untracked.string( "fastmoni" ) +) +process.PrescaleService = cms.Service( "PrescaleService", + forceDefault = cms.bool( False ), + prescaleTable = cms.VPSet( + cms.PSet( pathName = cms.string( "p3" ), + prescales = cms.vuint32( 50, 50, 50, 50, 50, 50, 50, 50, 50 ) + ), + cms.PSet( pathName = cms.string( "p2" ), + prescales = cms.vuint32( 100, 100, 100, 100, 100, 100, 100, 100, 100 ) + ), + cms.PSet( pathName = cms.string( "p1" ), + prescales = cms.vuint32( 10, 10, 10, 10, 10, 10, 10, 10, 10 ) + ) + ), + lvl1DefaultLabel = cms.string( "1e33" ), + lvl1Labels = cms.vstring( '2e33', + '1.4e33', + '1e33', + '7e32', + '5e32', + '3e32', + '2e32', + '1.4e32', + '1e32' ) +) +process.MessageLogger = cms.Service( "MessageLogger", + suppressInfo = cms.untracked.vstring( 'hltGtDigis' ), + debugs = cms.untracked.PSet( + threshold = cms.untracked.string( "INFO" ), + placeholder = cms.untracked.bool( True ), + ), + cout = cms.untracked.PSet( + threshold = cms.untracked.string( "ERROR" ), + ), + cerr_stats = cms.untracked.PSet( + threshold = cms.untracked.string( "WARNING" ), + output = cms.untracked.string( "cerr" ), + optionalPSet = cms.untracked.bool( True ) + ), + warnings = cms.untracked.PSet( + threshold = cms.untracked.string( "INFO" ), + placeholder = cms.untracked.bool( True ), + ), + statistics = cms.untracked.vstring( 'cerr' ), + cerr = cms.untracked.PSet( + INFO = cms.untracked.PSet( limit = cms.untracked.int32( 0 ) ), + noTimeStamps = cms.untracked.bool( False ), + FwkReport = cms.untracked.PSet( + reportEvery = cms.untracked.int32( 1 ), + limit = cms.untracked.int32( 0 ) + ), + default = cms.untracked.PSet( limit = cms.untracked.int32( 10000000 ) ), + Root_NoDictionary = cms.untracked.PSet( limit = cms.untracked.int32( 0 ) ), + FwkJob = cms.untracked.PSet( limit = cms.untracked.int32( 0 ) ), + FwkSummary = cms.untracked.PSet( + reportEvery = cms.untracked.int32( 1 ), + limit = cms.untracked.int32( 10000000 ) + ), + threshold = cms.untracked.string( "INFO" ), + ), + FrameworkJobReport = cms.untracked.PSet( + default = cms.untracked.PSet( limit = cms.untracked.int32( 0 ) ), + FwkJob = cms.untracked.PSet( limit = cms.untracked.int32( 10000000 ) ) + ), + suppressWarning = cms.untracked.vstring( 'hltGtDigis' ), + errors = cms.untracked.PSet( + threshold = cms.untracked.string( "INFO" ), + placeholder = cms.untracked.bool( True ), + ), + fwkJobReports = cms.untracked.vstring( 'FrameworkJobReport' ), + infos = cms.untracked.PSet( + threshold = cms.untracked.string( "INFO" ), + Root_NoDictionary = cms.untracked.PSet( limit = cms.untracked.int32( 0 ) ), + placeholder = cms.untracked.bool( True ), + ), + categories = cms.untracked.vstring( 'FwkJob', + 'FwkReport', + 'FwkSummary', + 'Root_NoDictionary' ), + destinations = cms.untracked.vstring( 'warnings', + 'errors', + 'infos', + 'debugs', + 'cout', + 'cerr' ), + threshold = cms.untracked.string( "INFO" ), + suppressError = cms.untracked.vstring( 'hltGtDigis' ) +) -process.hltTriggerJSONMonitoring = cms.EDAnalyzer('TriggerJSONMonitoring', - triggerResults = cms.InputTag( 'TriggerResults','','HLT') +process.ExceptionGenerator2 = cms.EDAnalyzer( "ExceptionGenerator", + defaultAction = cms.untracked.int32( 0 ), + defaultQualifier = cms.untracked.int32( 0 ) +) +process.HLTPrescaler = cms.EDFilter( "HLTPrescaler", + L1GtReadoutRecordTag = cms.InputTag( "hltGtDigis" ), + offset = cms.uint32( 0 ) +) +process.HLTPrescaler2 = cms.EDFilter( "HLTPrescaler", + L1GtReadoutRecordTag = cms.InputTag( "hltGtDigis" ), + offset = cms.uint32( 0 ) ) +process.hltL1GtObjectMap = cms.EDProducer( "L1GlobalTrigger", + TechnicalTriggersUnprescaled = cms.bool( True ), + ProduceL1GtObjectMapRecord = cms.bool( True ), + AlgorithmTriggersUnmasked = cms.bool( False ), + EmulateBxInEvent = cms.int32( 1 ), + AlgorithmTriggersUnprescaled = cms.bool( True ), + ProduceL1GtDaqRecord = cms.bool( False ), + ReadTechnicalTriggerRecords = cms.bool( True ), + RecordLength = cms.vint32( 3, 0 ), + TechnicalTriggersUnmasked = cms.bool( False ), + ProduceL1GtEvmRecord = cms.bool( False ), + GmtInputTag = cms.InputTag( "hltGtDigis" ), + TechnicalTriggersVetoUnmasked = cms.bool( True ), + AlternativeNrBxBoardEvm = cms.uint32( 0 ), + TechnicalTriggersInputTags = cms.VInputTag( 'simBscDigis' ), + CastorInputTag = cms.InputTag( "castorL1Digis" ), + GctInputTag = cms.InputTag( "hltGctDigis" ), + AlternativeNrBxBoardDaq = cms.uint32( 0 ), + WritePsbL1GtDaqRecord = cms.bool( False ), + BstLengthBytes = cms.int32( -1 ) +) +process.TriggerJSONMonitoring = cms.EDAnalyzer( "TriggerJSONMonitoring", + triggerResults = cms.InputTag( 'TriggerResults','','HLT' ) +) +process.DQMFileSaver = cms.EDAnalyzer( "DQMFileSaver", + runIsComplete = cms.untracked.bool( False ), + referenceHandling = cms.untracked.string( "all" ), + producer = cms.untracked.string( "DQM" ), + forceRunNumber = cms.untracked.int32( -1 ), + saveByRun = cms.untracked.int32( 1 ), + saveAtJobEnd = cms.untracked.bool( False ), + saveByLumiSection = cms.untracked.int32( 1 ), + version = cms.untracked.int32( 1 ), + referenceRequireStatus = cms.untracked.int32( 100 ), + convention = cms.untracked.string( "FilterUnit" ), + dirName = cms.untracked.string( "." ), + fileFormat = cms.untracked.string( "PB" ) +) +process.ExceptionGenerator = cms.EDAnalyzer( "ExceptionGenerator", + defaultAction = cms.untracked.int32( 0 ), + defaultQualifier = cms.untracked.int32( 64 ) +) +process.ExceptionGenerator3 = cms.EDAnalyzer( "ExceptionGenerator", + defaultAction = cms.untracked.int32( 0 ), + defaultQualifier = cms.untracked.int32( 0 ) +) +process.HLTPrescaler3 = cms.EDFilter( "HLTPrescaler", + L1GtReadoutRecordTag = cms.InputTag( "hltGtDigis" ), + offset = cms.uint32( 0 ) +) + +process.hltOutputA = cms.OutputModule( "ShmStreamConsumer", + SelectEvents = cms.untracked.PSet( SelectEvents = cms.vstring( 'p1' ) ), + outputCommands = cms.untracked.vstring( 'drop *', + 'keep FEDRawDataCollection_rawDataCollector_*_*', + 'keep FEDRawDataCollection_source_*_*' ) +) +process.hltOutputB = cms.OutputModule( "ShmStreamConsumer", + SelectEvents = cms.untracked.PSet( SelectEvents = cms.vstring( 'p3' ) ), + outputCommands = cms.untracked.vstring( 'drop *', + 'keep FEDRawDataCollection_rawDataCollector_*_*', + 'keep FEDRawDataCollection_source_*_*' ) +) +process.hltOutputDQM = cms.OutputModule( "ShmStreamConsumer", + SelectEvents = cms.untracked.PSet( SelectEvents = cms.vstring( 'p2' ) ), + outputCommands = cms.untracked.vstring( 'drop *', + 'keep FEDRawDataCollection_rawDataCollector_*_*', + 'keep FEDRawDataCollection_source_*_*' ) +) + +process.p3 = cms.Path( process.ExceptionGenerator3 + process.HLTPrescaler3 ) +process.ep3 = cms.EndPath( process.hltOutputB ) +process.pDQMhisto = cms.Path( process.DQMFileSaver ) +process.json = cms.EndPath( process.TriggerJSONMonitoring ) +process.L1Gt = cms.Path( process.hltL1GtObjectMap ) +process.ep2 = cms.EndPath( process.hltOutputDQM ) +process.ep1 = cms.EndPath( process.hltOutputA ) +process.p2 = cms.Path( process.ExceptionGenerator2 + process.HLTPrescaler ) +process.p1 = cms.Path( process.ExceptionGenerator + process.HLTPrescaler2 ) + +process.transferSystem = cms.PSet( + destinations = cms.vstring("Tier0","DQM","ECAL","None"), + transferModes = cms.vstring("tier0_on","tier0_off","test"), + streamA = cms.PSet(tier0_on=cms.vstring( "Tier0" ),tier0_off=cms.vstring( "None" ),test=cms.vstring( "None" )), + streamB = cms.PSet(tier0_on=cms.vstring( "None" ),tier0_off=cms.vstring( "None" ),test=cms.vstring( "None" )), + streamDQM = cms.PSet(tier0_on=cms.vstring( "DQM","Tier0" ),tier0_off=cms.vstring( "DQM" ),test=cms.vstring( "None" )), + streamL1Rates = cms.PSet(tier0_on=cms.vstring( "Tier0" ),tier0_off=cms.vstring( "None" ),test=cms.vstring( "None" )), + streamHLTRates = cms.PSet(tier0_on=cms.vstring( "Tier0" ),tier0_off=cms.vstring( "None" ),test=cms.vstring( "None" )), + streamDQMHistograms = cms.PSet(tier0_on=cms.vstring( "DQM" ),tier0_off=cms.vstring( "DQM" ),test=cms.vstring( "None" )) +) + +import FWCore.ParameterSet.VarParsing as VarParsing + +import os + +cmsswbase = os.path.expandvars('$CMSSW_BASE/') + +options = VarParsing.VarParsing ('analysis') + +options.register ('runNumber', + 1, # default value + VarParsing.VarParsing.multiplicity.singleton, + VarParsing.VarParsing.varType.int, # string, int, or float + "Run Number") + +options.register ('buBaseDir', + '/fff/BU0', # default value + VarParsing.VarParsing.multiplicity.singleton, + VarParsing.VarParsing.varType.string, # string, int, or float + "BU base directory") + +options.register ('dataDir', + '/fff/data', # default value + VarParsing.VarParsing.multiplicity.singleton, + VarParsing.VarParsing.varType.string, # string, int, or float + "FU data directory") + +options.register ('numThreads', + 1, # default value + VarParsing.VarParsing.multiplicity.singleton, + VarParsing.VarParsing.varType.int, # string, int, or float + "Number of CMSSW threads") +options.register ('numFwkStreams', + 1, # default value + VarParsing.VarParsing.multiplicity.singleton, + VarParsing.VarParsing.varType.int, # string, int, or float + "Number of CMSSW streams") -process.streamA = cms.OutputModule("EvFOutputModule", - SelectEvents = cms.untracked.PSet(SelectEvents = cms.vstring( 'p1' )) - ) +options.parseArguments() -process.streamDQM = cms.OutputModule("EvFOutputModule", - SelectEvents = cms.untracked.PSet(SelectEvents = cms.vstring( 'p2' )) - ) +process.options = cms.untracked.PSet( + numberOfThreads = cms.untracked.uint32(options.numThreads), + numberOfStreams = cms.untracked.uint32(options.numFwkStreams), + multiProcesses = cms.untracked.PSet( + maxChildProcesses = cms.untracked.int32(0) + ) +) -process.ep = cms.EndPath(process.streamA+process.streamDQM+process.hltTriggerJSONMonitoring) +process.PoolDBESSource.connect = 'frontier://FrontierProd/CMS_COND_31X_GLOBALTAG' +process.PoolDBESSource.pfnPrefix = cms.untracked.string('frontier://FrontierProd/') -process.GlobalTag.connect = 'frontier://FrontierProd/CMS_COND_31X_GLOBALTAG' -process.GlobalTag.pfnPrefix = cms.untracked.string('frontier://FrontierProd/') +process.EvFDaqDirector.buBaseDir = options.buBaseDir +process.EvFDaqDirector.baseDir = options.dataDir +process.EvFDaqDirector.runNumber = options.runNumber diff --git a/rpm/fffmeta-1.5.3-6.noarch.rpm b/rpm/fffmeta-1.5.3-6.noarch.rpm deleted file mode 100644 index 158f1fe..0000000 Binary files a/rpm/fffmeta-1.5.3-6.noarch.rpm and /dev/null differ diff --git a/rpm/fffmeta-1.6.0-0.noarch.rpm b/rpm/fffmeta-1.6.0-0.noarch.rpm new file mode 100644 index 0000000..4723008 Binary files /dev/null and b/rpm/fffmeta-1.6.0-0.noarch.rpm differ diff --git a/rpm/fffmeta-vm-1.5.3-6.noarch.rpm b/rpm/fffmeta-vm-1.5.3-6.noarch.rpm deleted file mode 100644 index 62405c9..0000000 Binary files a/rpm/fffmeta-vm-1.5.3-6.noarch.rpm and /dev/null differ diff --git a/rpm/fffmeta-vm-1.6.0-0.noarch.rpm b/rpm/fffmeta-vm-1.6.0-0.noarch.rpm new file mode 100644 index 0000000..f809c9b Binary files /dev/null and b/rpm/fffmeta-vm-1.6.0-0.noarch.rpm differ diff --git a/rpm/hltd-1.5.3-6.x86_64.rpm b/rpm/hltd-1.5.3-6.x86_64.rpm deleted file mode 100644 index 11c46c5..0000000 Binary files a/rpm/hltd-1.5.3-6.x86_64.rpm and /dev/null differ diff --git a/rpm/hltd-1.6.0-0.x86_64.rpm b/rpm/hltd-1.6.0-0.x86_64.rpm new file mode 100644 index 0000000..54dc3e5 Binary files /dev/null and b/rpm/hltd-1.6.0-0.x86_64.rpm differ diff --git a/scripts/hltdrpm.sh b/scripts/hltdrpm.sh index eb09fa0..4da465c 100755 --- a/scripts/hltdrpm.sh +++ b/scripts/hltdrpm.sh @@ -36,15 +36,17 @@ mkdir -p etc/init.d mkdir -p etc/logrotate.d mkdir -p etc/appliance/resources/idle mkdir -p etc/appliance/resources/online -mkdir -p etc/appliance/resources/offline mkdir -p etc/appliance/resources/except mkdir -p etc/appliance/resources/quarantined +mkdir -p etc/appliance/resources/cloud mkdir -p usr/lib64/python2.6/site-packages mkdir -p usr/lib64/python2.6/site-packages/pyelasticsearch ls cp -r $BASEDIR/python/hltd $TOPDIR/etc/init.d/hltd -cp -r $BASEDIR/python/soap2file.py $TOPDIR/etc/init.d/soap2file +cp -r $BASEDIR/python/soap2file $TOPDIR/etc/init.d/soap2file cp -r $BASEDIR/* $TOPDIR/opt/hltd +rm -rf $TOPDIR/opt/hltd/python/hltd +rm -rf $TOPDIR/opt/hltd/python/soap2file cp -r $BASEDIR/etc/hltd.conf $TOPDIR/etc/ cp -r $BASEDIR/etc/logrotate.d/hltd $TOPDIR/etc/logrotate.d/ echo "working in $PWD" @@ -53,9 +55,9 @@ ls opt/hltd echo "Creating DQM directories" mkdir -p etc/appliance/dqm_resources/idle mkdir -p etc/appliance/dqm_resources/online -mkdir -p etc/appliance/dqm_resources/offline mkdir -p etc/appliance/dqm_resources/except mkdir -p etc/appliance/dqm_resources/quarantined +mkdir -p etc/appliance/dqm_resources/cloud cd $TOPDIR #pyelasticsearch @@ -152,12 +154,18 @@ Classifier: Topic :: System :: Filesystems Classifier: Topic :: System :: Monitoring EOF + +cd $TOPDIR +cd opt/hltd/lib/python-procname/ +./setup.py -q build +cp build/lib.linux-x86_64-2.6/procname.so $TOPDIR/usr/lib64/python2.6/site-packages + cd $TOPDIR # we are done here, write the specs and make the fu***** rpm cat > hltd.spec < in fffmeta -#/sbin/service hltd restart #restart delegated to fffmeta! %files %dir %attr(777, -, -) /var/log/hltd %dir %attr(777, -, -) /var/log/hltd/pid @@ -212,10 +216,11 @@ rm -rf /etc/appliance/except/* /usr/lib64/python2.6/site-packages/*_inotify.so* /usr/lib64/python2.6/site-packages/*python_inotify* /usr/lib64/python2.6/site-packages/pyelasticsearch +/usr/lib64/python2.6/site-packages/procname.so %preun if [ \$1 == 0 ]; then - /sbin/service hltd stop - /sbin/service hltd stop + /sbin/service hltd stop || true + /sbin/service soap2file stop || true fi EOF mkdir -p RPMBUILD/{RPMS/{noarch},SPECS,BUILD,SOURCES,SRPMS} diff --git a/scripts/makeloopfs.sh b/scripts/makeloopfs.sh new file mode 100755 index 0000000..f745ad0 --- /dev/null +++ b/scripts/makeloopfs.sh @@ -0,0 +1,106 @@ +#!/bin/bash +if [ -n "$1" ]; then + if [ -n "$2" ]; then + if [ -n "$3" ]; then + + if [ -d $1 ]; then + + basedir=`readlink -e $1` + image=$basedir/$2.img + mountpoint=$basedir/$2 + sizemb=$3 + ret=0 + umask 0 + + #protect from going wrong + if [ "$mountpoint" == "/" ]; then exit 99; fi + if [ "$mountpoint" == "//" ]; then exit 99; fi + if [ "$mountpoint" == "/fff" ]; then exit 99; fi + if [ "$mountpoint" == "/fff/" ]; then exit 99; fi + if [ "$mountpoint" == "/fff/ramdisk" ]; then exit 99; fi + if [ "$mountpoint" == "/fff/ramdisk/" ]; then exit 99; fi + if [ "$mountpoint" == "fff/ramdisk" ]; then exit 99; fi + if [ "$mountpoint" == "fff/ramdisk/" ]; then exit 99; fi + + echo "makeloop script invoked for creating loop device disk $2 in ${basedir} of size $3 MB" + + if [ -d $mountpoint ]; then + + point=`mount | grep $mountpoint | grep /dev/loop | awk '{print $3}'` + + if [ "$point" != "" ]; then + #kill any processes that might use the mount point and remove from NFS + fuser -km $point + exportfs -u *:$point + #unmunt loop device + umount $point + if [ $? != 0 ]; then + sleep 0.1 + fuser -km $point + exportfs -u *:$point + umount $point + if [ $? != 0 ]; then + echo "Unsuccessful umount of $point !" + exit 1 + fi + fi + exportfs -u *:$point + fi + fi + #deleting mount point + rm -rf $mountpoint + if [ $? != 0 ]; then + echo "Unsuccessful delete of unmounted mount point $mountpoint !" + exit 2 + fi + + if [ -f $image ]; then + chmod 755 $image + rm -rf $image + if [ $? != 0 ]; then + echo "Unsuccessful delete old image file $image" + exit 3 + fi + fi + + dd if=/dev/zero of=$image bs=1048576 count=$sizemb >& /dev/null + echo y | mkfs.ext3 $image > /dev/null + #try mount + mkdir $mountpoint + if [ $? != 0 ]; then + echo "Unsuccessful make mount point directory!" + exit 4 + fi + + echo "mounting image directory..." + mount -o loop,noatime $image $mountpoint + if [ $? != 0 ]; then + echo "Unsuccessful mount with parameters $image $mountpoint" + exit 5 + fi + + chmod -R 777 $mountpoint + + exportfs -o rw,sync,no_root_squash,no_subtree_check *:$mountpoint + if [ $? != 0 ]; then + echo "exportfs command failed for $mountpoint !" + exit 6 + fi + exit 0 + #end + else + echo "base directory not found!" + fi + else + echo "No parameter 3 given!" + fi + else + echo "No parameter 2 given!" + fi +else + echo "No parameter 1 given!" +fi + +echo "Usage: makeloopfs.sh basedir subdir imgsize(MB)" +exit 1 + diff --git a/scripts/metarpm.sh b/scripts/metarpm.sh index b303dc3..56c66d4 100755 --- a/scripts/metarpm.sh +++ b/scripts/metarpm.sh @@ -4,16 +4,11 @@ SCRIPTDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" cd $SCRIPTDIR/.. BASEDIR=$PWD -PACKAGENAME="fffmeta" - PARAMCACHE="paramcache" if [ -n "$1" ]; then - PARAMCACHE=$1 -fi - -if [ -n "$2" ]; then - PACKAGENAME=$2 + #PARAMCACHE=$1 + PARAMCACHE=${1##*/} fi echo "Using cache file $PARAMCACHE" @@ -32,10 +27,6 @@ else done fi - - - - echo "Enviroment (prod,vm) (press enter for \"${lines[0]}\"):" readin="" read readin @@ -86,7 +77,7 @@ if [ ${#readin} != "0" ]; then lines[6]=$readin fi -echo "Equipment set (press enter for: \"${lines[7]}\") - type 'latest' to use latest eq set or 'default' for default one or 'test' for VM enviroment:" +echo "Equipment set (press enter for: \"${lines[7]}\") - type 'latest' or enter a specific one:" readin="" read readin if [ ${#readin} != "0" ]; then @@ -107,7 +98,6 @@ if [ ${#readin} != "0" ]; then lines[9]=$readin fi - echo "number of framework streams per process (press enter for: ${lines[10]}):" readin="" read readin @@ -115,8 +105,6 @@ if [ ${#readin} != "0" ]; then lines[10]=$readin fi - - echo "CMSSW log collection level (DEBUG,INFO,WARNING,ERROR or FATAL) (press enter for: ${lines[11]}):" readin="" read readin @@ -145,12 +133,21 @@ done chmod 500 $SCRIPTDIR/$PARAMCACHE # create a build area +if [ ${lines[0]} == "prod" ]; then + PACKAGENAME="fffmeta" +elif [ ${lines[0]} == "vm" ]; then + PACKAGENAME="fffmeta-vm" +else + echo "Environment ${lines[0]} not supported. Available: prod or vm" + exit 1 +fi + echo "removing old build area" -rm -rf /tmp/fffmeta-build-tmp +rm -rf /tmp/$PACKAGENAME-build-tmp echo "creating new build area" -mkdir /tmp/fffmeta-build-tmp +mkdir /tmp/$PACKAGENAME-build-tmp ls -cd /tmp/fffmeta-build-tmp +cd /tmp/$PACKAGENAME-build-tmp mkdir BUILD mkdir RPMS TOPDIR=$PWD @@ -160,13 +157,19 @@ ls pluginpath="/opt/fff/esplugins/" pluginname1="bigdesk" pluginfile1="lukas-vlcek-bigdesk-v2.4.0-2-g9807b92-mod.zip" +pluginname2="head" +pluginfile2="head-master.zip" +pluginname3="HQ" +pluginfile3="hq-master.zip" +pluginname4="paramedic" +pluginfile4="paramedic-master.zip" cd $TOPDIR # we are done here, write the specs and make the fu***** rpm cat > fffmeta.spec <= 1.2.0, hltd >= 1.5.3, cx_Oracle >= 5.1.2, java-1.7.0-openjdk +Requires:elasticsearch >= 1.4.2, hltd >= 1.6.0, cx_Oracle >= 5.1.2, java-1.7.0-openjdk Provides:/opt/fff/configurefff.sh Provides:/opt/fff/setupmachine.py +Provides:/opt/fff/instances.input Provides:/etc/init.d/fffmeta #Provides:/opt/fff/backup/elasticsearch.yml @@ -203,10 +207,15 @@ mkdir -p opt/fff/esplugins mkdir -p opt/fff/backup mkdir -p etc/init.d/ cp $BASEDIR/python/setupmachine.py %{buildroot}/opt/fff/setupmachine.py +cp $BASEDIR/etc/instances.input %{buildroot}/opt/fff/instances.input echo "#!/bin/bash" > %{buildroot}/opt/fff/configurefff.sh +echo python2.6 /opt/hltd/python/fillresources.py >> %{buildroot}/opt/fff/configurefff.sh echo python2.6 /opt/fff/setupmachine.py elasticsearch,hltd $params >> %{buildroot}/opt/fff/configurefff.sh cp $BASEDIR/esplugins/$pluginfile1 %{buildroot}/opt/fff/esplugins/$pluginfile1 +cp $BASEDIR/esplugins/$pluginfile2 %{buildroot}/opt/fff/esplugins/$pluginfile2 +cp $BASEDIR/esplugins/$pluginfile3 %{buildroot}/opt/fff/esplugins/$pluginfile3 +cp $BASEDIR/esplugins/$pluginfile4 %{buildroot}/opt/fff/esplugins/$pluginfile4 cp $BASEDIR/esplugins/install.sh %{buildroot}/opt/fff/esplugins/install.sh cp $BASEDIR/esplugins/uninstall.sh %{buildroot}/opt/fff/esplugins/uninstall.sh @@ -234,9 +243,13 @@ echo "fi" >> %{buildroot}/etc/init.d/fffmeta %attr( 755 ,root, root) /opt/fff/setupmachine.py %attr( 755 ,root, root) /opt/fff/setupmachine.pyc %attr( 755 ,root, root) /opt/fff/setupmachine.pyo +%attr( 755 ,root, root) /opt/fff/instances.input %attr( 700 ,root, root) /opt/fff/configurefff.sh %attr( 755 ,root, root) /etc/init.d/fffmeta %attr( 444 ,root, root) /opt/fff/esplugins/$pluginfile1 +%attr( 444 ,root, root) /opt/fff/esplugins/$pluginfile2 +%attr( 444 ,root, root) /opt/fff/esplugins/$pluginfile3 +%attr( 444 ,root, root) /opt/fff/esplugins/$pluginfile4 %attr( 755 ,root, root) /opt/fff/esplugins/install.sh %attr( 755 ,root, root) /opt/fff/esplugins/uninstall.sh @@ -254,10 +267,20 @@ python2.6 /opt/fff/setupmachine.py elasticsearch $params #update permissions in case new rpm changed uid/guid chown -R elasticsearch:elasticsearch /var/log/elasticsearch chown -R elasticsearch:elasticsearch /var/lib/elasticsearch -echo /opt/fff/esplugins/uninstall.sh /usr/share/elasticsearch $pluginname1 -/opt/fff/esplugins/uninstall.sh /usr/share/elasticsearch $pluginname1 -echo /opt/fff/esplugins/install.sh /usr/share/elasticsearch $pluginfile1 $pluginname1 + +#plugins +/opt/fff/esplugins/uninstall.sh /usr/share/elasticsearch $pluginname1 > /dev/null /opt/fff/esplugins/install.sh /usr/share/elasticsearch $pluginfile1 $pluginname1 + +/opt/fff/esplugins/uninstall.sh /usr/share/elasticsearch $pluginname2 > /dev/null +/opt/fff/esplugins/install.sh /usr/share/elasticsearch $pluginfile2 $pluginname2 + +/opt/fff/esplugins/uninstall.sh /usr/share/elasticsearch $pluginname3 > /dev/null +/opt/fff/esplugins/install.sh /usr/share/elasticsearch $pluginfile3 $pluginname3 + +/opt/fff/esplugins/uninstall.sh /usr/share/elasticsearch $pluginname4 > /dev/null +/opt/fff/esplugins/install.sh /usr/share/elasticsearch $pluginfile4 $pluginname4 + /sbin/service elasticsearch start chkconfig --del elasticsearch chkconfig --add elasticsearch @@ -271,7 +294,11 @@ chkconfig --add elasticsearch %triggerin -- hltd #echo "triggered on hltd update or install" + /sbin/service hltd stop || true +/sbin/service soap2file stop || true +rm -rf /etc/hltd.instances + python2.6 /opt/fff/setupmachine.py restore,hltd python2.6 /opt/fff/setupmachine.py hltd $params @@ -288,11 +315,14 @@ fi #set up resources for hltd /opt/hltd/python/fillresources.py -/sbin/service hltd restart +/sbin/service hltd restart || true +/sbin/service soap2file restart || true + chkconfig --del hltd -#chkconfig --del soap2file +chkconfig --del soap2file + chkconfig --add hltd -#chkconfig --add soap2file +chkconfig --add soap2file %preun if [ \$1 == 0 ]; then @@ -300,12 +330,16 @@ if [ \$1 == 0 ]; then chkconfig --del fffmeta chkconfig --del elasticsearch chkconfig --del hltd -# chkconfig --del soap2file + chkconfig --del soap2file + + /sbin/service hltd stop || true /sbin/service elasticsearch stop || true /opt/fff/esplugins/uninstall.sh /usr/share/elasticsearch $pluginname1 || true + /opt/fff/esplugins/uninstall.sh /usr/share/elasticsearch $pluginname2 || true + /opt/fff/esplugins/uninstall.sh /usr/share/elasticsearch $pluginname3 || true + /opt/fff/esplugins/uninstall.sh /usr/share/elasticsearch $pluginname4 || true - /sbin/service hltd stop || true python2.6 /opt/fff/setupmachine.py restore,hltd,elasticsearch fi diff --git a/scripts/paramcache-vm b/scripts/paramcache-vm index e70c022..170fc26 100755 --- a/scripts/paramcache-vm +++ b/scripts/paramcache-vm @@ -1,12 +1,12 @@ vm http://cu-01.cern.ch:9200 -/opt/cmssw +/opt/offline rcms-flightsim fffsetup rcms ominozzo2 test -bufu +daqlocal 1 1 INFO diff --git a/scripts/tribe-metarpm.sh b/scripts/tribe-metarpm.sh new file mode 100755 index 0000000..2521664 --- /dev/null +++ b/scripts/tribe-metarpm.sh @@ -0,0 +1,248 @@ +#!/bin/bash -e +BUILD_ARCH=noarch +SCRIPTDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +cd $SCRIPTDIR/.. +BASEDIR=$PWD + +PACKAGENAME="fffmeta-tribe" + +PARAMCACHE="paramcache" + +echo "Using cache file $PARAMCACHE" + +if [ -f $SCRIPTDIR/$PARAMCACHE ]; +then + readarray lines < $SCRIPTDIR/$PARAMCACHE + for (( i=0; i < 12; i++ )) + do + lines[$i]=`echo -n ${lines[$i]} | tr -d "\n"` + done +else + for (( i=0; i < 12; i++ )) + do + lines[$i]="" + done +fi + +echo "Enviroment (prod,vm) (press enter for \"${lines[0]}\"):" +readin="" +read readin +if [ ${#readin} != "0" ]; then +lines[0]=$readin +fi +nousevar=$readin +nousevar=$readin +lines[1]="null" +lines[2]="null" + +echo "HWCFG DB server (press enter for \"${lines[3]}\"):" +readin="" +read readin +if [ ${#readin} != "0" ]; then +lines[3]=$readin +fi + +echo "HWCFG DB SID (or db name in VM enviroment) (press enter for: \"${lines[4]}\"):" +echo "(SPECIFIES address in TNSNAMES.ORA file if DB server field was \"null\"!)" +readin="" +read readin +if [ ${#readin} != "0" ]; then +lines[4]=$readin +fi + +echo "HWCFG DB username (press enter for: \"${lines[5]}\"):" +readin="" +read readin +if [ ${#readin} != "0" ]; then +lines[5]=$readin +fi + +echo "HWCFG DB password (press enter for: \"${lines[6]}\"):" +readin="" +read readin +if [ ${#readin} != "0" ]; then +lines[6]=$readin +fi + +echo "Equipment set (press enter for: \"${lines[7]}\") - type 'latest' or enter a specific one:" +readin="" +read readin +if [ ${#readin} != "0" ]; then +lines[7]=$readin +fi + +lines[8]="null" +lines[9]="null" +lines[10]="null" +lines[11]="null" + +params="" +for (( i=0; i < 12; i++ )) +do + params="$params ${lines[i]}" +done + +# create a build area + +echo "removing old build area" +rm -rf /tmp/fffmeta-tribe-build-tmp +echo "creating new build area" +mkdir /tmp/fffmeta-tribe-build-tmp +ls +cd /tmp/fffmeta-tribe-build-tmp +mkdir BUILD +mkdir RPMS +TOPDIR=$PWD +echo "working in $PWD" +ls + +pluginpath="/opt/fff/esplugins/" +pluginname1="bigdesk" +pluginfile1="lukas-vlcek-bigdesk-v2.4.0-2-g9807b92-mod.zip" +pluginname2="head" +pluginfile2="head-master.zip" +pluginname3="HQ" +pluginfile3="hq-master.zip" +pluginname4="paramedic" +pluginfile4="paramedic-master.zip" + +cd $TOPDIR +# we are done here, write the specs and make the fu***** rpm +cat > fffmeta-tribe.spec <= 1.4.2, cx_Oracle >= 5.1.2, java-1.7.0-openjdk, httpd >= 2.2.15, php >= 5.3.3, php-oci8 >= 1.4.9 + +Provides:/opt/fff/configurefff.sh +Provides:/opt/fff/setupmachine.py +Provides:/etc/init.d/fffmeta + +%description +fffmeta configuration setup package + +%prep +%build + +%install +rm -rf \$RPM_BUILD_ROOT +mkdir -p \$RPM_BUILD_ROOT +%__install -d "%{buildroot}/opt/fff" +%__install -d "%{buildroot}/opt/fff/backup" +%__install -d "%{buildroot}/opt/fff/esplugins" +%__install -d "%{buildroot}/etc/init.d" + +mkdir -p opt/fff/esplugins +mkdir -p opt/fff/backup +mkdir -p etc/init.d/ +cp $BASEDIR/python/setupmachine.py %{buildroot}/opt/fff/setupmachine.py +echo "#!/bin/bash" > %{buildroot}/opt/fff/configurefff.sh +echo python2.6 /opt/fff/setupmachine.py elasticsearch,web $params >> %{buildroot}/opt/fff/configurefff.sh + +cp $BASEDIR/esplugins/$pluginfile1 %{buildroot}/opt/fff/esplugins/$pluginfile1 +cp $BASEDIR/esplugins/$pluginfile2 %{buildroot}/opt/fff/esplugins/$pluginfile2 +cp $BASEDIR/esplugins/$pluginfile3 %{buildroot}/opt/fff/esplugins/$pluginfile3 +cp $BASEDIR/esplugins/$pluginfile4 %{buildroot}/opt/fff/esplugins/$pluginfile4 +cp $BASEDIR/esplugins/install.sh %{buildroot}/opt/fff/esplugins/install.sh +cp $BASEDIR/esplugins/uninstall.sh %{buildroot}/opt/fff/esplugins/uninstall.sh + +echo "#!/bin/bash" >> %{buildroot}/etc/init.d/fffmeta +echo "#" >> %{buildroot}/etc/init.d/fffmeta +echo "# chkconfig: 2345 79 22" >> %{buildroot}/etc/init.d/fffmeta +echo "#" >> %{buildroot}/etc/init.d/fffmeta +echo "if [ \\\$1 == \"start\" ]; then" >> %{buildroot}/etc/init.d/fffmeta +echo " /opt/fff/configurefff.sh" >> %{buildroot}/etc/init.d/fffmeta +echo " exit 0" >> %{buildroot}/etc/init.d/fffmeta +echo "fi" >> %{buildroot}/etc/init.d/fffmeta +echo "if [ \\\$1 == \"restart\" ]; then" >> %{buildroot}/etc/init.d/fffmeta +echo "/opt/fff/configurefff.sh" >> %{buildroot}/etc/init.d/fffmeta +echo " exit 0" >> %{buildroot}/etc/init.d/fffmeta +echo "fi" >> %{buildroot}/etc/init.d/fffmeta +echo "if [ \\\$1 == \"status\" ]; then" >> %{buildroot}/etc/init.d/fffmeta +echo "echo fffmeta does not have status" >> %{buildroot}/etc/init.d/fffmeta +echo " exit 0" >> %{buildroot}/etc/init.d/fffmeta +echo "fi" >> %{buildroot}/etc/init.d/fffmeta + + +%files +%defattr(-, root, root, -) +#/opt/fff +%attr( 755 ,root, root) /opt/fff/setupmachine.py +%attr( 755 ,root, root) /opt/fff/setupmachine.pyc +%attr( 755 ,root, root) /opt/fff/setupmachine.pyo +%attr( 700 ,root, root) /opt/fff/configurefff.sh +%attr( 755 ,root, root) /etc/init.d/fffmeta +%attr( 444 ,root, root) /opt/fff/esplugins/$pluginfile1 +%attr( 444 ,root, root) /opt/fff/esplugins/$pluginfile2 +%attr( 444 ,root, root) /opt/fff/esplugins/$pluginfile3 +%attr( 444 ,root, root) /opt/fff/esplugins/$pluginfile4 +%attr( 755 ,root, root) /opt/fff/esplugins/install.sh +%attr( 755 ,root, root) /opt/fff/esplugins/uninstall.sh + +%post +#echo "post install trigger" +chkconfig --del fffmeta +chkconfig --add fffmeta +#disabled, can be run manually for now + +%triggerin -- elasticsearch +#echo "triggered on elasticsearch update or install" +/sbin/service elasticsearch stop +python2.6 /opt/fff/setupmachine.py restore,elasticsearch +python2.6 /opt/fff/setupmachine.py elasticsearch,web $params +#update permissions in case new rpm changed uid/guid +chown -R elasticsearch:elasticsearch /var/log/elasticsearch +chown -R elasticsearch:elasticsearch /var/lib/elasticsearch + +/opt/fff/esplugins/uninstall.sh /usr/share/elasticsearch $pluginname1 > /dev/null +/opt/fff/esplugins/install.sh /usr/share/elasticsearch $pluginfile1 $pluginname1 + +/opt/fff/esplugins/uninstall.sh /usr/share/elasticsearch $pluginname2 > /dev/null +/opt/fff/esplugins/install.sh /usr/share/elasticsearch $pluginfile2 $pluginname2 + +/opt/fff/esplugins/uninstall.sh /usr/share/elasticsearch $pluginname3 > /dev/null +/opt/fff/esplugins/install.sh /usr/share/elasticsearch $pluginfile3 $pluginname3 + +/opt/fff/esplugins/uninstall.sh /usr/share/elasticsearch $pluginname4 > /dev/null +/opt/fff/esplugins/install.sh /usr/share/elasticsearch $pluginfile4 $pluginname4 + +chkconfig --del elasticsearch +chkconfig --add elasticsearch +chkconfig --add httpd +#todo:kill java process if running to have clean restart +/sbin/service elasticsearch start +/sbin/service httpd restart || true + +%preun + +if [ \$1 == 0 ]; then + + chkconfig --del fffmeta + chkconfig --del elasticsearch + chkconfig --del httpd + + /sbin/service elasticsearch stop || true + /opt/fff/esplugins/uninstall.sh /usr/share/elasticsearch $pluginname1 || true + /opt/fff/esplugins/uninstall.sh /usr/share/elasticsearch $pluginname2 || true + /opt/fff/esplugins/uninstall.sh /usr/share/elasticsearch $pluginname3 || true + /opt/fff/esplugins/uninstall.sh /usr/share/elasticsearch $pluginname4 || true + /sbin/service httpd stop || true + + + python2.6 /opt/fff/setupmachine.py restore,elasticsearch +fi + +#%verifyscript + +EOF + +rpmbuild --target noarch --define "_topdir `pwd`/RPMBUILD" -bb fffmeta-tribe.spec + diff --git a/scripts/unmountloopfs.sh b/scripts/unmountloopfs.sh new file mode 100755 index 0000000..7079446 --- /dev/null +++ b/scripts/unmountloopfs.sh @@ -0,0 +1,74 @@ +#!/bin/bash +if [ -n "$1" ]; then + if [ -d $1 ]; then + + basedir=`readlink -e $1` + umask 0 + points=`mount | grep $basedir/ | grep /dev/loop | awk '{print $3}'` + imgs=`mount | grep $basedir/ | grep /dev/loop | awk '{print $1}'` + pointarr=( $points ) + imgarr=( $imgs ) + + len=${#pointarr[@]} + len2=${#imgarr[@]} + if [[ $len == 0 ]]; then + exit 0 + fi + max=$((len)) + + for i in $(seq 0 1 $max) + do + if [ $i == $max ]; then continue; fi + point=${pointarr[$i]} + image=${imgarr[$i]} + #protect from dangerous action + if [ $point == "/" ]; then continue; fi + if [ $point == "//" ]; then continue; fi + if [ $point == "/fff" ]; then continue; fi + if [ $point == "/fff/" ]; then continue; fi + if [ $point == "/fff/ramdisk" ]; then continue; fi + if [ $point == "/fff/ramdisk/" ]; then continue; fi + if [ $point == "fff/ramdisk" ]; then continue; fi + if [ $point == "fff/ramdisk/" ]; then continue; fi + + echo "found mountpoint $point $image" + #kill any processes that might use the mount point and remove from NFS + fuser -km $point + #unmunt loop device + sleep 0.2 + exportfs -u *:$point + umount $point + if [ $? != 0 ]; then + sleep 0.1 + fuser -km $point + sleep 0.2 + exportfs -u *:$point + umount $point + if [ $? != 0 ]; then + echo "Unsuccessful unmount of $point !" + exit 1 + fi + fi + + #deleting mount point + exportfs -u *:$point + rm -rf $point + if [ $? != 0 ]; then + echo "Unsuccessful delete of unmounted mount point $point !" + exit 2 + fi + + #remove image + chmod 755 $image + rm -rf $image + if [ $? != 0 ]; then + echo "Unsuccessful delete of image file $image" + exit 3 + fi + done + exit 0 + else + echo "base directory not found!" + fi +fi +exit 1 diff --git a/test/crashtest.py b/test/crashtest.py index 52d8d46..72b6d49 100644 --- a/test/crashtest.py +++ b/test/crashtest.py @@ -88,7 +88,7 @@ def process(self): dirname = sys.argv[1] dirname = os.path.basename(os.path.normpath(dirname)) watchDir = os.path.join(conf.watch_directory,dirname) - outputDir = conf.micromerge_output + #outputDir = conf.micromerge_output @@ -119,4 +119,4 @@ def process(self): notifier.stop() print "Quit" - sys.exit(0) \ No newline at end of file + sys.exit(0)