diff --git a/cgi/cloud_mode_active_cgi.py b/cgi/cloud_mode_active_cgi.py
new file mode 100755
index 0000000..c6d4e80
--- /dev/null
+++ b/cgi/cloud_mode_active_cgi.py
@@ -0,0 +1,11 @@
+#!/usr/bin/env python2.6
+import cgi
+import os
+print "Content-Type: text/html" # HTML is following
+print
+
+try:
+ cloud = os.listdir('/etc/appliance/resources/cloud')
+ print len(cloud)
+except Exception as ex:
+ print ex
diff --git a/cgi/exclude_cgi.py b/cgi/exclude_cgi.py
new file mode 100755
index 0000000..5fc9a86
--- /dev/null
+++ b/cgi/exclude_cgi.py
@@ -0,0 +1,15 @@
+#!/usr/bin/env python2.6
+import cgi
+import os
+form = cgi.FieldStorage()
+print "Content-Type: text/html" # HTML is following
+print
+print "
CGI script exclude"
+
+try:
+ os.unlink('exclude')
+except:
+ pass
+fp = open('exclude','w+')
+fp.close()
+
diff --git a/cgi/include_cgi.py b/cgi/include_cgi.py
new file mode 100755
index 0000000..34d37e5
--- /dev/null
+++ b/cgi/include_cgi.py
@@ -0,0 +1,15 @@
+#!/usr/bin/env python2.6
+import cgi
+import os
+form = cgi.FieldStorage()
+print "Content-Type: text/html" # HTML is following
+print
+print "CGI script exclude"
+
+try:
+ os.unlink('include')
+except:
+ pass
+fp = open('include','w+')
+fp.close()
+
diff --git a/cgi/suspend_cgi.py b/cgi/suspend_cgi.py
index 6a2f97c..ad8e9b6 100755
--- a/cgi/suspend_cgi.py
+++ b/cgi/suspend_cgi.py
@@ -5,10 +5,15 @@
print "Content-Type: text/html" # HTML is following
print
print "CGI script suspend"
+
+portsuffix=""
+if "port" in form:
+ portsuffix=form["port"].value
+
try:
- os.unlink('suspend')
+ os.unlink('suspend'+portsuffix)
except:
pass
-fp = open('suspend','w+')
+fp = open('suspend'+portsuffix,'w+')
fp.close()
diff --git a/esplugins/head-master.zip b/esplugins/head-master.zip
new file mode 100644
index 0000000..4d16a1e
Binary files /dev/null and b/esplugins/head-master.zip differ
diff --git a/esplugins/hq-master.zip b/esplugins/hq-master.zip
new file mode 100644
index 0000000..6f50d38
Binary files /dev/null and b/esplugins/hq-master.zip differ
diff --git a/esplugins/install.sh b/esplugins/install.sh
index 7bd2e8f..1e63fda 100644
--- a/esplugins/install.sh
+++ b/esplugins/install.sh
@@ -1,4 +1,4 @@
cd $1
-echo installing elasticsearch plugins...
-bin/plugin --url file:///opt/fff/esplugins/$2 --install $3
+echo installing elasticsearch plugin $3 ...
+bin/plugin -s --url file:///opt/fff/esplugins/$2 --install $3
diff --git a/esplugins/paramedic-master.zip b/esplugins/paramedic-master.zip
new file mode 100644
index 0000000..b0fc5e5
Binary files /dev/null and b/esplugins/paramedic-master.zip differ
diff --git a/esplugins/uninstall.sh b/esplugins/uninstall.sh
index c22303c..301411a 100644
--- a/esplugins/uninstall.sh
+++ b/esplugins/uninstall.sh
@@ -1,4 +1,5 @@
#!/bin/bash
cd $1
-bin/plugin --remove $2
+echo uninstalling elastic plugin $2 ...
+bin/plugin -s --remove $2
diff --git a/etc/hltd.conf b/etc/hltd.conf
index a91699b..c1643a4 100644
--- a/etc/hltd.conf
+++ b/etc/hltd.conf
@@ -1,5 +1,6 @@
[General]
enabled = False
+instance = main
exec_directory = /opt/hltd
user = daqlocal
watch_directory = /fff/data
@@ -9,19 +10,19 @@ mount_command = mount
mount_type = nfs4
mount_options_ramdisk = rw,noatime,vers=4,rsize=65536,wsize=65536,namlen=255,hard,proto=tcp,timeo=600,retrans=2,sec=sys,noac
mount_options_output = rw,vers=4,rsize=65536,wsize=65536,namlen=255,hard,proto=tcp,timeo=600,retrans=2,sec=sys
-micromerge_output = /fff/BU0/output
delete_run_dir = True
output_adler32 = True
[Monitoring]
use_elasticsearch = True
-close_es_index = False
+close_es_index = True
es_cmssw_log_level = DISABLED
es_hltd_log_level = ERROR
es_local = localhost
[Web]
cgi_port = 9000
+cgi_instance_port_offset = 0
soap2file_port = 8010
[Resources]
diff --git a/etc/instances.input b/etc/instances.input
new file mode 100644
index 0000000..f826192
--- /dev/null
+++ b/etc/instances.input
@@ -0,0 +1,27 @@
+{
+ "DISABLED-dvbu-c2f34-30-01":
+ {
+ "names":["main","testing"],
+ "sizes":[20,30]
+ },
+ "DISABLED-dvrubu-c2f34-17-03":
+ {
+ "names":["testing"],
+ "sizes":[0]
+ },
+ "DISABLED-dvrubu-c2f34-17-04":
+ {
+ "names":["testing"],
+ "sizes":[0]
+ },
+ "bu-vm-01-01.cern.ch":
+ {
+ "names":["main","testing"],
+ "sizes":[1000,500]
+ },
+ "fu-vm-02-02.cern.ch":
+ {
+ "names":["testing"],
+ "sizes":[0]
+ }
+}
diff --git a/json/runapplianceTemplate.json b/json/runapplianceTemplate.json
index d410952..8578066 100644
--- a/json/runapplianceTemplate.json
+++ b/json/runapplianceTemplate.json
@@ -274,50 +274,35 @@
}
}
},
- "hltrates-legend": {
+ "qstatus": {
"properties": {
- "path-names": {
- "type": "string",
- "index": "not_analyzed"
- },
- "dataset-names": {
- "type": "string",
- "index": "not_analyzed"
- }
- }
- },
- "hltrates": {
- "properties": {
- "ls": {
- "type": "integer"
- },
- "pid": {
- "type": "integer"
- },
- "processed": {
- "type": "integer"
- },
- "path-wasrun": {
+ "numQueuedLS": {
"type": "integer"
},
- "path-afterl1seed": {
+ "maxQueuedLS": {
"type": "integer"
},
- "path-afterprescale": {
+ "numReadFromQueueLS": {
"type": "integer"
},
- "path-accepted": {
+ "maxClosedLS": {
"type": "integer"
},
- "path-rejected": {
+ "numReadOpenLS": {
"type": "integer"
},
- "path-errors": {
- "type": "integer"
+ "fm_date": {
+ "type": "date"
},
- "dataset-accepted": {
- "type": "integer"
+ "host": {
+ "type": "string",
+ "index":"not_analyzed"
}
+ },
+ "_timestamp": {
+ "enabled": true,
+ "store": "yes",
+ "path": "fm_date"
}
},
"cmsswlog": {
diff --git a/lib/python-procname/procnamemodule.c b/lib/python-procname/procnamemodule.c
new file mode 100644
index 0000000..e447032
--- /dev/null
+++ b/lib/python-procname/procnamemodule.c
@@ -0,0 +1,72 @@
+/*
+ * Copyright (C) 2008 Eugene A. Lisitsky
+ *
+ * The procname library for Python.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ * */
+
+#include
+#include
+
+void Py_GetArgcArgv(int*, char***);
+
+PyDoc_STRVAR(procname__doc__, "Module for setting/getting process name");
+
+static PyObject *
+procname_check(PyObject *self, PyObject *args) {
+ return Py_BuildValue("i", 1);
+};
+
+
+static PyObject *
+procname_getprocname(PyObject *self, PyObject *args) {
+ int argc;
+ char **argv;
+ Py_GetArgcArgv(&argc, &argv);
+ return Py_BuildValue("s", argv[0]);
+};
+
+
+static PyObject *
+procname_setprocname(PyObject *self, PyObject *args) {
+ int argc;
+ char **argv;
+ char *name;
+ if (!PyArg_ParseTuple(args, "s", &name))
+ return NULL;
+ Py_GetArgcArgv(&argc, &argv);
+ strncpy(argv[0], name , strlen(name));
+ memset(&argv[0][strlen(name)], '\0', strlen(&argv[0][strlen(name)]));
+ prctl (15 /* PR_SET_NAME */, name, 0, 0, 0);
+ Py_INCREF(Py_None);
+ return Py_None;
+};
+
+
+static PyMethodDef procname_methods[] = {
+ {"check", procname_check, METH_VARARGS, "Test func"},
+ {"getprocname", procname_getprocname, METH_VARARGS,
+ "Get procname.\nReturns name (string)"},
+ {"setprocname", procname_setprocname, METH_VARARGS,
+ "Set procname.\n name (string) -> new process name.\nReturns None."},
+ {NULL, NULL, 0, NULL}
+};
+
+PyMODINIT_FUNC
+initprocname(void) {
+ (void) Py_InitModule3("procname", procname_methods, procname__doc__);
+}
+
diff --git a/lib/python-procname/setup.py b/lib/python-procname/setup.py
new file mode 100755
index 0000000..cf97d9e
--- /dev/null
+++ b/lib/python-procname/setup.py
@@ -0,0 +1,17 @@
+#!/usr/bin/env python
+
+import distutils.core
+import distutils.util
+
+platform = distutils.util.get_platform()
+
+
+distutils.core.setup(
+ name='procname',
+ version='0.1',
+ description='Process name renaming',
+ author="Eugene A Lisitsky",
+ license='LGPL',
+ platforms='Linux',
+ ext_modules=[distutils.core.Extension('procname', sources=['procnamemodule.c'])],
+ )
diff --git a/python/aUtils.py b/python/aUtils.py
index df226e3..76c9499 100644
--- a/python/aUtils.py
+++ b/python/aUtils.py
@@ -2,17 +2,19 @@
import os,stat
import time,datetime
import shutil
-import json
+import simplejson as json
import logging
import zlib
import subprocess
+import threading
+#import fcntl
from inotifywrapper import InotifyWrapper
import _inotify as inotify
ES_DIR_NAME = "TEMP_ES_DIRECTORY"
-UNKNOWN,OUTPUTJSD,JSD,STREAM,INDEX,FAST,SLOW,OUTPUT,STREAMERR,STREAMDQMHISTOUTPUT,INI,EOLS,EOR,COMPLETE,DAT,PDAT,PJSNDATA,PIDPB,PB,CRASH,MODULELEGEND,PATHLEGEND,BOX,BOLS,HLTRATES,HLTRATESLEGEND = range(26) #file types
+UNKNOWN,OUTPUTJSD,DEFINITION,STREAM,INDEX,FAST,SLOW,OUTPUT,STREAMERR,STREAMDQMHISTOUTPUT,INI,EOLS,EOR,COMPLETE,DAT,PDAT,PJSNDATA,PIDPB,PB,CRASH,MODULELEGEND,PATHLEGEND,BOX,BOLS,QSTATUS = range(25) #file types
TO_ELASTICIZE = [STREAM,INDEX,OUTPUT,STREAMERR,STREAMDQMHISTOUTPUT,EOLS,EOR,COMPLETE]
TEMPEXT = ".recv"
ZEROLS = 'ls0000'
@@ -40,6 +42,16 @@ def __init__(self,recursiveMode=False):
self.logger = logging.getLogger(self.__class__.__name__)
self.eventQueue = False
self.inotifyWrapper = InotifyWrapper(self,recursiveMode)
+ self.queueStatusPath = None
+ self.queueStatusPathMon = None
+ self.queueStatusPathDir = None
+ self.queuedLumiList = []
+ self.maxQueuedLumi=-1
+ #max seen/closed by anelastic thread
+ self.maxReceivedEoLS=-1
+ self.maxClosedLumi=-1
+ self.numOpenLumis=-1
+ self.lock = threading.Lock()
def register_inotify_path(self,path,mask):
self.inotifyWrapper.registerPath(path,mask)
@@ -48,20 +60,101 @@ def start_inotify(self):
self.inotifyWrapper.start()
def stop_inotify(self):
- logging.info("MonitorRanger: Stop inotify wrapper")
+ self.logger.info("MonitorRanger: Stop inotify wrapper")
self.inotifyWrapper.stop()
- logging.info("MonitorRanger: Join inotify wrapper")
+ self.logger.info("MonitorRanger: Join inotify wrapper")
self.inotifyWrapper.join()
- logging.info("MonitorRanger: Inotify wrapper returned")
+ self.logger.info("MonitorRanger: Inotify wrapper returned")
def process_default(self, event):
self.logger.debug("event: %s on: %s" %(str(event.mask),event.fullpath))
if self.eventQueue:
- self.eventQueue.put(event)
+
+ if self.queueStatusPath!=None:
+ if self.checkNewLumi(event):
+ self.eventQueue.put(event)
+ else:
+ self.eventQueue.put(event)
def setEventQueue(self,queue):
self.eventQueue = queue
+ def checkNewLumi(self,event):
+ if event.fullpath.endswith("_EoLS.jsn"):
+ try:
+ queuedLumi = int(os.path.basename(event.fullpath).split('_')[1][2:])
+ self.lock.acquire()
+ if queuedLumi not in self.queuedLumiList:
+ if queuedLumi>self.maxQueuedLumi:
+ self.maxQueuedLumi=queuedLumi
+ self.queuedLumiList.append(queuedLumi)
+ self.lock.release()
+ self.updateQueueStatusFile()
+ else:
+ self.lock.release()
+ #skip if EoL for LS in queue has already been written once (e.g. double file create race)
+ return False
+ except:
+ self.logger.warning("Problem checking new EoLS filename: "+str(os.path.basename(event.fullpath)) + " error:"+str(ex))
+ try:self.lock.release()
+ except:pass
+ return True
+
+ def notifyLumi(self,ls,maxReceivedEoLS,maxClosedLumi,numOpenLumis):
+ if self.queueStatusPath==None:return
+ self.lock.acquire()
+ if ls!=None and ls in self.queuedLumiList:
+ self.queuedLumiList.remove(ls)
+ self.maxReceivedEoLS=maxReceivedEoLS
+ self.maxClosedLumi=maxClosedLumi
+ self.numOpenLumis=numOpenLumis
+ self.lock.release()
+ self.updateQueueStatusFile()
+
+ def setQueueStatusPath(self,path,monpath):
+ self.queueStatusPath = path
+ self.queueStatusPathMon = monpath
+ self.queueStatusPathDir = path[:path.rfind('/')]
+
+ def updateQueueStatusFile(self):
+ if self.queueStatusPath==None:return
+ num_queued_lumis = len(self.queuedLumiList)
+ if not os.path.exists(self.queueStatusPathDir):
+ self.logger.error("No directory to write queueStatusFile: "+str(self.queueStatusPathDir))
+ else:
+ self.logger.info("Update status file - queued lumis:"+str(num_queued_lumis)+ " EoLS:: max queued:"+str(self.maxQueuedLumi) \
+ +" un-queued:"+str(self.maxReceivedEoLS)+" Lumis:: last closed:"+str(self.maxClosedLumi)+ " num open:"+str(self.numOpenLumis))
+ #write json
+ doc = {"numQueuedLS":num_queued_lumis,
+ "maxQueuedLS":self.maxQueuedLumi,
+ "numReadFromQueueLS:":self.maxReceivedEoLS,
+ "maxClosedLS":self.maxClosedLumi,
+ "numReadOpenLS":self.numOpenLumis
+ }
+ try:
+ if self.queueStatusPath!=None:
+ attempts=3
+ while attempts>0:
+ try:
+ with open(self.queueStatusPath+TEMPEXT,"w") as fp:
+ #fcntl.flock(fp, fcntl.LOCK_EX)
+ json.dump(doc,fp)
+ os.rename(self.queueStatusPath+TEMPEXT,self.queueStatusPath)
+ break
+ except Exception as ex:
+ attempts-=1
+ if attempts==0:
+ raise ex
+ self.logger.warning("Unable to write status file, with error:" + str(ex)+".retrying...")
+ time.sleep(0.05)
+ try:
+ shutil.copyfile(self.queueStatusPath,self.queueStatusPathMon)
+ except:
+ pass
+ except Exception as ex:
+ self.logger.error("Unable to open/write " + self.queueStatusPath)
+ self.logger.exception(ex)
+
class fileHandler(object):
def __eq__(self,other):
@@ -106,6 +199,7 @@ def getFiletype(self,filepath = None):
if not filepath: filepath = self.filepath
filename = self.basename
name,ext = self.name,self.ext
+ if ext==TEMPEXT:return UNKNOWN
name = name.upper()
if "mon" not in filepath:
if ext == ".dat" and "_PID" not in name: return DAT
@@ -113,26 +207,26 @@ def getFiletype(self,filepath = None):
if ext == ".jsndata" and "_PID" in name: return PJSNDATA
if ext == ".ini" and "_PID" in name: return INI
if ext == ".jsd" and "OUTPUT_" in name: return OUTPUTJSD
- if ext == ".jsd" : return JSD
+ if ext == ".jsd" : return DEFINITION
if ext == ".jsn":
if STREAMERRORNAME.upper() in name: return STREAMERR
- elif "BOLS" in name : return BOLS
- elif "STREAM" in name and "_PID" in name: return STREAM
- elif "INDEX" in name and "_PID" in name: return INDEX
- elif "CRASH" in name and "_PID" in name: return CRASH
- elif "EOLS" in name: return EOLS
- elif "EOR" in name: return EOR
+ elif "_BOLS" in name : return BOLS
+ elif "_STREAM" in name and "_PID" in name: return STREAM
+ elif "_INDEX" in name and "_PID" in name: return INDEX
+ elif "_CRASH" in name and "_PID" in name: return CRASH
+ elif "_EOLS" in name: return EOLS
+ elif "_EOR" in name: return EOR
+ elif "_TRANSFER" in name: return DEFINITION
if ext==".jsn":
if STREAMDQMHISTNAME.upper() in name and "_PID" not in name: return STREAMDQMHISTOUTPUT
- if "STREAM" in name and "_PID" not in name: return OUTPUT
- if "_HLTRATESLEGEND" in name: return HLTRATESLEGEND
- elif "_HLTRATES" in name: return HLTRATES
+ if "_STREAM" in name and "_PID" not in name: return OUTPUT
+ if name.startswith("QUEUE_STATUS"): return QSTATUS
if ext==".pb":
if "_PID" not in name: return PB
else: return PIDPB
if name.endswith("COMPLETE"): return COMPLETE
- if ".fast" in filename: return FAST
- if "slow" in filename: return SLOW
+ if ext == ".fast" in filename: return FAST
+ if ext == ".slow" in filename: return SLOW
if ext == ".leg" and "MICROSTATELEGEND" in name: return MODULELEGEND
if ext == ".leg" and "PATHLEGEND" in name: return PATHLEGEND
if "boxes" in filepath : return BOX
@@ -149,7 +243,6 @@ def getFileHeaders(self):
elif filetype in [DAT,PB,OUTPUT,STREAMERR,STREAMDQMHISTOUTPUT]: self.run,self.ls,self.stream,self.host = splitname
elif filetype == INDEX: self.run,self.ls,self.index,self.pid = splitname
elif filetype == EOLS: self.run,self.ls,self.eols = splitname
- elif filetype == HLTRATES:self.run,self.ls,self.ftype,self.pid = splitname
else:
self.logger.warning("Bad filetype: %s" %self.filepath)
self.run,self.ls,self.stream = [None]*3
@@ -167,11 +260,12 @@ def getBoxData(self,filepath = None):
data = fi.read()
data = data.strip(sep).split(sep)
data = dict([d.split('=') for d in data])
+ except IOError,e:
+ data = {}
except StandardError,e:
self.logger.exception(e)
data = {}
-
return data
#get data from json file
@@ -247,7 +341,12 @@ def setFieldByName(self,field,value,warning=True):
#get definitions from jsd file
def getDefinitions(self):
if self.filetype in [STREAM]:
+ #try:
self.jsdfile = self.data["definition"]
+ #except:
+ # self.logger.error("no definition field in "+str(self.filepath))
+ # self.definitions = {}
+ # return False
elif not self.jsdfile:
self.logger.warning("jsd file not set")
self.definitions = {}
@@ -256,10 +355,11 @@ def getDefinitions(self):
return True
- def deleteFile(self):
+ def deleteFile(self,silent=False):
#return True
filepath = self.filepath
- self.logger.info(filepath)
+ if silent==False:
+ self.logger.info(filepath)
if os.path.isfile(filepath):
try:
os.remove(filepath)
@@ -389,21 +489,37 @@ def writeout(self,empty=False):
return False
return True
+ #TODO:make sure that the file is copied only once
def esCopy(self):
if not self.exists(): return
if self.filetype in TO_ELASTICIZE:
esDir = os.path.join(self.dir,ES_DIR_NAME)
if os.path.isdir(esDir):
+ newpathTemp = os.path.join(esDir,self.basename+TEMPEXT)
newpath = os.path.join(esDir,self.basename)
retries = 5
while True:
try:
- shutil.copy(self.filepath,newpath)
+ shutil.copy(self.filepath,newpathTemp)
+ break
+ except (OSError,IOError),e:
+ retries-=1
+ if retries == 0:
+ self.logger.exception(e)
+ return
+ #raise e #non-critical exception
+ else:
+ time.sleep(0.5)
+ retries = 5
+ while True:
+ try:
+ os.rename(newpathTemp,newpath)
break
except (OSError,IOError),e:
retries-=1
if retries == 0:
self.logger.exception(e)
+ return
#raise e #non-critical exception
else:
time.sleep(0.5)
diff --git a/python/anelastic.py b/python/anelastic.py
index 99428ef..63db0b2 100755
--- a/python/anelastic.py
+++ b/python/anelastic.py
@@ -11,7 +11,7 @@
import _inotify as inotify
import threading
import Queue
-import json
+import simplejson as json
import logging
@@ -21,8 +21,9 @@
class LumiSectionRanger():
host = os.uname()[1]
- def __init__(self,tempdir,outdir,run_number):
+ def __init__(self,mr,tempdir,outdir,run_number):
self.logger = logging.getLogger(self.__class__.__name__)
+ self.mr = mr
self.stoprequest = threading.Event()
self.emptyQueue = threading.Event()
self.firstStream = threading.Event()
@@ -41,7 +42,10 @@ def __init__(self,tempdir,outdir,run_number):
self.jsdfile = None
self.buffer = [] # file list before the first stream file
self.emptyOutTemplate = None
-
+ self.useTimeout=60
+ self.maxQueuedLumi=0
+ self.maxReceivedEoLS=0
+ self.maxClosedLumi=0
def join(self, stop=False, timeout=None):
@@ -52,7 +56,8 @@ def join(self, stop=False, timeout=None):
def start(self):
self.run()
- def stop(self):
+ def stop(self,timeout=60):
+ self.useTimeout=timeout
self.stoprequest.set()
def setSource(self,source):
@@ -71,11 +76,15 @@ def run(self):
self.process()
except (KeyboardInterrupt,Queue.Empty) as e:
self.emptyQueue.set()
+ except Exception as ex:
+ self.logger.exception(ex)
+ self.logger.fatal("Exiting on unhandled exception")
+ os._exit(1)
else:
time.sleep(0.5)
#allow timeout in case 'complete' file is received and lumi is not closed
if self.stoprequest.isSet() and self.emptyQueue.isSet() and self.checkClosure()==False:
- if endTimeout<=-1: endTimeout=100
+ if endTimeout<=-1: endTimeout=self.useTimeout*2
if endTimeout==0: break
endTimeout-=1
@@ -105,8 +114,8 @@ def process(self):
eventtype = self.eventtype
if eventtype:# & inotify.IN_CLOSE_WRITE:
- if filetype == JSD:
- self.processJsdFile()
+ if filetype == DEFINITION:
+ self.processDefinitionFile()
if filetype == OUTPUTJSD and not self.jsdfile:
self.jsdfile=self.infile.filepath
self.createEmptyOutputTemplate()
@@ -119,18 +128,28 @@ def process(self):
elif filetype in [STREAM,STREAMDQMHISTOUTPUT,INDEX,EOLS,DAT,PB]:
run,ls = (self.infile.run,self.infile.ls)
key = (run,ls)
+ ls_num=int(ls[2:])
if filetype == EOLS :
+ if self.maxReceivedEoLS=0:
- if numFiles == 1:
- #fastHadd crashes trying to merge only one file
- os.rename(command_args[4],command_args[3])
- else:
- p = subprocess.Popen(command_args,stdout=subprocess.PIPE,stderr=subprocess.STDOUT)
- p.wait()
- if p.returncode!=0:
- self.logger.error('fastHadd returned with exit code '+str(p.returncode)+' and response: ' + str(p.communicate()) + '. Merging parameters given:'+str(command_args) +' ,file sizes(B):'+str(inFileSizes))
- #DQM more verbose debugging
- try:
- filesize = os.stat(fullOutputPath).st_size
- self.logger.error('fastHadd reported to fail at merging, while output pb file exists! '+ fullOutputPath + ' with size(B): '+str(filesize))
- except:
- pass
- outfile.setFieldByName('ReturnCodeMask', str(p.returncode))
- hasError=True
- if True:
- if numFiles==1:
- try:
- filesize = os.stat(fullOutputPath).st_size
- except:
- self.logger.error('Error checking fastHadd output file size: '+ fullOutputPath)
- hasError=True
- try:
- os.chmod(fullOutputPath,0666)
- except:
- self.logger.error('Error fixing permissions of fastHadd output file: '+ fullOutputPath)
- if numFiles>1:
- for f in command_args[4:]:
- try:
- if hasError==False:os.remove(f)
- except OSError as ex:
- self.logger.warning('exception removing file '+f+' : '+str(ex))
+ p = subprocess.Popen(command_args,stdout=subprocess.PIPE,stderr=subprocess.STDOUT)
+ p.wait()
+ if p.returncode!=0:
+ self.logger.error('fastHadd returned with exit code '+str(p.returncode)+' and response: ' + str(p.communicate()) + '. Merging parameters given:'+str(command_args) +' ,file sizes(B):'+str(inFileSizes))
+ #DQM more verbose debugging
+ try:
+ filesize = os.stat(fullOutputPath).st_size
+ self.logger.error('fastHadd reported to fail at merging, while output pb file exists! '+ fullOutputPath + ' with size(B): '+str(filesize))
+ except:
+ pass
+ outfile.setFieldByName('ReturnCodeMask', str(p.returncode))
+ hasError=True
+
+ for f in command_args[4:]:
+ try:
+ if hasError==False:os.remove(f)
+ except OSError as ex:
+ self.logger.warning('exception removing file '+f+' : '+str(ex))
else:
hasError=True
@@ -830,8 +869,14 @@ def abortMerging(self):
if __name__ == "__main__":
+
+ import procname
+ procname.setprocname('anelastic')
+
+ conf=initConf()
+
logging.basicConfig(filename=os.path.join(conf.log_dir,"anelastic.log"),
- level=logging.INFO,
+ level=conf.service_log_level,
format='%(levelname)s:%(asctime)s - %(funcName)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S')
logger = logging.getLogger(os.path.basename(__file__))
@@ -848,7 +893,7 @@ def abortMerging(self):
rawinputdir = sys.argv[3]
dirname = os.path.basename(os.path.normpath(dirname))
watchDir = os.path.join(conf.watch_directory,dirname)
- outputDir = conf.micromerge_output
+ outputDir = sys.argv[4]
dqmHandler = None
@@ -868,11 +913,12 @@ def abortMerging(self):
#starting inotify thread
mr = MonitorRanger()
mr.setEventQueue(eventQueue)
+ mr.setQueueStatusPath(os.path.join(watchDir,"open","queue_status.jsn"),os.path.join(watchDir,"mon","queue_status.jsn"))
mr.register_inotify_path(watchDir,mask)
mr.start_inotify()
#starting lsRanger thread
- ls = LumiSectionRanger(watchDir,outputDir,run_number)
+ ls = LumiSectionRanger(mr,watchDir,outputDir,run_number)
ls.setSource(eventQueue)
ls.start()
diff --git a/python/applianceumount.py b/python/applianceumount.py
index 523e259..ee3c13f 100644
--- a/python/applianceumount.py
+++ b/python/applianceumount.py
@@ -36,8 +36,8 @@ def run(self):
os.symlink('/opt/hltd/cgi',self.watch_directory+'/cgi-bin')
handler.cgi_directories = ['/cgi-bin']
- print("starting http server on port "+str(self.cgi_port+5))
- self.httpd = BaseHTTPServer.HTTPServer(("", self.cgi_port+5), handler)
+ print("starting http server on port "+str(self.cgi_port+20))
+ self.httpd = BaseHTTPServer.HTTPServer(("", self.cgi_port+20), handler)
self.httpd.serve_forever()
self.finished=True
@@ -51,9 +51,10 @@ def run(self):
def stop(self):
self.httpd.shutdown()
-def checkMode():
+def checkMode(instance):
try:
hltdconf='/etc/hltd.conf'
+ if instance != "main": hltdconf='/etc/hltd-'+instance+'.conf'
with open(hltdconf,'r') as f:
for l in f.readlines():
ls=l.strip(' \n')
@@ -63,31 +64,37 @@ def checkMode():
pass
return "unknown"
-def stopFUs():
+def stopFUs(instance):
hltdconf='/etc/hltd.conf'
watch_directory='/fff/ramdisk'
+ if instance != "main": hltdconf='/etc/hltd-'+instance+'.conf'
machine_is_bu=False
machine_is_fu=False
cgi_port=9000
+ cgi_offset=0
try:
f=open(hltdconf,'r')
for l in f.readlines():
ls=l.strip(' \n')
- if not ls.startswith('#') and ls.startswith('watch_directory'):
+ if ls.startswith('watch_directory'):
watch_directory=ls.split('=')[1].strip(' ')
- if not ls.startswith('#') and ls.startswith('role'):
+ elif ls.startswith('role'):
if 'bu' in ls.split('=')[1].strip(' '): machine_is_bu=True
if 'fu' in ls.split('=')[1].strip(' ')=='fu': machine_is_fu=True
- if not ls.startswith('#') and ls.startswith('cgi_port'):
+ elif ls.startswith('cgi_instance_port_offset'):
+ cgi_offset=int(ls.split('=')[1].strip(' '))
+ elif ls.startswith('cgi_port'):
cgi_port=int(ls.split('=')[1].strip(' '))
f.close()
except Exception as ex:
- print "Unable to read parameters",str(ex),"using defaults"
+ if instance!="main": raise ex
+ else:
+ print "Unable to read parameters",str(ex),"using defaults"
if machine_is_bu==False:return True
- syslog.syslog("hltd:Initiating FU unmount procedure")
+ syslog.syslog("hltd-"+str(instance)+": initiating FU unmount procedure")
#continue with notifying FUs
boxinfodir=os.path.join(watch_directory,'appliance/boxes')
@@ -106,15 +113,16 @@ def stopFUs():
current_time = time.time()
age = current_time - os.path.getmtime(os.path.join(boxinfodir,machine))
print "found machine",machine," which is ",str(age)," seconds old"
- syslog.syslog("hltd: found machine "+str(machine) + " which is "+ str(age)+" seconds old")
+ syslog.syslog("hltd-"+str(instance)+": found machine "+str(machine) + " which is "+ str(age)+" seconds old")
if age < 30:
if receiver==None:
receiver = UmountResponseReceiver(watch_directory,cgi_port)
receiver.start()
time.sleep(1)
try:
- connection = httplib.HTTPConnection(machine, cgi_port,timeout=5)
- connection.request("GET",'cgi-bin/suspend_cgi.py')
+ #subtract cgi offset when connecting machine
+ connection = httplib.HTTPConnection(machine, cgi_port-cgi_offset,timeout=5)
+ connection.request("GET",'cgi-bin/suspend_cgi.py?port='+str(cgi_port))
response = connection.getresponse()
machinelist.append(machine)
except:
@@ -133,7 +141,7 @@ def stopFUs():
machinePending=True
activeMachines.append(machine)
- syslog.syslog("hltd: waiting for machines to respond:"+str(activeMachines))
+ syslog.syslog("hltd-"+str(instance)+": waiting for machines to respond:"+str(activeMachines))
if machinePending:
usedTimeout+=2
time.sleep(2)
@@ -142,12 +150,12 @@ def stopFUs():
except:
#handle interrupt
print "Interrupted!"
- syslog.syslog("hltd: FU suspend was interrupted")
+ syslog.syslog("hltd-"+str(instance)+": FU suspend was interrupted")
count=0
if receiver!=None:
while receiver.finished==False:
count+=1
- if count%100==0:syslog.syslog("hltd stop: trying to stop suspend receiver HTTP server thread (script interrupted)")
+ if count%100==0:syslog.syslog("hltd-"+str(instance)+": stop: trying to stop suspend receiver HTTP server thread (script interrupted)")
try:
receiver.stop()
time.sleep(.1)
@@ -161,7 +169,7 @@ def stopFUs():
if receiver!=None:
while receiver.finished==False:
count+=1
- if count%100==0:syslog.syslog("hltd stop: trying to stop suspend receiver HTTP server thread")
+ if count%100==0:syslog.syslog("hltd-"+str(instance)+": stop: trying to stop suspend receiver HTTP server thread")
try:
receiver.stop()
time.sleep(.1)
@@ -172,10 +180,10 @@ def stopFUs():
print "Finished FU suspend for:",str(machinelist)
print "Not successful:",str(activeMachines)
- syslog.syslog("hltd: unmount script completed. remaining machines :"+str(activeMachines))
+ syslog.syslog("hltd-"+str(instance)+": unmount script completed. remaining machines :"+str(activeMachines))
if usedTimeout==maxTimeout:
print "FU suspend failed for hosts:",activeMachines
- syslog.syslog("hltd: FU suspend failed for hosts"+str(activeMachines))
+ syslog.syslog("hltd-"+str(instance)+": FU suspend failed for hosts"+str(activeMachines))
return False
return True
diff --git a/python/daemon2.py b/python/daemon2.py
index 97e51f6..a5c78c6 100644
--- a/python/daemon2.py
+++ b/python/daemon2.py
@@ -17,12 +17,25 @@ class Daemon2:
attn: May change in the near future to use PEP daemon
"""
- def __init__(self, pidfile, processname, stdin='/dev/null', stdout='/dev/null', stderr='/dev/null'):
+ def __init__(self, processname, instance, confname=None, stdin='/dev/null', stdout='/dev/null', stderr='/dev/null'):
self.stdin = stdin
self.stdout = stdout
self.stderr = stderr
- self.pidfile = pidfile
self.processname = processname
+ self.instance = instance
+ if confname==None:confname=processname
+ if instance=="main":
+ instsuffix=""
+ self.instancemsg=""
+ else:
+ instsuffix="-"+instance
+ self.instancemsg=" instance"+instance
+
+ self.pidfile = "/var/run/" + processname + instsuffix + ".pid"
+ self.conffile = "/etc/" + confname + instsuffix + ".conf"
+ self.lockfile = '/var/lock/subsys/'+processname + instsuffix
+
+
def daemonize(self):
@@ -35,7 +48,7 @@ def daemonize(self):
pid = os.fork()
if pid > 0:
# exit first parent
- sys.exit(0)
+ return -1
except OSError, e:
sys.stderr.write("fork #1 failed: %d (%s)\n" % (e.errno, e.strerror))
sys.exit(1)
@@ -71,14 +84,21 @@ def daemonize(self):
atexit.register(self.delpid)
pid = str(os.getpid())
file(self.pidfile,'w+').write("%s\n" % pid)
+ return 0
def delpid(self):
- os.remove(self.pidfile)
+ if os.path.exists(self.pidfile):
+ os.remove(self.pidfile)
def start(self):
"""
Start the daemon
"""
+ if not os.path.exists(self.conffile):
+ print "Missing "+self.conffile+" - can not start instance"
+ #raise Exception("Missing "+self.conffile)
+ sys.exit(4)
# Check for a pidfile to see if the daemon already runs
+
try:
pf = file(self.pidfile,'r')
pid = int(pf.read().strip())
@@ -89,10 +109,13 @@ def start(self):
if pid:
message = "pidfile %s already exists. Daemon already running?\n"
sys.stderr.write(message % self.pidfile)
- sys.exit(1)
+ sys.exit(3)
# Start the daemon
- self.daemonize()
- self.run()
+ ret = self.daemonize()
+ if ret == 0:
+ self.run()
+ ret = 0
+ return ret
def status(self):
"""
@@ -107,16 +130,22 @@ def status(self):
except IOError:
pid = None
if not pid:
- message = self.processname+" not running, no pidfile %s\n"
+ message = self.processname + self.instancemsg +" not running, no pidfile %s\n"
else:
try:
os.kill(pid,0)
- message = self.processname+" is running with pidfile %s\n"
+ message = self.processname + self.instancemsg + " is running with pidfile %s\n"
retval = True
+ except OSError as ex:
+ if ex.errno==1:
+ message = self.processname + self.instancemsg + " is running with pidfile %s\n"
+ else:
+ message = self.processname + self.instancemsg + " pid exist in %s but process is not running\n"
except:
- message = self.processname+" pid exist in %s but process is not running\n"
+ message = self.processname + self.instancemsg + " pid exist in %s but process is not running\n"
+ #should return true for puppet to detect service crash (also when stopped)
- sys.stderr.write(message % self.pidfile)
+ sys.stdout.write(message % self.pidfile)
return retval
def silentStatus(self):
@@ -132,7 +161,7 @@ def silentStatus(self):
except IOError:
pid = None
if not pid:
- message = self.processname+" not running, no pidfile %s\n"
+ message = self.processname + self.instancemsg +" not running, no pidfile %s\n"
else:
try:
os.kill(pid,0)
@@ -155,12 +184,18 @@ def stop(self):
pid = None
if not pid:
- message = "pidfile %s does not exist. Daemon not running?\n"
- sys.stderr.write(message % self.pidfile)
+ message = " not running, no pidfile %s\n"
+ sys.stdout.write(message % self.pidfile)
+ sys.stdout.flush()
return # not an error in a restart
# Try killing the daemon process
+ processPresent=False
try:
+ #check is process is alive
+ os.kill(pid,0)
+ processPresent=True
+ sys.stdout.flush()
# signal the daemon to stop
timeout = 5.0 #kill timeout
os.kill(pid, SIGINT)
@@ -183,25 +218,37 @@ def stop(self):
time.sleep(0.5)
timeout-=0.5
except OSError, err:
+ time.sleep(.1)
err = str(err)
if err.find("No such process") > 0:
#this handles the successful stopping of the daemon...
if os.path.exists(self.pidfile):
- print 'removing pidfile'
- os.remove(self.pidfile)
- sys.stdout.write('[OK]\n')
- sys.stdout.flush()
+ if processPresent==False:
+ sys.stdout.write(" process "+str(pid)+" is dead. Removing pidfile" + self.pidfile+ " pid:" + str(pid))
+ try:
+ os.remove(self.pidfile)
+ except Exception as ex:
+ sys.stdout.write(' [ \033[1;31mFAILED\033[0;39m ]\n')
+ sys.stderr.write(str(ex)+'\n')
+ sys.exit(1)
+ elif not os.path.exists(self.pidfile):
+ if processPresent==False:
+ sys.stdout.write(' service is not running')
else:
- print str(err)
+ sys.stdout.write(' [ \033[1;31mFAILED\033[0;39m ]\n')
+ sys.stderr.write(str(err)+'\n')
sys.exit(1)
- sys.stdout.write('[OK]\n')
+
+ if (self.processname!="hltd"):sys.stdout.write("\t\t")
+ sys.stdout.write('\t\t\t [ \033[1;32mOK\033[0;39m ]\n')
+ sys.stdout.flush()
def restart(self):
"""
Restart the daemon
"""
self.stop()
- self.start()
+ return self.start()
def run(self):
"""
@@ -212,7 +259,7 @@ def run(self):
def emergencyUmount(self):
cfg = ConfigParser.SafeConfigParser()
- cfg.read('/etc/hltd.conf')
+ cfg.read(self.conffile)
bu_base_dir=None#/fff/BU0?
ramdisk_subdirectory = 'ramdisk'
@@ -229,7 +276,7 @@ def emergencyUmount(self):
process = subprocess.Popen(['mount'],stdout=subprocess.PIPE)
out = process.communicate()[0]
mounts = re.findall('/'+bu_base_dir+'[0-9]+',out)
- if len(mounts)>1 and mounts[0]==mounts[1]: mounts=[mounts[0]]
+ mounts = sorted(list(set(mounts)))
for point in mounts:
sys.stdout.write("trying emergency umount of "+point+"\n")
try:
@@ -237,7 +284,8 @@ def emergencyUmount(self):
except subprocess.CalledProcessError, err1:
pass
except Exception as ex:
- sys.stdout.write(ex.args[0]+"\n")
+ #ok(legacy mountpoint)
+ pass
try:
subprocess.check_call(['umount',os.path.join('/'+point,ramdisk_subdirectory)])
except subprocess.CalledProcessError, err1:
@@ -252,4 +300,20 @@ def emergencyUmount(self):
sys.stdout.write(str(err1.returncode)+"\n")
except Exception as ex:
sys.stdout.write(ex.args[0]+"\n")
+
+
+ def touchLockFile(self):
+ try:
+ with open(self.lockfile,"w+") as fi:
+ pass
+ except:
+ pass
+
+ def removeLockFile(self):
+ try:
+ os.unlink(self.lockfile)
+ except:
+ pass
+
+
diff --git a/python/elastic.py b/python/elastic.py
index 28fccdd..f5c6048 100755
--- a/python/elastic.py
+++ b/python/elastic.py
@@ -25,7 +25,6 @@ def __init__(self, esDir, inMonDir):
self.inputMonDir = inMonDir
self.movedModuleLegend = False
self.movedPathLegend = False
- self.processedHLTRatesLegend = False
def start(self):
self.run()
@@ -44,7 +43,11 @@ def run(self):
self.emptyQueue.clear()
self.process()
except (KeyboardInterrupt,Queue.Empty) as e:
- self.emptyQueue.set()
+ self.emptyQueue.set()
+ except Exception as ex:
+ self.logger.exception(ex)
+ self.logger.fatal("Exiting on unhandled exception")
+ os._exit(1)
else:
time.sleep(0.5)
@@ -60,8 +63,8 @@ def process(self):
infile = self.infile
filetype = infile.filetype
eventtype = self.eventtype
- if eventtype & inotify.IN_CLOSE_WRITE:
- if filetype in [FAST,SLOW]:
+ if eventtype & (inotify.IN_CLOSE_WRITE | inotify.IN_MOVED_TO) :
+ if filetype in [FAST,SLOW,QSTATUS]:
self.elasticize()
elif self.esDirName in infile.dir:
if filetype in [INDEX,STREAM,OUTPUT,STREAMDQMHISTOUTPUT]:self.elasticize()
@@ -85,13 +88,6 @@ def process(self):
logger.error(ex)
pass
self.movedPathLegend = True
- elif filetype == HLTRATES:
- self.logger.debug('received json HLT rates')
- self.elasticize()
- elif filetype == HLTRATESLEGEND and self.processedHLTRatesLegend==False:
- self.logger.debug('received json HLT legend rates')
- self.elasticize()
-
@@ -106,47 +102,46 @@ def elasticize(self):
elif filetype == SLOW:
es.elasticize_prc_sstate(infile)
self.logger.debug(name+" going into prc-sstate")
- self.infile.deleteFile()
+ self.infile.deleteFile(silent=True)
elif filetype == INDEX:
self.logger.info(name+" going into prc-in")
es.elasticize_prc_in(infile)
- self.infile.deleteFile()
+ self.infile.deleteFile(silent=True)
elif filetype == STREAM:
self.logger.info(name+" going into prc-out")
es.elasticize_prc_out(infile)
- self.infile.deleteFile()
+ self.infile.deleteFile(silent=True)
elif filetype in [OUTPUT,STREAMDQMHISTOUTPUT]:
self.logger.info(name+" going into fu-out")
es.elasticize_fu_out(infile)
- self.infile.deleteFile()
+ self.infile.deleteFile(silent=True)
+ elif filetype == QSTATUS:
+ self.logger.debug(name+" going into qstatus")
+ es.elasticize_queue_status(infile)
elif filetype == COMPLETE:
self.logger.info(name+" going into fu-complete")
dt=os.path.getctime(infile.filepath)
completed = datetime.datetime.utcfromtimestamp(dt).isoformat()
es.elasticize_fu_complete(completed)
- self.infile.deleteFile()
+ self.infile.deleteFile(silent=True)
self.stop()
- elif filetype == HLTRATESLEGEND:
- if self.processedHLTRatesLegend==False:
- es.elasticize_hltrateslegend(infile)
- self.processedHLTRatesLegend=True
- self.infile.deleteFile()
- elif filetype == HLTRATES:
- self.logger.info(name+" going into hlt-rates")
- es.elasticize_hltrates(infile)
- self.infile.deleteFile()
def elasticizeLS(self):
ls = self.infile.ls
es.flushLS(ls)
- self.infile.deleteFile()
+ self.infile.deleteFile(silent=True)
if __name__ == "__main__":
+
+ import procname
+ procname.setprocname('elastic')
+
+ conf=initConf()
logging.basicConfig(filename=os.path.join(conf.log_dir,"elastic.log"),
- level=logging.INFO,
+ level=conf.service_log_level,
format='%(levelname)s:%(asctime)s - %(funcName)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S')
logger = logging.getLogger(os.path.basename(__file__))
@@ -165,17 +160,14 @@ def elasticizeLS(self):
expected_processes = int(sys.argv[3])
indexSuffix = conf.elastic_cluster
update_modulo=conf.fastmon_insert_modulo
- dirname = os.path.basename(os.path.normpath(dirname))
- watchDir = os.path.join(conf.watch_directory,dirname)#???
- outputDir = conf.micromerge_output
- monDir = os.path.join(watchDir,"mon")
- tempDir = os.path.join(watchDir,ES_DIR_NAME)
+ rundirname = os.path.basename(os.path.normpath(dirname))
+ monDir = os.path.join(dirname,"mon")
+ tempDir = os.path.join(dirname,ES_DIR_NAME)
- mask = inotify.IN_CLOSE_WRITE | inotify.IN_MOVED_TO
- monMask = inotify.IN_CLOSE_WRITE
- tempMask = inotify.IN_CLOSE_WRITE
+ monMask = inotify.IN_CLOSE_WRITE | inotify.IN_MOVED_TO
+ tempMask = inotify.IN_CLOSE_WRITE | inotify.IN_MOVED_TO
- logger.info("starting elastic for "+dirname)
+ logger.info("starting elastic for "+rundirname[:3]+' '+rundirname[3:])
try:
os.makedirs(monDir)
@@ -191,12 +183,11 @@ def elasticizeLS(self):
#starting inotify thread
mr = MonitorRanger()
mr.setEventQueue(eventQueue)
- #mr.register_inotify_path(watchDir,mask)
mr.register_inotify_path(monDir,monMask)
mr.register_inotify_path(tempDir,tempMask)
mr.start_inotify()
- es = elasticBand.elasticBand('http://'+conf.es_local+':9200',dirname,indexSuffix,expected_processes,update_modulo)
+ es = elasticBand.elasticBand('http://'+conf.es_local+':9200',rundirname,indexSuffix,expected_processes,update_modulo)
#starting elasticCollector thread
ec = elasticCollector(ES_DIR_NAME,inmondir)
diff --git a/python/elasticBand.py b/python/elasticBand.py
index 20c5093..978feed 100644
--- a/python/elasticBand.py
+++ b/python/elasticBand.py
@@ -5,15 +5,13 @@
from pyelasticsearch.client import ElasticHttpError
from pyelasticsearch.client import ConnectionError
from pyelasticsearch.client import Timeout
-import json
+import simplejson as json
import csv
import math
import logging
from aUtils import *
-#MONBUFFERSIZE = 50
-es_server_url = 'http://localhost:9200'
class elasticBand():
@@ -24,7 +22,7 @@ def __init__(self,es_server_url,runstring,indexSuffix,monBufferSize,fastUpdateMo
self.prcinBuffer = {}
self.prcoutBuffer = {}
self.fuoutBuffer = {}
- self.es = ElasticSearch(es_server_url,timeout=20)
+ self.es = ElasticSearch(es_server_url,timeout=20,revival_delay=60)
self.hostname = os.uname()[1]
self.hostip = socket.gethostbyname_ex(self.hostname)[2][0]
#self.number_of_data_nodes = self.es.health()['number_of_data_nodes']
@@ -36,12 +34,13 @@ def __init__(self,es_server_url,runstring,indexSuffix,monBufferSize,fastUpdateMo
aliasName = runstring + "_" + indexSuffix
self.indexName = aliasName# + "_" + self.hostname
- def imbue_jsn(self,infile):
+ def imbue_jsn(self,infile,silent=False):
with open(infile.filepath,'r') as fp:
try:
document = json.load(fp)
except json.scanner.JSONDecodeError,ex:
- logger.exception(ex)
+ if silent==False:
+ self.logger.exception(ex)
return None,-1
return document,0
@@ -155,58 +154,24 @@ def elasticize_prc_in(self,infile):
document['data']=datadict
document['ls']=int(ls[2:])
document['index']=int(index[5:])
- document['dest']=os.uname()[1]
+ document['dest']=self.hostname
document['process']=int(prc[3:])
try:document.pop('definition')
except:pass
self.prcinBuffer.setdefault(ls,[]).append(document)
#self.es.index(self.indexName,'prc-in',document)
-
- def elasticize_hltrateslegend(self,infile):
- document,ret = self.imbue_jsn(infile)
+ def elasticize_queue_status(self,infile):
+ document,ret = self.imbue_jsn(infile,silent=True)
if ret<0:return False
- datadict={}
- #datadict['pid'] = int(infile.pid[3:])
- try:
- paths=document['data'][0].strip('[]')
- datasets=document['data'][1].strip('[]')
- datadict['dataset-names']=datasets.split(',') if len(datasets)>0 else []
- datadict['path-names']=paths.split(',') if len(paths)>0 else []
- except:
- pass
- self.tryIndex('hltrates-legend',datadict)
+ document['fm_date']=str(infile.mtime)
+ document['host']=self.hostname
+ self.tryIndex('qstatus',document)
return True
-
- def elasticize_hltrates(self,infile):
- document,ret = self.imbue_jsn(infile)
- if ret<0:return False
- datadict={}
- try:
- datadict['ls'] = int(infile.ls[2:])
- datadict['pid'] = int(infile.pid[3:])
- try:
- if json.loads(document['data'][0])[0]==0:return True
- except:
- pass
- datadict['processed']=json.loads(document['data'][0])[0]
- datadict['path-wasrun']=json.loads(document['data'][1])
- datadict['path-afterl1seed']=json.loads(document['data'][2])
- datadict['path-afterprescale']=json.loads(document['data'][3])
- datadict['path-accepted']=json.loads(document['data'][4])
- datadict['path-rejected']=json.loads(document['data'][5])
- datadict['path-errors']=json.loads(document['data'][6])
- datadict['dataset-accepted']=json.loads(document['data'][7])
- except:
- return False
- self.tryIndex('hltrates',datadict)
- return True
-
-
def elasticize_fu_complete(self,timestamp):
document = {}
- document['host']=os.uname()[1]
+ document['host']=self.hostname
document['fm_date']=timestamp
self.tryIndex('fu-complete',document)
@@ -264,7 +229,7 @@ def tryBulkIndex(self,docname,documents,attempts=1):
if attempts==0:
self.indexFailures+=1
if self.indexFailures<2:
- self.logger.error("Elasticsearch connection error.")
+ self.logger.warning("Elasticsearch connection error.")
time.sleep(5)
except ElasticHttpError as ex:
if attempts==0:
diff --git a/python/elasticbu.py b/python/elasticbu.py
index 8565615..2fad372 100755
--- a/python/elasticbu.py
+++ b/python/elasticbu.py
@@ -23,10 +23,12 @@
import requests
import simplejson as json
-
import socket
-def getURLwithIP(url):
+#silence HTTP connection info from requests package
+logging.getLogger("urllib3").setLevel(logging.WARNING)
+
+def getURLwithIP(url,nsslock=None):
try:
prefix = ''
if url.startswith('http://'):
@@ -41,7 +43,17 @@ def getURLwithIP(url):
logging.error('could not parse URL ' +url)
raise(ex)
if url!='localhost':
- ip = socket.gethostbyname(url)
+ if nsslock is not None:
+ try:
+ nsslock.acquire()
+ ip = socket.gethostbyname(url)
+ nsslock.release()
+ except Exception as ex:
+ try:nsslock.release()
+ except:pass
+ raise ex
+ else:
+ ip = socket.gethostbyname(url)
else: ip='127.0.0.1'
return prefix+str(ip)+suffix
@@ -49,8 +61,9 @@ def getURLwithIP(url):
class elasticBandBU:
- def __init__(self,runnumber,startTime,runMode=True):
+ def __init__(self,conf,runnumber,startTime,runMode=True,nsslock=None):
self.logger = logging.getLogger(self.__class__.__name__)
+ self.conf=conf
self.es_server_url=conf.elastic_runindex_url
self.runindex_write="runindex_"+conf.elastic_runindex_name+"_write"
self.runindex_read="runindex_"+conf.elastic_runindex_name+"_read"
@@ -66,8 +79,14 @@ def __init__(self,runnumber,startTime,runMode=True):
self.runMode=runMode
self.boxinfoFUMap = {}
self.ip_url=None
+ self.nsslock=nsslock
self.updateIndexMaybe(self.runindex_name,self.runindex_write,self.runindex_read,mappings.central_es_settings,mappings.central_runindex_mapping)
self.updateIndexMaybe(self.boxinfo_name,self.boxinfo_write,self.boxinfo_read,mappings.central_es_settings,mappings.central_boxinfo_mapping)
+ self.black_list=None
+ if self.conf.instance=='main':
+ self.hostinst = self.host
+ else:
+ self.hostinst = self.host+'_'+self.conf.instance
#write run number document
if runMode == True and self.stopping==False:
@@ -89,14 +108,19 @@ def updateIndexMaybe(self,index_name,alias_write,alias_read,settings,mapping):
connectionAttempts+=1
try:
if retry or self.ip_url==None:
- self.ip_url=getURLwithIP(self.es_server_url)
- self.es = ElasticSearch(self.es_server_url)
+ self.ip_url=getURLwithIP(self.es_server_url,self.nsslock)
+ self.es = ElasticSearch(self.ip_url,timeout=20,revival_delay=60)
#check if runindex alias exists
- self.logger.info('writing to elastic index '+alias_write)
if requests.get(self.es_server_url+'/_alias/'+alias_write).status_code == 200:
+ self.logger.info('writing to elastic index '+alias_write + ' on '+self.es_server_url+' - '+self.ip_url )
self.createDocMappingsMaybe(alias_write,mapping)
- break
+ break
+ else:
+ time.sleep(.5)
+ if (connectionAttempts%10)==0:
+ self.logger.error('unable to access to elasticsearch alias ' + alias_write + ' on '+self.es_server_url+' / '+self.ip_url)
+ continue
except ElasticHttpError as ex:
#es error, retry
self.logger.error(ex)
@@ -110,7 +134,7 @@ def updateIndexMaybe(self,index_name,alias_write,alias_read,settings,mapping):
retry=True
continue
- except (ConnectionError,Timeout) as ex:
+ except (socket.gaierror,ConnectionError,Timeout) as ex:
#try to reconnect with different IP from DNS load balancing
if self.runMode and connectionAttempts>100:
self.logger.error('elastic (BU): exiting after 100 connection attempts to '+ self.es_server_url)
@@ -128,12 +152,19 @@ def createDocMappingsMaybe(self,index_name,mapping):
doc = {key:mapping[key]}
res = requests.get(self.ip_url+'/'+index_name+'/'+key+'/_mapping')
#only update if mapping is empty
- if res.status_code==200 and res.content.strip()=='{}':
- requests.post(self.ip_url+'/'+index_name+'/'+key+'/_mapping',json.dumps(doc))
-
- def resetURL(url):
- self.es = None
- self.es = ElasticSearch(url)
+ if res.status_code==200:
+ if res.content.strip()=='{}':
+ requests.post(self.ip_url+'/'+index_name+'/'+key+'/_mapping',json.dumps(doc))
+ else:
+ #still check if number of properties is identical in each type
+ inmapping = json.loads(res.content)
+ for indexname in inmapping:
+ properties = inmapping[indexname]['mappings'][key]['properties']
+ #should be size 1
+ for pdoc in mapping[key]['properties']:
+ if pdoc not in properties:
+ requests.post(self.ip_url+'/'+index_name+'/'+key+'/_mapping',json.dumps(doc))
+ break
def read_line(self,fullpath):
with open(fullpath,'r') as fp:
@@ -177,7 +208,31 @@ def elasticize_box(self,infile):
basename = infile.basename
self.logger.debug(basename)
current_time = time.time()
- if basename.startswith('fu'):
+
+ if infile.data=={}:return
+
+ bu_doc=False
+ if basename.startswith('bu') or basename.startswith('dvbu'):
+ bu_doc=True
+
+ #check box file against blacklist
+ if bu_doc or self.black_list==None:
+ self.black_list=[]
+
+ try:
+ with open(os.path.join(self.conf.watch_directory,'appliance','blacklist'),"r") as fi:
+ try:
+ self.black_list = json.load(fi)
+ except ValueError:
+ #file is being written or corrupted
+ return
+ except:
+ #blacklist file is not present, do not filter
+ pass
+
+ if basename in self.black_list:return
+
+ if bu_doc==False:
try:
self.boxinfoFUMap[basename] = [infile.data,current_time]
except Exception as ex:
@@ -185,34 +240,56 @@ def elasticize_box(self,infile):
return
try:
document = infile.data
- document['id']=basename
+ #unique id for separate instances
+ if bu_doc:
+ document['id']=self.hostinst
+ else:
+ document['id']=basename
+
+ #both here and in "boxinfo_appliance"
+ document['appliance']=self.host
+ document['instance']=self.conf.instance
+ #only here
+ document['host']=basename
+
self.index_documents('boxinfo',[document])
except Exception as ex:
self.logger.warning('box info not injected: '+str(ex))
return
- if basename.startswith('bu') or basename.startswith('dvbu'):
+ if bu_doc:
try:
document = infile.data
+ try:
+ document.pop('id')
+ except:pass
+ try:
+ document.pop('host')
+ except:pass
#aggregation from FUs
document['idles']=0
document['used']=0
document['broken']=0
document['quarantined']=0
+ document['cloud']=0
document['usedDataDir']=0
document['totalDataDir']=0
document['hosts']=[basename]
+ document['blacklistedHosts']=[]
for key in self.boxinfoFUMap:
- dpair = self.boxinfoFUMap[key]
- d = dpair[0]
- #check if entry is not older than 10 seconds
- if current_time - dpair[1] > 10:continue
- document['idles']+=int(d['idles'])
- document['used']+=int(d['used'])
- document['broken']+=int(d['broken'])
- document['quarantined']+=int(d['quarantined'])
- document['usedDataDir']+=int(d['usedDataDir'])
- document['totalDataDir']+=int(d['totalDataDir'])
- document['hosts'].append(key)
+ dpair = self.boxinfoFUMap[key]
+ d = dpair[0]
+ #check if entry is not older than 10 seconds
+ if current_time - dpair[1] > 10:continue
+ document['idles']+=int(d['idles'])
+ document['used']+=int(d['used'])
+ document['broken']+=int(d['broken'])
+ document['quarantined']+=int(d['quarantined'])
+ document['cloud']+=int(d['cloud'])
+ document['usedDataDir']+=int(d['usedDataDir'])
+ document['totalDataDir']+=int(d['totalDataDir'])
+ document['hosts'].append(key)
+ for blacklistedHost in self.black_list:
+ document['blacklistedHosts'].append(blacklistedHost)
self.index_documents('boxinfo_appliance',[document],bulk=False)
except Exception as ex:
#in case of malformed box info
@@ -238,8 +315,10 @@ def elasticize_eols(self,infile):
def index_documents(self,name,documents,bulk=True):
attempts=0
destination_index = ""
+ is_box=False
if name.startswith("boxinfo"):
destination_index = self.boxinfo_write
+ is_box=True
else:
destination_index = self.runindex_write
while True:
@@ -253,16 +332,18 @@ def index_documents(self,name,documents,bulk=True):
except ElasticHttpError as ex:
if attempts<=1:continue
self.logger.error('elasticsearch HTTP error. skipping document '+name)
+ if is_box==True:break
#self.logger.exception(ex)
return False
- except (ConnectionError,Timeout) as ex:
+ except (socket.gaierror,ConnectionError,Timeout) as ex:
if attempts>100 and self.runMode:
raise(ex)
self.logger.error('elasticsearch connection error. retry.')
+ if is_box==True:break
if self.stopping:return False
time.sleep(0.1)
- ip_url=getURLwithIP(self.es_server_url)
- self.es = ElasticSearch(ip_url)
+ ip_url=getURLwithIP(self.es_server_url,self.nsslock)
+ self.es = ElasticSearch(ip_url,timeout=20,revival_delay=60)
return False
@@ -290,7 +371,7 @@ def stop(self):
self.stoprequest.set()
def run(self):
- self.logger.info("Start main loop")
+ self.logger.info("elasticCollectorBU: start main loop (monitoring:"+self.inRunDir+")")
count = 0
while not (self.stoprequest.isSet() and self.emptyQueue.isSet()) :
if self.source:
@@ -300,16 +381,16 @@ def run(self):
self.infile = fileHandler(event.fullpath)
self.emptyQueue.clear()
if self.infile.filetype==EOR:
- if self.es:
- try:
- dt=os.path.getctime(event.fullpath)
- endtime = datetime.datetime.utcfromtimestamp(dt).isoformat()
- self.es.elasticize_runend_time(endtime)
- except Exception as ex:
- self.logger.warning(str(ex))
- endtime = datetime.datetime.utcnow().isoformat()
- self.es.elasticize_runend_time(endtime)
- break
+ if self.es:
+ try:
+ dt=os.path.getctime(event.fullpath)
+ endtime = datetime.datetime.utcfromtimestamp(dt).isoformat()
+ self.es.elasticize_runend_time(endtime)
+ except Exception as ex:
+ self.logger.warning(str(ex))
+ endtime = datetime.datetime.utcnow().isoformat()
+ self.es.elasticize_runend_time(endtime)
+ break
self.process()
except (KeyboardInterrupt,Queue.Empty) as e:
self.emptyQueue.set()
@@ -325,9 +406,9 @@ def run(self):
#if run dir deleted
if os.path.exists(self.inRunDir)==False:
self.logger.info("Exiting because run directory in has disappeared")
- #nevertheless put run end time
if self.es:
- endtime = datetime.datetime.utcnow().isoformat()
+ #write end timestamp in case EoR file was not seen
+ endtime = datetime.datetime.utcnow().isoformat()
self.es.elasticize_runend_time(endtime)
break
self.logger.info("Stop main loop (watching directory " + str(self.inRunDir) + ")")
@@ -374,7 +455,7 @@ def stop(self):
self.stoprequest.set()
def run(self):
- self.logger.info("Start main loop")
+ self.logger.info("elasticBoxCollectorBU: start main loop")
while not (self.stoprequest.isSet() and self.emptyQueue.isSet()) :
if self.source:
try:
@@ -391,7 +472,7 @@ def run(self):
self.logger.warning("IOError on reading "+event.fullpath)
else:
time.sleep(1.0)
- self.logger.info("Stop main loop")
+ self.logger.info("elasticBoxCollectorBU: stop main loop")
def setSource(self,source):
self.source = source
@@ -408,9 +489,12 @@ def process(self):
class BoxInfoUpdater(threading.Thread):
- def __init__(self,ramdisk):
+ def __init__(self,ramdisk,conf,nsslock):
self.logger = logging.getLogger(self.__class__.__name__)
self.stopping = False
+ self.es=None
+ self.conf=conf
+ self.nsslock=nsslock
try:
threading.Thread.__init__(self)
@@ -435,7 +519,7 @@ def __init__(self,ramdisk):
def run(self):
try:
- self.es = elasticBandBU(0,'',False)
+ self.es = elasticBandBU(self.conf,0,'',False,self.nsslock)
if self.stopping:return
self.ec = elasticBoxCollectorBU(self.es)
@@ -450,7 +534,7 @@ def stop(self):
try:
self.stopping=True
self.threadEvent.set()
- if self.es:
+ if self.es is not None:
self.es.stopping=True
self.es.threadEvent.set()
if self.mr is not None:
@@ -465,16 +549,19 @@ def stop(self):
class RunCompletedChecker(threading.Thread):
- def __init__(self,mode,nr,nresources,run_dir,active_runs,elastic_process):
+ def __init__(self,conf,mode,nr,nresources,run_dir,active_runs,active_runs_errors,elastic_process):
self.logger = logging.getLogger(self.__class__.__name__)
+ self.conf=conf
self.mode = mode
self.nr = nr
self.nresources = nresources
- self.rundirCheckPath = conf.watch_directory +'/run'+ str(nr).zfill(conf.run_number_padding)
+ rundir = 'run'+ str(nr).zfill(conf.run_number_padding)
+ self.rundirCheckPath = os.path.join(conf.watch_directory, rundir)
self.eorCheckPath = os.path.join(self.rundirCheckPath,'run' + str(nr).zfill(conf.run_number_padding) + '_ls0000_EoR.jsn')
- self.url = 'http://localhost:9200/run'+str(nr).zfill(conf.run_number_padding)+'*/fu-complete/_count'
- self.urlclose = 'http://localhost:9200/run'+str(nr).zfill(conf.run_number_padding)+'*/_close'
- self.urlsearch = 'http://localhost:9200/run'+str(nr).zfill(conf.run_number_padding)+'*/fu-complete/_search?size=1'
+ self.indexPrefix = 'run'+str(nr).zfill(conf.run_number_padding) + '_' + conf.elastic_cluster
+ self.url = 'http://'+conf.es_local+':9200/' + self.indexPrefix + '*/fu-complete/_count'
+ self.urlclose = 'http://'+conf.es_local+':9200/' + self.indexPrefix + '*/_close'
+ self.urlsearch = 'http://'+conf.es_local+':9200/' + self.indexPrefix + '*/fu-complete/_search?size=1'
self.url_query = '{ "query": { "filtered": {"query": {"match_all": {}}}}, "sort": { "fm_date": { "order": "desc" }}}'
@@ -482,6 +569,7 @@ def __init__(self,mode,nr,nresources,run_dir,active_runs,elastic_process):
self.threadEvent = threading.Event()
self.run_dir = run_dir
self.active_runs = active_runs
+ self.active_runs_errors = active_runs_errors
self.elastic_process=elastic_process
try:
threading.Thread.__init__(self)
@@ -492,7 +580,6 @@ def __init__(self,mode,nr,nresources,run_dir,active_runs,elastic_process):
def checkBoxes(self,dir):
-
files = os.listdir(dir)
endAllowed=True
runFound=False
@@ -553,7 +640,7 @@ def run(self):
if os.path.exists(self.eorCheckPath) or os.path.exists(self.rundirCheckPath)==False:
break
- dir = conf.resource_base+'/boxes/'
+ dir = self.conf.resource_base+'/boxes/'
check_boxes=True
check_es_complete=True
total_es_elapsed=0
@@ -563,9 +650,14 @@ def run(self):
check_boxes = self.checkBoxes(dir)
if check_boxes==False:
+ try:
+ self.active_runs_errors.pop(self.active_runs.index(int(self.nr)))
+ except:
+ pass
try:
self.active_runs.remove(int(self.nr))
- except:pass
+ except:
+ pass
if check_es_complete:
try:
@@ -578,29 +670,21 @@ def run(self):
fm_time = str(dataq['hits']['hits'][0]['_source']['fm_date'])
#fill in central index completition time
postq = "{runNumber\":\"" + str(self.nr) + "\",\"completedTime\" : \"" + fm_time + "\"}"
- requests.post(conf.elastic_runindex_url+'/'+"runindex_"+conf.elastic_runindex_name+'_write/run',postq,timeout=5)
- self.logger.info("filled in completition time for run"+str(self.nr))
+ requests.post(self.conf.elastic_runindex_url+'/'+"runindex_"+self.conf.elastic_runindex_name+'_write/run',postq,timeout=5)
+ self.logger.info("filled in completition time for run "+str(self.nr))
except IndexError:
# 0 FU resources present in this run, skip writing completition time
pass
except Exception as ex:
self.logger.exception(ex)
- try:
- if conf.close_es_index==True:
- #wait a bit for central ES queries to complete
- time.sleep(10)
- resp = requests.post(self.urlclose,timeout=5)
- self.logger.info('closed appliance ES index for run '+str(self.nr))
- except Exception as exc:
- self.logger.error('Error in run completition check')
- self.logger.exception(exc)
check_es_complete=False
continue
else:
+ #TODO:do this only using active runs
time.sleep(5)
total_es_elapsed+=5
if total_es_elapsed>600:
- self.logger.error('run index complete flag was not written by all FUs, giving up after 10 minutes.')
+ self.logger.warning('run index complete flag was not written by all FUs, giving up checks after 10 minutes.')
check_es_complete=False
continue
except Exception,ex:
@@ -609,7 +693,17 @@ def run(self):
check_es_complete=False
#exit if both checks are complete
- if check_boxes==False and check_es_complete==False:break
+ if check_boxes==False and check_es_complete==False:
+ try:
+ if self.conf.close_es_index==True:
+ #wait a bit for queries to complete
+ time.sleep(10)
+ resp = requests.post(self.urlclose,timeout=5)
+ self.logger.info('closed appliance ES index for run '+str(self.nr))
+ except Exception as exc:
+ self.logger.error('Error in closing run index')
+ self.logger.exception(exc)
+ break
#check every 10 seconds
self.threadEvent.wait(10)
@@ -622,10 +716,15 @@ def stop(self):
self.threadEvent.set()
-
if __name__ == "__main__":
+
+ import procname
+ procname.setprocname('elasticbu')
+
+ conf=initConf(sys.argv[1])
+
logging.basicConfig(filename=os.path.join(conf.log_dir,"elasticbu.log"),
- level=logging.INFO,
+ level=conf.service_log_level,
format='%(levelname)s:%(asctime)s - %(funcName)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S')
logger = logging.getLogger(os.path.basename(__file__))
@@ -636,9 +735,8 @@ def stop(self):
eventQueue = Queue.Queue()
- runnumber = sys.argv[1]
+ runnumber = sys.argv[2]
watchdir = conf.watch_directory
-
mainDir = os.path.join(watchdir,'run'+ runnumber.zfill(conf.run_number_padding))
dt=os.path.getctime(mainDir)
startTime = datetime.datetime.utcfromtimestamp(dt).isoformat()
@@ -668,7 +766,7 @@ def stop(self):
mr.start_inotify()
- es = elasticBandBU(runnumber,startTime)
+ es = elasticBandBU(conf,runnumber,startTime)
#starting elasticCollector thread
ec = elasticCollectorBU(es,mainDir)
diff --git a/python/fillresources.py b/python/fillresources.py
index 902c548..cc3c7d1 100755
--- a/python/fillresources.py
+++ b/python/fillresources.py
@@ -3,6 +3,18 @@
import os
import shutil
import hltdconf
+import time
+
+def clearDir(dir):
+ try:
+ files = os.listdir(dir)
+ for file in files:
+ try:
+ os.unlink(os.path.join(dir,file))
+ except:
+ pass
+ except:
+ pass
conf=hltdconf.hltdConf('/etc/hltd.conf')
@@ -13,26 +25,14 @@
elif 'fu' in os.uname()[1]: role='fu'
else: role = conf.role
-if role=='fu' and conf.dqm_machine=="False":
-
- try:
- shutil.rmtree('/etc/appliance/online/*')
- except:
- pass
- try:
- shutil.rmtree('/etc/appliance/offline/*')
- except:
- pass
- try:
- shutil.rmtree('/etc/appliance/except/*')
- except:
- pass
- try:
- shutil.rmtree('/etc/appliance/quarantined/*')
- except:
- pass
-
+if role=='fu' and not conf.dqm_machine:
+ clearDir(conf.resource_base+'/idle')
+ clearDir(conf.resource_base+'/online')
+ clearDir(conf.resource_base+'/except')
+ clearDir(conf.resource_base+'/quarantined')
+ clearDir(conf.resource_base+'/cloud')
+
fp=open('/proc/cpuinfo','r')
resource_count = 0
for line in fp:
diff --git a/python/genTestFakeBu_cfg.py b/python/genTestFakeBu_cfg.py
index 39424da..f1963f5 100644
--- a/python/genTestFakeBu_cfg.py
+++ b/python/genTestFakeBu_cfg.py
@@ -50,7 +50,7 @@
process.source = cms.Source("EmptySource",
firstRun= cms.untracked.uint32(options.runNumber),
- numberEventsInLuminosityBlock = cms.untracked.uint32(500),
+ numberEventsInLuminosityBlock = cms.untracked.uint32(200),
numberEventsInRun = cms.untracked.uint32(0)
)
@@ -79,7 +79,7 @@
process.out = cms.OutputModule("RawStreamFileWriterForBU",
ProductLabel = cms.untracked.string("s"),
- numEventsPerFile = cms.untracked.uint32(100),
+ numEventsPerFile = cms.untracked.uint32(50),
jsonDefLocation = cms.untracked.string(cmsswbase+"/src/EventFilter/Utilities/plugins/budef.jsd"),
debug = cms.untracked.bool(True)
)
diff --git a/python/hltd b/python/hltd
index edaedd4..6b125e5 100755
--- a/python/hltd
+++ b/python/hltd
@@ -13,35 +13,47 @@ from applianceumount import checkMode
import time
import syslog
-def touchLockFile():
- try:
- with open('/var/lock/subsys/hltd',"w+") as fi:
- pass
- except:
- pass
-
-def removeLockFile():
- try:
- os.unlink('/var/lock/subsys/hltd')
- except:
- pass
+
+def startService(daemon,srvInstance):
+ daemon.touchLockFile()
+ proc = Popen(["/opt/hltd/python/hltd.py",srvInstance], stdout=PIPE)
+ output = proc.communicate()[0]
+ time.sleep(.1)
+ if daemon.silentStatus() and proc.returncode==0:
+ print 'Starting hltd instance',srvInstance,':\t\t\t\t [ \033[1;32mOK\033[0;39m ]'
+
+ daemon.touchLockFile()
+ else:
+ if proc.returncode==3:sys.exit(0)
+ print 'Starting hltd instance',srvInstance,':\t\t\t\t [ \033[1;31mFAILED\033[0;39m ]'
+ print output
+ sys.exit(1)
if __name__ == "__main__":
- daemon = hltd('/var/run/hltd.pid')
- if len(sys.argv) == 2:
+
+ if len(sys.argv) <=2 or sys.argv[2]=="all":
+ try:
+ instances=[]
+ with open('/etc/hltd.instances','r') as fi:
+ for line in fi.readlines():
+ lnstrip = line.strip(' \n')
+ if len(lnstrip)>0 and lnstrip.startswith("#")==False:
+ instances.append(lnstrip)
+ except:
+ instances = ["main"]
+ else:
+ instances = [sys.argv[2]]
+
+ for instance in instances:
+ daemon = hltd(instance)
+
+ if len(sys.argv) >= 2:
if 'start' == sys.argv[1]:
- touchLockFile()
- output = Popen(["/opt/hltd/python/hltd.py"], stdout=PIPE).communicate()[0]
- if daemon.silentStatus():
- print '[OK]'
- else:
- print '[Failed]'
- print output
+ startService(daemon,instance)
+
elif 'stop' == sys.argv[1]:
- if daemon.status():
- daemon.stop()
- elif os.path.exists('/var/run/hltd.pid'):
- daemon.delpid()
+ sys.stdout.write('Stopping hltd instance '+instance+':')
+ daemon.stop()
#determine runlevel
std_out=""
@@ -52,58 +64,55 @@ if __name__ == "__main__":
from_level = std_out.split('\t')[0].rstrip('\n').strip().split(' ')[0]
to_level = std_out.split('\t')[0].rstrip('\n').strip().split(' ')[1]
if to_level.isdigit() and int(to_level) in [0,1,6] and str(from_level)!="1":
-
- if stopFUs()==False:
+
+ if stopFUs(instance)==False:
msg = "Shutdown or reboot is cancelled by hltd - FU umount failed! Switching to runlevel 3..."
- syslog.syslog(msg)
+ syslog.syslog("hltd-"+str(instance)+":"+msg)
time.sleep(2)
p = Popen("init 3", shell=True, stdout=PIPE)
p.wait()
else:
- removeLockFile()
+ daemon.removeLockFile()
else:
- if checkMode()=="fu":
- removeLockFile()
+ if checkMode(instance)=="fu":
+ daemon.removeLockFile()
else:
print "Lock file remains. Run stop-appliance to unmount FUs."
except:
print "Runlevel:",std_out
- syslog.syslog("Exception when determining runlevel:"+str(std_out))
-
+ syslog.syslog("hltd-"+str(instance)+":Exception when determining runlevel:"+str(std_out))
+
elif 'stop-appliance' == sys.argv[1]:
- if daemon.status():
- daemon.stop()
- elif os.path.exists('/var/run/hltd.pid'):
- daemon.delpid()
-
- if checkMode()=="fu":
- print "This command is not supported on FU."
-
- elif stopFUs()==False:
- print "FU umount failed, lock file remains. FU umount failed."
+ sys.stdout.write('Stopping hltd instance '+instance+':')
+ daemon.stop()
+
+ if checkMode(instance)=="fu":
+ print "This command is not supported on FU. Performed only service stop."
+
+ elif stopFUs(instance)==False:
+ print "FU umount failed, lock file remains."
else:
- removeLockFile()
+ daemon.removeLockFile()
elif 'stop-light' == sys.argv[1]:
- if daemon.status():
- daemon.stop()
- elif os.path.exists('/var/run/hltd.pid'):
- daemon.delpid()
- removeLockFile()
-
+ sys.stdout.write('Stopping hltd instance '+instance+':')
+ daemon.stop()
+ daemon.removeLockFile()
+
elif 'restart' == sys.argv[1]:
- daemon.restart()
- touchLockFile()
+ sys.stdout.write('Stopping hltd instance '+instance+':')
+ daemon.stop()
+ startService(daemon,instance)
+
elif 'status' == sys.argv[1]:
daemon.status()
else:
print "Unknown command"
sys.exit(2)
-# print "hltd "+sys.argv[1]+"ed"
-# logging.debug("executed "+sys.argv[1])
- sys.exit(0)
else:
- print "usage: %s start|stop|stop-light|restart|status" % sys.argv[0]
+ print "usage: %s start|stop|stop-light|restart|status |all|main|instance" % sys.argv[0]
sys.exit(2)
+
+sys.exit(0)
diff --git a/python/hltd.py b/python/hltd.py
index 3a1ea6f..5adbb70 100755
--- a/python/hltd.py
+++ b/python/hltd.py
@@ -9,10 +9,9 @@
import subprocess
from signal import SIGKILL
from signal import SIGINT
-import json
+import simplejson as json
#import SOAPpy
import threading
-import fcntl
import CGIHTTPServer
import BaseHTTPServer
import cgitb
@@ -21,6 +20,8 @@
import re
import shutil
import socket
+#import fcntl
+#import random
#modules distributed with hltd
import prctl
@@ -34,26 +35,63 @@
from elasticbu import BoxInfoUpdater
from elasticbu import RunCompletedChecker
-idles = conf.resource_base+'/idle/'
-used = conf.resource_base+'/online/'
-broken = conf.resource_base+'/except/'
-quarantined = conf.resource_base+'/quarantined/'
+from aUtils import fileHandler
+
nthreads = None
nstreams = None
expected_processes = None
run_list=[]
+runs_pending_shutdown=[]
bu_disk_list_ramdisk=[]
bu_disk_list_output=[]
+bu_disk_list_ramdisk_instance=[]
+bu_disk_list_output_instance=[]
active_runs=[]
+active_runs_errors=[]
resource_lock = threading.Lock()
+nsslock = threading.Lock()
suspended=False
+entering_cloud_mode=False
+cloud_mode=False
+
+ramdisk_submount_size=0
+machine_blacklist=[]
+boxinfoFUMap = {}
+
+logCollector = None
+
+def setFromConf(myinstance):
+
+ global conf
+ global logger
+ global idles
+ global used
+ global broken
+ global quarantined
+ global cloud
+
+ conf=initConf(myinstance)
-logging.basicConfig(filename=os.path.join(conf.log_dir,"hltd.log"),
+ idles = conf.resource_base+'/idle/'
+ used = conf.resource_base+'/online/'
+ broken = conf.resource_base+'/except/'
+ quarantined = conf.resource_base+'/quarantined/'
+ cloud = conf.resource_base+'/cloud/'
+
+ #prepare log directory
+ if myinstance!='main':
+ if not os.path.exists(conf.log_dir): os.makedirs(conf.log_dir)
+ if not os.path.exists(os.path.join(conf.log_dir,'pid')): os.makedirs(os.path.join(conf.log_dir,'pid'))
+ os.chmod(conf.log_dir,0777)
+ os.chmod(os.path.join(conf.log_dir,'pid'),0777)
+
+ logging.basicConfig(filename=os.path.join(conf.log_dir,"hltd.log"),
level=conf.service_log_level,
format='%(levelname)s:%(asctime)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S')
+ logger = logging.getLogger(os.path.basename(__file__))
+ conf.dump()
-conf.dump()
def preexec_function():
dem = demote.demote(conf.user)
@@ -62,35 +100,72 @@ def preexec_function():
# os.setpgrp()
def cleanup_resources():
+ try:
+ dirlist = os.listdir(cloud)
+ for cpu in dirlist:
+ os.rename(cloud+cpu,idles+cpu)
+ dirlist = os.listdir(broken)
+ for cpu in dirlist:
+ os.rename(broken+cpu,idles+cpu)
+ dirlist = os.listdir(used)
+ for cpu in dirlist:
+ os.rename(used+cpu,idles+cpu)
+ dirlist = os.listdir(quarantined)
+ for cpu in dirlist:
+ os.rename(quarantined+cpu,idles+cpu)
+ dirlist = os.listdir(idles)
+ #quarantine files beyond use fraction limit (rounded to closest integer)
+ num_excluded = round(len(dirlist)*(1.-conf.resource_use_fraction))
+ for i in range(0,int(num_excluded)):
+ os.rename(idles+dirlist[i],quarantined+dirlist[i])
+ return True
+ except Exception as ex:
+ logger.warning(str(ex))
+ return False
+def move_resources_to_cloud():
dirlist = os.listdir(broken)
for cpu in dirlist:
- os.rename(broken+cpu,idles+cpu)
+ os.rename(broken+cpu,cloud+cpu)
dirlist = os.listdir(used)
for cpu in dirlist:
- os.rename(used+cpu,idles+cpu)
+ os.rename(used+cpu,cloud+cpu)
dirlist = os.listdir(quarantined)
for cpu in dirlist:
- os.rename(quarantined+cpu,idles+cpu)
+ os.rename(quarantined+cpu,cloud+cpu)
dirlist = os.listdir(idles)
- #quarantine files beyond use fraction limit (rounded to closest integer)
- num_excluded = round(len(dirlist)*(1.-conf.resource_use_fraction))
- for i in range(0,int(num_excluded)):
- os.rename(idles+dirlist[i],quarantined+dirlist[i])
+ for cpu in dirlist:
+ os.rename(idles+cpu,cloud+cpu)
+ dirlist = os.listdir(idles)
+ for cpu in dirlist:
+ os.rename(idles+cpu,cloud+cpu)
+
def cleanup_mountpoints(remount=True):
- bu_disk_list_ramdisk[:] = []
- bu_disk_list_output[:] = []
+
+ global bu_disk_list_ramdisk
+ global bu_disk_list_ramdisk_instance
+ global bu_disk_list_output
+ global bu_disk_list_output_instance
+
+ bu_disk_list_ramdisk = []
+ bu_disk_list_output = []
+ bu_disk_list_ramdisk_instance = []
+ bu_disk_list_output_instance = []
+
if conf.bu_base_dir[0] == '/':
- bu_disk_list_ramdisk[:] = [os.path.join(conf.bu_base_dir,conf.ramdisk_subdirectory)]
- bu_disk_list_output[:] = [os.path.join(conf.bu_base_dir,conf.output_subdirectory)]
+ bu_disk_list_ramdisk = [os.path.join(conf.bu_base_dir,conf.ramdisk_subdirectory)]
+ bu_disk_list_output = [os.path.join(conf.bu_base_dir,conf.output_subdirectory)]
+ if conf.instance=="main":
+ bu_disk_list_ramdisk_instance = bu_disk_list_ramdisk
+ bu_disk_list_output_instance = bu_disk_list_output
+ else:
+ bu_disk_list_ramdisk_instance = [os.path.join(bu_disk_list_ramdisk[0],conf.instance)]
+ bu_disk_list_output_instance = [os.path.join(bu_disk_list_output[0],conf.instance)]
+
#make subdirectories if necessary and return
if remount==True:
- try:
- os.makedirs(conf.bu_base_dir)
- except OSError:
- pass
try:
os.makedirs(os.path.join(conf.bu_base_dir,conf.ramdisk_subdirectory))
except OSError:
@@ -104,59 +179,63 @@ def cleanup_mountpoints(remount=True):
process = subprocess.Popen(['mount'],stdout=subprocess.PIPE)
out = process.communicate()[0]
mounts = re.findall('/'+conf.bu_base_dir+'[0-9]+',out)
- mounts = list(set(mounts))
- #if len(mounts)>1 and mounts[0]==mounts[1]: mounts=[mounts[0]]
- logging.info("cleanup_mountpoints: found following mount points ")
- logging.info(mounts)
+ mounts = sorted(list(set(mounts)))
+ logger.info("cleanup_mountpoints: found following mount points: ")
+ logger.info(mounts)
umount_failure=False
for point in mounts:
- logging.info("trying umount of "+point)
+
try:
+ #try to unmount old style mountpoint(ok if fails)
subprocess.check_call(['umount','/'+point])
- except subprocess.CalledProcessError, err1:
- pass
- except Exception as ex:
- logging.exception(ex)
+ except:pass
try:
subprocess.check_call(['umount',os.path.join('/'+point,conf.ramdisk_subdirectory)])
except subprocess.CalledProcessError, err1:
- logging.error("Error calling umount in cleanup_mountpoints")
- logging.error(str(err1.returncode))
- umount_failure=True
+ logger.info("trying to kill users of ramdisk")
+ try:
+ subprocess.check_call(['fuser','-km',os.path.join('/'+point,conf.ramdisk_subdirectory)])
+ except subprocess.CalledProcessError, err2:
+ logger.error("Error calling umount in cleanup_mountpoints (ramdisk), return code:"+str(err2.returncode))
+ try:
+ subprocess.check_call(['umount',os.path.join('/'+point,conf.ramdisk_subdirectory)])
+ except subprocess.CalledProcessError, err2:
+ logger.error("Error calling umount in cleanup_mountpoints (ramdisk), return code:"+str(err2.returncode))
+ umount_failure=True
try:
subprocess.check_call(['umount',os.path.join('/'+point,conf.output_subdirectory)])
except subprocess.CalledProcessError, err1:
- logging.error("Error calling umount in cleanup_mountpoints")
- logging.error(str(err1.returncode))
- umount_failure=True
- #this will remove directories only if they are empty (as unomunted mount point should be)
+ logger.info("trying to kill users of output")
+ try:
+ subprocess.check_call(['fuser','-km',os.path.join('/'+point,conf.output_subdirectory)])
+ except subprocess.CalledProcessError, err2:
+ logger.error("Error calling umount in cleanup_mountpoints (output), return code:"+str(err2.returncode))
+ try:
+ subprocess.check_call(['umount',os.path.join('/'+point,conf.output_subdirectory)])
+ except subprocess.CalledProcessError, err2:
+ logger.error("Error calling umount in cleanup_mountpoints (output), return code:"+str(err2.returncode))
+ umount_failure=True
+
+ #this will remove directories only if they are empty (as unmounted mount point should be)
try:
if os.path.join('/'+point,conf.ramdisk_subdirectory)!='/':
os.rmdir(os.path.join('/'+point,conf.ramdisk_subdirectory))
except Exception as ex:
- logging.exception(ex)
+ logger.exception(ex)
try:
if os.path.join('/'+point,conf.output_subdirectory)!='/':
os.rmdir(os.path.join('/'+point,conf.output_subdirectory))
except Exception as ex:
- logging.exception(ex)
- try:
- if os.path.join('/',point)!='/':
- os.rmdir('/'+point)
- except Exception as ex:
- logging.exception(ex)
+ logger.exception(ex)
if remount==False:
if umount_failure:return False
return True
i = 0
bus_config = os.path.join(os.path.dirname(conf.resource_base.rstrip(os.path.sep)),'bus.config')
if os.path.exists(bus_config):
+ busconfig_age = os.path.getmtime(bus_config)
for line in open(bus_config):
- logging.info("found BU to mount at "+line.strip())
- try:
- os.makedirs('/'+conf.bu_base_dir+str(i))
- except OSError:
- pass
+ logger.info("found BU to mount at "+line.strip())
try:
os.makedirs(os.path.join('/'+conf.bu_base_dir+str(i),conf.ramdisk_subdirectory))
except OSError:
@@ -174,16 +253,20 @@ def cleanup_mountpoints(remount=True):
break
else:
p_end = datetime.datetime.now()
- logging.warn('unable to ping '+line.strip())
+ logger.warn('unable to ping '+line.strip())
dt = p_end - p_begin
if dt.seconds < 10:
time.sleep(10-dt.seconds)
attemptsLeft-=1
- if attemptsLeft==0:
- logging.fatal('hltd was unable to ping BU '+line.strip())
- sys.exit(1)
- else:
- logging.info("trying to mount "+line.strip()+':/fff/'+conf.ramdisk_subdirectory+' '+os.path.join('/'+conf.bu_base_dir+str(i),conf.ramdisk_subdirectory))
+ if attemptsLeft==0:
+ logger.fatal('hltd was unable to ping BU '+line.strip())
+ #check if bus.config has been updated
+ if (os.path.getmtime(bus_config) - busconfig_age)>1:
+ return cleanup_mountpoints(remount)
+ attemptsLeft=8
+ #sys.exit(1)
+ if True:
+ logger.info("trying to mount "+line.strip()+':/fff/'+conf.ramdisk_subdirectory+' '+os.path.join('/'+conf.bu_base_dir+str(i),conf.ramdisk_subdirectory))
try:
subprocess.check_call(
[conf.mount_command,
@@ -194,13 +277,18 @@ def cleanup_mountpoints(remount=True):
line.strip()+':/fff/'+conf.ramdisk_subdirectory,
os.path.join('/'+conf.bu_base_dir+str(i),conf.ramdisk_subdirectory)]
)
- bu_disk_list_ramdisk.append(os.path.join('/'+conf.bu_base_dir+str(i),conf.ramdisk_subdirectory))
+ toappend = os.path.join('/'+conf.bu_base_dir+str(i),conf.ramdisk_subdirectory)
+ bu_disk_list_ramdisk.append(toappend)
+ if conf.instance=="main":
+ bu_disk_list_ramdisk_instance.append(toappend)
+ else:
+ bu_disk_list_ramdisk_instance.append(os.path.join(toappend,conf.instance))
except subprocess.CalledProcessError, err2:
- logging.exception(err2)
- logging.fatal("Unable to mount ramdisk - exiting.")
+ logger.exception(err2)
+ logger.fatal("Unable to mount ramdisk - exiting.")
sys.exit(1)
- logging.info("trying to mount "+line.strip()+':/fff/'+conf.output_subdirectory+' '+os.path.join('/'+conf.bu_base_dir+str(i),conf.output_subdirectory))
+ logger.info("trying to mount "+line.strip()+':/fff/'+conf.output_subdirectory+' '+os.path.join('/'+conf.bu_base_dir+str(i),conf.output_subdirectory))
try:
subprocess.check_call(
[conf.mount_command,
@@ -211,26 +299,49 @@ def cleanup_mountpoints(remount=True):
line.strip()+':/fff/'+conf.output_subdirectory,
os.path.join('/'+conf.bu_base_dir+str(i),conf.output_subdirectory)]
)
- bu_disk_list_output.append(os.path.join('/'+conf.bu_base_dir+str(i),conf.output_subdirectory))
+ toappend = os.path.join('/'+conf.bu_base_dir+str(i),conf.output_subdirectory)
+ bu_disk_list_output.append(toappend)
+ if conf.instance=="main" or conf.instance_same_destination==True:
+ bu_disk_list_output_instance.append(toappend)
+ else:
+ bu_disk_list_output_instance.append(os.path.join(toappend,conf.instance))
except subprocess.CalledProcessError, err2:
- logging.exception(err2)
- logging.fatal("Unable to mount output - exiting.")
+ logger.exception(err2)
+ logger.fatal("Unable to mount output - exiting.")
sys.exit(1)
-
i+=1
#clean up suspended state
try:
- if remount==True:os.unlink(conf.watch_directory+'/suspend')
+ if remount==True:os.popen('rm -rf '+conf.watch_directory+'/suspend*')
except:pass
except Exception as ex:
- logging.error("Exception in cleanup_mountpoints")
- logging.exception(ex)
+ logger.error("Exception in cleanup_mountpoints")
+ logger.exception(ex)
if remount==True:
- logging.fatal("Unable to handle (un)mounting")
+ logger.fatal("Unable to handle (un)mounting")
return False
else:return False
+def submount_size(basedir):
+ loop_size=0
+ try:
+ p = subprocess.Popen("mount", shell=False, stdout=subprocess.PIPE)
+ p.wait()
+ std_out=p.stdout.read().split("\n")
+ for l in std_out:
+ try:
+ ls = l.strip()
+ toks = l.split()
+ if toks[0].startswith(basedir) and toks[2].startswith(basedir) and 'loop' in toks[5]:
+ imgstat = os.stat(toks[0])
+ imgsize = imgstat.st_size
+ loop_size+=imgsize
+ except:pass
+ except:pass
+ return loop_size
+
+
def calculate_threadnumber():
global nthreads
global nstreams
@@ -240,12 +351,58 @@ def calculate_threadnumber():
nthreads = idlecount/conf.cmssw_threads_autosplit
nstreams = idlecount/conf.cmssw_threads_autosplit
if nthreads*conf.cmssw_threads_autosplit != nthreads:
- logging.error("idle cores can not be evenly split to cmssw threads")
+ logger.error("idle cores can not be evenly split to cmssw threads")
else:
nthreads = conf.cmssw_threads
- nstreams = conf.cmssw_threads
+ nstreams = conf.cmssw_streams
expected_processes = idlecount/nstreams
+
+def updateBlacklist():
+ black_list=[]
+ active_black_list=[]
+ #TODO:this will be updated to read blacklist from database
+ if conf.role=='bu':
+ try:
+ if os.stat('/etc/appliance/blacklist').st_size>0:
+ with open('/etc/appliance/blacklist','r') as fi:
+ try:
+ static_black_list = json.load(fi)
+ for item in static_black_list:
+ black_list.append(item)
+ logger.info("found these resources in /etc/appliance/blacklist: "+str(black_list))
+ except ValueError:
+ logger.error("error parsing /etc/appliance/blacklist")
+ except:
+ #no blacklist file, this is ok
+ pass
+ black_list=list(set(black_list))
+ try:
+ forceUpdate=False
+ with open(os.path.join(conf.watch_directory,'appliance','blacklist'),'r') as fi:
+ active_black_list = json.load(fi)
+ except:
+ forceUpdate=True
+ if forceUpdate==True or active_black_list != black_list:
+ try:
+ with open(os.path.join(conf.watch_directory,'appliance','blacklist'),'w') as fi:
+ json.dump(black_list,fi)
+ except:
+ return False,black_list
+ #TODO:check on FU if blacklisted
+ return True,black_list
+
+def restartLogCollector(instanceParam):
+ global logCollector
+ if logCollector!=None:
+ logger.info("terminating logCollector")
+ logCollector.terminate()
+ logCollector = None
+ logger.info("starting logcollector.py")
+ logcollector_args = ['/opt/hltd/python/logcollector.py']
+ logcollector_args.append(instanceParam)
+ logCollector = subprocess.Popen(logcollector_args,preexec_fn=preexec_function,close_fds=True)
+
class system_monitor(threading.Thread):
def __init__(self):
@@ -259,65 +416,152 @@ def __init__(self):
def rehash(self):
if conf.role == 'fu':
- self.directory = ['/'+x+'/appliance/boxes/' for x in bu_disk_list_ramdisk]
+ self.directory = [os.path.join(bu_disk_list_ramdisk_instance[0],'appliance','boxes')]
+ #self.directory = ['/'+x+'/appliance/boxes/' for x in bu_disk_list_ramdisk_instance]
+ #write only in one location
else:
- self.directory = [conf.watch_directory+'/appliance/boxes/']
- self.file = [x+self.hostname for x in self.directory]
- for dir in self.directory:
+ self.directory = [os.path.join(conf.watch_directory,'appliance/boxes/')]
try:
- os.makedirs(dir)
+ #if directory does not exist: check if it is renamed to specific name (non-main instance)
+ if not os.path.exists(self.directory[0]) and conf.instance=="main":
+ os.makedirs(self.directory[0])
except OSError:
pass
- logging.info("system_monitor: rehash found the following BU disks")
+
+ self.file = [os.path.join(x,self.hostname) for x in self.directory]
+
+ logger.info("system_monitor: rehash found the following BU disk(s):"+str(self.file))
for disk in self.file:
- logging.info(disk)
+ logger.info(disk)
def run(self):
try:
- logging.debug('entered system monitor thread ')
+ logger.debug('entered system monitor thread ')
global suspended
+ global ramdisk_submount_size
+ res_path_temp = os.path.join(conf.watch_directory,'appliance','resource_summary_temp')
+ res_path = os.path.join(conf.watch_directory,'appliance','resource_summary')
+ selfhost = os.uname()[1]
+ counter=0
while self.running:
-# logging.info('system monitor - running '+str(self.running))
- self.threadEvent.wait(5)
+ self.threadEvent.wait(5 if counter>0 else 1)
+ counter+=1
+ counter=counter%5
if suspended:continue
tstring = datetime.datetime.utcfromtimestamp(time.time()).isoformat()
- fp = None
+ ramdisk = None
+ if conf.role == 'bu':
+ ramdisk = os.statvfs(conf.watch_directory)
+ ramdisk_occ=1
+ try:ramdisk_occ = float((ramdisk.f_blocks - ramdisk.f_bavail)*ramdisk.f_bsize - ramdisk_submount_size)/float(ramdisk.f_blocks*ramdisk.f_bsize - ramdisk_submount_size)
+ except:pass
+ if ramdisk_occ<0:
+ ramdisk_occ=0
+ logger.info('incorrect ramdisk occupancy',ramdisk_occ)
+ if ramdisk_occ>1:
+ ramdisk_occ=1
+ logger.info('incorrect ramdisk occupancy',ramdisk_occ)
+
+ resource_count_idle = 0
+ resource_count_used = 0
+ resource_count_broken = 0
+ cloud_count = 0
+ lastFURuns = []
+ lastFURun=-1
+ activeRunQueuedLumisNum = -1
+ current_time = time.time()
+ for key in boxinfoFUMap:
+ if key==selfhost:continue
+ entry = boxinfoFUMap[key]
+ if current_time - entry[1] > 10:continue
+ resource_count_idle+=int(entry[0]['idles'])
+ resource_count_used+=int(entry[0]['used'])
+ resource_count_broken+=int(entry[0]['broken'])
+ cloud_count+=int(entry[0]['cloud'])
+ try:
+ lastFURuns.append(int(entry[0]['activeRuns'].strip('[]').split(',')[-1]))
+ except:pass
+ fuRuns = sorted(list(set(lastFURuns)))
+ if len(fuRuns)>0:
+ lastFURun = fuRuns[-1]
+ #second pass
+ for key in boxinfoFUMap:
+ if key==selfhost:continue
+ entry = boxinfoFUMap[key]
+ if current_time - entry[1] > 10:continue
+ try:
+ lastrun = int(entry[0]['activeRuns'].strip('[]').split(',')[-1])
+ if lastrun==lastFURun:
+ qlumis = int(entry[0]['activeRunNumQueuedLS'])
+ if qlumis>activeRunQueuedLumisNum:activeRunQueuedLumisNum=qlumis
+ except:pass
+ res_doc = {
+ "active_resources":resource_count_idle+resource_count_used,
+ "idle":resource_count_idle,
+ "used":resource_count_used,
+ "broken":resource_count_broken,
+ "cloud":cloud_count,
+ "activeFURun":lastFURun,
+ "activeRunNumQueuedLS":activeRunQueuedLumisNum,
+ "ramdisk_occupancy":ramdisk_occ
+ }
+ with open(res_path_temp,'w') as fp:
+ json.dump(res_doc,fp)
+ os.rename(res_path_temp,res_path)
+
for mfile in self.file:
if conf.role == 'fu':
dirstat = os.statvfs(conf.watch_directory)
- fp=open(mfile,'w+')
- fp.write('fm_date='+tstring+'\n')
- fp.write('idles='+str(len(os.listdir(idles)))+'\n')
- fp.write('used='+str(len(os.listdir(used)))+'\n')
- fp.write('broken='+str(len(os.listdir(broken)))+'\n')
- fp.write('quarantined='+str(len(os.listdir(quarantined)))+'\n')
- fp.write('usedDataDir='+str(((dirstat.f_blocks - dirstat.f_bavail)*dirstat.f_bsize)>>20)+'\n')
- fp.write('totalDataDir='+str((dirstat.f_blocks*dirstat.f_bsize)>>20)+'\n')
- #two lines with active runs (used to check file consistency)
- fp.write('activeRuns='+str(active_runs).strip('[]')+'\n')
- fp.write('activeRuns='+str(active_runs).strip('[]')+'\n')
- fp.write('entriesComplete=True')
- fp.close()
+ try:
+ with open(mfile,'w+') as fp:
+ fp.write('fm_date='+tstring+'\n')
+ if cloud_mode==True and entering_cloud_mode==True:
+ #lie about cores in cloud if cloud mode enabled, even if still processing
+ fp.write('idles=0\n')
+ fp.write('used=0\n')
+ fp.write('broken=0\n')
+ fp.write('cloud='+str(len(os.listdir(cloud))+len(os.listdir(idles))+len(os.listdir(used))+len(os.listdir(broken)))+'\n')
+ else:
+ fp.write('idles='+str(len(os.listdir(idles)))+'\n')
+ fp.write('used='+str(len(os.listdir(used)))+'\n')
+ fp.write('broken='+str(len(os.listdir(broken)))+'\n')
+ fp.write('cloud='+str(len(os.listdir(cloud)))+'\n')
+
+ fp.write('quarantined='+str(len(os.listdir(quarantined)))+'\n')
+ fp.write('usedDataDir='+str(((dirstat.f_blocks - dirstat.f_bavail)*dirstat.f_bsize)>>20)+'\n')
+ fp.write('totalDataDir='+str((dirstat.f_blocks*dirstat.f_bsize)>>20)+'\n')
+ #two lines with active runs (used to check file consistency)
+ fp.write('activeRuns='+str(active_runs).strip('[]')+'\n')
+ fp.write('activeRuns='+str(active_runs).strip('[]')+'\n')
+ fp.write('activeRunsErrors='+str(active_runs_errors).strip('[]')+'\n')
+ fp.write('activeRunNumQueuedLS='+self.getLumiQueueStat()+'\n')
+ fp.write('entriesComplete=True')
+ except Exception as ex:
+ logger.warning('boxinfo file write failed +'+str(ex))
+ if counter==0:
+ #in case something happened with the BU server, try remount
+ cleanup_mountpoints()
+
if conf.role == 'bu':
- ramdisk = os.statvfs(conf.watch_directory)
+ #ramdisk = os.statvfs(conf.watch_directory)
outdir = os.statvfs('/fff/output')
- fp=open(mfile,'w+')
-
- fp.write('fm_date='+tstring+'\n')
- fp.write('idles=0\n')
- fp.write('used=0\n')
- fp.write('broken=0\n')
- fp.write('quarantined=0\n')
- fp.write('usedRamdisk='+str(((ramdisk.f_blocks - ramdisk.f_bavail)*ramdisk.f_bsize)>>20)+'\n')
- fp.write('totalRamdisk='+str((ramdisk.f_blocks*ramdisk.f_bsize)>>20)+'\n')
- fp.write('usedOutput='+str(((outdir.f_blocks - outdir.f_bavail)*outdir.f_bsize)>>20)+'\n')
- fp.write('totalOutput='+str((outdir.f_blocks*outdir.f_bsize)>>20)+'\n')
- fp.write('activeRuns='+str(active_runs).strip('[]')+'\n')
- fp.write('activeRuns='+str(active_runs).strip('[]')+'\n')
- fp.write('entriesComplete=True')
- fp.close()
-
+ with open(mfile,'w+') as fp:
+ fp.write('fm_date='+tstring+'\n')
+ fp.write('idles=0\n')
+ fp.write('used=0\n')
+ fp.write('broken=0\n')
+ fp.write('quarantined=0\n')
+ fp.write('cloud=0\n')
+ fp.write('usedRamdisk='+str(((ramdisk.f_blocks - ramdisk.f_bavail)*ramdisk.f_bsize - ramdisk_submount_size)>>20)+'\n')
+ fp.write('totalRamdisk='+str((ramdisk.f_blocks*ramdisk.f_bsize - ramdisk_submount_size)>>20)+'\n')
+ fp.write('usedOutput='+str(((outdir.f_blocks - outdir.f_bavail)*outdir.f_bsize)>>20)+'\n')
+ fp.write('totalOutput='+str((outdir.f_blocks*outdir.f_bsize)>>20)+'\n')
+ fp.write('activeRuns='+str(active_runs).strip('[]')+'\n')
+ fp.write('activeRuns='+str(active_runs).strip('[]')+'\n')
+ fp.write('entriesComplete=True')
+
+ #deprecated
if conf.role == 'bu':
mfile = conf.resource_base+'/disk.jsn'
stat=[]
@@ -336,7 +580,7 @@ def run(self):
json.dump(stat,fp)
fp.close()
except Exception as ex:
- logging.error(ex)
+ logger.error(ex)
for mfile in self.file:
try:
@@ -344,10 +588,20 @@ def run(self):
except OSError:
pass
- logging.debug('exiting system monitor thread ')
+ logger.debug('exiting system monitor thread ')
+
+ def getLumiQueueStat(self):
+ try:
+ with open(os.path.join(conf.watch_directory,'run'+str(active_runs[-1]).zfill(conf.run_number_padding),
+ 'open','queue_status.jsn'),'r') as fp:
+ #fcntl.flock(fp, fcntl.LOCK_EX)
+ statusDoc = json.load(fp)
+ return str(statusDoc["numQueuedLS"])
+ except:
+ return "-1"
def stop(self):
- logging.debug("system_monitor: request to stop")
+ logger.debug("system_monitor: request to stop")
self.running = False
self.threadEvent.set()
@@ -358,13 +612,13 @@ def __init__(self):
def startNewRun(self,nr):
if self.runnumber:
- logging.error("Another BU emulator run "+str(self.runnumber)+" is already ongoing")
+ logger.error("Another BU emulator run "+str(self.runnumber)+" is already ongoing")
return
self.runnumber = nr
configtouse = conf.test_bu_config
destination_base = None
if role == 'fu':
- destination_base = bu_disk_list_ramdisk[startindex%len(bu_disk_list_ramdisk)]
+ destination_base = bu_disk_list_ramdisk_instance[startindex%len(bu_disk_list_ramdisk_instance)]
else:
destination_base = conf.watch_directory
@@ -393,8 +647,8 @@ def startNewRun(self,nr):
close_fds=True
)
except Exception as ex:
- logging.error("Error in forking BU emulator process")
- logging.error(ex)
+ logger.error("Error in forking BU emulator process")
+ logger.error(ex)
def stop(self):
os.kill(self.process.pid,SIGINT)
@@ -424,22 +678,22 @@ def ping(self):
def NotifyNewRun(self,runnumber):
self.runnumber = runnumber
- logging.info("calling start of run on "+self.cpu[0]);
+ logger.info("calling start of run on "+self.cpu[0]);
try:
- connection = httplib.HTTPConnection(self.cpu[0], conf.cgi_port)
+ connection = httplib.HTTPConnection(self.cpu[0], conf.cgi_port - conf.cgi_instance_port_offset)
connection.request("GET",'cgi-bin/start_cgi.py?run='+str(runnumber))
response = connection.getresponse()
#do something intelligent with the response code
- logging.error("response was "+str(response.status))
+ logger.error("response was "+str(response.status))
if response.status > 300: self.hoststate = 1
else:
- logging.info(response.read())
+ logger.info(response.read())
except Exception as ex:
- logging.exception(ex)
+ logger.exception(ex)
def NotifyShutdown(self):
try:
- connection = httplib.HTTPConnection(self.cpu[0], conf.cgi_port)
+ connection = httplib.HTTPConnection(self.cpu[0], conf.cgi_port - self.cgi_instance_port_offset)
connection.request("GET",'cgi-bin/stop_cgi.py?run='+str(self.runnumber))
time.sleep(0.05)
response = connection.getresponse()
@@ -447,10 +701,10 @@ def NotifyShutdown(self):
#do something intelligent with the response code
if response.status > 300: self.hoststate = 0
except Exception as ex:
- logging.exception(ex)
+ logger.exception(ex)
def StartNewProcess(self ,runnumber, startindex, arch, version, menu,num_threads,num_streams):
- logging.debug("OnlineResource: StartNewProcess called")
+ logger.debug("OnlineResource: StartNewProcess called")
self.runnumber = runnumber
"""
@@ -458,10 +712,10 @@ def StartNewProcess(self ,runnumber, startindex, arch, version, menu,num_threads
independent mounts of the BU - it should not be necessary in due course
IFF it is necessary, it should address "any" number of mounts, not just 2
"""
- input_disk = bu_disk_list_ramdisk[startindex%len(bu_disk_list_ramdisk)]
+ input_disk = bu_disk_list_ramdisk_instance[startindex%len(bu_disk_list_ramdisk_instance)]
#run_dir = input_disk + '/run' + str(self.runnumber).zfill(conf.run_number_padding)
- logging.info("starting process with "+version+" and run number "+str(runnumber))
+ logger.info("starting process with "+version+" and run number "+str(runnumber))
if "_patch" in version:
full_release="cmssw-patch"
@@ -492,7 +746,7 @@ def StartNewProcess(self ,runnumber, startindex, arch, version, menu,num_threads
if self.watchdog:
new_run_args.append("skipFirstLumis=True")
- logging.info("arg array "+str(new_run_args).translate(None, "'"))
+ logger.info("arg array "+str(new_run_args).translate(None, "'"))
try:
# dem = demote.demote(conf.user)
self.process = subprocess.Popen(new_run_args,
@@ -500,29 +754,29 @@ def StartNewProcess(self ,runnumber, startindex, arch, version, menu,num_threads
close_fds=True
)
self.processstate = 100
- logging.info("started process "+str(self.process.pid))
+ logger.info("started process "+str(self.process.pid))
# time.sleep(1.)
if self.watchdog==None:
self.watchdog = ProcessWatchdog(self,self.lock)
self.watchdog.start()
- logging.debug("watchdog thread for "+str(self.process.pid)+" is alive "
+ logger.debug("watchdog thread for "+str(self.process.pid)+" is alive "
+ str(self.watchdog.is_alive()))
else:
self.watchdog.join()
self.watchdog = ProcessWatchdog(self,self.lock)
self.watchdog.start()
- logging.debug("watchdog thread restarted for "+str(self.process.pid)+" is alive "
+ logger.debug("watchdog thread restarted for "+str(self.process.pid)+" is alive "
+ str(self.watchdog.is_alive()))
except Exception as ex:
- logging.info("OnlineResource: exception encountered in forking hlt slave")
- logging.info(ex)
+ logger.info("OnlineResource: exception encountered in forking hlt slave")
+ logger.info(ex)
def join(self):
- logging.debug('calling join on thread ' +self.watchdog.name)
+ logger.debug('calling join on thread ' +self.watchdog.name)
self.watchdog.join()
def disableRestart(self):
- logging.debug("OnlineResource "+str(self.cpu)+" restart is now disabled")
+ logger.debug("OnlineResource "+str(self.cpu)+" restart is now disabled")
if self.watchdog:
self.watchdog.disableRestart()
@@ -530,11 +784,11 @@ def clearQuarantined(self):
resource_lock.acquire()
try:
for cpu in self.quarantined:
- logging.info('Clearing quarantined resource '+cpu)
+ logger.info('Clearing quarantined resource '+cpu)
os.rename(quarantined+cpu,idles+cpu)
self.quarantined = []
except Exception as ex:
- logging.exception(ex)
+ logger.exception(ex)
resource_lock.release()
class ProcessWatchdog(threading.Thread):
@@ -549,16 +803,16 @@ def __init__(self,resource,lock):
def run(self):
try:
monfile = self.resource.associateddir+'/hltd.jsn'
- logging.info('watchdog for process '+str(self.resource.process.pid))
+ logger.info('watchdog for process '+str(self.resource.process.pid))
self.resource.process.wait()
returncode = self.resource.process.returncode
pid = self.resource.process.pid
#update json process monitoring file
self.resource.processstate=returncode
- logging.debug('ProcessWatchdog: acquire lock thread '+str(pid))
+ logger.debug('ProcessWatchdog: acquire lock thread '+str(pid))
self.lock.acquire()
- logging.debug('ProcessWatchdog: acquired lock thread '+str(pid))
+ logger.debug('ProcessWatchdog: acquired lock thread '+str(pid))
try:
with open(monfile,"r+") as fp:
@@ -573,13 +827,13 @@ def run(self):
fp.flush()
except IOError,ex:
- logging.exception(ex)
+ logger.exception(ex)
except ValueError:
pass
- logging.debug('ProcessWatchdog: release lock thread '+str(pid))
+ logger.debug('ProcessWatchdog: release lock thread '+str(pid))
self.lock.release()
- logging.debug('ProcessWatchdog: released lock thread '+str(pid))
+ logger.debug('ProcessWatchdog: released lock thread '+str(pid))
abortedmarker = self.resource.statefiledir+'/'+Run.ABORTED
@@ -591,20 +845,24 @@ def run(self):
try:
os.rename(used+cpu,idles+cpu)
except Exception as ex:
- logging.exception(ex)
+ logger.exception(ex)
except:pass
resource_lock.release()
return
- #quit codes (configuration errors):
- quit_codes = [127,90,65,73]
+ #bump error count in active_runs_errors which is logged in the box file
+ if returncode!=0:
+ try:
+ global active_runs
+ global active_runs_errors
+ active_runs_errors[active_runs.index(self.resource.runnumber)]+=1
+ except:
+ pass
- #cleanup actions- remove process from list and
- # attempt restart on same resource
- #dqm mode will treat configuration error as a crash and eventually move to quarantined
- if returncode != 0 and ( returncode not in quit_codes or conf.dqm_machine==True):
+ #cleanup actions- remove process from list and attempt restart on same resource
+ if returncode != 0:
if returncode < 0:
- logging.error("process "+str(pid)
+ logger.error("process "+str(pid)
+" for run "+str(self.resource.runnumber)
+" on resource(s) " + str(self.resource.cpu)
+" exited with signal "
@@ -613,7 +871,7 @@ def run(self):
+str(self.retry_enabled)
)
else:
- logging.error("process "+str(pid)
+ logger.error("process "+str(pid)
+" for run "+str(self.resource.runnumber)
+" on resource(s) " + str(self.resource.cpu)
+" exited with code "
@@ -621,8 +879,23 @@ def run(self):
+" restart is enabled ? "
+str(self.retry_enabled)
)
-
-
+ #quit codes (configuration errors):
+ quit_codes = [127,90,73]
+
+ #removed 65 because it is not only configuration error
+ #quit_codes = [127,90,65,73]
+
+ #dqm mode will treat configuration error as a crash and eventually move to quarantined
+ if conf.dqm_machine==False and returncode in quit_codes:
+ if self.resource.retry_attempts < self.retry_limit:
+ logger.warning('for this type of error, restarting this process is disabled')
+ self.resource.retry_attempts=self.retry_limit
+ if returncode==127:
+ logger.fatal('Exit code indicates that CMSSW environment might not be available (cmsRun executable not in path).')
+ elif returncode==90:
+ logger.fatal('Exit code indicates that there might be a python error in the CMSSW configuration.')
+ else:
+ logger.fatal('Exit code indicates that there might be a C/C++ error in the CMSSW configuration.')
#generate crashed pid json file like: run000001_ls0000_crash_pid12345.jsn
oldpid = "pid"+str(pid).zfill(5)
@@ -635,8 +908,8 @@ def run(self):
try:
with open(filepath,"w+") as fi:
json.dump(document,fi)
- except: logging.exception("unable to create %r" %filename)
- logging.info("pid crash file: %r" %filename)
+ except: logger.exception("unable to create %r" %filename)
+ logger.info("pid crash file: %r" %filename)
if self.resource.retry_attempts < self.retry_limit:
@@ -649,7 +922,7 @@ def run(self):
self.resource.process = None
self.resource.retry_attempts += 1
- logging.info("try to restart process for resource(s) "
+ logger.info("try to restart process for resource(s) "
+str(self.resource.cpu)
+" attempt "
+ str(self.resource.retry_attempts))
@@ -657,10 +930,10 @@ def run(self):
for cpu in self.resource.cpu:
os.rename(used+cpu,broken+cpu)
resource_lock.release()
- logging.debug("resource(s) " +str(self.resource.cpu)+
+ logger.debug("resource(s) " +str(self.resource.cpu)+
" successfully moved to except")
elif self.resource.retry_attempts >= self.retry_limit:
- logging.error("process for run "
+ logger.error("process for run "
+str(self.resource.runnumber)
+" on resources " + str(self.resource.cpu)
+" reached max retry limit "
@@ -680,20 +953,11 @@ def run(self):
fp = open(conf.watch_directory+'/quarantined'+str(self.resource.runnumber).zfill(conf.run_number_padding),'w+')
fp.close()
except Exception as ex:
- logging.exception(ex)
+ logger.exception(ex)
#successful end= release resource (TODO:maybe should mark aborted for non-0 error codes)
- elif returncode == 0 or returncode in quit_codes:
- if returncode==0:
- logging.info('releasing resource, exit 0 meaning end of run '+str(self.resource.cpu))
- elif returncode==127:
- logging.fatal('error executing start script. Maybe CMSSW environment is not available (cmsRun executable not in path).')
- elif returncode==90:
- logging.fatal('error executing start script: python error.')
- elif returncode in quit_codes:
- logging.fatal('error executing start script: CMSSW configuration error.')
- else:
- logging.fatal('error executing start script: unspecified error.')
+ elif returncode == 0:
+ logger.info('releasing resource, exit 0 meaning end of run '+str(self.resource.cpu))
# generate an end-of-run marker if it isn't already there - it will be picked up by the RunRanger
endmarker = conf.watch_directory+'/end'+str(self.resource.runnumber).zfill(conf.run_number_padding)
@@ -714,12 +978,12 @@ def run(self):
#self.resource.process=None
- # logging.info('exiting thread '+str(self.resource.process.pid))
+ # logger.info('exiting thread '+str(self.resource.process.pid))
except Exception as ex:
resource_lock.release()
- logging.info("OnlineResource watchdog: exception")
- logging.exception(ex)
+ logger.info("OnlineResource watchdog: exception")
+ logger.exception(ex)
return
def disableRestart(self):
@@ -736,7 +1000,8 @@ class Run:
VALID_MARKERS = [STARTING,ACTIVE,STOPPING,COMPLETE,ABORTED]
- def __init__(self,nr,dirname,bu_dir):
+ def __init__(self,nr,dirname,bu_dir,instance):
+ self.instance = instance
self.runnumber = nr
self.dirname = dirname
self.online_resource_list = []
@@ -754,22 +1019,23 @@ def __init__(self,nr,dirname,bu_dir):
self.anelasticWatchdog = None
self.threadEvent = threading.Event()
global active_runs
+ global active_runs_errors
if conf.role == 'fu':
self.changeMarkerMaybe(Run.STARTING)
if int(self.runnumber) in active_runs:
raise Exception("Run "+str(self.runnumber)+ "already active")
active_runs.append(int(self.runnumber))
+ active_runs_errors.append(0)
else:
- #currently unused on BU
active_runs.append(int(self.runnumber))
+ active_runs_errors.append(0)
self.menu_directory = bu_dir+'/'+conf.menu_directory
readMenuAttempts=0
#polling for HLT menu directory
while os.path.exists(self.menu_directory)==False and conf.dqm_machine==False and conf.role=='fu':
- time.sleep(.2)
readMenuAttempts+=1
#10 seconds allowed before defaulting to local configuration
if readMenuAttempts>50: break
@@ -780,19 +1046,17 @@ def __init__(self,nr,dirname,bu_dir):
while True:
self.menu = self.menu_directory+'/'+conf.menu_name
if os.path.exists(self.menu_directory+'/'+conf.arch_file):
- fp = open(self.menu_directory+'/'+conf.arch_file,'r')
- self.arch = fp.readline().strip()
- fp.close()
+ with open(self.menu_directory+'/'+conf.arch_file,'r') as fp:
+ self.arch = fp.readline().strip()
if os.path.exists(self.menu_directory+'/'+conf.version_file):
- fp = open(self.menu_directory+'/'+conf.version_file,'r')
- self.version = fp.readline().strip()
- fp.close()
+ with open(self.menu_directory+'/'+conf.version_file,'r') as fp:
+ self.version = fp.readline().strip()
try:
- logging.info("Run "+str(self.runnumber)+" uses "+ self.version+" ("+self.arch+") with "+self.menu)
+ logger.info("Run "+str(self.runnumber)+" uses "+ self.version+" ("+self.arch+") with "+self.menu)
break
except Exception as ex:
- logging.exception(ex)
- logging.error("Run parameters obtained for run "+str(self.runnumber)+": "+ str(self.version)+" ("+str(self.arch)+") with "+str(self.menu))
+ logger.exception(ex)
+ logger.error("Run parameters obtained for run "+str(self.runnumber)+": "+ str(self.version)+" ("+str(self.arch)+") with "+str(self.menu))
time.sleep(.5)
readMenuAttempts+=1
if readMenuAttempts==3: raise Exception("Unable to parse HLT parameters")
@@ -802,73 +1066,83 @@ def __init__(self,nr,dirname,bu_dir):
self.version = conf.cmssw_default_version
self.menu = conf.test_hlt_config1
if conf.role=='fu':
- logging.warn("Using default values for run "+str(self.runnumber)+": "+self.version+" ("+self.arch+") with "+self.menu)
+ logger.warn("Using default values for run "+str(self.runnumber)+": "+self.version+" ("+self.arch+") with "+self.menu)
self.rawinputdir = None
+ #
if conf.role == "bu":
try:
self.rawinputdir = conf.watch_directory+'/run'+str(self.runnumber).zfill(conf.run_number_padding)
- self.buoutputdir = conf.micromerge_output+'/run'+str(self.runnumber).zfill(conf.run_number_padding)
+ #if conf.instance!="main" and conf.instance_same_destination==False:
+ # try:os.mkdir(os.path.join(conf.micromerge_output,conf.instance))
+ # except:pass
+ # self.buoutputdir = os.path.join(conf.micromerge_output,instance,'run'+str(self.runnumber).zfill(conf.run_number_padding))
+ #else:
+ # self.buoutputdir = os.path.join(conf.micromerge_output,'run'+str(self.runnumber).zfill(conf.run_number_padding))
os.mkdir(self.rawinputdir+'/mon')
except Exception, ex:
- logging.error("could not create mon dir inside the run input directory")
+ logger.error("could not create mon dir inside the run input directory")
else:
- self.rawinputdir= bu_disk_list_ramdisk[0]+'/run' + str(self.runnumber).zfill(conf.run_number_padding)
+ #self.rawinputdir= os.path.join(random.choice(bu_disk_list_ramdisk_instance),'run' + str(self.runnumber).zfill(conf.run_number_padding))
+ self.rawinputdir= os.path.join(bu_disk_list_ramdisk_instance[0],'run' + str(self.runnumber).zfill(conf.run_number_padding))
self.lock = threading.Lock()
- #conf.use_elasticsearch = False
- #note: start elastic.py first!
+
if conf.use_elasticsearch == True:
+ global nsslock
try:
if conf.role == "bu":
- logging.info("starting elasticbu.py with arguments:"+self.dirname)
- elastic_args = ['/opt/hltd/python/elasticbu.py',str(self.runnumber)]
+ nsslock.acquire()
+ logger.info("starting elasticbu.py with arguments:"+self.dirname)
+ elastic_args = ['/opt/hltd/python/elasticbu.py',self.instance,str(self.runnumber)]
else:
- logging.info("starting elastic.py with arguments:"+self.dirname)
- elastic_args = ['/opt/hltd/python/elastic.py',self.dirname,self.rawinputdir+'/mon',str(expected_processes),str(conf.elastic_cluster)]
+ logger.info("starting elastic.py with arguments:"+self.dirname)
+ elastic_args = ['/opt/hltd/python/elastic.py',self.dirname,self.rawinputdir+'/mon',str(expected_processes)]
self.elastic_monitor = subprocess.Popen(elastic_args,
preexec_fn=preexec_function,
close_fds=True
)
-
except OSError as ex:
- logging.error("failed to start elasticsearch client")
- logging.error(ex)
+ logger.error("failed to start elasticsearch client")
+ logger.error(ex)
+ try:nsslock.release()
+ except:pass
if conf.role == "fu" and conf.dqm_machine==False:
try:
- logging.info("starting anelastic.py with arguments:"+self.dirname)
- elastic_args = ['/opt/hltd/python/anelastic.py',self.dirname,str(self.runnumber), self.rawinputdir]
+ logger.info("starting anelastic.py with arguments:"+self.dirname)
+ #elastic_args = ['/opt/hltd/python/anelastic.py',self.dirname,str(self.runnumber), self.rawinputdir,random.choice(bu_disk_list_output_instance)]
+ elastic_args = ['/opt/hltd/python/anelastic.py',self.dirname,str(self.runnumber), self.rawinputdir,bu_disk_list_output_instance[0]]
self.anelastic_monitor = subprocess.Popen(elastic_args,
preexec_fn=preexec_function,
close_fds=True
)
except OSError as ex:
- logging.fatal("failed to start anelastic.py client:")
- logging.exception(ex)
+ logger.fatal("failed to start anelastic.py client:")
+ logger.exception(ex)
sys.exit(1)
def AcquireResource(self,resourcenames,fromstate):
idles = conf.resource_base+'/'+fromstate+'/'
try:
- logging.debug("Trying to acquire resource "
+ logger.debug("Trying to acquire resource "
+str(resourcenames)
+" from "+fromstate)
for resourcename in resourcenames:
os.rename(idles+resourcename,used+resourcename)
if not filter(lambda x: x.cpu==resourcenames,self.online_resource_list):
- logging.debug("resource(s) "+str(resourcenames)
+ logger.debug("resource(s) "+str(resourcenames)
+" not found in online_resource_list, creating new")
self.online_resource_list.append(OnlineResource(resourcenames,self.lock))
return self.online_resource_list[-1]
- logging.debug("resource(s) "+str(resourcenames)
+ logger.debug("resource(s) "+str(resourcenames)
+" found in online_resource_list")
return filter(lambda x: x.cpu==resourcenames,self.online_resource_list)[0]
except Exception as ex:
- logging.info("exception encountered in looking for resources")
- logging.info(ex)
+ logger.info("exception encountered in looking for resources")
+ logger.info(ex)
def ContactResource(self,resourcename):
self.online_resource_list.append(OnlineResource(resourcename,self.lock))
@@ -878,28 +1152,39 @@ def ReleaseResource(self,res):
self.online_resource_list.remove(res)
def AcquireResources(self,mode):
- logging.info("acquiring resources from "+conf.resource_base)
+ logger.info("acquiring resources from "+conf.resource_base)
idles = conf.resource_base
idles += '/idle/' if conf.role == 'fu' else '/boxes/'
try:
dirlist = os.listdir(idles)
except Exception as ex:
- logging.info("exception encountered in looking for resources")
- logging.info(ex)
- logging.info(dirlist)
+ logger.info("exception encountered in looking for resources")
+ logger.info(ex)
+ logger.info(str(dirlist))
current_time = time.time()
count = 0
cpu_group=[]
#self.lock.acquire()
+ global machine_blacklist
+ if conf.role=='bu':
+ update_success,machine_blacklist=updateBlacklist()
+ if update_success==False:
+ logger.fatal("unable to check blacklist: giving up on run start")
+ return False
+
for cpu in dirlist:
#skip self
- if conf.role=='bu' and cpu == os.uname()[1]:continue
-
+ if conf.role=='bu':
+ if cpu == os.uname()[1]:continue
+ if cpu in machine_blacklist:
+ logger.info("skipping blacklisted resource "+str(cpu))
+ continue
+
count = count+1
cpu_group.append(cpu)
age = current_time - os.path.getmtime(idles+cpu)
- logging.info("found resource "+cpu+" which is "+str(age)+" seconds old")
+ logger.info("found resource "+cpu+" which is "+str(age)+" seconds old")
if conf.role == 'fu':
if count == nstreams:
self.AcquireResource(cpu_group,'idle')
@@ -909,12 +1194,13 @@ def AcquireResources(self,mode):
if age < 10:
cpus = [cpu]
self.ContactResource(cpus)
+ return True
#self.lock.release()
def Start(self):
self.is_active_run = True
for resource in self.online_resource_list:
- logging.info('start run '+str(self.runnumber)+' on cpu(s) '+str(resource.cpu))
+ logger.info('start run '+str(self.runnumber)+' on cpu(s) '+str(resource.cpu))
if conf.role == 'fu':
self.StartOnResource(resource)
else:
@@ -929,11 +1215,11 @@ def Start(self):
self.startCompletedChecker()
def StartOnResource(self, resource):
- logging.debug("StartOnResource called")
+ logger.debug("StartOnResource called")
resource.statefiledir=conf.watch_directory+'/run'+str(self.runnumber).zfill(conf.run_number_padding)
mondir = os.path.join(resource.statefiledir,'mon')
resource.associateddir=mondir
- logging.info(str(nthreads)+' '+str(nstreams))
+ logger.info(str(nthreads)+' '+str(nstreams))
resource.StartNewProcess(self.runnumber,
self.online_resource_list.index(resource),
self.arch,
@@ -941,10 +1227,10 @@ def StartOnResource(self, resource):
self.menu,
int(round((len(resource.cpu)*float(nthreads)/nstreams))),
len(resource.cpu))
- logging.debug("StartOnResource process started")
- #logging.debug("StartOnResource going to acquire lock")
+ logger.debug("StartOnResource process started")
+ #logger.debug("StartOnResource going to acquire lock")
#self.lock.acquire()
- #logging.debug("StartOnResource lock acquired")
+ #logger.debug("StartOnResource lock acquired")
try:
os.makedirs(mondir)
except OSError:
@@ -954,7 +1240,7 @@ def StartOnResource(self, resource):
fp=None
stat = []
if not os.path.exists(monfile):
- logging.debug("No log file "+monfile+" found, creating one")
+ logger.debug("No log file "+monfile+" found, creating one")
fp=open(monfile,'w+')
attempts=0
while True:
@@ -966,12 +1252,12 @@ def StartOnResource(self, resource):
attempts+=1
continue
else:
- logging.error("could not retrieve process parameters")
- logging.exception(ex)
+ logger.error("could not retrieve process parameters")
+ logger.exception(ex)
break
else:
- logging.debug("Updating existing log file "+monfile)
+ logger.debug("Updating existing log file "+monfile)
fp=open(monfile,'r+')
stat=json.load(fp)
attempts=0
@@ -990,8 +1276,8 @@ def StartOnResource(self, resource):
time.sleep(.05)
continue
else:
- logging.error("could not retrieve process parameters")
- logging.exception(ex)
+ logger.error("could not retrieve process parameters")
+ logger.exception(ex)
break
fp.seek(0)
fp.truncate()
@@ -1000,11 +1286,34 @@ def StartOnResource(self, resource):
fp.flush()
fp.close()
#self.lock.release()
- #logging.debug("StartOnResource lock released")
+ #logger.debug("StartOnResource lock released")
+
+ def Stop(self):
+ #used to gracefully stop CMSSW and finish scripts
+ with open(os.path.join(self.dirname,"temp_CMSSW_STOP"),'w') as f:
+ writedoc = {}
+ bu_lumis = []
+ try:
+ bu_eols_files = filter( lambda x: x.endswith("_EoLS.jsn"),os.listdir(self.rawinputdir))
+ bu_lumis = (sorted([int(x.split('_')[1][2:]) for x in bu_eols_files]))
+ except:
+ logger.error("Unable to parse BU EoLS files")
+ if len(bu_lumis):
+ logger.info('last closed lumisection in ramdisk is '+str(bu_lumis[-1]))
+ writedoc['lastLS']=bu_lumis[-1]+2 #current+2
+ else: writedoc['lastLS']=2
+ json.dump(writedoc,f)
+ try:
+ os.rename(os.path.join(self.dirname,"temp_CMSSW_STOP"),os.path.join(self.dirname,"CMSSW_STOP"))
+ except:pass
+
- def Shutdown(self,herod=False):
+ def Shutdown(self,herod):
#herod mode sends sigkill to all process, however waits for all scripts to finish
- logging.debug("Run:Shutdown called")
+ logger.debug("Run:Shutdown called")
+ global runs_pending_shutdown
+ if self.runnumber in runs_pending_shutdown: runs_pending_shutdown.remove(self.runnumber)
+
self.is_active_run = False
try:
self.changeMarkerMaybe(Run.ABORTED)
@@ -1017,16 +1326,16 @@ def Shutdown(self,herod=False):
for resource in self.online_resource_list:
if conf.role == 'fu':
if resource.processstate==100:
- logging.info('terminating process '+str(resource.process.pid)+
+ logger.info('terminating process '+str(resource.process.pid)+
' in state '+str(resource.processstate))
if herod:resource.process.kill()
else:resource.process.terminate()
- logging.info('process '+str(resource.process.pid)+' join watchdog thread')
+ logger.info('process '+str(resource.process.pid)+' join watchdog thread')
# time.sleep(.1)
resource.join()
- logging.info('process '+str(resource.process.pid)+' terminated')
- logging.info('releasing resource(s) '+str(resource.cpu))
+ logger.info('process '+str(resource.process.pid)+' terminated')
+ logger.info('releasing resource(s) '+str(resource.cpu))
resource.clearQuarantined()
resource_lock.acquire()
@@ -1034,8 +1343,8 @@ def Shutdown(self,herod=False):
try:
os.rename(used+cpu,idles+cpu)
except OSError:
- #@SM:happens if t was quarantined
- logging.warning('Unable to find resource file '+used+cpu+'.')
+ #@SM:happens if it was quarantined
+ logger.warning('Unable to find resource file '+used+cpu+'.')
except Exception as ex:
resource_lock.release()
raise(ex)
@@ -1055,8 +1364,8 @@ def Shutdown(self,herod=False):
self.anelastic_monitor.terminate()
self.anelastic_monitor.wait()
except Exception as ex:
- logging.info("exception encountered in shutting down anelastic.py "+ str(ex))
- #logging.exception(ex)
+ logger.info("exception encountered in shutting down anelastic.py "+ str(ex))
+ #logger.exception(ex)
if conf.use_elasticsearch == True:
try:
if self.elastic_monitor:
@@ -1066,18 +1375,21 @@ def Shutdown(self,herod=False):
self.elastic_monitor.terminate()
self.elastic_monitor.wait()
except Exception as ex:
- logging.info("exception encountered in shutting down elastic.py")
- logging.exception(ex)
+ logger.info("exception encountered in shutting down elastic.py")
+ if "No child processes" in str(ex):pass
+ else:logger.exception(ex)
if self.waitForEndThread is not None:
self.waitForEndThread.join()
except Exception as ex:
- logging.info("exception encountered in shutting down resources")
- logging.exception(ex)
+ logger.info("exception encountered in shutting down resources")
+ logger.exception(ex)
global active_runs
+ global active_runs_errors
active_runs_copy = active_runs[:]
for run_num in active_runs_copy:
if run_num == self.runnumber:
+ active_runs_errors.pop(active_runs.index(run_num))
active_runs.remove(run_num)
try:
@@ -1087,7 +1399,7 @@ def Shutdown(self,herod=False):
except:
pass
- logging.info('Shutdown of run '+str(self.runnumber).zfill(conf.run_number_padding)+' completed')
+ logger.info('Shutdown of run '+str(self.runnumber).zfill(conf.run_number_padding)+' completed')
def ShutdownBU(self):
@@ -1108,16 +1420,18 @@ def ShutdownBU(self):
time.sleep(.1)
self.elastic_monitor.wait()
except Exception as ex:
- logging.info("exception encountered in shutting down elasticbu.py: " + str(ex))
- #logging.exception(ex)
+ logger.info("exception encountered in shutting down elasticbu.py: " + str(ex))
+ #logger.exception(ex)
global active_runs
+ global active_runs_errors
active_runs_copy = active_runs[:]
for run_num in active_runs_copy:
if run_num == self.runnumber:
+ active_runs_errors.pop(active_runs.index(run_num))
active_runs.remove(run_num)
- logging.info('Shutdown of run '+str(self.runnumber).zfill(conf.run_number_padding)+' on BU completed')
+ logger.info('Shutdown of run '+str(self.runnumber).zfill(conf.run_number_padding)+' on BU completed')
def StartWaitForEnd(self):
@@ -1127,11 +1441,13 @@ def StartWaitForEnd(self):
self.waitForEndThread = threading.Thread(target = self.WaitForEnd)
self.waitForEndThread.start()
except Exception as ex:
- logging.info("exception encountered in starting run end thread")
- logging.info(ex)
+ logger.info("exception encountered in starting run end thread")
+ logger.info(ex)
def WaitForEnd(self):
- logging.info("wait for end thread!")
+ logger.info("wait for end thread!")
+ global cloud_mode
+ global entering_cloud_mode
try:
for resource in self.online_resource_list:
resource.disableRestart()
@@ -1139,19 +1455,19 @@ def WaitForEnd(self):
if resource.processstate is not None:#was:100
if resource.process is not None and resource.process.pid is not None: ppid = resource.process.pid
else: ppid="None"
- logging.info('waiting for process '+str(ppid)+
+ logger.info('waiting for process '+str(ppid)+
' in state '+str(resource.processstate) +
' to complete ')
try:
resource.join()
- logging.info('process '+str(resource.process.pid)+' completed')
+ logger.info('process '+str(resource.process.pid)+' completed')
except:pass
# os.rename(used+resource.cpu,idles+resource.cpu)
resource.clearQuarantined()
resource.process=None
self.online_resource_list = []
if conf.role == 'fu':
- logging.info('writing complete file')
+ logger.info('writing complete file')
self.changeMarkerMaybe(Run.COMPLETE)
try:
os.remove(conf.watch_directory+'/end'+str(self.runnumber).zfill(conf.run_number_padding))
@@ -1160,29 +1476,43 @@ def WaitForEnd(self):
if conf.dqm_machine==False:
self.anelastic_monitor.wait()
except OSError,ex:
- logging.info("Exception encountered in waiting for termination of anelastic:" +str(ex))
+ logger.info("Exception encountered in waiting for termination of anelastic:" +str(ex))
if conf.use_elasticsearch == True:
try:
self.elastic_monitor.wait()
except OSError,ex:
- logging.info("Exception encountered in waiting for termination of anelastic:" +str(ex))
+ logger.info("Exception encountered in waiting for termination of anelastic:" +str(ex))
if conf.delete_run_dir is not None and conf.delete_run_dir == True:
try:
shutil.rmtree(self.dirname)
except Exception as ex:
- logging.exception(ex)
+ logger.exception(ex)
global active_runs
- logging.info("active runs.."+str(active_runs))
+ global active_runs_errors
+ logger.info("active runs.."+str(active_runs))
for run_num in active_runs:
if run_num == self.runnumber:
+ active_runs_errors.pop(active_runs.index(run_num))
active_runs.remove(run_num)
- logging.info("new active runs.."+str(active_runs))
+ logger.info("new active runs.."+str(active_runs))
+
+ if cloud_mode==True:
+ resource_lock.acquire()
+ if len(active_runs)>=1:
+ logger.info("VM mode: waiting for runs: "+str(active_runs)+" to finish")
+ else:
+ logger.info("No active runs. moving all resource files to cloud")
+ #give resources to cloud and bail out
+ move_resources_to_cloud()
+ entering_cloud_mode=False
+ resource_lock.release()
except Exception as ex:
- logging.error("exception encountered in ending run")
- logging.exception(ex)
+ resource_lock.release()
+ logger.error("exception encountered in ending run")
+ logger.exception(ex)
def changeMarkerMaybe(self,marker):
dir = self.dirname
@@ -1192,7 +1522,7 @@ def changeMarkerMaybe(self,marker):
fp = open(dir+'/'+marker,'w+')
fp.close()
else:
- logging.error("There are more than one markers for run "
+ logger.error("There are more than one markers for run "
+str(self.runnumber))
return
@@ -1201,8 +1531,8 @@ def startAnelasticWatchdog(self):
self.anelasticWatchdog = threading.Thread(target = self.runAnelasticWatchdog)
self.anelasticWatchdog.start()
except Exception as ex:
- logging.info("exception encountered in starting anelastic watchdog thread")
- logging.info(ex)
+ logger.info("exception encountered in starting anelastic watchdog thread")
+ logger.info(ex)
def runAnelasticWatchdog(self):
try:
@@ -1210,8 +1540,8 @@ def runAnelasticWatchdog(self):
if self.is_active_run == True:
#abort the run
self.anelasticWatchdog=None
- logging.fatal("Premature end of anelastic.py")
- self.Shutdown()
+ logger.fatal("Premature end of anelastic.py")
+ self.Shutdown(False)
except:
pass
@@ -1223,14 +1553,14 @@ def stopAnelasticWatchdog(self):
def startCompletedChecker(self):
if conf.role == 'bu': #and conf.use_elasticsearch == True:
try:
- logging.info('start checking completition of run '+str(self.runnumber))
+ logger.info('start checking completition of run '+str(self.runnumber))
#mode 1: check for complete entries in ES
#mode 2: check for runs in 'boxes' files
- self.endChecker = RunCompletedChecker(1,int(self.runnumber),self.online_resource_list,self.dirname, active_runs,self.elastic_monitor)
+ self.endChecker = RunCompletedChecker(conf,1,int(self.runnumber),self.online_resource_list,self.dirname,active_runs,active_runs_errors,self.elastic_monitor)
self.endChecker.start()
except Exception,ex:
- logging.error('failure to start run completition checker:')
- logging.exception(ex)
+ logger.error('failure to start run completition checker:')
+ logger.exception(ex)
def checkQuarantinedLimit(self):
allQuarantined=True
@@ -1248,8 +1578,9 @@ def checkQuarantinedLimit(self):
class RunRanger:
- def __init__(self):
+ def __init__(self,instance):
self.inotifyWrapper = InotifyWrapper(self)
+ self.instance = instance
def register_inotify_path(self,path,mask):
self.inotifyWrapper.registerPath(path,mask)
@@ -1258,25 +1589,64 @@ def start_inotify(self):
self.inotifyWrapper.start()
def stop_inotify(self):
- logging.info("RunRanger: Stop inotify wrapper")
self.inotifyWrapper.stop()
- logging.info("RunRanger: Join inotify wrapper")
self.inotifyWrapper.join()
- logging.info("RunRanger: Inotify wrapper returned")
+ logger.info("RunRanger: Inotify wrapper shutdown done")
def process_IN_CREATE(self, event):
nr=0
global run_list
- logging.info('RunRanger: event '+event.fullpath)
+ global runs_pending_shutdown
+ global active_runs
+ global active_runs_errors
+ global cloud_mode
+ global entering_cloud_mode
+ logger.info('RunRanger: event '+event.fullpath)
dirname=event.fullpath[event.fullpath.rfind("/")+1:]
- logging.info('RunRanger: new filename '+dirname)
+ logger.info('RunRanger: new filename '+dirname)
if dirname.startswith('run'):
+
+ if os.path.islink(event.fullpath):
+ logger.info('directory ' + event.fullpath + ' is link. Ignoring this run')
+ return
+ if not os.path.isdir(event.fullpath):
+ logger.info(event.fullpath +' is a file. A directory is needed to start a run.')
+ return
nr=int(dirname[3:])
if nr!=0:
try:
- logging.info('new run '+str(nr))
+ logger.info('new run '+str(nr))
+ #terminate quarantined runs
+ for q_runnumber in runs_pending_shutdown:
+ q_run = filter(lambda x: x.runnumber==q_runnumber,run_list)
+ if len(q_run):
+ q_run[0].Shutdown(True)#run abort in herod mode (wait for anelastic/elastic to shut down)
+ time.sleep(.1)
+
+ if cloud_mode==True and entering_cloud_mode==False:
+ logger.info("received new run notification in VM mode. Checking if idle cores are available...")
+ try:
+ if len(os.listdir(idles))<1:
+ logger.info("this run is skipped because FU is in VM mode and resources have not been returned")
+ return
+ #return all resources to HLTD (TODO:check if VM tool is done)
+ while True:
+ resource_lock.acquire()
+ #retry this operation in case cores get moved around by other means
+ if cleanup_resources()==True:
+ resource_lock.release()
+ break
+ resource_lock.release()
+ time.sleep(0.1)
+ logger.warning("could not move all resources, retrying.")
+ cloud_mode=False
+ except Exception as ex:
+ #resource_lock.release()
+ logger.fatal("failed to disable VM mode when receiving notification for run "+str(nr))
+ logger.exception(ex)
if conf.role == 'fu':
- bu_dir = bu_disk_list_ramdisk[0]+'/'+dirname
+ #bu_dir = random.choice(bu_disk_list_ramdisk_instance)+'/'+dirname
+ bu_dir = bu_disk_list_ramdisk_instance[0]+'/'+dirname
try:
os.symlink(bu_dir+'/jsd',event.fullpath+'/jsd')
except:
@@ -1294,17 +1664,23 @@ def process_IN_CREATE(self, event):
# create an EoR file that will trigger all the running jobs to exit nicely
open(EoR_file_name, 'w').close()
- run_list.append(Run(nr,event.fullpath,bu_dir))
+ run_list.append(Run(nr,event.fullpath,bu_dir,self.instance))
resource_lock.acquire()
- run_list[-1].AcquireResources(mode='greedy')
- run_list[-1].Start()
+ if run_list[-1].AcquireResources(mode='greedy'):
+ run_list[-1].Start()
+ else:
+ run_list.remove(run_list[-1])
resource_lock.release()
+ if conf.role == 'bu' and conf.instance != 'main':
+ logger.info('creating run symlink in main ramdisk directory')
+ main_ramdisk = os.path.dirname(os.path.normpath(conf.watch_directory))
+ os.symlink(event.fullpath,os.path.join(main_ramdisk,os.path.basename(event.fullpath)))
except OSError as ex:
- logging.error("RunRanger: "+str(ex)+" "+ex.filename)
- logging.exception(ex)
+ logger.error("RunRanger: "+str(ex)+" "+ex.filename)
+ logger.exception(ex)
except Exception as ex:
- logging.error("RunRanger: unexpected exception encountered in forking hlt slave")
- logging.exception(ex)
+ logger.error("RunRanger: unexpected exception encountered in forking hlt slave")
+ logger.exception(ex)
elif dirname.startswith('emu'):
nr=int(dirname[3:])
@@ -1316,8 +1692,8 @@ def process_IN_CREATE(self, event):
bu_emulator.startNewRun(nr)
except Exception as ex:
- logging.info("exception encountered in starting BU emulator run")
- logging.info(ex)
+ logger.info("exception encountered in starting BU emulator run")
+ logger.info(ex)
os.remove(event.fullpath)
@@ -1330,7 +1706,7 @@ def process_IN_CREATE(self, event):
try:
runtoend = filter(lambda x: x.runnumber==nr,run_list)
if len(runtoend)==1:
- logging.info('end run '+str(nr))
+ logger.info('end run '+str(nr))
#remove from run_list to prevent intermittent restarts
#lock used to fix a race condition when core files are being moved around
resource_lock.acquire()
@@ -1341,34 +1717,34 @@ def process_IN_CREATE(self, event):
runtoend[0].StartWaitForEnd()
if bu_emulator and bu_emulator.runnumber != None:
bu_emulator.stop()
- #logging.info('run '+str(nr)+' removing end-of-run marker')
+ #logger.info('run '+str(nr)+' removing end-of-run marker')
#os.remove(event.fullpath)
elif len(runtoend)==0:
- logging.warning('request to end run '+str(nr)
+ logger.warning('request to end run '+str(nr)
+' which does not exist')
os.remove(event.fullpath)
else:
- logging.error('request to end run '+str(nr)
+ logger.error('request to end run '+str(nr)
+' has more than one run object - this should '
+'*never* happen')
except Exception as ex:
resource_lock.release()
- logging.info("exception encountered when waiting hltrun to end")
- logging.info(ex)
+ logger.info("exception encountered when waiting hltrun to end")
+ logger.info(ex)
else:
- logging.error('request to end run '+str(nr)
+ logger.error('request to end run '+str(nr)
+' which is an invalid run number - this should '
+'*never* happen')
else:
- logging.error('request to end run '+str(nr)
+ logger.error('request to end run '+str(nr)
+' which is NOT a run number - this should '
+'*never* happen')
elif dirname.startswith('herod'):
os.remove(event.fullpath)
if conf.role == 'fu':
- logging.info("killing all CMSSW child processes")
+ logger.info("killing all CMSSW child processes")
for run in run_list:
run.Shutdown(True)
elif conf.role == 'bu':
@@ -1378,46 +1754,48 @@ def process_IN_CREATE(self, event):
try:
dirlist = os.listdir(boxdir)
current_time = time.time()
- logging.info("sending herod to child FUs")
+ logger.info("sending herod to child FUs")
for name in dirlist:
if name == os.uname()[1]:continue
age = current_time - os.path.getmtime(boxdir+name)
- logging.info('found box '+name+' with keepalive age '+str(age))
+ logger.info('found box '+name+' with keepalive age '+str(age))
if age < 20:
- connection = httplib.HTTPConnection(name, conf.cgi_port)
+ connection = httplib.HTTPConnection(name, conf.cgi_port - self.cgi_instance_port_offset)
connection.request("GET",'cgi-bin/herod_cgi.py')
response = connection.getresponse()
- logging.info("sent herod to all child FUs")
+ logger.info("sent herod to all child FUs")
except Exception as ex:
- logging.error("exception encountered in contacting resources")
- logging.info(ex)
+ logger.error("exception encountered in contacting resources")
+ logger.info(ex)
run_list=[]
+ active_runs_errors=[]
active_runs=[]
-
elif dirname.startswith('populationcontrol'):
- logging.info("terminating all ongoing runs")
- for run in run_list:
- if conf.role=='fu':
- run.Shutdown()
- elif conf.role=='bu':
- run.ShutdownBU()
+ if len(run_list)>0:
+ logger.info("terminating all ongoing runs via cgi interface (populationcontrol): "+str(run_list))
+ for run in run_list:
+ if conf.role=='fu':
+ run.Shutdown(run.runnumber in runs_pending_shutdown)
+ elif conf.role=='bu':
+ run.ShutdownBU()
+ logger.info("terminated all ongoing runs via cgi interface (populationcontrol)")
run_list = []
+ active_runs_errors=[]
active_runs=[]
- logging.info("terminated all ongoing runs via cgi interface (populationcontrol)")
os.remove(event.fullpath)
elif dirname.startswith('harakiri') and conf.role == 'fu':
os.remove(event.fullpath)
pid=os.getpid()
- logging.info('asked to commit seppuku:'+str(pid))
+ logger.info('asked to commit seppuku:'+str(pid))
try:
- logging.info('sending signal '+str(SIGKILL)+' to myself:'+str(pid))
+ logger.info('sending signal '+str(SIGKILL)+' to myself:'+str(pid))
retval = os.kill(pid, SIGKILL)
- logging.info('sent SIGINT to myself:'+str(pid))
- logging.info('got return '+str(retval)+'waiting to die...and hope for the best')
+ logger.info('sent SIGINT to myself:'+str(pid))
+ logger.info('got return '+str(retval)+'waiting to die...and hope for the best')
except Exception as ex:
- logging.error("exception in committing harakiri - the blade is not sharp enough...")
- logging.error(ex)
+ logger.error("exception in committing harakiri - the blade is not sharp enough...")
+ logger.error(ex)
elif dirname.startswith('quarantined'):
try:
@@ -1431,30 +1809,35 @@ def process_IN_CREATE(self, event):
runtoend = filter(lambda x: x.runnumber==nr,run_list)
if len(runtoend)==1:
if runtoend[0].checkQuarantinedLimit()==True:
- runtoend[0].Shutdown(True)#run abort in herod mode (wait for anelastic/elastic to shut down)
+ hasHigherRuns = filter(lambda x: x.runnumber>nr,run_list)
+ if len(hasHigherRuns)>0:
+ runtoend[0].Shutdown(True)
+ else:
+ runs_pending_shutdown.append(nr)
except Exception as ex:
- logging.exception(ex)
+ logger.exception(ex)
elif dirname.startswith('suspend') and conf.role == 'fu':
- logging.info('suspend mountpoints initiated')
+ logger.info('suspend mountpoints initiated')
+ replyport = int(dirname[7:]) if dirname[7:].isdigit()==True else conf.cgi_port
global suspended
suspended=True
for run in run_list:
- run.Shutdown(False)#terminate all ongoing runs
+ run.Shutdown(run.runnumber in runs_pending_shutdown)#terminate all ongoing runs
run_list=[]
time.sleep(.5)
umount_success = cleanup_mountpoints(remount=False)
if umount_success==False:
time.sleep(1)
- logging.error("Suspend initiated from BU failed, trying again...")
+ logger.error("Suspend initiated from BU failed, trying again...")
#notifying itself again
try:os.remove(event.fullpath)
except:pass
fp = open(event.fullpath,"w+")
fp.close()
return
- #logging.info("Suspend failed, preparing for harakiri...")
+ #logger.info("Suspend failed, preparing for harakiri...")
#time.sleep(.1)
#fp = open(os.path.join(os.path.dirname(event.fullpath.rstrip(os.path.sep)),'harakiri'),"w+")
#fp.close()
@@ -1471,15 +1854,15 @@ def process_IN_CREATE(self, event):
#first report to BU that umount was done
try:
if bu_name==None:
- logging.fatal("No BU name was found in the bus.config file. Leaving mount points unmounted until the hltd service restart.")
+ logger.fatal("No BU name was found in the bus.config file. Leaving mount points unmounted until the hltd service restart.")
os.remove(event.fullpath)
return
- connection = httplib.HTTPConnection(bu_name, conf.cgi_port+5,timeout=5)
+ connection = httplib.HTTPConnection(bu_name, replyport+20,timeout=5)
connection.request("GET",'cgi-bin/report_suspend_cgi.py?host='+os.uname()[1])
response = connection.getresponse()
except Exception as ex:
- logging.error("Unable to report suspend state to BU "+str(bu_name)+':'+str(conf.cgi_port+5))
- logging.exception(ex)
+ logger.error("Unable to report suspend state to BU "+str(bu_name)+':'+str(replyport+20))
+ logger.exception(ex)
#loop while BU is not reachable
while True:
@@ -1493,26 +1876,26 @@ def process_IN_CREATE(self, event):
bu_name=line.split('.')[0]
break
except:
- logging.info('exception test 1')
+ logger.info('exception test 1')
time.sleep(5)
continue
if bu_name==None:
- logging.info('exception test 2')
+ logger.info('exception test 2')
time.sleep(5)
continue
- logging.info('checking if BU hltd is available...')
- connection = httplib.HTTPConnection(bu_name, conf.cgi_port,timeout=5)
+ logger.info('checking if BU hltd is available...')
+ connection = httplib.HTTPConnection(bu_name, replyport,timeout=5)
connection.request("GET",'cgi-bin/getcwd_cgi.py')
response = connection.getresponse()
- logging.info('BU hltd is running !...')
+ logger.info('BU hltd is running !...')
#if we got here, the service is back up
break
except Exception as ex:
try:
- logging.info('Failed to contact BU hltd service: ' + str(ex.args[0]) +" "+ str(ex.args[1]))
+ logger.info('Failed to contact BU hltd service: ' + str(ex.args[0]) +" "+ str(ex.args[1]))
except:
- logging.info('Failed to contact BU hltd service: ')
+ logger.info('Failed to contact BU hltd service '+str(ex))
time.sleep(5)
#mount again
@@ -1520,12 +1903,59 @@ def process_IN_CREATE(self, event):
try:os.remove(event.fullpath)
except:pass
suspended=False
- logging.info("Remount is performed")
+ logger.info("Remount is performed")
+
+ elif dirname.startswith('exclude') and conf.role == 'fu':
+ #service on this machine is asked to be excluded for cloud use
+ logger.info('machine exclude initiated')
+ resource_lock.acquire()
+ cloud_mode=True
+ entering_cloud_mode=True
+ try:
+ for run in run_list:
+ if run.runnumber in runs_pending_shutdown:
+ run.Shutdown(True)
+ else:
+ #write signal file for CMSSW to quit with 0 after certain LS
+ run.Stop()
+ except Exception as ex:
+ logger.fatal("Unable to clear runs. Will not enter VM mode.")
+ logger.exception(ex)
+ cloud_mode=False
+ resource_lock.release()
+ os.remove(event.fullpath)
+
+ elif dirname.startswith('include') and conf.role == 'fu':
+ #TODO: pick up latest working run..
+ tries=1000
+ if cloud_mode==True:
+ while True:
+ resource_lock.acquire()
+ #retry this operation in case cores get moved around by other means
+ if entering_cloud_mode==False and cleanup_resources()==True:
+ resource_lock.release()
+ break
+ resource_lock.release()
+ time.sleep(0.1)
+ tries-=1
+ if tries==0:
+ logger.fatal("Timeout: taking resources from cloud after waiting for 100 seconds")
+ cleanup_resources()
+ entering_cloud_mode=False
+ break
+ if (tries%10)==0:
+ logger.warning("could not move all resources, retrying.")
+ cloud_mode=False
+ os.remove(event.fullpath)
+ elif dirname.startswith('logrestart'):
+ #hook to restart logcollector process manually
+ restartLogCollector(self.instance)
+ os.remove(event.fullpath)
- logging.debug("RunRanger completed handling of event "+event.fullpath)
+ logger.debug("RunRanger completed handling of event "+event.fullpath)
def process_default(self, event):
- logging.info('RunRanger: event '+event.fullpath+' type '+str(event.mask))
+ logger.info('RunRanger: event '+event.fullpath+' type '+str(event.mask))
filename=event.fullpath[event.fullpath.rfind("/")+1:]
class ResourceRanger:
@@ -1543,29 +1973,27 @@ def start_inotify(self):
self.inotifyWrapper.start()
def stop_managed_monitor(self):
- logging.info("ResourceRanger: Stop managed monitor")
self.managed_monitor.stop()
- logging.info("ResourceRanger: Join managed monitor")
self.managed_monitor.join()
- logging.info("ResourceRanger: managed monitor returned")
+ logger.info("ResourceRanger: managed monitor shutdown done")
def stop_inotify(self):
- logging.info("ResourceRanger: Stop inotify wrapper")
self.inotifyWrapper.stop()
- logging.info("ResourceRanger: Join inotify wrapper")
self.inotifyWrapper.join()
- logging.info("ResourceRanger: Inotify wrapper returned")
+ logger.info("ResourceRanger: Inotify wrapper shutdown done")
def process_IN_MOVED_TO(self, event):
- logging.debug('ResourceRanger-MOVEDTO: event '+event.fullpath)
+ logger.debug('ResourceRanger-MOVEDTO: event '+event.fullpath)
+ basename = os.path.basename(event.fullpath)
+ if basename.startswith('resource_summary'):return
try:
resourcepath=event.fullpath[1:event.fullpath.rfind("/")]
resourcestate=resourcepath[resourcepath.rfind("/")+1:]
resourcename=event.fullpath[event.fullpath.rfind("/")+1:]
resource_lock.acquire()
- if not (resourcestate == 'online' or resourcestate == 'offline'
+ if not (resourcestate == 'online' or resourcestate == 'cloud'
or resourcestate == 'quarantined'):
- logging.debug('ResourceNotifier: new resource '
+ logger.debug('ResourceNotifier: new resource '
+resourcename
+' in '
+resourcepath
@@ -1575,7 +2003,7 @@ def process_IN_MOVED_TO(self, event):
ongoing_runs = filter(lambda x: x.is_active_run==True,run_list)
if ongoing_runs:
ongoing_run = ongoing_runs[0]
- logging.info("ResourceRanger: found active run "+str(ongoing_run.runnumber))
+ logger.info("ResourceRanger: found active run "+str(ongoing_run.runnumber))
"""grab resources that become available
#@@EM implement threaded acquisition of resources here
"""
@@ -1584,8 +2012,8 @@ def process_IN_MOVED_TO(self, event):
try:
reslist = os.listdir(idlesdir)
except Exception as ex:
- logging.info("exception encountered in looking for resources")
- logging.exception(ex)
+ logger.info("exception encountered in looking for resources")
+ logger.exception(ex)
#put inotify-ed resource as the first item
for resindex,resname in enumerate(reslist):
fileFound=False
@@ -1614,9 +2042,9 @@ def process_IN_MOVED_TO(self, event):
res = ongoing_run.AcquireResource(resourcenames,resourcestate)
if acquired_sufficient:
- logging.info("ResourceRanger: acquired resource(s) "+str(res.cpu))
+ logger.info("ResourceRanger: acquired resource(s) "+str(res.cpu))
ongoing_run.StartOnResource(res)
- logging.info("ResourceRanger: started process on resource "
+ logger.info("ResourceRanger: started process on resource "
+str(res.cpu))
else:
#if no run is active, move (x N threads) files from except to idle to be picked up for the next run
@@ -1650,19 +2078,20 @@ def process_IN_MOVED_TO(self, event):
os.rename(broken+resname,idles+resname)
except Exception as ex:
- logging.info("exception encountered in looking for resources in except")
- logging.info(ex)
+ logger.info("exception encountered in looking for resources in except")
+ logger.info(ex)
except Exception as ex:
- logging.error("exception in ResourceRanger")
- logging.error(ex)
+ logger.error("exception in ResourceRanger")
+ logger.error(ex)
try:
resource_lock.release()
except:pass
def process_IN_MODIFY(self, event):
-
- logging.debug('ResourceRanger-MODIFY: event '+event.fullpath)
+ logger.debug('ResourceRanger-MODIFY: event '+event.fullpath)
+ basename = os.path.basename(event.fullpath)
+ if basename.startswith('resource_summary'):return
try:
bus_config = os.path.join(os.path.dirname(conf.resource_base.rstrip(os.path.sep)),'bus.config')
if event.fullpath == bus_config:
@@ -1673,21 +2102,58 @@ def process_IN_MODIFY(self, event):
if self.managed_monitor:
self.managed_monitor = system_monitor()
self.managed_monitor.start()
- logging.info("ResouceRanger: managed monitor is "+str(self.managed_monitor))
+ logger.info("ResouceRanger: managed monitor is "+str(self.managed_monitor))
except Exception as ex:
- logging.error("exception in ResourceRanger")
- logging.error(ex)
+ logger.error("exception in ResourceRanger")
+ logger.error(ex)
def process_default(self, event):
- logging.debug('ResourceRanger: event '+event.fullpath +' type '+ str(event.mask))
+ logger.debug('ResourceRanger: event '+event.fullpath +' type '+ str(event.mask))
filename=event.fullpath[event.fullpath.rfind("/")+1:]
+ def process_IN_CLOSE_WRITE(self, event):
+ logger.debug('ResourceRanger-IN_CLOSE_WRITE: event '+event.fullpath)
+ global machine_blacklist
+ resourcepath=event.fullpath[0:event.fullpath.rfind("/")]
+ basename = os.path.basename(event.fullpath)
+ if basename.startswith('resource_summary'):return
+ if conf.role=='fu':return
+ if basename == os.uname()[1]:return
+ if basename == 'blacklist':
+ with open(os.path.join(conf.watch_directory,'appliance','blacklist','r')) as fi:
+ try:
+ machine_blacklist = json.load(fi)
+ except:
+ pass
+ if resourcepath.endswith('boxes'):
+ global boxinfoFUMap
+ if basename in machine_blacklist:
+ try:boxinfoFUMap.remove(basename)
+ except:pass
+ else:
+ try:
+ infile = fileHandler(event.fullpath)
+ current_time = time.time()
+ boxinfoFUMap[basename] = [infile.data,current_time]
+ except Exception as ex:
+ logger.error("Unable to read of parse boxinfo file "+basename)
+ logger.exception(ex)
+
+
class hltd(Daemon2,object):
- def __init__(self, pidfile):
- Daemon2.__init__(self,pidfile,'hltd')
+ def __init__(self, instance):
+ self.instance=instance
+ Daemon2.__init__(self,'hltd',instance,'hltd')
def stop(self):
+ #read configuration file
+ try:
+ setFromConf(self.instance)
+ except Exception as ex:
+ print " CONFIGURATION error:",str(ex),"(check configuration file) [ \033[1;31mFAILED\033[0;39m ]"
+ sys.exit(4)
+
if self.silentStatus():
try:
if os.path.exists(conf.watch_directory+'/populationcontrol'):
@@ -1697,13 +2163,18 @@ def stop(self):
count = 10
while count:
os.stat(conf.watch_directory+'/populationcontrol')
- sys.stdout.write('o.o')
+ if count==10:
+ sys.stdout.write(' o.o')
+ else:
+ sys.stdout.write('o.o')
sys.stdout.flush()
- time.sleep(1.)
+ time.sleep(.5)
count-=1
except OSError, err:
+ time.sleep(.1)
pass
except IOError, err:
+ time.sleep(.1)
pass
super(hltd,self).stop()
@@ -1713,8 +2184,15 @@ def run(self):
infer it from the name of the machine
"""
+ #read configuration file
+ setFromConf(self.instance)
+ logger.info(" ")
+ logger.info(" ")
+ logger.info("<<<< ---- hltd start : instance " + self.instance + " ---- >>>>")
+ logger.info(" ")
+
if conf.enabled==False:
- logging.warning("Service is currently disabled.")
+ logger.warning("Service is currently disabled.")
sys.exit(1)
if conf.role == 'fu':
@@ -1722,8 +2200,11 @@ def run(self):
"""
cleanup resources
"""
+ while True:
+ if cleanup_resources()==True:break
+ time.sleep(0.1)
+ logger.warning("retrying cleanup_resources")
- cleanup_resources()
"""
recheck mount points
this is done at start and whenever the file /etc/appliance/bus.config is modified
@@ -1740,6 +2221,13 @@ def run(self):
except:
pass
+ if conf.role == 'bu':
+ global machine_blacklist
+ update_success,machine_blacklist=updateBlacklist()
+ global ramdisk_submount_size
+ if self.instance == 'main':
+ #if there are other instance mountpoints in ramdisk, they will be subtracted from size estimate
+ ramdisk_submount_size = submount_size(conf.watch_directory)
"""
the line below is a VERY DIRTY trick to address the fact that
@@ -1751,39 +2239,47 @@ def run(self):
watch_directory = os.readlink(conf.watch_directory) if os.path.islink(conf.watch_directory) else conf.watch_directory
resource_base = os.readlink(conf.resource_base) if os.path.islink(conf.resource_base) else conf.resource_base
+ if conf.use_elasticsearch == True:
+ time.sleep(.2)
+ restartLogCollector(self.instance)
+
#start boxinfo elasticsearch updater
+ global nsslock
boxInfo = None
if conf.role == 'bu' and conf.use_elasticsearch == True:
- boxInfo = BoxInfoUpdater(watch_directory)
+ boxInfo = BoxInfoUpdater(watch_directory,conf,nsslock)
boxInfo.start()
- logCollector = None
- if conf.use_elasticsearch == True:
- logging.info("starting logcollector.py")
- logcolleccor_args = ['/opt/hltd/python/logcollector.py',]
- logCollector = subprocess.Popen(['/opt/hltd/python/logcollector.py'],preexec_fn=preexec_function,close_fds=True)
-
- runRanger = RunRanger()
+ runRanger = RunRanger(self.instance)
runRanger.register_inotify_path(watch_directory,inotify.IN_CREATE)
runRanger.start_inotify()
- logging.info("started RunRanger - watch_directory " + watch_directory)
+ logger.info("started RunRanger - watch_directory " + watch_directory)
+
+ appliance_base=resource_base
+ if resource_base.endswith('/'):
+ resource_base = resource_base[:-1]
+ if resource_base.rfind('/')>0:
+ appliance_base = resource_base[:resource_base.rfind('/')]
rr = ResourceRanger()
try:
- imask = inotify.IN_MOVED_TO | inotify.IN_CREATE | inotify.IN_DELETE | inotify.IN_MODIFY
if conf.role == 'bu':
+ pass
#currently does nothing on bu
+ imask = inotify.IN_MOVED_TO | inotify.IN_CLOSE_WRITE | inotify.IN_DELETE
rr.register_inotify_path(resource_base, imask)
rr.register_inotify_path(resource_base+'/boxes', imask)
else:
- rr.register_inotify_path(resource_base, imask)
+ imask_appl = inotify.IN_MODIFY
+ imask = inotify.IN_MOVED_TO
+ rr.register_inotify_path(appliance_base, imask_appl)
rr.register_inotify_path(resource_base+'/idle', imask)
rr.register_inotify_path(resource_base+'/except', imask)
rr.start_inotify()
- logging.info("started ResourceRanger - watch_directory "+resource_base)
+ logger.info("started ResourceRanger - watch_directory "+resource_base)
except Exception as ex:
- logging.error("Exception caught in starting notifier2")
- logging.error(ex)
+ logger.error("Exception caught in starting ResourceRanger notifier")
+ logger.error(ex)
try:
cgitb.enable(display=0, logdir="/tmp")
@@ -1791,48 +2287,53 @@ def run(self):
# the following allows the base directory of the http
# server to be 'conf.watch_directory, which is writeable
# to everybody
- if os.path.exists(conf.watch_directory+'/cgi-bin'):
- os.remove(conf.watch_directory+'/cgi-bin')
- os.symlink('/opt/hltd/cgi',conf.watch_directory+'/cgi-bin')
+ if os.path.exists(watch_directory+'/cgi-bin'):
+ os.remove(watch_directory+'/cgi-bin')
+ os.symlink('/opt/hltd/cgi',watch_directory+'/cgi-bin')
handler.cgi_directories = ['/cgi-bin']
- logging.info("starting http server on port "+str(conf.cgi_port))
+ logger.info("starting http server on port "+str(conf.cgi_port))
httpd = BaseHTTPServer.HTTPServer(("", conf.cgi_port), handler)
- logging.info("hltd serving at port "+str(conf.cgi_port)+" with role "+conf.role)
- os.chdir(conf.watch_directory)
+ logger.info("hltd serving at port "+str(conf.cgi_port)+" with role "+conf.role)
+ os.chdir(watch_directory)
+ logger.info("<<<< ---- hltd instance " + self.instance + ": init complete, starting httpd ---- >>>>")
+ logger.info("")
httpd.serve_forever()
except KeyboardInterrupt:
- logging.info("terminating all ongoing runs")
- for run in run_list:
- if conf.role=='fu':
- run.Shutdown()
- elif conf.role=='bu':
- run.ShutdownBU()
- logging.info("terminated all ongoing runs")
- logging.info("stopping run ranger inotify helper")
+ logger.info("stop signal detected")
+ if len(run_list)>0:
+ logger.info("terminating all ongoing runs")
+ for run in run_list:
+ if conf.role=='fu':
+ global runs_pending_shutdown
+ run.Shutdown(run.runnumber in runs_pending_shutdown)
+ elif conf.role=='bu':
+ run.ShutdownBU()
+ logger.info("terminated all ongoing runs")
runRanger.stop_inotify()
- logging.info("stopping resource ranger inotify helper")
rr.stop_inotify()
if boxInfo is not None:
- logging.info("stopping boxinfo updater")
+ logger.info("stopping boxinfo updater")
boxInfo.stop()
+ global logCollector
if logCollector is not None:
+ logger.info("terminating logCollector")
logCollector.terminate()
- logging.info("stopping system monitor")
+ logger.info("stopping system monitor")
rr.stop_managed_monitor()
- logging.info("closing httpd socket")
+ logger.info("closing httpd socket")
httpd.socket.close()
- logging.info(threading.enumerate())
- logging.info("unmounting mount points")
+ logger.info(threading.enumerate())
+ logger.info("unmounting mount points")
if cleanup_mountpoints(remount=False)==False:
time.sleep(1)
cleanup_mountpoints(remount=False)
- logging.info("shutdown of service completed")
+ logger.info("shutdown of service (main thread) completed")
except Exception as ex:
- logging.info("exception encountered in operating hltd")
- logging.info(ex)
+ logger.info("exception encountered in operating hltd")
+ logger.info(ex)
runRanger.stop_inotify()
rr.stop_inotify()
rr.stop_managed_monitor()
@@ -1840,5 +2341,7 @@ def run(self):
if __name__ == "__main__":
- daemon = hltd('/var/run/hltd.pid')
+ import procname
+ procname.setprocname('hltd')
+ daemon = hltd(sys.argv[1])
daemon.start()
diff --git a/python/hltdconf.py b/python/hltdconf.py
index a93765c..70578d8 100644
--- a/python/hltdconf.py
+++ b/python/hltdconf.py
@@ -33,8 +33,14 @@ def __init__(self, conffile):
self.use_elasticsearch = bool(self.use_elasticsearch=="True")
self.close_es_index = bool(self.close_es_index=="True")
self.cgi_port = int(self.cgi_port)
+ self.cgi_instance_port_offset = int(self.cgi_instance_port_offset)
self.soap2file_port = int(self.soap2file_port)
+ try:
+ self.instance_same_destination=bool(self.instance_same_destination=="True")
+ except:
+ self.instance_same_destination = True
+
self.dqm_machine = bool(self.dqm_machine=="True")
if self.dqm_machine:
self.resource_base = self.dqm_resource_base
@@ -48,7 +54,7 @@ def __init__(self, conffile):
self.service_log_level = getattr(logging,self.service_log_level)
self.autodetect_parameters()
- #read cluster name from elastic search configuration file (used to specify index name)
+ #read cluster name from elastic search configuration file (if not set up directly)
if not self.elastic_cluster and self.use_elasticsearch == True:
f = None
try:
@@ -63,14 +69,10 @@ def __init__(self, conffile):
self.elastic_cluster = line.split(':')[1].strip()
def dump(self):
- logging.info( '')
- logging.info( 'conf.user '+self.user)
- logging.info( 'conf.role '+ self.role)
- logging.info( 'conf.cmssw_base '+ self.cmssw_base)
- logging.info( '')
+ logging.info( '')
def autodetect_parameters(self):
- if not self.role and 'bu' in os.uname()[1]:
+ if not self.role and (os.uname()[1].startswith('bu-') or os.uname()[1].startswith('dvbu-')):
self.role = 'bu'
elif not self.role:
self.role = 'fu'
@@ -78,5 +80,12 @@ def autodetect_parameters(self):
if self.role == 'bu': self.watch_directory='/fff/ramdisk'
if self.role == 'fu': self.watch_directory='/fff/data'
+def initConf(instance='main'):
+ conf=None
+ try:
+ if instance!='main':
+ conf = hltdConf('/etc/hltd-'+instance+'.conf')
+ except:pass
+ if conf==None and instance=='main': conf = hltdConf('/etc/hltd.conf')
+ return conf
-conf = hltdConf('/etc/hltd.conf')
diff --git a/python/logcollector.py b/python/logcollector.py
index 4662558..ac82a9e 100755
--- a/python/logcollector.py
+++ b/python/logcollector.py
@@ -15,7 +15,7 @@
import _inotify as inotify
import threading
import Queue
-import json
+import simplejson as json
import logging
import collections
import subprocess
@@ -733,12 +733,7 @@ def __init__(self,es_server_url):
ip_url=getURLwithIP(es_server_url)
self.es = ElasticSearch(ip_url)
#update in case of new documents added to mapping definition
- for key in mappings.central_hltdlogs_mapping:
- doc = mappings.central_hltdlogs_mapping[key]
- res = requests.get(ip_url+'/'+self.index_name+'/'+key+'/_mapping')
- #only update if mapping is empty
- if res.status_code==200 and res.content.strip()=='{}':
- requests.post(ip_url+'/'+self.index_name+'/'+key+'/_mapping',str(doc))
+ self.updateMappingMaybe(ip_url)
break
except (ElasticHttpError,ConnectionError,Timeout) as ex:
#try to reconnect with different IP from DNS load balancing
@@ -783,6 +778,14 @@ def elasticize_log(self,type,severity,timestamp,msg):
self.es.index(self.index_name,'hltdlog',document)
except:
logger.warning('failed connection attempts to ' + self.es_server_url)
+
+ def updateMappingMaybe(self,ip_url):
+ for key in mappings.central_hltdlogs_mapping:
+ doc = mappings.central_hltdlogs_mapping[key]
+ res = requests.get(ip_url+'/'+self.index_name+'/'+key+'/_mapping')
+ #only update if mapping is empty
+ if res.status_code==200 and res.content.strip()=='{}':
+ requests.post(ip_url+'/'+self.index_name+'/'+key+'/_mapping',json.dumps(doc))
class HLTDLogParser(threading.Thread):
def __init__(self,dir,file,loglevel,esHandler,skipToEnd):
@@ -951,8 +954,14 @@ def registerSignal(eventRef):
if __name__ == "__main__":
+
+ import procname
+ procname.setprocname('logcol')
+
+ conf=initConf(sys.argv[1])
+
logging.basicConfig(filename=os.path.join(conf.log_dir,"logcollector.log"),
- level=logging.INFO,
+ level=conf.service_log_level,
format='%(levelname)s:%(asctime)s - %(funcName)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S')
logger = logging.getLogger(os.path.basename(__file__))
@@ -988,9 +997,10 @@ def registerSignal(eventRef):
threadEvent = threading.Event()
registerSignal(threadEvent)
- hltdlogdir = '/var/log/hltd'
+ hltdlogdir = conf.log_dir
hltdlogs = ['hltd.log','anelastic.log','elastic.log','elasticbu.log']
- cmsswlogdir = '/var/log/hltd/pid'
+ cmsswlogdir = os.path.join(conf.log_dir,'pid')
+
mask = inotify.IN_CREATE
logger.info("starting CMSSW log collector for "+cmsswlogdir)
diff --git a/python/mappings.py b/python/mappings.py
index fbc6141..9da6b62 100644
--- a/python/mappings.py
+++ b/python/mappings.py
@@ -139,7 +139,7 @@
'processed' :{'type':'integer'},
'accepted' :{'type':'integer'},
'errorEvents' :{'type':'integer'},
- 'size' :{'type':'integer'},
+ 'size' :{'type':'long'},
}
},
'macromerge' : {
@@ -154,7 +154,7 @@
'processed' :{'type':'integer'},
'accepted' :{'type':'integer'},
'errorEvents' :{'type':'integer'},
- 'size' :{'type':'integer'},
+ 'size' :{'type':'long'},
}
}
@@ -165,17 +165,22 @@
'properties' : {
'fm_date' :{'type':'date'},
'id' :{'type':'string'},
+ 'host' :{'type':'string',"index":"not_analyzed"},
+ 'appliance' :{'type':'string',"index":"not_analyzed"},
+ 'instance' :{'type':'string',"index":"not_analyzed"},
'broken' :{'type':'integer'},
'used' :{'type':'integer'},
'idles' :{'type':'integer'},
'quarantined' :{'type':'integer'},
+ 'cloud' :{'type':'integer'},
'usedDataDir' :{'type':'integer'},
'totalDataDir' :{'type':'integer'},
'usedRamdisk' :{'type':'integer'},
'totalRamdisk' :{'type':'integer'},
'usedOutput' :{'type':'integer'},
'totalOutput' :{'type':'integer'},
- 'activeRuns' :{'type':'string'}
+ 'activeRuns' :{'type':'string'},
+ 'activeRunsErrors':{'type':'string',"index":"not_analyzed"},
},
'_timestamp' : {
'enabled' : True,
@@ -193,6 +198,7 @@
'used' :{'type':'integer'},
'idles' :{'type':'integer'},
'quarantined' :{'type':'integer'},
+ 'cloud' :{'type':'integer'},
'usedDataDir' :{'type':'integer'},
'totalDataDir' :{'type':'integer'},
'usedRamdisk' :{'type':'integer'},
@@ -200,38 +206,17 @@
'usedOutput' :{'type':'integer'},
'totalOutput' :{'type':'integer'},
'activeRuns' :{'type':'string'},
- 'hosts' :{'type':'string',"index":"not_analyzed"}
- },
- '_timestamp' : {
- 'enabled' : True,
- 'store' : "yes",
- "path" : "fm_date"
- }
- },
- 'boxinfo_last' : {#deprecated
- '_id' :{'path':'id'},
- 'properties' : {
- 'fm_date' :{'type':'date'},
- 'id' :{'type':'string'},
- 'broken' :{'type':'integer'},
- 'used' :{'type':'integer'},
- 'idles' :{'type':'integer'},
- 'quarantined' :{'type':'integer'},
- 'usedDataDir' :{'type':'integer'},
- 'totalDataDir' :{'type':'integer'},
- 'usedRamdisk' :{'type':'integer'},
- 'totalRamdisk' :{'type':'integer'},
- 'usedOutput' :{'type':'integer'},
- 'totalOutput' :{'type':'integer'},
- 'activeRuns' :{'type':'string'}
+ 'hosts' :{'type':'string',"index":"not_analyzed"},
+ 'blacklistedHosts':{'type':'string',"index":"not_analyzed"},
+ 'host' :{'type':'string',"index":"not_analyzed"},
+ 'instance' :{'type':'string',"index":"not_analyzed"}
},
'_timestamp' : {
'enabled' : True,
'store' : "yes",
"path" : "fm_date"
}
- }
-
+ },
}
diff --git a/python/setupmachine.py b/python/setupmachine.py
index cf5dde4..9e875c9 100755
--- a/python/setupmachine.py
+++ b/python/setupmachine.py
@@ -2,6 +2,9 @@
import os,sys,socket
import shutil
+import json
+import subprocess
+import shutil
import time
@@ -34,10 +37,6 @@
dblogin = 'empty'
dbpwd = 'empty'
equipmentSet = 'latest'
-default_eqset_daq2val = 'eq_140325_attributes'
-#default_eqset_daq2 = 'eq_140430_mounttest'
-#default_eqset_daq2 = 'eq_14-508_emu'
-default_eqset_daq2 = 'eq_140522_emu'
minidaq_list = ["bu-c2f13-21-01","bu-c2f13-23-01","bu-c2f13-25-01","bu-c2f13-27-01",
"fu-c2f13-17-01","fu-c2f13-17-02","fu-c2f13-17-03","fu-c2f13-17-04"
"fu-c2f13-19-01","fu-c2f13-19-02","fu-c2f13-19-03","fu-c2f13-19-04"]
@@ -45,27 +44,52 @@
"fu-c2f13-39-03","fu-c2f13-39-04"]
ed_list = ["bu-c2f13-29-01","fu-c2f13-41-01","fu-c2f13-41-02",
"fu-c2f13-41-03","fu-c2f13-41-04"]
+
+#es_cdaq_list = ["srv-c2a11-07-01","srv-c2a11-08-01","srv-c2a11-09-01","srv-c2a11-10-01",
+# "srv-c2a11-11-01","srv-c2a11-14-01","srv-c2a11-15-01","srv-c2a11-16-01",
+# "srv-c2a11-17-01","srv-c2a11-18-01","srv-c2a11-19-01","srv-c2a11-20-01",
+# "srv-c2a11-21-01","srv-c2a11-22-01","srv-c2a11-23-01","srv-c2a11-26-01",
+# "srv-c2a11-27-01","srv-c2a11-28-01","srv-c2a11-29-01","srv-c2a11-30-01"]
+#
+#es_tribe_list = ["srv-c2a11-31-01","srv-c2a11-32-01","srv-c2a11-33-01","srv-c2a11-34-01",
+# "srv-c2a11-35-01","srv-c2a11-38-01","srv-c2a11-39-01","srv-c2a11-40-01",
+# "srv-c2a11-41-01","srv-c2a11-42-01"]
+
+tribe_ignore_list = ['bu-c2f13-29-01','bu-c2f13-31-01']
+
myhost = os.uname()[1]
-def countCPUs():
- fp=open('/proc/cpuinfo','r')
- resource_count = 0
- for line in fp:
- if line.startswith('processor'):
- resource_count+=1
- return resource_count
+#testing dual mount point
+vm_override_buHNs = {
+ "fu-vm-01-01.cern.ch":["bu-vm-01-01","bu-vm-01-01"],
+ "fu-vm-01-02.cern.ch":["bu-vm-01-01"],
+ "fu-vm-02-01.cern.ch":["bu-vm-01-01","bu-vm-01-01"],
+ "fu-vm-02-02.cern.ch":["bu-vm-01-01"]
+ }
def getmachinetype():
#print "running on host ",myhost
if myhost.startswith('dvrubu-') or myhost.startswith('dvfu-') : return 'daq2val','fu'
elif myhost.startswith('dvbu-') : return 'daq2val','bu'
- elif myhost.startswith('bu-') : return 'daq2','bu'
elif myhost.startswith('fu-') : return 'daq2','fu'
- elif myhost.startswith('cmsdaq-401b28') : return 'test','fu'
- elif myhost.startswith('dvfu-') : return 'test','fu'
+ elif myhost.startswith('bu-') : return 'daq2','bu'
+ elif myhost.startswith('srv-') :
+ try:
+ es_cdaq_list = socket.gethostbyname_ex('es-cdaq')[2]
+ es_tribe_list = socket.gethostbyname_ex('es-tribe')[2]
+ myaddr = socket.gethostbyname(myhost)
+ if myaddr in es_cdaq_list:
+ return 'es','escdaq'
+ elif myaddr in es_tribe_list:
+ return 'es','tribe'
+ else:
+ return 'unknown','unknown'
+ except socket.gaierror, ex:
+ print 'dns lookup error ',str(ex)
+ raise ex
else:
- print "debug"
+ print "unknown machine type"
return 'unknown','unknown'
@@ -94,7 +118,7 @@ def checkModifiedConfigInFile(file):
else:zone=tzones[0]
for l in lines:
- if l.strip().startswith("#edited by fff meta rpm at "+getTimeString()):
+ if l.strip().startswith("#edited by fff meta rpm"):
return True
return False
@@ -102,24 +126,36 @@ def checkModifiedConfigInFile(file):
def checkModifiedConfig(lines):
for l in lines:
- if l.strip().startswith("#edited by fff meta rpm at "+getTimeString()):
+ if l.strip().startswith("#edited by fff meta rpm"):
return True
return False
-
+
+
+#alternates between two data inteface indices based on host naming convention
+def name_identifier():
+ try:
+ nameParts = os.uname()[1].split('-')
+ return (int(nameParts[-1]) * int(nameParts[-2]/2)) % 2
+ except:
+ return 0
+
+
def getBUAddr(parentTag,hostname):
global equipmentSet
#con = cx_Oracle.connect('CMS_DAQ2_TEST_HW_CONF_W/'+dbpwd+'@'+dbhost+':10121/int2r_lb.cern.ch',
- #equipmentSet = 'eq_140325_attributes'
-
- if equipmentSet == 'default':
- if parentTag == 'daq2val':
- equipmentSet = default_eqset_daq2val
- if parentTag == 'daq2':
- equipmentSet = default_eqset_daq2
if env == "vm":
+
+ try:
+ #cluster in openstack that is not (yet) in mysql
+ retval = []
+ for bu_hn in vm_override_buHNs[hostname]:
+ retval.append(["myBU",bu_hn])
+ return retval
+ except:
+ pass
con = MySQLdb.connect( host= dbhost, user = dblogin, passwd = dbpwd, db = dbsid)
else:
if parentTag == 'daq2':
@@ -131,7 +167,7 @@ def getBUAddr(parentTag,hostname):
con = cx_Oracle.connect(dblogin+'/'+dbpwd+'@'+dbhost+':10121/'+dbsid,
cclass="FFFSETUP",purity = cx_Oracle.ATTR_PURITY_SELF)
else:
- con = cx_Oracle.connect('CMS_DAQ2_TEST_HW_CONF_W/'+dbpwd+'@int2r2-v.cern.ch:10121/int2r_lb.cern.ch',
+ con = cx_Oracle.connect('CMS_DAQ2_TEST_HW_CONF_R/'+dbpwd+'@int2r2-v.cern.ch:10121/int2r_lb.cern.ch',
cclass="FFFSETUP",purity = cx_Oracle.ATTR_PURITY_SELF)
#print con.version
@@ -175,7 +211,6 @@ def getBUAddr(parentTag,hostname):
cur.execute(qstring)
else:
print "query equipment set",parentTag+'/'+equipmentSet
- #print '\n',qstring2
cur.execute(qstring2)
retval = []
@@ -185,19 +220,64 @@ def getBUAddr(parentTag,hostname):
#print retval
return retval
+def getAllBU(requireFU=False):
+
+ #setups = ['daq2','daq2val']
+ parentTag = 'daq2'
+ if True:
+ #if parentTag == 'daq2':
+ if dbhost.strip()=='null':
+ #con = cx_Oracle.connect('CMS_DAQ2_HW_CONF_W','pwd','cms_rcms',
+ con = cx_Oracle.connect(dblogin,dbpwd,dbsid,
+ cclass="FFFSETUP",purity = cx_Oracle.ATTR_PURITY_SELF)
+ else:
+ con = cx_Oracle.connect(dblogin+'/'+dbpwd+'@'+dbhost+':10121/'+dbsid,
+ cclass="FFFSETUP",purity = cx_Oracle.ATTR_PURITY_SELF)
+ #else:
+ # con = cx_Oracle.connect('CMS_DAQ2_TEST_HW_CONF_W/'+dbpwd+'@int2r2-v.cern.ch:10121/int2r_lb.cern.ch',
+ # cclass="FFFSETUP",purity = cx_Oracle.ATTR_PURITY_SELF)
+
+ cur = con.cursor()
+ retval = []
+ if requireFU==False:
+ qstring= "select dnsname from DAQ_EQCFG_DNSNAME where (dnsname like 'bu-%' OR dnsname like '__bu-%') \
+ AND eqset_id = (select eqset_id from DAQ_EQCFG_EQSET where tag='"+parentTag.upper()+"' AND \
+ ctime = (SELECT MAX(CTIME) FROM DAQ_EQCFG_EQSET WHERE tag='"+parentTag.upper()+"'))"
+
+ else:
+ qstring = "select attr_value from \
+ DAQ_EQCFG_HOST_ATTRIBUTE ha, \
+ DAQ_EQCFG_HOST_NIC hn, \
+ DAQ_EQCFG_DNSNAME d \
+ where \
+ ha.eqset_id=hn.eqset_id AND \
+ hn.eqset_id=d.eqset_id AND \
+ ha.host_id = hn.host_id AND \
+ ha.attr_name like 'myBU%' AND \
+ hn.nic_id = d.nic_id AND \
+ d.dnsname like 'fu-%' \
+ AND d.eqset_id = (select eqset_id from DAQ_EQCFG_EQSET \
+ where tag='"+parentTag.upper()+"' AND \
+ ctime = (SELECT MAX(CTIME) FROM DAQ_EQCFG_EQSET WHERE tag='"+parentTag.upper()+"'))"
+
+
+
+
+ cur.execute(qstring)
+
+ for res in cur:
+ retval.append(res[0])
+ cur.close()
+ retval = sorted(list(set(map(lambda v: v.split('.')[0], retval))))
+ print retval
+ return retval
+
def getSelfDataAddr(parentTag):
global equipmentSet
#con = cx_Oracle.connect('CMS_DAQ2_TEST_HW_CONF_W/'+dbpwd+'@'+dbhost+':10121/int2r_lb.cern.ch',
- #equipmentSet = 'eq_140325_attributes'
-
- if equipmentSet == 'default':
- if parentTag == 'daq2val':
- equipmentSet = default_eqset_daq2val
- if parentTag == 'daq2':
- equipmentSet = default_eqset_daq2
con = cx_Oracle.connect(dblogin+'/'+dbpwd+'@'+dbhost+':10121/'+dbsid,
cclass="FFFSETUP",purity = cx_Oracle.ATTR_PURITY_SELF)
@@ -235,13 +315,27 @@ def getSelfDataAddr(parentTag):
return retval
+def getInstances(hostname):
+ #instance.input example:
+ #{"cmsdaq-401b28.cern.ch":{"names":["main","ecal"],"sizes":[40,20]}} #size is in megabytes
+ #BU can have multiple instances, FU should have only one specified. If none, any host is assumed to have only main instance
+ try:
+ with open('/opt/fff/instances.input','r') as fi:
+ doc = json.load(fi)
+ return doc[hostname]['names'],doc[hostname]['sizes']
+ except:
+ return ["main"],0
+
class FileManager:
- def __init__(self,file,sep,edited,os1='',os2=''):
+ def __init__(self,file,sep,edited,os1='',os2='',recreate=False):
self.name = file
- f = open(file,'r')
- self.lines = f.readlines()
- f.close()
+ if recreate==False:
+ f = open(file,'r')
+ self.lines = f.readlines()
+ f.close()
+ else:
+ self.lines=[]
self.sep = sep
self.regs = []
self.remove = []
@@ -259,7 +353,7 @@ def removeEntry(self,key):
def commit(self):
out = []
if self.edited == False:
- out.append('#edited by fff meta rpm\n')
+ out.append('#edited by fff meta rpm at '+getTimeString()+'\n')
#first removing elements
for rm in self.remove:
@@ -298,6 +392,8 @@ def commit(self):
if insertionDone == False:
self.lines.append(toAdd)
for l in self.lines:
+ #already written
+ if l.startswith("#edited by fff meta rpm"):continue
out.append(l)
#print "file ",self.name,"\n\n"
#for o in out: print o
@@ -344,11 +440,6 @@ def restoreFileMaybe(file):
if 'elasticsearch' in selection:
restoreFileMaybe(elasticsysconf)
restoreFileMaybe(elasticconf)
- if 'hltd' in selection:
- try:
- os.remove(os.path.join(backup_dir,os.path.basename(busconfig)))
- except:
- pass
sys.exit(0)
@@ -444,13 +535,15 @@ def restoreFileMaybe(file):
dqmmachine = 'False'
execdir = '/opt/hltd'
resourcefract = '0.5'
+
if cluster == 'daq2val':
- runindex_name = 'dv'
+ runindex_name = 'dv'
elif cluster == 'daq2':
runindex_name = 'cdaq'
if myhost in minidaq_list:
runindex_name = 'minidaq'
if myhost in dqm_list or myhost in ed_list:
+
use_elasticsearch = 'False'
runindex_name = 'dqm'
cmsswloglevel = 'DISABLED'
@@ -470,66 +563,35 @@ def restoreFileMaybe(file):
cmssw_base = '/home/dqmdevlocal'
execdir = '/home/dqmdevlocal/output' ##not yet
- #hardcode minidaq hosts until role is available
- #if cnhostname == 'bu-c2f13-27-01.cms' or cnhostname == 'fu-c2f13-19-03.cms' or cnhostname == 'fu-c2f13-19-04.cms':
- # runindex_name = 'runindex_minidaq'
- #hardcode dqm hosts until role is available
- #if cnhostname == 'bu-c2f13-31-01.cms' or cnhostname == 'fu-c2f13-39-01.cms' or cnhostname == 'fu-c2f13-39-02.cms' or cnhostname == 'fu-c2f13-39-03.cms' or cnhostname == 'fu-c2f13-39-04.cms':
- # runindex_name = 'runindex_dqm'
- else:
- runindex_name = 'test'
+ buName = None
+ buDataAddr=[]
- buName = ''
- budomain = ''
if type == 'fu':
- if cluster == 'daq2val' or cluster == 'daq2':
- addrList = getBUAddr(cluster,cnhostname)
- selectedAddr = False
- for addr in addrList:
- #result = os.system("ping -c 1 "+ str(addr[1])+" >& /dev/null")
- result = 0#ping disabled for now
- #os.system("clear")
- if result == 0:
- buDataAddr = addr[1]
- if addr[1].find('.'):
- buName = addr[1].split('.')[0]
- budomain = addr[1][addr[1].find('.'):]
- else:
- buName = addr[1]
- selectedAddr=True
- break
- else:
- print "failed to ping",str(addr[1])
+ if cluster == 'daq2val' or cluster == 'daq2':
+ for addr in getBUAddr(cluster,cnhostname):
+ if buName==None:
+ buName = addr[1].split('.')[0]
+ elif buName != addr[1].split('.')[0]:
+ print "BU name not same for all interfaces:",buName,buNameCheck
+ continue
+ buDataAddr.append(addr[1])
#if none are pingable, first one is picked
- if selectedAddr==False:
- if len(addrList)>0:
- addr = addrList[0]
- buDataAddr = addr[1]
- if addr[1].find('.'):
- buName = addr[1].split('.')[0]
- else:
- buName = addr[1]
- if buName == '':
+ if buName == None or len(buDataAddr)==0:
print "no BU found for this FU in the dabatase"
sys.exit(-1)
+ else:
+ print "FU configuration in cluster",cluster,"not supported yet !!"
+ sys.exit(-2)
- elif cluster =='test':
- hn = os.uname()[1].split(".")[0]
- addrList = [hn]
- buName = hn
- buDataAddr = hn
- else:
- print "FU configuration in cluster",cluster,"not supported yet !!"
- sys.exit(-2)
-
elif type == 'bu':
if env == "vm":
buName = os.uname()[1].split(".")[0]
else:
buName = os.uname()[1]
- addrList = buName
+ elif type == 'tribe':
+ buDataAddr = getAllBU(requireFU=False)
+ buName='es-tribe'
- #print "detected address", addrList," and name ",buName
print "running configuration for machine",cnhostname,"of type",type,"in cluster",cluster,"; appliance bu is:",buName
clusterName='appliance_'+buName
@@ -543,7 +605,7 @@ def restoreFileMaybe(file):
#print "will modify sysconfig elasticsearch configuration"
#maybe backup vanilla versions
essysEdited = checkModifiedConfigInFile(elasticsysconf)
- if essysEdited == False and type == 'fu': #modified only on FU
+ if essysEdited == False:
#print "elasticsearch sysconfig configuration was not yet modified"
shutil.copy(elasticsysconf,os.path.join(backup_dir,os.path.basename(elasticsysconf)))
@@ -551,97 +613,224 @@ def restoreFileMaybe(file):
if esEdited == False:
shutil.copy(elasticconf,os.path.join(backup_dir,os.path.basename(elasticconf)))
- escfg = FileManager(elasticconf,':',esEdited,'',' ')
+ if type == 'fu' or type == 'bu':
- escfg.reg('cluster.name',clusterName)
- escfg.reg('node.name',cnhostname)
- essyscfg = FileManager(elasticsysconf,'=',essysEdited)
- essyscfg.reg('ES_HEAP_SIZE','1G')
- essyscfg.commit()
+ essyscfg = FileManager(elasticsysconf,'=',essysEdited)
+ essyscfg.reg('ES_HEAP_SIZE','1G')
+ essyscfg.commit()
- if type == 'fu':
+ escfg = FileManager(elasticconf,':',esEdited,'',' ')
+ escfg.reg('cluster.name',clusterName)
+ escfg.reg('node.name',cnhostname)
escfg.reg('discovery.zen.ping.multicast.enabled','false')
- if env=="vm":
- escfg.reg('discovery.zen.ping.unicast.hosts',"[\"" + buName + "\"]")
- else:
- escfg.reg('discovery.zen.ping.unicast.hosts',"[\"" + buName + ".cms" + "\"]")
escfg.reg('network.publish_host',es_publish_host)
escfg.reg('transport.tcp.compress','true')
- escfg.reg('indices.fielddata.cache.size', '50%')
- if cluster != 'test':
+
+ if type == 'fu':
+ if env=="vm":
+ escfg.reg('discovery.zen.ping.unicast.hosts',"[\"" + buName + "\"]")
+ else:
+ escfg.reg('discovery.zen.ping.unicast.hosts',"[\"" + buName + ".cms" + "\"]")
+ escfg.reg('indices.fielddata.cache.size', '50%')
escfg.reg('node.master','false')
escfg.reg('node.data','true')
- if type == 'bu':
- escfg.reg('network.publish_host',es_publish_host)
- #escfg.reg('discovery.zen.ping.multicast.enabled','false')
- #escfg.reg('discovery.zen.ping.unicast.hosts','[ \"'+elastic_host2+'\" ]')
+ if type == 'bu':
+ #escfg.reg('discovery.zen.ping.unicast.hosts','[ \"'+elastic_host2+'\" ]')
+ escfg.reg('node.master','true')
+ escfg.reg('node.data','false')
+ escfg.commit()
+
+ if type == 'tribe':
+ essyscfg = FileManager(elasticsysconf,'=',essysEdited)
+ essyscfg.reg('ES_HEAP_SIZE','12G')
+ essyscfg.commit()
+
+ escfg = FileManager(elasticconf,':',esEdited,'',' ',recreate=True)
+ escfg.reg('cluster.name','es-tribe')
+ escfg.reg('discovery.zen.ping.multicast.enabled','false')
+ #escfg.reg('discovery.zen.ping.unicast.hosts','['+','.join(buDataAddr)+']')
+ escfg.reg('transport.tcp.compress','true')
+ bustring = "["
+ for bu in buDataAddr:
+ if bu in tribe_ignore_list:continue
+
+ try:
+ socket.gethostbyname_ex(bu+'.cms')
+ except:
+ print "skipping",bu," - unable to lookup IP address"
+ continue
+ if bustring!="[":bustring+=','
+ bustring+='"'+bu+'.cms'+'"'
+ bustring += "]"
+ escfg.reg('discovery.zen.ping.unicast.hosts',bustring)
+
+ escfg.reg('tribe','')
+ i=1;
+ for bu in buDataAddr:
+ if bu in tribe_ignore_list:continue
+
+ try:
+ socket.gethostbyname_ex(bu+'.cms')
+ except:
+ # print "skipping",bu," - unable to lookup IP address"
+ continue
+
+ escfg.reg(' t'+str(i),'')
+ #escfg.reg(' discovery.zen.ping.unicast.hosts', '["'+bu+'.cms"]')
+ escfg.reg(' cluster.name', 'appliance_'+bu)
+ i=i+1
+ escfg.commit()
+
+ if type == 'escdaq':
+ essyscfg = FileManager(elasticsysconf,'=',essysEdited)
+ essyscfg.reg('ES_HEAP_SIZE','10G')
+ essyscfg.commit()
+
+ escfg = FileManager(elasticconf,':',esEdited,'',' ',recreate=True)
+ escfg.reg('cluster.name','es-cdaq')
+ escfg.reg('discovery.zen.minimum_master_nodes','11')
+ escfg.reg('index.mapper.dynamic','false')
+ escfg.reg('action.auto_create_index','false')
escfg.reg('transport.tcp.compress','true')
escfg.reg('node.master','true')
- escfg.reg('node.data','false')
+ escfg.reg('node.data','true')
+ escfg.commit()
- escfg.commit()
if "hltd" in selection:
#first prepare bus.config file
if type == 'fu':
- try:
- shutil.copy(busconfig,os.path.join(backup_dir,os.path.basename(busconfig)))
- os.remove(busconfig)
- except Exception,ex:
- print "problem with copying bus.config? ",ex
- pass
+
+ #permissive:try to remove old bus.config
+ try:os.remove(os.path.join(backup_dir,os.path.basename(busconfig)))
+ except:pass
+ try:os.remove(busconfig)
+ except:pass
#write bu ip address
- print "WRITING BUS CONFIG ", busconfig
f = open(busconfig,'w+')
- f.writelines(getIPs(buDataAddr)[0])
+
+ #swap entries based on name (only C6100 hosts with two data interfaces):
+ if len(buDataAddr)>1 and name_identifier()==1:
+ temp = buDataAddr[0]
+ buDataAddr[0]=buDataAddr[1]
+ buDataAddr[1]=temp
+
+ newline=False
+ for addr in buDataAddr:
+ if newline:f.writelines('\n')
+ newline=True
+ f.writelines(getIPs(addr)[0])
+ #break after writing first entry. it is not yet safe to use secondary interface
+ break
f.close()
+ #FU should have one instance assigned, BUs can have multiple
+ watch_dir_bu = '/fff/ramdisk'
+ out_dir_bu = '/fff/output'
+ log_dir_bu = '/var/log/hltd'
+
+ instances,sizes=getInstances(os.uname()[1])
+ if len(instances)==0: instances=['main']
+
hltdEdited = checkModifiedConfigInFile(hltdconf)
- #print "was modified?",hltdEdited
+
if hltdEdited == False:
shutil.copy(hltdconf,os.path.join(backup_dir,os.path.basename(hltdconf)))
- hltdcfg = FileManager(hltdconf,'=',hltdEdited,' ',' ')
- hltdcfg.reg('enabled','True','[General]')
if type=='bu':
+ try:os.remove('/etc/hltd.instances')
+ except:pass
+
+ #do major ramdisk cleanup (unmount existing loop mount points, run directories and img files)
+ try:
+ subprocess.check_call(['/opt/hltd/scripts/unmountloopfs.sh','/fff/ramdisk'])
+ #delete existing run directories to ensure there is space (if this machine has a non-main instance)
+ if instances!=["main"]:
+ os.popen('rm -rf /fff/ramdisk/run*')
+ except subprocess.CalledProcessError, err1:
+ print 'failed to cleanup ramdisk',err1
+ except Exception as ex:
+ print 'failed to cleanup ramdisk',ex
+
+ cgibase=9000
+
+ for idx,val in enumerate(instances):
+ if idx!=0 and val=='main':
+ instances[idx]=instances[0]
+ instances[0]=val
+ break
+ for idx, instance in enumerate(instances):
+
+ watch_dir_bu = '/fff/ramdisk'
+ out_dir_bu = '/fff/output'
+ log_dir_bu = '/var/log/hltd'
+
+ cfile = hltdconf
+ if instance != 'main':
+ cfile = '/etc/hltd-'+instance+'.conf'
+ shutil.copy(hltdconf,cfile)
+ watch_dir_bu = os.path.join(watch_dir_bu,instance)
+ out_dir_bu = os.path.join(out_dir_bu,instance)
+ log_dir_bu = os.path.join(log_dir_bu,instance)
+
+ #run loopback setup for non-main instances (is done on every boot since ramdisk is volatile)
+ try:
+ subprocess.check_call(['/opt/hltd/scripts/makeloopfs.sh','/fff/ramdisk',instance, str(sizes[idx])])
+ except subprocess.CalledProcessError, err1:
+ print 'failed to configure loopback device mount in ramdisk'
+
+ soap2file_port='0'
+
+ if myhost in dqm_list or myhost in ed_list or cluster == 'daq2val' or env=='vm':
+ soap2file_port='8010'
+
+ hltdcfg = FileManager(cfile,'=',hltdEdited,' ',' ')
+
+ hltdcfg.reg('enabled','True','[General]')
+ hltdcfg.reg('role','bu','[General]')
- #get needed info here
hltdcfg.reg('user',username,'[General]')
- hltdcfg.reg('cgi_port','9000','[Web]')
+ hltdcfg.reg('instance',instance,'[General]')
+
+ #port for multiple instances
+ hltdcfg.reg('cgi_port',str(cgibase+idx),'[Web]')
+ hltdcfg.reg('cgi_instance_port_offset',str(idx),'[Web]')
+ hltdcfg.reg('soap2file_port',soap2file_port,'[Web]')
+
hltdcfg.reg('elastic_cluster',clusterName,'[Monitoring]')
- hltdcfg.reg('watch_directory','/fff/ramdisk','[General]')
- hltdcfg.reg('role','bu','[General]')
- hltdcfg.reg('micromerge_output','/fff/output','[General]')
+ hltdcfg.reg('watch_directory',watch_dir_bu,'[General]')
+ #hltdcfg.reg('micromerge_output',out_dir_bu,'[General]')
hltdcfg.reg('elastic_runindex_url',elastic_host,'[Monitoring]')
hltdcfg.reg('elastic_runindex_name',runindex_name,'[Monitoring]')
hltdcfg.reg('use_elasticsearch',use_elasticsearch,'[Monitoring]')
hltdcfg.reg('es_cmssw_log_level',cmsswloglevel,'[Monitoring]')
hltdcfg.reg('dqm_machine',dqmmachine,'[DQM]')
- #hltdcfg.removeEntry('watch_directory')
+ hltdcfg.reg('log_dir',log_dir_bu,'[Logs]')
hltdcfg.commit()
- #remove /fff/data from BU (hack)
- try:
- shutil.rmtree('/fff/data')
- except:
- pass
+
+ #write all instances in a file
+ if 'main' not in instances or len(instances)>1:
+ with open('/etc/hltd.instances',"w") as fi:
+ for instance in instances: fi.write(instance+"\n")
+
if type=='fu':
+ hltdcfg = FileManager(hltdconf,'=',hltdEdited,' ',' ')
- #max_cores_done = False
- #do_max_cores = True
- #num_max_cores = countCPUs()
+ hltdcfg.reg('enabled','True','[General]')
+ hltdcfg.reg('role','fu','[General]')
- #num_threads_done = False
- #do_num_threads = True
- #num_threads = nthreads
- hltdcfg.reg('exec_directory',execdir,'[General]')
hltdcfg.reg('user',username,'[General]')
+ #FU can only have one instance (so we take instance[0] and ignore others)
+ hltdcfg.reg('instance',instances[0],'[General]')
+
+ hltdcfg.reg('exec_directory',execdir,'[General]')
hltdcfg.reg('watch_directory','/fff/data','[General]')
- hltdcfg.reg('role','fu','[General]')
hltdcfg.reg('cgi_port','9000','[Web]')
- #hltdcfg.reg('mount_options_output','rw,vers=4,rsize=65536,wsize=65536,namlen=255,hard,proto=tcp,timeo=600,retrans=2,sec=sys','[General]')
+ hltdcfg.reg('cgi_instance_port_offset',"0",'[Web]')
+ hltdcfg.reg('soap2file_port','0','[Web]')
hltdcfg.reg('elastic_cluster',clusterName,'[Monitoring]')
hltdcfg.reg('es_cmssw_log_level',cmsswloglevel,'[Monitoring]')
hltdcfg.reg('elastic_runindex_url',elastic_host,'[Monitoring]')
@@ -653,6 +842,11 @@ def restoreFileMaybe(file):
hltdcfg.reg('cmssw_threads',nthreads,'[CMSSW]')
hltdcfg.reg('cmssw_streams',nfwkstreams,'[CMSSW]')
hltdcfg.reg('resource_use_fraction',resourcefract,'[Resources]')
- #hltdcfg.removeEntry('watch_directory')
hltdcfg.commit()
+ if "web" in selection:
+ try:os.rmdir('/var/www/html')
+ except:
+ try:os.unlink('/var/www/html')
+ except:pass
+ os.symlink('/es-web','/var/www/html')
diff --git a/python/soap2file b/python/soap2file
new file mode 100755
index 0000000..9126c7d
--- /dev/null
+++ b/python/soap2file
@@ -0,0 +1,64 @@
+#!/bin/env python
+#
+# chkconfig: 2345 81 03
+#
+
+import sys
+import SOAPpy
+import time
+from subprocess import Popen
+from subprocess import PIPE
+
+sys.path.append('/opt/hltd/python')
+#sys.path.append('/opt/hltd/lib')
+
+from soap2file import Soap2file
+
+
+def startService(daemon):
+ proc = Popen(["/opt/hltd/python/soap2file.py"], stdout=PIPE)
+ output = proc.communicate()[0]
+ time.sleep(.1)
+ if daemon.silentStatus() and proc.returncode==0:
+ print 'Starting soap2file:\t\t\t\t\t [ \033[1;32mOK\033[0;39m ]'
+ else:
+ if proc.returncode==3:sys.exit(0)
+ print 'Starting soap2file instance: [ \032[1;32mFAILED\033[0;39m ]'
+ print output
+ sys.exit(1)
+
+
+
+if __name__ == "__main__":
+
+ soap2file = Soap2file()
+
+ if not soap2file.checkEnabled():
+ print "Soap2file service is disabled"
+ sys.exit(0)
+
+ if len(sys.argv) == 2:
+
+ if 'start' == sys.argv[1]:
+ startService(soap2file)
+
+ elif 'stop' == sys.argv[1]:
+ sys.stdout.write('Stopping soap2file:')
+ soap2file.stop()
+
+ elif 'restart' == sys.argv[1]:
+ sys.stdout.write('Stopping soap2file:')
+ soap2file.stop()
+ startService(soap2file)
+
+ elif 'status' == sys.argv[1]:
+ soap2file.status()
+
+ else:
+ print "Unknown command"
+ sys.exit(2)
+ sys.exit(0)
+ else:
+ print "usage: %s start|stop|restart|status" % sys.argv[0]
+ sys.exit(2)
+
diff --git a/python/soap2file.py b/python/soap2file.py
index d8e6cae..ca63a88 100755
--- a/python/soap2file.py
+++ b/python/soap2file.py
@@ -4,12 +4,11 @@
#
import os
-import pwd
import sys
import SOAPpy
sys.path.append('/opt/hltd/python')
-sys.path.append('/opt/hltd/lib')
+#sys.path.append('/opt/hltd/lib')
import demote
import hltdconf
@@ -30,7 +29,6 @@ def writeToFile(filename,content,overwrite):
except IOError as ex:
return "Failed to write data: "+str(ex)
-
def createDirectory(dirname):
try:
os.mkdir(dirname)
@@ -38,15 +36,25 @@ def createDirectory(dirname):
except OSError as ex:
return "Failed to create directory: "+str(ex)
+def renamePath(oldpath,newpath):
+ try:
+ os.rename(oldpath,newpath)
+ return "Success"
+ except Exception as ex:
+ return "Failed to rename file: "+str(ex)
class Soap2file(Daemon2):
- def __init__(self,pidfile):
- Daemon2.__init__(self,pidfile,'soap2file')
+ def __init__(self):
+ Daemon2.__init__(self,'soap2file','main','hltd')
#SOAPpy.Config.debug = 1
self._conf=hltdconf.hltdConf('/etc/hltd.conf')
self._hostname = os.uname()[1]
+ def checkEnabled(self):
+ if self._conf.soap2file_port>0:return True
+ return False
+
def run(self):
dem = demote.demote(self._conf.user)
dem()
@@ -54,43 +62,13 @@ def run(self):
server = SOAPpy.SOAPServer((self._hostname, self._conf.soap2file_port))
server.registerFunction(writeToFile)
server.registerFunction(createDirectory)
+ server.registerFunction(renamePath)
server.serve_forever()
if __name__ == "__main__":
-
- pidfile = '/var/run/soap2file.pid'
- soap2file = Soap2file(pidfile)
-
- if len(sys.argv) == 2:
-
- if 'start' == sys.argv[1]:
- try:
- soap2file.start()
- if soap2file.silentStatus():
- print '[OK]'
- else:
- print '[Failed]'
- except:
- pass
-
- elif 'stop' == sys.argv[1]:
- if soap2file.status():
- soap2file.stop()
- elif os.path.exists(pidfile):
- soap2file.delpid()
-
- elif 'restart' == sys.argv[1]:
- soap2file.restart()
-
- elif 'status' == sys.argv[1]:
- soap2file.status()
-
- else:
- print "Unknown command"
- sys.exit(2)
- sys.exit(0)
- else:
- print "usage: %s start|stop|restart|status" % sys.argv[0]
- sys.exit(2)
+ daemon = Soap2file()
+ import procname
+ procname.setprocname('soap2file')
+ daemon.start()
diff --git a/python/testFUHistograms_cfg2.py b/python/testFUHistograms_cfg2.py
index 796b453..4d69212 100644
--- a/python/testFUHistograms_cfg2.py
+++ b/python/testFUHistograms_cfg2.py
@@ -1,178 +1,40 @@
-import FWCore.ParameterSet.Config as cms
-import FWCore.ParameterSet.VarParsing as VarParsing
-import DQMServices.Components.test.checkBooking as booking
-import DQMServices.Components.test.createElements as c
-import os,sys
-
-cmsswbase = os.path.expandvars('$CMSSW_BASE/')
-
-options = VarParsing.VarParsing ('analysis')
-
-options.register ('runNumber',
- 1, # default value
- VarParsing.VarParsing.multiplicity.singleton,
- VarParsing.VarParsing.varType.int, # string, int, or float
- "Run Number")
-
-options.register ('buBaseDir',
- '/fff/BU0', # default value
- VarParsing.VarParsing.multiplicity.singleton,
- VarParsing.VarParsing.varType.string, # string, int, or float
- "BU base directory")
-
-options.register ('dataDir',
- '/fff/data', # default value
- VarParsing.VarParsing.multiplicity.singleton,
- VarParsing.VarParsing.varType.string, # string, int, or float
- "FU data directory")
-
-options.register ('numThreads',
- 1, # default value
- VarParsing.VarParsing.multiplicity.singleton,
- VarParsing.VarParsing.varType.int, # string, int, or float
- "Number of CMSSW threads")
-
-options.register ('numFwkStreams',
- 1, # default value
- VarParsing.VarParsing.multiplicity.singleton,
- VarParsing.VarParsing.varType.int, # string, int, or float
- "Number of CMSSW streams")
-
-
-
-options.parseArguments()
-
-process = cms.Process("HLT")
-
-# load DQM
-process.load("DQMServices.Core.DQM_cfg")
-process.load("DQMServices.Components.DQMEnvironment_cfi")
-
-#b = booking.BookingParams(sys.argv)
-#b = booking.BookingParams(["CTOR","BJ","BR"])
-#b.doCheck(testOnly=False)
-
-elements = c.createElements()
-readRunElements = c.createReadRunElements()
-readLumiElements = c.createReadLumiElements()
-
+# /users/avetisya/LS1/DAQTest/HLT/V3 (CMSSW_7_2_1)
+import FWCore.ParameterSet.Config as cms
+process = cms.Process( "HLT" )
-process.maxEvents = cms.untracked.PSet(
- input = cms.untracked.int32(-1)
+process.HLTConfigVersion = cms.PSet(
+ tableName = cms.string('/users/avetisya/LS1/DAQTest/HLT/V3')
)
-process.options = cms.untracked.PSet(
- numberOfThreads = cms.untracked.uint32(options.numThreads),
- numberOfStreams = cms.untracked.uint32(options.numFwkStreams),
- multiProcesses = cms.untracked.PSet(
- maxChildProcesses = cms.untracked.int32(0)
- )
+process.streams = cms.PSet(
+ A = cms.vstring( 'A1' ),
+ B = cms.vstring( 'B' ),
+ DQM = cms.vstring( 'DQM1' )
+)
+process.datasets = cms.PSet(
+ A1 = cms.vstring( 'p1' ),
+ B = cms.vstring( 'p3' ),
+ DQM1 = cms.vstring( 'p2' )
)
-process.MessageLogger = cms.Service("MessageLogger",
- destinations = cms.untracked.vstring( 'cout' ),
- cout = cms.untracked.PSet( FwkReport =
- cms.untracked.PSet(reportEvery = cms.untracked.int32(10),
- optionalPSet = cms.untracked.bool(True),
- #limit = cms.untracked.int32(10000000)
- ),
- threshold = cms.untracked.string( "INFO" )
- )
- )
-
-process.FastMonitoringService = cms.Service("FastMonitoringService",
- sleepTime = cms.untracked.int32(1),
- microstateDefPath = cms.untracked.string( cmsswbase+'/src/EventFilter/Utilities/plugins/microstatedef.jsd' ),
- #fastMicrostateDefPath = cms.untracked.string( cmsswbase+'/src/EventFilter/Utilities/plugins/microstatedeffast.jsd' ),
- fastName = cms.untracked.string( 'fastmoni' ),
- slowName = cms.untracked.string( 'slowmoni' ))
-
-process.EvFDaqDirector = cms.Service("EvFDaqDirector",
- buBaseDir = cms.untracked.string(options.buBaseDir),
- baseDir = cms.untracked.string(options.dataDir),
- directorIsBU = cms.untracked.bool(False ),
- testModeNoBuilderUnit = cms.untracked.bool(False),
- runNumber = cms.untracked.uint32(options.runNumber)
- )
-process.PrescaleService = cms.Service( "PrescaleService",
- lvl1DefaultLabel = cms.string( "B" ),
- lvl1Labels = cms.vstring( 'A',
- 'B'
- ),
- prescaleTable = cms.VPSet(
- cms.PSet( pathName = cms.string( "p1" ),
- prescales = cms.vuint32( 0, 10)
- ),
- cms.PSet( pathName = cms.string( "p2" ),
- prescales = cms.vuint32( 0, 100)
- )
- ))
-
-
-process.source = cms.Source("FedRawDataInputSource",
- getLSFromFilename = cms.untracked.bool(True),
- testModeNoBuilderUnit = cms.untracked.bool(False),
- eventChunkSize = cms.untracked.uint32(128),
- numBuffers = cms.untracked.uint32(2),
- eventChunkBlock = cms.untracked.uint32(128),
- useL1EventID=cms.untracked.bool(True)
- )
-
-
-process.filter1 = cms.EDFilter("HLTPrescaler",
- L1GtReadoutRecordTag = cms.InputTag( "hltGtDigis" )
- )
-process.filter2 = cms.EDFilter("HLTPrescaler",
- L1GtReadoutRecordTag = cms.InputTag( "hltGtDigis" )
- )
-
-process.a = cms.EDAnalyzer("ExceptionGenerator",
- defaultAction = cms.untracked.int32(0),
- defaultQualifier = cms.untracked.int32(120))
-
-process.b = cms.EDAnalyzer("ExceptionGenerator",
- defaultAction = cms.untracked.int32(0),
- defaultQualifier = cms.untracked.int32(0))
-
-
-process.filler = cms.EDAnalyzer("DummyBookFillDQMStoreMultiThread",
- folder = cms.untracked.string("TestFolder/"),
- elements = cms.untracked.VPSet(*elements),
- fillRuns = cms.untracked.bool(True),
- fillLumis = cms.untracked.bool(True),
- book_at_constructor = cms.untracked.bool(False),
- book_at_beginJob = cms.untracked.bool(False),
- book_at_beginRun = cms.untracked.bool(True))
-
-
-
-
-
-process.p1 = cms.Path(process.a*process.filter1)
-process.p2 = cms.Path(process.b*process.filter2)
-
-process.dqmsave_step = cms.Path(process.filler*process.dqmSaver)
-
-### global options Online ###
-process.add_(cms.Service("DQMStore"))
-process.DQMStore.LSbasedMode = cms.untracked.bool(True)
-process.DQMStore.verbose = cms.untracked.int32(5)
-process.DQMStore.enableMultiThread = cms.untracked.bool(True)
-
-process.dqmSaver.workflow = ''
-process.dqmSaver.convention = 'FilterUnit'
-process.dqmSaver.saveByLumiSection = True
-process.dqmSaver.fileFormat = cms.untracked.string('PB')
-process.dqmSaver.fakeFilterUnitMode = cms.untracked.bool(False)
-
+process.source = cms.Source( "FedRawDataInputSource",
+ numBuffers = cms.untracked.uint32( 1 ),
+ useL1EventID = cms.untracked.bool( True ),
+ eventChunkSize = cms.untracked.uint32( 128 ),
+ eventChunkBlock = cms.untracked.uint32( 128 ),
+ getLSFromFilename = cms.untracked.bool( True ),
+ verifyAdler32 = cms.untracked.bool( True )
+)
-process.GlobalTag = cms.ESSource( "PoolDBESSource",
+process.PoolDBESSource = cms.ESSource( "PoolDBESSource",
globaltag = cms.string( "GR_H_V39::All" ),
- toGet = cms.VPSet(
+ RefreshEachRun = cms.untracked.bool( False ),
+ RefreshOpenIOVs = cms.untracked.bool( False ),
+ toGet = cms.VPSet(
),
- DBParameters = cms.PSet(
+ DBParameters = cms.PSet(
authenticationPath = cms.untracked.string( "." ),
connectionRetrialTimeOut = cms.untracked.int32( 60 ),
idleConnectionCleanupPeriod = cms.untracked.int32( 10 ),
@@ -181,32 +43,306 @@
enableConnectionSharing = cms.untracked.bool( True ),
enableReadOnlySessionOnUpdateConnection = cms.untracked.bool( False ),
connectionTimeOut = cms.untracked.int32( 0 ),
+ authenticationSystem = cms.untracked.int32( 0 ),
connectionRetrialPeriod = cms.untracked.int32( 10 )
),
RefreshAlways = cms.untracked.bool( False ),
- ReconnectEachRun = cms.untracked.bool( False ),
- RefreshEachRun = cms.untracked.bool( False ),
- RefreshOpenIOVs = cms.untracked.bool( False ),
connect = cms.string( "frontier://(proxyurl=http://localhost:3128)(serverurl=http://localhost:8000/FrontierOnProd)(serverurl=http://localhost:8000/FrontierOnProd)(retrieve-ziplevel=0)/CMS_COND_31X_GLOBALTAG" ),
+ ReconnectEachRun = cms.untracked.bool( False ),
BlobStreamerName = cms.untracked.string( "TBufferBlobStreamingService" )
)
+process.FastTimerService = cms.Service( "FastTimerService",
+ dqmPath = cms.untracked.string( "HLT/TimerService" ),
+ dqmModuleTimeRange = cms.untracked.double( 40.0 ),
+ useRealTimeClock = cms.untracked.bool( True ),
+ enableTimingModules = cms.untracked.bool( True ),
+ enableDQM = cms.untracked.bool( True ),
+ enableDQMbyModule = cms.untracked.bool( False ),
+ enableTimingExclusive = cms.untracked.bool( False ),
+ skipFirstPath = cms.untracked.bool( False ),
+ enableDQMbyLumiSection = cms.untracked.bool( True ),
+ dqmPathTimeResolution = cms.untracked.double( 0.5 ),
+ dqmPathTimeRange = cms.untracked.double( 100.0 ),
+ dqmTimeRange = cms.untracked.double( 1000.0 ),
+ dqmLumiSectionsRange = cms.untracked.uint32( 2500 ),
+ enableDQMbyProcesses = cms.untracked.bool( True ),
+ enableDQMSummary = cms.untracked.bool( True ),
+ enableTimingSummary = cms.untracked.bool( False ),
+ enableDQMbyPathTotal = cms.untracked.bool( True ),
+ enableTimingPaths = cms.untracked.bool( True ),
+ enableDQMbyPathExclusive = cms.untracked.bool( True ),
+ dqmTimeResolution = cms.untracked.double( 5.0 ),
+ dqmModuleTimeResolution = cms.untracked.double( 0.2 ),
+ enableDQMbyPathActive = cms.untracked.bool( True ),
+ enableDQMbyPathDetails = cms.untracked.bool( True ),
+ enableDQMbyPathOverhead = cms.untracked.bool( True ),
+ enableDQMbyPathCounters = cms.untracked.bool( True ),
+ enableDQMbyModuleType = cms.untracked.bool( False )
+)
+process.DQMStore = cms.Service( "DQMStore",
+ verbose = cms.untracked.int32( 0 ),
+ collateHistograms = cms.untracked.bool( False ),
+ enableMultiThread = cms.untracked.bool( True ),
+ forceResetOnBeginLumi = cms.untracked.bool( False ),
+ LSbasedMode = cms.untracked.bool( True ),
+ verboseQT = cms.untracked.int32( 0 )
+)
+process.EvFDaqDirector = cms.Service( "EvFDaqDirector",
+ buBaseDir = cms.untracked.string( "." ),
+ runNumber = cms.untracked.uint32( 0 ),
+ outputAdler32Recheck = cms.untracked.bool( False ),
+ baseDir = cms.untracked.string( "." )
+)
+process.FastMonitoringService = cms.Service( "FastMonitoringService",
+ slowName = cms.untracked.string( "slowmoni" ),
+ sleepTime = cms.untracked.int32( 1 ),
+ fastMonIntervals = cms.untracked.uint32( 2 ),
+ fastName = cms.untracked.string( "fastmoni" )
+)
+process.PrescaleService = cms.Service( "PrescaleService",
+ forceDefault = cms.bool( False ),
+ prescaleTable = cms.VPSet(
+ cms.PSet( pathName = cms.string( "p3" ),
+ prescales = cms.vuint32( 50, 50, 50, 50, 50, 50, 50, 50, 50 )
+ ),
+ cms.PSet( pathName = cms.string( "p2" ),
+ prescales = cms.vuint32( 100, 100, 100, 100, 100, 100, 100, 100, 100 )
+ ),
+ cms.PSet( pathName = cms.string( "p1" ),
+ prescales = cms.vuint32( 10, 10, 10, 10, 10, 10, 10, 10, 10 )
+ )
+ ),
+ lvl1DefaultLabel = cms.string( "1e33" ),
+ lvl1Labels = cms.vstring( '2e33',
+ '1.4e33',
+ '1e33',
+ '7e32',
+ '5e32',
+ '3e32',
+ '2e32',
+ '1.4e32',
+ '1e32' )
+)
+process.MessageLogger = cms.Service( "MessageLogger",
+ suppressInfo = cms.untracked.vstring( 'hltGtDigis' ),
+ debugs = cms.untracked.PSet(
+ threshold = cms.untracked.string( "INFO" ),
+ placeholder = cms.untracked.bool( True ),
+ ),
+ cout = cms.untracked.PSet(
+ threshold = cms.untracked.string( "ERROR" ),
+ ),
+ cerr_stats = cms.untracked.PSet(
+ threshold = cms.untracked.string( "WARNING" ),
+ output = cms.untracked.string( "cerr" ),
+ optionalPSet = cms.untracked.bool( True )
+ ),
+ warnings = cms.untracked.PSet(
+ threshold = cms.untracked.string( "INFO" ),
+ placeholder = cms.untracked.bool( True ),
+ ),
+ statistics = cms.untracked.vstring( 'cerr' ),
+ cerr = cms.untracked.PSet(
+ INFO = cms.untracked.PSet( limit = cms.untracked.int32( 0 ) ),
+ noTimeStamps = cms.untracked.bool( False ),
+ FwkReport = cms.untracked.PSet(
+ reportEvery = cms.untracked.int32( 1 ),
+ limit = cms.untracked.int32( 0 )
+ ),
+ default = cms.untracked.PSet( limit = cms.untracked.int32( 10000000 ) ),
+ Root_NoDictionary = cms.untracked.PSet( limit = cms.untracked.int32( 0 ) ),
+ FwkJob = cms.untracked.PSet( limit = cms.untracked.int32( 0 ) ),
+ FwkSummary = cms.untracked.PSet(
+ reportEvery = cms.untracked.int32( 1 ),
+ limit = cms.untracked.int32( 10000000 )
+ ),
+ threshold = cms.untracked.string( "INFO" ),
+ ),
+ FrameworkJobReport = cms.untracked.PSet(
+ default = cms.untracked.PSet( limit = cms.untracked.int32( 0 ) ),
+ FwkJob = cms.untracked.PSet( limit = cms.untracked.int32( 10000000 ) )
+ ),
+ suppressWarning = cms.untracked.vstring( 'hltGtDigis' ),
+ errors = cms.untracked.PSet(
+ threshold = cms.untracked.string( "INFO" ),
+ placeholder = cms.untracked.bool( True ),
+ ),
+ fwkJobReports = cms.untracked.vstring( 'FrameworkJobReport' ),
+ infos = cms.untracked.PSet(
+ threshold = cms.untracked.string( "INFO" ),
+ Root_NoDictionary = cms.untracked.PSet( limit = cms.untracked.int32( 0 ) ),
+ placeholder = cms.untracked.bool( True ),
+ ),
+ categories = cms.untracked.vstring( 'FwkJob',
+ 'FwkReport',
+ 'FwkSummary',
+ 'Root_NoDictionary' ),
+ destinations = cms.untracked.vstring( 'warnings',
+ 'errors',
+ 'infos',
+ 'debugs',
+ 'cout',
+ 'cerr' ),
+ threshold = cms.untracked.string( "INFO" ),
+ suppressError = cms.untracked.vstring( 'hltGtDigis' )
+)
-process.hltTriggerJSONMonitoring = cms.EDAnalyzer('TriggerJSONMonitoring',
- triggerResults = cms.InputTag( 'TriggerResults','','HLT')
+process.ExceptionGenerator2 = cms.EDAnalyzer( "ExceptionGenerator",
+ defaultAction = cms.untracked.int32( 0 ),
+ defaultQualifier = cms.untracked.int32( 0 )
+)
+process.HLTPrescaler = cms.EDFilter( "HLTPrescaler",
+ L1GtReadoutRecordTag = cms.InputTag( "hltGtDigis" ),
+ offset = cms.uint32( 0 )
+)
+process.HLTPrescaler2 = cms.EDFilter( "HLTPrescaler",
+ L1GtReadoutRecordTag = cms.InputTag( "hltGtDigis" ),
+ offset = cms.uint32( 0 )
)
+process.hltL1GtObjectMap = cms.EDProducer( "L1GlobalTrigger",
+ TechnicalTriggersUnprescaled = cms.bool( True ),
+ ProduceL1GtObjectMapRecord = cms.bool( True ),
+ AlgorithmTriggersUnmasked = cms.bool( False ),
+ EmulateBxInEvent = cms.int32( 1 ),
+ AlgorithmTriggersUnprescaled = cms.bool( True ),
+ ProduceL1GtDaqRecord = cms.bool( False ),
+ ReadTechnicalTriggerRecords = cms.bool( True ),
+ RecordLength = cms.vint32( 3, 0 ),
+ TechnicalTriggersUnmasked = cms.bool( False ),
+ ProduceL1GtEvmRecord = cms.bool( False ),
+ GmtInputTag = cms.InputTag( "hltGtDigis" ),
+ TechnicalTriggersVetoUnmasked = cms.bool( True ),
+ AlternativeNrBxBoardEvm = cms.uint32( 0 ),
+ TechnicalTriggersInputTags = cms.VInputTag( 'simBscDigis' ),
+ CastorInputTag = cms.InputTag( "castorL1Digis" ),
+ GctInputTag = cms.InputTag( "hltGctDigis" ),
+ AlternativeNrBxBoardDaq = cms.uint32( 0 ),
+ WritePsbL1GtDaqRecord = cms.bool( False ),
+ BstLengthBytes = cms.int32( -1 )
+)
+process.TriggerJSONMonitoring = cms.EDAnalyzer( "TriggerJSONMonitoring",
+ triggerResults = cms.InputTag( 'TriggerResults','','HLT' )
+)
+process.DQMFileSaver = cms.EDAnalyzer( "DQMFileSaver",
+ runIsComplete = cms.untracked.bool( False ),
+ referenceHandling = cms.untracked.string( "all" ),
+ producer = cms.untracked.string( "DQM" ),
+ forceRunNumber = cms.untracked.int32( -1 ),
+ saveByRun = cms.untracked.int32( 1 ),
+ saveAtJobEnd = cms.untracked.bool( False ),
+ saveByLumiSection = cms.untracked.int32( 1 ),
+ version = cms.untracked.int32( 1 ),
+ referenceRequireStatus = cms.untracked.int32( 100 ),
+ convention = cms.untracked.string( "FilterUnit" ),
+ dirName = cms.untracked.string( "." ),
+ fileFormat = cms.untracked.string( "PB" )
+)
+process.ExceptionGenerator = cms.EDAnalyzer( "ExceptionGenerator",
+ defaultAction = cms.untracked.int32( 0 ),
+ defaultQualifier = cms.untracked.int32( 64 )
+)
+process.ExceptionGenerator3 = cms.EDAnalyzer( "ExceptionGenerator",
+ defaultAction = cms.untracked.int32( 0 ),
+ defaultQualifier = cms.untracked.int32( 0 )
+)
+process.HLTPrescaler3 = cms.EDFilter( "HLTPrescaler",
+ L1GtReadoutRecordTag = cms.InputTag( "hltGtDigis" ),
+ offset = cms.uint32( 0 )
+)
+
+process.hltOutputA = cms.OutputModule( "ShmStreamConsumer",
+ SelectEvents = cms.untracked.PSet( SelectEvents = cms.vstring( 'p1' ) ),
+ outputCommands = cms.untracked.vstring( 'drop *',
+ 'keep FEDRawDataCollection_rawDataCollector_*_*',
+ 'keep FEDRawDataCollection_source_*_*' )
+)
+process.hltOutputB = cms.OutputModule( "ShmStreamConsumer",
+ SelectEvents = cms.untracked.PSet( SelectEvents = cms.vstring( 'p3' ) ),
+ outputCommands = cms.untracked.vstring( 'drop *',
+ 'keep FEDRawDataCollection_rawDataCollector_*_*',
+ 'keep FEDRawDataCollection_source_*_*' )
+)
+process.hltOutputDQM = cms.OutputModule( "ShmStreamConsumer",
+ SelectEvents = cms.untracked.PSet( SelectEvents = cms.vstring( 'p2' ) ),
+ outputCommands = cms.untracked.vstring( 'drop *',
+ 'keep FEDRawDataCollection_rawDataCollector_*_*',
+ 'keep FEDRawDataCollection_source_*_*' )
+)
+
+process.p3 = cms.Path( process.ExceptionGenerator3 + process.HLTPrescaler3 )
+process.ep3 = cms.EndPath( process.hltOutputB )
+process.pDQMhisto = cms.Path( process.DQMFileSaver )
+process.json = cms.EndPath( process.TriggerJSONMonitoring )
+process.L1Gt = cms.Path( process.hltL1GtObjectMap )
+process.ep2 = cms.EndPath( process.hltOutputDQM )
+process.ep1 = cms.EndPath( process.hltOutputA )
+process.p2 = cms.Path( process.ExceptionGenerator2 + process.HLTPrescaler )
+process.p1 = cms.Path( process.ExceptionGenerator + process.HLTPrescaler2 )
+
+process.transferSystem = cms.PSet(
+ destinations = cms.vstring("Tier0","DQM","ECAL","None"),
+ transferModes = cms.vstring("tier0_on","tier0_off","test"),
+ streamA = cms.PSet(tier0_on=cms.vstring( "Tier0" ),tier0_off=cms.vstring( "None" ),test=cms.vstring( "None" )),
+ streamB = cms.PSet(tier0_on=cms.vstring( "None" ),tier0_off=cms.vstring( "None" ),test=cms.vstring( "None" )),
+ streamDQM = cms.PSet(tier0_on=cms.vstring( "DQM","Tier0" ),tier0_off=cms.vstring( "DQM" ),test=cms.vstring( "None" )),
+ streamL1Rates = cms.PSet(tier0_on=cms.vstring( "Tier0" ),tier0_off=cms.vstring( "None" ),test=cms.vstring( "None" )),
+ streamHLTRates = cms.PSet(tier0_on=cms.vstring( "Tier0" ),tier0_off=cms.vstring( "None" ),test=cms.vstring( "None" )),
+ streamDQMHistograms = cms.PSet(tier0_on=cms.vstring( "DQM" ),tier0_off=cms.vstring( "DQM" ),test=cms.vstring( "None" ))
+)
+
+import FWCore.ParameterSet.VarParsing as VarParsing
+
+import os
+
+cmsswbase = os.path.expandvars('$CMSSW_BASE/')
+
+options = VarParsing.VarParsing ('analysis')
+
+options.register ('runNumber',
+ 1, # default value
+ VarParsing.VarParsing.multiplicity.singleton,
+ VarParsing.VarParsing.varType.int, # string, int, or float
+ "Run Number")
+
+options.register ('buBaseDir',
+ '/fff/BU0', # default value
+ VarParsing.VarParsing.multiplicity.singleton,
+ VarParsing.VarParsing.varType.string, # string, int, or float
+ "BU base directory")
+
+options.register ('dataDir',
+ '/fff/data', # default value
+ VarParsing.VarParsing.multiplicity.singleton,
+ VarParsing.VarParsing.varType.string, # string, int, or float
+ "FU data directory")
+
+options.register ('numThreads',
+ 1, # default value
+ VarParsing.VarParsing.multiplicity.singleton,
+ VarParsing.VarParsing.varType.int, # string, int, or float
+ "Number of CMSSW threads")
+options.register ('numFwkStreams',
+ 1, # default value
+ VarParsing.VarParsing.multiplicity.singleton,
+ VarParsing.VarParsing.varType.int, # string, int, or float
+ "Number of CMSSW streams")
-process.streamA = cms.OutputModule("EvFOutputModule",
- SelectEvents = cms.untracked.PSet(SelectEvents = cms.vstring( 'p1' ))
- )
+options.parseArguments()
-process.streamDQM = cms.OutputModule("EvFOutputModule",
- SelectEvents = cms.untracked.PSet(SelectEvents = cms.vstring( 'p2' ))
- )
+process.options = cms.untracked.PSet(
+ numberOfThreads = cms.untracked.uint32(options.numThreads),
+ numberOfStreams = cms.untracked.uint32(options.numFwkStreams),
+ multiProcesses = cms.untracked.PSet(
+ maxChildProcesses = cms.untracked.int32(0)
+ )
+)
-process.ep = cms.EndPath(process.streamA+process.streamDQM+process.hltTriggerJSONMonitoring)
+process.PoolDBESSource.connect = 'frontier://FrontierProd/CMS_COND_31X_GLOBALTAG'
+process.PoolDBESSource.pfnPrefix = cms.untracked.string('frontier://FrontierProd/')
-process.GlobalTag.connect = 'frontier://FrontierProd/CMS_COND_31X_GLOBALTAG'
-process.GlobalTag.pfnPrefix = cms.untracked.string('frontier://FrontierProd/')
+process.EvFDaqDirector.buBaseDir = options.buBaseDir
+process.EvFDaqDirector.baseDir = options.dataDir
+process.EvFDaqDirector.runNumber = options.runNumber
diff --git a/rpm/fffmeta-1.5.3-6.noarch.rpm b/rpm/fffmeta-1.5.3-6.noarch.rpm
deleted file mode 100644
index 158f1fe..0000000
Binary files a/rpm/fffmeta-1.5.3-6.noarch.rpm and /dev/null differ
diff --git a/rpm/fffmeta-1.6.0-0.noarch.rpm b/rpm/fffmeta-1.6.0-0.noarch.rpm
new file mode 100644
index 0000000..4723008
Binary files /dev/null and b/rpm/fffmeta-1.6.0-0.noarch.rpm differ
diff --git a/rpm/fffmeta-vm-1.5.3-6.noarch.rpm b/rpm/fffmeta-vm-1.5.3-6.noarch.rpm
deleted file mode 100644
index 62405c9..0000000
Binary files a/rpm/fffmeta-vm-1.5.3-6.noarch.rpm and /dev/null differ
diff --git a/rpm/fffmeta-vm-1.6.0-0.noarch.rpm b/rpm/fffmeta-vm-1.6.0-0.noarch.rpm
new file mode 100644
index 0000000..f809c9b
Binary files /dev/null and b/rpm/fffmeta-vm-1.6.0-0.noarch.rpm differ
diff --git a/rpm/hltd-1.5.3-6.x86_64.rpm b/rpm/hltd-1.5.3-6.x86_64.rpm
deleted file mode 100644
index 11c46c5..0000000
Binary files a/rpm/hltd-1.5.3-6.x86_64.rpm and /dev/null differ
diff --git a/rpm/hltd-1.6.0-0.x86_64.rpm b/rpm/hltd-1.6.0-0.x86_64.rpm
new file mode 100644
index 0000000..54dc3e5
Binary files /dev/null and b/rpm/hltd-1.6.0-0.x86_64.rpm differ
diff --git a/scripts/hltdrpm.sh b/scripts/hltdrpm.sh
index eb09fa0..4da465c 100755
--- a/scripts/hltdrpm.sh
+++ b/scripts/hltdrpm.sh
@@ -36,15 +36,17 @@ mkdir -p etc/init.d
mkdir -p etc/logrotate.d
mkdir -p etc/appliance/resources/idle
mkdir -p etc/appliance/resources/online
-mkdir -p etc/appliance/resources/offline
mkdir -p etc/appliance/resources/except
mkdir -p etc/appliance/resources/quarantined
+mkdir -p etc/appliance/resources/cloud
mkdir -p usr/lib64/python2.6/site-packages
mkdir -p usr/lib64/python2.6/site-packages/pyelasticsearch
ls
cp -r $BASEDIR/python/hltd $TOPDIR/etc/init.d/hltd
-cp -r $BASEDIR/python/soap2file.py $TOPDIR/etc/init.d/soap2file
+cp -r $BASEDIR/python/soap2file $TOPDIR/etc/init.d/soap2file
cp -r $BASEDIR/* $TOPDIR/opt/hltd
+rm -rf $TOPDIR/opt/hltd/python/hltd
+rm -rf $TOPDIR/opt/hltd/python/soap2file
cp -r $BASEDIR/etc/hltd.conf $TOPDIR/etc/
cp -r $BASEDIR/etc/logrotate.d/hltd $TOPDIR/etc/logrotate.d/
echo "working in $PWD"
@@ -53,9 +55,9 @@ ls opt/hltd
echo "Creating DQM directories"
mkdir -p etc/appliance/dqm_resources/idle
mkdir -p etc/appliance/dqm_resources/online
-mkdir -p etc/appliance/dqm_resources/offline
mkdir -p etc/appliance/dqm_resources/except
mkdir -p etc/appliance/dqm_resources/quarantined
+mkdir -p etc/appliance/dqm_resources/cloud
cd $TOPDIR
#pyelasticsearch
@@ -152,12 +154,18 @@ Classifier: Topic :: System :: Filesystems
Classifier: Topic :: System :: Monitoring
EOF
+
+cd $TOPDIR
+cd opt/hltd/lib/python-procname/
+./setup.py -q build
+cp build/lib.linux-x86_64-2.6/procname.so $TOPDIR/usr/lib64/python2.6/site-packages
+
cd $TOPDIR
# we are done here, write the specs and make the fu***** rpm
cat > hltd.spec < in fffmeta
-#/sbin/service hltd restart #restart delegated to fffmeta!
%files
%dir %attr(777, -, -) /var/log/hltd
%dir %attr(777, -, -) /var/log/hltd/pid
@@ -212,10 +216,11 @@ rm -rf /etc/appliance/except/*
/usr/lib64/python2.6/site-packages/*_inotify.so*
/usr/lib64/python2.6/site-packages/*python_inotify*
/usr/lib64/python2.6/site-packages/pyelasticsearch
+/usr/lib64/python2.6/site-packages/procname.so
%preun
if [ \$1 == 0 ]; then
- /sbin/service hltd stop
- /sbin/service hltd stop
+ /sbin/service hltd stop || true
+ /sbin/service soap2file stop || true
fi
EOF
mkdir -p RPMBUILD/{RPMS/{noarch},SPECS,BUILD,SOURCES,SRPMS}
diff --git a/scripts/makeloopfs.sh b/scripts/makeloopfs.sh
new file mode 100755
index 0000000..f745ad0
--- /dev/null
+++ b/scripts/makeloopfs.sh
@@ -0,0 +1,106 @@
+#!/bin/bash
+if [ -n "$1" ]; then
+ if [ -n "$2" ]; then
+ if [ -n "$3" ]; then
+
+ if [ -d $1 ]; then
+
+ basedir=`readlink -e $1`
+ image=$basedir/$2.img
+ mountpoint=$basedir/$2
+ sizemb=$3
+ ret=0
+ umask 0
+
+ #protect from going wrong
+ if [ "$mountpoint" == "/" ]; then exit 99; fi
+ if [ "$mountpoint" == "//" ]; then exit 99; fi
+ if [ "$mountpoint" == "/fff" ]; then exit 99; fi
+ if [ "$mountpoint" == "/fff/" ]; then exit 99; fi
+ if [ "$mountpoint" == "/fff/ramdisk" ]; then exit 99; fi
+ if [ "$mountpoint" == "/fff/ramdisk/" ]; then exit 99; fi
+ if [ "$mountpoint" == "fff/ramdisk" ]; then exit 99; fi
+ if [ "$mountpoint" == "fff/ramdisk/" ]; then exit 99; fi
+
+ echo "makeloop script invoked for creating loop device disk $2 in ${basedir} of size $3 MB"
+
+ if [ -d $mountpoint ]; then
+
+ point=`mount | grep $mountpoint | grep /dev/loop | awk '{print $3}'`
+
+ if [ "$point" != "" ]; then
+ #kill any processes that might use the mount point and remove from NFS
+ fuser -km $point
+ exportfs -u *:$point
+ #unmunt loop device
+ umount $point
+ if [ $? != 0 ]; then
+ sleep 0.1
+ fuser -km $point
+ exportfs -u *:$point
+ umount $point
+ if [ $? != 0 ]; then
+ echo "Unsuccessful umount of $point !"
+ exit 1
+ fi
+ fi
+ exportfs -u *:$point
+ fi
+ fi
+ #deleting mount point
+ rm -rf $mountpoint
+ if [ $? != 0 ]; then
+ echo "Unsuccessful delete of unmounted mount point $mountpoint !"
+ exit 2
+ fi
+
+ if [ -f $image ]; then
+ chmod 755 $image
+ rm -rf $image
+ if [ $? != 0 ]; then
+ echo "Unsuccessful delete old image file $image"
+ exit 3
+ fi
+ fi
+
+ dd if=/dev/zero of=$image bs=1048576 count=$sizemb >& /dev/null
+ echo y | mkfs.ext3 $image > /dev/null
+ #try mount
+ mkdir $mountpoint
+ if [ $? != 0 ]; then
+ echo "Unsuccessful make mount point directory!"
+ exit 4
+ fi
+
+ echo "mounting image directory..."
+ mount -o loop,noatime $image $mountpoint
+ if [ $? != 0 ]; then
+ echo "Unsuccessful mount with parameters $image $mountpoint"
+ exit 5
+ fi
+
+ chmod -R 777 $mountpoint
+
+ exportfs -o rw,sync,no_root_squash,no_subtree_check *:$mountpoint
+ if [ $? != 0 ]; then
+ echo "exportfs command failed for $mountpoint !"
+ exit 6
+ fi
+ exit 0
+ #end
+ else
+ echo "base directory not found!"
+ fi
+ else
+ echo "No parameter 3 given!"
+ fi
+ else
+ echo "No parameter 2 given!"
+ fi
+else
+ echo "No parameter 1 given!"
+fi
+
+echo "Usage: makeloopfs.sh basedir subdir imgsize(MB)"
+exit 1
+
diff --git a/scripts/metarpm.sh b/scripts/metarpm.sh
index b303dc3..56c66d4 100755
--- a/scripts/metarpm.sh
+++ b/scripts/metarpm.sh
@@ -4,16 +4,11 @@ SCRIPTDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
cd $SCRIPTDIR/..
BASEDIR=$PWD
-PACKAGENAME="fffmeta"
-
PARAMCACHE="paramcache"
if [ -n "$1" ]; then
- PARAMCACHE=$1
-fi
-
-if [ -n "$2" ]; then
- PACKAGENAME=$2
+ #PARAMCACHE=$1
+ PARAMCACHE=${1##*/}
fi
echo "Using cache file $PARAMCACHE"
@@ -32,10 +27,6 @@ else
done
fi
-
-
-
-
echo "Enviroment (prod,vm) (press enter for \"${lines[0]}\"):"
readin=""
read readin
@@ -86,7 +77,7 @@ if [ ${#readin} != "0" ]; then
lines[6]=$readin
fi
-echo "Equipment set (press enter for: \"${lines[7]}\") - type 'latest' to use latest eq set or 'default' for default one or 'test' for VM enviroment:"
+echo "Equipment set (press enter for: \"${lines[7]}\") - type 'latest' or enter a specific one:"
readin=""
read readin
if [ ${#readin} != "0" ]; then
@@ -107,7 +98,6 @@ if [ ${#readin} != "0" ]; then
lines[9]=$readin
fi
-
echo "number of framework streams per process (press enter for: ${lines[10]}):"
readin=""
read readin
@@ -115,8 +105,6 @@ if [ ${#readin} != "0" ]; then
lines[10]=$readin
fi
-
-
echo "CMSSW log collection level (DEBUG,INFO,WARNING,ERROR or FATAL) (press enter for: ${lines[11]}):"
readin=""
read readin
@@ -145,12 +133,21 @@ done
chmod 500 $SCRIPTDIR/$PARAMCACHE
# create a build area
+if [ ${lines[0]} == "prod" ]; then
+ PACKAGENAME="fffmeta"
+elif [ ${lines[0]} == "vm" ]; then
+ PACKAGENAME="fffmeta-vm"
+else
+ echo "Environment ${lines[0]} not supported. Available: prod or vm"
+ exit 1
+fi
+
echo "removing old build area"
-rm -rf /tmp/fffmeta-build-tmp
+rm -rf /tmp/$PACKAGENAME-build-tmp
echo "creating new build area"
-mkdir /tmp/fffmeta-build-tmp
+mkdir /tmp/$PACKAGENAME-build-tmp
ls
-cd /tmp/fffmeta-build-tmp
+cd /tmp/$PACKAGENAME-build-tmp
mkdir BUILD
mkdir RPMS
TOPDIR=$PWD
@@ -160,13 +157,19 @@ ls
pluginpath="/opt/fff/esplugins/"
pluginname1="bigdesk"
pluginfile1="lukas-vlcek-bigdesk-v2.4.0-2-g9807b92-mod.zip"
+pluginname2="head"
+pluginfile2="head-master.zip"
+pluginname3="HQ"
+pluginfile3="hq-master.zip"
+pluginname4="paramedic"
+pluginfile4="paramedic-master.zip"
cd $TOPDIR
# we are done here, write the specs and make the fu***** rpm
cat > fffmeta.spec <= 1.2.0, hltd >= 1.5.3, cx_Oracle >= 5.1.2, java-1.7.0-openjdk
+Requires:elasticsearch >= 1.4.2, hltd >= 1.6.0, cx_Oracle >= 5.1.2, java-1.7.0-openjdk
Provides:/opt/fff/configurefff.sh
Provides:/opt/fff/setupmachine.py
+Provides:/opt/fff/instances.input
Provides:/etc/init.d/fffmeta
#Provides:/opt/fff/backup/elasticsearch.yml
@@ -203,10 +207,15 @@ mkdir -p opt/fff/esplugins
mkdir -p opt/fff/backup
mkdir -p etc/init.d/
cp $BASEDIR/python/setupmachine.py %{buildroot}/opt/fff/setupmachine.py
+cp $BASEDIR/etc/instances.input %{buildroot}/opt/fff/instances.input
echo "#!/bin/bash" > %{buildroot}/opt/fff/configurefff.sh
+echo python2.6 /opt/hltd/python/fillresources.py >> %{buildroot}/opt/fff/configurefff.sh
echo python2.6 /opt/fff/setupmachine.py elasticsearch,hltd $params >> %{buildroot}/opt/fff/configurefff.sh
cp $BASEDIR/esplugins/$pluginfile1 %{buildroot}/opt/fff/esplugins/$pluginfile1
+cp $BASEDIR/esplugins/$pluginfile2 %{buildroot}/opt/fff/esplugins/$pluginfile2
+cp $BASEDIR/esplugins/$pluginfile3 %{buildroot}/opt/fff/esplugins/$pluginfile3
+cp $BASEDIR/esplugins/$pluginfile4 %{buildroot}/opt/fff/esplugins/$pluginfile4
cp $BASEDIR/esplugins/install.sh %{buildroot}/opt/fff/esplugins/install.sh
cp $BASEDIR/esplugins/uninstall.sh %{buildroot}/opt/fff/esplugins/uninstall.sh
@@ -234,9 +243,13 @@ echo "fi" >> %{buildroot}/etc/init.d/fffmeta
%attr( 755 ,root, root) /opt/fff/setupmachine.py
%attr( 755 ,root, root) /opt/fff/setupmachine.pyc
%attr( 755 ,root, root) /opt/fff/setupmachine.pyo
+%attr( 755 ,root, root) /opt/fff/instances.input
%attr( 700 ,root, root) /opt/fff/configurefff.sh
%attr( 755 ,root, root) /etc/init.d/fffmeta
%attr( 444 ,root, root) /opt/fff/esplugins/$pluginfile1
+%attr( 444 ,root, root) /opt/fff/esplugins/$pluginfile2
+%attr( 444 ,root, root) /opt/fff/esplugins/$pluginfile3
+%attr( 444 ,root, root) /opt/fff/esplugins/$pluginfile4
%attr( 755 ,root, root) /opt/fff/esplugins/install.sh
%attr( 755 ,root, root) /opt/fff/esplugins/uninstall.sh
@@ -254,10 +267,20 @@ python2.6 /opt/fff/setupmachine.py elasticsearch $params
#update permissions in case new rpm changed uid/guid
chown -R elasticsearch:elasticsearch /var/log/elasticsearch
chown -R elasticsearch:elasticsearch /var/lib/elasticsearch
-echo /opt/fff/esplugins/uninstall.sh /usr/share/elasticsearch $pluginname1
-/opt/fff/esplugins/uninstall.sh /usr/share/elasticsearch $pluginname1
-echo /opt/fff/esplugins/install.sh /usr/share/elasticsearch $pluginfile1 $pluginname1
+
+#plugins
+/opt/fff/esplugins/uninstall.sh /usr/share/elasticsearch $pluginname1 > /dev/null
/opt/fff/esplugins/install.sh /usr/share/elasticsearch $pluginfile1 $pluginname1
+
+/opt/fff/esplugins/uninstall.sh /usr/share/elasticsearch $pluginname2 > /dev/null
+/opt/fff/esplugins/install.sh /usr/share/elasticsearch $pluginfile2 $pluginname2
+
+/opt/fff/esplugins/uninstall.sh /usr/share/elasticsearch $pluginname3 > /dev/null
+/opt/fff/esplugins/install.sh /usr/share/elasticsearch $pluginfile3 $pluginname3
+
+/opt/fff/esplugins/uninstall.sh /usr/share/elasticsearch $pluginname4 > /dev/null
+/opt/fff/esplugins/install.sh /usr/share/elasticsearch $pluginfile4 $pluginname4
+
/sbin/service elasticsearch start
chkconfig --del elasticsearch
chkconfig --add elasticsearch
@@ -271,7 +294,11 @@ chkconfig --add elasticsearch
%triggerin -- hltd
#echo "triggered on hltd update or install"
+
/sbin/service hltd stop || true
+/sbin/service soap2file stop || true
+rm -rf /etc/hltd.instances
+
python2.6 /opt/fff/setupmachine.py restore,hltd
python2.6 /opt/fff/setupmachine.py hltd $params
@@ -288,11 +315,14 @@ fi
#set up resources for hltd
/opt/hltd/python/fillresources.py
-/sbin/service hltd restart
+/sbin/service hltd restart || true
+/sbin/service soap2file restart || true
+
chkconfig --del hltd
-#chkconfig --del soap2file
+chkconfig --del soap2file
+
chkconfig --add hltd
-#chkconfig --add soap2file
+chkconfig --add soap2file
%preun
if [ \$1 == 0 ]; then
@@ -300,12 +330,16 @@ if [ \$1 == 0 ]; then
chkconfig --del fffmeta
chkconfig --del elasticsearch
chkconfig --del hltd
-# chkconfig --del soap2file
+ chkconfig --del soap2file
+
+ /sbin/service hltd stop || true
/sbin/service elasticsearch stop || true
/opt/fff/esplugins/uninstall.sh /usr/share/elasticsearch $pluginname1 || true
+ /opt/fff/esplugins/uninstall.sh /usr/share/elasticsearch $pluginname2 || true
+ /opt/fff/esplugins/uninstall.sh /usr/share/elasticsearch $pluginname3 || true
+ /opt/fff/esplugins/uninstall.sh /usr/share/elasticsearch $pluginname4 || true
- /sbin/service hltd stop || true
python2.6 /opt/fff/setupmachine.py restore,hltd,elasticsearch
fi
diff --git a/scripts/paramcache-vm b/scripts/paramcache-vm
index e70c022..170fc26 100755
--- a/scripts/paramcache-vm
+++ b/scripts/paramcache-vm
@@ -1,12 +1,12 @@
vm
http://cu-01.cern.ch:9200
-/opt/cmssw
+/opt/offline
rcms-flightsim
fffsetup
rcms
ominozzo2
test
-bufu
+daqlocal
1
1
INFO
diff --git a/scripts/tribe-metarpm.sh b/scripts/tribe-metarpm.sh
new file mode 100755
index 0000000..2521664
--- /dev/null
+++ b/scripts/tribe-metarpm.sh
@@ -0,0 +1,248 @@
+#!/bin/bash -e
+BUILD_ARCH=noarch
+SCRIPTDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+cd $SCRIPTDIR/..
+BASEDIR=$PWD
+
+PACKAGENAME="fffmeta-tribe"
+
+PARAMCACHE="paramcache"
+
+echo "Using cache file $PARAMCACHE"
+
+if [ -f $SCRIPTDIR/$PARAMCACHE ];
+then
+ readarray lines < $SCRIPTDIR/$PARAMCACHE
+ for (( i=0; i < 12; i++ ))
+ do
+ lines[$i]=`echo -n ${lines[$i]} | tr -d "\n"`
+ done
+else
+ for (( i=0; i < 12; i++ ))
+ do
+ lines[$i]=""
+ done
+fi
+
+echo "Enviroment (prod,vm) (press enter for \"${lines[0]}\"):"
+readin=""
+read readin
+if [ ${#readin} != "0" ]; then
+lines[0]=$readin
+fi
+nousevar=$readin
+nousevar=$readin
+lines[1]="null"
+lines[2]="null"
+
+echo "HWCFG DB server (press enter for \"${lines[3]}\"):"
+readin=""
+read readin
+if [ ${#readin} != "0" ]; then
+lines[3]=$readin
+fi
+
+echo "HWCFG DB SID (or db name in VM enviroment) (press enter for: \"${lines[4]}\"):"
+echo "(SPECIFIES address in TNSNAMES.ORA file if DB server field was \"null\"!)"
+readin=""
+read readin
+if [ ${#readin} != "0" ]; then
+lines[4]=$readin
+fi
+
+echo "HWCFG DB username (press enter for: \"${lines[5]}\"):"
+readin=""
+read readin
+if [ ${#readin} != "0" ]; then
+lines[5]=$readin
+fi
+
+echo "HWCFG DB password (press enter for: \"${lines[6]}\"):"
+readin=""
+read readin
+if [ ${#readin} != "0" ]; then
+lines[6]=$readin
+fi
+
+echo "Equipment set (press enter for: \"${lines[7]}\") - type 'latest' or enter a specific one:"
+readin=""
+read readin
+if [ ${#readin} != "0" ]; then
+lines[7]=$readin
+fi
+
+lines[8]="null"
+lines[9]="null"
+lines[10]="null"
+lines[11]="null"
+
+params=""
+for (( i=0; i < 12; i++ ))
+do
+ params="$params ${lines[i]}"
+done
+
+# create a build area
+
+echo "removing old build area"
+rm -rf /tmp/fffmeta-tribe-build-tmp
+echo "creating new build area"
+mkdir /tmp/fffmeta-tribe-build-tmp
+ls
+cd /tmp/fffmeta-tribe-build-tmp
+mkdir BUILD
+mkdir RPMS
+TOPDIR=$PWD
+echo "working in $PWD"
+ls
+
+pluginpath="/opt/fff/esplugins/"
+pluginname1="bigdesk"
+pluginfile1="lukas-vlcek-bigdesk-v2.4.0-2-g9807b92-mod.zip"
+pluginname2="head"
+pluginfile2="head-master.zip"
+pluginname3="HQ"
+pluginfile3="hq-master.zip"
+pluginname4="paramedic"
+pluginfile4="paramedic-master.zip"
+
+cd $TOPDIR
+# we are done here, write the specs and make the fu***** rpm
+cat > fffmeta-tribe.spec <= 1.4.2, cx_Oracle >= 5.1.2, java-1.7.0-openjdk, httpd >= 2.2.15, php >= 5.3.3, php-oci8 >= 1.4.9
+
+Provides:/opt/fff/configurefff.sh
+Provides:/opt/fff/setupmachine.py
+Provides:/etc/init.d/fffmeta
+
+%description
+fffmeta configuration setup package
+
+%prep
+%build
+
+%install
+rm -rf \$RPM_BUILD_ROOT
+mkdir -p \$RPM_BUILD_ROOT
+%__install -d "%{buildroot}/opt/fff"
+%__install -d "%{buildroot}/opt/fff/backup"
+%__install -d "%{buildroot}/opt/fff/esplugins"
+%__install -d "%{buildroot}/etc/init.d"
+
+mkdir -p opt/fff/esplugins
+mkdir -p opt/fff/backup
+mkdir -p etc/init.d/
+cp $BASEDIR/python/setupmachine.py %{buildroot}/opt/fff/setupmachine.py
+echo "#!/bin/bash" > %{buildroot}/opt/fff/configurefff.sh
+echo python2.6 /opt/fff/setupmachine.py elasticsearch,web $params >> %{buildroot}/opt/fff/configurefff.sh
+
+cp $BASEDIR/esplugins/$pluginfile1 %{buildroot}/opt/fff/esplugins/$pluginfile1
+cp $BASEDIR/esplugins/$pluginfile2 %{buildroot}/opt/fff/esplugins/$pluginfile2
+cp $BASEDIR/esplugins/$pluginfile3 %{buildroot}/opt/fff/esplugins/$pluginfile3
+cp $BASEDIR/esplugins/$pluginfile4 %{buildroot}/opt/fff/esplugins/$pluginfile4
+cp $BASEDIR/esplugins/install.sh %{buildroot}/opt/fff/esplugins/install.sh
+cp $BASEDIR/esplugins/uninstall.sh %{buildroot}/opt/fff/esplugins/uninstall.sh
+
+echo "#!/bin/bash" >> %{buildroot}/etc/init.d/fffmeta
+echo "#" >> %{buildroot}/etc/init.d/fffmeta
+echo "# chkconfig: 2345 79 22" >> %{buildroot}/etc/init.d/fffmeta
+echo "#" >> %{buildroot}/etc/init.d/fffmeta
+echo "if [ \\\$1 == \"start\" ]; then" >> %{buildroot}/etc/init.d/fffmeta
+echo " /opt/fff/configurefff.sh" >> %{buildroot}/etc/init.d/fffmeta
+echo " exit 0" >> %{buildroot}/etc/init.d/fffmeta
+echo "fi" >> %{buildroot}/etc/init.d/fffmeta
+echo "if [ \\\$1 == \"restart\" ]; then" >> %{buildroot}/etc/init.d/fffmeta
+echo "/opt/fff/configurefff.sh" >> %{buildroot}/etc/init.d/fffmeta
+echo " exit 0" >> %{buildroot}/etc/init.d/fffmeta
+echo "fi" >> %{buildroot}/etc/init.d/fffmeta
+echo "if [ \\\$1 == \"status\" ]; then" >> %{buildroot}/etc/init.d/fffmeta
+echo "echo fffmeta does not have status" >> %{buildroot}/etc/init.d/fffmeta
+echo " exit 0" >> %{buildroot}/etc/init.d/fffmeta
+echo "fi" >> %{buildroot}/etc/init.d/fffmeta
+
+
+%files
+%defattr(-, root, root, -)
+#/opt/fff
+%attr( 755 ,root, root) /opt/fff/setupmachine.py
+%attr( 755 ,root, root) /opt/fff/setupmachine.pyc
+%attr( 755 ,root, root) /opt/fff/setupmachine.pyo
+%attr( 700 ,root, root) /opt/fff/configurefff.sh
+%attr( 755 ,root, root) /etc/init.d/fffmeta
+%attr( 444 ,root, root) /opt/fff/esplugins/$pluginfile1
+%attr( 444 ,root, root) /opt/fff/esplugins/$pluginfile2
+%attr( 444 ,root, root) /opt/fff/esplugins/$pluginfile3
+%attr( 444 ,root, root) /opt/fff/esplugins/$pluginfile4
+%attr( 755 ,root, root) /opt/fff/esplugins/install.sh
+%attr( 755 ,root, root) /opt/fff/esplugins/uninstall.sh
+
+%post
+#echo "post install trigger"
+chkconfig --del fffmeta
+chkconfig --add fffmeta
+#disabled, can be run manually for now
+
+%triggerin -- elasticsearch
+#echo "triggered on elasticsearch update or install"
+/sbin/service elasticsearch stop
+python2.6 /opt/fff/setupmachine.py restore,elasticsearch
+python2.6 /opt/fff/setupmachine.py elasticsearch,web $params
+#update permissions in case new rpm changed uid/guid
+chown -R elasticsearch:elasticsearch /var/log/elasticsearch
+chown -R elasticsearch:elasticsearch /var/lib/elasticsearch
+
+/opt/fff/esplugins/uninstall.sh /usr/share/elasticsearch $pluginname1 > /dev/null
+/opt/fff/esplugins/install.sh /usr/share/elasticsearch $pluginfile1 $pluginname1
+
+/opt/fff/esplugins/uninstall.sh /usr/share/elasticsearch $pluginname2 > /dev/null
+/opt/fff/esplugins/install.sh /usr/share/elasticsearch $pluginfile2 $pluginname2
+
+/opt/fff/esplugins/uninstall.sh /usr/share/elasticsearch $pluginname3 > /dev/null
+/opt/fff/esplugins/install.sh /usr/share/elasticsearch $pluginfile3 $pluginname3
+
+/opt/fff/esplugins/uninstall.sh /usr/share/elasticsearch $pluginname4 > /dev/null
+/opt/fff/esplugins/install.sh /usr/share/elasticsearch $pluginfile4 $pluginname4
+
+chkconfig --del elasticsearch
+chkconfig --add elasticsearch
+chkconfig --add httpd
+#todo:kill java process if running to have clean restart
+/sbin/service elasticsearch start
+/sbin/service httpd restart || true
+
+%preun
+
+if [ \$1 == 0 ]; then
+
+ chkconfig --del fffmeta
+ chkconfig --del elasticsearch
+ chkconfig --del httpd
+
+ /sbin/service elasticsearch stop || true
+ /opt/fff/esplugins/uninstall.sh /usr/share/elasticsearch $pluginname1 || true
+ /opt/fff/esplugins/uninstall.sh /usr/share/elasticsearch $pluginname2 || true
+ /opt/fff/esplugins/uninstall.sh /usr/share/elasticsearch $pluginname3 || true
+ /opt/fff/esplugins/uninstall.sh /usr/share/elasticsearch $pluginname4 || true
+ /sbin/service httpd stop || true
+
+
+ python2.6 /opt/fff/setupmachine.py restore,elasticsearch
+fi
+
+#%verifyscript
+
+EOF
+
+rpmbuild --target noarch --define "_topdir `pwd`/RPMBUILD" -bb fffmeta-tribe.spec
+
diff --git a/scripts/unmountloopfs.sh b/scripts/unmountloopfs.sh
new file mode 100755
index 0000000..7079446
--- /dev/null
+++ b/scripts/unmountloopfs.sh
@@ -0,0 +1,74 @@
+#!/bin/bash
+if [ -n "$1" ]; then
+ if [ -d $1 ]; then
+
+ basedir=`readlink -e $1`
+ umask 0
+ points=`mount | grep $basedir/ | grep /dev/loop | awk '{print $3}'`
+ imgs=`mount | grep $basedir/ | grep /dev/loop | awk '{print $1}'`
+ pointarr=( $points )
+ imgarr=( $imgs )
+
+ len=${#pointarr[@]}
+ len2=${#imgarr[@]}
+ if [[ $len == 0 ]]; then
+ exit 0
+ fi
+ max=$((len))
+
+ for i in $(seq 0 1 $max)
+ do
+ if [ $i == $max ]; then continue; fi
+ point=${pointarr[$i]}
+ image=${imgarr[$i]}
+ #protect from dangerous action
+ if [ $point == "/" ]; then continue; fi
+ if [ $point == "//" ]; then continue; fi
+ if [ $point == "/fff" ]; then continue; fi
+ if [ $point == "/fff/" ]; then continue; fi
+ if [ $point == "/fff/ramdisk" ]; then continue; fi
+ if [ $point == "/fff/ramdisk/" ]; then continue; fi
+ if [ $point == "fff/ramdisk" ]; then continue; fi
+ if [ $point == "fff/ramdisk/" ]; then continue; fi
+
+ echo "found mountpoint $point $image"
+ #kill any processes that might use the mount point and remove from NFS
+ fuser -km $point
+ #unmunt loop device
+ sleep 0.2
+ exportfs -u *:$point
+ umount $point
+ if [ $? != 0 ]; then
+ sleep 0.1
+ fuser -km $point
+ sleep 0.2
+ exportfs -u *:$point
+ umount $point
+ if [ $? != 0 ]; then
+ echo "Unsuccessful unmount of $point !"
+ exit 1
+ fi
+ fi
+
+ #deleting mount point
+ exportfs -u *:$point
+ rm -rf $point
+ if [ $? != 0 ]; then
+ echo "Unsuccessful delete of unmounted mount point $point !"
+ exit 2
+ fi
+
+ #remove image
+ chmod 755 $image
+ rm -rf $image
+ if [ $? != 0 ]; then
+ echo "Unsuccessful delete of image file $image"
+ exit 3
+ fi
+ done
+ exit 0
+ else
+ echo "base directory not found!"
+ fi
+fi
+exit 1
diff --git a/test/crashtest.py b/test/crashtest.py
index 52d8d46..72b6d49 100644
--- a/test/crashtest.py
+++ b/test/crashtest.py
@@ -88,7 +88,7 @@ def process(self):
dirname = sys.argv[1]
dirname = os.path.basename(os.path.normpath(dirname))
watchDir = os.path.join(conf.watch_directory,dirname)
- outputDir = conf.micromerge_output
+ #outputDir = conf.micromerge_output
@@ -119,4 +119,4 @@ def process(self):
notifier.stop()
print "Quit"
- sys.exit(0)
\ No newline at end of file
+ sys.exit(0)