配置文件:
###############################################################################
# Config file to monitor CHM processes, YAML format
#
# procTask DataFile Record fields:
# Date Time Node Command PID PCPU VSZ(KB) RSS(KB) StartTime Info
#
# fileTask DataFile Record fields:
# FileName CreateTime Size(KB) INFO
#
###############################################################################
#common setting for monitor
logLevel: debug # debug, info, warn, error, fatal, default: debug
logFile: monitor.log # log filename, default: monitor.log
allProc: "ASM|asm|MGMT|mgmt|APX|apx|d\.bin|cssdagent|cssdmonitor|orarootagent.bin|oraagent.bin|scriptagent.bin|tnslsnr"
asmProc: "ASM|asm"
mgmtProc: "MGMT|mgmt"
apxProc: "APX|apx"
crsProc: "d\.bin|cssdagent|cssdmonitor|orarootagent.bin|oraagent.bin|scriptagent.bin|tnslsnr"
# monitoring process settings
OCSSDProcTask:
type: processTask # processTask - process monitor task, fileTask - file monitor task
exec: true # true - execute this task, false - don't execute this task, default true
process: ocssd.bin|ohasd.bin # process name list to be monitored, seperated by '|', used by grep to select
nodes: [rws00fys, rws00fyt, rws00fyu, rws00fyv] #optioanl, default is local host
checkInterval: 1 # interval seconds to check status, default: 1 seconds
statCount: 5 # count of checkInterval to statistic, default: 5
cpuThreshold: 10 # cpu usage percent threshold, default: 10.0%
memThreshold: 409600 # rss memory threshold, in KB, default: 400MB
DataFile: procmonitor.dat
actions: ~
# monitoring file settings: use regular expression to match filenames
OCSSDFileTask:
type: fileTask # processTask - process monitor task, fileTask - file monitor task
exec: true # true - execute this task, false - don't execute this task
file: # file to be monitored
- '/u01/app/crsusr/diag/crs/<LOCAL_HOST>/crs/trace/ohasd[_0-9]*.trc'
- '/u01/app/crsusr/diag/crs/<LOCAL_HOST>/crs/trace/ocssd[_0-9]*.trc'
checkInterval: 60 # interval seconds to check status, default: 60 seconds
statCount: 5 # count of checkInterval to statistic, default: 5
wrapTime: 600
wrapCount: 20 # in wrapInterval seconds, no more than wrapCount files generated
DataFile: filemonitor.dat
actions: ~
# monitoring process settings
CHAMProcTask:
type: processTask # processTask - process monitor task, fileTask - file monitor task
exec: false # true - execute this task, false - don't execute this task
process: osysmond|ologgerd|mdb_pmon # process name list to be monitored, seperated by '|', used by grep to select
nodes: [rws00fys, rws00fyt, rws00fyu, rws00fyv] #optioanl, default is local host
checkInterval: 1 # interval seconds to check status, default: 1 seconds
statCount: 5 # count of checkInterval to statistic, default: 5
cpuThreshold: 10 # cpu usage percent threshold, default: 10.0%
memThreshold: 409600 # rss memory threshold, in KB, default: 400MB
DataFile: procmonitor.dat
actions: ~
CHAMFileTask:
type: fileTask # processTask - process monitor task, fileTask - file monitor task
exec: false # true - execute this task, false - don't execute this task
file: # file to be monitored
- '/u01/app/crsusr/diag/crs/<LOCAL_HOST>/crs/trace/osysmond[_0-9]*.trc'
- '/u01/app/crsusr/diag/crs/<LOCAL_HOST>/crs/trace/ologgerd[_0-9]*.trc'
- '/u01/app/12.2.0/grid/crf/db/<LOCAL_HOST>/json/jsondump.log'
checkInterval: 60 # interval seconds to check status, default: 60 seconds
statCount: 5 # count of checkInterval to statistic, default: 5
wrapTime: 600
wrapCount: 20 # in wrapInterval seconds, no more than wrapCount files generated
DataFile: filemonitor.dat
actions: ~
运行代码
#!/usr/bin/env python
"""
This script runs a sequence of commands on a remote host using SSH, it will monitor assigned
processes, it will report a warning if the process reboot or quit, or CPU/MEMORY exceed assigned
limits.
It also perform some simple system health checks such as open file descriptors number,
port status and memory usage etc.
"""
import sys
import os
import platform
import time
from datetime import datetime,timedelta
import getopt
import getpass
import logging
import traceback
import subprocess
import yaml
import socket
import signal
import time
import threading
import re
global LOGGER, OS_NAME
TIME_OUT = 20
DEFAULT_LOGFILE = "ractest.log";
DEFAULT_LOGLEVEL = "debug"
EXPR_NAME_ESC = '\\'
EXPR_NAME_BEGIN = '<'
EXPR_NAME_END = '>'
#platform.system()
#platform.release()
SYSTEM_LINUX = 'Linux'
SYSTEM_SOLARIS = 'SunOS'
SYSTEM_WINDOWS = 'Windows'
SYSTEM_MACOS = 'Darwin'
SYSTEM_AIX = 'AIX'
SYSTEM_HPUX = 'HPUX'
LOCAL_HOST = socket.gethostname().split(".")[0]
g_bRunApplication = True
def findString(file_name, value):
file1 = file(file_name, 'r')
for line in file1:
if value in line:
return line
else:
return None
def find_in_list(array_list, value):
index = []
i = 0
for l in array_list:
if cmp(l, value) == 0:
index.append(i)
i = i + 1
return index
def strpTimeDelta(strDayTime):
'''
translate a format string to a timedelta object.
strDayTime: string, format: [days-][hour:]minutes:seconds
return a timedelta object by format string
'''
listDayTime = strDayTime.split('-')
if (len(listDayTime) > 1):
nDays = int(listDayTime[0])
listTime = listDayTime[1].split(':')
else:
nDays = 0
listTime = listDayTime[0].split(':')
if (len(listTime) > 2):
nHour = int(listTime[0])
nMin = int(listTime[1])
nSec = int(listTime[2])
else:
nHour = 0
nMin = int(listTime[0])
nSec = int(listTime[1])
return timedelta(days=nDays, hours=nHour, minutes=nMin, seconds=nSec)
# Run command over ssh or in local shell
def runSSHOneCmd(host, command):
if (host != LOCAL_HOST):
ssh_cmd = "ssh %s %s" % (host, command)
else:
ssh_cmd = "%s" % (command)
proc = subprocess.Popen(ssh_cmd, shell=True, stderr=subprocess.STDOUT, stdout=subprocess.PIPE)
output = proc.communicate()
#print "runSSHOneCmd: %s\nOutput: \n%s" % (ssh_cmd, output[0])
lines = output[0].split('\n')
#remove blank line
if len(lines) > 1 and not lines[-1].split():
lines.pop()
return lines
def readPropertyFile(strFileName, sep='=', comment_char='#'):
"""
Read the file passed as parameter as a properties file.
"""
dictProps = {}
with open(strFileName, "rt") as fPorpFile:
for strLine in fPorpFile:
strLine = strLine.strip()
if strLine and not strLine.startswith(comment_char):
tupKeyValue = strLine.split(sep)
strKey = tupKeyValue[0].strip()
strValue = sep.join(tupKeyValue[1:]).strip().strip('"')
dictProps[strKey] = strValue
return dictProps
def parseString(strExp, dict = None):
"""
Parse the expressions in strExp, find expression value in dict and replace the expression with the value
Example:
if LOCAL_HOST = "rws00fys",
/u01/app/crsusr/diag/<LOCAL_HOST>/...
will be parsed as:
/u01/app/crsusr/diag/rws00fys/...
"""
if (dict is None):
dict = {"LOCAL_HOST":LOCAL_HOST}
#print "parseString0: %s, dict=%s" % (strExp, dict)
nNormal = 0
strResult = ""
# find the first begin mark
nBegin = strExp.find(EXPR_NAME_BEGIN, nNormal)
while (nBegin >= 0):
if (nBegin >= 0 and strExp[nBegin-1] == EXPR_NAME_ESC):
# escape of begin mark
strResult += strExp[nNormal:nBegin-1] # add string before ESC to result
strResult += EXPR_NAME_BEGIN # add escaped EXPR_NAME_BEGIN to result
#print "parseString1: RESULT=%s" % (strResult)
nNormal = nBegin+1
nBegin = strExp.find(EXPR_NAME_BEGIN, nNormal)
continue;
# find end mark
nName = nBegin+1
strName = ""
nEnd = strExp.find(EXPR_NAME_END, nName)
while (nEnd > 0 and strExp[nEnd-1] == EXPR_NAME_ESC):
# escape of end mark
strName += strExp[nName:nEnd-1] # add string before ESC to name
strName += EXPR_NAME_END # add escaped EXPR_NAME_END to name
nName = nEnd+1
nEnd = strExp.find(EXPR_NAME_END, nName)
if (nEnd < 0):
break;
strName += strExp[nName:nEnd]
strName = strName.replace(EXPR_NAME_ESC+EXPR_NAME_BEGIN, EXPR_NAME_BEGIN)
strName = strName.replace(EXPR_NAME_ESC+EXPR_NAME_END, EXPR_NAME_END)
strResult += strExp[nNormal:nBegin]
#print "parseString2: NAME=%s" % (strName)
if (strName is not None and len(strName) > 0):
if strName not in dict:
#print "parseString: '%s' not found." % (strName)
#strResult += EXPR_NAME_BEGIN + strName + EXPR_NAME_END
strResult += strName
else:
strResult += str(dict[strName])
#print "parseString3: RESULT=%s" % (strResult)
# find next begin mark
nNormal = nEnd+1
nBegin = strExp.find(EXPR_NAME_BEGIN, nNormal)
# end of string parse
strResult += strExp[nNormal:]
#print "parseString4: RESULT=%s" % (strResult)
strResult = strResult.replace(EXPR_NAME_ESC+EXPR_NAME_BEGIN, EXPR_NAME_BEGIN)
strResult = strResult.replace(EXPR_NAME_ESC+EXPR_NAME_END, EXPR_NAME_END)
#print "parseString5: RESULT=%s" % (strResult)
return strResult
def sigHandler(sig, frame):
global g_bRunApplication
logging.info("sigHandler: Caught signal: %s", sig)
g_bRunApplication = False
logging.info("sigHandler: g_bRunApplication: %s", g_bRunApplication)
def exitWithUsage():
print globals()['__doc__']
os._exit(0)
def initLogging(nLevel = logging.DEBUG, strLogFile = DEFAULT_LOGFILE):
# create logger
strScriptDir = os.path.dirname(os.path.realpath(__file__))
#time_now = time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime())
#strLogFileName = os.path.join(strScriptDir, 'monitor_' + time_now + '.log')
strLogFileName = os.path.join(strScriptDir, strLogFile)
#strFmt = "%(asctime)-15s : %(levelname)s: %(filename)s-%(lineno)d: %(message)s"
strFmt = "%(asctime)s : %(levelname)s: %(message)s"
print "initLogging:\n filename = %s\n level = %d DEBUG=%d\n format = %s" % (strLogFileName, nLevel, logging.DEBUG, strFmt)
logging.basicConfig(filename = strLogFileName, level = nLevel, format = strFmt)
###############################################################################
# class RACAppBase
class RACAppBase():
def __init__(self):
self.strLogFile = DEFAULT_LOGFILE
self.nLogLevel = logging.DEBUG
self.dictTask = {}
def readConfig(self, strFileName):
"""
readConfig: read config items from config file file_name
config items:
outFile: log file name, default is DEFAULT_PROC_DATAFILE
nodes: node list to be monitored, if None, monitor local node
process: process command name list to be monitored, seperated by '|', can't be empty
"""
with open(strFileName, 'r') as stream:
try:
yamlObj = yaml.load(stream)
except yaml.YAMLError as ex:
print(ex)
return False;
self.strLogFile = yamlObj.get("logFile", DEFAULT_LOGFILE)
strLogLevel = yamlObj.get("logLevel", DEFAULT_LOGLEVEL)
if (strLogLevel == "info"):
self.nLogLevel = logging.INFO
elif (strLogLevel == "warn"):
self.nLogLevel = logging.WARN
elif (strLogLevel == "error"):
self.nLogLevel = logging.ERROR
elif (strLogLevel == "fatal"):
self.nLogLevel = logging.FATAL
else:
self.nLogLevel = logging.DEBUG
print "Read Config file OK:\n logFile = %s\n logLevel = %s(%d)\n" % (self.strLogFile, strLogLevel, self.nLogLevel)
return True
def StartAllTasks(self):
logging.info("Starting all tasks ...")
logging.info("Start all tasks OK.")
def StopAllTasks(self):
logging.info("Stopping all tasks ...")
logging.info("Stop all tasks OK.")
#def main():
if __name__ == "__main__":
#global g_bRunMonitor
appTest = RACAppBase()
#global g_bRunApplication
configFile = "process.yml"
######################################################################
## Parse the options, arguments, get ready, etc.
######################################################################
try:
optlist, args = getopt.getopt(sys.argv[1:], 'h?f:', ['help','h','?'])
except Exception, e:
print str(e)
exitWithUsage()
if len(args) > 1:
exitWithUsage()
options = dict(optlist)
if [elem for elem in options if elem in ['-h','--h','-?','--?','--help']]:
print "Help:"
exitWithUsage()
if '-f' in options:
configFile = options['-f']
#print "filename=", configFile
if not configFile:
print "The value of \'-f\' is empty, please enter a valid file name!!!"
sys.exit(0)
print "config file: %s" % (configFile)
if not appTest.readConfig(configFile):
print "Read config file '%s' failed!" % (configFile)
sys.exit(-1)
initLogging(appTest.nLogLevel, appTest.strLogFile)
signal.signal(signal.SIGTERM, sigHandler)
signal.signal(signal.SIGINT, sigHandler)
logging.info("Begin testing racutil.")
appTest.StartAllTasks()
while g_bRunApplication:
time.sleep(0.5)
appTest.StopAllTasks()
logging.info("End test racutil.")
if __name__ == "__main__": ??? #global g_bRunMonitor ??? appTest = RACAppBase()
??? #global g_bRunApplication ??? configFile = "process.yml"
??? ###################################################################### ??? ## Parse the options, arguments, get ready, etc. ??? ###################################################################### ??? try: ??????? optlist, args = getopt.getopt(sys.argv[1:], 'h?f:', ['help','h','?']) ??? except Exception, e: ??????? print str(e) ??????? exitWithUsage() ?????? ? ??? if len(args) > 1: ??????? exitWithUsage()
??? options = dict(optlist) ??? if [elem for elem in options if elem in ['-h','--h','-?','--?','--help']]: ??????? print "Help:" ??????? exitWithUsage()
??? if '-f' in options: ??????? configFile = options['-f'] ??????? #print "filename=", configFile ??????? if not configFile: ??????????? print "The value of \'-f\' is empty, please enter a valid file name!!!" ??????????? sys.exit(0)
?
|