forked from jamesoff/simplemonitor
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsimplemonitor.py
250 lines (217 loc) · 10.7 KB
/
simplemonitor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
# coding=utf-8
"""Execution logic for SimpleMonitor."""
import signal
import copy
import pickle
import time
import logging
import Loggers
import Monitors
module_logger = logging.getLogger('simplemonitor')
class SimpleMonitor:
# TODO: move this outside into monitor.py?
# could give better control over restarting the listener thread
need_hup = False
def __init__(self, allow_pickle=True):
"""Main class turn on."""
self.allow_pickle = allow_pickle
self.monitors = {}
self.failed = []
self.still_failing = []
self.skipped = []
self.warning = []
self.remote_monitors = {}
self.loggers = {}
self.alerters = {}
try:
signal.signal(signal.SIGHUP, self.hup_loggers)
except ValueError: # pragma: no cover
module_logger.warning("Unable to trap SIGHUP... maybe it doesn't exist on this platform.\n")
except AttributeError: # pragma: no cover
module_logger.warning("Unable to trap SIGHUP... maybe it doesn't exist on this platform.\n")
def hup_loggers(self, sig_number, stack_frame):
"""Handle a SIGHUP (rotate logfiles).
We set a variable to say we want to do this later (so it's done at the right time)."""
self.need_hup = True
module_logger.info("We get signal.")
def add_monitor(self, name, monitor):
self.monitors[name] = monitor
def set_tolerance(self, monitor, tolerance):
self.monitors[monitor].set_tolerance(tolerance)
def set_urgency(self, monitor, urgency):
self.monitors[monitor].set_urgency(urgency)
def set_dependencies(self, name, dependencies):
"""Update a monitor's dependencies."""
self.monitors[name].set_dependencies(dependencies)
def reset_monitors(self):
"""Clear all the monitors' dependency info back to default."""
for key in list(self.monitors.keys()):
self.monitors[key].reset_dependencies()
def verify_dependencies(self):
ok = True
for k in list(self.monitors.keys()):
for dependency in self.monitors[k]._dependencies:
if dependency not in list(self.monitors.keys()):
module_logger.critical("Configuration error: dependency %s of monitor %s is not defined!", dependency, k)
ok = False
return ok
def run_tests(self):
self.reset_monitors()
joblist = list(self.monitors.keys())
new_joblist = []
failed = []
not_run = False
while joblist:
new_joblist = []
module_logger.debug("Starting loop with joblist %s", joblist)
for monitor in joblist:
module_logger.debug("Trying monitor: %s", monitor)
if len(self.monitors[monitor].get_dependencies()) > 0:
# this monitor has outstanding deps, put it on the new joblist for next loop
new_joblist.append(monitor)
module_logger.debug("Added %s to new joblist, is now %s", monitor, new_joblist)
for dep in self.monitors[monitor].get_dependencies():
module_logger.debug("considering %s's dependency %s (failed monitors: %s)", monitor, dep, failed)
if dep in failed:
# oh wait, actually one of its deps failed, so we'll never be able to run it
module_logger.info("Doesn't look like %s worked, skipping %s", dep, monitor)
failed.append(monitor)
self.monitors[monitor].record_skip(dep)
try:
new_joblist.remove(monitor)
except Exception:
module_logger.exception("Exception caught while trying to remove monitor %s with failed deps from new joblist.", monitor)
module_logger.debug("new_joblist is currently: %s", new_joblist)
break
continue
try:
if self.monitors[monitor].should_run():
not_run = False
start_time = time.time()
self.monitors[monitor].run_test()
end_time = time.time()
self.monitors[monitor].last_run_duration = end_time - start_time
else:
not_run = True
self.monitors[monitor].record_skip(None)
module_logger.info("Not run: %s", monitor)
except Exception:
module_logger.exception("Monitor %s threw exception during run_test()", monitor)
if self.monitors[monitor].get_error_count() > 0:
if self.monitors[monitor].virtual_fail_count() == 0:
module_logger.warning("monitor failed but within tolerance: %s", monitor)
else:
module_logger.error("monitor failed: %s (%s)", monitor, self.monitors[monitor].last_result)
failed.append(monitor)
else:
if not not_run:
module_logger.info("monitor passed: %s", monitor)
for monitor2 in joblist:
self.monitors[monitor2].dependency_succeeded(monitor)
joblist = copy.copy(new_joblist)
def log_result(self, logger):
"""Use the given logger object to log our state."""
logger.check_dependencies(self.failed + self.still_failing + self.skipped)
logger.start_batch()
for key in list(self.monitors.keys()):
self.monitors[key].log_result(key, logger)
try:
for key in list(self.remote_monitors.keys()):
module_logger.info('remote logging for %s', key)
self.remote_monitors[key].log_result(key, logger)
except Exception: # pragma: no cover
module_logger.exception("exception while logging remote monitors")
logger.end_batch()
def do_alert(self, alerter):
"""Use the given alerter object to send an alert, if needed."""
alerter.check_dependencies(self.failed + self.still_failing + self.skipped)
for key in list(self.monitors.keys()):
# Don't generate alerts for monitors which want it done remotely
if self.monitors[key].remote_alerting:
# TODO: could potentially disable alerts by setting a monitor to remote alerting, but not having anywhere to send it!
module_logger.debug("skipping alert for monitor %s as it wants remote alerting", key)
continue
module_logger.debug("considering alert for monitor %s (group: %s) with alerter %s (groups: %s)",
self.monitors[key].name,
self.monitors[key].group,
alerter.name,
alerter.groups
)
try:
if self.monitors[key].group in alerter.groups:
# Only notifications for services that have it enabled
if self.monitors[key].notify:
module_logger.debug("notifying alerter %s", alerter.name)
alerter.send_alert(key, self.monitors[key])
else:
module_logger.info("skipping alerters for disabled monitor %s", key)
else:
module_logger.info("skipping alerter %s as monitor is not in group", alerter.name)
except Exception: # pragma: no cover
module_logger.exception("exception caught while alerting for %s", key)
for key in list(self.remote_monitors.keys()):
try:
if self.remote_monitors[key].remote_alerting:
alerter.send_alert(key, self.remote_monitors[key])
else:
module_logger.debug("not alerting for monitor %s as it doesn't want remote alerts", key)
continue
except Exception: # pragma: no cover
module_logger.exception("exception caught while alerting for %s", key)
def count_monitors(self):
"""Gets the number of monitors we have defined."""
return len(self.monitors)
def add_alerter(self, name, alerter):
self.alerters[name] = alerter
def add_logger(self, name, logger):
if isinstance(logger, Loggers.logger.Logger):
self.loggers[name] = logger
else:
module_logger.critical('Failed to add logger because it is not the right type')
def do_alerts(self):
for key in list(self.alerters.keys()):
self.do_alert(self.alerters[key])
def do_recovery(self):
for key in list(self.monitors.keys()):
self.monitors[key].attempt_recover()
def do_logs(self):
if self.need_hup:
module_logger.info("Processing HUP.")
for logger in self.loggers:
self.loggers[logger].hup()
self.need_hup = False
for key in list(self.loggers.keys()):
self.log_result(self.loggers[key])
def update_remote_monitor(self, data, hostname):
for (name, state) in data.items():
module_logger.info("updating remote monitor %s", name)
if isinstance(state, dict):
remote_monitor = Monitors.monitor.get_class(state['cls_type']) \
.from_python_dict(state['data'])
self.remote_monitors[name] = remote_monitor
elif self.allow_pickle:
# Fallback for old remote monitors
try:
remote_monitor = pickle.loads(state)
except pickle.UnpicklingError:
module_logger.critical('Could not unpickle monitor %s', name)
else:
self.remote_monitors[name] = remote_monitor
else:
module_logger.critical(
'Could not deserialize state of monitor %s. '
'If the remote host uses an old version of '
'simplemonitor, you need to set allow_pickle = true '
'in the [monitor] section.',
name)
def run_loop(self):
"""Run the complete monitor loop once."""
module_logger.debug('Running tests')
self.run_tests()
module_logger.debug('Running recovery')
self.do_recovery()
module_logger.debug('Running alerts')
self.do_alerts()
module_logger.debug('Running logs')
self.do_logs()
module_logger.debug('Loop complete')