-
Notifications
You must be signed in to change notification settings - Fork 0
/
faultguard.py
196 lines (153 loc) · 6.81 KB
/
faultguard.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
from multiprocessing import Process, Manager
import pickle
from collections.abc import MutableMapping
import signal
class FaultguardDict(MutableMapping):
"""
Dictionary-like object.
Stores data in the faultguard process. Every data is automatically serialized and deserialized using pickle.
If the application process(es) experience a fault, the data in this object should be preserved.
"""
def __init__(self, managed_dict):
self.store = managed_dict
def __getitem__(self, key):
return pickle.loads(self.store[key])
def __setitem__(self, key, value):
self.store[key] = pickle.dumps(value)
def __delitem__(self, key):
del self.store[key]
def __iter__(self):
return iter(self.store)
def __len__(self):
return len(self.store)
def wrapped_launch(launch, managed_dict, signal_handlers, args):
# Attach signal handlers
for sig in signal_handlers:
signal.signal(sig, signal_handlers[sig])
faultguard_data = FaultguardDict(managed_dict)
if args is None:
launch(faultguard_data)
else:
launch(faultguard_data, args)
def is_active(autosave_file):
"""
Test if the process creating a given autosave file is running.
:param autosave_file: Path to autosave file.
"""
import os
import time
return abs(os.stat(autosave_file).st_atime - time.time()) < 2
def recover(rescue, autosave_file):
"""
Load the given faultguard data dictionary from an autosave file and pass it to a rescue function.
:param rescue: The method to call with the recovered faultguard data dictionary.
:param autosave_file: The file to recover the data from.
:returns: Whether the main (0) or the backup file (1) was used for recovery
"""
# Compression library
import lzma
import os
if is_active(autosave_file):
import time
time.sleep(2)
if is_active(autosave_file):
raise RuntimeError("Trying to access a backup of a process that is still running.")
success = True
try:
with lzma.open(autosave_file, "r") as f:
faultguard_data = FaultguardDict(pickle.load(f))
except Exception as e:
print("The following issue occured during recovery:", e)
success = False
if success:
rescue(faultguard_data)
return 0
if not os.path.isfile(autosave_file + ".tmp"):
raise RuntimeError("Recovery unsuccessful.")
print("Switching to try recovery of backup file")
with lzma.open(autosave_file + ".tmp", "r") as f:
faultguard_data = FaultguardDict(pickle.load(f))
rescue(faultguard_data)
return 1
def start(launch, rescue, args=None, autosave_interval=None, autosave_file=None):
"""
Start application through faultguard.
Launch and rescue have access to the same dictionary. Each entry in this dictionary is stored as serialized data using the python internal 'pickle' method. The "launch" method runs in a seperate process so a fault in that process should not affect the data stored in the dictionary.
If the autosave parameters are set, the dictionary is compressed and saved in the specified time interval to the specified path. Throws an error if the autosave file already exists. After successful exit of the monitored application, the autosave file is deleted.
:param launch: The applications main method. Accepts faultguard data dictionary as first and args (if not None) as second parameter.
:param rescue: The method to call on a fault. Accepts faultguard data dictionary as first and args (if not None) as second parameter.
:param args: Data passed to launch and rescue methods.
:param autosave_interval: Time in seconds between each autosave of the `faultguard` dictionary.
:param autosave_file: Path to file to use for autosaves.
:returns: The applications exit code.
"""
# Ensure valid parameters
if autosave_interval is not None or autosave_file is not None:
if autosave_interval is None or autosave_file is None:
raise TypeError("Only one of the arguments 'autosave_interval' and 'autosave_file' is defined")
import os
if os.path.isfile(autosave_file):
raise RuntimeError("The given autosave file already exists")
with open(autosave_file, "w") as f:
if not f.writable():
raise RuntimeError("The given autosave file is not writable")
if os.path.isfile(autosave_file + ".tmp"):
os.remove(autosave_file + ".tmp")
# Detach signal handlers from faultguard process
# Ensures faultguard does not interfere with signals like SIGINT
orig_handlers = {}
for sig in signal.Signals:
if sig.name == "CTRL_C_EVENT" \
or sig.name == "CTRL_BREAK_EVENT" \
or sig.name == "SIGKILL" \
or sig.name == "SIGSTOP" \
or sig.name == "SIGCHLD":
continue
orig_handlers[sig] = signal.signal(sig, signal.SIG_IGN)
# Setup process
manager = Manager()
managed_dict = manager.dict()
p = Process(target=wrapped_launch, args=(launch, managed_dict, orig_handlers, args,))
# Run process
p.start()
if autosave_interval is None:
p.join()
else:
# Compression library
import lzma
import time
while p.is_alive():
# Autosave
if os.path.isfile(autosave_file + ".tmp"):
os.remove(autosave_file + ".tmp")
os.rename(autosave_file, autosave_file + ".tmp")
with lzma.open(autosave_file, "w") as f:
pickle.dump(dict(managed_dict), f)
mod_time = time.time()
# Wait for next autosave
remaining_interval = autosave_interval
while remaining_interval > 1:
remaining_interval -= 1
p.join(1)
os.utime(autosave_file, (time.time(), mod_time))
p.join(remaining_interval)
# Close Manager process
# If this is not done and the faultguard process is terminated, the Manager process
# would keep running.
if p.exitcode != 0:
faultguard_data = FaultguardDict(dict(managed_dict))
manager.shutdown()
# Re-attach signal handlers
for sig in orig_handlers:
signal.signal(sig, orig_handlers[sig])
if p.exitcode != 0:
if args is None:
rescue(faultguard_data, p.exitcode)
else:
rescue(faultguard_data, p.exitcode, args)
if autosave_interval is not None and os.path.isfile(autosave_file):
# Remove autosave file
os.remove(autosave_file)
if os.path.isfile(autosave_file + ".tmp"):
os.remove(autosave_file + ".tmp")
return p.exitcode