-
Notifications
You must be signed in to change notification settings - Fork 1
/
update_mgmt_health_check.py
executable file
·397 lines (323 loc) · 17.2 KB
/
update_mgmt_health_check.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
#!/usr/bin/env python
# Title - OMS Health Check Script for Update Management Linux VM
# Version - 2.0
import sys
import os
import os.path
import socket
import datetime
import imp
import codecs
import json
import string
import platform
import re
import subprocess
from os import walk
rule_info_list = []
output = []
oms_admin_conf_path = "/etc/opt/microsoft/omsagent/conf/omsadmin.conf"
oms_agent_dir = "/var/opt/microsoft/omsagent"
oms_agent_log = "/var/opt/microsoft/omsagent/log/omsagent.log"
current_mof = "/etc/opt/omi/conf/omsconfig/configuration/Current.mof"
status_passed = "Passed"
status_failed = "Failed"
status_debug = "Debug"
empty_failure_reason = ""
workspace = ""
class RuleInfo:
def __init__(self, rule_id, rule_group_id, status, result_msg_id):
self.RuleId = rule_id
self.RuleGroupId = rule_group_id
self.CheckResult = status
self.CheckResultMessageId = result_msg_id
self.CheckResultMessageArguments = list()
def main(output_path=None, return_json_output="False"):
if os.geteuid() != 0:
print "Please run this script as root"
exit()
# supported python version 2.4.x to 2.7.x
if not ((sys.version_info[0] == 2) and ((sys.version_info[1]>=4) and (sys.version_info[1] < 8))):
log_msg = "Unsupport python version:(%s). Supported python version (2.4 to 2.7)" % str(sys.version_info)
print log_msg
exit()
global workspace
workspace = get_workspace()
get_machine_info()
check_os_version()
check_oms_agent_installed()
check_oms_agent_running()
check_multihoming()
check_hybrid_worker_package_present()
check_hybrid_worker_running()
check_general_internet_connectivity()
check_agent_service_endpoint()
check_jrds_endpoint(workspace)
check_log_analytics_endpoints()
if return_json_output == "True":
print json.dumps([obj.__dict__ for obj in rule_info_list])
else:
for line in output:
print line
if output_path is not None:
try:
os.makedirs(output_path)
except OSError:
if not os.path.isdir(output_path):
raise
log_path = "%s/healthcheck-%s.log" % (output_path, datetime.datetime.utcnow().isoformat())
f = open(log_path, "w")
f.write("".join(output))
f.close()
print "Output is written to " + log_path
def get_machine_info():
FNULL = open(os.devnull, "w")
if subprocess.call(["which", "hostnamectl"], stdout=FNULL, stderr=FNULL) == 0:
hostname_output = os.popen("hostnamectl").read()
write_log_output(None, None, status_debug, empty_failure_reason, "Machine Information:" + hostname_output)
FNULL.close()
def check_os_version():
rule_id = "Linux.OperatingSystemCheck"
rule_group_id = "prerequisites"
os_version = platform.platform()
supported_os_url = "https://docs.microsoft.com/en-us/azure/automation/automation-update-management#clients"
# We support (Ubuntu 14.04, Ubuntu 16.04, Ubuntu 18.04, SuSE 11, SuSE 12, Redhat 6, Redhat 7, CentOs 6, CentOs 7)
if re.search("Ubuntu-14.04", os_version, re.IGNORECASE) or \
re.search("Ubuntu-16.04", os_version, re.IGNORECASE) or \
re.search("Ubuntu-18.04", os_version, re.IGNORECASE) or \
re.search("SuSE-11", os_version, re.IGNORECASE) or \
re.search("SuSE-12", os_version, re.IGNORECASE) or \
re.search("redhat-6", os_version, re.IGNORECASE) or \
re.search("redhat-7", os_version, re.IGNORECASE) or \
re.search("centos-6", os_version, re.IGNORECASE) or \
re.search("centos-7", os_version, re.IGNORECASE) :
write_log_output(rule_id, rule_group_id, status_passed, empty_failure_reason, "Operating system version is supported")
else:
log_msg = "Operating System version (%s) is not supported. Supported versions listed here: %s" % (os_version, supported_os_url)
write_log_output(rule_id, rule_group_id, status_failed, empty_failure_reason, log_msg, supported_os_url)
def check_oms_agent_installed():
rule_id = "Linux.OMSAgentInstallCheck"
rule_group_id = "servicehealth"
oms_agent_troubleshooting_url = "https://github.com/Microsoft/OMS-Agent-for-Linux/blob/master/docs/Troubleshooting.md"
if os.path.isfile(oms_admin_conf_path) and os.path.isfile(oms_agent_log):
write_log_output(rule_id, rule_group_id, status_passed, empty_failure_reason, "Microsoft Monitoring agent is installed")
oms_admin_file_content = "\t"
oms_admin_file = open(oms_admin_conf_path, "r")
for line in oms_admin_file:
oms_admin_file_content += line + "\t"
oms_admin_file.close()
write_log_output(rule_id, rule_group_id, status_debug, empty_failure_reason, "omsadmin.conf file contents:\n" + oms_admin_file_content)
else:
write_log_output(rule_id, rule_group_id, status_failed, empty_failure_reason, "Microsoft Monitoring agent is not installed", oms_agent_troubleshooting_url)
write_log_output(rule_id, rule_group_id, status_debug, empty_failure_reason, "Microsoft Monitoring agent troubleshooting guide:" + oms_agent_troubleshooting_url)
return
def check_oms_agent_running():
rule_id = "Linux.OMSAgentStatusCheck"
rule_group_id = "servicehealth"
oms_agent_troubleshooting_url = "https://github.com/Microsoft/OMS-Agent-for-Linux/blob/master/docs/Troubleshooting.md"
is_oms_agent_running, ps_output = is_process_running("omsagent", ["omsagent.log", "omsagent.conf"], "OMS Agent")
if is_oms_agent_running:
write_log_output(rule_id, rule_group_id, status_passed, empty_failure_reason, "Microsoft Monitoring agent is running")
else:
write_log_output(rule_id, rule_group_id, status_failed, empty_failure_reason, "Microsoft Monitoring agent is not running", oms_agent_troubleshooting_url)
write_log_output(rule_id, rule_group_id, status_debug, empty_failure_reason, ps_output)
write_log_output(rule_id, rule_group_id, status_debug, empty_failure_reason, "Microsoft Monitoring agent troubleshooting guide:" + oms_agent_troubleshooting_url)
def check_multihoming():
rule_id = "Linux.MultiHomingCheck"
rule_group_id = "servicehealth"
if not os.path.isdir(oms_agent_dir):
write_log_output(rule_id, rule_group_id, status_failed, "NoWorkspace", "Machine is not registered with log analytics workspace.")
return
directories = []
potential_workspaces = []
for (dirpath, dirnames, filenames) in walk(oms_agent_dir):
directories.extend(dirnames)
break # Get the top level of directories
for directory in directories:
if len(directory) >= 32:
potential_workspaces.append(directory)
workspace_id_list = str(potential_workspaces)
if len(potential_workspaces) > 1:
write_log_output(rule_id, rule_group_id, status_failed, empty_failure_reason, "Machine registered with more than one log analytics workspace. List of workspaces:" + workspace_id_list, workspace_id_list)
else:
write_log_output(rule_id, rule_group_id, status_passed, empty_failure_reason, "Machine registered with log analytics workspace:" + workspace_id_list, workspace_id_list)
def check_hybrid_worker_package_present():
rule_id = "Linux.HybridWorkerPackgeCheck"
rule_group_id = "servicehealth"
if os.path.isfile("/opt/microsoft/omsconfig/modules/nxOMSAutomationWorker/VERSION") and \
os.path.isfile("/opt/microsoft/omsconfig/modules/nxOMSAutomationWorker/DSCResources/MSFT_nxOMSAutomationWorkerResource/automationworker/worker/configuration.py"):
write_log_output(rule_id, rule_group_id, status_passed, empty_failure_reason, "Hybrid worker package is present")
else:
write_log_output(rule_id, rule_group_id, status_failed, empty_failure_reason, "Hybrid worker package is not present")
def check_hybrid_worker_running():
rule_id = "Linux.HybridWorkerStatusCheck"
rule_group_id = "servicehealth"
if not os.path.isfile(current_mof):
write_log_output(rule_id, rule_group_id, status_failed, "MissingCurrentMofFile", "Hybrid worker is not running. current_mof file:(" + current_mof + ") is missing", current_mof)
return
search_text = "ResourceSettings"
command = "file -b --mime-encoding " + current_mof
current_mof_encoding = os.popen(command).read()
resourceSetting = find_line_in_file("ResourceSettings", current_mof, current_mof_encoding);
if resourceSetting is None:
write_log_output(rule_id, rule_group_id, status_failed, empty_failure_reason, "Hybrid worker is not running")
write_log_output(rule_id, rule_group_id, status_debug, empty_failure_reason, "Unable to get ResourceSettings from current_mof file:(" + current_mof + ") with file encoding:" + current_mof_encoding)
return
backslash = string.replace("\str", "str", "")
resourceSetting = string.replace(resourceSetting, backslash, "")
resourceSetting = string.replace(resourceSetting, ";", "")
resourceSetting = string.replace(resourceSetting, "\"[", "[")
resourceSetting = string.replace(resourceSetting, "]\"", "]")
resourceSetting = resourceSetting.split("=")[1].strip()
automation_worker_path = "/opt/microsoft/omsconfig/Scripts/"
if (sys.version_info.major == 2) :
if (sys.version_info.minor >= 6) :
automation_worker_path += "2.6x-2.7x"
else:
automation_worker_path += "2.4x-2.5x"
os.chdir(automation_worker_path)
nxOMSAutomationWorker=imp.load_source("nxOMSAutomationWorker", "./Scripts/nxOMSAutomationWorker.py")
settings = nxOMSAutomationWorker.read_settings_from_mof_json(resourceSetting)
if not settings.auto_register_enabled:
write_log_output(rule_id, rule_group_id, status_failed, "UpdateDeploymentDisabled", "Hybrid worker is not running", current_mof)
write_log_output(rule_id, rule_group_id, status_debug, empty_failure_reason, "Update deployment solution is not enabled. ResourceSettings:" + resourceSetting)
return
if nxOMSAutomationWorker.Test_Marshall(resourceSetting) == [0]:
write_log_output(rule_id, rule_group_id, status_passed, empty_failure_reason, "Hybrid worker is running")
else:
write_log_output(rule_id, rule_group_id, status_failed, empty_failure_reason, "Hybrid worker is not running")
write_log_output(rule_id, rule_group_id, status_debug, empty_failure_reason, "ResourceSettings:" + resourceSetting + " read from current_mof file:(" + current_mof + ")")
write_log_output(rule_id, rule_group_id, status_debug, empty_failure_reason, "nxOMSAutomationWorker.py path:" + automation_worker_path)
def check_general_internet_connectivity():
rule_id = "Linux.InternetConnectionCheck"
rule_group_id = "connectivity"
if check_endpoint(None, "bing.com") and check_endpoint(None, "google.com"):
write_log_output(rule_id, rule_group_id, status_passed, empty_failure_reason, "Machine is connected to internet")
else:
write_log_output(rule_id, rule_group_id, status_failed, empty_failure_reason, "Machine is not connected to internet")
def check_agent_service_endpoint():
rule_id = "Linux.AgentServiceConnectivityCheck"
rule_group_id = "connectivity"
agent_endpoint = get_agent_endpoint()
if agent_endpoint is None:
write_log_output(rule_id, rule_group_id, status_failed, "UnableToGetEndpoint", "Unable to get the registration (agent service) endpoint")
elif check_endpoint(None, agent_endpoint):
write_log_output(rule_id, rule_group_id, status_passed, empty_failure_reason, "TCP test for {" + agent_endpoint + "} (port 443) succeeded", agent_endpoint)
else:
write_log_output(rule_id, rule_group_id, status_failed, empty_failure_reason, "TCP test for {" + agent_endpoint + "} (port 443) failed", agent_endpoint)
def check_jrds_endpoint(workspace):
rule_id = "Linux.JRDSConnectivityCheck"
rule_group_id = "connectivity"
jrds_endpoint = get_jrds_endpoint(workspace)
if jrds_endpoint is None:
write_log_output(rule_id, rule_group_id, status_failed, "UnableToGetEndpoint", "Unable to get the operations (JRDS) endpoint")
elif jrds_endpoint is not None and check_endpoint(workspace, jrds_endpoint):
write_log_output(rule_id, rule_group_id, status_passed, empty_failure_reason, "TCP test for {" + jrds_endpoint + "} (port 443) succeeded", jrds_endpoint)
else:
write_log_output(rule_id, rule_group_id, status_failed, empty_failure_reason, "TCP test for {" + jrds_endpoint + "} (port 443) failed", jrds_endpoint)
def check_log_analytics_endpoints():
rule_id = "Linux.LogAnalyticsConnectivityCheck"
rule_group_id = "connectivity"
i = 0
if is_fairfax_region() is True:
fairfax_log_analytics_endpoints = ["usge-jobruntimedata-prod-1.usgovtrafficmanager.net", "usge-agentservice-prod-1.usgovtrafficmanager.net",
"*.ods.opinsights.azure.us", "*.oms.opinsights.azure.us" ]
for endpoint in fairfax_log_analytics_endpoints:
i += 1
if "*" in endpoint and workspace is not None:
endpoint = endpoint.replace("*", workspace)
if check_endpoint(workspace, endpoint):
write_log_output(rule_id + str(i), rule_group_id, status_passed, empty_failure_reason, "TCP test for {" + endpoint + "} (port 443) succeeded", endpoint)
else:
write_log_output(rule_id + str(i), rule_group_id, status_failed, empty_failure_reason, "TCP test for {" + endpoint + "} (port 443) failed", endpoint)
else:
log_analytics_endpoints = ["*.ods.opinsights.azure.com", "*.oms.opinsights.azure.com", "ods.systemcenteradvisor.com"]
for endpoint in log_analytics_endpoints:
i += 1
if "*" in endpoint and workspace is not None:
endpoint = endpoint.replace("*", workspace)
if check_endpoint(workspace, endpoint):
write_log_output(rule_id + str(i), rule_group_id, status_passed, empty_failure_reason, "TCP test for {" + endpoint + "} (port 443) succeeded", endpoint)
else:
write_log_output(rule_id + str(i), rule_group_id, status_failed, empty_failure_reason, "TCP test for {" + endpoint + "} (port 443) failed", endpoint)
def check_endpoint(workspace, endpoint):
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
new_endpoint = None
if "*" in endpoint and workspace is not None:
new_endpoint = endpoint.replace("*", workspace)
elif "*" not in endpoint:
new_endpoint = endpoint
if new_endpoint is not None:
try:
response = sock.connect_ex((new_endpoint, 443))
if response == 0:
return True
else:
return False
except Exception as ex:
return False
else:
return False
def get_jrds_endpoint(workspace):
if workspace is not None:
worker_conf_path = "/var/opt/microsoft/omsagent/%s/state/automationworker/worker.conf" % (workspace)
line = find_line_in_file("jrds_base_uri", worker_conf_path)
if line is not None:
return line.split("=")[1].split("/")[2].strip()
return None
def get_agent_endpoint():
line = find_line_in_file("agentservice", oms_admin_conf_path)
# Fetch the text after https://
if line is not None:
return line.split("=")[1].split("/")[2].strip()
return None
def is_process_running(process_name, search_criteria, output_name):
command = "ps aux | grep %s | grep -v grep" % (process_name)
grep_output = os.popen(command).read()
if any(search_text in grep_output for search_text in search_criteria):
return True, grep_output
else:
return False, grep_output
def get_workspace():
line = find_line_in_file("WORKSPACE", oms_admin_conf_path)
if line is not None:
return line.split("=")[1].strip()
return None
def is_fairfax_region():
oms_endpoint = find_line_in_file("OMS_ENDPOINT", oms_admin_conf_path)
if oms_endpoint is not None:
return ".us" in oms_endpoint.split("=")[1]
def find_line_in_file(search_text, path, file_encoding=""):
if os.path.isfile(path):
if file_encoding == "":
current_file = open(path, "r")
else:
current_file = codecs.open(path, "r", file_encoding)
for line in current_file:
if search_text in line:
current_file.close()
return line
current_file.close()
return None
def write_log_output(rule_id, rule_group_id, status, failure_reason, log_msg, *result_msg_args):
global output, rule_info_list
if(type(log_msg) != str):
log_msg = str(log_msg)
if status != status_debug:
if failure_reason == empty_failure_reason:
result_msg_id = rule_id + "." + status
else:
result_msg_id = rule_id + "." + status + "." + failure_reason
current_rule_info = RuleInfo(rule_id, rule_group_id, status, result_msg_id)
result_msg_args_list = []
for arg in result_msg_args:
current_rule_info.CheckResultMessageArguments.append(arg)
rule_info_list.append(current_rule_info)
output.append(status + ": " + log_msg + "\n")
if __name__ == "__main__":
if len(sys.argv) > 2:
main(sys.argv[1], sys.argv[2])
elif len(sys.argv) > 1:
main(sys.argv[1])
else:
main()