系统监控程序

it2022-05-05  131

系统监控程序

只使用python标准库,监控cpu, load, memory,traffic, connection, diskpace, diskio

#!/usr/bin/env python # -*- encoding: utf-8 -*- from optparse import OptionParser import os, sys, pwd, time, re, json def unit_convert(data): if data < 10**3: return str(round(float(data), 2)) elif data > 10**3 and data < 10**6: return str(round(float(data) / 10**3, 2)) + 'K' elif data > 10**6 and data < 10**9: return str(round(float(data) / 10**6, 2)) + 'M' elif data > 10**9 and data < 10**12: return str(round(float(data) / 10**9, 2)) + 'G' elif data > 10**12 and data < 10**15: return str(round(float(data) / 10**12, 2)) + 'T' ''' if data < 2**10: return str(round(float(data), 2)) elif data > 2**10 and data < 2**20: return str(round(float(data) / 2**10, 2)) + 'K' elif data > 2**20 and data < 2**30: return str(round(float(data) / 2**20, 2)) + 'M' elif data > 2**30 and data < 2**40: return str(round(float(data) / 2**30, 2)) + 'G' elif data > 2**40 and data < 2**50: return str(round(float(data) / 2**40, 2)) + 'T' ''' def nagios_handle(status, status_info, perf_data): STATUS = {'0': 'OK', '2': 'CRITICAL'} if isinstance(perf_data, str): print STATUS[str(status)] + status_info + ' |' + perf_data else: print STATUS[str(status)] + status_info + ' |' + ' '.join(sorted(perf_data)) sys.exit(status) class CPUCollector(): ''' CPU Collector''' def get_stats(self): cpu = {} total_cpu = 0 with open('/proc/stat', 'r') as f: for line in f: m = re.match('^(cpu\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)', line) if m: name = m.group(1) cpu[name] = {} cpu[name]['idle'], cpu[name]['system'], cpu[name]['user'], cpu[name]['iowait'] = \ int(m.group(5)), int(m.group(4)), int(m.group(2)), int(m.group(6)) total_cpu += 1 data = { 'timestamp': int(time.mktime(time.localtime())), 'total_cpu': total_cpu, 'cpu': cpu, } return data def check_status(self, threshold): cache_file = '/tmp/cache_cpu_' + pwd.getpwuid(os.getuid())[0] perf_data = [] avg_cpu = 0 if not os.path.exists(cache_file): new = self.get_stats() with open(cache_file, 'w') as f: json.dump(new, f, sort_keys=True) nagios_handle(0, ': Buffer creation...', '') else: with open(cache_file, 'r') as f: old = json.load(f) new = self.get_stats() with open(cache_file, 'w') as f: json.dump(new, f, sort_keys=True) for name in new['cpu'].keys(): delta_total = (new['cpu'][name]['idle'] + new['cpu'][name]['system'] + new['cpu'][name]['user'] + new['cpu'][name]['iowait']) - \ (old['cpu'][name]['idle'] + old['cpu'][name]['system'] + old['cpu'][name]['user'] + old['cpu'][name]['iowait']) delta_idle = new['cpu'][name]['idle'] - old['cpu'][name]['idle'] cpu_usage = 100 - (100.0 * delta_idle / delta_total) perf_data.append('\'%s\'=%.2f%%;;;0;100' % (name, cpu_usage)) avg_cpu += cpu_usage avg_cpu = avg_cpu / new['total_cpu'] perf_data.append('\'total_cpu_avg\'=%.2f%%;0:%d;0:%d;0;100' % (avg_cpu, threshold, threshold)) status = 0 if avg_cpu < threshold else 2 status_info = ': CPU(s) average usage is: %.2f%%' % avg_cpu nagios_handle(status, status_info, perf_data) with open(cache_file, 'w') as f: json.dump(new, f, sort_keys=True) class LoadCollector(): '''Load Collector''' def get_stats(self): with open('/proc/loadavg', 'r') as f: load1, load5, load15 = f.readline().split()[0:3] return { 'load1': load1, 'load5': load5, 'load15': load15 } def get_cpu_core(self): stats = open('/proc/cpuinfo', 'r') core = [] for line in stats: if line.startswith('core id'): core.append(int(line.split(':')[1])) return len(list(set(core))) def check_status(self): perf_data = [] stats = self.get_stats() threshold = self.get_cpu_core() * 8 status = 0 if stats['load5'] > threshold else 2 status_info = ': Load average: %s, %s, %s' % (stats['load1'], stats['load5'], stats['load15']) for name, value in stats.items(): perf_data.append('\'%s\'=%s;0:%d;0:%d;0;' %(name, value, threshold, threshold)) nagios_handle(status, status_info, perf_data) class MemoryCollector(): '''Memory Collector''' def get_stats(self): with open('/proc/meminfo', 'r') as f: for line in f: m = re.match('^(MemTotal|MemFree|Buffers|Cached):\s+(\d+) kB', line) if m: if m.group(1) == 'MemTotal': total = float(m.group(2)) * 1024 elif m.group(1) == 'MemFree': free = float(m.group(2)) * 1024 elif m.group(1) == 'Buffers': buffers = float(m.group(2)) * 1024 elif m.group(1) == 'Cached': cached = float(m.group(2)) * 1024 return total, free, buffers, cached def check_status(self, threshold): total, free, buffers, cached = self.get_stats() used = total - (free + buffers + cached) percent = round(100.0 * used / total, 2) status = 0 if percent < threshold else 2 status_info = ': Ram Used (+buffers/cache): %sB (%.2f%%), Buffer: %sB, Cached: %sB, Total: %sB' \ % (unit_convert(used), percent, unit_convert(buffers), unit_convert(cached), unit_convert(total)) perf_data = '\'cached\'=%d;;;0; \'buffer\'=%d;;;0; \'used\'=%d;0:%d;0:%d;0;%d' % \ (cached, buffers, used, used*threshold/100, used*threshold/100, total) nagios_handle(status, status_info, perf_data) class ConnectionsCollector(): '''Connections Collector''' def get_stats(self): data = { 'ERROR' : 0, 'ESTABLISHED' : 0, 'SYN_SENT' : 0, 'SYN_RECV' : 0, 'FIN_WAIT1' : 0, 'FIN_WAIT2' : 0, 'TIME_WAIT' : 0, 'CLOSE' : 0, 'CLOSE_WAIT' : 0, 'LAST_ACK' : 0, 'LISTEN' : 0, 'CLOSING' : 0 } with open('/proc/net/tcp6', 'r') as f: for line in f: m = re.match('\s*(\d+):\s+(\S*):(\S*)\s+(\S*):(\S*)\s+(\S*)\s+(\S*):(\S*)\s+(\S*):(\S*)\s+(\S*)\s+(\S*)\s+(.*)', line) if m: state = m.group(6) if state is '00': data['ERROR'] += 1 elif state == '01': data['ESTABLISHED'] += 1 elif state == '02': data['SYN_SENT'] += 1 elif state == '03': data['SYN_RECV'] += 1 elif state == '04': data['FIN_WAIT1'] += 1 elif state == '05': data['FIN_WAIT2'] += 1 elif state == '06': data['TIME_WAIT'] += 1 elif state == '07': data['CLOSE'] += 1 elif state is '08': data['CLOSE_WAIT'] += 1 elif state is '09': data['LAST_ACK'] += 1 elif state is '0A': data['LISTEN'] += 1 elif state is '0B': data['CLOSING'] += 1 if os.path.exists('/proc/net/tcp'): with open('/proc/net/tcp', 'r') as f: for line in f: m = re.match('\s*(\d+):\s+(\S*):(\S*)\s+(\S*):(\S*)\s+(\S*)\s+(\S*):(\S*)\s+(\S*):(\S*)\s+(\S*)\s+(\S*)\s+(.*)', line) if m: state = m.group(6) if state is '00': data['ERROR'] += 1 elif state == '01': data['ESTABLISHED'] += 1 elif state == '02': data['SYN_SENT'] += 1 elif state == '03': data['SYN_RECV'] += 1 elif state == '04': data['FIN_WAIT1'] += 1 elif state == '05': data['FIN_WAIT2'] += 1 elif state == '06': data['TIME_WAIT'] += 1 elif state == '07': data['CLOSE'] += 1 elif state is '08': data['CLOSE_WAIT'] += 1 elif state is '09': data['LAST_ACK'] += 1 elif state is '0A': data['LISTEN'] += 1 elif state is '0B': data['CLOSING'] += 1 return data def check_status(self, threshold): perf_data = [] stats = self.get_stats() total = sum([i for i in stats.values()]) status = 0 if total < threshold else 2 status_info = ': Total connections: %d' % total for name, value in stats.items(): perf_data.append('\'%s\'=%d;;;0;' % (name.lower(), value)) nagios_handle(status, status_info, perf_data) class TrafficCollector(): '''Traffic Collector''' def list_interface(self): with open('/proc/net/dev', 'r') as f: for line in f: m = re.match('\s*(\S+):\s*(.*)', line) if m: print m.group(1) def get_stats(self): nic = {} with open('/proc/net/dev', 'r') as f: for line in f: m = re.match('\s*(\S+):\s*(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+', line) if m: name = m.group(1) nic[name] = {} nic[name]['in'], nic[name]['out'] = int(m.group(2)) , int(m.group(10)) data = { 'timestamp': int(time.mktime(time.localtime())), 'nic': nic, } return data def check_status(self, interface, speed, percent): max = speed * 10**6 threshold = percent / 100.0 * max cache_file = '/tmp/cache_traffic_' + pwd.getpwuid(os.getuid())[0] if not os.path.exists(cache_file): new = self.get_stats() with open(cache_file, 'w') as f: json.dump(new, f, sort_keys=True) nagios_handle(0, ': Buffer creation...', '') else: with open(cache_file, 'r') as f: old = json.load(f) new = self.get_stats() with open(cache_file, 'w') as f: json.dump(new, f, sort_keys=True) if new['nic'][interface]['in'] > old['nic'][interface]['in']: traffic_in = (new['nic'][interface]['in'] - old['nic'][interface]['in']) / float(new['timestamp'] - old['timestamp']) * 8 else: traffic_in = (old['nic'][interface]['in'] - new['nic'][interface]['in']) / float(new['timestamp'] - old['timestamp']) * 8 if new['nic'][interface]['out'] > old['nic'][interface]['out']: traffic_out = (new['nic'][interface]['out'] - old['nic'][interface]['out']) / float(new['timestamp'] - old['timestamp']) * 8 else: traffic_out = (old['nic'][interface]['out'] - new['nic'][interface]['out']) / float(new['timestamp'] - old['timestamp']) * 8 status = 0 if (traffic_in < threshold) and (traffic_out < threshold) else 2 status_info = ': Interface %s Traffic In : %sb/s (%.2f%%), Out : %sb/s (%.2f%%)' \ % (interface, unit_convert(traffic_in), traffic_in / max * 100, unit_convert(traffic_out), traffic_out / max * 100) perf_data = '\'traffic_in\'=%.2fb/s;;0:%.1f;0;%.1f \'traffic_out\'=%.2fb/s;;0:%.1f;0;%.1f' \ % (traffic_in, threshold, max, traffic_out, threshold, max) nagios_handle(status, status_info, perf_data) class DiskSpaceCollector(): '''Disk Space Collector''' def list_mountpoint(self): with open('/proc/mounts', 'r') as f: for line in f: m = re.match('/dev/(sd[a-z][0-9]|md\S+)\s+(/\S*)\s+(\S+)\s(.*)', line) if m: print m.group(1), m.group(2), m.group(3) def get_stats(self, mountpoint): disk = os.statvfs(mountpoint) free = (disk.f_bavail * disk.f_frsize) total = (disk.f_blocks * disk.f_frsize) used = (disk.f_blocks - disk.f_bfree) * disk.f_frsize percent = 100.0 * used / total return total, used, free, percent def check_status(self, mountpoint, threshold): total, used, free, percent = self.get_stats(mountpoint) threshold = threshold * 10**9 status = 0 if free > threshold else 2 status_info = ': Storage \'%s\' Total: %sB Used: %sB (%.2f%%) Free: %sB (%.2f%%)' \ % (mountpoint, unit_convert(total), unit_convert(used), percent, unit_convert(free), 100 - percent) perf_data = '\'free\'=%.dB;@0:%d;@0:%d;0;%d' % (free, threshold, threshold, total) nagios_handle(status, status_info, perf_data) class DiskIOCollector(): '''Disk IO Collector''' def list_partition(self): with open('/proc/diskstats', 'r') as f: for line in f: m = re.match('\s*\d+\s+\d+\s+(sd[a-z]|md\d+)\s+(.*)', line) if m: print m.group(1) def get_stats(self): cpu = {} cpu['total_cpu'] = 0 disk = {} with open('/proc/stat', 'r') as f: for line in f: m = re.match('^cpu\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)', line) n = re.match('cpu\d+', line) if m: cpu['idle'], cpu['system'], cpu['user'], cpu['iowait'] = \ int(m.group(4)), int(m.group(3)), int(m.group(1)), int(m.group(5)) if n: cpu['total_cpu'] += 1 with open('/proc/diskstats', 'r') as f: for line in f: m = re.match('^\s*(\d+)\s+(\d+)\s+(sd[a-z]|md\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+', line) if m: name = m.group(3) disk[name] = {} disk[name]['read_sector'], disk[name]['write_sector'], disk[name]['read_ms'], disk[name]['write_ms'], disk[name]['ms_ticks'] = \ int(m.group(6)), int(m.group(10)), int(m.group(7)), int(m.group(11)), int(m.group(13)) data = { 'timestamp': int(time.mktime(time.localtime())), 'cpu': cpu, 'disk': disk } return data def check_status(self, partition, threshold): bytes_per_sector = 512.0 interrupt_frequency = 1000.0 cache_file = '/tmp/cache_diskio_' + pwd.getpwuid(os.getuid())[0] if not os.path.exists(cache_file): new = self.get_stats() with open(cache_file, 'w') as f: json.dump(new, f, sort_keys=True) nagios_handle(0, ': Buffer creation...', '') else: with open(cache_file, 'r') as f: old = json.load(f) new = self.get_stats() read_bytes = (new['disk'][partition]['read_sector'] - old['disk'][partition]['read_sector']) * bytes_per_sector / (new['timestamp'] - old['timestamp']); write_bytes = (new['disk'][partition]['write_sector'] - old['disk'][partition]['write_sector']) * bytes_per_sector / (new['timestamp'] - old['timestamp']); read_ms = new['disk'][partition]['read_ms'] - old['disk'][partition]['read_ms'] write_ms = new['disk'][partition]['write_ms'] - old['disk'][partition]['write_ms'] delta_ms = ((new['cpu']['idle'] + new['cpu']['system'] + new['cpu']['user'] + new['cpu']['iowait']) - \ (old['cpu']['idle'] + old['cpu']['system'] + old['cpu']['user'] + old['cpu']['iowait'])) * \ interrupt_frequency / old['cpu']['total_cpu'] / 100; utils = 100 * (new['disk'][partition]['ms_ticks'] - old['disk'][partition]['ms_ticks']) / delta_ms; utils = 100 if utils > 100 else utils status = 0 if utils < threshold else 2 perf_data = '\'readio\'=%sB/s;;;0; \'writeio\'=%sB/s;;;0; \'readtime\'=%dms;;;0; \'writetime\'=%dms;;;0; \'utils\'=%.2f%%;;;0;100' \ % (read_bytes, write_bytes, read_ms, write_ms, utils) status_info = ': Partition %s Read I/O: %sB/s, Write I/O: %sB/s, Write Time: %dms, Read Time: %dms, %%Utils: %.2f%%' \ % (partition, unit_convert(read_bytes), unit_convert(write_bytes), read_ms, write_ms, utils) nagios_handle(status, status_info, perf_data) with open(cache_file, 'w') as f: json.dump(new, f, sort_keys=True) if __name__ == '__main__': usage = ''' %prog [options] arg1 arg2 example: . %prog --mode=cpu --critical=98 (unit: %) . %prog --mode=memory --critical=98 (unit: %) . %prog --mode=load . %prog --mode=connections --critical=30000 (unit: int) . %prog --mode=traffic --interface=em1 --speed=1000 --critical=90 (unit: %) . %prog --mode=diskspace --name=/opt --critical=5 (unit: GB) . %prog --mode=disksio --name=sda --critical=95 (unit: %) . %prog --mode=traffic --list-interface . %prog --mode=diskspace --list-partition . %prog --mode=disksio --list-disk ''' parser = OptionParser(usage) parser.add_option('--mode', action='store', type='string', dest='mode', help='check mode') parser.add_option('--name', action='store', type='string', dest='name', help='check item name') parser.add_option('--critical', action='store', type='int', dest='critical', help='Threshold critical') parser.add_option('--speed', action='store', type='int', dest='speed', help='interface max speed, unit is Mb') parser.add_option('--list-interface', action='store_true', dest='interfaces', help='list all NIC interfaces') parser.add_option('--list-partition', action='store_true', dest='partitions', help='list all partitions') parser.add_option('--list-disk', action='store_true', dest='disks', help='list all disks') (options, args) = parser.parse_args() if options.mode == 'cpu' and options.critical in range(1, 100+1): cpu = CPUCollector() cpu.check_status(options.critical) if options.mode == 'load': load = LoadCollector() load.check_status() if options.mode == 'memory' and options.critical in range(1, 100+1): memory = MemoryCollector() memory.check_status(options.critical) if options.mode == 'connections' and options.critical: connections = ConnectionsCollector() connections.check_status(options.critical) if options.mode == 'traffic' and options.name and options.speed and options.critical: traffic = TrafficCollector() traffic.check_status(options.name, options.speed, options.critical) if options.interfaces: traffic = TrafficCollector() traffic.list_interface() if options.mode == 'diskspace' and options.name and options.critical: diskspace = DiskSpaceCollector() diskspace.check_status(options.name, options.critical) if options.partitions: diskspace = DiskSpaceCollector() diskspace.list_mountpoint() if options.mode == 'diskio' and options.name and options.critical: diskio = DiskIOCollector() diskio.check_status(options.name, options.critical) if options.disks: diskio = DiskIOCollector() diskio.list_partition() posted on 2016-02-19 14:37 北京涛子 阅读( ...) 评论( ...) 编辑 收藏

转载于:https://www.cnblogs.com/liujitao79/p/5200956.html

相关资源:系统监控程序Upsilon.zip

最新回复(0)