需求
由于我们的业务报警比较频繁,之前是针对每个报警进行具体处理,但是有时还会重复出现,或者后续处理有时忘记跟进等,因此进行报警短信的统计,可以针对一些问题与业务跟进,明确后续的优化方向等。
实现
实现原理如下图:
其中核心部分zbx_statis,其实就是我编写的一个python脚本,它会从zabbixDB中查询过去一周的所有报警信息,并按不同维度统计每周的报表上传到公司的git上,同时将一条汇总的sql插入到cmdb的库表中展示。
报警格式依赖
报表的分析统计可以分两个维度:
报警类型纬度;
业务纬度;
不管从哪个维度进行的统计,都需要一个前提:报警格式规范化。
针对报警内容的需求,我们对zabbix的trigger名称、主机名hostname等进行了规范化。
举例:
[17][15:31:04][productname-test-mysql-00][PROBLEM][005][cpu idle too low (<30%)][0.10 %][负责人:张学岩][15:31:07]
productname-test-mysql-00 是主机名,按业务等级进行命名,用于报警统计的业务纬度统计;
cpu idle too low (<30%) 是报警的类型,可以据此项进行类型纬度的统计;
通过维护一个主要业务列表,然后根据hostname匹配可以从业务纬度进行统计;
通过将报警类型规范化,用固定的格式放在报警信息的固定位置,可以按类型进行统计。
报表展示
以下是截取的部分报表的展示。
按报警类型纬度:
最下面的详细信息跳转即业务纬度的统计。
按业务纬度:
CMDB统计图表:
很直观的展示每周的报警数量,如果优化比较好的话,会看到整体应该是下降的趋势。
附件
上文提到的报警统计 python 脚本,写的时间比较久了,现在看内容还是比较杂乱,我也懒得改了,放出来供大家参考,内容如下:
#!/usr/bin/env python26 # encoding: utf-8 import MySQLdb import traceback import copy import datetime import time import operator import sys reload(sys) sys.setdefaultencoding( "utf-8" ) HOST = 'zabbix_db_host' DB = 'zabbix' PORT = 3306 RETRY_TIMES = 3 # 业务类型 GROUP_TYPE = ['A', 'B', 'C', '...', 'X', 'Y', ] BASE_DIR = 'alerts_statistic/' START_TIME = (datetime.datetime.now() - datetime.timedelta(days=(7 + datetime.datetime.now().weekday()))).strftime("%Y%m%d") END_TIME = (datetime.datetime.now() - datetime.timedelta(days=(datetime.datetime.now().weekday()))).strftime("%Y%m%d") DAY_SUM = 0 NIGHT_SUM = 0 class Connection: def __init__(self, *args, **kwargs): self.args = args self.kwargs = kwargs self.kwargs['user'] = "user" self.kwargs['passwd'] = "password" self.kwargs['port'] = kwargs['port'] if kwargs.has_key("port") else 3306 self.kwargs['db'] = kwargs['db'] if kwargs.has_key("db") else "information_schema" self.kwargs['connect_timeout'] = 1 def get_connection(self): ret = {"errno":0, 'errmsg':"", 'value':None} conn = None try: for i in range(0, RETRY_TIMES): conn = MySQLdb.connect(*self.args, **self.kwargs) if conn: break ret['value'] = conn except Exception, err: ret['error'] = -1 ret['errmsg'] = self.kwargs['host'] + str(err) traceback.print_exc() finally: return ret def create_connection(*args, **kwargs): __conn__ = Connection(*args, **kwargs) ret = __conn__.get_connection() if ret['errno']: return None else: return ret['value'] def get_alert(): start_timestamp = int(time.mktime(datetime.datetime.strptime(START_TIME + ' 00:00:00', "%Y%m%d %H:%M:%S").timetuple())) end_timestamp = int(time.mktime(datetime.datetime.strptime(END_TIME + ' 00:00:00', "%Y%m%d %H:%M:%S").timetuple())) try: conn = create_connection(host = HOST, db = DB, port = PORT, charset = 'utf8') if conn: SQL = """select from_unixtime(a.clock),a.subject from alerts a,events b left join triggers c on b.objectid=c.triggerid where a.eventid=b.eventid and a.alerttype=0 and a.subject not like '%test-%' and a.subject not like '%-test%' and a.clock>={start_time} and a.clock< {end_time}="" group="" by="" a.subject="" order="" by="" a.clock"""="" sql="SQL.format(start_time=start_timestamp," end_time="end_timestamp)" print="" sql="" cursor="conn.cursor()" cursor.execute(sql)="" ret="cursor.fetchall()" cursor.close()="" conn.close()="" return="" ret="" except="" exception,e:="" pass="" def="" alert_statistic(alert_list):="" result="{}" alerts_list="[]" if="" alert_list:="" for="" alert="" in="" alert_list:="" alerts_list.append(alert)="" for="" group="" in="" group_type:="" alerts="[]" alerts_list2="copy.copy(alerts_list)" for="" alert="" in="" alerts_list2:="" if="" group="" in="" alert[1]:="" alerts.append(alert)="" alerts_list.remove(alert)="" result[group]="alerts" result['other']="alerts_list" result['status']="0" else:="" result['status']="1" return="" result="" def="" write_to_file(result,="" day):="" if="" result:="" #statis_date="datetime.datetime.now().strftime(" %y-%m-%d")"="" file_name="BASE_DIR" +="" 'detail/'="" +="" start_time="" +="" '-'="" +="" end_time="" +="" '_'="" +="" day="" +="" '.md'="" writer="open(file_name,'w')" writer.write('##="" '="" +="" start_time="" +="" '="" -="" '="" +="" end_time="" +="" ':="" '="" +="" day)="" alert_sum="0" for="" group="" in="" result.keys():="" alert_sum="alert_sum" +="" len(result[group])="" writer.write('\n\n**短信总数:'="" +="" str(alert_sum)="" +="" '**')="" for="" group="" in="" result.keys():="" #print="" group,="" ":",="" len(result[group])="" if="" group="=" 'status':="" continue="" if="" len(result[group])="=" 0:="" continue="" writer.write('\n\n###="" '="" +="" group="" +="" '('="" +="" str(len(result[group]))="" +="" ')'="" +="" '\n\n')="" writer.write('|报警内容|报警时间|\n|---|---|\n')="" for="" alert="" in="" result[group]:="" writer.write('|'="" +="" str(alert[1])="" +="" '|'="" +="" str(alert[0])="" +="" '|\n')="" writer.close()="" def="" day_night_split(result,="" day='light' ):="" results="{}" if="" result:="" for="" group="" in="" result.keys():="" if="" group="=" 'status':="" continue="" alerts="result[group]" alerts2="copy.copy(alerts)" for="" alert="" in="" alerts:="" alert_time="alert[0]" alert_hour_time="alert_time.strftime(" %h")"="" if="" int(alert_hour_time)="">= 7 and day == 'light': pass elif int(alert_hour_time) >= 7 and day == 'night': alerts2.remove(alert) elif int(alert_hour_time) < 7="" and="" day="=" 'light':="" alerts2.remove(alert)="" elif="" int(alert_hour_time)="">< 7="" and="" day="=" 'night':="" pass="" results[group]="alerts2" return="" results="" def="" alert_groupby(alert_list):="" alerts="[]" alert_group="[]" group_list="[]" for="" group="" in="" alert_list.keys():="" if="" group="=" 'status':="" continue="" for="" alert="" in="" alert_list[group]:="" alerts.append(alert[1])="" for="" alert="" in="" alerts:="" ###="" 兼容添加trigger="" id的改动="" temp1="alert.split('][')" alert_type='' if="" len(temp1)="=" 9:="" alert_type="alert.split('][')[5].split(',')[0]" else:="" alert_type="alert.split('][')[4].split(',')[0]" if="" alert_type="" not="" in="" alert_group:="" alert_group.append(alert_type)="" for="" type="" in="" alert_group:="" type_dict="{}" count="0" hostlist="[]" for="" alert="" in="" alerts:="" if="" type="=" alert.split('][')[4].split(',')[0]="" or="" type="=" alert.split('][')[5].split(',')[0]:="" count="count" +="" 1="" hostname="alert.split('][')[2]" if="" hostname="" not="" in="" hostlist:="" hostlist.append(hostname)="" type_dict['type']="type" type_dict['hostlist']="," .join(hostlist)="" type_dict['count']="str(count)" group_list.append(type_dict)="" group_list.sort(key="lambda" x="" :="" int(x['count']),="" reverse="True)" return="" group_list="" def="" write_group(group_light,="" group_night):="" if="" group_light="" and="" group_night:="" file_name="BASE_DIR" +="" start_time="" +="" '-'="" +="" end_time="" +="" '.md'="" file_detail_light="START_TIME" +="" '-'="" +="" end_time="" +="" '_白天'="" +="" '.md'="" file_detail_night="START_TIME" +="" '-'="" +="" end_time="" +="" '_夜间'="" +="" '.md'="" writer="open(file_name,'w')" writer.write('##="" '="" +="" start_time="" +="" '-'="" +="" end_time="" +="" '\n\n')="" ##="" light="" alert_sum="0" for="" group="" in="" group_light:="" alert_sum="alert_sum" +="" int(group['count'])="" global="" day_sum="" day_sum="alert_sum" writer.write('###="" '="" +="" '白天:'="" +="" str(alert_sum)="" +="" '\n\n')="" writer.write("|报警类型|报警数量|报警主机|\n|---|---|---|\n")="" for="" group="" in="" group_light:="" writer.write("|"="" +="" group['type']="" +="" "|"="" +="" group['count']="" +="" "|"="" +="" group['hostlist']="" +="" "|\n")="" writer.write("\n[详细报警信息](detail/"="" +="" file_detail_light="" +="" ")\n\n")="" ##="" night="" alert_sum="0" for="" group="" in="" group_night:="" alert_sum="alert_sum" +="" int(group['count'])="" global="" night_sum="" night_sum="alert_sum" writer.write('###="" '="" +="" '夜间:'="" +="" str(alert_sum)="" +="" '\n\n')="" writer.write("|报警类型|报警数量|报警主机|\n|---|---|---|\n")="" for="" group="" in="" group_night:="" writer.write("|"="" +="" group['type']="" +="" "|"="" +="" group['count']="" +="" "|"="" +="" group['hostlist']="" +="" "|\n")="" writer.write("\n[详细报警信息](detail/"="" +="" file_detail_night="" +="" ")\n\n")="" writer.close()="" def="" write_trend(sql):="" host='cmdb_host' db='cmdb_db' port="3306" try:="" conn="create_connection(host" =="" host,="" db="db," port="port," charset='utf8' )="" if="" conn:="" sql="sql" #="" sql="SQL.format(start_time=start_timestamp," end_time="end_timestamp)" print="" sql="" cursor="conn.cursor()" cursor.execute(sql)="" ret="cursor.fetchall()" cursor.close()="" conn.commit()="" conn.close()="" return="" ret="" except="" exception,e:="" print="" e="" def="" git_push():="" import="" os="" os.system("cd="" alerts_statis="" &&="" git="" add="" alerts_statis="" &&="" git="" commit="" -m="" 'update'="" &&="" git="" push")="" if="" __name__="=" '__main__':="" #="" alert="" 列表="" alert_list="get_alert()" #="" 按业务进行统计="" result="alert_statistic(alert_list)" if="" result['status']="=" 0:="" #="" 区分白天夜间="" result_day="day_night_split(result," 'light')="" result_night="day_night_split(result," 'night')="" #="" 按报警类型划分="" light_alert="alert_groupby(result_day)" night_alert="alert_groupby(result_night)" #="" 写入文件="" write_group(light_alert,="" night_alert)="" write_to_file(result_day,="" '白天')="" write_to_file(result_night,="" '夜间')="" git_push()="" sql='insert into alerts (start_time,end_time,all_count,day_count,night_count)values("' +="" start_time="" +="" '","'="" +="" end_time="" +="" '",'="" +="" str(day_sum="" +="" night_sum)="" +="" ','="" +="" str(day_sum)="" +="" ','="" +="" str(night_sum)="" +="" ');'="" #="" 写入cmdb="" write_trend(sql)="" else:="" print('there\'s="" no="" alert="" warning="" or="" something="" error.')="">