nginx 日志分析，python 实现

kaola4549 发表于 2018-8-9 07:34:07

环境：　　

　　
OS：
　　
Linux: Linux version 2.6.32-431.el6.x86_64 (mockbuild@c6b8.bsys.dev.centos.org)                                              x
　　
Build: (gcc version 4.4.7 20120313 (Red Hat 4.4.7-4) (GCC) )                                                                x
　　
Release: 2.6.32-431.el6.x86_64                                                                                              x
　　
Version: #1 SMP Fri Nov 22 03:15:09 UTC 2013                                                                               x
　　
cpuinfo: GenuineIntel Intel(R) Core(TM) i3 CPUM 380@ 2.53GHz                                                       x
　　
cpuinfo: Hz=2527.069 bogomips=5054.13                                                                                        x
　　
cpuinfo: ProcessorChips=1 PhyscalCores=2                                                                                     x
　　
cpuinfo: Hyperthreads=0 VirtualCPUs =2
　　

　　
Python：Python 3.5.1
　　

　　
功能：
　　
   1. 支持总访问量和总流量的分析；
　　
   2. 支持HTTP个状态码的统计分析；
　　
   3. 支持对结构进行指定条目显示；
　　
   4. 支持分析指定时间内的统计；
　　

　　

　　
#!/usr/bin/env python3
　　
#-*- coding: utf8 -*-
　　

　　
import fileinput
　　
import re
　　
import time
　　
from collections import Counter
　　
import math
　　
import sys
　　
from datetime import datetime, timedelta
　　

　　
#初始化显示的日志条目，None表示显示全部
　　
records = None
　　

　　
#脚本使用方法
　　
def usage():
　　
print('Usage: %s nginx_log_file ' % sys.argv)
　　
print('Usage: for int number. eg: 10 ')
　　
print('Usage: for for ')
　　
print('eg: ./ngx.py /var/log/nginx/access.log ')
　　
sys.exit(0)
　　

　　
#过去多长时间的时间点时间戳
　　
def tmstamp():
　　
if len(sys.argv) <= 3:
　　
   #return datetime.now().timestamp()
　　
   return 0
　　
elif re.match('^[\d]+d$', sys.argv):
　　
   return (datetime.now() - timedelta(days=float(sys.argv.rstrip('d')))).timestamp()
　　
elif re.match('^[\d]+h$', sys.argv):
　　
   return (datetime.now() - timedelta(hours=float(sys.argv.rstrip('h')))).timestamp()
　　
elif re.match('^[\d]+m$', sys.argv):
　　
   return (datetime.now() - timedelta(minutes=float(sys.argv.rstrip('m')))).timestamp()
　　
elif re.match('^[\d]+s$', sys.argv):
　　
   return (datetime.now() - timedelta(seconds=float(sys.argv.rstrip('s')))).timestamp()
　　
else:
　　
   usage()
　　

　　

　　

　　
#转换字节单位
　　
def convertBytes(bytes, lst=['B','KB','MB','GB','TB','PB']):
　　
i = int(math.floor(math.log(bytes, 1024)))
　　
if i >= len(lst):
　　
   i = len(lst) - 1
　　
return ('%.2f ' + lst) % (bytes/math.pow(1024, i))
　　

　　
#日志解析生成器
　　
def ngx():
　　
try:
　　
   with fileinput.input(sys.argv) as f:
　　
         for line in f:
　　
            ip,_,_,dtime, _, mthd, _, _, status, size, *_ = re.split('[\s"]+', line)
　　
            dtstamp = time.mktime(time.strptime(dtime.lstrip('['), '%d/%b/%Y:%H:%M:%S'))
　　
            yield
　　
except:
　　
   usage()
　　

　　

　　
# 参数判断
　　
if len(sys.argv) < 2 or len(sys.argv) > 4:
　　
usage()
　　
if len(sys.argv) < 3:
　　
records = None
　　
elif len(sys.argv) == 3:
　　
try:
　　
   re.match('[\d]+', sys.argv)
　　
   records = int(sys.argv)
　　
except:
　　
   usage()
　　
elif len(sys.argv) == 4:
　　
try:
　　
   re.match('^[\d]+$', sys.argv)
　　
except:
　　
   usage()
　　

　　

　　
#初始化各统计变量
　　
iptotal, ipsize, ip200, ip302, ip304, ip403, ip404, ip500, ip502, ip503, totsize =Counter(), Counter(), Counter(), Counter(), Counter(), Counter(), Counter(), Counter(), Counter(), Counter(), 0
　　

　　
#定义映射表头
　　
header = ['ip', 'statuscode', 'size', 'dtstamp']
　　

　　
#进行迭代统计
　　
for line in ngx():
　　
#将两个列表转换为字典
　　
datadict = dict(zip(header, line))
　　

　　
#统计n天/时/分/秒之前的访问量和带宽等信息
　　
if datadict['dtstamp'] > tmstamp():
　　

　　
   #每个IP的流量带宽
　　
   ipsize] += int(datadict['size'])
　　

　　
   #总流量
　　
   totsize += int(datadict['size'])
　　

　　
   #每IP的总访问量
　　
   iptotal] += 1
　　

　　
   #统计个状态码的请求数
　　
   if datadict['statuscode'] == '200':
　　
         ip200] += 1
　　
   elif datadict['statuscode'] == '302':
　　
         ip302] += 1
　　
   elif datadict['statuscode'] == '304':
　　
         ip304] += 1
　　
   elif datadict['statuscode'] == '403':
　　
         ip403] += 1
　　
   elif datadict['statuscode'] == '404':
　　
         ip404] += 1
　　
   elif datadict['statuscode'] == '500':
　　
         ip500] += 1
　　
   elif datadict['statuscode'] == '502':
　　
         ip502] += 1
　　
   elif datadict['statuscode'] == '503':
　　
         ip503] += 1
　　

　　
#判断是否有存在数据，存在则打印，否则，输出错误信息！
　　
if totsize:
　　
#打印网站总流量,总访问量
　　
print("\nTotal traffic : %sTotal request times : %d\n" % (convertBytes(totsize),sum(iptotal.values())))
　　

　　
#打印表头
　　
print('%-15s %-10s %-12s %-8s %-8s %-8s %-8s %-8s %-8s %-8s %-8s' %('Ip', 'Times', 'Traffic' , '200', '302', '304', '403', '404', '500', '502', '503'))
　　

　　
print('%-15s %-10s %-12s %-8s %-8s %-8s %-8s %-8s %-8s %-8s %-8s' %('-'*15, '-'*10, '-'*12, '-'*8, '-'*8, '-'*8, '-'*8, '-'*8, '-'*8, '-'*8, '-'*8))
　　
#打印前多少条数据
　　
#for k, v in sorted(iptotal.items(), key=lambda v: v, reverse=True):
　　
for k, v in iptotal.most_common(records):
　　
   print('%-15s %-10s %-12s %-8s %-8s %-8s %-8s %-8s %-8s %-8s %-8s' % (k, v, convertBytes(ipsize), ip200, ip302, ip304, ip403, ip404, ip500, ip502, ip503))
　　

　　
else:
　　
print('Not found data!')

页: [1]

运维网's Archiver

nginx 日志分析，python 实现