二分法查找指定时间的日志

二分法是很基础的一个查询方法。试想一个场景,应用的访问量非常大,单天的日志单个文件上100G,要准实时地统计出TPM的大小。没有什么storm之类的高级玩意,就自己写脚本进行统计的话其实不太好搞。这个时候可以试试每次用二分法找出上一分钟的日志所在的偏移量,然后再顺序读入日志进行处理,可以比较高效地跳过大量的日志。python简单写了个
[python]
#!/usr/bin/env python
import re
import datetime
import sys
class logtools:
"""
this tools can get the bind qps and the ips which query with high frequency
"""
def __init__(self,filename="/xx/acess.log"):
self.logname=filename
try:
#print "logs is",filename
self.fd=open(filename,"r")
except IOError:
print "open log failed"
sys.exit(1)
def __del__(self):
try:
self.fd.close()
except:
print "close fd failed"
def get_last_min(self):
now=datetime.datetime.now()
last=datetime.datetime.now()+datetime.timedelta(minutes=-2)
qps_time=datetime.datetime.now()+datetime.timedelta(minutes=-1)
t=qps_time.strftime(‘\s+%H:%M:’)
t2=qps_time.strftime(‘%H:%M’)
return (int(last.strftime("%s")),t,t2)
def get_current_min(self):
time_reg=re.compile("\s+(?P<hour>\d+):(?P<min>\d+):(?P<sec>\d+)")
now=datetime.datetime.now()
i=1
while True:
line=self.fd.readline()
if not line:
return None
match=time_reg.search(line)
i=i+1
if match:
match_time=datetime.datetime(year=now.year,month=now.month,day=now.day,
hour=int(match.group("hour")),
minute=int(match.group("min")),
second=int(match.group("sec")),
)
break
return int(match_time.strftime("%s"))
def get_last_seek(self,last_time):
old_seek=self.fd.tell()
self.fd.seek(0,0)
start_seek=self.fd.tell()
start_time=self.get_current_min()
pos_off=len(self.fd.readline())*2
self.fd.seek(0,2)
end_seek=self.fd.tell()
self.fd.seek(-pos_off,2)
end_time=self.get_current_min()
#print "time range:",start_time,last_time,end_time
#print "pos_off:",pos_off
if last_time < start_time:
print "error last-time <start-time"
return end_seek
elif last_time > end_time:
print "error %d > %d"%(last_time,end_time)
return end_seek
time=0
while (end_seek – start_seek > 2*pos_off and end_time – start_time > 3) :
half_seek=int((end_seek+start_seek)/2)
self.fd.seek(half_seek,0)
half_time=self.get_current_min()
#print "%d –<%d>—%d"%(start_seek,half_seek,end_seek)
if last_time<=half_time:
end_seek=half_seek
self.fd.seek(end_seek,0)
end_time=self.get_current_min()
else:
start_seek=half_seek
self.fd.seek(start_seek,0)
start_time=self.get_current_min()
time+=1
#print "search %d times"%time
return half_seek
def get_tpm(self):
reg=self.get_last_min()[1]+"\d{2}"
reg_time=self.get_last_min()[2]
regex=re.compile(reg)
time_pre=self.get_last_min()[0]
pos=self.get_last_seek(time_pre)
self.fd.seek(pos,0)
query=0
line=self.fd.readline()
while line:
if line == None:
break
elif regex.search(line):
query+=1
line=self.fd.readline()
print "%s qps %d"%(str(reg_time),query)
a=logtools(filename=sys.argv[1])
a.get_tpm()

[/python]

此条目发表在python分类目录。将固定链接加入收藏夹。

发表回复