二分法是很基础的一个查询方法。试想一个场景,应用的访问量非常大,单天的日志单个文件上100G,要准实时地统计出TPM的大小。没有什么storm之类的高级玩意,就自己写脚本进行统计的话其实不太好搞。这个时候可以试试每次用二分法找出上一分钟的日志所在的偏移量,然后再顺序读入日志进行处理,可以比较高效地跳过大量的日志。python简单写了个[python]#!/usr/bin/env pythonimport reimport datetimeimport sysclass logtools:"""this tools can get the bind qps and the ips which query with high frequency"""def __init__(self,filename="/xx/acess.log"):self.logname=filenametry:#print "logs is",filenameself.fd=open(filename,"r")except IOError:print "open log failed"sys.exit(1)def __del__(self):try:self.fd.close()except:print "close fd failed"def get_last_min(self):now=datetime.datetime.now()last=datetime.datetime.now()+datetime.timedelta(minutes=-2)qps_time=datetime.datetime.now()+datetime.timedelta(minutes=-1)t=qps_time.strftime(‘\s+%H:%M:’)t2=qps_time.strftime(‘%H:%M’)return (int(last.strftime("%s")),t,t2)def get_current_min(self):time_reg=re.compile("\s+(?P\d+):(?P\d+):(?P\d+)")now=datetime.datetime.now()i=1while True:line=self.fd.readline()if not  line:return Nonematch=time_reg.search(line)i=i+1if  match:match_time=datetime.datetime(year=now.year,month=now.month,day=now.day,hour=int(match.group("hour")),minute=int(match.group("min")),second=int(match.group("sec")),)breakreturn int(match_time.strftime("%s"))def get_last_seek(self,last_time):old_seek=self.fd.tell()self.fd.seek(0,0)start_seek=self.fd.tell()start_time=self.get_current_min()pos_off=len(self.fd.readline())*2self.fd.seek(0,2)end_seek=self.fd.tell()self.fd.seek(-pos_off,2)end_time=self.get_current_min()#print "time range:",start_time,last_time,end_time#print "pos_off:",pos_offif last_time < start_time:print "error last-timereturn end_seekelif  last_time > end_time:print "error %d > %d"%(last_time,end_time)return end_seektime=0while (end_seek – start_seek > 2*pos_off and end_time – start_time > 3) :half_seek=int((end_seek+start_seek)/2)self.fd.seek(half_seek,0)half_time=self.get_current_min()#print "%d –<%d>—%d"%(start_seek,half_seek,end_seek)if last_time<=half_time:end_seek=half_seekself.fd.seek(end_seek,0)end_time=self.get_current_min()else:start_seek=half_seekself.fd.seek(start_seek,0)start_time=self.get_current_min()time+=1#print "search %d times"%timereturn half_seekdef get_tpm(self):reg=self.get_last_min()[1]+"\d{2}"reg_time=self.get_last_min()[2]regex=re.compile(reg)time_pre=self.get_last_min()[0]pos=self.get_last_seek(time_pre)self.fd.seek(pos,0)query=0line=self.fd.readline()while line:if line == None:breakelif regex.search(line):query+=1line=self.fd.readline()print "%s qps %d"%(str(reg_time),query)a=logtools(filename=sys.argv[1])a.get_tpm()

[/python]