我们经常遇到某个页面缺少一个js之类的情况,一般使用firefox的httpfox或者chrome自带的工具都可以查看那些元素有问题.用python写了一个脚本,分析页面里面引用的元素,然后逐个进行请求,查看是否有的元素不能成功获取.不过使用urllib2有的地方异常处理也做的不完善,目前只是自己简单测试了一些,当熟练一下Python的用法了.目前从perl切换到python还是有很多的地方不是太习惯.尤其是很多列表和字符串的操作有点不一样.[python]#!/usr/bin/env pythonimport urllib2import gzipimport binasciiimport re,sysimport stringfrom StringIO import StringIOdef gunziptxt(data):buf = StringIO(data)of =gzip.GzipFile(fileobj=buf,mode="rb")outdata=of.read()return outdatadef http_code(url):request=urllib2.Request(url,headers={‘User-agent’:"python urllib browser","Accept-Encoding":’gzip’})try:response=urllib2.urlopen(request,timeout=5)return response.getcode()except urllib2.HTTPError,error:print "url:",error.reasonreturn error.codeexcept urllib2.URLError,error:print url,error.reasonreturn 1000def http_client(url):request=urllib2.Request(url,headers={‘User-agent’:"python urllib browser","Accept-Encoding":’gzip’})try:response=urllib2.urlopen(request,timeout=5)info=response.info()data=response.read()except urllib2.HTTPError,error:print "%s error:%s" %(url,error.reason)return Noneexcept urllib2.URLError,error:print error.reasonreturn None
```bash
if info.get("content-encoding",None) == ‘gzip’:outdata=gunziptxt(data)else:outdata=datareturn outdatadef get_src(page):src_re=re.compile(r’src\s*=\s*["|\’]\s*(https?://[^\"\’]+?)["|\’]’)if page:link_urls=src_re.findall(page)return set(link_urls)else:return set()if len(sys.argv)<2:print "usage:\n\t",sys.argv[0],"url"exit(1)if __name__ == "__main__":urls=sys.argv[1]pages=http_client(urls)if pages:links=get_src(pages)else:exit(1)for link in links:code=http_code(link)if code >399:print "%s \x1B[1;31m%d\x1B[m"%(link,code)else:print "%s \x1B[1;32m%d\x1B[m"%(link,code)else:print "pagecheck test"