Python实现网络爬虫、蜘蛛.doc-道客多多

资源描述

1、python 中如何提取网页正文啊谢谢import urllib.request url=“http:/ response=urllib.request.urlopen(url) page=response.read() python提取网页中的文本1. import os,sys,datetime 2. import httplib,urllib, re 3. from sgmllib import SGMLParser 4. 5. import types 6. 7. class Html2txt(SGMLParser): 8. def reset(self): 9. self.text

2、 = 10. self.inbody = True 11. SGMLParser.reset(self) 12. def handle_data(self,text): 13. if self.inbody: 14. self.text += text 15. 16. def start_head(self,text): 17. self.inbody = False 18. def end_head(self): 19. self.inbody = True 20. 21. 22.if _name_ = “_main_“: 23. parser = Html2txt() 24. parser

3、.feed(urllib.urlopen(“http:/“).read() 25. parser.close() 26. print parser.text.strip() python 下载网页import httplib conn=httplib.HTTPConnection(““)conn.request(“GET“,“/index.html“)r1=conn.getresponse()print r1.status,r1.reasondata=r1.read()print dataconn.close用python 下载网页，超级简单！from urllib import urlope

4、nwebdata = urlopen(“).read()print webdata深入python里面有python 下载网页内容, 用python的pycurl模块实现1. 用python 下载网页内容还是很不错的，之前是使用urllib模块实验的，但听说有pycurl这个模块，而且比urllib好，所以尝试下，废话不说，以下是代码2.3.4. #!/usr/bin/env python5. # -*- coding: utf-8 -*-6. import StringIO7. import pycurl8.9. def writefile(fstr,xfilename):f=open(xf

5、ilename,w)f.write(fstr)f.close10.1. html = StringIO.StringIO()2. c = pycurl.Curl()3. myurl=http:/4. 5. c.setopt(pycurl.URL, myurl)6. 7. #写的回调8. c.setopt(pycurl.WRITEFUNCTION, html.write)9. 10. c.setopt(pycurl.FOLLOWLOCATION, 1)11. 12. #最大重定向次数,可以预防重定向陷阱13. c.setopt(pycurl.MAXREDIRS, 5)14. 15. #连接超时设

6、置16. c.setopt(pycurl.CONNECTTIMEOUT, 60)17. c.setopt(pycurl.TIMEOUT, 300)18. 19. #模拟浏览器20. c.setopt(pycurl.USERAGENT, “Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)“)21. 22. 23. 24. #访问,阻塞到访问结束25. c.perform()26. 27. #打印出 200(HTTP状态码，可以不需要)28. print c.getinfo(pycurl.HTTP_

7、CODE)29. 30. #输出网页的内容31. print html.getvalue()32. #保存成down.txt文件33. writefile(html.getvalue(),“down.txt“)python的pycurl模块的安装可以到http:/ 下载网页的几种方法1fd = urllib2.urlopen(url_link)data = fd.read()这是最简洁的一种，当然也是Get的方法2通过GET的方法def GetHtmlSource(url):try:htmSource = req = urllib2.Request(url)fd = urllib2.urlop

8、en(req,“)while 1:data = fd.read(1024)if not len(data):breakhtmSource += datafd.close()del fddel reqhtmSource = htmSource.decode(cp936)htmSource = formatStr(htmSource)return htmSourceexcept socket.error, err:str_err = “%s“ % errreturn “3通过GET的方法def GetHtmlSource_Get(htmurl):htmSource = “try:urlx = ht

9、tplib.urlsplit(htmurl)conn = httplib.HTTPConnection(loc)conn.connect()conn.putrequest(“GET“, htmurl, None)conn.putheader(“Content-Length“, 0)conn.putheader(“Connection“, “close“)conn.endheaders()res = conn.getresponse()htmSource = res.read()except Exception(), err:trackback.print_exec()conn.close()r

10、eturn htmSource通过POST的方法def GetHtmlSource_Post(getString):htmSource = “try:url = httplib.urlsplit(“http:/:8080“)conn = httplib.HTTPConnection(loc)conn.connect()conn.putrequest(“POST“, “/sipo/zljs/hyjs-jieguo.jsp“)conn.putheader(“Content-Length“, len(getString)conn.putheader(“Content-Type“, “applicat

11、ion/x-www-form-urlencoded“)conn.putheader(“Connection“, “ Keep-Alive“)conn.endheaders()conn.send(getString)f = conn.getresponse()if not f:raise socket.error, “timed out“htmSource = f.read()f.close()conn.close()return htmSourceexcept Exception(), err:trackback.print_exec()conn.close()return htmSource

12、本文来自CSDN 博客，转载请标明出处： http:/ 搭建一个管理平台，用来协调抓取工作。因为自己很喜欢Django admin后台，所以这次用这个后台对抓取到的链接进行管理，使我的爬虫可以应对各种后期的需求。比如分时段抓取，定期的对已经抓取的地址重新抓取。数据库是用python自带的sqlite3，所以很方便。这几天正好在做一个电影推荐系统，需要些电影数据。本文的例子是对豆瓣电影抓取特定的数据。第一步：建立Django模型模仿nutch的爬虫思路，这里简化了。每次抓取任务开始先从数据库里找到未保存的(is_save = False)的链接，放到抓取链表里。你也可以根据自己的需求去过滤链接。

13、python代码：view plaincopy to clipboardprint?01.class Crawl_URL(models.Model): 02. url = models.URLField(抓取地址 ,max_length=100, unique=True) 03. weight = models.SmallIntegerField(抓取深度,default = 0)#抓取深度起始1 04. is_save = models.BooleanField(是否已保存,default= False)# 05. date = models.DateTimeField(保存时间,auto_

14、now_add=True,blank=True,null=True) 06. def _unicode_(self): 07. return self.url class Crawl_URL(models.Model):url = models.URLField(抓取地址,max_length=100, unique=True)weight = models.SmallIntegerField(抓取深度,default = 0)#抓取深度起始1is_save = models.BooleanField(是否已保存,default= False)#date = models.DateTimeFi

15、eld(保存时间,auto_now_add=True,blank=True,null=True)def _unicode_(self):return self.url 然后生成相应的表。还需要一个admin管理后台view plaincopy to clipboardprint?01.class Crawl_URLAdmin(admin.ModelAdmin): 02. list_display = (url,weight,is_save,date,) 03. ordering = (-id,) 04. list_filter = (is_save,weight,date,) 05. fiel

16、ds = (url,weight,is_save,) 06.admin.site.register(Crawl_URL, Crawl_URLAdmin) class Crawl_URLAdmin(admin.ModelAdmin):list_display = (url,weight,is_save,date,)ordering = (-id,)list_filter = (is_save,weight,date,)fields = (url,weight,is_save,)admin.site.register(Crawl_URL, Crawl_URLAdmin) 第二步，编写爬虫代码爬虫是

17、单线程，并且每次抓取后都有相应的暂定，豆瓣网会禁止一定强度抓取的爬虫爬虫根据深度来控制，每次都是先生成链接，然后抓取，并解析出更多的链接，最后将抓取过的链接is_save=true，并把新链接存入数据库中。每次一个深度抓取完后都需要花比较长的时候把链接导入数据库。因为需要判断链接是否已存入数据库。这个只对满足正则表达式 http:/ 的地址进行数据解析。并且直接忽略掉不是电影模块的链接。第一次抓取需要在后台加个链接，比如http:/ coding=UTF-8import urllib2from BeautifulSoup import *from urlparse import urljoin

18、from pysqlite2 import dbapi2 as sqlitefrom movie.models import *from django.contrib.auth.models import Userfrom time import sleepimage_path = C:/Users/soul/djcodetest/picture/user = User.objects.get(id=1)def crawl(depth=10):for i in range(1,depth):print 开始抓取 for %d%ipages = Crawl_URL.objects.filter(

19、is_save=False)newurls= for crawl_page in pages:page = crawl_page.urltry:c=urllib2.urlopen(page)except:continue try:#解析元数据和urlsoup=BeautifulSoup(c.read()#解析电影页面if re.search(rhttp:/ link in links: if href in dict(link.attrs): url=urljoin(page,linkhref) if url.find(“)!=-1: continueif len(url) 60: conti

20、nueurl=url.split(#)0 # removie location portionif re.search(rhttp:/, url):newurlsurl= crawl_page.weight + 1 #连接有效。存入字典中try:print add url :except:pass except Exception.args:try:print “Could not parse : %s“ % argsexcept:pass#newurls存入数据库 is_save=False weight=icrawl_page.is_save = Truecrawl_page.save()

21、#休眠2.5秒sleep(2.5)save_url(newurls) #保存url，放到数据库里def save_url(newurls):for (url,weight) in newurls.items():url = Crawl_URL(url=url,weight=weight)try:url.save()except:try:print url重复:except:passreturn True第三步，用BeautifulSoup解析页面抽取出电影标题，图片，剧情介绍，主演，标签，地区。关于BeautifulSoup的使用可以看这里BeautifulSoup技术文档view plain

22、copy to clipboardprint?01.#抓取数据 02.def read_html(soup): 03. #解析出标题 04. html_title = soup.html.head.title.string 05. title = html_title:len(html_title)-5 06. #解析出电影介绍 07. try: 08. intro = soup.find(span,attrs=class:all hidden).text 09. except: 10. try: 11. node = soup.find(div,attrs=class:blank20).pr

23、eviousSibling 12. intro = node.contents0+node.contents2 13. except: 14. try: 15. contents = soup.find(div,attrs=class:blank20).previousSibling.previousSibling.text 16. intro = contents:len(contents)-22 17. except: 18. intro = u暂无 19. 20. #取得图片 21. html_image = soup(a,href=pile( 22. data = urllib2.ur

24、lopen(html_image).read() 23. image = 201003/+html_imagehtml_image.rfind(/)+1: 24. f = file(image_path+image,wb) 25. f.write(data) 26. f.close() 27. 28. 29. #解析出地区 30. try: 31. soup_obmo = soup.find(div,attrs=class:obmo).findAll(span) 32. html_area = soup_obmo0.nextSibling.split(/) 33. area = html_ar

25、ea0.lstrip() 34. except: 35. area = 36. 37. #time = soup_obmo1.nextSibling.split( )1 38. #time = time.strptime(html_time,%Y-%m-%d) 39. 40. #生成电影对象 41. new_movie = Movie(title=title,intro=intro,area=area,version=暂无,upload_user=user,image=image) 42. new_movie.save() 43. try: 44. actors = soup.find(div

26、,attrs=id:info).findAll(span)5.nextSibling.nextSibling.string.split( )0 45. actors_list = Actor.objects.filter(name = actors) 46. if len(actors_list) = 1: 47. actor = actors_list0 48. new_movie.actors.add(actor) 49. else: 50. actor = Actor(name=actors) 51. actor.save() 52. new_movie.actors.add(actor

27、) 53. except: 54. pass 55. 56. #tag 57. tags = soup.find(div,attrs=class:blank20).findAll(a) 58. for tag_html in tags: 59. tag_str = tag_html.string 60. if len(tag_str) 4: 61. continue 62. tag_list = Tag.objects.filter(name = tag_str) 63. if len(tag_list) = 1: 64. tag = tag_list0 65. 66. new_movie.t

28、ags.add(tag) 67. else: 68. tag = Tag(name=tag_str) 69. tag.save() 70. new_movie.tags.add(tag) 71. #try: 72. 73. #except Exception.args: 74. # print “Could not download : %s“ % args 75. print rdownload success 76. #抓取数据def read_html(soup):#解析出标题html_title = soup.html.head.title.stringtitle = html_tit

29、le:len(html_title)-5#解析出电影介绍try:intro = soup.find(span,attrs=class:all hidden).textexcept:try:node = soup.find(div,attrs=class:blank20).previousSiblingintro = node.contents0+node.contents2except:try:contents = soup.find(div,attrs=class:blank20).previousSibling.previousSibling.textintro = contents:le

30、n(contents)-22except:intro = u暂无#取得图片html_image = soup(a,href=pile( = urllib2.urlopen(html_image).read()image = 201003/+html_imagehtml_image.rfind(/)+1:f = file(image_path+image,wb)f.write(data)f.close()#解析出地区try:soup_obmo = soup.find(div,attrs=class:obmo).findAll(span)html_area = soup_obmo0.nextSib

31、ling.split(/)area = html_area0.lstrip()except:area = #time = soup_obmo1.nextSibling.split( )1#time = time.strptime(html_time,%Y-%m-%d)#生成电影对象new_movie = Movie(title=title,intro=intro,area=area,version=暂无,upload_user=user,image=image)new_movie.save()try:actors = soup.find(div,attrs=id:info).findAll(s

32、pan)5.nextSibling.nextSibling.string.split( )0actors_list = Actor.objects.filter(name = actors)if len(actors_list) = 1:actor = actors_list0new_movie.actors.add(actor)else:actor = Actor(name=actors)actor.save() new_movie.actors.add(actor)except:pass#tagtags = soup.find(div,attrs=class:blank20).findAl

33、l(a)for tag_html in tags:tag_str = tag_html.stringif len(tag_str) 4:continuetag_list = Tag.objects.filter(name = tag_str)if len(tag_list) = 1:tag = tag_list0new_movie.tags.add(tag)else:tag = Tag(name=tag_str)tag.save() new_movie.tags.add(tag)#try:#except Exception.args:# print “Could not download :

34、%s“ % argsprint rdownload success豆瓣的电影页面并不是很对称，所以有时候抓取的结果可能会有点出入本文来自CSDN 博客，转载请标明出处： http:/ API key了，因此只能自己将查询对应的URL准备好，然后通过脚本将该链接对应的网页爬下来。我们假定，我们要爬这样一个页面：http:/ urllib2 , urlparse , gzip from StringIO import StringIOUSER_AGENT = Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.5

35、0727; CIBA; InfoPath.2; 360SE)class SmartRedirectHandler(urllib2.HTTPRedirectHandler):def http_error_301(self , req , fp , code , msg , headers):result = urllib2.HTTPRedirectHandler.http_error_301(self , req , fp , code , msg , headers)result.status = code return result def http_error_302(self , req

36、 , fp , code , msg , headers):result = urllib2.HTTPRedirectHandler.http_error_302( self , req , fp , code , msg , headers)result.status = code return result class DefaultErrorHandler(urllib2.HTTPDefaultErrorHandler):def http_error_default(self , req , fp , code , msg , headers):result = urllib2.HTTP

37、Error(req.get_full_url() , code , msg , headers , fp)result.status = code return result def openAnything(source , etag = None , lastmodified = None , agent= USER_AGENT):if hasattr(source , read):return source if source = - :return sys.stdinif urlparse.urlparse(source)0 = http :request = urllib2.Requ

38、est(source)request.add_header(USER-AGENT, agent)request.add_header(Cookie, PREF=ID=5e8d1d15fe369be8:U=95d9eb627acc6c79:LD=en:NW=1:CR=2:TM=1270616770:LM=1270616914:S=FAiBW5rEW2azKJXk; NID=33=rXimIAwO_TvEyFlE4lBRkxr1x3TTVh36maim2Cn0gk3b3SAbtn79qkAtgIli18d382TnTCFMOXjzgqxQFCEWLHEbnyf-MtVwfa4-pYmXSMkUMP

39、YqDi61ZmmqyPcBbwzP)if etag : request.add_header(If-None-Match, etag)if lastmodified :request.add_header(If-Modified-Since, lastmodified)# request.add_header(Accept-encoding, gzip)# request.add_header(Connection, Keep-Alive)opener = urllib2.build_opener(SmartRedirectHandler() , DefaultErrorHandler()r

40、eturn opener.open(request)try :return open(source)except (IOError , OSError) : passreturn StringIO(str(sources)def fetch(source , etag = None , last_modified = None , agent =USER_AGENT):result = f = openAnything(source , etag , last_modified , agent)resultdata = f.read()if hasattr(f , headers):resul

41、tetag = f.headers.get(ETag)resultlastmodified = f.headers.get(Last-Modified)if f.headers.get(content-encoding , )=gzip :resultdate=gzip.GzipFile(fileobj = StringIO(resultdata) ).read()if hasattr(f , url) :resulturl = f.urlresultstatus = 200if hasattr(f , status) :resultstatus = f.statusf.close()retu

42、rn result if _name_=_main_ :result = fetch(“http:/ resultdataprint resulturlprint resultstatus要注意一下几点：1、要加User-agent ，否则GOOGLE会返回403 的forbidden信息，因为 Python默认的User-agent是python2、如果不想被重定向到Google 中国，哦，NO ，是Google 香港，那么要记得在自己的cookie中找到对应的cookie 加上去3、这种东西要多抓包，一个字节一个字节的比对，要相信，浏览器也是程序写的，浏览器能做到的程序也能做到本文来自CS

43、DN 博客，转载请标明出处： http:/ #coding:utf-82.3.import urllib.request4.import xml.dom.minidom5.import sqlite36.import threading7.import time8.9.class logger(object):10. def log(self,*msg):11. for i in msg:12. print(i)13.14.Log = logger()15.Log.log(测试下 )16.17.class downloader(object):18. 19. def _init_(self,u

44、rl):20. self.url = url21. 22. def download(self):23. Log.log(开始下载,self.url)24. try:25. content = urllib.request.urlopen(self.url).read()26. #req = urllib.request.Request(url)27. #response = urllib.request.urlopen(req)28. #content = response.read()29. Log.log(下载完毕)30. return(content) 31. except:32. Log.log(

展开阅读全文