Python网页爬虫

1:

#! /usr/bin/env python
# -*- coding: utf-8 -*-
#filename:splider.py
#author:wfu([email protected])

from spdUtility import PriorityQueue,Parser
import urllib2
import sys
import os


def updatePriQueue( priQueue, url ):
    "更新优先级队列"
    extraPrior = url.endswith('.html') and 2 or 0 #这里优先下载以html结尾的url
    extraMyBlog = 'www.kgblog.net' in url and 5 or 0 #优先抓取含有指定内容的网页,竞价抓取排名??
    item = priQueue.getitem(url)
    if item :
        newitem = ( item[0]+1+extraPrior+extraMyBlog, item[1] )
        priQueue.remove(item)
        priQueue.push( newitem )
    else :
        priQueue.push( (1+extraPrior+extraMyBlog,url) )

def getmainurl(url):
    "获得该url的主站地址,用于添加在相对url地址的开头"
    ix = url.find('/',len('http://') )
    if ix > 0 :
        return url[:ix]
    else :
        return url

def analyseHtml(url,html, priQueue,downlist):
    "分析html的超链接,并更新优先级队列"
    p = Parser()
    try :
        p.feed(html)
        p.close()
    except:
        return
    mainurl = getmainurl(url)
    for k, v in p.anchors.items():
        for u in v :
            if not u.startswith('http://'):  #处理相对地址的url
                u = mainurl + u       
            if not downlist.count(u) :    #如果该url已经下载,就不处理了
                updatePriQueue( priQueue, u )

def downloadUrl(id, url, priQueue , downlist,downFolder):
    "下载指定url内容,并分析html超链接"
    downFileName = downFolder+'/%d.html' % (id,)
    print 'downloading',url,'as', downFileName ,
    try:
        fp = urllib2.urlopen(url)
    except:
        print '[ failed ]'
        return False
    else :
        print '[ success ]'
        downlist.push( url )  #把已下载的url添加到列表中
        op = open(downFileName,"wb")
        html = fp.read()
        unicode(html,"gb18030","ignore").encode("utf8");
        op.write( html )
        op.close()
        fp.close()
        analyseHtml(url,html,priQueue,downlist)
        return True

def spider(beginurl, pages,downFolder):
    "爬虫主程序,循环从优先级队列中取出最高优先级的结点处理"
    priQueue = PriorityQueue()
    downlist = PriorityQueue() #已下载url的集合,防止重复下载
    priQueue.push( (1,beginurl) )
    i = 0
    while not priQueue.empty() and i < pages :
        k, url = priQueue.pop()
        if downloadUrl(i+1, url, priQueue , downlist,downFolder):
            i += 1
    print '\nDownload',i,'pages, Totally.'

def main():
    "主函数,设定相关参数:开始url,抓取的网页数目,保存的文件夹"
    beginurl = 'http://www.csdn.net'  #开始抓取的URL地址
    pages = 10   #抓取网页的数目
    downloadFolder = './down' #指定保存网页的文件夹
    if not os.path.isdir( downloadFolder ):
        os.mkdir( downloadFolder )
    spider( beginurl, pages, downloadFolder)

if __name__ == '__main__':
    main()


2:

#! /usr/bin/env python
# -*- coding: utf-8 -*-
#filename:spdUtility.py
#author:wfu([email protected])
import bisect
import string
import htmllib
import formatter
class PriorityQueue(list):
    "优先级队列,用于存储url,及它的优先级"
    def __init__(self):
        list.__init__(self)
        self.map  =  {}
    def push(self, item):
        #  按顺序插入,防止重复元素;若要按升序排列,可使用bisect.insort_left
        if  self.count(item)  ==  0:
            bisect.insort(self,  item)
            self.map[  item[1]  ]  =  item
    def pop(self):
        r  =  list.pop(self)
        del  self.map[  r[1]  ]
        return  r
    def getitem(self,url):
        if  self.map.has_key(  url  ):
            return  self.map[url]
        else  :
            return  None
    def empty(self):
        return  len(self)  ==  0
    def remove(self,item):
        list.remove(self,  item)
        del  self.map[  item[1]  ]

    def count(self,item):
        if len(self)  ==  0  :
            return  0
        #二分查找
        left = 0
        right =  len(self)-1
        mid  =  -1
        while  left  <=  right:
            mid  =  (left+right)/2
            if  self[mid]  <  item  :
                left  =  mid  +  1
            elif  self[mid]  >  item  :
                right  =  mid  -1
            else  :
                break
        return  self[mid]  ==  item  and  1  or  0


class Parser(htmllib.HTMLParser):
    #HTML分析类     
    def  __init__(self,  verbose=0):
        self.anchors  =  {}
        f  =  formatter.NullFormatter()
        htmllib.HTMLParser.__init__(self,  f,  verbose)

    def  anchor_bgn(self,  href,  name,  type):
        self.save_bgn()
        self.anchor  =  href

    def  anchor_end(self):
        text  =  string.strip(self.save_end())
        if  self.anchor  and  text:
            self.anchors[text]  =  self.anchors.get(text,  [])  +  [self.anchor]


def main():  #just  for  test
    pq  =  PriorityQueue()
    #  add  items  out  of  order
    pq.push(  (1,'http://www.baidu.com')  )
    pq.push(  (2,'http://www.sina.com')  )
    pq.push(  (3,'http://www.google.com')  )
    pq.push(  (1,'http://www.163.com')  )

    item  =  pq.getitem('http://www.sina.com')
    print  item
    print  pq.count(item)
    pq.remove(  item  )
    print  pq.count(item)
    #  print  queue  contents
    while  not  pq.empty():
        print  pq.pop()

if __name__ == '__main__':
    main()



發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章