python 抓網站

原創

2020-02-23 01:45

前段時間寫的，不過也參考了網上的例子，針對具體的情況做了修改。

#-*- coding: utf-8 -*-

import socket
import lxml.html
import lxml.etree
# import chardet

import time
from urllib import urlretrieve
from urlparse import urlparse
from os import makedirs, remove, removedirs
from os.path import exists, splitext, dirname
import sys

# from httptools import strToUnicode, unicodeToStr

class Retrieve(object):

    # def __init__(self, url, baseUrl):
    def __init__(self, url):
        self.url = url
        self.file = self.fileName()
        self.baseUrl = url
        # self.filetype = splitext(self.file)[1][1:]
        # print 'filetype: %s' % self.filetype
        # self.charset = ''

    def fileName(self):
        """根據url創建目錄文件"""

        urlPart = urlparse(self.url)
        path = urlPart[1] + urlPart[2]
        if not urlPart[2]:
            path = urlPart[1] + '/'
        ext = splitext(path)
        if ext[1]=='':
            path += 'index.html'

        file = path
        path = dirname(path)

        if not exists(path):
            makedirs(path)

        return file

    def downLoad(self):
        """下載文件"""
        if exists(self.file):
            return ('** file exists',)
        socket.setdefaulttimeout(25)
        try:
            result = urlretrieve(self.url, self.file)
        except Exception, e:
            print 'download error:', e
            result = ('** invail url', )
            return result
        if ('content-length' in result[1]) and int(result[1]['content-length'])<250:
            path = dirname(self.file)
            remove(self.file)
            try:
                removedirs(path)
            except OSError:
                pass
            result = ('** invail url', )
        return result

    def getLinks(self):
        """獲取文件中的鏈接"""
        f = open(self.file)
        html = f.read()
        f.close()

        # 編碼判斷及轉換
        # charJust = chardet.detect(html)
        # try:
            # if charJust['encoding'].lower() == 'gb2312':
                # charJust['encoding'] = 'gb18030'
        # except Exception, e:
            # charJust['encoding'] = 'utf-8'
        # self.charset = charJust['encoding']
        # html = strToUnicode(html, encoding=self.charset)

        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(base_url=self.baseUrl, resolve_base_href=False)

        linkList = []

        for link in lxml.html.iterlinks(doc):
            linkList.append(link[2])

        # 把絕對路徑替換成相對路徑
        # self.linkReplFunc(doc)

        return linkList

class Crawl(object):

    def __init__(self, url, domain):
        self.url = url
        self.domain = domain
        self.seen = []
        self.vlink = [url]
        self.baseUrl = url

    def getPage(self):
        # rv = Retrieve(self.url, self.baseUrl)
        rv = Retrieve(self.url)
        # 把下載過得url放入列表中
        self.seen.append(self.url)
        result = rv.downLoad()

        if result[0] == '** invail url':
           self.log('** download err')
           return

        if self.url[-3:].lower() in ['css', '.js', 'jpg', 'png', 'gif', 'bmp', 'mp4', 'exe', 'bin', 'swf', 'ico']:
           return

        if self.url.find('sina')!=-1:
           self.log('** sina url')
           return

        try:
            links = rv.getLinks()
        except Exception,e:
            print 'getLinks error:', e
            return

        self.log('sucess download')
        for link in links:
           self.log('get link', link)
           link = link.split('?')[0].split('#')[0]
           if (link not in self.seen) and (link not in self.vlink) and (self.domain in link) and link.count(r'://')<2 and link.find(r'http://uctest.ucweb.com:81')==0:
               self.log('++app', link)
               print 'appendlink: %s' % link
               self.vlink.append(link)
           else:
               self.log('--drop', link)

    def go(self):
       while self.vlink:
#            time.sleep(2)
           link = self.vlink.pop()
           self.url = link
#            print 'download list: ', self.vlink
           print 'download: ', self.url
           self.getPage()
            # sys.exit(0)

    def log(self, st, link=''):
        f=file('urlall.txt', 'a')
        if len(link)==0:
           f.write((u'%s:\t%s\n' % (st, self.url)).encode("utf-8"))
        else:
           f.write((u'%s:\t%s\n' % (st, link)).encode("utf-8"))
        f.close()

if __name__ == '__main__':

    #url = "http://www.phpv.net/topics/79.html"
    #url = 'http://uctest.ucweb.com:81/wml/Graphics/htmlcachepic/1p_11.html'
    url = "http://uctest.ucweb.com:81/wml/index.wml"
    domain = 'uctest.ucweb.com'
    cr = Crawl(url, domain)
    cr.go()
    print 'download over'

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.

python 抓網站

DAPPER 事務 TRANSACTION

安裝python-lxml

4月15日作業

shell sed命令使用心得

python獲取所有鏈接保存到數據表並依次打開

ubuntu下桌面顯示內容配置

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結