文章目錄

下載緩存

緩存使用的方式有很多種，對於數百萬網頁的網站來說，重新爬取會非常費勁，一開始就爬取可以讓每個網頁只下載一次

爲鏈接爬蟲添加緩存支持

我們把上一張的download函數寫成一個類，讓其擁有緩存的功能，用dict保存訪問過的鏈接來過濾是一個比較不錯的主意，key設置爲url，以下的Downloader保存了上一章的核心代碼

#!/usr/bin/env python
# encoding: utf-8

from random import choice

class Downloader:
    """ Downloader class to use cache and requests for downloading pages.
        For contructor, pass:
            delay (int): # of secs delay between requests (default: 5)
            user_agent (str): user agent string (default: 'wswp')
            proxies (list[dict]): list of possible proxies, each
                must be a dict with http / https keys and proxy values
            cache (dict or dict-like obj): keys: urls, values: dicts with keys (html, code)
            timeout (float/int): number of seconds to wait until timeout
    """
    def __init__(self, delay=5, user_agent='wswp', proxies=None, cache={},
                 timeout=60):
        self.user_agent = user_agent
        self.proxies = proxies
        self.cache = cache
        self.num_retries = None  # we will set this per request
        self.timeout = timeout

    def __call__(self, url, num_retries=2):
        """ Call the downloader class, which will return HTML from cache
            or download it
            args:
                url (str): url to download
            kwargs:
                num_retries (int): # times to retry if 5xx code (default: 2)
        """
        self.num_retries = num_retries
        try:
            result = self.cache[url]
            print('Loaded from cache:', url)
        except KeyError:
            result = None
        if result and self.num_retries and 500 <= result['code'] < 600:
            # server error so ignore result from cache
            # and re-download
            result = None
        if result is None:
            # result was not loaded from cache, need to download
            proxies = choice(self.proxies) if self.proxies else None
            headers = {'User-Agent': self.user_agent}
            result = self.download(url, headers, proxies)
            self.cache[url] = result
        return result['html']

    def download(self, url, headers, proxies):
        print('Downloading:', url)

if __name__ == "__main__":
    downloader = Downloader()
    downloader('http://example.python-scraping.com') # 首次訪問無cache
    downloader('http://example.python-scraping.com') # 有cache的情況
    downloader('http://www.baidu.com') # 再來一個無cache的

執行結果如下

磁盤緩存

書中提到的磁盤緩存並不是把html都保存到本地中，僅僅是保存html中的json字符串，對這個json字符串進行zlib.compress和zlib.decompress實現壓縮的功能，緩存有效期爲30天

#!/usr/bin/env python
# encoding: utf-8

import os
import json
import re
import zlib

from datetime import datetime
from urllib.parse import urlsplit

import requests
from datetime import timedelta

class DiskCache:
    """ DiskCache helps store urls and their responses to disk
        Intialization components:
            cache_dir (str): abs file path or relative file path
                for cache directory (default: ../data/cache)
            max_len (int): maximum filename length (default: 255)
            compress (bool): use zlib compression (default: True)
            encoding (str): character encoding for compression (default: utf-8)
            expires (datetime.timedelta): timedelta when content will expire
                (default: 30 days ago)
    """
    def __init__(self, cache_dir='./data/cache', max_len=255, compress=True,
                 encoding='utf-8', expires=timedelta(days=30)):
        self.cache_dir = cache_dir
        self.max_len = max_len
        self.compress = compress
        self.encoding = encoding
        self.expires = expires

    def url_to_path(self, url):
        """ Return file system path string for given URL """
        components = urlsplit(url)
        # append index.html to empty paths
        path = components.path
        if not path:
            path = '/index.html'
        elif path.endswith('/'):
            path += 'index.html'
        filename = components.netloc + path + components.query
        # replace invalid characters
        filename = re.sub(r'[^/0-9a-zA-Z\-.,;_ ]', '_', filename)
        # restrict maximum number of characters
        filename = '/'.join(seg[:self.max_len] for seg in filename.split('/'))
        return os.path.join(self.cache_dir, filename)

    def __getitem__(self, url):
        """Load data from disk for given URL"""
        path = self.url_to_path(url)
        if os.path.exists(path):
            mode = ('rb' if self.compress else 'r')
            with open(path, mode) as fp:
                if self.compress:
                    data = zlib.decompress(fp.read()).decode(self.encoding)
                    data = json.loads(data)
                else:
                    data = json.load(fp)
            exp_date = data.get('expires')
            if exp_date and datetime.strptime(exp_date,
                                              '%Y-%m-%dT%H:%M:%S') <= datetime.utcnow():
                print('Cache expired!', exp_date)
                raise KeyError(url + ' has expired.')
            return data
        else:
            # URL has not yet been cached
            raise KeyError(url + ' does not exist')

    def __setitem__(self, url, result):
        print(url, result, "==========")
        """Save data to disk for given url"""
        path = self.url_to_path(url)
        folder = os.path.dirname(path)
        if not os.path.exists(folder):
            os.makedirs(folder)
        mode = ('wb' if self.compress else 'w')
        # Note: the timespec command requires Py3.6+ (if using 3.X you can
        # export using isoformat() and import with '%Y-%m-%dT%H:%M:%S.%f'
        result['expires'] = (datetime.utcnow() + self.expires).isoformat(
            timespec='seconds')
        with open(path, mode) as fp:
            if self.compress:
                data = bytes(json.dumps(result), self.encoding)
                fp.write(zlib.compress(data))
            else:
                json.dump(result, fp)

if __name__ == "__main__":
    dc = DiskCache()
    url = "http://example.python-scraping.com"
    dc.url_to_path(url)
    resp = requests.get(url)
    d = {"name": "zengraoli", "password": "123456"}
    html = resp.text
    # dc[url] = html
    dc[url] = d

使用測試後，可以看到文件夾生成了，內容是一個壓縮過的html頁面

磁盤緩存缺點

文件名即使我們做了替換，還是會出現重複的情況，解決方案是用哈希處理；另外一個情況是網站子類太多，那麼查找起來會很慢，解決方案是使用多個網頁合併到一個裏面，用其他數據結構進行查找

鍵值對存儲緩存

redis實現緩存

如果是使用redis作爲緩存在合適不過了，redis還能提供對索引有效期的設置，替換了我們之前的手工處理，但是壓縮還是需要靠自己來做

下面的代碼是使用redis作爲緩存的測試，請首先確保安裝了對應的包

#!/usr/bin/env python
# encoding: utf-8

import json
import zlib
import requests
from datetime import timedelta
from redis import StrictRedis

class RedisCache:
    def __init__(self, client=None, expires=timedelta(days=30), encoding='utf-8', compress=True):
        self.client = (StrictRedis(host='localhost', port=6379, db=0)
                       if client is None else client)
        self.expires = expires
        self.encoding = encoding
        self.compress = compress

    def __getitem__(self, url):
        """Load data from Redis for given URL"""
        record = self.client.get(url)
        if record:
            if self.compress:
                record = zlib.decompress(record)
            return json.loads(record.decode(self.encoding))
        else:
            # URL has not yet been cached
            raise KeyError(url + ' does not exist')

    def __setitem__(self, url, result):
        """Save data to Redis for given url"""
        data = bytes(json.dumps(result), self.encoding)
        if self.compress:
            data = zlib.compress(data)
        self.client.setex(url, self.expires, data)

if __name__ == "__main__":
    dc = RedisCache()
    url = "http://example.python-scraping.com"
    resp = requests.get(url)
    d = {"name": "zengraoli", "password": "123456"}
    html = resp.text
    # dc[url] = html
    dc[url] = d
    print(dc[url])

探索requests-cache

requests-cache讓我們自己免於實現cache類，他支持多種後端，redis、mongodb、sqlite以及內存。使用之前需要安裝

pip install requests-cache

下面的代碼時使用requests-cache的，中間設置了緩存第二次的訪問基本不需要多少時間

import time
import requests_cache

if __name__ == "__main__":
    requests_cache.install_cache(backend='redis', expire_after=timedelta(days=30))
    url = "http://example.python-scraping.com"
    start = time.time()
    resp = requests.get(url)
    end = time.time()
    print("循環運行時間:%.2f秒" % (end - start)) # 循環運行時間:1.01秒

    resp = requests.get(url)
    end = time.time()
    print("循環運行時間:%.2f秒" % (end - start)) # 循環運行時間:1.02秒

併發下載

100萬個網頁

所謂的100w個網頁是從亞馬遜下載的一個壓縮文件，裏面是一個csv，用requests請求會比較慢，所以可以先用瀏覽器下載，然後再從本地讀取

#!/usr/bin/env python
# encoding: utf-8

import csv
from zipfile import ZipFile
from io import TextIOWrapper, BytesIO
import requests

if __name__ == "__main__":
    # resp = requests.get('http://s3.amazonaws.com/alexa-static/top-1m.csv.zip', stream=True)
    urls = []  # top 1 million URL's will be stored in this list
    content = ""
    # with ZipFile(BytesIO(content)) as zf:
    with ZipFile("top-1m.csv.zip", "r") as zf: # 從本地中讀取url
        csv_filename = zf.namelist()[0]
        with zf.open(csv_filename) as csv_file:
            for _, website in csv.reader(TextIOWrapper(csv_file)):
                urls.append('http://' + website)

多線程爬蟲

去掉一些其他功能，來模擬一下多線程的爬蟲

#!/usr/bin/env python
# encoding: utf-8

import csv
from zipfile import ZipFile
from io import TextIOWrapper
import requests
import threading
import socket
import time

SLEEP_TIME = 1
socket.setdefaulttimeout(60)
requests.packages.urllib3.disable_warnings() # 我們對輸出的ssl警告不感興趣

def threaded_crawler(start_url, max_threads=5):
    if isinstance(start_url, list):
        crawl_queue = start_url
    else:
        crawl_queue = [start_url]

    def process_queue():
        while crawl_queue:
            url = crawl_queue.pop()
            headers = {'User-Agent': "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",}
            try:
                html = requests.get(url, headers=headers, verify=False)
                print(url, html.status_code)
            except Exception as ee:
                print(url, " ee:", ee)

    # wait for all download threads to finish
    threads = []
    print(max_threads)
    while threads or crawl_queue:
        for thread in threads:
            if not thread.is_alive():
                threads.remove(thread)
        while len(threads) < max_threads and crawl_queue:
            # can start some more threads
            thread = threading.Thread(target=process_queue)
            thread.setDaemon(True)  # set daemon so main thread can exit w/ ctrl-c
            thread.start()
            threads.append(thread)
        print(threads)
        for thread in threads:
            thread.join()
        time.sleep(SLEEP_TIME)

if __name__ == "__main__":
    urls = []  # top 1 million URL's will be stored in this list
    content = ""
    with ZipFile("top-1m.csv.zip", "r") as zf: # 從本地中讀取url
        csv_filename = zf.namelist()[0]
        with zf.open(csv_filename) as csv_file:
            for _, website in csv.reader(TextIOWrapper(csv_file)):
                urls.append('http://' + website)

    # 拿出來10個url做多線程爬蟲示例
    url_list = urls[:10]
    threaded_crawler(url_list)

輸出如下

多進程爬蟲

多進程在py中相對多線程會有速度的提升，因爲GIL的緣故，但多進程又無法同時讀取同一個下載隊列，因此可以採用redis作爲中間介質

#!/usr/bin/env python
# encoding: utf-8

import csv
from zipfile import ZipFile
from io import TextIOWrapper
import requests
import threading
import multiprocessing
import socket
import time
from redis_queue import RedisQueue

SLEEP_TIME = 1
socket.setdefaulttimeout(60)
requests.packages.urllib3.disable_warnings() # 我們對輸出的ssl警告不感興趣

def threaded_crawler_rq(start_url, max_threads=5):
    crawl_queue = RedisQueue()
    def process_queue():
        while crawl_queue:
            url = crawl_queue.pop()
            headers = {'User-Agent': "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",}
            try:
                html = requests.get(url, headers=headers, verify=False)
                print(url, html.status_code)
            except Exception as ee:
                print(url, " ee:", ee)

    # wait for all download threads to finish
    threads = []
    print(max_threads)
    while threads or crawl_queue:
        for thread in threads:
            if not thread.is_alive():
                threads.remove(thread)
        while len(threads) < max_threads and crawl_queue:
            # can start some more threads
            thread = threading.Thread(target=process_queue)
            thread.setDaemon(True)  # set daemon so main thread can exit w/ ctrl-c
            thread.start()
            threads.append(thread)
        print(threads)
        for thread in threads:
            thread.join()
        time.sleep(SLEEP_TIME)

def mp_threaded_crawler(*args, **kwargs):
    """ create a multiprocessing threaded crawler """
    processes = []
    num_procs = kwargs.pop('num_procs')
    if not num_procs:
        num_procs = multiprocessing.cpu_count()
    for _ in range(num_procs):
        proc = multiprocessing.Process(target=threaded_crawler_rq,
                                       args=args, kwargs=kwargs)
        proc.start()
        processes.append(proc)
    # wait for processes to complete
    for proc in processes:
        proc.join()

if __name__ == "__main__":
    urls = []  # top 1 million URL's will be stored in this list
    content = ""
    crawl_queue = RedisQueue()
    with ZipFile("top-1m.csv.zip", "r") as zf: # 從本地中讀取url
        csv_filename = zf.namelist()[0]
        with zf.open(csv_filename) as csv_file:
            for _, website in csv.reader(TextIOWrapper(csv_file)):
                urls.append('http://' + website)
    # 拿出來10個url做多線程爬蟲示例
    url_list = urls[:20]
    crawl_queue.push(url_list) # 把20個鏈接推進redis隊列中
    # threaded_crawler(url_list)
    start_time = time.time()
    mp_threaded_crawler(url_list, num_procs=4)
    print('Total time: %ss' % (time.time() - start_time))

配套的redis代碼，注意存放路徑

# Based loosely on the Redis Cookbook FIFO Queue: http://www.rediscookbook.org/implement_a_fifo_queue.html
from redis import StrictRedis

class RedisQueue:
    def __init__(self, client=None, db=0, queue_name='wswp'):
        self.client = (StrictRedis(host='localhost', port=6379, db=db)
                       if client is None else client)
        self.name = "queue:%s" % queue_name
        self.seen_set = "seen:%s" % queue_name
        self.depth = "depth:%s" % queue_name

    def __len__(self):
        return self.client.llen(self.name)

    def push(self, element):
        """Push an element to the tail of the queue"""
        if isinstance(element, list):
            element = [e for e in element if not self.already_seen(e)]
            self.client.lpush(self.name, *element)
            self.client.sadd(self.seen_set, *element)
        elif not self.already_seen(element):
            self.client.lpush(self.name, element)
            self.client.sadd(self.seen_set, element)

    def already_seen(self, element):
        """ determine if an element has already been seen """
        return self.client.sismember(self.seen_set, element)

    def set_depth(self, element, depth):
        """ Set the seen hash and depth """
        self.client.hset(self.depth, element, depth)

    def get_depth(self, element):
        """ Get the seen hash and depth """
        return (lambda dep: int(dep) if dep else 0)(self.client.hget(self.depth, element))

    def pop(self):
        """Pop an element from the head of the queue"""
        return self.client.rpop(self.name).decode('utf-8')

輸出如下

參考網站

python3 __call__方法
 python計算時間的兩種方式：time與datetime
python中zipfile模塊實例化解析
 python requests提示警告InsecureRequestWarning

《用Python寫網絡爬蟲》讀書筆記2