下載緩存
緩存使用的方式有很多種,對於數百萬網頁的網站來說,重新爬取會非常費勁,一開始就爬取可以讓每個網頁只下載一次
爲鏈接爬蟲添加緩存支持
我們把上一張的download函數寫成一個類,讓其擁有緩存的功能,用dict保存訪問過的鏈接來過濾是一個比較不錯的主意,key設置爲url,以下的Downloader保存了上一章的核心代碼
#!/usr/bin/env python
# encoding: utf-8
from random import choice
class Downloader:
""" Downloader class to use cache and requests for downloading pages.
For contructor, pass:
delay (int): # of secs delay between requests (default: 5)
user_agent (str): user agent string (default: 'wswp')
proxies (list[dict]): list of possible proxies, each
must be a dict with http / https keys and proxy values
cache (dict or dict-like obj): keys: urls, values: dicts with keys (html, code)
timeout (float/int): number of seconds to wait until timeout
"""
def __init__(self, delay=5, user_agent='wswp', proxies=None, cache={},
timeout=60):
self.user_agent = user_agent
self.proxies = proxies
self.cache = cache
self.num_retries = None # we will set this per request
self.timeout = timeout
def __call__(self, url, num_retries=2):
""" Call the downloader class, which will return HTML from cache
or download it
args:
url (str): url to download
kwargs:
num_retries (int): # times to retry if 5xx code (default: 2)
"""
self.num_retries = num_retries
try:
result = self.cache[url]
print('Loaded from cache:', url)
except KeyError:
result = None
if result and self.num_retries and 500 <= result['code'] < 600:
# server error so ignore result from cache
# and re-download
result = None
if result is None:
# result was not loaded from cache, need to download
proxies = choice(self.proxies) if self.proxies else None
headers = {'User-Agent': self.user_agent}
result = self.download(url, headers, proxies)
self.cache[url] = result
return result['html']
def download(self, url, headers, proxies):
print('Downloading:', url)
if __name__ == "__main__":
downloader = Downloader()
downloader('http://example.python-scraping.com') # 首次訪問無cache
downloader('http://example.python-scraping.com') # 有cache的情況
downloader('http://www.baidu.com') # 再來一個無cache的
執行結果如下
磁盤緩存
書中提到的磁盤緩存並不是把html都保存到本地中,僅僅是保存html中的json字符串,對這個json字符串進行zlib.compress和zlib.decompress實現壓縮的功能,緩存有效期爲30天
#!/usr/bin/env python
# encoding: utf-8
import os
import json
import re
import zlib
from datetime import datetime
from urllib.parse import urlsplit
import requests
from datetime import timedelta
class DiskCache:
""" DiskCache helps store urls and their responses to disk
Intialization components:
cache_dir (str): abs file path or relative file path
for cache directory (default: ../data/cache)
max_len (int): maximum filename length (default: 255)
compress (bool): use zlib compression (default: True)
encoding (str): character encoding for compression (default: utf-8)
expires (datetime.timedelta): timedelta when content will expire
(default: 30 days ago)
"""
def __init__(self, cache_dir='./data/cache', max_len=255, compress=True,
encoding='utf-8', expires=timedelta(days=30)):
self.cache_dir = cache_dir
self.max_len = max_len
self.compress = compress
self.encoding = encoding
self.expires = expires
def url_to_path(self, url):
""" Return file system path string for given URL """
components = urlsplit(url)
# append index.html to empty paths
path = components.path
if not path:
path = '/index.html'
elif path.endswith('/'):
path += 'index.html'
filename = components.netloc + path + components.query
# replace invalid characters
filename = re.sub(r'[^/0-9a-zA-Z\-.,;_ ]', '_', filename)
# restrict maximum number of characters
filename = '/'.join(seg[:self.max_len] for seg in filename.split('/'))
return os.path.join(self.cache_dir, filename)
def __getitem__(self, url):
"""Load data from disk for given URL"""
path = self.url_to_path(url)
if os.path.exists(path):
mode = ('rb' if self.compress else 'r')
with open(path, mode) as fp:
if self.compress:
data = zlib.decompress(fp.read()).decode(self.encoding)
data = json.loads(data)
else:
data = json.load(fp)
exp_date = data.get('expires')
if exp_date and datetime.strptime(exp_date,
'%Y-%m-%dT%H:%M:%S') <= datetime.utcnow():
print('Cache expired!', exp_date)
raise KeyError(url + ' has expired.')
return data
else:
# URL has not yet been cached
raise KeyError(url + ' does not exist')
def __setitem__(self, url, result):
print(url, result, "==========")
"""Save data to disk for given url"""
path = self.url_to_path(url)
folder = os.path.dirname(path)
if not os.path.exists(folder):
os.makedirs(folder)
mode = ('wb' if self.compress else 'w')
# Note: the timespec command requires Py3.6+ (if using 3.X you can
# export using isoformat() and import with '%Y-%m-%dT%H:%M:%S.%f'
result['expires'] = (datetime.utcnow() + self.expires).isoformat(
timespec='seconds')
with open(path, mode) as fp:
if self.compress:
data = bytes(json.dumps(result), self.encoding)
fp.write(zlib.compress(data))
else:
json.dump(result, fp)
if __name__ == "__main__":
dc = DiskCache()
url = "http://example.python-scraping.com"
dc.url_to_path(url)
resp = requests.get(url)
d = {"name": "zengraoli", "password": "123456"}
html = resp.text
# dc[url] = html
dc[url] = d
使用測試後,可以看到文件夾生成了,內容是一個壓縮過的html頁面
磁盤緩存缺點
文件名即使我們做了替換,還是會出現重複的情況,解決方案是用哈希處理;另外一個情況是網站子類太多,那麼查找起來會很慢,解決方案是使用多個網頁合併到一個裏面,用其他數據結構進行查找
鍵值對存儲緩存
redis實現緩存
如果是使用redis作爲緩存在合適不過了,redis還能提供對索引有效期的設置,替換了我們之前的手工處理,但是壓縮還是需要靠自己來做
下面的代碼是使用redis作爲緩存的測試,請首先確保安裝了對應的包
#!/usr/bin/env python
# encoding: utf-8
import json
import zlib
import requests
from datetime import timedelta
from redis import StrictRedis
class RedisCache:
def __init__(self, client=None, expires=timedelta(days=30), encoding='utf-8', compress=True):
self.client = (StrictRedis(host='localhost', port=6379, db=0)
if client is None else client)
self.expires = expires
self.encoding = encoding
self.compress = compress
def __getitem__(self, url):
"""Load data from Redis for given URL"""
record = self.client.get(url)
if record:
if self.compress:
record = zlib.decompress(record)
return json.loads(record.decode(self.encoding))
else:
# URL has not yet been cached
raise KeyError(url + ' does not exist')
def __setitem__(self, url, result):
"""Save data to Redis for given url"""
data = bytes(json.dumps(result), self.encoding)
if self.compress:
data = zlib.compress(data)
self.client.setex(url, self.expires, data)
if __name__ == "__main__":
dc = RedisCache()
url = "http://example.python-scraping.com"
resp = requests.get(url)
d = {"name": "zengraoli", "password": "123456"}
html = resp.text
# dc[url] = html
dc[url] = d
print(dc[url])
探索requests-cache
requests-cache讓我們自己免於實現cache類,他支持多種後端,redis、mongodb、sqlite以及內存。使用之前需要安裝
pip install requests-cache
下面的代碼時使用requests-cache的,中間設置了緩存第二次的訪問基本不需要多少時間
import time
import requests_cache
if __name__ == "__main__":
requests_cache.install_cache(backend='redis', expire_after=timedelta(days=30))
url = "http://example.python-scraping.com"
start = time.time()
resp = requests.get(url)
end = time.time()
print("循環運行時間:%.2f秒" % (end - start)) # 循環運行時間:1.01秒
resp = requests.get(url)
end = time.time()
print("循環運行時間:%.2f秒" % (end - start)) # 循環運行時間:1.02秒
併發下載
100萬個網頁
所謂的100w個網頁是從亞馬遜下載的一個壓縮文件,裏面是一個csv,用requests請求會比較慢,所以可以先用瀏覽器下載,然後再從本地讀取
#!/usr/bin/env python
# encoding: utf-8
import csv
from zipfile import ZipFile
from io import TextIOWrapper, BytesIO
import requests
if __name__ == "__main__":
# resp = requests.get('http://s3.amazonaws.com/alexa-static/top-1m.csv.zip', stream=True)
urls = [] # top 1 million URL's will be stored in this list
content = ""
# with ZipFile(BytesIO(content)) as zf:
with ZipFile("top-1m.csv.zip", "r") as zf: # 從本地中讀取url
csv_filename = zf.namelist()[0]
with zf.open(csv_filename) as csv_file:
for _, website in csv.reader(TextIOWrapper(csv_file)):
urls.append('http://' + website)
多線程爬蟲
去掉一些其他功能,來模擬一下多線程的爬蟲
#!/usr/bin/env python
# encoding: utf-8
import csv
from zipfile import ZipFile
from io import TextIOWrapper
import requests
import threading
import socket
import time
SLEEP_TIME = 1
socket.setdefaulttimeout(60)
requests.packages.urllib3.disable_warnings() # 我們對輸出的ssl警告不感興趣
def threaded_crawler(start_url, max_threads=5):
if isinstance(start_url, list):
crawl_queue = start_url
else:
crawl_queue = [start_url]
def process_queue():
while crawl_queue:
url = crawl_queue.pop()
headers = {'User-Agent': "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",}
try:
html = requests.get(url, headers=headers, verify=False)
print(url, html.status_code)
except Exception as ee:
print(url, " ee:", ee)
# wait for all download threads to finish
threads = []
print(max_threads)
while threads or crawl_queue:
for thread in threads:
if not thread.is_alive():
threads.remove(thread)
while len(threads) < max_threads and crawl_queue:
# can start some more threads
thread = threading.Thread(target=process_queue)
thread.setDaemon(True) # set daemon so main thread can exit w/ ctrl-c
thread.start()
threads.append(thread)
print(threads)
for thread in threads:
thread.join()
time.sleep(SLEEP_TIME)
if __name__ == "__main__":
urls = [] # top 1 million URL's will be stored in this list
content = ""
with ZipFile("top-1m.csv.zip", "r") as zf: # 從本地中讀取url
csv_filename = zf.namelist()[0]
with zf.open(csv_filename) as csv_file:
for _, website in csv.reader(TextIOWrapper(csv_file)):
urls.append('http://' + website)
# 拿出來10個url做多線程爬蟲示例
url_list = urls[:10]
threaded_crawler(url_list)
輸出如下
多進程爬蟲
多進程在py中相對多線程會有速度的提升,因爲GIL的緣故,但多進程又無法同時讀取同一個下載隊列,因此可以採用redis作爲中間介質
#!/usr/bin/env python
# encoding: utf-8
import csv
from zipfile import ZipFile
from io import TextIOWrapper
import requests
import threading
import multiprocessing
import socket
import time
from redis_queue import RedisQueue
SLEEP_TIME = 1
socket.setdefaulttimeout(60)
requests.packages.urllib3.disable_warnings() # 我們對輸出的ssl警告不感興趣
def threaded_crawler_rq(start_url, max_threads=5):
crawl_queue = RedisQueue()
def process_queue():
while crawl_queue:
url = crawl_queue.pop()
headers = {'User-Agent': "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",}
try:
html = requests.get(url, headers=headers, verify=False)
print(url, html.status_code)
except Exception as ee:
print(url, " ee:", ee)
# wait for all download threads to finish
threads = []
print(max_threads)
while threads or crawl_queue:
for thread in threads:
if not thread.is_alive():
threads.remove(thread)
while len(threads) < max_threads and crawl_queue:
# can start some more threads
thread = threading.Thread(target=process_queue)
thread.setDaemon(True) # set daemon so main thread can exit w/ ctrl-c
thread.start()
threads.append(thread)
print(threads)
for thread in threads:
thread.join()
time.sleep(SLEEP_TIME)
def mp_threaded_crawler(*args, **kwargs):
""" create a multiprocessing threaded crawler """
processes = []
num_procs = kwargs.pop('num_procs')
if not num_procs:
num_procs = multiprocessing.cpu_count()
for _ in range(num_procs):
proc = multiprocessing.Process(target=threaded_crawler_rq,
args=args, kwargs=kwargs)
proc.start()
processes.append(proc)
# wait for processes to complete
for proc in processes:
proc.join()
if __name__ == "__main__":
urls = [] # top 1 million URL's will be stored in this list
content = ""
crawl_queue = RedisQueue()
with ZipFile("top-1m.csv.zip", "r") as zf: # 從本地中讀取url
csv_filename = zf.namelist()[0]
with zf.open(csv_filename) as csv_file:
for _, website in csv.reader(TextIOWrapper(csv_file)):
urls.append('http://' + website)
# 拿出來10個url做多線程爬蟲示例
url_list = urls[:20]
crawl_queue.push(url_list) # 把20個鏈接推進redis隊列中
# threaded_crawler(url_list)
start_time = time.time()
mp_threaded_crawler(url_list, num_procs=4)
print('Total time: %ss' % (time.time() - start_time))
配套的redis代碼,注意存放路徑
# Based loosely on the Redis Cookbook FIFO Queue: http://www.rediscookbook.org/implement_a_fifo_queue.html
from redis import StrictRedis
class RedisQueue:
def __init__(self, client=None, db=0, queue_name='wswp'):
self.client = (StrictRedis(host='localhost', port=6379, db=db)
if client is None else client)
self.name = "queue:%s" % queue_name
self.seen_set = "seen:%s" % queue_name
self.depth = "depth:%s" % queue_name
def __len__(self):
return self.client.llen(self.name)
def push(self, element):
"""Push an element to the tail of the queue"""
if isinstance(element, list):
element = [e for e in element if not self.already_seen(e)]
self.client.lpush(self.name, *element)
self.client.sadd(self.seen_set, *element)
elif not self.already_seen(element):
self.client.lpush(self.name, element)
self.client.sadd(self.seen_set, element)
def already_seen(self, element):
""" determine if an element has already been seen """
return self.client.sismember(self.seen_set, element)
def set_depth(self, element, depth):
""" Set the seen hash and depth """
self.client.hset(self.depth, element, depth)
def get_depth(self, element):
""" Get the seen hash and depth """
return (lambda dep: int(dep) if dep else 0)(self.client.hget(self.depth, element))
def pop(self):
"""Pop an element from the head of the queue"""
return self.client.rpop(self.name).decode('utf-8')
輸出如下
參考網站
python3 __call__方法
python計算時間的兩種方式:time與datetime
python中zipfile模塊實例化解析
python requests提示警告InsecureRequestWarning