使用python抓取網頁內容,並且下載網頁內容中的圖片。
使用urllib庫抓取網頁內容,該庫是python自帶,不需要安裝。
由於我這邊抓取的是國外的網站,用到了代理,且設置User-Agent防止網站對沒有User-Agent的請求過濾
爬蟲類代碼Scrapy.py:
import urllib.request
class Scrapy:
proxy = ''
opener = ''
proxy_host = '192.168.88.17:8333'
user_agent = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.170 Safari/537.36'
values = {'test':1}
headers = ''
def __init__(self):
self.headers = {
'User-Agent':self.user_agent,
'Host':'mangakakalot.com',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cache-Control':'max-age=0',
'Proxy-Connection':'keep-alive',
'Upgrade-Insecure-Requests':'1'
}
# 設置代理
def setProxy(self):
self.proxy = urllib.request.ProxyHandler({'http': self.proxy_host})
self.opener = urllib.request.build_opener(self.proxy, urllib.request.HTTPHandler)
# 獲取網頁內容
def getHtml(self, url):
self.setProxy()
urllib.request.install_opener(self.opener)
req = urllib.request.Request(url, urllib.parse.urlencode(self.values).encode(encoding='UTF8'), {'User-Agent':self.user_agent})
return urllib.request.urlopen(req).read().decode('utf8')
# 下載圖片
def downImg(self, url, folder, filename):
self.setProxy()
self.opener.addheaders = [('User-Agent', self.user_agent)]
urllib.request.install_opener(self.opener)
urllib.request.urlretrieve(url, folder + filename)
調用例子:
# -*- coding: UTF-8 -*-
from Scrapy import Scrapy
scrapy = Scrapy()
# 抓取網頁內容
html = scrapy.getHtml('http://xxxxxxxxxx')
# 下載地址
scrapy.downImg('http://xxxxxxxxxx.jpg', './data/pic/', 'xx.jpg')