1. 單線程版
import requests
from lxml import etree
import time
import re
class HaiBaoSpider():
def __init__(self):
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"
}
self.start_url = "https://818ps.com/muban/haibao.html?user_source=r44926&bd_vid=9395005922550959355&sdclkid=b5gpA5fs15eG15eG"
def get_html(self, url):
response = requests.get(url, headers = self.headers)
if response.status_code == 200:
return response.content.decode("utf-8")
else:
return None
def parse_page(self, html_str, flag=1):
html = etree.HTML(html_str)
imgs = html.xpath("//img[@class='lazy']")
data = []
for i in imgs:
item = {}
item["img_name"] = i.xpath("./@alt")[0]
item["img_url"] = "https:" + i.xpath("./@img-original")[0]
data.append(item)
if flag == 0:
next_url = "https://818ps.com" + html.xpath("//li[@class='toppage']/a/@href")[0]
else:
next_url = "https://818ps.com" + html.xpath("//li[@class='toppage']/a/@href")[1]
print(next_url)
return data, next_url
def save_img(self, title, url):
title = re.sub(r"[^\u4E00-\u9FFF]", "", title)
end_name = re.search(r"\.[a-z]{3}[\?|!]", url).group()[:4]
file_name = title + end_name
fp = open("./data/海報圖片爬蟲/" + file_name, "wb")
fp.write(requests.get(url).content)
fp.close()
print(file_name + "寫入成功...")
def run(self):
html_str = self.get_html(self.start_url)
data, next_url = self.parse_page(html_str, flag=0)
for d in data:
self.save_img(d["img_name"] , d["img_url"])
while next_url:
html_str = self.get_html(next_url)
data, next_url = self.parse_page(html_str)
for d in data:
self.save_img(d["img_name"], d["img_url"])
if __name__ == '__main__':
hbs = HaiBaoSpider()
hbs.run()
2.多線程版
import requests
from lxml import etree
import time
import re
import threading
import queue
class HaiBaoSpider():
def __init__(self):
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"
}
self.start_url = "https://818ps.com/muban/haibao.html?user_source=r44926&bd_vid=9395005922550959355&sdclkid=b5gpA5fs15eG15eG"
self.list_url = queue.Queue(300)
self.html_text = queue.Queue(300)
self.img_urlAndTitle = queue.Queue(2000)
def get_html(self, url):
response = requests.get(url, headers = self.headers)
if response.status_code == 200:
return response.content.decode("utf-8")
else:
return None
def get_next_url(self, html_str, flag=1):
html = etree.HTML(html_str)
if flag == 0:
next_url = "https://818ps.com" + html.xpath("//li[@class='toppage']/a/@href")[0]
else:
next_url = "https://818ps.com" + html.xpath("//li[@class='toppage']/a/@href")[1]
return next_url
def get_list_url(self):
html_str = self.get_html(self.start_url)
next_url = self.get_next_url(html_str, flag=0)
self.list_url.put(next_url)
while next_url:
html_str = self.get_html(next_url)
next_url = self.get_next_url(html_str, flag=1)
self.list_url.put(next_url)
def get_list_html(self):
while True:
url = self.list_url.get()
list_html = self.get_html(url)
self.html_text.put(list_html)
self.list_url.task_done()
def get_img_urlAndTitle(self):
while True:
html_text = self.html_text.get()
html = etree.HTML(html_text)
imgs = html.xpath("//img[@class='lazy']")
for i in imgs:
item = {}
item["img_name"] = i.xpath("./@alt")[0]
item["img_url"] = "https:" + i.xpath("./@img-original")[0]
self.img_urlAndTitle.put(item)
self.html_text.task_done()
def save_imgs(self):
while True:
img_item = self.img_urlAndTitle.get()
title = re.sub(r"[^\u4E00-\u9FFF]", "", img_item["img_name"])
end_name = re.search(r"\.[a-z]{3}[\?|!]", img_item["img_url"]).group()[:4]
file_name = title + end_name
fp = open("./data/海報圖片爬蟲/" + file_name, "wb")
fp.write(requests.get(img_item["img_url"]).content)
fp.close()
print(file_name + "寫入成功...")
self.img_urlAndTitle.task_done()
def run(self):
thread_list = []
#獲取列表頁url使用3個線程
for i in range(3):
thread_get_list_url = threading.Thread(target=self.get_list_url)
thread_list.append(thread_get_list_url)
#獲取列表頁html使用5個線程
for i in range(5):
thread_get_list_html = threading.Thread(target=self.get_list_html)
thread_list.append(thread_get_list_html)
#獲取海報名稱和圖片url地址無需發器網絡請求,故只用1個線程即可
thread_parse_html = threading.Thread(target=self.get_img_urlAndTitle)
thread_list.append(thread_parse_html)
#保存圖片用10個線程
for i in range(10):
thread_save_img = threading.Thread(target=self.save_imgs)
thread_list.append(thread_save_img)
for t in thread_list:
t.setDaemon(True) #將線程設置爲後臺進程,當主進程結束時子進程也結束
t.start() #開啓進程
time.sleep(1)
#令主線程等待,當全部隊列爲空時方可結束主線程
self.list_url.join()
self.html_text.join()
self.img_urlAndTitle.join()
if __name__ == '__main__':
hbs = HaiBaoSpider()
hbs.run()
爬取結果如下:
卡通立夏節氣動態海報.png寫入成功…
卡通風格預防接種停診通知宣傳海報.jpg寫入成功…
創意扁平風青年節動態海報.png寫入成功…
廢棄口罩處理方式垃圾分類黃色卡通手機海報.jpg寫入成功…
廢棄口罩處理方式垃圾分類橙色卡通手機海報.jpg寫入成功…
行動表達愛母親節對話手繪海報.jpg寫入成功…
簡約風旅遊出行橫板海報.jpg寫入成功…
母親節花式曬單宣傳手機海報.jpg寫入成功…
簡約創意五月你好日籤海報.png寫入成功…
簡約創意五四節超市促銷橫版海報.jpg寫入成功…
手繪風五一音樂趴樂器演奏豎版海報.jpg寫入成功…
簡約風醫護人員招聘手機海報.jpg寫入成功…
Process finished with exit code -1
寫入圖片如下: