學習爬蟲的第不知道多少天了,今天用爬一些你懂得的網站,冗餘代碼太多,只用來相互交流
import requests
from lxml import etree
class Sprider():
""" 設置爬取,返回一個html傳遞給解析函數"""
def __init__(self,url):
self.url=url
def sprider(self):
headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36",}
html=requests.get(url,headers=headers,verify=False).text,
# print(html)
return html
def resolving(self,html):
html=etree.HTML(html)
content=html.xpath("//div//a/img/@data-original")
name=html.xpath("//div//a/img/@alt")
# print(content,name)
return content,name
def spriderPicture(self,url,name):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36",
}
html = requests.get(url, headers=headers,verify=False).content
with open("D:\專業學習\Python\pythonGet\爬取javbuff\\"+name+".jpg","wb") as f:
f.write(html)
print("ok")
if __name__=="__main__":
"""1.獲取要爬去的url
2.爬指定的url的頁面將返回的源碼做解析找到需要的圖片的url並且返回列表
3.將圖片地址的列表利用for循環傳遞給爬取得函數
"""
url=input("請輸入抓取的地址")
"""將url傳遞給爬蟲模塊"""
sprider=Sprider(url)
html=sprider.sprider()
"""將返回的頁面源代碼傳遞解析函數,且接受兩個返回的列表"""
pictureUrlList,nameList=sprider.resolving(html)
for i in range(0,len(pictureUrlList)):
sprider.spriderPicture(pictureUrlList[i],nameList[i])
print("over")
但是問題出現了,
requests.exceptions.SSLError: HTTPSConnectionPool(host='imgb.xboot.bid', port=443): Max retries exceeded with url: /digital/video/rbb00152/rbb00152ps.jpg (Caused by SSLError(SSLError("bad handshake: SysCallError(10054, 'WSAECONNRESET')")))
所以怎麼解決那?加上verify=False 證書不再驗證
加上 verify=False 出現了新的 ERROR
InsecureRequestWarning: Unverified HTTPS request is being made. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#ssl-warnings
InsecureRequestWarning)