前言
因爲煎蛋網的妹子圖加入裏反扒手段,網頁中圖片的鏈接已經加密 需要分析js來找出圖片的請求地址 所以就用了selenium對它的簡單爬取
導入包
from selenium import webdriver
from bs4 import BeautifulSoup
import urllib.request
import os
模擬谷歌瀏覽器
broswer = webdriver.Chrome()
將頁面中的圖片的url放入image_url_list列表
def get_one_page(url):
#構造一個空的列表 方便後續保存網頁中圖片的url
image_url_list = []
#通過模擬的瀏覽器自動打開待爬取的頁面
broswer.get(url)
#獲取網頁源代碼
data = broswer.page_source
#通過BeautifulSoup解析網頁
soup = BeautifulSoup(data,'lxml')
#定位圖片地址元素
images = soup.select("a.view_img_link")
#print(images)
#print(type(images))
for image in images:
hf = image.get('href')
#去除 gif的圖片格式
if str('gif') in str(hf):
pass
else:
image_url_list.append(hf)
#print(len(image_url))
#print(type(image_url))
return image_url_list
將頁面中的圖片保存到本地
def download_one_page(image_url_list,toPath):
for i in image_url_list:
http_url = "http:" + i
#print(http_url)
extension = os.path.split(i)[1]
path = os.path.join(toPath, extension)
try:
print("照片正在下載......",http_url)
urllib.request.urlretrieve(http_url,filename=path)
except urllib.error.URLError as e:
print(e)
爬取多頁目標
def main():
#url = r'http://jandan.net/ooxx/page-num#comments'
toPath =r"E:\python\requests和正則爬取圖片\jiandan"
urls = ["http://jandan.net/ooxx/page-{}#comments".format(str(i)) for i in range(38,40)]
print(type(urls))
for url in urls:
image_url_list = get_one_page(url)
download_one_page(image_url_list,toPath)
broswer.close()
完整代碼展示
from selenium import webdriver
from bs4 import BeautifulSoup
import urllib.request
import os
broswer = webdriver.Chrome()
def get_one_page(url):
#構造一個空的列表 方便後續保存網頁中圖片的url
image_url_list = []
#通過模擬的瀏覽器自動打開待爬取的頁面
broswer.get(url)
#獲取網頁源代碼
data = broswer.page_source
#通過BeautifulSoup解析網頁
soup = BeautifulSoup(data,'lxml')
#定位圖片地址元素
images = soup.select("a.view_img_link")
#print(images)
#print(type(images))
for image in images:
hf = image.get('href')
#去除 gif的圖片格式
if str('gif') in str(hf):
pass
else:
image_url_list.append(hf)
#print(len(image_url))
#print(type(image_url))
return image_url_list
def download_one_page(image_url_list,toPath):
for i in image_url_list:
http_url = "http:" + i
#print(http_url)
extension = os.path.split(i)[1]
path = os.path.join(toPath, extension)
try:
print("圖片正在下載......",http_url)
urllib.request.urlretrieve(http_url,filename=path)
except urllib.error.URLError as e:
print(e)
def main():
#url = r'http://jandan.net/ooxx/page-num#comments'
toPath =r"E:\python\requests和正則爬取圖片\jiandan"
urls = ["http://jandan.net/ooxx/page-{}#comments".format(str(i)) for i in range(38,40)]
print(type(urls))
for url in urls:
image_url_list = get_one_page(url)
download_one_page(image_url_list,toPath)
broswer.close()
if __name__ == "__main__":
main()