python項目之 爬蟲爬取煎蛋jandan的妹子圖-上
抓取妹子圖練練手。
網頁url格式
http://jandan.net/ooxx/page-1777#comment
只需改變頁碼1777即可
分析頁面源碼發現妹子圖有兩個
一個是縮略圖
<img src="http://ww1.sinaimg.cn/mw600/4bf31e43jw1f09htnzkh5j20dw0kumz0.jpg" /></p>
另一個是原圖
<a href="http://ww1.sinaimg.cn/large/4bf31e43jw1f09htnzkh5j20dw0kumz0.jpg" target="_blank" class="view_img_link">[查看原圖]</a>
這裏我們抓取原圖,使用class和target這個屬性查找。
最終得到每一頁的TXT文件,下篇是文件合併與圖片存取。
源碼如下
代理ip文件請自行查找:-D
# coding:utf-8
####################################################
# coding by 劉雲飛
####################################################
import requests
import os
import time
import random
from bs4 import BeautifulSoup
import threading
url = "http://jandan.net/ooxx/page-"
img_lists = []
url_lists = []
not_url_lists = []
ips = []
thread_list = []
with open('ip2.txt', 'r') as f:
lines = f.readlines()
for line in lines:
ip_one = "http://" + line.strip()
ips.append(ip_one)
headers = {
'Host': 'jandan.net',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:43.0) Gecko/20100101 Firefox/42.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8',
'Accept-Encoding': 'gzip, deflate, sdch',
'Referer': 'http://jandan.net/ooxx/',
'Connection': 'keep-alive',
'Cache-Control': 'max-age=0',
}
for i in range(1530, 1883):
url_lists.append(url + str(i) + '#comments')
def writeToTxt(name, list):
with open(name, 'w+') as f:
for urlOne in list:
f.write(urlOne + "\n")
def get_img_url(url):
single_ip_addr = random.choice(ips)
lists_tmp = []
page = int(url[28:32])
filename = str(page) + ".txt"
proxies = {'http': single_ip_addr}
try:
res = requests.get(url, headers=headers, proxies=proxies)
print(res.status_code)
if res.status_code == 200:
text = res.text
Soup = BeautifulSoup(text, 'lxml')
results = Soup.find_all("a", target="_blank", class_="view_img_link")
for img in results:
lists_tmp.append(img['href'])
url_lists.append(img['href'])
print(url + " --->>>>抓取完畢!!")
writeToTxt(filename, lists_tmp)
else:
not_url_lists.append(url)
print("not ok")
except:
not_url_lists.append(url)
print("not ok")
for url in url_lists:
page = int(url[28:32])
filename = str(page) + ".txt"
if os.path.exists(filename):
print(url + " is pass")
else:
# time.sleep(1)
get_img_url(url)
print(img_lists)
with open("img_url.txt", 'w+') as f:
for url in img_lists:
f.write(url + "\n")
print("共有 " + str(len(img_lists)) + " 張圖片。")
print("all done!!!")
with open("not_url_lists.txt", 'w+') as f:
for url in not_url_lists:
f.write(url + "\n")