python項目之 爬蟲爬取煎蛋jandan的妹子圖-上

python項目之 爬蟲爬取煎蛋jandan的妹子圖-上

抓取妹子圖練練手。

網頁url格式

http://jandan.net/ooxx/page-1777#comment
只需改變頁碼1777即可

分析頁面源碼發現妹子圖有兩個

一個是縮略圖

<img src="http://ww1.sinaimg.cn/mw600/4bf31e43jw1f09htnzkh5j20dw0kumz0.jpg" /></p>

另一個是原圖

<a href="http://ww1.sinaimg.cn/large/4bf31e43jw1f09htnzkh5j20dw0kumz0.jpg" target="_blank" class="view_img_link">[查看原圖]</a>

這裏我們抓取原圖,使用class和target這個屬性查找。

最終得到每一頁的TXT文件,下篇是文件合併與圖片存取。

源碼如下

代理ip文件請自行查找:-D

# coding:utf-8
####################################################
# coding by 劉雲飛
####################################################

import requests
import os
import time
import random
from bs4 import BeautifulSoup
import threading

url = "http://jandan.net/ooxx/page-"
img_lists = []
url_lists = []
not_url_lists = []
ips = []
thread_list = []

with open('ip2.txt', 'r') as f:
    lines = f.readlines()
    for line in lines:
        ip_one = "http://" + line.strip()
        ips.append(ip_one)

headers = {
    'Host': 'jandan.net',
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:43.0) Gecko/20100101 Firefox/42.0',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Language': 'zh-CN,zh;q=0.8',
    'Accept-Encoding': 'gzip, deflate, sdch',
    'Referer': 'http://jandan.net/ooxx/',
    'Connection': 'keep-alive',
    'Cache-Control': 'max-age=0',
}

for i in range(1530, 1883):
    url_lists.append(url + str(i) + '#comments')


def writeToTxt(name, list):
    with open(name, 'w+') as f:
        for urlOne in list:
            f.write(urlOne + "\n")


def get_img_url(url):
    single_ip_addr = random.choice(ips)
    lists_tmp = []
    page = int(url[28:32])
    filename = str(page) + ".txt"
    proxies = {'http': single_ip_addr}
    try:
        res = requests.get(url, headers=headers, proxies=proxies)
        print(res.status_code)
        if res.status_code == 200:
            text = res.text
            Soup = BeautifulSoup(text, 'lxml')
            results = Soup.find_all("a", target="_blank", class_="view_img_link")
            for img in results:
                lists_tmp.append(img['href'])
                url_lists.append(img['href'])
            print(url + "  --->>>>抓取完畢!!")
            writeToTxt(filename, lists_tmp)
        else:
            not_url_lists.append(url)

            print("not ok")
    except:
        not_url_lists.append(url)
        print("not ok")


for url in url_lists:
    page = int(url[28:32])
    filename = str(page) + ".txt"
    if os.path.exists(filename):
        print(url + "   is pass")
    else:
        # time.sleep(1)
        get_img_url(url)

print(img_lists)

with open("img_url.txt", 'w+') as f:
    for url in img_lists:
        f.write(url + "\n")

print("共有 " + str(len(img_lists)) + " 張圖片。")
print("all done!!!")

with open("not_url_lists.txt", 'w+') as f:
    for url in not_url_lists:
        f.write(url + "\n")
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章