Python網絡爬蟲(4)煎蛋網妹子圖片抓取

from urllib.request import urlopen
from urllib.error import HTTPError
from urllib.error import URLError
from bs4 import BeautifulSoup
import urllib.request
import re
import os

def get_html(url):
    try:
        html = urlopen(url)
    except HTTPError as e:
        print(e)
        return None
    except URLError as e:
        print(e)
        return None
    try:
        bsObj = BeautifulSoup(html,"html.parser")
        return bsObj
    except AttributeError as e:
        print(e)
        return None
    
def get_img(bsObj):
    img_addrs = bsObj.findAll("img",{"src":re.compile("\/\/ww[1-9]\.sinaimg\.cn\/mw600\/[0-9a-zA-z]{32}\.jpg")})
    return img_addrs

def save_img(path,img_addrs,i):
    for each in img_addrs:
        filename = path + str(i) + '.jpg'
        try:
            urllib.request.urlretrieve('http:'+each["src"],filename)
            print("%d.jpg download success!"%i)
        except HTTPError as e:
            print(e)
        i = i + 1
    return i

if __name__ == "__main__":
    while 1 :
        path = input("Please input the path:")
        try:
            os.makedirs(path)
            break
        except FileExistsError as e:
            print(e)
            continue
    page = int(input("Please input the pages:"))
    i = 0
    for n in range(1,page + 1):
        url = "http://jandan.net/ooxx/page-" + str(2308 - n) + "#comments"
        print(url)
        html = get_html(url)
        img_addrs = get_img(html)
        i = save_img(path,img_addrs,i)





沒有對IP和post和get處理 簡單的爬蟲

我在做的時候沒有審清頁面關係 導致抓不到正確圖片 一上午啊!!!

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章