from urllib.request import urlopen
from urllib.error import HTTPError
from urllib.error import URLError
from bs4 import BeautifulSoup
import urllib.request
import re
import os
def get_html(url):
try:
html = urlopen(url)
except HTTPError as e:
print(e)
return None
except URLError as e:
print(e)
return None
try:
bsObj = BeautifulSoup(html,"html.parser")
return bsObj
except AttributeError as e:
print(e)
return None
def get_img(bsObj):
img_addrs = bsObj.findAll("img",{"src":re.compile("\/\/ww[1-9]\.sinaimg\.cn\/mw600\/[0-9a-zA-z]{32}\.jpg")})
return img_addrs
def save_img(path,img_addrs,i):
for each in img_addrs:
filename = path + str(i) + '.jpg'
try:
urllib.request.urlretrieve('http:'+each["src"],filename)
print("%d.jpg download success!"%i)
except HTTPError as e:
print(e)
i = i + 1
return i
if __name__ == "__main__":
while 1 :
path = input("Please input the path:")
try:
os.makedirs(path)
break
except FileExistsError as e:
print(e)
continue
page = int(input("Please input the pages:"))
i = 0
for n in range(1,page + 1):
url = "http://jandan.net/ooxx/page-" + str(2308 - n) + "#comments"
print(url)
html = get_html(url)
img_addrs = get_img(html)
i = save_img(path,img_addrs,i)
沒有對IP和post和get處理 簡單的爬蟲
我在做的時候沒有審清頁面關係 導致抓不到正確圖片 一上午啊!!!