python urllib.request etree爬取百度貼吧的圖片並存儲本地,源代碼如下:
import re
import time
import urllib.request
from lxml import etree
# ------ 獲取網頁源代碼的方法 ---
def getHtml(url):
# page = urllib.request.urlopen(url)
# html = page.read()
headers = {'User-Agent': 'User-Agent:Mozilla/5.0'}
html1 = urllib.request.Request(url, headers=headers)
html = urllib.request.urlopen(html1).read()
return html
# ------ getHtml()內輸入任意帖子的URL ------
html = getHtml('https://tieba.baidu.com/index.html')
# ------ 修改html對象內的字符編碼爲UTF-8 ------
# html = html.decode('UTF-8')
# ------ 獲取帖子內所有圖片地址的方法 ------
def getImg(html):
# ------ 匹配網頁內容找到圖片地址 ------
tree = etree.HTML(html)
imglist = tree.xpath('//img')
return imglist
# reg = r'src="([.*\S]*\.jpg)"'
# imgname = r'alt="*"'
# imgre = re.compile(reg);
# imgnamelist = re.findall(imgname,html)
# imglist = re.findall(imgre, html)
# return imglist,imgnamelist
imgList = getImg(html)
imgNamenum = 0
for one in imgList:
# ------ 這裏最好使用異常處理及多線程編程方式 ------
try:
imgPath = one.get('src')
if imgPath[:4] != 'http':
imgPath = 'https://tieba.baidu.com/' + imgPath
imgName = one.get('alt')
imgName = str(imgName)
if imgName == 'None':
imgName = str(time.time())
f = open('D:\\Temp\\'+ str(imgName)+".jpg", 'wb')
f.write((urllib.request.urlopen(imgPath)).read())
print(imgPath)
time.sleep(0.1)
f.close()
except Exception as e:
print(imgPath+" error")
imgNamenum += 1
print("All Done!")
結果如下:
注:本文僅用於技術交流,不得用於商業用途。不遵守者,與本文作者無關。