爬取說明
爬取的是小黃鴨的圖片並保存到本地
使用模塊主要作用說明
import requests
from urllib.request import urlretrieve
import re
import os
urlretrieve:保存下載的圖片
os:判斷文件目錄是否存在和文件目錄的創建
re:正則模塊,查找需要的內容
代碼解釋
設置了請求頭:
url="http://www.ivsky.com/tupian/xiaohuangren_t21343/"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Referer': url,
'Connection': 'Keep-alive'
}
提交請求:
s=requests.get(url,headers=headers)
# print(s.url)
s=s.text
# print(s)
使用re匹配需要的數據:
pattern = r'<div class="il_img".*?<img src="(.*?.jpg)" width'
pa=re.compile(pattern)
uls=re.findall(pattern=pa,string=s)
使用urlretrieve保存圖片:
for item in uls:
# print(item)
#http://img.ivsky.com/img/tupian/t/201411/01/xiaohuangren-004.jpg
path = re.split("\/[0-9]{2}(\/.*?\.jpg)",item,2)[1]
path = '/root/python/python/taobao%s'%path
# print(os.path.exists(os.path.split(path)[0]))
if not (os.path.exists(os.path.split(path)[0])):
os.mkdir(os.path.split(path)[0])
print(path)
urlretrieve(item,path)
使用文件流保存圖片:
for item in uls:
path = re.split("\/[0-9]{2}(\/.*?\.jpg)", item, 2)[1]
path = '/root/python/python/taobao%s' % path
imgedata=requests.get(item).content
print(path)
with open(path,"wb") as f:
f.write(imgedata)
總結:兩種保存方式,文件流比urlretrieve快
完整代碼
#coding:utf-8
import requests
from urllib.request import urlretrieve
import re
import os
url="http://www.ivsky.com/tupian/xiaohuangren_t21343/"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Referer': url,
'Connection': 'Keep-alive'
}
s=requests.get(url,headers=headers)
# print(s.url)
s=s.text
# print(s)
pattern = r'<div class="il_img".*?<img src="(.*?.jpg)" width'
pa=re.compile(pattern)
uls=re.findall(pattern=pa,string=s)
'''urlretrieve
for item in uls:
# print(item)
#http://img.ivsky.com/img/tupian/t/201411/01/xiaohuangren-004.jpg
path = re.split("\/[0-9]{2}(\/.*?\.jpg)",item,2)[1]
path = '/root/python/python/taobao%s'%path
# print(os.path.exists(os.path.split(path)[0]))
if not (os.path.exists(os.path.split(path)[0])):
os.mkdir(os.path.split(path)[0])
print(path)
urlretrieve(item,path)
# print(len(uls))'''
for item in uls:
path = re.split("\/[0-9]{2}(\/.*?\.jpg)", item, 2)[1]
path = '/root/python/python/taobao%s' % path
imgedata=requests.get(item).content
print(path)
with open(path,"wb") as f:
f.write(imgedata)