簡介
今天學習了Requests與BeautifulSoup的用法,簡單結合網上的示例,做了一個簡單的爬取網站圖片的腳本,還發現了一個不錯的網站,值得推薦給大家(你懂的)。
示例代碼
# -*- coding:UTF-8 -*-
#!/usr/bin/python3
'''
Script Name : downLoadImage.py
Author : svoid
Created : 2015-03-14
Last Modified :
Version : 1.0
Modifications :
Description : 網站爬取圖片
'''
import requests
import threading
from bs4 import BeautifulSoup
import re
"""
Description : 將網頁圖片保存本地
@param imgUrl : 待保存圖片URL
@param imgName : 待保存圖片名稱
@return 無
"""
def saveImage( imgUrl,imgName ="default.jpg" ):
response = requests.get(imgUrl, stream=True)
image = response.content
DstDir="D:\\persion\\picture\\downimg\\"
print("保存文件"+DstDir+imgName+"\n")
try:
with open(DstDir+imgName ,"wb") as jpg:
jpg.write( image)
return
except IOError:
print("IO Error\n")
return
finally:
jpg.close
"""
Description : 開啓多線程執行下載任務
@param filelist:待下載圖片URL列表
@return 無
"""
def downImageViaMutiThread( filelist ):
task_threads=[] #存儲線程
count=1
for file in filelist:
filename = file.replace("/","-")
if 'com-' in filename:
p = re.compile(r'com-')
print(filename)
filename = p.split(filename)[1]
t = threading.Thread(target=saveImage,args=(file,filename))
count = count+1
task_threads.append(t)
for task in task_threads:
task.start()
for task in task_threads:
task.join()
"""
Description : 獲取圖片地址
@param pageUrl : 網頁URL
@return : 圖片地址列表
"""
def getfilelist(pageUrl):
web = requests.get(pageUrl)
soup = BeautifulSoup(web.text)
filelist=[]
# for photo in soup.find_all('img',{'class':'scrollLoading'}):
for photo in soup.find_all('img'):
filelist.append(photo.get('src'))
# filelist.append(photo.get('data-original'))
return filelist
def getweblist(webUrl):
web = requests.get(webUrl)
soup = BeautifulSoup(web.text)
weblist=[]
for pagelist in soup.find_all('div',{'class':'metaRight'}):
for link in pagelist.find_all('a'):
weblist.append(link.get('href'))
return weblist
if __name__ == "__main__":
webUrl = 'http://www.meizitu.com/'
list = getweblist(webUrl)
for page in list:
imagelist=getfilelist(page)
downImageViaMutiThread(imagelist)
後記
程序還是有很多不足的地方,比如網絡連接狀態均未作判斷,圖片也未作很多處理,簡單的保存在本地。
程序運行之後,發現爬取了很多小圖,屌絲也要高標準要求自己不是,又補充了一個小程序刪掉。
# -*- coding:UTF-8 -*-
#!/usr/bin/python
'''
Script Name : deleteSmallImage.py
Author : svoid
Created : 2015-03-14
Last Modified :
Version : 1.0
Modifications :
Description : 文件相關操作
'''
import os
import sys
import stat
import time
def get_dir_file(dirname):
filelist = []
for file in os.listdir(path=dirname):
targetFile = os.path.join(dirname, file)
filelist.append(targetFile)
return filelist
def get_file_size(filename):
return os.stat(filename)[stat.ST_SIZE]
if __name__ == '__main__':
WORKDIR = 'D:\\persion\\picture\\downimg'
filelist = get_dir_file(WORKDIR)
for file in filelist:
if (get_file_size(file) <= 1024*50 and os.path.exists(file)) :
os.remove(file)
print("delete file %s"%(file))
else:
print("file not exists or file is well"
好了,不多說了,又是一個不眠的夜晚。。。