简介
今天学习了Requests与BeautifulSoup的用法,简单结合网上的示例,做了一个简单的爬取网站图片的脚本,还发现了一个不错的网站,值得推荐给大家(你懂的)。
示例代码
# -*- coding:UTF-8 -*-
#!/usr/bin/python3
'''
Script Name : downLoadImage.py
Author : svoid
Created : 2015-03-14
Last Modified :
Version : 1.0
Modifications :
Description : 网站爬取图片
'''
import requests
import threading
from bs4 import BeautifulSoup
import re
"""
Description : 将网页图片保存本地
@param imgUrl : 待保存图片URL
@param imgName : 待保存图片名称
@return 无
"""
def saveImage( imgUrl,imgName ="default.jpg" ):
response = requests.get(imgUrl, stream=True)
image = response.content
DstDir="D:\\persion\\picture\\downimg\\"
print("保存文件"+DstDir+imgName+"\n")
try:
with open(DstDir+imgName ,"wb") as jpg:
jpg.write( image)
return
except IOError:
print("IO Error\n")
return
finally:
jpg.close
"""
Description : 开启多线程执行下载任务
@param filelist:待下载图片URL列表
@return 无
"""
def downImageViaMutiThread( filelist ):
task_threads=[] #存储线程
count=1
for file in filelist:
filename = file.replace("/","-")
if 'com-' in filename:
p = re.compile(r'com-')
print(filename)
filename = p.split(filename)[1]
t = threading.Thread(target=saveImage,args=(file,filename))
count = count+1
task_threads.append(t)
for task in task_threads:
task.start()
for task in task_threads:
task.join()
"""
Description : 获取图片地址
@param pageUrl : 网页URL
@return : 图片地址列表
"""
def getfilelist(pageUrl):
web = requests.get(pageUrl)
soup = BeautifulSoup(web.text)
filelist=[]
# for photo in soup.find_all('img',{'class':'scrollLoading'}):
for photo in soup.find_all('img'):
filelist.append(photo.get('src'))
# filelist.append(photo.get('data-original'))
return filelist
def getweblist(webUrl):
web = requests.get(webUrl)
soup = BeautifulSoup(web.text)
weblist=[]
for pagelist in soup.find_all('div',{'class':'metaRight'}):
for link in pagelist.find_all('a'):
weblist.append(link.get('href'))
return weblist
if __name__ == "__main__":
webUrl = 'http://www.meizitu.com/'
list = getweblist(webUrl)
for page in list:
imagelist=getfilelist(page)
downImageViaMutiThread(imagelist)
后记
程序还是有很多不足的地方,比如网络连接状态均未作判断,图片也未作很多处理,简单的保存在本地。
程序运行之后,发现爬取了很多小图,屌丝也要高标准要求自己不是,又补充了一个小程序删掉。
# -*- coding:UTF-8 -*-
#!/usr/bin/python
'''
Script Name : deleteSmallImage.py
Author : svoid
Created : 2015-03-14
Last Modified :
Version : 1.0
Modifications :
Description : 文件相关操作
'''
import os
import sys
import stat
import time
def get_dir_file(dirname):
filelist = []
for file in os.listdir(path=dirname):
targetFile = os.path.join(dirname, file)
filelist.append(targetFile)
return filelist
def get_file_size(filename):
return os.stat(filename)[stat.ST_SIZE]
if __name__ == '__main__':
WORKDIR = 'D:\\persion\\picture\\downimg'
filelist = get_dir_file(WORKDIR)
for file in filelist:
if (get_file_size(file) <= 1024*50 and os.path.exists(file)) :
os.remove(file)
print("delete file %s"%(file))
else:
print("file not exists or file is well"
好了,不多说了,又是一个不眠的夜晚。。。