Python 遍歷網頁代碼抓取文字和圖片
通過python的幾個工具requests,beautifulSoup,json, Pool暴力遍歷url抓取內容
- 獲取全量的文字和圖片鏈接
- 獲取圖片
獲取全量的文字和圖片鏈接
#!/usr/bin/python
#-*- coding:utf-8 -*-
import requests
import bs4
from multiprocessing import Pool
import json
import time
##地址就補貼了。哈哈
url_root = 'xxxx'
def geturl(num):
return url_root + str(num);
def geturls(num):
return map(geturl, range(14,num));
def getdata(url):
datalist={}
response = requests.get(url)
if response.status_code != 200 :
return {"noValue":"noValue"}
soup = bs4.BeautifulSoup(response.text, "html.parser")
## 獲取index
datalist['index'] = soup.title.string[4:8].encode('utf-8')
## 獲取內容
for meta in soup.select('meta'):
if meta.get('name') == 'description':
datalist['content'] = meta.get('content').encode('utf-8')
##獲取圖片
datalist['img'] = soup.find_all('img')[1]['src'].encode('utf-8')
return datalist
if __name__ == '__main__':
pool = Pool(processes=10)
datalist = []
urls = geturls(1314);
start = time.time()
datalist = pool.map(getdata, urls)
end = time.time()
print 'use:%.2f s' %(end -start)
jsondata = json.dumps({'data':datalist}, ensure_ascii=False)
with open('data.txt','w' ) as outfile:
outfile.write(jsondata)
outfile.close()
獲取圖片
解析第一個script得到的格式化內容,通過pool併發的區下載圖片
#!/usr/bin/python
#-*- coding:utf-8 -*-
import json
import requests
from multiprocessing import Pool
def downImge(imgurl):
file_name = imgurl.split('/')[len(imgurl.split('/')) -1 ]
response = requests.get(imgurl, stream=True)
if response.status_code == 200 :
with open("image/"+file_name, 'w') as f:
f.write(response.content)
if __name__ == "__main__":
datalist = []
with open('data.txt', 'r') as f:
datalist = json.loads(f.read())
f.close()
imglist = []
for item in datalist['data']:
if item.has_key('img'):
imglist.append(item['img'])
pool = Pool(10)
pool.map(downImge, imglist)