爬取圖片

import requests                   # 模塊導入的倆種方法
from multiprocessing import Pool
import re

def get(url):
    ret=requests.get(url)
    if ret.status_code==200:
        return  ret.content.decode('gbk')

def call_back(arg):
    ret = com.finditer(arg)
    dict_lst=[]
    for i in ret:
        dic = {
            'png': i.group('png'),
            'name': i.group('name'),
            'place': i.group('place')
        }
        dict_lst.append(dic)
    for i in  dict_lst:
        res=subget(i['png'])
        write_func(i['name'],i['place'],res)

    return dict_lst

def subget(url):
    if 'https' in url:
        ret = requests.get(url)
        if ret.status_code == 200:
            return ret.content
        else:
            pass
    else:
        n_url = 'http://www.xiaohuar.com' + url
        ret = requests.get(n_url)
        if ret.status_code == 200:
            return ret.content
        else:
            pass

def write_func(path,place,picture):
    with open(r'E:\text1\爬蟲\text_png\%s-%s.png' %(path,place),'wb') as f:
         f.write(picture)

'''我要爬取的網頁的特徵'''
'''http://www.xiaohuar.com/list-1-0.html'''
'''http://www.xiaohuar.com/list-1-43.html'''

if __name__ =='__main__':
    com = re.compile(
        '<div class="item_t">(?:.*?)src="(?P<png>.*?)"(?:.*?)<span class="price">(?P<name>.*?)</span>(?:.*?)'
        '<a href="http://www.xiaohuar.com/" class="img_album_btn">(?P<place>.*?)</a>', re.S)

    pool=Pool(3)
    res_lst=[]
    for i in range(40):
        pool.apply_async(get,args=('http://www.xiaohuar.com/list-1-%s.html' %i,),callback=call_back)

    pool.close()
    pool.join()

缺點:爬取的速度慢,最多17個網頁(好無奈)

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章