python簡單爬取圖片的一點總結

折騰了好幾天,終於開發了一個能夠爬取mzitu的單進程程序,雖然只有短短的幾十行code,

但由於沒怎麼搞過爬蟲,有很多的坑都是費了很大勁兒才爬出來,不過不斷的查詢、實驗等學到的東西還真挺受用的:

學習了:

1、requests,urllib2,BeautifulSourp,selenium+webdriver(mzitu沒涉及到,但還是學了下)

2、每級URL的變化分析和提取

3、路徑和字符串的處理

4、防盜鏈  'Referer'

終於能夠完整爬取主頁上第一頁的24個連接的所有圖片,在此記錄下!

       

#coding=utf-8

import requests
from bs4 import BeautifulSoup
import urllib2
import re
import time
import os

'''
運行平臺:Mac OS python 2.7
'''

url = 'http://www.mzitu.com'
localDir = os.path.expanduser('~/Desktop/mzitu')
header = {
            'Host':'www.mzitu.com',
            'Accept-Language': 'en-us',
            'Connection': 'close',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1.1 Safari/605.1.15',
            "Cookie":"Hm_lpvt_dbc355aef238b6c32b43eacbbf161c3c=1586961981; Hm_lvt_dbc355aef238b6c32b43eacbbf161c3c=1586939099"
            }

def get_url_content(aURL):
    request = urllib2.Request(aURL,None,header)
    html = urllib2.urlopen(request, timeout=10)
    data = html.read()
    return data


def get_all_link_from_main_url():
    data = get_url_content(url)
    pan = r'<li><a href="(.*)" target="_blank.* target="_blank.*'
    https = re.compile(pan).findall(data)
    return https


def get_max_page(pin):
    data = get_url_content(pin)
    bs = BeautifulSoup(data, 'lxml')
    mp = bs.find_all('div', class_='pagenavi')
    aList = mp[0].find_all('span')[-2].string
    return aList

def imageHref(pageURL):
    data = get_url_content(pageURL)
    bs = BeautifulSoup(data, 'lxml')
    mp = bs.find_all('div', class_='main-image')
    src = mp[0].img['src']
    return src

def downloadImage(pageURL,imgURL):
    subdir = pageURL.split('/')[-2]
    mkDir = localDir+'/'+subdir
    if not os.path.exists(mkDir):
        os.mkdir(mkDir)
    filename = imgURL.split('/')[-1]
    localURL = mkDir+'/'+filename
    with open(localURL,'wb') as f:
        headersURL = {
            'Referer': 'https://www.mzitu.com',
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36',
        }
        rst = urllib2.Request(imgURL,None,headersURL)
        rsp = urllib2.urlopen(rst)
        f.write(rsp.read())
        f.close()


def download_one_set(pOne):
    print pOne
    max_page = int(get_max_page(pOne))
    for m in range(max_page):
        each_url = pOne+'/'+str(m+1)
        print each_url
        img_url = imageHref(each_url)
        print img_url
        downloadImage(each_url,img_url)
        print '\t--',img_url,' -- ok'
        time.sleep(1)


if __name__ == '__main__':
    p_list = get_all_link_from_main_url()
    for page in p_list:
        download_one_set(page)
    print '下載 ok'

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章