python3 27270網站美女爬蟲(二)

對於27270網站美女 進行了一個嘗試
使用

python3
庫:
urllib 
BeautifulSoup 
lxml 

主要是下載靜態網頁的圖片

顯示從IP代理網站上獲取代理池,然後在從靜態頁面獲取圖片的連接,最後下載圖片

1、爬蟲下載IP代理
2、模擬瀏覽器下載

分析頁面

<body>
....省略其他頁面代碼
<div>
....省略其他頁面代碼
 <div class="MeinvTuPianBox">
  <ul>
....省略其他頁面代碼
 <li> <a href="*****" title="******" class="MMPic" target="_blank"><i><img src="*****" width="190" height="280" alt="*****"  /></i></a>
....省略其他頁面代碼
</li> 
....省略其他頁面代碼
</div>

從上面可以看出頁面各個元素之間的關係,確定好要找元素的位置

body > div > div class=MeinvTuPianBox > ul > li > a class=MMPic > i > img

完整的代碼

from urllib.request import urlopen
import urllib.request
from bs4 import  BeautifulSoup
import os, time
import http.cookiejar
import  random 
from urllib.request import urlretrieve ,HTTPError ,urlopen,URLError 

base_url='http://www.27270.com/'#ent/meinvtupian/' #list_11_%s.html';
one_url=['word']

base_dir=''
proxy_ip=[]



#class myThread (threading.Thread):
#    def __init__(self, start,end):
#        threading.Thread.__init__(self)
#        #self.threadID = threadID
#        self.start = start
#        self.end = end
#       
#    def run(self):
#        print ("開始線程:" + self.name)
#        #print_time(self.name, self.counter, 5)
#        get_url_list( self.start,self.end )
#        print ("退出線程:" + self.name)

#ip代理池
def getProxyIp():  
    proxy = []  
    for i in range(1, 3):  
        #print(i)  
        header = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) '  
                                           'AppleWebKit/537.36 (KHTML, like Gecko) '  
                                               'Ubuntu Chromium/44.0.2403.89 '  
                                               'Chrome/44.0.2403.89 '  
                                               'Safari/537.36'}  
        req = urllib.request.Request(url='http://www.xicidaili.com/nt/{0}'.format(i), headers=header)  
        r = urllib.request.urlopen(req)  
        soup = BeautifulSoup(r,'html.parser',from_encoding='utf-8')  
        table = soup.find('table', attrs={'id': 'ip_list'})  
        tr = table.find_all('tr')[1:]  
        #解析得到代理ip的地址,端口,和類型  
        for item in tr:  
            tds =  item.find_all('td')  
            temp_dict = {}  
            kind = "{0}:{1}".format(tds[1].get_text().lower(), tds[2].get_text())  
            proxy.append("http://"+kind)  
    return proxy  



#隨機獲取IP地址
def getIP():
    ip=random.choice(proxy_ip)
    return ip  

def makeMyOpener(head={
    'Connection': 'Keep-Alive',
    'Accept': 'text/html, application/xhtml+xml, */*',
    'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko'
}):
    proxy_dict=getIP()
    print(proxy_dict)
    cj = http.cookiejar.CookieJar()
    opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
    header = []
    for key, value in head.items():
        elem = (key, value)
        header.append(elem)
    elem=('http',proxy_dict)
    header.append(elem)
    opener.addheaders = header
    return opener   


#圖片下載
def download(url,file_name,index):  
    dir=base_dir+str(index)+'/' 
    if not os.path.isdir(dir):
        os.makedirs(dir)
    dir=dir+file_name
    try:
        with  urlopen(url,timeout=30) as r:
            content=r.read();
            with open(dir,'wb') as code:
            #open(name,[mode[,buffering]]) 打開文件夾
                code.write(content)
        #time.sleep(1)
    except :
        pass

def get_url_list(index,end):
    girl_list=[]
    try:
        #if end==index:
        #    print(u'已經全部抓取完畢')
        #    threading.currentThread().stop()   
        oper = makeMyOpener()
        url='http://www.27270.com/ent/meinvtupian/list_11_%s.html' % index
        html = oper.open(url)

        #第一種方法
        #bsObj = BeautifulSoup(html,'lxml')
        #girl_list = bsObj.findAll('img')

        #第二種方法
        soup = BeautifulSoup(html,'lxml')
        girl_list = soup.select('body > div > div.MeinvTuPianBox > ul > li > a.MMPic > i > img')
        if not girl_list:
            print(u'已經全部抓取完畢')
            sys.exit(0)

        #第三尋找元素方法
        #response = requests.get(image_detail_link).content
        #sel = html.fromstring(html)            
        #girl_list =sel.xpath("//div[@class='MeinvTuPianBox']/ul/li/a[@class='MMPic']/i/img")[0]
        mm_down = []
        mm_names = []

        #第四種方法 正則,此處略

        for mpoto in girl_list:
            mm_link = mpoto.get('src') 
            mm_nick = mpoto.get('alt')
            mm_down.append(mm_link)
            mm_names.append(mm_nick)

        for gril,name in zip(mm_down,mm_names):
            download(gril, name + '.jpg',index)         
            print(gril+name)

        index=index+1       
        get_url_list(index,end)

    except HTTPError as e:  
        print('HTTPError'+str(e.code))
        get_url_list(index,end) 
    except URLError as e:
        print('URLError'+e)
        get_url_list(index,end)
    #return girl_list       


if __name__ == '__main__':
    proxy_ip=getProxyIp()
    base_dir='E:/cache-work/python3/images1/'
    if not os.path.isdir(base_dir):
        os.makedirs(base_dir)
    get_url_list(163,100)
    """
    try:
        _thread.start_new_thread( get_url_list, ( 1,35, ) )
        _thread.start_new_thread(get_url_list, ( 35,70, ) )
        _thread.start_new_thread( get_url_list, ( 70,110, ) )
        _thread.start_new_thread( get_url_list, ( 110,150, ) )
        _thread.start_new_thread( get_url_list, ( 150,500,) )
    except:
       print ("Error: 無法啓動線程")

    while 1:
       pass
    """    
"""
    thread1= myThread( 1,35)
    thread2= myThread(35,70)
    thread3= myThread(70,110)
    thread4= myThread(110,150)
    thread5= myThread(150,1000)
    thread1.start()
    thread2.start()
    thread3.start()
    thread4.start()
    thread5.start()
""" 

# 創建兩個線程
發佈了88 篇原創文章 · 獲贊 25 · 訪問量 9萬+
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章