Python 爬蟲實戰案例 : 微信公衆號的爬取

# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
import time
import random
import MySQLdb
import threading
import socket
import math

socket.setdefaulttimeout(60)#這裏對整個socket層設置超時時間。後續文件中如果再使用到socket,不必再設置

glock = threading.Lock() #定義全局鎖

CATEGORY_URL= ['http://www.we123.com/gzh/onclick/'] #獲取地區分類鏈接
all_url = [] #
ALL_URLS = [] #所有詳細頁面鏈接
proxy_list = [] #IP池
URL = 'http://www.we123.com'
PAGE_URL = [] #所有分頁鏈接

#獲取Ip池
def get_ip():
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}
    url = 'http://http-webapi.zhimaruanjian.com'#可以使用芝麻代理,好用穩定還不貴
    resp = requests.get(url,headers=headers)
    obj = resp.json() #獲取json ip池對象
    for ip in obj:
        arr = 'http://' + str(ip['ip']) + ':' + str(ip['port'])
        proxy_list.append(arr)

#獲取頁面源碼函數
def get_html(url):
    # headers = {}
    user_agent_list  = [
        'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.3538.400 QQBrowser/9.6.12501.400',
        'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0'
    ]
    # user_agent = random.choice(user_agent_list)
    headers  = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.3538.400 QQBrowser/9.6.12501.400'
    }
    # 代理,免費的代理只能維持一會可能就沒用了,自行更換
    # proxy_list  = [
    #     "http://27.192.185.62:3252",
    # ]
    # proxy_ip = random.choice(proxy_list)
    # proxies = {'http': proxy_ip}
    # print(str(url))
    try:
        resp = requests.get(url,headers=headers)
        # print("72行:status_code = " + str(resp.status_code))
        # print(type(resp.text))
        # print(resp.url) # 請求的url
        if resp.status_code == 200:
            return resp
        elif resp.status_code == 404:
            return  resp
        elif resp.status_code == 500:
            return  resp
        return  resp
    except RuntimeError:
        print("超時")
        return "error"
    except ConnectionError:
        print("連接超時")
        return "error"
    except RequestException:
        print("http請求父類錯誤")
        with open('url_exception.txt','a+', encoding='utf-8') as f:
            f.write(str(url))
            f.write('\n')
        return "error"

#獲取區域分類鏈接
def get_categoty_url():
    url = 'http://www.we123.com/gzh/onclick/'
    resp = get_html(url)
    soup = BeautifulSoup(resp.text,'lxml')
    html = soup.select('div.div-subs2 > div.divst-content > div.divst-subs > li > a')
    # 獲取區域分類鏈接
    for i in html:
        city = i['href'].split("/")[-1]
        if (city == '海外' or city == '臺灣' or city == '澳門'):
            continue
        url = URL + i['href']
        CATEGORY_URL.append(url)
    print(CATEGORY_URL)


#獲取每個區域下所有分頁鏈接
def get_page_url(url):
    city = url.split('/')[-1]
    html = get_html(url)
    if html == "error":
        print("98行:connect url error")
        time.sleep(random.randint(10,20))
        return "error"
    soup = BeautifulSoup(html.text,'lxml')
    #獲取總條數
    all_nums = soup.select("div.page > a > b")
    if len(all_nums) == 0:
        return "error"
    else:
        all_nums = soup.select("div.page > a > b")[0].get_text()
    #獲取總分頁數
    all_pages = math.ceil((int(all_nums) / 30))
    #獲取所有分頁鏈接
    all_page_url = []
    for i in range(0,int(all_pages)):
        page_url = 'http://www.we123.com/e/action/ListInfo.php?page=' + str(i) + '&classid=45&line=30&tempid=10&orderby=onclick&myorder=0&totalnum=' + str(all_nums)
        all_page_url.append(page_url)
    return all_page_url

# 獲取所有詳細頁面鏈接
def get_page_urls():
        global PAGE_URL
        c_url = CATEGORY_URL.pop()
        print('121 行:請求鏈接' + c_url)
        PAGE_URL = get_page_url(c_url) #獲取每個區域下面的所有分頁鏈接

# 獲取所有詳細頁面鏈接
def get_info_urls():
    while True:
        global  PAGE_URL #設置全局變量
        glock.acquire()  #加鎖
        if len(PAGE_URL) == 0:
            glock.release()  #解鎖
            print('131 行:CATEGORY_URL 爲空')
            break
        else:
            p_url = PAGE_URL.pop()
            print('135 行:請求鏈接' + p_url)
            glock.release()  #解鎖

            glock.acquire()  #加鎖
            html = get_html(p_url)
            if html == "error":
                print("141行:connect url error")
                time.sleep(2)
                return
            soup = BeautifulSoup(html.text,'lxml')
            info_urls = soup.select('div.gzhRight > div.gzh_list > ul > li > a')
            for x in info_urls:
                i_url = URL + x['href']
                ALL_URLS.append(i_url)
            print("庫存鏈接共:" + str(len(ALL_URLS)))
        glock.release()  #解鎖
#獲取每一頁需要的數據
def get_data():
    while True:
        global ALL_URLS  #設置全局變量
        glock.acquire()  #加鎖
        print("當前庫存:"+str(len(ALL_URLS)))
        if len(ALL_URLS) == 0:
            glock.release()  #解鎖
            print('159 行 :ALL_URLS 爲空')
            break
        else:
            url = ALL_URLS.pop()
            print("開始抓取數據:" + url)
            glock.release() #解鎖
            time.sleep(1) #睡眠1秒鐘
            html = get_html(url)
            if html == "error":
                print("168行:connect url error")
                time.sleep(random.randint(2, 4))
                return
            html.encoding='utf-8' #顯式地指定網頁編碼,一般情況可以不用
            soup = BeautifulSoup(html.text,'lxml')
            #公衆號名稱
            names = soup.select('div.artcleLeft > div.xcxnry > div.xcxtop > div.xcxtop_left > div.gzhtop_logo > h1')
            #微信號id
            accounts = []
            accounts.append(soup.select('div.artcleLeft > div.xcxnry > div.xcxtop > div.xcxtop_left > div.gzhtop_logo > p')[0])
            #微信頭像
            imgs = soup.select('div.artcleLeft > div.xcxnry > div.xcxtop > div.xcxtop_left > div.gzhtop_logo > img')
            #公衆號二維碼
            QR_codes= soup.select('div.artcleLeft > div.xcxnry > div.xcxtop > div.xcxtop_right >  img')
            #介紹
            descs = soup.select('div.artcleLeft > div.xcxnry > div.xcxinfo')
            #公衆號分類
            categorys = []
            category = ''
            cate = soup.select('div.artcleLeft > div.xcxnry > div.xcxtop > div.xcxtop_left > div.xcx_p > span > a')
            if not len(cate) == 0:
                category = cate[0].get_text()
            else:
                category = '綜合'
            glock.acquire()  #加鎖
            for name,account,img,QR_code,desc in zip(names,accounts,imgs,QR_codes,descs):
                data = {
                    'name':name.get_text(),
                    'category':category,
                    'account':account.get_text().split(":")[-1],
                    'img':img['src'],
                    'QR_code':QR_code['src'],
                    'desc':desc.get_text()
                }
                add_data(data,url)
            glock.release() #解鎖
#添加數據
def add_data(data,url):
    con = MySQLdb.connect('127.0.0.1','root','root','test',charset="utf8",use_unicode=True)
    cursor = con.cursor()
    # exit()
    insert_sql = """
        insert ignore into weixin5(w_name,category,account,img,QR_code,introduce)
        VALUES (%s,%s,%s,%s,%s,%s)
        """
    print('212行 :' + data['name'] + '_' + data['account'] + '添加成功!-' + url)
    try:
        cursor.execute(insert_sql,(data['name'],data['category'],data['account'],data['img'],data['QR_code'],str(data['desc'])))
        con.commit()
    except:
        ALL_URLS.insert(0,url)
        print("218行:" + URL + '插入失敗')
        con.rollback()
    con.close()

# 將時間字符串轉化爲時間戳
def time_to(dt):
    timeArray = time.strptime(dt, "%Y年%m月%d日")
    timestamp = int(time.mktime(timeArray))
    return timestamp

#啓動多線程爬取
def main():
    for x in range(3):
        th = threading.Thread(target=get_info_urls)
        th.start()
         # get_info_urls()
    time.sleep(3)
    for x in range(5):
        th = threading.Thread(target=get_data)
        th.start()

if __name__ == '__main__':
    # 計時
    t1 = time.time()
    # 調用函數
    get_ip() #獲取ip池
    get_page_urls()
    time.sleep(2)
    # get_categoty_url()
    main()
    print(time.time() - t1)

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章