Python3.6 微小寶微信公衆號文章抓爬

之前同步了Python3.6 搜狗微信公衆號抓爬,但搜狗設置了微信公衆號文章反抓爬機制,限制太死,而且沒有太多時間研究他的反抓爬破解,只能換抓爬渠道了,所以寫了一個微小寶抓爬微信公衆號文章。還是利用原有的Python工程,詳情請看https://blog.csdn.net/wudaoshihun/article/details/83552027

微小寶公衆號文章抓爬採用selenium技術,所以需要安裝selenium包,具體請看https://blog.csdn.net/wudaoshihun/article/details/83592681

 

具體步驟:

1. 模擬登錄微小寶

2. 搜索要抓爬的微信公衆號

3. 獲取文章列表

 

下面是一個demo,可根據需求把微信公衆號配置化,可借鑑上述文章內容。

 

# -*- coding: utf-8 -*-
from selenium.common.exceptions import WebDriverException, NoSuchElementException

from utils import config
import datetime
import logging.config
import random
from wechatsogou import *
from utils.tools import *
from utils.alert_is_present import *
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
import requests
import urllib.request
import os
from pyquery import PyQuery as pq
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from utils.config import *
from selenium.webdriver.support.wait import WebDriverWait
# 日誌
from os import path
from wechatsogou import *
from utils.tools import *
log_file_path = path.join(path.dirname(path.abspath(__file__)), 'resources/china_stat.conf')
logging.config.fileConfig(log_file_path)
logger = logging.getLogger()
#數據庫實例
mysql = mysql('table_carwler')

# 搜索API實例
wechats = WechatSogouApi() #不使用外部Cookie

today = datetime.date.today()


"""
啓動函數
"""
def wechat_crawler_start(pid=0):
    print("啓動抓爬")

    options = define_options(config.source_env)
    # driver地址
    brower = define_driver(options,config.source_env)

    table_crawler_function(brower,pid)

    brower.close()
    print("結束抓爬")

def define_options(source_env):
    options = webdriver.ChromeOptions()
    options.binary_location = binary_location #谷歌地址
    options.add_argument('--no-sandbox')#解決DevToolsActivePort文件不存在的報錯

    options.add_argument('window-size=1920x3000') #指定瀏覽器分辨率
    options.add_argument('--disable-gpu') #谷歌文檔提到需要加上這個屬性來規避bug
    options.add_argument('--hide-scrollbars') #隱藏滾動條, 應對一些特殊頁面
    options.add_argument('blink-settings=imagesEnabled=false') #不加載圖片, 提升速度

    if(source_env == 'linux'):
        options.add_argument('--headless') #瀏覽器不提供可視化頁面. linux下如果系統不支持可視化不加這條會啓動失敗
    else:
        pass
    return options


def define_driver(options,source_env):
    if(source_env == 'linux'):
        chromedriver = chrome_driver_binary
        os.environ["webdriver.chrome.driver"] = chromedriver
        brower = webdriver.Chrome(chrome_options=options,executable_path=chromedriver)
    else:
        brower = webdriver.Chrome(chrome_options=options)
        brower.set_window_size(1920,3000)
    return brower

"""
自動實現抓爬
"""
def table_crawler_function(brower,pid=0):
    print('父節點'+str(pid))
    #indicatorList =  query_list(pid)
    if pid==0:

        # print('循環中'+str(indicator['Id']))
        login_url = 'https://account.wxb.com/page/login?from=https://data.wxb.com/rank?category=-1'
        user = '//*[@id="root"]/div/div/div[2]/div[1]/span[2]' #賬號登錄
        username = '//*[@id="email"]'
        userpass = '//*[@id="password"]'
        login = '//*[@id="root"]/div/div/div[2]/form/button'

        brower.get(login_url)

        mouse_move_xpath_click(brower,user)
        time.sleep(1)
        click_xpath_str(brower,user)
        time.sleep(1)
        brower.find_element(By.XPATH,username).send_keys("13717626181")
        brower.find_element(By.XPATH,userpass).send_keys("xiaoyao128")
        time.sleep(1)
        mouse_move_xpath_click(brower,login)
        time.sleep(1)
        indicatorList = ["1"]
        for indicator in indicatorList:

            search_xpth = '//*[@id="root"]/div/div[2]/header/div[2]/span/input'
            text = '//*[@id="query"]'
            mouse_move_xpath_click(brower,search_xpth)
            time.sleep(1)
            el = brower.find_element(By.XPATH,search_xpth);
            el.send_keys("tiandiwulianwang")
            el.send_keys(Keys.ENTER)
            time.sleep(3)
            whref = ''
            for link in brower.find_elements_by_xpath("//*[@href]"):#獲取當前頁面的href
                x = link.get_attribute('href')
                print(x)
                if '/details/postRead' in x:
                    whref = x
            print(whref)
            brower.get(whref)
            s = []
            for link1 in brower.find_elements_by_xpath("//*[@href]"):
                y = link1.get_attribute('href')
                if 'https://mp.weixin.qq.com/s?__biz' in y:
                    print(y)
                    s.append(y)

            for link2 in s:
                index_html_path = wechats.down_html(link2,'ruguo')


def mouse_move_text_click(brower, param):
    print("請求不到{},滾動條翻滾",param)
    target = brower.find_element_by_link_text(param)
    brower.execute_script("arguments[0].scrollIntoView(false);",target)
    target.click()
def mouse_move_xpath_click(brower, param):
    print("請求不到{},滾動條翻滾",param)
    wait = WebDriverWait(brower,10)
    wait.until(lambda brower: brower.find_element_by_xpath(param))
    target = brower.find_element(By.XPATH,param)
    brower.execute_script("arguments[0].scrollIntoView(false);",target)
    target.click()
def second_except_id_click(brower, param):
    logger.error("第二次點擊ID={"+param+"}")
    brower.d('window.scrollTo(0,document.body.scrollHeight)')
    brower.find_element_by_id(param).click()

def except_text_click(brower, param):
    logger.error("請求不到,滾動條根據text定位")
    target = brower.find_element_by_link_text(param)
    brower.execute_script("arguments[0].scrollIntoView(false);",target)
    target.click()
def except_xpath_click(brower, param):
    logger.error("請求不到,滾動條根據xpath定位")
    target = brower.find_element(By.XPATH,param)
    brower.execute_script("arguments[0].scrollIntoView(false);",target)
    target.click()
def except_id_click(brower, param):
    logger.error("請求不到,滾動條根據ID定位")
    target = brower.find_element(By.ID,param)
    brower.execute_script("arguments[0].scrollIntoView(false);",target)
    target.click()
"""
根據text點擊
"""
def click_text(brower, indicator):

    r = brower.find_element_by_link_text(indicator['text_name'])
    ActionChains(brower).move_to_element(r).click(r).perform()
    wait=WebDriverWait(brower,10)#等待元素加載出來


def click_scroll_id(brower, param):
    time.sleep(1)
    try:
        mouse_move_text_click(brower,param)

        brower.find_element_by_id(param).click()
    except  NoSuchElementException as e:
        second_except_id_click(brower,param)
    except WebDriverException as e1:
        second_except_id_click(brower,param)

def click_text_str(brower, param):
    wait = WebDriverWait(brower,10)
    wait.until(lambda brower: brower.find_element_by_text(param))
    r = brower.find_element_by_link_text(param)
    ActionChains(brower).move_to_element(r).click(r).perform()
    wait=WebDriverWait(brower,10)#等待元素加載出來

def click_xpath(brower, indicator):
    r = brower.find_element(By.XPATH,indicator['craw_attr'])
    ActionChains(brower).move_to_element(r).click(r).perform()
    wait=WebDriverWait(brower,10)#等待元素加載出來
    #wait.until(EC.presence_of_element_located((By.ID,id)))

def click_xpath_str(brower, param):
    wait = WebDriverWait(brower,10)
    wait.until(lambda brower: brower.find_element_by_xpath(param))
    r = brower.find_element(By.XPATH,param)
    ActionChains(brower).move_to_element(r).click(r).perform()
    wait=WebDriverWait(brower,10)#等待元素加載出來

def click_id(brower, indicator):
    r = brower.find_element(By.ID,indicator['craw_attr'])
    ActionChains(brower).move_to_element(r).click(r).perform()
    wait=WebDriverWait(brower,10)#等待元素加載出來

def click_id_str(brower, param):
    r = brower.find_element(By.ID,param)
    ActionChains(brower).move_to_element(r).click(r).perform()
    wait=WebDriverWait(brower,10)#等待元素加載出來

"""SQL封裝"""
def query_list(param):
    cur = mysql.conn.cursor()
    mysql.field('*')
    mysql.where(" p_id="+str(param)+" and is_used=0 and class_type=4")
    mysql.order_sql = " order by Id"

    indicatorList = mysql.find(0)
    return indicatorList

if __name__ == '__main__':
    r =  wechat_crawler_start(0)
 

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章