Python3.6 微小寶微信公衆號文章抓爬

之前同步了Python3.6 搜狗微信公衆號抓爬，但搜狗設置了微信公衆號文章反抓爬機制，限制太死，而且沒有太多時間研究他的反抓爬破解，只能換抓爬渠道了，所以寫了一個微小寶抓爬微信公衆號文章。還是利用原有的Python工程，詳情請看https://blog.csdn.net/wudaoshihun/article/details/83552027

微小寶公衆號文章抓爬採用selenium技術，所以需要安裝selenium包，具體請看https://blog.csdn.net/wudaoshihun/article/details/83592681

具體步驟：

1. 模擬登錄微小寶

2. 搜索要抓爬的微信公衆號

3. 獲取文章列表

下面是一個demo，可根據需求把微信公衆號配置化，可借鑑上述文章內容。

# -*- coding: utf-8 -*-
from selenium.common.exceptions import WebDriverException, NoSuchElementException

from utils import config
import datetime
import logging.config
import random
from wechatsogou import *
from utils.tools import *
from utils.alert_is_present import *
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
import requests
import urllib.request
import os
from pyquery import PyQuery as pq
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from utils.config import *
from selenium.webdriver.support.wait import WebDriverWait
# 日誌
from os import path
from wechatsogou import *
from utils.tools import *
log_file_path = path.join(path.dirname(path.abspath(__file__)), 'resources/china_stat.conf')
logging.config.fileConfig(log_file_path)
logger = logging.getLogger()
#數據庫實例
mysql = mysql('table_carwler')

# 搜索API實例
wechats = WechatSogouApi() #不使用外部Cookie

today = datetime.date.today()

"""
啓動函數
"""
def wechat_crawler_start(pid=0):
print("啓動抓爬")

options = define_options(config.source_env)
# driver地址
brower = define_driver(options,config.source_env)

table_crawler_function(brower,pid)

brower.close()
print("結束抓爬")

def define_options(source_env):
options = webdriver.ChromeOptions()
options.binary_location = binary_location #谷歌地址
options.add_argument('--no-sandbox')#解決DevToolsActivePort文件不存在的報錯

options.add_argument('window-size=1920x3000') #指定瀏覽器分辨率
options.add_argument('--disable-gpu') #谷歌文檔提到需要加上這個屬性來規避bug
options.add_argument('--hide-scrollbars') #隱藏滾動條, 應對一些特殊頁面
options.add_argument('blink-settings=imagesEnabled=false') #不加載圖片, 提升速度

if(source_env == 'linux'):
options.add_argument('--headless') #瀏覽器不提供可視化頁面. linux下如果系統不支持可視化不加這條會啓動失敗
else:
pass
return options

def define_driver(options,source_env):
if(source_env == 'linux'):
chromedriver = chrome_driver_binary
os.environ["webdriver.chrome.driver"] = chromedriver
brower = webdriver.Chrome(chrome_options=options,executable_path=chromedriver)
else:
brower = webdriver.Chrome(chrome_options=options)
brower.set_window_size(1920,3000)
return brower

"""
自動實現抓爬
"""
def table_crawler_function(brower,pid=0):
print('父節點'+str(pid))
#indicatorList = query_list(pid)
if pid==0:

# print('循環中'+str(indicator['Id']))
login_url = 'https://account.wxb.com/page/login?from=https://data.wxb.com/rank?category=-1'
user = '//*[@id="root"]/div/div/div[2]/div[1]/span[2]' #賬號登錄
username = '//*[@id="email"]'
userpass = '//*[@id="password"]'
login = '//*[@id="root"]/div/div/div[2]/form/button'

brower.get(login_url)

mouse_move_xpath_click(brower,user)
time.sleep(1)
click_xpath_str(brower,user)
time.sleep(1)
brower.find_element(By.XPATH,username).send_keys("13717626181")
brower.find_element(By.XPATH,userpass).send_keys("xiaoyao128")
time.sleep(1)
mouse_move_xpath_click(brower,login)
time.sleep(1)
indicatorList = ["1"]
for indicator in indicatorList:

search_xpth = '//*[@id="root"]/div/div[2]/header/div[2]/span/input'
text = '//*[@id="query"]'
mouse_move_xpath_click(brower,search_xpth)
time.sleep(1)
el = brower.find_element(By.XPATH,search_xpth);
el.send_keys("tiandiwulianwang")
el.send_keys(Keys.ENTER)
time.sleep(3)
whref = ''
for link in brower.find_elements_by_xpath("//*[@href]"):#獲取當前頁面的href
x = link.get_attribute('href')
print(x)
if '/details/postRead' in x:
whref = x
print(whref)
brower.get(whref)
s = []
for link1 in brower.find_elements_by_xpath("//*[@href]"):
y = link1.get_attribute('href')
if 'https://mp.weixin.qq.com/s?__biz' in y:
print(y)
s.append(y)

for link2 in s:
index_html_path = wechats.down_html(link2,'ruguo')

def mouse_move_text_click(brower, param):
print("請求不到{},滾動條翻滾",param)
target = brower.find_element_by_link_text(param)
brower.execute_script("arguments[0].scrollIntoView(false);",target)
target.click()
def mouse_move_xpath_click(brower, param):
print("請求不到{},滾動條翻滾",param)
wait = WebDriverWait(brower,10)
wait.until(lambda brower: brower.find_element_by_xpath(param))
target = brower.find_element(By.XPATH,param)
brower.execute_script("arguments[0].scrollIntoView(false);",target)
target.click()
def second_except_id_click(brower, param):
logger.error("第二次點擊ID={"+param+"}")
brower.d('window.scrollTo(0,document.body.scrollHeight)')
brower.find_element_by_id(param).click()

def except_text_click(brower, param):
logger.error("請求不到，滾動條根據text定位")
target = brower.find_element_by_link_text(param)
brower.execute_script("arguments[0].scrollIntoView(false);",target)
target.click()
def except_xpath_click(brower, param):
logger.error("請求不到，滾動條根據xpath定位")
target = brower.find_element(By.XPATH,param)
brower.execute_script("arguments[0].scrollIntoView(false);",target)
target.click()
def except_id_click(brower, param):
logger.error("請求不到，滾動條根據ID定位")
target = brower.find_element(By.ID,param)
brower.execute_script("arguments[0].scrollIntoView(false);",target)
target.click()
"""
根據text點擊
"""
def click_text(brower, indicator):

r = brower.find_element_by_link_text(indicator['text_name'])
ActionChains(brower).move_to_element(r).click(r).perform()
wait=WebDriverWait(brower,10)#等待元素加載出來

def click_scroll_id(brower, param):
time.sleep(1)
try:
mouse_move_text_click(brower,param)

brower.find_element_by_id(param).click()
except NoSuchElementException as e:
second_except_id_click(brower,param)
except WebDriverException as e1:
second_except_id_click(brower,param)

def click_text_str(brower, param):
wait = WebDriverWait(brower,10)
wait.until(lambda brower: brower.find_element_by_text(param))
r = brower.find_element_by_link_text(param)
ActionChains(brower).move_to_element(r).click(r).perform()
wait=WebDriverWait(brower,10)#等待元素加載出來

def click_xpath(brower, indicator):
r = brower.find_element(By.XPATH,indicator['craw_attr'])
ActionChains(brower).move_to_element(r).click(r).perform()
wait=WebDriverWait(brower,10)#等待元素加載出來
#wait.until(EC.presence_of_element_located((By.ID,id)))

def click_xpath_str(brower, param):
wait = WebDriverWait(brower,10)
wait.until(lambda brower: brower.find_element_by_xpath(param))
r = brower.find_element(By.XPATH,param)
ActionChains(brower).move_to_element(r).click(r).perform()
wait=WebDriverWait(brower,10)#等待元素加載出來

def click_id(brower, indicator):
r = brower.find_element(By.ID,indicator['craw_attr'])
ActionChains(brower).move_to_element(r).click(r).perform()
wait=WebDriverWait(brower,10)#等待元素加載出來

def click_id_str(brower, param):
r = brower.find_element(By.ID,param)
ActionChains(brower).move_to_element(r).click(r).perform()
wait=WebDriverWait(brower,10)#等待元素加載出來

"""SQL封裝"""
def query_list(param):
cur = mysql.conn.cursor()
mysql.field('*')
mysql.where(" p_id="+str(param)+" and is_used=0 and class_type=4")
mysql.order_sql = " order by Id"

indicatorList = mysql.find(0)
return indicatorList

if __name__ == '__main__':
r = wechat_crawler_start(0)

Python3.6 微小寶微信公衆號文章抓爬

redis的key亂碼問題和值自增問題

一個開源且全面的C#算法實戰教程

一款.NET開源、功能強大、跨平臺的繪圖庫 - OxyPlot

CORS error 但是 status code 是200 OK

壓縮上傳的GPU數據的方案

使用skopeo同步鏡像

linux下openoffice4.1.7安裝及錯誤解決

Elasticsearch技術分析

springCloud Finchley.RELEASE gateway 動態網關實現及gateway功能綜述,優化

Python3 selenium 網頁table數據抓爬

centos7.X安裝jdk及配置環境變量及wget下載tomcat

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結