python+selenium模擬瀏覽器抓取智慧芽數據

一、需求

有一批專利號需要在智慧芽上找它對應的相似專利。

抓包看了下智慧芽登錄接口,登錄可以直接使用用戶名、密碼登錄,沒驗證碼限制少了很多麻煩。又看它使用token請求,開始想着獲取到登錄後的token再批量請求接口就好了,但嘗試了下發現不行。因爲請求次數多了就會有限制,再重新登錄換新token都不行,所以改成了使用selenium模擬瀏覽器請求抓取數據。

二、思路

按照人手動操作,先登錄,然後打開頁面抓取數據,然後保存到excel中,由於數據比較多,代碼運行過程中容易出現中斷,所以抓取一部分就保存一部分到excel中,一直循環抓取。

三、輸入輸出格式

輸入

輸出

四、代碼

代碼中涉及到的知識點還是比較多的,所以貼了上來,方便後面遇到類似的直接拿來使用。

from selenium import webdriver
import time
import pandas
import pandas as pd
from openpyxl import load_workbook
from selenium.common import NoSuchElementException
from selenium.webdriver.common.by import By
from collections import Counter
from collections import OrderedDict
INPUT_FILE='PNS_1.xlsx'
OUT_FILE='PN_Out_1.xlsx'
# 設置WebDriver
driver = webdriver.Chrome()
# 存放需要保存的數據
save_data=[]
# 登錄
def login():
    # 打開Google首頁
    driver.get('https://account.zhihuiya.com/#/')
    time.sleep(5)
    current_url=driver.current_url
    print('當前頁面url:',current_url)
    if 'account.zhihuiya.com' in current_url:
        tab_element = driver.find_element(By.ID, 'tab-EMAIL_PASSWORD')
        if tab_element.is_displayed():
            tab_element.click()
            time.sleep(3)
            # 找到賬號輸入框,輸入賬號
            account_element = driver.find_element(By.XPATH, '//*[@id="pane-EMAIL_PASSWORD"]/form/div[1]/div/input')
            account_element.click()
            account_element.clear()
            account_element.send_keys('[email protected]')
            pwd_element = driver.find_element(By.XPATH, '//*[@id="pane-EMAIL_PASSWORD"]/form/div[2]/div/div/input')
            pwd_element.send_keys('xxxxx')
            #登錄按鈕
            btn_element = driver.find_element(By.XPATH, '//*[@id="pane-EMAIL_PASSWORD"]/form/div[4]/div/button')
            btn_element.click()
            time.sleep(5)
            try:
                # 定位元素
                element = driver.find_element(By.XPATH, '/html/body/div[6]/div')
                # 判斷元素是否可見
                is_visible = element.is_displayed()
                print(f"元素是否可見: {is_visible}")
                if is_visible:
                    agree_element = driver.find_element(By.XPATH, '/html/body/div[6]/div/div[2]/button[2]')
                    agree_element.click()
                    time.sleep(5)
                # 對元素進行操作
            except NoSuchElementException:
                print("元素不存在")

# 獲取PN對應的patentId
def find_patentId(pn):
    url='https://analytics.zhihuiya.com/search/result/tablelist/1?sort=pdesc&limit=100&q='+pn+'&_type=query&search_mode=publication'
    driver.get(url)
    time.sleep(2)
    a_element=driver.find_element(By.XPATH,'//*[@data-link-type="PN"][@data-link-data="'+pn+'"]')
    print(a_element)
    # 獲取元素屬性
    patent_id = a_element.get_attribute("data-patent-id")
    print(patent_id)
    find_similar(pn, patent_id)

# 根據pn和patentId找相似
def find_similar(pn,patent_Id):
    url='https://analytics.zhihuiya.com/patent-view/similar?_type=query&source_type=search_result&rows=100&patentId='+patent_Id+'&sort=pdesc&page=1&q='+pn
    print(url)
    driver.get(url)
    time.sleep(6)
    span_elements=driver.find_elements(By.XPATH,'//*[@class="PN"]/span[@class="snap-table__pn-row"]')
    print('相似專利:',span_elements)
    global save_data
    for ele in span_elements:
        print(pn,ele.text)
        ele_tup=(pn,ele.text)
        save_data.append(ele_tup)

    # 超過一百條寫入excel一次
    if len(save_data)>=100:
        write_diff_PNs()
        print('寫入數據')


# 獲取已保存到Excel中的PN
def find_out_PNs(sheetname,output_file):
    # 加載工作簿
    workbook = load_workbook(output_file)
    sheet = workbook[sheetname]
    print(sheet)
    rows = [row[0].value for row in sheet.iter_rows(min_col=1, max_col=2, min_row=2)];
    # print(rows)
    workbook.close()
    return  rows

# 獲取所有需要獲取相似專利的PN
def find_all_PNs(sheetname,output_file):
    # 加載工作簿
    workbook = load_workbook(output_file)
    sheet = workbook[sheetname]
    print(sheet)
    rows = [row[0].value for row in sheet.iter_rows(min_col=1, max_col=1, min_row=2)];
    # print(rows)
    workbook.close()
    return  rows

# 獲取所有和已獲取PN的差集
def find_diff_PNs():
    out_pns=find_out_PNs('PN',OUT_FILE)
    out_pns=list(set(out_pns))
    print('已存pns個數:',len(out_pns))
    all_pns=find_all_PNs('PN',INPUT_FILE)
    all_pns=list(set(all_pns))
    print('所有pns個數:',len(all_pns))
    diff_pns = [x for x in out_pns if x not in all_pns]
    print('多餘pns:',diff_pns)
    diff_pns = [x for x in all_pns if x not in out_pns]
    print('差異pns:',diff_pns)
    return diff_pns

# 找到待獲取的PN並將其寫入到Excel
def write_diff_PNs():
    # 合併原來舊數據並寫入到excel中
    global save_data
    save_data_df = pandas.DataFrame(save_data, columns=['Query', 'Similar'])
    df = pd.read_excel(OUT_FILE, sheet_name='PN')
    print('已存數據行數:',len(df))
    df = pd.concat([df, save_data_df], ignore_index=True)  # 合併數據
    print('刪除前數據數:',len(df))
    # drop_duplicates 去除重複數據
    df=df.drop_duplicates(subset=['Query','Similar'],keep="first",ignore_index=True)
    print('刪除後數據數:',len(df))
    with pd.ExcelWriter(OUT_FILE, engine='openpyxl', mode='a', if_sheet_exists="replace") as writer:
        df.to_excel(writer, sheet_name='PN', index=False)
    save_data=[]


if __name__ == '__main__':
    # 記錄開始時間
    start_time = time.time()
    print(start_time)
    if len(find_diff_PNs())>0:
        login()
        time.sleep(10)
        while len(find_diff_PNs())>0:
            try:
                # 找到待獲取的PN
                diff_pns = find_diff_PNs()
                print('差異pns數量',diff_pns)
                for pn in diff_pns:
                    # 找到對應的patentId
                    find_patentId(pn)
                    time.sleep(1)
                time.sleep(10)
                print(save_data)
                write_diff_PNs()
            except Exception:
                print("打印異常")

        time.sleep(10)
        # 關閉瀏覽器
        driver.quit()
    else:
        print('已無需要查找相似的專利')
        driver.quit()
    # 記錄結束時間
    end_time = time.time()
    # 計算耗時
    elapsed_time = end_time - start_time
    print(f"耗時: {elapsed_time} 秒")

    # 統計列表元素重複次數
    # data_list = find_out_PNs('PN',OUT_FILE)
    # counts = Counter(data_list)
    # unique_values = list(OrderedDict.fromkeys(data_list))
    # for unique in unique_values:
    #     value = counts.get(unique)
    #     if value>20:
    #         print(str(unique) + '\t' + str(value))

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章