一、需求
有一批專利號需要在智慧芽上找它對應的相似專利。
抓包看了下智慧芽登錄接口,登錄可以直接使用用戶名、密碼登錄,沒驗證碼限制少了很多麻煩。又看它使用token請求,開始想着獲取到登錄後的token再批量請求接口就好了,但嘗試了下發現不行。因爲請求次數多了就會有限制,再重新登錄換新token都不行,所以改成了使用selenium模擬瀏覽器請求抓取數據。
二、思路
按照人手動操作,先登錄,然後打開頁面抓取數據,然後保存到excel中,由於數據比較多,代碼運行過程中容易出現中斷,所以抓取一部分就保存一部分到excel中,一直循環抓取。
三、輸入輸出格式
輸入
輸出
四、代碼
代碼中涉及到的知識點還是比較多的,所以貼了上來,方便後面遇到類似的直接拿來使用。
from selenium import webdriver import time import pandas import pandas as pd from openpyxl import load_workbook from selenium.common import NoSuchElementException from selenium.webdriver.common.by import By from collections import Counter from collections import OrderedDict INPUT_FILE='PNS_1.xlsx' OUT_FILE='PN_Out_1.xlsx' # 設置WebDriver driver = webdriver.Chrome() # 存放需要保存的數據 save_data=[] # 登錄 def login(): # 打開Google首頁 driver.get('https://account.zhihuiya.com/#/') time.sleep(5) current_url=driver.current_url print('當前頁面url:',current_url) if 'account.zhihuiya.com' in current_url: tab_element = driver.find_element(By.ID, 'tab-EMAIL_PASSWORD') if tab_element.is_displayed(): tab_element.click() time.sleep(3) # 找到賬號輸入框,輸入賬號 account_element = driver.find_element(By.XPATH, '//*[@id="pane-EMAIL_PASSWORD"]/form/div[1]/div/input') account_element.click() account_element.clear() account_element.send_keys('[email protected]') pwd_element = driver.find_element(By.XPATH, '//*[@id="pane-EMAIL_PASSWORD"]/form/div[2]/div/div/input') pwd_element.send_keys('xxxxx') #登錄按鈕 btn_element = driver.find_element(By.XPATH, '//*[@id="pane-EMAIL_PASSWORD"]/form/div[4]/div/button') btn_element.click() time.sleep(5) try: # 定位元素 element = driver.find_element(By.XPATH, '/html/body/div[6]/div') # 判斷元素是否可見 is_visible = element.is_displayed() print(f"元素是否可見: {is_visible}") if is_visible: agree_element = driver.find_element(By.XPATH, '/html/body/div[6]/div/div[2]/button[2]') agree_element.click() time.sleep(5) # 對元素進行操作 except NoSuchElementException: print("元素不存在") # 獲取PN對應的patentId def find_patentId(pn): url='https://analytics.zhihuiya.com/search/result/tablelist/1?sort=pdesc&limit=100&q='+pn+'&_type=query&search_mode=publication' driver.get(url) time.sleep(2) a_element=driver.find_element(By.XPATH,'//*[@data-link-type="PN"][@data-link-data="'+pn+'"]') print(a_element) # 獲取元素屬性 patent_id = a_element.get_attribute("data-patent-id") print(patent_id) find_similar(pn, patent_id) # 根據pn和patentId找相似 def find_similar(pn,patent_Id): url='https://analytics.zhihuiya.com/patent-view/similar?_type=query&source_type=search_result&rows=100&patentId='+patent_Id+'&sort=pdesc&page=1&q='+pn print(url) driver.get(url) time.sleep(6) span_elements=driver.find_elements(By.XPATH,'//*[@class="PN"]/span[@class="snap-table__pn-row"]') print('相似專利:',span_elements) global save_data for ele in span_elements: print(pn,ele.text) ele_tup=(pn,ele.text) save_data.append(ele_tup) # 超過一百條寫入excel一次 if len(save_data)>=100: write_diff_PNs() print('寫入數據') # 獲取已保存到Excel中的PN def find_out_PNs(sheetname,output_file): # 加載工作簿 workbook = load_workbook(output_file) sheet = workbook[sheetname] print(sheet) rows = [row[0].value for row in sheet.iter_rows(min_col=1, max_col=2, min_row=2)]; # print(rows) workbook.close() return rows # 獲取所有需要獲取相似專利的PN def find_all_PNs(sheetname,output_file): # 加載工作簿 workbook = load_workbook(output_file) sheet = workbook[sheetname] print(sheet) rows = [row[0].value for row in sheet.iter_rows(min_col=1, max_col=1, min_row=2)]; # print(rows) workbook.close() return rows # 獲取所有和已獲取PN的差集 def find_diff_PNs(): out_pns=find_out_PNs('PN',OUT_FILE) out_pns=list(set(out_pns)) print('已存pns個數:',len(out_pns)) all_pns=find_all_PNs('PN',INPUT_FILE) all_pns=list(set(all_pns)) print('所有pns個數:',len(all_pns)) diff_pns = [x for x in out_pns if x not in all_pns] print('多餘pns:',diff_pns) diff_pns = [x for x in all_pns if x not in out_pns] print('差異pns:',diff_pns) return diff_pns # 找到待獲取的PN並將其寫入到Excel def write_diff_PNs(): # 合併原來舊數據並寫入到excel中 global save_data save_data_df = pandas.DataFrame(save_data, columns=['Query', 'Similar']) df = pd.read_excel(OUT_FILE, sheet_name='PN') print('已存數據行數:',len(df)) df = pd.concat([df, save_data_df], ignore_index=True) # 合併數據 print('刪除前數據數:',len(df)) # drop_duplicates 去除重複數據 df=df.drop_duplicates(subset=['Query','Similar'],keep="first",ignore_index=True) print('刪除後數據數:',len(df)) with pd.ExcelWriter(OUT_FILE, engine='openpyxl', mode='a', if_sheet_exists="replace") as writer: df.to_excel(writer, sheet_name='PN', index=False) save_data=[] if __name__ == '__main__': # 記錄開始時間 start_time = time.time() print(start_time) if len(find_diff_PNs())>0: login() time.sleep(10) while len(find_diff_PNs())>0: try: # 找到待獲取的PN diff_pns = find_diff_PNs() print('差異pns數量',diff_pns) for pn in diff_pns: # 找到對應的patentId find_patentId(pn) time.sleep(1) time.sleep(10) print(save_data) write_diff_PNs() except Exception: print("打印異常") time.sleep(10) # 關閉瀏覽器 driver.quit() else: print('已無需要查找相似的專利') driver.quit() # 記錄結束時間 end_time = time.time() # 計算耗時 elapsed_time = end_time - start_time print(f"耗時: {elapsed_time} 秒") # 統計列表元素重複次數 # data_list = find_out_PNs('PN',OUT_FILE) # counts = Counter(data_list) # unique_values = list(OrderedDict.fromkeys(data_list)) # for unique in unique_values: # value = counts.get(unique) # if value>20: # print(str(unique) + '\t' + str(value))