爬虫:根据表格中专利号对应的超链接爬取网页pdf

这个是公司同事帮我写的,没接触过爬虫,代码存个档学习一下。

这个文件是读取表格单元格中的专利号和对应的超链接,存储到新的csv文件中



import pandas as pd
from openpyxl import load_workbook

filename = '/data/datasets/LLMS/20231208 湖南中烟专利检索结果及全文(去噪前)/加热卷烟烟支结构专利风险评估项目/烟支结构专利检索结果(电脑连接外网状态下点击“公开号”可打开专利全文链接).XLSX'
df = pd.read_excel(filename)

workbook = load_workbook(filename)

sheets = workbook.sheetnames

sheet_name = sheets[0]

sheet = workbook[sheet_name]

ids = []
links = []
for row in sheet.iter_rows():
    for cell in row:
        if cell.hyperlink:
            # print(row[1].value)
            # print(cell.hyperlink.target)
            ids.append(row[1].value)
            links.append(cell.hyperlink.target)

# 生成DataFrame
data = {'ID': ids, 'Link': links}
df = pd.DataFrame(data)

# 保存为CSV文件
df.to_csv('output.csv', index=False)

这段就是爬虫代码了,听同事说伪装浏览器是对付反爬的一种很有效的方法,然后就是sleep一下等待网页响应了。

import pandas as pd
import requests
import chardet
import json
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
import re
import time
import urllib.request
import os


data = pd.read_csv('/data/python projects/image_tool/notebooks_dev/llms/专利检索/links.csv')
path = '/data/datasets/LLMS/20231208 湖南中烟专利检索结果及全文(去噪前)/加热卷烟烟支结构专利风险评估项目/files'

# 打开真实谷歌浏览器,防止识别
chrome_options = Options()
# 在终端运行以下命令,启动接口为9221的爬虫浏览器
# !chrome --remote-debugging-port=9221 --user-data-dir="/data0/home/aimall/others/google"
chrome_options.add_experimental_option("debuggerAddress", "127.0.0.1:9222")
service = Service("chromedriver_linux64/chromedriver-linux64/chromedriver") # 版本要对应,到https://sites.google.com/chromium.org/driver/downloads/version-selection下载
driver = webdriver.Chrome(service=service,options=chrome_options)

pattern = r'https?://[^\s"]+'

if __name__ == '__main__':
    for row in data.iterrows():
        id = row[1][0] +'.pdf'
        url = row[1][1]
        url = url.replace('abst','pdf')
        driver.get(url)
        time.sleep(3) # 注意等待网页响应
        page_source = driver.page_source
        urls = re.findall(pattern, page_source)
        pdf_url = None
        for url in urls:
            if url.startswith('https://patsnap-pdf.cdn.zhihuiya.com'):
                pdf_url = url
        if pdf_url is not None:
            pdf_url = pdf_url.replace('&', '&')
            response = requests.get(pdf_url)
            with open(os.path.join(path,str(id)), "wb") as file:
                file.write(response.content)
        time.sleep(2)

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章