selenium+mysql 爬取LEI官網數據

import time from selenium import webdriver from selenium.webdriver.chrome.service import Service from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.ui import Select import pymysql # 指定WebDriver的路徑 webdriver_path = 'C:/chromedriver/chromedriver.exe' # 創建Service對象 service = Service(executable_path=webdriver_path) # 連接數據庫 db = pymysql.connect(host='127.0.0.1', user='root', password='******', db='lei_db', charset='utf8mb4') # 創建遊標對象 cursor = db.cursor() # 初始化WebDriver driver = webdriver.Chrome(service=service) # 打開目標網頁 driver.get("https://www.leichina.org/cei/2935720/2935943/index.html") time.sleep(10) driver.maximize_window() # 等待頁面加載完成 WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.TAG_NAME, "body"))) # 定位iframe並切換上下文 iframe = driver.find_element(By.ID, "frame2") driver.switch_to.frame(iframe) # 定位“更多”按鈕並點擊 more_btn = driver.find_element(By.ID, "moreBtn") more_btn.click() # 等待select元素出現 select = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "#btn2 > select"))) # 使用Select類來處理下拉框 select = Select(select) select.select_by_index(1) # 定位驗證碼並刷icon icon_element = driver.find_element(By.ID, "changepic") icon_element.click() # 點擊刷新icon,刷新它 # 等待驗證碼圖片加載完成 WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "#imgcode"))) # 這裏等待用戶手動輸入驗證碼 user_input = input("請打開驗證碼圖片URL,在瀏覽器中查看驗證碼,並輸入驗證碼:") # 定位驗證碼輸入框並輸入驗證碼 vcode_input = driver.find_element(By.CSS_SELECTOR, "#searchForm\:vCode") vcode_input.send_keys(user_input) # 定位查詢按鈕並點擊 query_btn = driver.find_element(By.ID, "searchForm:j_id36") query_btn.click() time.sleep(10) # 等待查詢結果表格加載完成 WebDriverWait(driver, 30).until( EC.presence_of_element_located((By.CSS_SELECTOR, "#resultForm\:cdmLegalEntityListData"))) # 解析查詢結果 total_pages = 10692 for page in range(1, total_pages + 1): print(f"正在處理第{page}頁...") time.sleep(10) # 等待表格加載完成 WebDriverWait(driver, 30).until( EC.presence_of_element_located((By.CSS_SELECTOR, "#resultForm\:cdmLegalEntityListData"))) #僅用於獲取行數 因爲最後一頁行數可能小於10 table = driver.find_element(By.CSS_SELECTOR, "#resultForm\:cdmLegalEntityListData\:tb") rows = table.find_elements(By.TAG_NAME, "tr") r_count = len(rows) for i in range(0,r_count): # 解析表格 table = driver.find_element(By.CSS_SELECTOR, "#resultForm\:cdmLegalEntityListData\:tb") # rows = table.find_elements(By.TAG_NAME, "tr")[1:] # 跳過表頭 rows = table.find_elements(By.TAG_NAME, "tr") row=rows[i] cells = row.find_elements(By.TAG_NAME, "td") lei = cells[0].text company_cn_name = cells[1].text status = cells[2].text address = cells[3].text # 模擬點擊進入詳情頁 detail_link = cells[4].find_element(By.TAG_NAME, "a") detail_link.click() time.sleep(5) # 等待詳情頁加載完成 WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.TAG_NAME, "body"))) # 定位驗證碼輸入框並輸入驗證碼 company_en_name = driver.find_element(By.CSS_SELECTOR, "#j_id3\:j_id6 > table > tbody > tr > td > table > tbody > tr:nth-child(2) > td.txtLeft").text # 執行SQL語句,插入數據 sql = "INSERT INTO lei (lei, company_cn_name, status, address, company_en_name) VALUES (%s, %s, %s, %s, %s)" cursor.execute(sql, (lei, company_cn_name, status, address, company_en_name)) # print(f"lei: {lei}") # print(f"company_cn_name: {company_cn_name}") # print(f"status: {status}") # print(f"address: {address}") # print(f"company_en_name: {company_en_name}") # 定位返回按鈕並點擊 back_btn = driver.find_element(By.CSS_SELECTOR, "#j_id3 > table > tbody > tr > td > table > tbody > tr:nth-child(1) > td > table > tbody > tr > td > table:nth-child(15) > tbody > tr:nth-child(2) > td > table > tbody > tr > td:nth-child(2) > a > img") back_btn.click() time.sleep(5) # 提交事務 db.commit() print(f"已經爬取第{i+1}條...") # 翻頁 if page <= total_pages: next_page_btn = driver.find_element(By.CSS_SELECTOR, '#resultForm > table > tbody > tr:nth-child(1) > td > table > tbody > tr > td > table:nth-child(2) > tbody > tr:nth-child(2) > td > a:nth-child(9)') next_page_btn.click() time.sleep(20) # 等待頁面加載完成 # 關閉數據庫連接 cursor.close() db.close()
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章