Python使用Selenium库爬取动态网页

实验一股票数据爬取

爬取网站： http://data.eastmoney.com/notices/hsa.html

爬取内容：第一页至十五页网页页面中的“代码”，“名称”“公告标题”，“公告类型”以及“公告日期”数据信息存储至“股票数据.CSV”文件。

from bs4 import BeautifulSoup
import bs4,csv
import time
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait

browser = webdriver.Chrome()
browser.get('http://data.eastmoney.com/notices/')
wait = WebDriverWait(browser, 10)
#browser.find_element_by_css_selector('#dt_1').click()
table_emergence = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#dt_1')))


###################信息提取######################
info = []
def get_info(html):
	soup = BeautifulSoup(html, 'lxml')
	table = soup.find(name='table', attrs={'''id''': 'dt_1'})
	trs = table.find('tbody').children
	for tr in trs:
		if isinstance(tr, bs4.element.Tag):
			tds = tr.find_all('td')
			code = tds[0].a.string
			name = tds[1].a.string
			title = tds[3].a.string
			title_type = tds[4].span.string
			time = tds[5].span.string
			sub_info = [code, name, title, title_type, time]
			info.append(sub_info)

#############翻页操作######################
def next_page(page_number):
	try:
		wait = WebDriverWait(browser, 20)
		inputs = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#PageContgopage')))
		submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#PageCont > a.btn_link')))
		inputs.clear()
		inputs.send_keys(page_number)
		submit.click()
		wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, '#PageCont > span.at'), str(page_number)))
	except TimeoutException:
		next_page(page_number)

######################保存数据##################################
def save_data(data):
	with open('股票数据.csv', 'w', newline='', encoding='utf-8') as f:
		writer = csv.writer(f)
		writer.writerow(['代码', '名称', '公告标题', '公告类型', '公告日期'])
		for a in data:
			print(a)
			writer.writerow(a)


for i in range(0, 16):
	get_info(browser.page_source)
	next_page(i+2)
	time.sleep(2)
save_data(info)
browser.close()

实验二地震台数据爬取

爬取网站：http://www.ceic.ac.cn/history

实验内容：第一页至第十五页中页面中的“震级”、“发震时刻”、“维度”、“经度”、“深度” 以及“参考位置信息”信息存储至“地震台信息.csv”文件。（提示：需要先点击“查询”按钮才可以得到目标页面）

from bs4 import BeautifulSoup
import bs4,csv,numpy
import time
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait


browser = webdriver.Chrome()
browser.get('http://www.ceic.ac.cn/history')
wait = WebDriverWait(browser, 5)
browser.find_element_by_class_name('check').click()
time.sleep(2)


#################信息提取######################
info = []
def get_info(html):
	soup = BeautifulSoup(html, 'lxml')
	td = soup.find_all('td')
	for td in td:
		info.append(td.text)

#####################翻页操作######################
def next_page():
	try:
		wait = WebDriverWait(browser, 10)
		nextpage = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '.page > [alt=下一页]')))
		nextpage.click()
	except TimeoutException:
		next_page()

#######################保存数据###################################
def save_data(data):
	with open('地震台信息.csv', 'w', newline='', encoding='utf-8') as f:
		writer = csv.writer(f)
		writer.writerow(['震级(M)','发震时刻(UTC+8)','纬度(°)','经度(°)','深度(千米)','参考位置'])
		for a in data:
			print(a)
			writer.writerow(a)

###################循环爬取多页数据##############################
for i in range(0,15):
	html = browser.page_source
	get_info(html)
	next_page()
	time.sleep(2)
info = numpy.array(info).reshape((int)(len(info)/6), 6)
save_data(info)

browser.close()

实验三京东数据爬取

爬取网站： https://search.jd.com/Search?keyword=python

实验内容：第一页至十五页网页页面中的书籍的名称、价格、图书封面照片的链接、评价数、出版社信息存储至“京东-python 图书信息.CSV” 文件。

from bs4 import BeautifulSoup
import bs4,csv,numpy
import time
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait


browser = webdriver.Chrome()
browser.get('http://www.jd.com')
wait = WebDriverWait(browser, 5)
input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'input[aria-label=搜索]')))
submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'button[aria-label=搜索]')))
input.clear()
input.send_keys('python')
submit.click()
time.sleep(2)
js="document.documentElement.scrollTop=10000"
browser.execute_script(js)
time.sleep(2)

#################信息提取######################
books = []
def get_info(html):
    soup = BeautifulSoup(html, 'lxml')
    tag_img = soup.select('#J_goodsList > ul > li > div > div.p-img')
    tag_price = soup.select('#J_goodsList > ul > li > div > div.p-price')
    tag_name = soup.select('#J_goodsList > ul > li > div > div.p-name > a > em')
    tag_publish = soup.select('#J_goodsList > ul > li > div > div.p-shopnum')
    tag_comment = soup.select('#J_goodsList > ul > li > div > div.p-commit')
    #advertising = soup.select('# J_goodsList > ul > li > div > div.p-market')

    for i in range(0,len(tag_img)):
        temp = []
        temp.append(tag_name[i].text)
        temp.append(tag_price[i].text.strip().replace('\n',''))
        temp.append(tag_img[i].a.img)
        temp.append(tag_comment[i].text.replace('\n', ''))
        temp.append(tag_publish[i].text.replace('\n',''))
        books.append(temp)

#############翻页操作######################
def next_page():
    try:
        wait = WebDriverWait(browser, 10)
        nextpage = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '.pn-next > em')))
        nextpage.click()
        time.sleep(2)
        js = "document.documentElement.scrollTop=10000"
        browser.execute_script(js)
    except TimeoutException:
        next_page()

#######################保存数据###################################
def save_data(data):
	with open('京东-python 图书信息.csv', 'w', newline='', encoding='utf-8') as f:
		writer = csv.writer(f)
		writer.writerow(['名称','价格','封面','评论数','出版社'])
		for a in data:
			print(a)
			writer.writerow(a)

###################循环爬取多页数据##############################
for i in range(0,168):
    html = browser.page_source
    get_info(html)
    next_page()
    time.sleep(5)
save_data(books)
browser.quit()

上面的代码不能过滤广告，不能爬取一些特殊的书籍，以下是修正后的代码：

from bs4 import BeautifulSoup
import csv
import time
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait

browser = webdriver.Chrome()
browser.get('http://www.jd.com')
wait = WebDriverWait(browser, 5)
#定位搜索框与搜索按钮，搜索python
input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'input[aria-label=搜索]')))
submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'button[aria-label=搜索]')))
input.clear()
input.send_keys('python')
submit.click()
time.sleep(2)
#滚动垂直进度条到最下方
js="document.documentElement.scrollTop=10000"
browser.execute_script(js)
time.sleep(2)

#################信息提取######################
books = []  #存放书籍信息
def get_info(html):
    tag_books = BeautifulSoup(html, 'lxml').select('#J_goodsList > ul > li > div')  #提取所有书籍信息
    for tag_book in tag_books:
        soup = BeautifulSoup(str(tag_book), 'lxml')   #提取某一本书的具体信息
        tag_advertising = soup.select('span.p-promo-flag')  #查找广告
        if(len(tag_advertising) == 1):  #过滤广告
            continue
        #提取某一本书的各项信息
        tag_flag = soup.select('div > div.gl-i-tab-content')
        tag_img = soup.select('div.p-img')
        tag_price = soup.select('div.p-price')
        tag_name = soup.select('div.p-name > a > em')
        tag_publish = soup.select('div.p-bookdetails > span.p-bi-store')
        tag_comment = soup.select('div.p-commit')
        temp_book = []  # 暂时存放某一本书的各项信息
        # 提取无套件的书籍信息
        if (len(tag_flag) == 0):
            temp_book.append(tag_name[0].text)
            temp_book.append(tag_price[0].text.strip().replace('\n', ''))

            temp_src = tag_img[0].a.get('src')
            if (temp_src == None):
                temp_href = tag_img[0].a.get('href')
                if (temp_href[0] != 'h'):
                    temp_book.append('https:' + tag_img[0].a.get('href'))
            else:
                temp_book.append(temp_src)

            if (len(tag_publish) == 1):
                temp_book.append(tag_publish[0].text.replace('\n', ''))
            else:
                temp_book.append('无')

            temp_book.append(tag_comment[0].text.replace('\n', ''))
            books.append(temp_book)
        # 提取有套件的书籍信息
        else:
            #一个div下多个书籍套件的提取
            for i in range(0,len(tag_name)):
                temp_books = []
                temp_books.append(tag_name[i].text)
                temp_books.append(tag_price[i].text.strip().replace('\n', '')[0:6])

                temp_src = tag_img[i].a.get('src')
                if (temp_src == None):
                    temp_href = tag_img[i].a.get('href')
                    if (temp_href[i] != 'h'):
                        temp_books.append('https:' + tag_img[i].a.get('href'))
                else:
                    temp_books.append(temp_src)

                if (len(tag_publish) == 1):
                    temp_books.append(tag_publish[0].text.replace('\n', ''))
                else:
                    temp_books.append('无')

                if(tag_comment[i].text.replace('\n', '')==''):
                    temp_books.append('无')
                else:
                    temp_books.append(tag_comment[i].text.replace('\n', ''))
                books.append(temp_books)

#############翻页操作######################
def next_page():
    try:
        wait = WebDriverWait(browser, 10)
        nextpage = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '.pn-next > em')))
        nextpage.click()
        time.sleep(2)
        js = "document.documentElement.scrollTop=10000"
        browser.execute_script(js)
    except TimeoutException:
        next_page()

#######################保存数据###################################
def save_data(data):
	with open('京东-python 图书信息.csv', 'w', newline='', encoding='utf-8') as f:
		writer = csv.writer(f)
		writer.writerow(['名称','价格','封面','评论数','出版社'])
		for a in data:
			print(a)
			writer.writerow(a)

###################循环爬取多页数据##############################
html = browser.page_source
get_info(html)
for i in range(0,16):
    next_page()
    time.sleep(5)
    html = browser.page_source
    get_info(html)

save_data(books)

browser.close()

Python使用Selenium库爬取动态网页

微调真的能让LLM学到新东西吗:引入新知识可能让模型产生更多的幻觉

iNeuOS工业互联网操作系统，增加电力IEC104协议

微服务实践k8s&dapr开发部署实验（3）订阅发布

Python使用bs4庫爬蟲實例

Python使用Selenium庫爬取動態網頁

Python re庫正則方式爬取貓眼電影

Java集合體系與集合選用

Java集合Collection接口的常用方法(實現集合元素的增刪，集合之間的交併差，集合與數組間的轉換，集合的迭代器法遍歷)

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結