Python使用Selenium庫爬取動態網頁

  • 實驗一 股票數據爬取
爬取網站: http://data.eastmoney.com/notices/hsa.html
爬取內容:第一頁至十五頁網頁頁面中的“代碼”,“名稱”“公告標題”,“公 告類型”以及“公告日期”數據信息存儲至“股票數據.CSV”文件。
from bs4 import BeautifulSoup
import bs4,csv
import time
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait

browser = webdriver.Chrome()
browser.get('http://data.eastmoney.com/notices/')
wait = WebDriverWait(browser, 10)
#browser.find_element_by_css_selector('#dt_1').click()
table_emergence = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#dt_1')))


###################信息提取######################
info = []
def get_info(html):
	soup = BeautifulSoup(html, 'lxml')
	table = soup.find(name='table', attrs={'''id''': 'dt_1'})
	trs = table.find('tbody').children
	for tr in trs:
		if isinstance(tr, bs4.element.Tag):
			tds = tr.find_all('td')
			code = tds[0].a.string
			name = tds[1].a.string
			title = tds[3].a.string
			title_type = tds[4].span.string
			time = tds[5].span.string
			sub_info = [code, name, title, title_type, time]
			info.append(sub_info)

#############翻頁操作######################
def next_page(page_number):
	try:
		wait = WebDriverWait(browser, 20)
		inputs = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#PageContgopage')))
		submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#PageCont > a.btn_link')))
		inputs.clear()
		inputs.send_keys(page_number)
		submit.click()
		wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, '#PageCont > span.at'), str(page_number)))
	except TimeoutException:
		next_page(page_number)

######################保存數據##################################
def save_data(data):
	with open('股票數據.csv', 'w', newline='', encoding='utf-8') as f:
		writer = csv.writer(f)
		writer.writerow(['代碼', '名稱', '公告標題', '公告類型', '公告日期'])
		for a in data:
			print(a)
			writer.writerow(a)


for i in range(0, 16):
	get_info(browser.page_source)
	next_page(i+2)
	time.sleep(2)
save_data(info)
browser.close()
  • 實驗二 地震臺數據爬取
爬取網站 :http://www.ceic.ac.cn/history
實驗內容:第一頁至 第十五頁中頁面中的“震級”、“發震時刻”、“維度”、“經度”、“深度” 以及“參考位置信息”信息存儲至“地震臺信息.csv”文件。(提示: 需要先點擊“查詢”按鈕纔可以得到目標頁面)
from bs4 import BeautifulSoup
import bs4,csv,numpy
import time
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait


browser = webdriver.Chrome()
browser.get('http://www.ceic.ac.cn/history')
wait = WebDriverWait(browser, 5)
browser.find_element_by_class_name('check').click()
time.sleep(2)


#################信息提取######################
info = []
def get_info(html):
	soup = BeautifulSoup(html, 'lxml')
	td = soup.find_all('td')
	for td in td:
		info.append(td.text)

#####################翻頁操作######################
def next_page():
	try:
		wait = WebDriverWait(browser, 10)
		nextpage = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '.page > [alt=下一頁]')))
		nextpage.click()
	except TimeoutException:
		next_page()

#######################保存數據###################################
def save_data(data):
	with open('地震臺信息.csv', 'w', newline='', encoding='utf-8') as f:
		writer = csv.writer(f)
		writer.writerow(['震級(M)','發震時刻(UTC+8)','緯度(°)','經度(°)','深度(千米)','參考位置'])
		for a in data:
			print(a)
			writer.writerow(a)

###################循環爬取多頁數據##############################
for i in range(0,15):
	html = browser.page_source
	get_info(html)
	next_page()
	time.sleep(2)
info = numpy.array(info).reshape((int)(len(info)/6), 6)
save_data(info)

browser.close()


  • 實驗三 京東數據爬取
爬取網站: https://search.jd.com/Search?keyword=python
實驗內容:第一頁至十五頁網頁頁面中的書籍的名稱、價格、圖書封面照片的鏈接、評價數、出版社信息存儲至“京東-python 圖書信息.CSV” 文件。
from bs4 import BeautifulSoup
import bs4,csv,numpy
import time
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait


browser = webdriver.Chrome()
browser.get('http://www.jd.com')
wait = WebDriverWait(browser, 5)
input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'input[aria-label=搜索]')))
submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'button[aria-label=搜索]')))
input.clear()
input.send_keys('python')
submit.click()
time.sleep(2)
js="document.documentElement.scrollTop=10000"
browser.execute_script(js)
time.sleep(2)

#################信息提取######################
books = []
def get_info(html):
    soup = BeautifulSoup(html, 'lxml')
    tag_img = soup.select('#J_goodsList > ul > li > div > div.p-img')
    tag_price = soup.select('#J_goodsList > ul > li > div > div.p-price')
    tag_name = soup.select('#J_goodsList > ul > li > div > div.p-name > a > em')
    tag_publish = soup.select('#J_goodsList > ul > li > div > div.p-shopnum')
    tag_comment = soup.select('#J_goodsList > ul > li > div > div.p-commit')
    #advertising = soup.select('# J_goodsList > ul > li > div > div.p-market')

    for i in range(0,len(tag_img)):
        temp = []
        temp.append(tag_name[i].text)
        temp.append(tag_price[i].text.strip().replace('\n',''))
        temp.append(tag_img[i].a.img)
        temp.append(tag_comment[i].text.replace('\n', ''))
        temp.append(tag_publish[i].text.replace('\n',''))
        books.append(temp)

#############翻頁操作######################
def next_page():
    try:
        wait = WebDriverWait(browser, 10)
        nextpage = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '.pn-next > em')))
        nextpage.click()
        time.sleep(2)
        js = "document.documentElement.scrollTop=10000"
        browser.execute_script(js)
    except TimeoutException:
        next_page()

#######################保存數據###################################
def save_data(data):
	with open('京東-python 圖書信息.csv', 'w', newline='', encoding='utf-8') as f:
		writer = csv.writer(f)
		writer.writerow(['名稱','價格','封面','評論數','出版社'])
		for a in data:
			print(a)
			writer.writerow(a)

###################循環爬取多頁數據##############################
for i in range(0,168):
    html = browser.page_source
    get_info(html)
    next_page()
    time.sleep(5)
save_data(books)
browser.quit()

上面的代碼不能過濾廣告,不能爬取一些特殊的書籍,以下是修正後的代碼:

from bs4 import BeautifulSoup
import csv
import time
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait

browser = webdriver.Chrome()
browser.get('http://www.jd.com')
wait = WebDriverWait(browser, 5)
#定位搜索框與搜索按鈕,搜索python
input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'input[aria-label=搜索]')))
submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'button[aria-label=搜索]')))
input.clear()
input.send_keys('python')
submit.click()
time.sleep(2)
#滾動垂直進度條到最下方
js="document.documentElement.scrollTop=10000"
browser.execute_script(js)
time.sleep(2)

#################信息提取######################
books = []  #存放書籍信息
def get_info(html):
    tag_books = BeautifulSoup(html, 'lxml').select('#J_goodsList > ul > li > div')  #提取所有書籍信息
    for tag_book in tag_books:
        soup = BeautifulSoup(str(tag_book), 'lxml')   #提取某一本書的具體信息
        tag_advertising = soup.select('span.p-promo-flag')  #查找廣告
        if(len(tag_advertising) == 1):  #過濾廣告
            continue
        #提取某一本書的各項信息
        tag_flag = soup.select('div > div.gl-i-tab-content')
        tag_img = soup.select('div.p-img')
        tag_price = soup.select('div.p-price')
        tag_name = soup.select('div.p-name > a > em')
        tag_publish = soup.select('div.p-bookdetails > span.p-bi-store')
        tag_comment = soup.select('div.p-commit')
        temp_book = []  # 暫時存放某一本書的各項信息
        # 提取無套件的書籍信息
        if (len(tag_flag) == 0):
            temp_book.append(tag_name[0].text)
            temp_book.append(tag_price[0].text.strip().replace('\n', ''))

            temp_src = tag_img[0].a.get('src')
            if (temp_src == None):
                temp_href = tag_img[0].a.get('href')
                if (temp_href[0] != 'h'):
                    temp_book.append('https:' + tag_img[0].a.get('href'))
            else:
                temp_book.append(temp_src)

            if (len(tag_publish) == 1):
                temp_book.append(tag_publish[0].text.replace('\n', ''))
            else:
                temp_book.append('無')

            temp_book.append(tag_comment[0].text.replace('\n', ''))
            books.append(temp_book)
        # 提取有套件的書籍信息
        else:
            #一個div下多個書籍套件的提取
            for i in range(0,len(tag_name)):
                temp_books = []
                temp_books.append(tag_name[i].text)
                temp_books.append(tag_price[i].text.strip().replace('\n', '')[0:6])

                temp_src = tag_img[i].a.get('src')
                if (temp_src == None):
                    temp_href = tag_img[i].a.get('href')
                    if (temp_href[i] != 'h'):
                        temp_books.append('https:' + tag_img[i].a.get('href'))
                else:
                    temp_books.append(temp_src)

                if (len(tag_publish) == 1):
                    temp_books.append(tag_publish[0].text.replace('\n', ''))
                else:
                    temp_books.append('無')

                if(tag_comment[i].text.replace('\n', '')==''):
                    temp_books.append('無')
                else:
                    temp_books.append(tag_comment[i].text.replace('\n', ''))
                books.append(temp_books)

#############翻頁操作######################
def next_page():
    try:
        wait = WebDriverWait(browser, 10)
        nextpage = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '.pn-next > em')))
        nextpage.click()
        time.sleep(2)
        js = "document.documentElement.scrollTop=10000"
        browser.execute_script(js)
    except TimeoutException:
        next_page()

#######################保存數據###################################
def save_data(data):
	with open('京東-python 圖書信息.csv', 'w', newline='', encoding='utf-8') as f:
		writer = csv.writer(f)
		writer.writerow(['名稱','價格','封面','評論數','出版社'])
		for a in data:
			print(a)
			writer.writerow(a)

###################循環爬取多頁數據##############################
html = browser.page_source
get_info(html)
for i in range(0,16):
    next_page()
    time.sleep(5)
    html = browser.page_source
    get_info(html)

save_data(books)

browser.close()

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章