- 實驗一 股票數據爬取
from bs4 import BeautifulSoup
import bs4,csv
import time
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
browser = webdriver.Chrome()
browser.get('http://data.eastmoney.com/notices/')
wait = WebDriverWait(browser, 10)
#browser.find_element_by_css_selector('#dt_1').click()
table_emergence = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#dt_1')))
###################信息提取######################
info = []
def get_info(html):
soup = BeautifulSoup(html, 'lxml')
table = soup.find(name='table', attrs={'''id''': 'dt_1'})
trs = table.find('tbody').children
for tr in trs:
if isinstance(tr, bs4.element.Tag):
tds = tr.find_all('td')
code = tds[0].a.string
name = tds[1].a.string
title = tds[3].a.string
title_type = tds[4].span.string
time = tds[5].span.string
sub_info = [code, name, title, title_type, time]
info.append(sub_info)
#############翻頁操作######################
def next_page(page_number):
try:
wait = WebDriverWait(browser, 20)
inputs = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#PageContgopage')))
submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#PageCont > a.btn_link')))
inputs.clear()
inputs.send_keys(page_number)
submit.click()
wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, '#PageCont > span.at'), str(page_number)))
except TimeoutException:
next_page(page_number)
######################保存數據##################################
def save_data(data):
with open('股票數據.csv', 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(['代碼', '名稱', '公告標題', '公告類型', '公告日期'])
for a in data:
print(a)
writer.writerow(a)
for i in range(0, 16):
get_info(browser.page_source)
next_page(i+2)
time.sleep(2)
save_data(info)
browser.close()
- 實驗二 地震臺數據爬取
from bs4 import BeautifulSoup
import bs4,csv,numpy
import time
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
browser = webdriver.Chrome()
browser.get('http://www.ceic.ac.cn/history')
wait = WebDriverWait(browser, 5)
browser.find_element_by_class_name('check').click()
time.sleep(2)
#################信息提取######################
info = []
def get_info(html):
soup = BeautifulSoup(html, 'lxml')
td = soup.find_all('td')
for td in td:
info.append(td.text)
#####################翻頁操作######################
def next_page():
try:
wait = WebDriverWait(browser, 10)
nextpage = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '.page > [alt=下一頁]')))
nextpage.click()
except TimeoutException:
next_page()
#######################保存數據###################################
def save_data(data):
with open('地震臺信息.csv', 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(['震級(M)','發震時刻(UTC+8)','緯度(°)','經度(°)','深度(千米)','參考位置'])
for a in data:
print(a)
writer.writerow(a)
###################循環爬取多頁數據##############################
for i in range(0,15):
html = browser.page_source
get_info(html)
next_page()
time.sleep(2)
info = numpy.array(info).reshape((int)(len(info)/6), 6)
save_data(info)
browser.close()
- 實驗三 京東數據爬取
from bs4 import BeautifulSoup
import bs4,csv,numpy
import time
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
browser = webdriver.Chrome()
browser.get('http://www.jd.com')
wait = WebDriverWait(browser, 5)
input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'input[aria-label=搜索]')))
submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'button[aria-label=搜索]')))
input.clear()
input.send_keys('python')
submit.click()
time.sleep(2)
js="document.documentElement.scrollTop=10000"
browser.execute_script(js)
time.sleep(2)
#################信息提取######################
books = []
def get_info(html):
soup = BeautifulSoup(html, 'lxml')
tag_img = soup.select('#J_goodsList > ul > li > div > div.p-img')
tag_price = soup.select('#J_goodsList > ul > li > div > div.p-price')
tag_name = soup.select('#J_goodsList > ul > li > div > div.p-name > a > em')
tag_publish = soup.select('#J_goodsList > ul > li > div > div.p-shopnum')
tag_comment = soup.select('#J_goodsList > ul > li > div > div.p-commit')
#advertising = soup.select('# J_goodsList > ul > li > div > div.p-market')
for i in range(0,len(tag_img)):
temp = []
temp.append(tag_name[i].text)
temp.append(tag_price[i].text.strip().replace('\n',''))
temp.append(tag_img[i].a.img)
temp.append(tag_comment[i].text.replace('\n', ''))
temp.append(tag_publish[i].text.replace('\n',''))
books.append(temp)
#############翻頁操作######################
def next_page():
try:
wait = WebDriverWait(browser, 10)
nextpage = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '.pn-next > em')))
nextpage.click()
time.sleep(2)
js = "document.documentElement.scrollTop=10000"
browser.execute_script(js)
except TimeoutException:
next_page()
#######################保存數據###################################
def save_data(data):
with open('京東-python 圖書信息.csv', 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(['名稱','價格','封面','評論數','出版社'])
for a in data:
print(a)
writer.writerow(a)
###################循環爬取多頁數據##############################
for i in range(0,168):
html = browser.page_source
get_info(html)
next_page()
time.sleep(5)
save_data(books)
browser.quit()
上面的代碼不能過濾廣告,不能爬取一些特殊的書籍,以下是修正後的代碼:
from bs4 import BeautifulSoup
import csv
import time
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
browser = webdriver.Chrome()
browser.get('http://www.jd.com')
wait = WebDriverWait(browser, 5)
#定位搜索框與搜索按鈕,搜索python
input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'input[aria-label=搜索]')))
submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'button[aria-label=搜索]')))
input.clear()
input.send_keys('python')
submit.click()
time.sleep(2)
#滾動垂直進度條到最下方
js="document.documentElement.scrollTop=10000"
browser.execute_script(js)
time.sleep(2)
#################信息提取######################
books = [] #存放書籍信息
def get_info(html):
tag_books = BeautifulSoup(html, 'lxml').select('#J_goodsList > ul > li > div') #提取所有書籍信息
for tag_book in tag_books:
soup = BeautifulSoup(str(tag_book), 'lxml') #提取某一本書的具體信息
tag_advertising = soup.select('span.p-promo-flag') #查找廣告
if(len(tag_advertising) == 1): #過濾廣告
continue
#提取某一本書的各項信息
tag_flag = soup.select('div > div.gl-i-tab-content')
tag_img = soup.select('div.p-img')
tag_price = soup.select('div.p-price')
tag_name = soup.select('div.p-name > a > em')
tag_publish = soup.select('div.p-bookdetails > span.p-bi-store')
tag_comment = soup.select('div.p-commit')
temp_book = [] # 暫時存放某一本書的各項信息
# 提取無套件的書籍信息
if (len(tag_flag) == 0):
temp_book.append(tag_name[0].text)
temp_book.append(tag_price[0].text.strip().replace('\n', ''))
temp_src = tag_img[0].a.get('src')
if (temp_src == None):
temp_href = tag_img[0].a.get('href')
if (temp_href[0] != 'h'):
temp_book.append('https:' + tag_img[0].a.get('href'))
else:
temp_book.append(temp_src)
if (len(tag_publish) == 1):
temp_book.append(tag_publish[0].text.replace('\n', ''))
else:
temp_book.append('無')
temp_book.append(tag_comment[0].text.replace('\n', ''))
books.append(temp_book)
# 提取有套件的書籍信息
else:
#一個div下多個書籍套件的提取
for i in range(0,len(tag_name)):
temp_books = []
temp_books.append(tag_name[i].text)
temp_books.append(tag_price[i].text.strip().replace('\n', '')[0:6])
temp_src = tag_img[i].a.get('src')
if (temp_src == None):
temp_href = tag_img[i].a.get('href')
if (temp_href[i] != 'h'):
temp_books.append('https:' + tag_img[i].a.get('href'))
else:
temp_books.append(temp_src)
if (len(tag_publish) == 1):
temp_books.append(tag_publish[0].text.replace('\n', ''))
else:
temp_books.append('無')
if(tag_comment[i].text.replace('\n', '')==''):
temp_books.append('無')
else:
temp_books.append(tag_comment[i].text.replace('\n', ''))
books.append(temp_books)
#############翻頁操作######################
def next_page():
try:
wait = WebDriverWait(browser, 10)
nextpage = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '.pn-next > em')))
nextpage.click()
time.sleep(2)
js = "document.documentElement.scrollTop=10000"
browser.execute_script(js)
except TimeoutException:
next_page()
#######################保存數據###################################
def save_data(data):
with open('京東-python 圖書信息.csv', 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(['名稱','價格','封面','評論數','出版社'])
for a in data:
print(a)
writer.writerow(a)
###################循環爬取多頁數據##############################
html = browser.page_source
get_info(html)
for i in range(0,16):
next_page()
time.sleep(5)
html = browser.page_source
get_info(html)
save_data(books)
browser.close()