- 实验一 股票数据爬取
from bs4 import BeautifulSoup
import bs4,csv
import time
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
browser = webdriver.Chrome()
browser.get('http://data.eastmoney.com/notices/')
wait = WebDriverWait(browser, 10)
#browser.find_element_by_css_selector('#dt_1').click()
table_emergence = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#dt_1')))
###################信息提取######################
info = []
def get_info(html):
soup = BeautifulSoup(html, 'lxml')
table = soup.find(name='table', attrs={'''id''': 'dt_1'})
trs = table.find('tbody').children
for tr in trs:
if isinstance(tr, bs4.element.Tag):
tds = tr.find_all('td')
code = tds[0].a.string
name = tds[1].a.string
title = tds[3].a.string
title_type = tds[4].span.string
time = tds[5].span.string
sub_info = [code, name, title, title_type, time]
info.append(sub_info)
#############翻页操作######################
def next_page(page_number):
try:
wait = WebDriverWait(browser, 20)
inputs = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#PageContgopage')))
submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#PageCont > a.btn_link')))
inputs.clear()
inputs.send_keys(page_number)
submit.click()
wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, '#PageCont > span.at'), str(page_number)))
except TimeoutException:
next_page(page_number)
######################保存数据##################################
def save_data(data):
with open('股票数据.csv', 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(['代码', '名称', '公告标题', '公告类型', '公告日期'])
for a in data:
print(a)
writer.writerow(a)
for i in range(0, 16):
get_info(browser.page_source)
next_page(i+2)
time.sleep(2)
save_data(info)
browser.close()
- 实验二 地震台数据爬取
from bs4 import BeautifulSoup
import bs4,csv,numpy
import time
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
browser = webdriver.Chrome()
browser.get('http://www.ceic.ac.cn/history')
wait = WebDriverWait(browser, 5)
browser.find_element_by_class_name('check').click()
time.sleep(2)
#################信息提取######################
info = []
def get_info(html):
soup = BeautifulSoup(html, 'lxml')
td = soup.find_all('td')
for td in td:
info.append(td.text)
#####################翻页操作######################
def next_page():
try:
wait = WebDriverWait(browser, 10)
nextpage = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '.page > [alt=下一页]')))
nextpage.click()
except TimeoutException:
next_page()
#######################保存数据###################################
def save_data(data):
with open('地震台信息.csv', 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(['震级(M)','发震时刻(UTC+8)','纬度(°)','经度(°)','深度(千米)','参考位置'])
for a in data:
print(a)
writer.writerow(a)
###################循环爬取多页数据##############################
for i in range(0,15):
html = browser.page_source
get_info(html)
next_page()
time.sleep(2)
info = numpy.array(info).reshape((int)(len(info)/6), 6)
save_data(info)
browser.close()
- 实验三 京东数据爬取
from bs4 import BeautifulSoup
import bs4,csv,numpy
import time
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
browser = webdriver.Chrome()
browser.get('http://www.jd.com')
wait = WebDriverWait(browser, 5)
input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'input[aria-label=搜索]')))
submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'button[aria-label=搜索]')))
input.clear()
input.send_keys('python')
submit.click()
time.sleep(2)
js="document.documentElement.scrollTop=10000"
browser.execute_script(js)
time.sleep(2)
#################信息提取######################
books = []
def get_info(html):
soup = BeautifulSoup(html, 'lxml')
tag_img = soup.select('#J_goodsList > ul > li > div > div.p-img')
tag_price = soup.select('#J_goodsList > ul > li > div > div.p-price')
tag_name = soup.select('#J_goodsList > ul > li > div > div.p-name > a > em')
tag_publish = soup.select('#J_goodsList > ul > li > div > div.p-shopnum')
tag_comment = soup.select('#J_goodsList > ul > li > div > div.p-commit')
#advertising = soup.select('# J_goodsList > ul > li > div > div.p-market')
for i in range(0,len(tag_img)):
temp = []
temp.append(tag_name[i].text)
temp.append(tag_price[i].text.strip().replace('\n',''))
temp.append(tag_img[i].a.img)
temp.append(tag_comment[i].text.replace('\n', ''))
temp.append(tag_publish[i].text.replace('\n',''))
books.append(temp)
#############翻页操作######################
def next_page():
try:
wait = WebDriverWait(browser, 10)
nextpage = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '.pn-next > em')))
nextpage.click()
time.sleep(2)
js = "document.documentElement.scrollTop=10000"
browser.execute_script(js)
except TimeoutException:
next_page()
#######################保存数据###################################
def save_data(data):
with open('京东-python 图书信息.csv', 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(['名称','价格','封面','评论数','出版社'])
for a in data:
print(a)
writer.writerow(a)
###################循环爬取多页数据##############################
for i in range(0,168):
html = browser.page_source
get_info(html)
next_page()
time.sleep(5)
save_data(books)
browser.quit()
上面的代码不能过滤广告,不能爬取一些特殊的书籍,以下是修正后的代码:
from bs4 import BeautifulSoup
import csv
import time
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
browser = webdriver.Chrome()
browser.get('http://www.jd.com')
wait = WebDriverWait(browser, 5)
#定位搜索框与搜索按钮,搜索python
input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'input[aria-label=搜索]')))
submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'button[aria-label=搜索]')))
input.clear()
input.send_keys('python')
submit.click()
time.sleep(2)
#滚动垂直进度条到最下方
js="document.documentElement.scrollTop=10000"
browser.execute_script(js)
time.sleep(2)
#################信息提取######################
books = [] #存放书籍信息
def get_info(html):
tag_books = BeautifulSoup(html, 'lxml').select('#J_goodsList > ul > li > div') #提取所有书籍信息
for tag_book in tag_books:
soup = BeautifulSoup(str(tag_book), 'lxml') #提取某一本书的具体信息
tag_advertising = soup.select('span.p-promo-flag') #查找广告
if(len(tag_advertising) == 1): #过滤广告
continue
#提取某一本书的各项信息
tag_flag = soup.select('div > div.gl-i-tab-content')
tag_img = soup.select('div.p-img')
tag_price = soup.select('div.p-price')
tag_name = soup.select('div.p-name > a > em')
tag_publish = soup.select('div.p-bookdetails > span.p-bi-store')
tag_comment = soup.select('div.p-commit')
temp_book = [] # 暂时存放某一本书的各项信息
# 提取无套件的书籍信息
if (len(tag_flag) == 0):
temp_book.append(tag_name[0].text)
temp_book.append(tag_price[0].text.strip().replace('\n', ''))
temp_src = tag_img[0].a.get('src')
if (temp_src == None):
temp_href = tag_img[0].a.get('href')
if (temp_href[0] != 'h'):
temp_book.append('https:' + tag_img[0].a.get('href'))
else:
temp_book.append(temp_src)
if (len(tag_publish) == 1):
temp_book.append(tag_publish[0].text.replace('\n', ''))
else:
temp_book.append('无')
temp_book.append(tag_comment[0].text.replace('\n', ''))
books.append(temp_book)
# 提取有套件的书籍信息
else:
#一个div下多个书籍套件的提取
for i in range(0,len(tag_name)):
temp_books = []
temp_books.append(tag_name[i].text)
temp_books.append(tag_price[i].text.strip().replace('\n', '')[0:6])
temp_src = tag_img[i].a.get('src')
if (temp_src == None):
temp_href = tag_img[i].a.get('href')
if (temp_href[i] != 'h'):
temp_books.append('https:' + tag_img[i].a.get('href'))
else:
temp_books.append(temp_src)
if (len(tag_publish) == 1):
temp_books.append(tag_publish[0].text.replace('\n', ''))
else:
temp_books.append('无')
if(tag_comment[i].text.replace('\n', '')==''):
temp_books.append('无')
else:
temp_books.append(tag_comment[i].text.replace('\n', ''))
books.append(temp_books)
#############翻页操作######################
def next_page():
try:
wait = WebDriverWait(browser, 10)
nextpage = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '.pn-next > em')))
nextpage.click()
time.sleep(2)
js = "document.documentElement.scrollTop=10000"
browser.execute_script(js)
except TimeoutException:
next_page()
#######################保存数据###################################
def save_data(data):
with open('京东-python 图书信息.csv', 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(['名称','价格','封面','评论数','出版社'])
for a in data:
print(a)
writer.writerow(a)
###################循环爬取多页数据##############################
html = browser.page_source
get_info(html)
for i in range(0,16):
next_page()
time.sleep(5)
html = browser.page_source
get_info(html)
save_data(books)
browser.close()