環境配置
安裝Python3.7.3,beautifulsoup4, requests
請求打印html頁面內容
import requests
url = 'http://www.eastmoney.com/'
req = requests.get(url)
req.encoding = req.apparent_encoding
html = req.text
執行print(html)就能打印網頁內容
網頁解析
解析網頁源碼模塊用Beautifulsoup模塊 提取東方財富網首頁的消息爲例,右鍵點擊對應的元素,選擇檢查,然後我們就可以看到網頁的源代碼了。
我們發現對應的元素都被<div class="nlist">選定,相應的我們可以把相應的代碼篩選出來。
from bs4 import BeautifulSoup
bf = BeautifulSoup(html, 'lxml')
nmlist = bf.find_all(class_ = 'nlist')
發現消息的標題和連接<a>給限定出來,用find_all方法獲取
a = nlist.find_all('a')
for each in a:
print(each.string, each.get('href'))
存儲CSV
import csv
date = open('test.csv','w')
writer = csv.writer(date)
date.close()
完整代碼如下
# -*- coding: utf-8 -*-
# @Time : 2019/4/8 17:40
# @Author : linjingtu
# @Email : [email protected]
# @File : test.py
# @Software: PyCharm
import requests
import lxml
from bs4 import BeautifulSoup
import csv
date = open('F:\\test.csv', 'w+')
writer = csv.writer(date)
url = 'http://www.eastmoney.com/'
req = requests.get(url)
req.encoding = req.apparent_encoding
html = req.text
bf = BeautifulSoup(html, 'lxml')
nlist = bf.find_all(class_ = 'nlist')[0]
a = nlist.find_all('a')
for each in a:
a_list = []
a_list.append(each.string)
a_list.append(each.get('href'))
writer.writerow(a_list)
date.close()
#print(nlist)