親愛的朋友們,我已經好久沒有更新文章了
我一直以爲自己在學習新的東西,學習更加深奧的東西 卻沒有注重積累 有點眼高手低了
這幾天忙着做期末作業 需要用到python 自己寫了個爬蟲
不太擅長,但是最後自我感覺良好
介紹一下我用的編輯器 VS code 微軟主推的跨平臺編輯器
使用了很久(感覺啥文件都能兼容)問題總是有的,譬如插件拓展、無法識別文件
源碼地址https://github.com/liyu19981212/python-
也可以複製下面的
用到了requests BeautifulSoup、re、pandas庫
requests庫用來請求網頁數據
BeautifulSoup用來進行網頁解析
re爲正則表達式,因爲從網頁獲取的文本可能會有格式問題
pandas是用來存儲數據的
我爬取的網站是駕考寶典(後天我要考科目四了)
學會對url進行分析,每一個界面是如何變化的
第一頁
第二頁
寫個循環就可以把所有的頁面遍歷了
jinan改成其他地面就是另一套數據了,自行發揮
裏面涉及到庫的使用都很簡單
百度教程很多,希望深入一下,畢竟拿來主義還是不靠譜的
給大家貼一下我的源碼
# -*- coding: utf-8 -*-
import requests
import re
from bs4 import BeautifulSoup
import pandas as pd
#代碼框架
def getText(url,d,pages):
try:
name = []
address = []
price = []
score = []
stu_num = []
for k,v in d.items():
for p in range(1,pages+1):
new_url = url+k+'/school'+'/'+str(p)+'f.html'
headers = {'User-Agent':'Mozilla/5.0 Chrome/46.0.2490.80 '}
print(new_url)
r = requests.get(new_url,headers=headers, timeout=30)
r.raise_for_status #如果狀態不是200,引發HTTPERROR異常
r.encoding = "utf-8"
# return r.text
soup = BeautifulSoup(r.text,'lxml')
#print(soup)
allinfo = soup.find('div',{'class','com-school-list com-part'})
allinfo = soup.find_all('li',{'class','clearfix'})
#print(allinfo)
#對每一個駕校的區塊進行操作,獲取駕校信息
for info in allinfo:
#駕校的名字
name1 = info.find_all('a',{'class':'title'})[0].get_text()
name1 = name1.replace('\n','').replace(' ','')
name.append(name1)
print(name1)
#駕校的地址
address1 = info.find_all('p',{'class':'field'})[0].get_text()
address1 = address1.replace('\n','').replace(' ','')
address.append(address1)
print(address1)
#學員數量
stu_num1 = info.find_all('span',{'class':'student'})[0].get_text()
stu_num1 = stu_num1.replace('\n','').replace(' ','')
stu_num.append(stu_num1)
print(stu_num1)
#駕校的價格
price1 = info.find_all('span',{'class':'price'})[0].get_text()
price1 == price1.replace('\n','').replace(' ','')
price.append(price1)
print(price1)
#駕校的評分
score1 = info.find_all('span',{'class':'score'})[0].get_text()
score1 == score1.replace('\n','').replace(' ','')
score.append(score1)
print(score1)
return name,address,price,score,stu_num
except Exception as e:
print(e)
#存儲數據
def save_data(name,address,price,score,stu_num):
result = pd.DataFrame()
#result['v'] = v
result['name'] = name
result['address'] = address
result['price'] = price
result['score'] = score
result['stu_num'] = stu_num
result.to_csv('result.csv',encoding='utf-8_sig')
#主任務
def run():
first_url = "http://www.jiakaobaodian.com/"
d = {'jinan':'濟南市'}
name,address,price,score,stu_num=getText(first_url,d,4)
save_data(name,address,price,score,stu_num)
#執行
if __name__ == '__main__':
run()
#fillUnivList(html)
內容不多,希望大家多多支持
現在是北京時間2019年12月24日
之前做完了後面的數據可視化和聚類分析但是沒有時間上傳
在不弄以後就沒了
給大家分享一下後面的部分
# -*- coding: utf-8 -*-
import requests
import re
from bs4 import BeautifulSoup
import pandas as pd
from sklearn.cluster import KMeans #導入K均值聚類算法
import matplotlib.pyplot as plt
#從網頁獲取數據
def getText(url,d,pages):
#傳入的三個參數分別爲初始網頁、數據字典、頁碼
try:
area = []
#駕校名稱
name = []
#駕校地址
address = []
#價格
price = []
#評分
score = []
#學員數量
stu_num = []
#循環遍歷
for k,v in d.items():
for p in range(1,pages+1):
#獲取新的url
new_url = url+'jinan/school/'+k+'/'+str(p)+'f.html'
#模擬瀏覽器
headers = {'User-Agent':'Mozilla/5.0 Chrome/46.0.2490.80 '}
print(new_url)
#使用requests庫
r = requests.get(new_url,headers=headers, timeout=30)
#狀態碼200
r.raise_for_status #如果狀態不是200,引發HTTPERROR異常
#設置編碼格式
r.encoding = "utf-8"
# return r.text
soup = BeautifulSoup(r.text,'lxml') #使用lxml解析
#print(soup)
#在這裏用到的是beautifulsoup庫
#定位到列表
allinfo = soup.find('div',{'class','com-school-list com-part'})
allinfo = soup.find_all('li',{'class','clearfix'})
#print(allinfo)
#對每一個駕校的區塊進行操作,獲取駕校信息
#遍歷標籤li
for info in allinfo:
#駕校的名字
name1 = info.find_all('a',{'class':'title'})[0].get_text()
#這裏使用get_text()方法獲取文本 去除文本中的標籤元素
name1 = name1.replace('\n','').replace(' ','')
#正則表達式規範化格式
#將獲取的數據放入name
name.append(name1)
print(name1)
#駕校的地址
address1 = info.find_all('p',{'class':'field'})[0].get_text()
address1 = address1.replace('\n','').replace(' ','')
address.append(address1)
print(address1)
#學員數量
stu_num1 = info.find_all('span',{'class':'student'})[0].get_text()
stu_num1 = stu_num1.replace('\n','').replace(' ','').replace('名學員','')
stu_num.append(stu_num1)
print(stu_num1)
#駕校的價格
price1 = info.find_all('span',{'class':'price'})[0].get_text()
price1 = price1.replace('¥','').replace(' ','').replace('面議','')
price.append(price1)
print(price1)
#駕校的評分
score1 = info.find_all('span',{'class':'score'})[0].get_text()
score1 = score1.replace('\n','').replace(' ','').replace('分','')
score.append(score1)
print(score1)
area.append(v)
return area,name,address,price,score,stu_num
except Exception as e:
print(e)
#存儲數據
def save_data(area,name,address,price,score,stu_num):
#pandas中的DataFrame
result = pd.DataFrame()
#result['v'] = v
result['area'] = area
result['name'] = name
result['address'] = address
result['price'] = price
result['score'] = score
result['stu_num'] = stu_num
result.to_csv('result.csv',encoding='utf-8_sig') #此處的編碼格式設置爲utf-8_sig
#數據處理
def clean():
datafile= './result.csv'
#原始數據,第一行爲屬性標籤
cleanedfile = './data_cleaned.csv'
#數據清洗後保存的文件
data = pd.read_csv(datafile,encoding='utf-8')
#讀取原始數據,指定UTF-8編碼
data = data[data['price'].notnull()&data['score'].notnull()]
#非空值才保留
#只保留非零的
index1 = data['price'] != 0
index2 = data['score'] != 0
index3 = data['stu_num'] != 0
data = data[index1 | index2 | index3] #該規則是“或”
data.to_csv(cleanedfile,encoding='utf-8_sig')
#導出結果
#可視化
def plot():
data = pd.read_csv('data_cleaned.csv',encoding='utf-8_sig')
d = data['area']
area = data['area'].drop_duplicates()
avgp = []
avgs = []
avgn = []
for a in area:
data1 = data[data['area']==a]
#平均價格取兩位小數
avgp1 = round(float(data1['price'].mean()),2)
avgp.append(avgp1)
#平均評分
avgs1 = round(float(data1['score'].mean()),2)
avgs.append(avgs1)
#平均人數
avgn1 = data1['stu_num'].mean()
avgn.append(avgn1)
#print(a,avgp1,avgs1,avgn1)
#求各個區平均分
plt.rcParams['font.family'] = ['sans-serif']
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.bar(area,avgs,color='SkyBlue')
plt.title('濟南市各個區駕校平均評分(滿分5分)')
plt.grid(True)
#plt.legend(lqu)
plt.xlabel('區名')
plt.ylabel('評分')
plt.show()
#求各個區平均價格
plt.rcParams['font.family'] = ['sans-serif']
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.bar(area,avgp,color='SkyBlue')
plt.title('濟南市各個區駕校平均價格')
plt.grid(True)
#plt.legend(lqu)
plt.xlabel('區名')
plt.ylabel('元')
plt.show()
#求各個區平均學員數量
plt.rcParams['font.family'] = ['sans-serif']
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.bar(area,avgn,color='SkyBlue')
plt.title('濟南市各個區駕校平均學員數量')
plt.grid(True)
#plt.legend(lqu)
plt.xlabel('區名')
plt.ylabel('人')
plt.show()
#K-means聚類分析
def Kmeans():
inputfile = './data_cleaned.csv'
#待聚類的數據文件
outputfile = './fenlei.xlsx'
k = 3
#需要進行的聚類類別數
# iteration = 500
#聚類最大循環數
#讀取數據並進行聚類分析
data = pd.read_csv(inputfile)
#讀取數據
data = data[['price','score','stu_num']]
#調用k-means算法,進行聚類分析
kmodel = KMeans(n_clusters = k, n_jobs = 4)
#n_jobs是並行數,一般等於CPU數較好
kmodel.fit(data)
#訓練模型
r1 = pd.Series(kmodel.labels_).value_counts()
#統計各個類別的數目
r2 = pd.DataFrame(kmodel.cluster_centers_)
#找出聚類中心
r = pd.concat([r2, r1], axis = 1)
#橫向連接(0是縱向),得到聚類中心對應的類別下的數目
r.columns = list(data.columns) + [u'類別數目']
#重命名錶頭
print("聚類表結果:")
print (r)
r = pd.concat([data, pd.Series(kmodel.labels_, index = data.index)], axis = 1)
#詳細輸出每個樣本對應的類別
r.columns = list(data.columns) + [u'聚類類別']
#重命名錶頭
r.to_excel(outputfile)
#保存分類結果
print("聚類圖結果:")
p = data.plot(kind='kde', linewidth = 2, subplots = True, sharex = False)
[p[i].set_ylabel('density') for i in range(k)]
plt.legend()
# pic_output = 'D://mypy/' #概率密度圖文件名前綴
#for i in range(k):
# density_plot(data[r[u'聚類類別']==i]).savefig(u'%s%s.png' %(pic_output, i))
#主任務
def run():
#url
first_url = "http://www.jiakaobaodian.com/"
#d可自由設定 需要理解key和value
d = {'q_lixia':'歷下區','q_shizhong':'市中區','q_huaiyin':'槐蔭區','q_tianqiao':'天橋區','q_licheng0':'歷城區','q_changqing0':'長清區','q_pingyin':'平陰縣','q_jiyang0':'濟陽縣','q_shanghe':'商河縣','q_zhangqiu':'章丘市'}
#調用def
#獲取數據
area,name,address,price,score,stu_num=getText(first_url,d,4)
#存儲數據
save_data(area,name,address,price,score,stu_num)
#數據清洗
clean()
#繪圖
plot()
#聚類分析
Kmeans()
#執行
if __name__ == '__main__':
run()
#fillUnivList(html)
運行結果如下:
希望有用
有緣關注一下