實現目標:爬取某地歷史天氣情況(以深圳2019年爲例)
需要的庫:requests,bs4,pandas
PS:requests 和 bs4 庫很小,大概150K左右
第一步:
找目標url;
第二步:獲取網頁源代碼
url = 'http://www.tianqihoubao.com/lishi/shenzhen/month/201901.html'
req = requests.get(url)
html = req.text
第三步:數據提取,提取自己需要的內容
soup = BeautifulSoup(html,'html.parser')
tr_list = soup.find_all('tr')
datas,condition,temp = [], [], []
for data in tr_list[1:]:
sub_data = data.text.split()
a = sub_data[0].replace('年','/')#將年月日用'/'代替,此處可省略,看個人需求
b = a.replace('月','/')
c = b.replace('日', '')
datas.append(c)
condition.append(''.join(sub_data[1:3]))
temp.append(''.join(sub_data[3:6]))
打印效果如下:
第四步:保存數據
_data = pd.DataFrame() # 創建一個表格
_data['日期'] = datas #向表格內添加數據
_data['天氣狀況'] = condition
_data['溫度'] = temp
_data.to_csv('深圳2019.01天氣記錄.csv',index=False, encoding='utf-8')
效果如下:
如下是一個月和多個月的整體代碼。
import requests
from bs4 import BeautifulSoup
import pandas as pd
'''====================================深圳2019.1歷史天氣數據======================================'''
# 目標url
url = 'http://www.tianqihoubao.com/lishi/shenzhen/month/201901.html'
# 獲取網頁源代碼
req = requests.get(url)
html = req.text
soup = BeautifulSoup(html,'html.parser')
tr_list = soup.find_all('tr')
datas,condition,temp = [], [], []
for data in tr_list[1:]:
sub_data = data.text.split()
a = sub_data[0].replace('年','/')
b = a.replace('月','/')
c = b.replace('日', '')
datas.append(c)
condition.append(''.join(sub_data[1:3]))
temp.append(''.join(sub_data[3:6]))
# 數據保存
_data = pd.DataFrame() # 創建一個表格
_data['日期'] = datas #向表格內添加數據
_data['天氣狀況'] = condition
_data['溫度'] = temp
_data.to_csv('深圳2019.01天氣記錄.csv',index=False, encoding='utf-8')
'''====================================深圳2019 1——4月的歷史天氣數據======================================'''
# 獲取url
def get_data(url):
req = requests.get(url)
html = req.text
# 數據提取
soup = BeautifulSoup(html,'html.parser')
tr_list = soup.find_all('tr')
datas,condition,temp = [], [], []
for data in tr_list[1:]:
sub_data = data.text.split()
# print(sub_data)
a = sub_data[0].replace('年','/')
b = a.replace('月','/')
c = b.replace('日', '')
datas.append(c)
condition.append(''.join(sub_data[1:3]))
temp.append(''.join(sub_data[3:6]))
# 數據保存
_data = pd.DataFrame() # 創建一個表格
_data['日期'] = datas #向表格內添加數據
_data['天氣狀況'] = condition
_data['溫度'] = temp
# print(_data)
return _data
data_01 = get_data('http://www.tianqihoubao.com/lishi/shenzhen/month/201901.html')
data_02 = get_data('http://www.tianqihoubao.com/lishi/shenzhen/month/201902.html')
data_03 = get_data('http://www.tianqihoubao.com/lishi/shenzhen/month/201903.html')
data_04 = get_data('http://www.tianqihoubao.com/lishi/shenzhen/month/201904.html')
data = pd.concat([data_01,data_02,data_03,data_04]).reset_index(drop=True)
data.to_csv('深圳2019.01-04月天氣記錄.csv',index=False, encoding='utf-8')