繼續把其他博主的程序拿來魔改一番,原博主博文鏈接如下,不過博文中只有代碼,沒別的了。
原博主代碼地址
具體代碼如下,自己添加的部分很混亂,還好能用。原博主保存的是txt格式,以字典形式保存的。因爲我要把數據導進matlab中用,所以篩出自己需要的導出csv格式了。其實不用這麼麻煩,可惜本人就一年多前學過一段時間python,後來又丟了,數據急着用,就先這樣了。等忙完再自己重寫個把(希望不食言)。這個博主爬取的網站氣象信息比較全,有溼度和降水,剛好需要,降水數據真的太難找了。
用了最笨的方法,最差的代碼風格 _(:з」∠)
# -*- coding: utf-8 -*-
"""
Created on Mon Apr 13 11:48:58 2020
@author: ZAN
"""
import requests
import pandas as pd
from bs4 import BeautifulSoup
from collections import defaultdict
from dateutil.relativedelta import relativedelta
from datetime import datetime
import numpy as np
class weather_data:
def __init__(self,city,start_year,end_year,start_month=1,end_month = 12):
"""
:param city: 需爬取的城市全拼
:param start_year: 爬取開始年份
:param end_year: 爬取結束年份
:param start_month: 爬取開始月份
:param end_month: 爬取結束月份
"""
self.city = city
self.start_time = datetime.strptime(f"{start_year}-{start_month}",'%Y-%m')
self.end_time = datetime.strptime(f"{end_year}-{end_month}",'%Y-%m')
def _get_original_html(self):
"""
網頁爬取
"""
url = f"https://tianqi.911cha.com/{self.city}/{self.start_time.year}-{self.start_time.month}.html"
print(url)
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"}#填寫自己瀏覽器內容
response = requests.get(url, headers=header)
return response.content.decode("utf-8")
def _parse_data(self):
#一次解析一個月
soup = BeautifulSoup(self.html,"html.parser")
data = defaultdict(dict)
for n, tr in enumerate(soup.find_all("tr")):
if n == 0:
continue
if n%2!=0:
date = tr.find("a").get_text()
#創建日期字典
#[時間,圖片,天氣,溫度,溼度,風力,風級,降水量,體感溫度,雲量]
data[date]["Day"] = {str(self.start_time.year)+'-'+key:con.get_text() for key,con in zip(['time','image','weather','temperature','humidity','wind_force','wind_scale',
'precipitation','sendible_temperature','cloud_amount'],tr.find_all("td"))}
else:
data[date]["Night"] = {key: con.get_text() for key, con in zip(
['time', 'image', 'weather', 'temperature', 'humidity', 'wind_force', 'wind_scale',
'precipitation', 'sendible_temperature', 'cloud_amount'], tr.find_all("td"))}
return data
def main(self):
data = []
while self.start_time<=self.end_time:
self.html = self._get_original_html()
data.append(self._parse_data())
self.start_time+=relativedelta(months=1)
return data
result = []
if __name__ == "__main__":
T = weather_data(city="enshi",start_year=2016,end_year=2017,start_month=1,end_month=10)
with open('weather_dict.txt','w',encoding='UTF-8') as f:
for line in T.main():
result.append(line)
f.writelines(str(line))
key_list = []
key2_list = []
val_list = []
val3_list = []
val5_list = []
for data in result:
key_value = list( data.keys() )
key_list.append(key_value)
val_value = list( data.values() )
val_list.append(val_value)
for i in key_list:
key2_list = key2_list + i ;
#下面全是對val值進行操作
for val2 in val_list:
for val3 in val2:
val3_value = list(val3.values())
val3_list.append(val3_value)
for nu in range( len(val3_list) ):
for val4 in val3_list[nu]:
val5 = list(val4.values())
val6 = ['0' if i == '-' else i for i in val5] #把降雨的-改成0,工作需要
val5_list.append(val6)
data_key = pd.DataFrame(key2_list) #日期
data_val = pd.DataFrame(val5_list) #氣象信息,可以根據自己需要對這個變量進行修改
# 去除符號
temp = data_val[3].str.strip('℃')
humd = data_val[4].str.strip('%')
rain = data_val[7].str.strip('mm')
weather = pd.DataFrame([temp,humd,rain]).T
#保留奇數行,刪除偶數行
day=weather[weather.index%2==0].reset_index(drop=True) #白天數據
# 保留偶數行,刪除奇數行
night=weather[weather.index%2==1].reset_index(drop=True) #晚上數據
fin = pd.concat([data_key,night,day],axis=1)
fin.to_csv('恩施氣象.csv',encoding="utf_8_sig")
題外話,大理寺日誌真的好看,安利!!!