python小象學院: JSON文件 /網絡爬蟲/Pandas-----空氣質量描述

 

博客推薦:

 http://www.pianshen.com/article/7251318624/#70_AQI_312

 功能1.0 AQI計算

""""
    auther:Susan
    function:AQI Calculation
    version:v1.0
    data:2019/4/27
"""

def cal_linear(iaqi_lo,iaqi_hi,bp_lo,bp_hi,cp):
    """
    Range scaling
    """
    iaqi = (iaqi_hi-iaqi_lo)*(cp-bp_lo)/(bp_hi-bp_lo)+iaqi_lo
    return iaqi

def cal_pm_iaqi(pm_val):
    """
    Calculation:pm2.5_IAQI
    """
    if 0 <= pm_val < 35:
        iaqi = cal_linear(0, 50, 0, 35, pm_val)
    elif 35 <= pm_val <75:
        iaqi = cal_linear(50, 100, 35, 75,pm_val)
    elif 75 <= pm_val <115:
        iaqi = cal_linear(100, 150, 75, 115,pm_val)
    elif 115 <= pm_val <150:
        iaqi = cal_linear(150, 200, 115, 150,pm_val)

    else:
        pass


def cal_co_iapi(co_val):
    """
    Calculation:CO_IAQI
    """
    # global iaqi
    if 0 <= co_val < 3:
        iaqi = cal_linear(0, 50, 0, 3, co_val)
    elif 3 <= co_val <5:
        iaqi = cal_linear(50, 100, 2, 4,co_val)
    elif 5 <= co_val <15:
        iaqi = cal_linear(100, 150,5, 14,co_val)
    else:
        pass


def cal_api(param_list):
    """
    AQI Calculation
    """
    pm_val = param_list[0]
    co_val = param_list[1]

    pm_iaqi = cal_pm_iaqi(pm_val)
    co_iaqi = cal_co_iapi(co_val)

    iaqi_list = []
    iaqi_list.append(pm_iaqi)
    iaqi_list.append(co_iaqi)

    aqi = max(iaqi_list)

    return aqi

def main():
    print('Please enter this information,and separate by spaces.')
    input_str = input('(1)PM2.5: (2)CO:')
    str_list = input_str.split(' ')
    pm_val = float(str_list[0])
    co_val = float(str_list[1])

    param_list = []
    param_list.append(pm_val)
    param_list.append(co_val)

    """
    Transfer function of AQI Calculation
    """
    aqi_val = cal_api(param_list)

    print('Air quality index value: {} '.format(aqi_val))

if __name__ == '__main__':
    main()

功能2.0 JSON讀取 

# -*- coding:utf-8 -*-
""""
    auther:Susan
    function:JSON reading
    version:v2.0
    data:2019/4/27
"""

import json

def process_json_file(filepath):

    f = open(filepath,mode='r', encoding='utf-8')
    city_list = json.load(f)

    return city_list


def main():
    filepath = input('Please input a json filemane:')
    city_list = process_json_file(filepath)
    city_list.sort(key=lambda city:city['aqi'])
    top5_list = city_list[:5]

    f = open('top5_aqi.json',mode='w',encoding='utf-8')
    json.dump(top5_list,f, ensure_ascii=False)
    f.close()
    print (city_list)
if __name__ == '__main__':
    main()

 

功能3.0 CSV讀取 

# -*- coding:utf-8 -*-
""""
    auther:Susan
    function:CSV reading
    version:v3.0
    data:2019/4/27
"""

import json
import csv

def process_json_file(filepath):

    f = open(filepath,mode='r', encoding='utf-8')
    city_list = json.load(f)

    return city_list


def main():
    filepath = input('Please input a json filemane:')
    city_list = process_json_file(filepath)
    city_list.sort(key=lambda city:city['aqi'])
    top5_list = city_list[:5]

    lines = []
    #Column name
    lines.append(city_list[0].keys())
    #
    for city in city_list:
        lines.append(list(city.values()))

    f = open('aqi1.csv','w',encoding='utf-8',newline='')
    writer = csv.writer(f)
    for line in lines:
        writer.writerow(line)
    f.close()

if __name__ == '__main__':
    main()

newline=''

新行不加入任何字符,不指定則會在新行末尾加入空行


根據輸入的文件判斷是JSON格式還是CSV格式,並進行相應的操作

功能4.0 判斷文件格式

# -*- coding:utf-8 -*-
""""
    auther:Susan
    function:Judge file format
    version:v4.0
    data:2019/5/8
    CSV:comma separated values
"""

import json
import csv
import os

def process_json_file(filepath):
    # f = open(filepath,mode='r', encoding='utf-8')
    # city_list = json.load(f)
    #
    # return city_list
    with open(filepath,mode='r', encoding='utf-8') as f:
        city_list = json.load(f)
    print(city_list)

def process_csv_file(filepath):
    with open(filepath,mode='r', encoding='utf-8',newline='') as f:
        reader = csv.reader(f)
        for row in reader:
            print(','.join(row))#Connect each element in the list with a comma

def main():
    filepath = input('Please input a filemane:')

    # 'json' in filepath
    filename,file_ext = os.path.splitext(filepath)

    if file_ext == '.json':
        process_json_file(filepath)
    elif file_ext == '.csv':
        process_csv_file(filepath)
    else:
        print('Unsupported file format!')


if __name__ == '__main__':
    main()

 

功能5.0 利用爬蟲做實時計算  

# -*- coding:utf-8 -*-
""""
    auther:Susan
    function:Use crawlers for real-time calculations
    version:v5.0
    data:2019/5/8
    Access the webpage through the crawler and display it to the user
"""

import requests


def get_html_text(url):
    """
    Return url text
    """

    r = requests.get(url,timeout=30)
    print(r.status_code)
    return r.text

def main():
    city_pinyin = input('Please enter the city pinyin:')
    url = 'http://pm25.in/'+city_pinyin
    url_text = get_html_text(url)
    aqi_div = '''    <div class="span12 data">
        <div class="span1">
          <div class="value">
            '''
    index = url_text.find(aqi_div)
    begin_index = index + len(aqi_div)
    end_index = begin_index + 2
    aqi_val = url_text[begin_index:end_index]
    print('Air quality:{}'.format(aqi_val))

    # print(url_text)

if __name__ == '__main__':
    main()

 

• 爲了能有效地提取並利用網絡信息並工作提高效率,出現了網絡爬蟲
• 利用網絡爬蟲實時獲取城市的空氣質量
• 高效地解析和處理HTML,beautifulsoup4

功能6.0 利用網絡爬蟲實時獲取城市的空氣質量

# -*- coding:utf-8 -*-
""""
    auther:Susan
    function:Parse HTML content more efficiently by BeautifilSoup
    version:v8.0
    data:2019/5/8

"""

import requests
from bs4 import BeautifulSoup

def get_city_aqi(city_pinyin):
    """
    Return url text
    """
    url = 'http://pm25.in/'+city_pinyin
    r = requests.get(url,timeout=30)
    soup = BeautifulSoup(r.text,'lxml')
    div_list = soup.find_all('div',{'class':'span1'})

    city_aqi = []
    for i in range(8):
        div_content = div_list[i]
        caption = div_content.find('div',{'class':'caption'}).text.strip()
        value = div_content.find('div',{'class':'value'}).text.strip()

        city_aqi.append((caption,value))

    return city_aqi

def main():
    city_pinyin = input('Please enter the city pinyin:')
    city_aqi = get_city_aqi(city_pinyin)

    print('Air quality:{}'.format(city_aqi))

if __name__ == '__main__':
    main()

Python strip() 方法用於移除字符串頭尾指定的字符(默認爲空格或換行符)或字符序列。

注意:該方法只能刪除開頭或是結尾的字符,不能刪除中間部分的字符。


功能7.0 利用beautifulsoup4獲取所有城市的空氣質量

# -*- coding:utf-8 -*-
""""
    auther:Susan
    function:BeautifilSoup
    version:v7.0
    data:2019/5/10

"""

import requests
from bs4 import BeautifulSoup

def get_city_aqi(city_pinyin):
    """
    Return url text
    """
    url = 'http://pm25.in/'+city_pinyin
    r = requests.get(url,timeout=30)
    soup = BeautifulSoup(r.text,'lxml')
    div_list = soup.find_all('div',{'class':'span1'})

    city_aqi = []
    for i in range(8):
        div_content = div_list[i]
        caption = div_content.find('div',{'class':'caption'}).text.strip()
        value = div_content.find('div',{'class':'value'}).text.strip()

        city_aqi.append((caption,value))

    return city_aqi

def get_all_cities():
    """
    Get the city
    """
    url = 'http://pm25.in/'
    city_list = []
    r = requests.get(url,timeout=30)
    soup = BeautifulSoup(r.text,'lxml')

    city_div = soup.find_all('div',{'class':'bottom'})[1]
    city_link_list = city_div.find_all('a')
    for city_link in city_link_list:
        city_name = city_link.text
        city_pinyin = city_link['href'][1:]
        city_list.append((city_name,city_pinyin))
    return city_list
def main():
    city_list = get_all_cities()
    for city in city_list:
        city_name = city[0]
        city_pinyin = city[1]
        city_aqi = get_city_aqi(city_pinyin)
        print(city_name,city_aqi)

if __name__ == '__main__':
    main()

功能8.0 將獲取的所有城市空氣質量保存成CSV數據文件

# -*- coding:utf-8 -*-
""""
    auther:Susan
    function:
    1.Get the website of all cities AQI:BeautifilSoup
    2.Real-time AQI preservation
    version:v6.0
    data:2019/5/8

"""

import requests
from bs4 import BeautifulSoup
import csv

def get_city_aqi(city_pinyin):
    """
    Return url text
    """
    url = 'http://pm25.in/'+city_pinyin
    r = requests.get(url,timeout=30)
    soup = BeautifulSoup(r.text,'lxml')
    div_list = soup.find_all('div',{'class':'span1'})

    city_aqi = []
    for i in range(8):
        div_content = div_list[i]
        caption = div_content.find('div',{'class':'caption'}).text.strip()
        value = div_content.find('div',{'class':'value'}).text.strip()

        # city_aqi.append((caption,value))
        city_aqi.append(value)
    return city_aqi

def get_all_cities():
    """
    Get the city
    """
    url = 'http://pm25.in/'
    city_list = []
    r = requests.get(url,timeout=30)
    soup = BeautifulSoup(r.text,'lxml')

    city_div = soup.find_all('div',{'class':'bottom'})[1]
    city_link_list = city_div.find_all('a')
    for city_link in city_link_list:
        city_name = city_link.text
        city_pinyin = city_link['href'][1:]
        city_list.append((city_name,city_pinyin))
    return city_list
def main():
    city_list = get_all_cities()
    header = ['city','AQI','PM2.5/1h','PM10/1h','CO/1h','NO2/1h','03/8h','SO2/1h']
    with open('China_city_aqi.csv','w',encoding='utf-8',newline='') as f:
        write = csv.writer(f)
        write.writerow(header)
        for i,city in enumerate(city_list):
            if (i+1)%10 == 0:
                print('Several records have been processed:{},a total of several records:{}.'.format(i+1,len(city_list)))
            city_name = city[0]
            city_pinyin = city[1]
            city_aqi = get_city_aqi(city_pinyin)
            row = [city_name]+city_aqi
            write.writerow(row)

if __name__ == '__main__':
    main()

 什麼是Pandas

Pandas的數據結構

Pandas的數據操作

Pandas統計計算和描述

功能9.0 簡單的數據處理和分析

結構化數據:CSV,JSON

非結構化數據:視頻,圖片,聲音

aqi_data.sort_values(by=['AQI'])# 默認從小到達
aqi_data.sort_values(by=['AQI',ascending=False])#從大到小

# -*- coding:utf-8 -*-
""""
    auther:Susan
    function:
    1.Get the website of all cities AQI:BeautifilSoup
    2.Real-time AQI preservation
    version:v10.0
    data:2019/5/8
    note:python2.7
"""

import pandas as pd

def main():
    aqi_data = pd.read_csv('China_city_aqi.csv')
    # print aqi_data.head(5)
    # print (aqi_data[['city','AQI']])
    print('Basic Information:')
    print(aqi_data.info())

    print('Data preview:')
    print(aqi_data.head())

    #Basic statistics
    print('AQI max:',aqi_data['AQI'].max)
    print('AQI min:',aqi_data['AQI'].max)
    print('AQI mean:',aqi_data['AQI'].mean())

    #top10
    top10_cities = aqi_data.sort_values(by=['AQI']).head(10)
    print('Ten cities with the best air quality:')
    print(top10_cities)

    #bottom10
    bottom_cities = aqi_data.sort_values(by=['AQI']).tail(10)
    #bottom_cities = aqi_data.sort_values(by=['AQI'],ascending=False).head(10)
    print('Ten cities with the worst air quality:')
    print( bottom_cities)

    #Save as CSV
    top10_cities.to_csv('top10_aqi.csv')
    bottom_cities.to_csv('bottom10_aqi.csv')


if __name__ == '__main__':
    main()

功能10.0 數據清洗和可視化 

• 數據清洗;利用Pandas進行數據可視化

數據獲取(網絡爬蟲)--->數據清洗(只保留AQI>0的數據)

plot(kind, x, y, title, figsize)   #kind指定繪製圖像類型 

https://blog.csdn.net/claroja/article/details/73872066  plot屬性設置

https://www.jianshu.com/p/33f843a7cef5  plot教程

https://blog.csdn.net/qq_37904945/article/details/79818719   無法顯示中文字體的問題

終端輸入(系統中的中文字體所在的位置):fc-list  :lang=zh     
在python用絕對路徑來引用字體:

import matplotlib.pyplot as plt
import matplotlib as mpl
zhfont= mpl.font_manager.FontProperties(fname='/usr/share/fonts/truetype/arphic/ukai.ttc')
plt.plot([1, 2, 3])
plt.xlabel('x軸標籤', fontproperties=zhfont)
plt.ylabel('y軸標籤',fontproperties=zhfont)
plt.show()

 

# -*- coding:utf-8 -*-
""""
    auther:Susan
    function:
    1.Get the website of all cities AQI:BeautifilSoup
    2.Real-time AQI preservation
    3.Plot Top5
    version:v10.0
    data:2019/5/8
    note:python2.7
"""

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl

# plt.rcParams['font.sans-serif'] = ['SimHei']
# plt.rcParams['axes.unicode_minus'] = False
# Linux useless to display as Chinese

def main():
    aqi_data = pd.read_csv('China_city_aqi.csv')
    # print aqi_data.head(5)
    # print (aqi_data[['city','AQI']])
    print('Basic Information:')
    print(aqi_data.info())

    print('Data preview:')
    print(aqi_data.head())

    # filter_condition = aqi_data['AQI']>0
    # clean_aqi_data = aqi_data(filter_condition)
    clean_aqi_data = aqi_data[aqi_data['AQI']>0]


    #Basic statistics
    print('AQI max:',clean_aqi_data['AQI'].max)
    print('AQI min:',clean_aqi_data['AQI'].max)
    print('AQI mean:',clean_aqi_data['AQI'].mean())

    #top50
    # font = mpl.font_manager.FontProperties(fname='/usr/share/fonts/truetype/arphic/ukai.ttc')
    font = mpl.font_manager.FontProperties(fname='/usr/share/fonts/opentype/noto/NotoSansCJK-Bold.ttc')
    top50_cities = clean_aqi_data.sort_values(by=['AQI']).head(50)
    top50_cities.plot(kind='bar',x='City',y='AQI',title='Fifty cities with the best air quality',
                      figsize=(20,10))
    plt.xticks(fontproperties=font)
    plt.xlabel(u"城市", fontproperties=font)
    plt.xlabel(u"空氣質量", fontproperties=font)
    plt.savefig('Top50_api.png')
    plt.show()
    top50_cities.to_csv('top50_aqi.csv')

if __name__ == '__main__':
    main()
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章