EDA探索性數據分析 -- 聯合國糧農組織: 水資源

一. 認識數據

1.1 讀取數據

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# import cycler
mpl_update = {'font.size': 16, 'xtick.labelsize': 14, 'ytick.labelsize': 14, 'figure.figsize': [12, 8],
#             'axes.prop_cycle': cycler('color',['#0055A7', '#2C3E4F', '#26C5ED', '#00cc66', '#D34100', '#FF9700','#091D32']),
             'axes.labelsize': 20, 'axes.labelcolor': '#677385', 'axes.titlesize': 20,
             'lines.color': '#0055A7', 'lines.linewidth': 3, 'text.color': '#677385'}
mpl.rcParams.update(mpl_update)

data = pd.read_csv('aquastat.csv.gzip', compression='gzip')
print(data.shape)
# data.info()
data.head()

(143280, 7)
在這裏插入圖片描述

print(data.variable.nunique())
print(data.variable_full.nunique())
data[['variable', 'variable_full']].drop_duplicates()

60
60
在這裏插入圖片描述

print(data.country.nunique())
countries = data.country.unique()

print(data.year_measured.nunique())
print(data.time_period.nunique())
time_periods = data.time_period.unique()
time_periods

199
56
12
在這裏插入圖片描述

1.2 缺失值狀況

data[data.variable == 'total_area'].value.isnull().sum()

220

import missingno as msno
msno.matrix(data, figsize = (16, 6))

在這裏插入圖片描述

二. 切片分析

  • 橫截面:一個時期內所有國家
  • 時間序列:一個國家隨着時間的推移
  • 面板數據:所有國家隨着時間的推移(作爲數據給出)
  • 地理空間:所有地理上相互聯繫的國家

2.1 time slicing

def time_slice(df, time_period):
    df = df[df.time_period == time_period]
    df = df.pivot(index = 'country', columns = 'variable', values = 'value')
    df.columns.name = time_period
    return df

time_slice(data, time_periods[0]).head()

在這裏插入圖片描述

2.2 country slicing

def country_slice(df, country):
    df = df[df.country == country]
    df = df.pivot(index = 'variable', columns = 'time_period', values = 'value')
    df.index.name = country
    return df

country_slice(data, countries[20]).head()

在這裏插入圖片描述

2.3 variable slicing

def variable_slice(df, variable):
    df = df[df.variable == variable]
    df = df.pivot(index = 'country', columns = 'time_period', values = 'value')
    df.index.name = variable
    return df

variable_slice(data, 'rural_pop').head()

在這裏插入圖片描述

2.4 country and variable slicing

def time_series(df, country, variable):
    series = df[(df.country == country) & (df.variable == variable)]
    series = series.dropna()[['year_measured', 'value']]
    
    series.year_measured = series.year_measured.astype(int)
    series.set_index('year_measured', inplace = True)
    series.columns = [variable]
    return series

time_series(data, 'China', 'total_pop')

在這裏插入圖片描述

2.5 region slicing

simple_regions = {'World | Asia':'Asia',
    'Americas | Central America and Caribbean | Central America': 'Central America',
    'Americas | Central America and Caribbean | Greater Antilles': 'Central America',
    'Americas | Central America and Caribbean | Lesser Antilles and Bahamas': 'Central America',
    'Americas | Northern America | Northern America': 'North America',
    'Americas | Northern America | Mexico': 'North America',
    'Americas | Southern America | Guyana':'South America',
    'Americas | Southern America | Andean':'South America',
    'Americas | Southern America | Brazil':'South America',
    'Americas | Southern America | Southern America':'South America', 
    'World | Africa':'Africa',
    'World | Europe':'Europe', 
    'World | Oceania':'Oceania'}

data.region = data.region.apply(lambda x: simple_regions[x])
print(data.region.nunique())
data.region.unique()

在這裏插入圖片描述

三. 缺失值

3.1 查看缺失值

recent = time_slice(data, '2013-2017')
msno.matrix(recent, labels=True)

在這裏插入圖片描述

3.2 剔除缺失值多的特徵

3.2.1 水資源總量

( Total exploitable water resources )

msno.matrix(variable_slice(data, 'exploitable_total'), inline=False, sort='descending')
plt.xlabel('Time period'); plt.ylabel('Country')
plt.title('Missing total exploitable water resources data across countries and time periods')

在這裏插入圖片描述

data = data.loc[~data.variable.str.contains('exploitable'), :]

3.2.2 降水指數

national_rainfall_index 全國降水指數(NRI)(毫米/年)

msno.matrix(variable_slice(data, 'national_rainfall_index'), inline=False, sort='descending')
plt.xlabel('Time period'); plt.ylabel('Country')
plt.title('Missing national rainfall index data across countries and time periods')

在這裏插入圖片描述

data = data.loc[~(data.variable == 'national_rainfall_index')]

3.2.3 Central America

#指數完整性
central_america = data[data.region == 'Central America']
msno.matrix(msno.nullity_sort(time_slice(central_america, '2013-2017'), sort='descending').T)

在這裏插入圖片描述

# 抽查巴哈馬缺少哪些數據以獲得更多的瞭解
msno.nullity_filter(country_slice(data, 'Bahamas').T, filter = 'bottom', p = 0.1)

在這裏插入圖片描述

四. 單特徵

4.1 Folim可視化

geo = r'world.json'

import folium
null_data = recent['agg_to_gdp'].notnull() * 1  # 布爾值變成0,1值
map = folium.Map(location = [48, -102], zoom_start = 2)
map.choropleth(geo_data = geo, data = null_data, columns = ['country', 'agg_to_gdp'],
              key_on = 'feature.properties.name', reset = True, fill_color = 'GnBu',
              fill_opacity = 0.8, line_opacity = 0.2, legend_name = 'Missing agricultural contribution to GDP data 2013-2017')
map

在這裏插入圖片描述

#模板

def plot_null_map(df, time_period, variable, legend_name = None):
    geo = r'world.json'
    ts = time_slice(df, time_period).reset_index().copy()
    ts[variable] = ts[variable].notnull() * 1
    map = folium.Map(location = [39, 118], zoom_start = 2)
    map.choropleth(geo_data = geo, data = ts, columns = ['country', variable],
                  key_on = 'feature.properties.name', reset = True, fill_color = 'GnBu',
                  fill_opacity = 1, line_opacity = 0.2, legend_name = legend_name if legend_name else variable)
    
    return map

plot_null_map(data, '2013-2017', 'number_undernourished', 'Number undernourished is missing')

4.2 Heatmap

fig, ax = plt.subplots(figsize = (14, 12))
sns.heatmap(data.groupby(['time_period', 'variable']).value.count().unstack().T, ax = ax)
plt.xticks(rotation = 45)
plt.title('Number of countries with data reported for each variable over time')

在這裏插入圖片描述

4.3 Pivottablejs 與 pandas_profiling

import pivotablejs
pivottablejs.pivot_ui(time_slice(data, '2013-2017'))
import pandas_profiling
pandas_profiling.ProfileReport(time_slice(data, '2013-2017')

五. 數據對數變換

For numerical data, look at:

  • Location: 均值,中位數,模式,四分位
  • Spread: 標準差、方差、範圍、間距範圍
  • Shape: 偏度、峯度
recent[['total_pop', 'urban_pop', 'rural_pop']].describe().astype(int)

在這裏插入圖片描述

recent.sort_values('rural_pop')[['total_pop', 'urban_pop', 'rural_pop']].head()

在這裏插入圖片描述

time_series(data, 'Qatar', 'total_pop').join(time_series(data, 'Qatar', 'urban_pop')).join(time_series(data, 'Qatar', 'rural_pop'))

在這裏插入圖片描述

5.1 峯度與偏度

import scipy
recent[['total_pop', 'urban_pop', 'rural_pop']].apply(scipy.stats.skew)

在這裏插入圖片描述

recent[['total_pop', 'urban_pop', 'rural_pop']].apply(scipy.stats.kurtosis)

在這裏插入圖片描述

sns.set_style('darkgrid')
plt.figure(figsize = (8, 6))
plt.hist(recent.total_pop.values, bins = 50)
plt.xlabel('Total population')
plt.ylabel('Number of countries')
plt.title('Distribution of population of countries 2013-2017')

在這裏插入圖片描述

5.2 Log transform

print(recent[['total_pop']].apply(np.log).apply(scipy.stats.skew))
recent[['total_pop']].apply(np.log).apply(scipy.stats.kurtosis)

在這裏插入圖片描述

# 模板

def plot_hist(df, variable, bins = 30, xlabel = None, by = None,
              ylabel = None, title = None,logx = False, ax = None):
    if not ax:
        fig, ax = plt.subplots(figsize = (8, 6))
    if logx:
        if df[variable].min() <= 0:
            df[variable] = df[variable] - df[variable].min() + 1
            print('Warning: data <= 0 exists, data transformed by %0.2g before plotting' % (- df[variable].min() + 1))
            
        bins = np.logspace(np.log10(df[variable].min()), np.log10(df[variable].max()), bins)
        ax.set_xscale('log')
    ax.hist(df[variable].dropna().values, bins = bins)
    
    if xlabel:
        ax.set_xlabel(xlabel)
    if ylabel:
        ax.set_ylabel(ylabel)
    if title:
        ax.set_title(title)
    return ax

plot_hist(recent, 'total_pop', logx = True, xlabel = 'Log of total population',
         ylabel='Number of countries', title='Distribution of total population of countries 2013-2-17')

在這裏插入圖片描述

  • Normalization
recent['population_density'] = recent.total_pop.divide(recent.total_area)
recent['population_density']

在這裏插入圖片描述

六. 數據分析維度

6.1 One country 人口

# 古巴
plt.plot(time_series(data, 'Cuba', 'total_pop'))
plt.xlabel('Year')
plt.ylabel('Population')
plt.title('Cuba population over time')

在這裏插入圖片描述

6.2 One region 人口

with sns.color_palette(sns.diverging_palette(220, 280, s = 85, l = 25, n = 20)):
    central_america = time_slice(data[data.region == 'Central America'], '1998-2002').sort_values('total_pop').index.tolist()
    for country in central_america:
        plt.plot(time_series(data, country, 'total_pop'), label = country, lw = 1.5)
        plt.xlabel('Year')
        plt.ylabel('Population')
        plt.title('North American populations over time')
    plt.legend(loc = 2, prop = {'size': 9})

在這裏插入圖片描述

6.2.1 增長率

with sns.color_palette(sns.diverging_palette(220, 280, s = 85, l = 25, n = 20)):
    for country in central_america:
        ts = time_series(data, country, 'total_pop')
        ts['norm_pop'] = ts.total_pop / ts.total_pop.min() * 100
        plt.plot(ts['norm_pop'], label = country, lw = 1.5)
        plt.xlabel('Year')
        plt.ylabel('Percent increase in population')
        plt.title('Percent increase in population from 1960 in Central American countries')
    plt.legend(loc = 'best', prop = {'size': 9})

在這裏插入圖片描述

6.2.2 增長率Heatmap

central_america_pop = variable_slice(data[data.region == 'Central America'], 'total_pop')
central_america_norm_pop = central_america_pop.div(central_america_pop.min(axis = 1), axis = 0)
# pandas.DataFrame.div 數據除以常數
central_america_norm_pop = central_america_norm_pop.loc[central_america]

fig, ax = plt.subplots(figsize = (12, 10))
sns.heatmap(central_america_norm_pop, ax=ax, cmap=sns.light_palette((214, 90, 60),
                                                                   input='husl', as_cmap=True))
plt.xticks(rotation = 60)
plt.xlabel('Time period')
plt.ylabel('Country, ordered by population in 1960')
plt.title('Percent increase in population from 1960')

在這裏插入圖片描述

6.3 可再生水資源

total renewable water resources

plot_hist(recent, 'total_renewable', bins=40, xlabel='Total renewable water resources ($10^9 m^3/yr$)',
         ylabel='Number of countries', title='Distribution of total renewable water resources, 2013-2017')

在這裏插入圖片描述

plot_hist(recent, 'total_renewable', bins=40, ylabel='Number of countries',
          xlabel='Total renewable water resources ($10^9 m^3/yr$)', logx = True,
          title='Distribution of total renewable water resources, 2013-2017')

在這裏插入圖片描述

central_america_renew = variable_slice(data[data.region=='Central America'], 'total_renewable')

fig,ax = plt.subplots(figsize = (8, 6))
sns.heatmap(central_america_renew, ax=ax, cmap=sns.light_palette((214,90,60), input='husl', as_cmap=True))
plt.xticks(rotation=45)
plt.xlabel('Time period')
plt.ylabel('Country, ordered by Total renewable water resources in 1960', fontsize = 14)
plt.title('Total renewable water resources increase in population from 1960')

在這裏插入圖片描述

6.4 Ipywidgets查看每一個單變量

def two_hist(df, variable, bins = 50, ylabel = 'Number of countries', title = None):
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize = (18, 8))
    ax1 = plot_hist(df, variable, bins = bins, xlabel = variable, ylabel = ylabel,
                   ax = ax1, title = variable if not title else title)
    ax2 = plot_hist(df, variable, bins = bins, xlabel = 'Log of' + variable,
                   ylabel = ylabel, logx = True, ax = ax2,
                   title = 'Log of' + variable if not title else title)
    plt.close()
    return fig

import ipywidgets as widgets

def hist_over_var(df, variables, bins = 50, ylabel = 'Number of countries', title = None):
    variable_slider = widgets.Dropdown(options = variables.tolist(), value = variables[0],
                                      description = 'Variable:', disabled = False, botton_style = '')
    widgets.interact(two_hist, df = widgets.fixed(df), variable = variable_slider,
                    y_label = widgets.fixed(ylabel), title = widgets.fixed(title), bins = widgets.fixed(bins))
    
hist_over_var(recent, recent.columns, bins = 40)

在這裏插入圖片描述

七. 變量關係可視化展示

7.1 Scatter plots

# seasonal_variability 季節變化(WRI)

plt.scatter(recent.seasonal_variability, recent.gdp_per_capita)
plt.xlabel('Seasonal variability')
plt.ylabel('GDP per capita ($USD/person)')

在這裏插入圖片描述

# 模板

def plot_scatter(df, x, y, xlabel = None, ylabel = None, title = None,
                logx = False, logy = False, by = None, ax = None):
    if not ax:
        fig, ax = plt.subplots(figsize = (8, 6))
        
    colors = mpl.rcParams['axes.prop_cycle'].by_key()['color']
    if by:
        groups = df.groupby(by)
        for j, (name, group) in enumerate(groups):
            ax.scatter(group[x], group[y], color = colors[j], label = name)
        ax.legend()
    else:
        ax.scatter(df[x], df[y], color = colors[0])
    if logx:
        ax.set_xscale('log')
    if logy:
        ax.set_yscale('log')
    if title:
        ax.set_title(title)
    
    ax.set_xlabel(xlabel if xlabel else x)
    ax.set_ylabel(ylabel if ylabel else y)    
    return ax
    
plot_scatter(recent, 'total_renewable', 'gdp_per_capita')

在這裏插入圖片描述

7.2 Joint plot

svr = [recent.seasonal_variability.min(), recent.seasonal_variability.max()]
gdpr = [recent.gdp_per_capita.min(), recent.gdp_per_capita.max()]
gdpbins = np.logspace(*np.log10(gdpr), 25)

# sns.set(style = 'ticks', color_codes = True)

g = sns.JointGrid(x='seasonal_variability', y='gdp_per_capita', data=recent, ylim=gdpr)
g.ax_marg_x.hist(recent.seasonal_variability, range = svr)
g.ax_marg_y.hist(recent.gdp_per_capita, range=gdpr, bins=gdpbins, orientation='horizontal')
g.plot_joint(plt.hexbin, gridsize = 40)
ax = g.ax_joint
# ax.set_yscale('log')
g.fig.set_figheight(8)
g.fig.set_figwidth(10)

在這裏插入圖片描述

7.3 Correlation

recent_corr = recent.corr().loc['gdp_per_capita'].drop(['gdp', 'gdp_per_capita'])

def conditional_bar(series, bar_colors = None, color_labels = None, figsize = (14, 24),
                   xlabel = None, by = None, ylabel = None, title = None):
    fig, ax = plt.subplots(figsize = figsize)
    if not bar_colors:
        bar_colors = mpl.rcParams['axes.prop_cycle'].by_key()['color'][0]
    plt.barh(range(len(series)), series.values, color = bar_colors)
    plt.xlabel('' if not xlabel else xlabel)
    plt.ylabel('' if not ylabel else ylabel)
    plt.yticks(range(len(series)), series.index.tolist())
    plt.title('' if not title else title)
    plt.ylim([-1, len(series)])
    
    if color_labels:
        for col, lab in color_labels.items():
            plt.plot([], linestyle = '', marker = 's', c = col, label = lab)
        lines, labels = ax.get_legend_handles_labels()
        ax.legend(lines[-len(color_labels.keys()):], labels[-len(color_labels.keys()):], loc = 'upper right')
    plt.close()
    return fig

bar_colors = ['#0055A7' if x else '#2C3E4F' for x in list(recent_corr.values < 0)]
color_labels = {'#0055A7': 'Negative correlation', '#2C3E4F': 'Positive correlation'}

conditional_bar(recent_corr.apply(np.abs), bar_colors, color_labels, xlabel = '|Correlation|',
               title = 'Magnitude of correlation with GDP per capita, 2013-2017')

在這裏插入圖片描述

7.4 標籤 binned

7.4.1 binned之前

plot_hist(recent, 'gdp_per_capita', xlabel = 'GDP per capita ($)', ylabel = 'Number of countries',
         title = 'Distribution of GDP per capita, 2013-2017')

plot_hist(recent, 'gdp_per_capita', xlabel = 'GDP per capita ($)',logx = True, bins = 30,
        ylabel = 'Number of countries', title = 'Distribution of GDP per capita, 2013-2017')

在這裏插入圖片描述在這裏插入圖片描述

7.4.2 binned之後

capita_bins = ['Very low', 'Low', 'Medium', 'High', 'Very high']
recent['gdp_bin'] = pd.qcut(recent.gdp_per_capita, 5, capita_bins)
bin_ranges = pd.qcut(recent.gdp_per_capita, 5).unique()

def plot_hist(df, variable, bins = None, xlabel = None, by = None,
             ylabel = None, title = None, logx = False, ax = None):
    if not ax:
        fig, ax = plt.subplots(figsize = (8, 6))
    if logx:
        bins = np.logspace(np.log10(df[variable].min()), np.log10(df[variable].max()), bins)
        ax.set_xscale('log')
        
    if by:
        if type(df[by].unique()) == pd.Categorical:
            cats = df[by].unique().categories.tolist()
        else:
            cats = df[by].unique().tolist
        for cat in cats:
            to_plot = df[df[by] == cat][variable].dropna()
            ax.hist(to_plot, bins = bins)
    else:
        ax.hist(df[variable].dropna().values, bins = bins)
        
    if xlabel:
        ax.set_xlabel(xlabel)
    if ylabel:
        ax.set_ylabel(ylabel)
    if title:
        ax.set_title(title)
    return ax

plot_hist(recent, 'gdp_per_capita', xlabel = 'GDP per capita ($)', logx = True,
         ylabel = 'Number of countries', bins = 25, by = 'gdp_bin',
         title = 'Distribution of log GDP per capita, 2013-2017')

在這裏插入圖片描述

7.4.3 Box plot

recent[['gdp_bin','total_pop_access_drinking']].boxplot(by = 'gdp_bin')
plt.title('Distribution of percent of total population with access to drinking water across gdp per capita categories')
plt.xlabel('GDP per capita quintile')
plt.ylabel('Total population of country')

在這裏插入圖片描述

def mult_boxplots(df, variable, category, xlabel = None, ylabel = None, title = None, ylim = None):
    df[[variable, category]].boxplot(by = category)
    if xlabel:
        plt.xlabel(xlabel)
    if ylabel:
        plt.ylabel(ylabel)
    if title:
        plt.title(title)
    if ylim:
        plt.ylim(ylim)
        
mult_boxplots(recent, 'flood_occurence', 'gdp_bin', xlabel = 'GDP per capita quintile')

在這裏插入圖片描述

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章