EDA探索性數據分析 -- 聯合國糧農組織: 水資源

一. 認識數據

1.1 讀取數據

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# import cycler
mpl_update = {'font.size': 16, 'xtick.labelsize': 14, 'ytick.labelsize': 14, 'figure.figsize': [12, 8],
#             'axes.prop_cycle': cycler('color',['#0055A7', '#2C3E4F', '#26C5ED', '#00cc66', '#D34100', '#FF9700','#091D32']),
             'axes.labelsize': 20, 'axes.labelcolor': '#677385', 'axes.titlesize': 20,
             'lines.color': '#0055A7', 'lines.linewidth': 3, 'text.color': '#677385'}
mpl.rcParams.update(mpl_update)

data = pd.read_csv('aquastat.csv.gzip', compression='gzip')
print(data.shape)
# data.info()
data.head()

(143280, 7)

print(data.variable.nunique())
print(data.variable_full.nunique())
data[['variable', 'variable_full']].drop_duplicates()

60
60

print(data.country.nunique())
countries = data.country.unique()

print(data.year_measured.nunique())
print(data.time_period.nunique())
time_periods = data.time_period.unique()
time_periods

199
56
12

1.2 缺失值狀況

data[data.variable == 'total_area'].value.isnull().sum()

220

import missingno as msno
msno.matrix(data, figsize = (16, 6))

二. 切片分析

橫截面：一個時期內所有國家
時間序列：一個國家隨着時間的推移
面板數據：所有國家隨着時間的推移（作爲數據給出）
地理空間：所有地理上相互聯繫的國家

2.1 time slicing

def time_slice(df, time_period):
    df = df[df.time_period == time_period]
    df = df.pivot(index = 'country', columns = 'variable', values = 'value')
    df.columns.name = time_period
    return df

time_slice(data, time_periods[0]).head()

2.2 country slicing

def country_slice(df, country):
    df = df[df.country == country]
    df = df.pivot(index = 'variable', columns = 'time_period', values = 'value')
    df.index.name = country
    return df

country_slice(data, countries[20]).head()

2.3 variable slicing

def variable_slice(df, variable):
    df = df[df.variable == variable]
    df = df.pivot(index = 'country', columns = 'time_period', values = 'value')
    df.index.name = variable
    return df

variable_slice(data, 'rural_pop').head()

2.4 country and variable slicing

def time_series(df, country, variable):
    series = df[(df.country == country) & (df.variable == variable)]
    series = series.dropna()[['year_measured', 'value']]
    
    series.year_measured = series.year_measured.astype(int)
    series.set_index('year_measured', inplace = True)
    series.columns = [variable]
    return series

time_series(data, 'China', 'total_pop')

2.5 region slicing

simple_regions = {'World | Asia':'Asia',
    'Americas | Central America and Caribbean | Central America': 'Central America',
    'Americas | Central America and Caribbean | Greater Antilles': 'Central America',
    'Americas | Central America and Caribbean | Lesser Antilles and Bahamas': 'Central America',
    'Americas | Northern America | Northern America': 'North America',
    'Americas | Northern America | Mexico': 'North America',
    'Americas | Southern America | Guyana':'South America',
    'Americas | Southern America | Andean':'South America',
    'Americas | Southern America | Brazil':'South America',
    'Americas | Southern America | Southern America':'South America', 
    'World | Africa':'Africa',
    'World | Europe':'Europe', 
    'World | Oceania':'Oceania'}

data.region = data.region.apply(lambda x: simple_regions[x])
print(data.region.nunique())
data.region.unique()

三. 缺失值

3.1 查看缺失值

recent = time_slice(data, '2013-2017')
msno.matrix(recent, labels=True)

3.2 剔除缺失值多的特徵

3.2.1 水資源總量

( Total exploitable water resources )

msno.matrix(variable_slice(data, 'exploitable_total'), inline=False, sort='descending')
plt.xlabel('Time period'); plt.ylabel('Country')
plt.title('Missing total exploitable water resources data across countries and time periods')

data = data.loc[~data.variable.str.contains('exploitable'), :]

3.2.2 降水指數

national_rainfall_index 全國降水指數（NRI）（毫米/年)

msno.matrix(variable_slice(data, 'national_rainfall_index'), inline=False, sort='descending')
plt.xlabel('Time period'); plt.ylabel('Country')
plt.title('Missing national rainfall index data across countries and time periods')

data = data.loc[~(data.variable == 'national_rainfall_index')]

3.2.3 Central America

#指數完整性
central_america = data[data.region == 'Central America']
msno.matrix(msno.nullity_sort(time_slice(central_america, '2013-2017'), sort='descending').T)

# 抽查巴哈馬缺少哪些數據以獲得更多的瞭解
msno.nullity_filter(country_slice(data, 'Bahamas').T, filter = 'bottom', p = 0.1)

四. 單特徵

4.1 Folim可視化

geo = r'world.json'

import folium
null_data = recent['agg_to_gdp'].notnull() * 1  # 布爾值變成0,1值
map = folium.Map(location = [48, -102], zoom_start = 2)
map.choropleth(geo_data = geo, data = null_data, columns = ['country', 'agg_to_gdp'],
              key_on = 'feature.properties.name', reset = True, fill_color = 'GnBu',
              fill_opacity = 0.8, line_opacity = 0.2, legend_name = 'Missing agricultural contribution to GDP data 2013-2017')
map

#模板

def plot_null_map(df, time_period, variable, legend_name = None):
    geo = r'world.json'
    ts = time_slice(df, time_period).reset_index().copy()
    ts[variable] = ts[variable].notnull() * 1
    map = folium.Map(location = [39, 118], zoom_start = 2)
    map.choropleth(geo_data = geo, data = ts, columns = ['country', variable],
                  key_on = 'feature.properties.name', reset = True, fill_color = 'GnBu',
                  fill_opacity = 1, line_opacity = 0.2, legend_name = legend_name if legend_name else variable)
    
    return map

plot_null_map(data, '2013-2017', 'number_undernourished', 'Number undernourished is missing')

4.2 Heatmap

fig, ax = plt.subplots(figsize = (14, 12))
sns.heatmap(data.groupby(['time_period', 'variable']).value.count().unstack().T, ax = ax)
plt.xticks(rotation = 45)
plt.title('Number of countries with data reported for each variable over time')

4.3 Pivottablejs 與 pandas_profiling

import pivotablejs
pivottablejs.pivot_ui(time_slice(data, '2013-2017'))

import pandas_profiling
pandas_profiling.ProfileReport(time_slice(data, '2013-2017')

五. 數據對數變換

For numerical data, look at:

Location: 均值，中位數，模式，四分位
Spread: 標準差、方差、範圍、間距範圍
Shape: 偏度、峯度

recent[['total_pop', 'urban_pop', 'rural_pop']].describe().astype(int)

recent.sort_values('rural_pop')[['total_pop', 'urban_pop', 'rural_pop']].head()

time_series(data, 'Qatar', 'total_pop').join(time_series(data, 'Qatar', 'urban_pop')).join(time_series(data, 'Qatar', 'rural_pop'))

5.1 峯度與偏度

import scipy
recent[['total_pop', 'urban_pop', 'rural_pop']].apply(scipy.stats.skew)

recent[['total_pop', 'urban_pop', 'rural_pop']].apply(scipy.stats.kurtosis)

sns.set_style('darkgrid')
plt.figure(figsize = (8, 6))
plt.hist(recent.total_pop.values, bins = 50)
plt.xlabel('Total population')
plt.ylabel('Number of countries')
plt.title('Distribution of population of countries 2013-2017')

5.2 Log transform

print(recent[['total_pop']].apply(np.log).apply(scipy.stats.skew))
recent[['total_pop']].apply(np.log).apply(scipy.stats.kurtosis)

# 模板

def plot_hist(df, variable, bins = 30, xlabel = None, by = None,
              ylabel = None, title = None,logx = False, ax = None):
    if not ax:
        fig, ax = plt.subplots(figsize = (8, 6))
    if logx:
        if df[variable].min() <= 0:
            df[variable] = df[variable] - df[variable].min() + 1
            print('Warning: data <= 0 exists, data transformed by %0.2g before plotting' % (- df[variable].min() + 1))
            
        bins = np.logspace(np.log10(df[variable].min()), np.log10(df[variable].max()), bins)
        ax.set_xscale('log')
    ax.hist(df[variable].dropna().values, bins = bins)
    
    if xlabel:
        ax.set_xlabel(xlabel)
    if ylabel:
        ax.set_ylabel(ylabel)
    if title:
        ax.set_title(title)
    return ax

plot_hist(recent, 'total_pop', logx = True, xlabel = 'Log of total population',
         ylabel='Number of countries', title='Distribution of total population of countries 2013-2-17')

Normalization

recent['population_density'] = recent.total_pop.divide(recent.total_area)
recent['population_density']

六. 數據分析維度

6.1 One country 人口

# 古巴
plt.plot(time_series(data, 'Cuba', 'total_pop'))
plt.xlabel('Year')
plt.ylabel('Population')
plt.title('Cuba population over time')

6.2 One region 人口

with sns.color_palette(sns.diverging_palette(220, 280, s = 85, l = 25, n = 20)):
    central_america = time_slice(data[data.region == 'Central America'], '1998-2002').sort_values('total_pop').index.tolist()
    for country in central_america:
        plt.plot(time_series(data, country, 'total_pop'), label = country, lw = 1.5)
        plt.xlabel('Year')
        plt.ylabel('Population')
        plt.title('North American populations over time')
    plt.legend(loc = 2, prop = {'size': 9})

6.2.1 增長率

with sns.color_palette(sns.diverging_palette(220, 280, s = 85, l = 25, n = 20)):
    for country in central_america:
        ts = time_series(data, country, 'total_pop')
        ts['norm_pop'] = ts.total_pop / ts.total_pop.min() * 100
        plt.plot(ts['norm_pop'], label = country, lw = 1.5)
        plt.xlabel('Year')
        plt.ylabel('Percent increase in population')
        plt.title('Percent increase in population from 1960 in Central American countries')
    plt.legend(loc = 'best', prop = {'size': 9})

6.2.2 增長率Heatmap

central_america_pop = variable_slice(data[data.region == 'Central America'], 'total_pop')
central_america_norm_pop = central_america_pop.div(central_america_pop.min(axis = 1), axis = 0)
# pandas.DataFrame.div 數據除以常數
central_america_norm_pop = central_america_norm_pop.loc[central_america]

fig, ax = plt.subplots(figsize = (12, 10))
sns.heatmap(central_america_norm_pop, ax=ax, cmap=sns.light_palette((214, 90, 60),
                                                                   input='husl', as_cmap=True))
plt.xticks(rotation = 60)
plt.xlabel('Time period')
plt.ylabel('Country, ordered by population in 1960')
plt.title('Percent increase in population from 1960')

6.3 可再生水資源

total renewable water resources

plot_hist(recent, 'total_renewable', bins=40, xlabel='Total renewable water resources ($10^9 m^3/yr$)',
         ylabel='Number of countries', title='Distribution of total renewable water resources, 2013-2017')

plot_hist(recent, 'total_renewable', bins=40, ylabel='Number of countries',
          xlabel='Total renewable water resources ($10^9 m^3/yr$)', logx = True,
          title='Distribution of total renewable water resources, 2013-2017')

central_america_renew = variable_slice(data[data.region=='Central America'], 'total_renewable')

fig,ax = plt.subplots(figsize = (8, 6))
sns.heatmap(central_america_renew, ax=ax, cmap=sns.light_palette((214,90,60), input='husl', as_cmap=True))
plt.xticks(rotation=45)
plt.xlabel('Time period')
plt.ylabel('Country, ordered by Total renewable water resources in 1960', fontsize = 14)
plt.title('Total renewable water resources increase in population from 1960')

6.4 Ipywidgets查看每一個單變量

def two_hist(df, variable, bins = 50, ylabel = 'Number of countries', title = None):
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize = (18, 8))
    ax1 = plot_hist(df, variable, bins = bins, xlabel = variable, ylabel = ylabel,
                   ax = ax1, title = variable if not title else title)
    ax2 = plot_hist(df, variable, bins = bins, xlabel = 'Log of' + variable,
                   ylabel = ylabel, logx = True, ax = ax2,
                   title = 'Log of' + variable if not title else title)
    plt.close()
    return fig

import ipywidgets as widgets

def hist_over_var(df, variables, bins = 50, ylabel = 'Number of countries', title = None):
    variable_slider = widgets.Dropdown(options = variables.tolist(), value = variables[0],
                                      description = 'Variable:', disabled = False, botton_style = '')
    widgets.interact(two_hist, df = widgets.fixed(df), variable = variable_slider,
                    y_label = widgets.fixed(ylabel), title = widgets.fixed(title), bins = widgets.fixed(bins))
    
hist_over_var(recent, recent.columns, bins = 40)

七. 變量關係可視化展示

7.1 Scatter plots

# seasonal_variability 季節變化（WRI）

plt.scatter(recent.seasonal_variability, recent.gdp_per_capita)
plt.xlabel('Seasonal variability')
plt.ylabel('GDP per capita ($USD/person)')

# 模板

def plot_scatter(df, x, y, xlabel = None, ylabel = None, title = None,
                logx = False, logy = False, by = None, ax = None):
    if not ax:
        fig, ax = plt.subplots(figsize = (8, 6))
        
    colors = mpl.rcParams['axes.prop_cycle'].by_key()['color']
    if by:
        groups = df.groupby(by)
        for j, (name, group) in enumerate(groups):
            ax.scatter(group[x], group[y], color = colors[j], label = name)
        ax.legend()
    else:
        ax.scatter(df[x], df[y], color = colors[0])
    if logx:
        ax.set_xscale('log')
    if logy:
        ax.set_yscale('log')
    if title:
        ax.set_title(title)
    
    ax.set_xlabel(xlabel if xlabel else x)
    ax.set_ylabel(ylabel if ylabel else y)    
    return ax
    
plot_scatter(recent, 'total_renewable', 'gdp_per_capita')

7.2 Joint plot

svr = [recent.seasonal_variability.min(), recent.seasonal_variability.max()]
gdpr = [recent.gdp_per_capita.min(), recent.gdp_per_capita.max()]
gdpbins = np.logspace(*np.log10(gdpr), 25)

# sns.set(style = 'ticks', color_codes = True)

g = sns.JointGrid(x='seasonal_variability', y='gdp_per_capita', data=recent, ylim=gdpr)
g.ax_marg_x.hist(recent.seasonal_variability, range = svr)
g.ax_marg_y.hist(recent.gdp_per_capita, range=gdpr, bins=gdpbins, orientation='horizontal')
g.plot_joint(plt.hexbin, gridsize = 40)
ax = g.ax_joint
# ax.set_yscale('log')
g.fig.set_figheight(8)
g.fig.set_figwidth(10)

7.3 Correlation

recent_corr = recent.corr().loc['gdp_per_capita'].drop(['gdp', 'gdp_per_capita'])

def conditional_bar(series, bar_colors = None, color_labels = None, figsize = (14, 24),
                   xlabel = None, by = None, ylabel = None, title = None):
    fig, ax = plt.subplots(figsize = figsize)
    if not bar_colors:
        bar_colors = mpl.rcParams['axes.prop_cycle'].by_key()['color'][0]
    plt.barh(range(len(series)), series.values, color = bar_colors)
    plt.xlabel('' if not xlabel else xlabel)
    plt.ylabel('' if not ylabel else ylabel)
    plt.yticks(range(len(series)), series.index.tolist())
    plt.title('' if not title else title)
    plt.ylim([-1, len(series)])
    
    if color_labels:
        for col, lab in color_labels.items():
            plt.plot([], linestyle = '', marker = 's', c = col, label = lab)
        lines, labels = ax.get_legend_handles_labels()
        ax.legend(lines[-len(color_labels.keys()):], labels[-len(color_labels.keys()):], loc = 'upper right')
    plt.close()
    return fig

bar_colors = ['#0055A7' if x else '#2C3E4F' for x in list(recent_corr.values < 0)]
color_labels = {'#0055A7': 'Negative correlation', '#2C3E4F': 'Positive correlation'}

conditional_bar(recent_corr.apply(np.abs), bar_colors, color_labels, xlabel = '|Correlation|',
               title = 'Magnitude of correlation with GDP per capita, 2013-2017')

7.4 標籤 binned

7.4.1 binned之前

plot_hist(recent, 'gdp_per_capita', xlabel = 'GDP per capita ($)', ylabel = 'Number of countries',
         title = 'Distribution of GDP per capita, 2013-2017')

plot_hist(recent, 'gdp_per_capita', xlabel = 'GDP per capita ($)',logx = True, bins = 30,
        ylabel = 'Number of countries', title = 'Distribution of GDP per capita, 2013-2017')

7.4.2 binned之後

capita_bins = ['Very low', 'Low', 'Medium', 'High', 'Very high']
recent['gdp_bin'] = pd.qcut(recent.gdp_per_capita, 5, capita_bins)
bin_ranges = pd.qcut(recent.gdp_per_capita, 5).unique()

def plot_hist(df, variable, bins = None, xlabel = None, by = None,
             ylabel = None, title = None, logx = False, ax = None):
    if not ax:
        fig, ax = plt.subplots(figsize = (8, 6))
    if logx:
        bins = np.logspace(np.log10(df[variable].min()), np.log10(df[variable].max()), bins)
        ax.set_xscale('log')
        
    if by:
        if type(df[by].unique()) == pd.Categorical:
            cats = df[by].unique().categories.tolist()
        else:
            cats = df[by].unique().tolist
        for cat in cats:
            to_plot = df[df[by] == cat][variable].dropna()
            ax.hist(to_plot, bins = bins)
    else:
        ax.hist(df[variable].dropna().values, bins = bins)
        
    if xlabel:
        ax.set_xlabel(xlabel)
    if ylabel:
        ax.set_ylabel(ylabel)
    if title:
        ax.set_title(title)
    return ax

plot_hist(recent, 'gdp_per_capita', xlabel = 'GDP per capita ($)', logx = True,
         ylabel = 'Number of countries', bins = 25, by = 'gdp_bin',
         title = 'Distribution of log GDP per capita, 2013-2017')

7.4.3 Box plot

recent[['gdp_bin','total_pop_access_drinking']].boxplot(by = 'gdp_bin')
plt.title('Distribution of percent of total population with access to drinking water across gdp per capita categories')
plt.xlabel('GDP per capita quintile')
plt.ylabel('Total population of country')

def mult_boxplots(df, variable, category, xlabel = None, ylabel = None, title = None, ylim = None):
    df[[variable, category]].boxplot(by = category)
    if xlabel:
        plt.xlabel(xlabel)
    if ylabel:
        plt.ylabel(ylabel)
    if title:
        plt.title(title)
    if ylim:
        plt.ylim(ylim)
        
mult_boxplots(recent, 'flood_occurence', 'gdp_bin', xlabel = 'GDP per capita quintile')

EDA探索性數據分析 -- 聯合國糧農組織: 水資源

目錄:

一. 認識數據

1.1 讀取數據

1.2 缺失值狀況

二. 切片分析

2.1 time slicing

2.2 country slicing

2.3 variable slicing

2.4 country and variable slicing

2.5 region slicing

三. 缺失值

3.1 查看缺失值

3.2 剔除缺失值多的特徵

3.2.1 水資源總量

3.2.2 降水指數

3.2.3 Central America

四. 單特徵

4.1 Folim可視化

4.2 Heatmap

4.3 Pivottablejs 與 pandas_profiling

五. 數據對數變換

5.1 峯度與偏度

5.2 Log transform

六. 數據分析維度

6.1 One country 人口

6.2 One region 人口

6.2.1 增長率

6.2.2 增長率Heatmap

6.3 可再生水資源

6.4 Ipywidgets查看每一個單變量

七. 變量關係可視化展示

7.1 Scatter plots

7.2 Joint plot

7.3 Correlation

7.4 標籤 binned

7.4.1 binned之前

7.4.2 binned之後

7.4.3 Box plot