目錄:
一. 認識數據
1.1 讀取數據
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
# import cycler
mpl_update = {'font.size': 16, 'xtick.labelsize': 14, 'ytick.labelsize': 14, 'figure.figsize': [12, 8],
# 'axes.prop_cycle': cycler('color',['#0055A7', '#2C3E4F', '#26C5ED', '#00cc66', '#D34100', '#FF9700','#091D32']),
'axes.labelsize': 20, 'axes.labelcolor': '#677385', 'axes.titlesize': 20,
'lines.color': '#0055A7', 'lines.linewidth': 3, 'text.color': '#677385'}
mpl.rcParams.update(mpl_update)
data = pd.read_csv('aquastat.csv.gzip', compression='gzip')
print(data.shape)
# data.info()
data.head()
(143280, 7)
print(data.variable.nunique())
print(data.variable_full.nunique())
data[['variable', 'variable_full']].drop_duplicates()
60
60
print(data.country.nunique())
countries = data.country.unique()
print(data.year_measured.nunique())
print(data.time_period.nunique())
time_periods = data.time_period.unique()
time_periods
199
56
12
1.2 缺失值狀況
data[data.variable == 'total_area'].value.isnull().sum()
220
import missingno as msno
msno.matrix(data, figsize = (16, 6))
二. 切片分析
- 橫截面:一個時期內所有國家
- 時間序列:一個國家隨着時間的推移
- 面板數據:所有國家隨着時間的推移(作爲數據給出)
- 地理空間:所有地理上相互聯繫的國家
2.1 time slicing
def time_slice(df, time_period):
df = df[df.time_period == time_period]
df = df.pivot(index = 'country', columns = 'variable', values = 'value')
df.columns.name = time_period
return df
time_slice(data, time_periods[0]).head()
2.2 country slicing
def country_slice(df, country):
df = df[df.country == country]
df = df.pivot(index = 'variable', columns = 'time_period', values = 'value')
df.index.name = country
return df
country_slice(data, countries[20]).head()
2.3 variable slicing
def variable_slice(df, variable):
df = df[df.variable == variable]
df = df.pivot(index = 'country', columns = 'time_period', values = 'value')
df.index.name = variable
return df
variable_slice(data, 'rural_pop').head()
2.4 country and variable slicing
def time_series(df, country, variable):
series = df[(df.country == country) & (df.variable == variable)]
series = series.dropna()[['year_measured', 'value']]
series.year_measured = series.year_measured.astype(int)
series.set_index('year_measured', inplace = True)
series.columns = [variable]
return series
time_series(data, 'China', 'total_pop')
2.5 region slicing
simple_regions = {'World | Asia':'Asia',
'Americas | Central America and Caribbean | Central America': 'Central America',
'Americas | Central America and Caribbean | Greater Antilles': 'Central America',
'Americas | Central America and Caribbean | Lesser Antilles and Bahamas': 'Central America',
'Americas | Northern America | Northern America': 'North America',
'Americas | Northern America | Mexico': 'North America',
'Americas | Southern America | Guyana':'South America',
'Americas | Southern America | Andean':'South America',
'Americas | Southern America | Brazil':'South America',
'Americas | Southern America | Southern America':'South America',
'World | Africa':'Africa',
'World | Europe':'Europe',
'World | Oceania':'Oceania'}
data.region = data.region.apply(lambda x: simple_regions[x])
print(data.region.nunique())
data.region.unique()
三. 缺失值
3.1 查看缺失值
recent = time_slice(data, '2013-2017')
msno.matrix(recent, labels=True)
3.2 剔除缺失值多的特徵
3.2.1 水資源總量
( Total exploitable water resources )
msno.matrix(variable_slice(data, 'exploitable_total'), inline=False, sort='descending')
plt.xlabel('Time period'); plt.ylabel('Country')
plt.title('Missing total exploitable water resources data across countries and time periods')
data = data.loc[~data.variable.str.contains('exploitable'), :]
3.2.2 降水指數
national_rainfall_index 全國降水指數(NRI)(毫米/年)
msno.matrix(variable_slice(data, 'national_rainfall_index'), inline=False, sort='descending')
plt.xlabel('Time period'); plt.ylabel('Country')
plt.title('Missing national rainfall index data across countries and time periods')
data = data.loc[~(data.variable == 'national_rainfall_index')]
3.2.3 Central America
#指數完整性
central_america = data[data.region == 'Central America']
msno.matrix(msno.nullity_sort(time_slice(central_america, '2013-2017'), sort='descending').T)
# 抽查巴哈馬缺少哪些數據以獲得更多的瞭解
msno.nullity_filter(country_slice(data, 'Bahamas').T, filter = 'bottom', p = 0.1)
四. 單特徵
4.1 Folim可視化
geo = r'world.json'
import folium
null_data = recent['agg_to_gdp'].notnull() * 1 # 布爾值變成0,1值
map = folium.Map(location = [48, -102], zoom_start = 2)
map.choropleth(geo_data = geo, data = null_data, columns = ['country', 'agg_to_gdp'],
key_on = 'feature.properties.name', reset = True, fill_color = 'GnBu',
fill_opacity = 0.8, line_opacity = 0.2, legend_name = 'Missing agricultural contribution to GDP data 2013-2017')
map
#模板
def plot_null_map(df, time_period, variable, legend_name = None):
geo = r'world.json'
ts = time_slice(df, time_period).reset_index().copy()
ts[variable] = ts[variable].notnull() * 1
map = folium.Map(location = [39, 118], zoom_start = 2)
map.choropleth(geo_data = geo, data = ts, columns = ['country', variable],
key_on = 'feature.properties.name', reset = True, fill_color = 'GnBu',
fill_opacity = 1, line_opacity = 0.2, legend_name = legend_name if legend_name else variable)
return map
plot_null_map(data, '2013-2017', 'number_undernourished', 'Number undernourished is missing')
4.2 Heatmap
fig, ax = plt.subplots(figsize = (14, 12))
sns.heatmap(data.groupby(['time_period', 'variable']).value.count().unstack().T, ax = ax)
plt.xticks(rotation = 45)
plt.title('Number of countries with data reported for each variable over time')
4.3 Pivottablejs 與 pandas_profiling
import pivotablejs
pivottablejs.pivot_ui(time_slice(data, '2013-2017'))
import pandas_profiling
pandas_profiling.ProfileReport(time_slice(data, '2013-2017')
五. 數據對數變換
For numerical data, look at:
- Location: 均值,中位數,模式,四分位
- Spread: 標準差、方差、範圍、間距範圍
- Shape: 偏度、峯度
recent[['total_pop', 'urban_pop', 'rural_pop']].describe().astype(int)
recent.sort_values('rural_pop')[['total_pop', 'urban_pop', 'rural_pop']].head()
time_series(data, 'Qatar', 'total_pop').join(time_series(data, 'Qatar', 'urban_pop')).join(time_series(data, 'Qatar', 'rural_pop'))
5.1 峯度與偏度
import scipy
recent[['total_pop', 'urban_pop', 'rural_pop']].apply(scipy.stats.skew)
recent[['total_pop', 'urban_pop', 'rural_pop']].apply(scipy.stats.kurtosis)
sns.set_style('darkgrid')
plt.figure(figsize = (8, 6))
plt.hist(recent.total_pop.values, bins = 50)
plt.xlabel('Total population')
plt.ylabel('Number of countries')
plt.title('Distribution of population of countries 2013-2017')
5.2 Log transform
print(recent[['total_pop']].apply(np.log).apply(scipy.stats.skew))
recent[['total_pop']].apply(np.log).apply(scipy.stats.kurtosis)
# 模板
def plot_hist(df, variable, bins = 30, xlabel = None, by = None,
ylabel = None, title = None,logx = False, ax = None):
if not ax:
fig, ax = plt.subplots(figsize = (8, 6))
if logx:
if df[variable].min() <= 0:
df[variable] = df[variable] - df[variable].min() + 1
print('Warning: data <= 0 exists, data transformed by %0.2g before plotting' % (- df[variable].min() + 1))
bins = np.logspace(np.log10(df[variable].min()), np.log10(df[variable].max()), bins)
ax.set_xscale('log')
ax.hist(df[variable].dropna().values, bins = bins)
if xlabel:
ax.set_xlabel(xlabel)
if ylabel:
ax.set_ylabel(ylabel)
if title:
ax.set_title(title)
return ax
plot_hist(recent, 'total_pop', logx = True, xlabel = 'Log of total population',
ylabel='Number of countries', title='Distribution of total population of countries 2013-2-17')
- Normalization
recent['population_density'] = recent.total_pop.divide(recent.total_area)
recent['population_density']
六. 數據分析維度
6.1 One country 人口
# 古巴
plt.plot(time_series(data, 'Cuba', 'total_pop'))
plt.xlabel('Year')
plt.ylabel('Population')
plt.title('Cuba population over time')
6.2 One region 人口
with sns.color_palette(sns.diverging_palette(220, 280, s = 85, l = 25, n = 20)):
central_america = time_slice(data[data.region == 'Central America'], '1998-2002').sort_values('total_pop').index.tolist()
for country in central_america:
plt.plot(time_series(data, country, 'total_pop'), label = country, lw = 1.5)
plt.xlabel('Year')
plt.ylabel('Population')
plt.title('North American populations over time')
plt.legend(loc = 2, prop = {'size': 9})
6.2.1 增長率
with sns.color_palette(sns.diverging_palette(220, 280, s = 85, l = 25, n = 20)):
for country in central_america:
ts = time_series(data, country, 'total_pop')
ts['norm_pop'] = ts.total_pop / ts.total_pop.min() * 100
plt.plot(ts['norm_pop'], label = country, lw = 1.5)
plt.xlabel('Year')
plt.ylabel('Percent increase in population')
plt.title('Percent increase in population from 1960 in Central American countries')
plt.legend(loc = 'best', prop = {'size': 9})
6.2.2 增長率Heatmap
central_america_pop = variable_slice(data[data.region == 'Central America'], 'total_pop')
central_america_norm_pop = central_america_pop.div(central_america_pop.min(axis = 1), axis = 0)
# pandas.DataFrame.div 數據除以常數
central_america_norm_pop = central_america_norm_pop.loc[central_america]
fig, ax = plt.subplots(figsize = (12, 10))
sns.heatmap(central_america_norm_pop, ax=ax, cmap=sns.light_palette((214, 90, 60),
input='husl', as_cmap=True))
plt.xticks(rotation = 60)
plt.xlabel('Time period')
plt.ylabel('Country, ordered by population in 1960')
plt.title('Percent increase in population from 1960')
6.3 可再生水資源
total renewable water resources
plot_hist(recent, 'total_renewable', bins=40, xlabel='Total renewable water resources ($10^9 m^3/yr$)',
ylabel='Number of countries', title='Distribution of total renewable water resources, 2013-2017')
plot_hist(recent, 'total_renewable', bins=40, ylabel='Number of countries',
xlabel='Total renewable water resources ($10^9 m^3/yr$)', logx = True,
title='Distribution of total renewable water resources, 2013-2017')
central_america_renew = variable_slice(data[data.region=='Central America'], 'total_renewable')
fig,ax = plt.subplots(figsize = (8, 6))
sns.heatmap(central_america_renew, ax=ax, cmap=sns.light_palette((214,90,60), input='husl', as_cmap=True))
plt.xticks(rotation=45)
plt.xlabel('Time period')
plt.ylabel('Country, ordered by Total renewable water resources in 1960', fontsize = 14)
plt.title('Total renewable water resources increase in population from 1960')
6.4 Ipywidgets查看每一個單變量
def two_hist(df, variable, bins = 50, ylabel = 'Number of countries', title = None):
fig, (ax1, ax2) = plt.subplots(1, 2, figsize = (18, 8))
ax1 = plot_hist(df, variable, bins = bins, xlabel = variable, ylabel = ylabel,
ax = ax1, title = variable if not title else title)
ax2 = plot_hist(df, variable, bins = bins, xlabel = 'Log of' + variable,
ylabel = ylabel, logx = True, ax = ax2,
title = 'Log of' + variable if not title else title)
plt.close()
return fig
import ipywidgets as widgets
def hist_over_var(df, variables, bins = 50, ylabel = 'Number of countries', title = None):
variable_slider = widgets.Dropdown(options = variables.tolist(), value = variables[0],
description = 'Variable:', disabled = False, botton_style = '')
widgets.interact(two_hist, df = widgets.fixed(df), variable = variable_slider,
y_label = widgets.fixed(ylabel), title = widgets.fixed(title), bins = widgets.fixed(bins))
hist_over_var(recent, recent.columns, bins = 40)
七. 變量關係可視化展示
7.1 Scatter plots
# seasonal_variability 季節變化(WRI)
plt.scatter(recent.seasonal_variability, recent.gdp_per_capita)
plt.xlabel('Seasonal variability')
plt.ylabel('GDP per capita ($USD/person)')
# 模板
def plot_scatter(df, x, y, xlabel = None, ylabel = None, title = None,
logx = False, logy = False, by = None, ax = None):
if not ax:
fig, ax = plt.subplots(figsize = (8, 6))
colors = mpl.rcParams['axes.prop_cycle'].by_key()['color']
if by:
groups = df.groupby(by)
for j, (name, group) in enumerate(groups):
ax.scatter(group[x], group[y], color = colors[j], label = name)
ax.legend()
else:
ax.scatter(df[x], df[y], color = colors[0])
if logx:
ax.set_xscale('log')
if logy:
ax.set_yscale('log')
if title:
ax.set_title(title)
ax.set_xlabel(xlabel if xlabel else x)
ax.set_ylabel(ylabel if ylabel else y)
return ax
plot_scatter(recent, 'total_renewable', 'gdp_per_capita')
7.2 Joint plot
svr = [recent.seasonal_variability.min(), recent.seasonal_variability.max()]
gdpr = [recent.gdp_per_capita.min(), recent.gdp_per_capita.max()]
gdpbins = np.logspace(*np.log10(gdpr), 25)
# sns.set(style = 'ticks', color_codes = True)
g = sns.JointGrid(x='seasonal_variability', y='gdp_per_capita', data=recent, ylim=gdpr)
g.ax_marg_x.hist(recent.seasonal_variability, range = svr)
g.ax_marg_y.hist(recent.gdp_per_capita, range=gdpr, bins=gdpbins, orientation='horizontal')
g.plot_joint(plt.hexbin, gridsize = 40)
ax = g.ax_joint
# ax.set_yscale('log')
g.fig.set_figheight(8)
g.fig.set_figwidth(10)
7.3 Correlation
recent_corr = recent.corr().loc['gdp_per_capita'].drop(['gdp', 'gdp_per_capita'])
def conditional_bar(series, bar_colors = None, color_labels = None, figsize = (14, 24),
xlabel = None, by = None, ylabel = None, title = None):
fig, ax = plt.subplots(figsize = figsize)
if not bar_colors:
bar_colors = mpl.rcParams['axes.prop_cycle'].by_key()['color'][0]
plt.barh(range(len(series)), series.values, color = bar_colors)
plt.xlabel('' if not xlabel else xlabel)
plt.ylabel('' if not ylabel else ylabel)
plt.yticks(range(len(series)), series.index.tolist())
plt.title('' if not title else title)
plt.ylim([-1, len(series)])
if color_labels:
for col, lab in color_labels.items():
plt.plot([], linestyle = '', marker = 's', c = col, label = lab)
lines, labels = ax.get_legend_handles_labels()
ax.legend(lines[-len(color_labels.keys()):], labels[-len(color_labels.keys()):], loc = 'upper right')
plt.close()
return fig
bar_colors = ['#0055A7' if x else '#2C3E4F' for x in list(recent_corr.values < 0)]
color_labels = {'#0055A7': 'Negative correlation', '#2C3E4F': 'Positive correlation'}
conditional_bar(recent_corr.apply(np.abs), bar_colors, color_labels, xlabel = '|Correlation|',
title = 'Magnitude of correlation with GDP per capita, 2013-2017')
7.4 標籤 binned
7.4.1 binned之前
plot_hist(recent, 'gdp_per_capita', xlabel = 'GDP per capita ($)', ylabel = 'Number of countries',
title = 'Distribution of GDP per capita, 2013-2017')
plot_hist(recent, 'gdp_per_capita', xlabel = 'GDP per capita ($)',logx = True, bins = 30,
ylabel = 'Number of countries', title = 'Distribution of GDP per capita, 2013-2017')
7.4.2 binned之後
capita_bins = ['Very low', 'Low', 'Medium', 'High', 'Very high']
recent['gdp_bin'] = pd.qcut(recent.gdp_per_capita, 5, capita_bins)
bin_ranges = pd.qcut(recent.gdp_per_capita, 5).unique()
def plot_hist(df, variable, bins = None, xlabel = None, by = None,
ylabel = None, title = None, logx = False, ax = None):
if not ax:
fig, ax = plt.subplots(figsize = (8, 6))
if logx:
bins = np.logspace(np.log10(df[variable].min()), np.log10(df[variable].max()), bins)
ax.set_xscale('log')
if by:
if type(df[by].unique()) == pd.Categorical:
cats = df[by].unique().categories.tolist()
else:
cats = df[by].unique().tolist
for cat in cats:
to_plot = df[df[by] == cat][variable].dropna()
ax.hist(to_plot, bins = bins)
else:
ax.hist(df[variable].dropna().values, bins = bins)
if xlabel:
ax.set_xlabel(xlabel)
if ylabel:
ax.set_ylabel(ylabel)
if title:
ax.set_title(title)
return ax
plot_hist(recent, 'gdp_per_capita', xlabel = 'GDP per capita ($)', logx = True,
ylabel = 'Number of countries', bins = 25, by = 'gdp_bin',
title = 'Distribution of log GDP per capita, 2013-2017')
7.4.3 Box plot
recent[['gdp_bin','total_pop_access_drinking']].boxplot(by = 'gdp_bin')
plt.title('Distribution of percent of total population with access to drinking water across gdp per capita categories')
plt.xlabel('GDP per capita quintile')
plt.ylabel('Total population of country')
def mult_boxplots(df, variable, category, xlabel = None, ylabel = None, title = None, ylim = None):
df[[variable, category]].boxplot(by = category)
if xlabel:
plt.xlabel(xlabel)
if ylabel:
plt.ylabel(ylabel)
if title:
plt.title(title)
if ylim:
plt.ylim(ylim)
mult_boxplots(recent, 'flood_occurence', 'gdp_bin', xlabel = 'GDP per capita quintile')