# library 導入庫import seaborn as sns
import pandas as pd
import numpy as np
# jupyter notebook顯示多行輸出from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity ='all'
1. 基礎熱圖繪製 Basic Heatmap plot
普通熱圖 Basic Heatmap
相關矩陣熱圖 Correlation matrix
相關矩陣半熱圖 an half heatmap of correlation matrix
多數據熱力圖製作 Basic Heatmap of long format data
# 普通熱圖 Basic Heatmap# Create a dataset (fake) 製作5行5列的矩陣
df = pd.DataFrame(np.random.random((5,5)), columns=["a","b","c","d","e"])# 顯示數據
df
# Default heatmap: just a visualization of this square matrix 默認熱力圖
p1 = sns.heatmap(df)
a
b
c
d
e
0
0.260319
0.749665
0.534837
0.077599
0.645868
1
0.455260
0.088954
0.876201
0.468024
0.679460
2
0.422090
0.029897
0.652491
0.492516
0.112680
3
0.016669
0.979161
0.274547
0.093439
0.965549
4
0.039159
0.851814
0.794167
0.796855
0.109723
# 相關矩陣熱圖 Correlation matrix# 一個常見的任務是檢查某些變量是否相關可以輕鬆計算每對變量之間的相關性,並將其繪製爲熱圖,發現哪個變量彼此相關。# Create a dataset (fake) 創建數據
df = pd.DataFrame(np.random.random((100,5)), columns=["a","b","c","d","e"])
df.head()# Calculate correlation between each pair of variable 計算相關係數
corr_matrix=df.corr()# 顯示相關係數結果
corr_matrix
# plot it 繪圖 cmap設定顏色版
sns.heatmap(corr_matrix, cmap='PuOr')
a
b
c
d
e
0
0.447492
0.083233
0.054378
0.528246
0.839064
1
0.966619
0.718003
0.584444
0.454353
0.319515
2
0.165938
0.500661
0.221050
0.304151
0.470321
3
0.012819
0.206002
0.317296
0.998902
0.546637
4
0.168106
0.935917
0.081234
0.652118
0.988459
a
b
c
d
e
a
1.000000
0.062998
0.219805
0.095833
0.160799
b
0.062998
1.000000
0.173022
0.040480
-0.101984
c
0.219805
0.173022
1.000000
-0.049702
-0.066863
d
0.095833
0.040480
-0.049702
1.000000
0.179716
e
0.160799
-0.101984
-0.066863
0.179716
1.000000
<matplotlib.axes._subplots.AxesSubplot at 0x17a4cc715c0>
# 相關矩陣半熱圖 an half heatmap of correlation matrix# Create a dataset (fake) 建立數據
df = pd.DataFrame(np.random.random((100,5)), columns=["a","b","c","d","e"])# Calculate correlation between each pair of variable 計算相關係數
corr_matrix=df.corr()# Can be great to plot only a half matrix 創建一個corr_matrix等大的O矩陣
mask = np.zeros_like(corr_matrix)# np.triu_indices_from(mask)返回矩陣上三角形的索引
indices=np.triu_indices_from(mask)# 顯示索引結果
indices
mask[np.triu_indices_from(mask)]=Truewith sns.axes_style("white"):# mask設置具有缺失值的單元格將自動被屏蔽;square使每個單元格爲正方形
p2 = sns.heatmap(corr_matrix, mask=mask, square=True)
# 軸的顯示 Remove X or Y labels# 由xticklables和yticklabels控制座標軸,cbar控制顏色條的顯示
sns.heatmap(df, yticklabels=False, cbar=False);
# 標籤隱藏 Hide a few axis labels to avoid overlapping# xticklabels表示標籤index爲該值倍數時顯示
sns.heatmap(df, xticklabels=3);
# 顏色條座標顯示範圍設置 Coordinate range setting of color bar
sns.heatmap(df, vmin=0, vmax=0.5);
3. 熱圖上使用標準化 Use normalization on heatmap
列的規範化 Column normalization
行的規範化 Row normalization
# 列的規範化 Column normalization# 有時矩陣某一列值遠遠高於其他列的值,導致整體熱圖各點顏色趨於兩級,需要對列的數據進行規範化的# Create a dataframe where the average value of the second column is higher:
df = pd.DataFrame(np.random.randn(10,10)*4+3)# 使得第一列數據明顯大於其他列
df[1]=df[1]+40# If we do a heatmap, we just observe that a column as higher values than others: 沒有規範化的熱力圖
sns.heatmap(df, cmap='viridis');
# Now if we normalize it by column 規範化列
df_norm_col=(df-df.mean())/df.std()
sns.heatmap(df_norm_col, cmap='viridis');
# 行的規範化 Row normalization # 列的規範化相同的原理適用於行規範化。# Create a dataframe where the average value of the second row is higher
df = pd.DataFrame(np.random.randn(10,10)*4+3)
df.iloc[2]=df.iloc[2]+40# If we do a heatmap, we just observe that a row has higher values than others: 第2行的數據明顯大於其他行
sns.heatmap(df, cmap='viridis');
# 1: substract mean 行的規範化
df_norm_row=df.sub(df.mean(axis=1), axis=0)# 2: divide by standard dev
df_norm_row=df_norm_row.div( df.std(axis=1), axis=0)# And see the result
sns.heatmap(df_norm_row, cmap='viridis');
4. 樹狀圖與熱圖 Dendrogram with heatmap
基礎樹狀圖與熱圖繪製 Dendrogram with heat map and coloured leaves
樹形圖與熱圖規範化 normalize of Dendrogram with heatmap
樹形圖與熱圖距離參數設定 distance of Dendrogram with
樹形圖與熱圖聚類方法參數設定 cluster method of Dendrogram with heatmap
# 基礎樹狀圖與熱圖繪製 Dendrogram with heat map and coloured leavesfrom matplotlib import pyplot as plt
import pandas as pd
# 使用mtcars數據集,通過一些數字變量提供幾輛汽車的性能參數。 # Data set mtcars數據集 下載
url ='https://python-graph-gallery.com/wp-content/uploads/mtcars.csv'
df = pd.read_csv(url)
df = df.set_index('model')# 橫軸爲汽車性能參數,縱軸爲汽車型號
df.head()
mpg
cyl
disp
hp
drat
wt
qsec
vs
am
gear
carb
model
Mazda RX4
21.0
6
160.0
110
3.90
2.620
16.46
0
1
4
4
Mazda RX4 Wag
21.0
6
160.0
110
3.90
2.875
17.02
0
1
4
4
Datsun 710
22.8
4
108.0
93
3.85
2.320
18.61
1
1
4
1
Hornet 4 Drive
21.4
6
258.0
110
3.08
3.215
19.44
1
0
3
1
Hornet Sportabout
18.7
8
360.0
175
3.15
3.440
17.02
0
0
3
2
# Prepare a vector of color mapped to the 'cyl' column# 設定發動機汽缸數6,4,,8指示不同的顏色
my_palette =dict(zip(df.cyl.unique(),["orange","yellow","brown"]))
my_palette
# 列出不同汽車的發動機汽缸數
row_colors = df.cyl.map(my_palette)
row_colors
# metric數據度量方法, method計算聚類的方法# standard_scale標準維度(0:行或1:列即每行或每列的含義,減去最小值並將每個維度除以其最大值)
sns.clustermap(df, metric="correlation", method="single", cmap="Blues", standard_scale=1, row_colors=row_colors)
{6: 'orange', 4: 'yellow', 8: 'brown'}
model
Mazda RX4 orange
Mazda RX4 Wag orange
Datsun 710 yellow
Hornet 4 Drive orange
Hornet Sportabout brown
Valiant orange
Duster 360 brown
Merc 240D yellow
Merc 230 yellow
Merc 280 orange
Merc 280C orange
Merc 450SE brown
Merc 450SL brown
Merc 450SLC brown
Cadillac Fleetwood brown
Lincoln Continental brown
Chrysler Imperial brown
Fiat 128 yellow
Honda Civic yellow
Toyota Corolla yellow
Toyota Corona yellow
Dodge Challenger brown
AMC Javelin brown
Camaro Z28 brown
Pontiac Firebird brown
Fiat X1-9 yellow
Porsche 914-2 yellow
Lotus Europa yellow
Ford Pantera L brown
Ferrari Dino orange
Maserati Bora brown
Volvo 142E yellow
Name: cyl, dtype: object
# 樹形圖與熱圖規範化 normalize of Dendrogram with heatmap# Standardize or Normalize every column in the figure# Standardize 標準化
sns.clustermap(df, standard_scale=1)# Normalize 正則化
sns.clustermap(df, z_score=1)
# 樹形圖與熱圖距離參數設定 distance of Dendrogram with heatmap# 相似性
sns.clustermap(df, metric="correlation", standard_scale=1)# 歐幾里得距離
sns.clustermap(df, metric="euclidean", standard_scale=1)
# 樹形圖與熱圖聚類方法參數設定 cluster method of Dendrogram with heatmap# single-linkage算法
sns.clustermap(df, metric="euclidean", standard_scale=1, method="single")# 聚類分析法ward,推薦使用
sns.clustermap(df, metric="euclidean", standard_scale=1, method="ward")
# 離羣值設置 outliers set# Ignore outliers# Let's create an outlier in the dataset, 添加離羣值
df.iloc[15,5]=1000# use the outlier detection 計算時忽略離羣值
sns.clustermap(df, robust=True)# do not use it 不忽略離羣值
sns.clustermap(df, robust=False)