參考:https://blog.csdn.net/weixin_41666747/article/details/103359961
案例說明:數據集包括20個樣本數據,5個數據特徵(品牌,熱量,含鈉量,酒精量,成本),在聚類時只使用後4個數值特徵。
數據(beer_data.txt):
name calories sodium alcohol cost Budweiser 144 15 4.7 0.43 Schlitz 151 19 4.9 0.43 Lowenbrau 157 15 0.9 0.48 Kronenbourg 170 7 5.2 0.73 Heineken 152 11 5.0 0.77 Old_Milwaukee 145 23 4.6 0.28 Augsberger 175 24 5.5 0.40 Srohs_Bohemian_Style 149 27 4.7 0.42 Miller_Lite 99 10 4.3 0.43 Budweiser_Light 113 8 3.7 0.40 Coors 140 18 4.6 0.44 Coors_Light 102 15 4.1 0.46 Michelob_Light 135 11 4.2 0.50 Becks 150 19 4.7 0.76 Kirin 149 6 5.0 0.79 Pabst_Extra_Light 68 15 2.3 0.38 Hamms 139 19 4.4 0.43 Heilemans_Old_Style 144 24 4.9 0.43 Olympia_Goled_Light 72 6 2.9 0.46 Schlitz_Light 97 7 4.2 0.47
導入包和數據
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import pandas as pd
beer = pd.read_csv("./beer_data.txt", sep=" ")
beer.head()
選取特徵訓練:
X = beer.iloc[:,1:] # ["calories","sodium","alcohol","cost"]
# K-Means聚類
km2 = KMeans(n_clusters=2).fit(X) # 取值k=2
km3 = KMeans(n_clusters=3).fit(X) # 取值k=3
print("當k=2時聚類結果:", km2.labels_)
print("當k=3時聚類結果:", km3.labels_)
#當k=2時聚類結果: [0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 1 0 0 1 1]
#當k=3時聚類結果: [0 0 0 0 0 0 0 0 2 2 0 2 0 0 0 1 0 0 1 2]
beer["cluster2"] = km2.labels_
beer["cluster3"] = km3.labels_
beer.sort_values("cluster3") #按某一列排序,默認升序 axis =0
結果分析:
結果展示
centers = beer.groupby("cluster3").mean().reset_index()
print(centers)
# 圖形化展示聚類效果(k=3)
from pandas.tools.plotting import scatter_matrix
import matplotlib.pyplot as plt
import numpy as np
plt.rcParams['font.size'] = 14
colors = np.array(['red', 'green', 'blue', 'yellow'])
plt.scatter(beer["calories"], beer["alcohol"],c=colors[beer["cluster3"]])
plt.scatter(centers.calories, centers.alcohol, linewidths=3, marker='+', s=300, c='black')
plt.xlabel("Calories")
plt.ylabel("Alcohol")
plt.show()
scatter_matrix(beer[["calories","sodium","alcohol","cost"]],
s=100,
alpha=1,
c=colors[beer["cluster3"]],
figsize=(10,10))
plt.suptitle("With 3 centroids initialized")
plt.show()