20200221_2_國家非文化遺產聚類分析

這個需求主要是k-means聚類算法,再加一個優化k-means++算法,這些直接調庫就可以

from sklearn import metrics
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline 
import matplotlib.pyplot as plt
import pandas as pd
import jieba
#進行文件的讀取
test=pd.read_excel('data/國家非文化遺產1.xlsx')
#強制轉化爲str類型
test["內容"]=test["內容"].astype('str')
cuttxt = lambda x: " ".join(jieba.lcut(x)) # 這裏不做任何清理工作,以保留情感詞
#調用apply函數進行修改
test["clean"] = test["內容"].apply(cuttxt) 
#調用停用詞
stpwrdpath ="data/停用詞.txt"
with open(stpwrdpath, 'rb') as fp:
    stopword = fp.read().decode('utf-8')  # 提用詞提取
#將停用詞表轉換爲list  
stpwrdlst = stopword.splitlines()
from sklearn.feature_extraction.text import TfidfTransformer  
from sklearn.feature_extraction.text import CountVectorizer 
vectorizer=CountVectorizer(stop_words=stpwrdlst)#該類會將文本中的詞語轉換爲詞頻矩陣,矩陣元素a[i][j] 表示j詞在i類文本下的詞頻  
transformer=TfidfTransformer()#該類會統計每個詞語的tf-idf權值
tfidf=transformer.fit_transform(vectorizer.fit_transform(test["clean"]))#第一個fit_transform是計算tf-idf,第二個fit_transform是將文本轉爲詞頻矩陣
weight=tfidf.toarray()#將tf-idf矩陣抽取出來,元素a[i][j]表示j詞在i類文本中的tf-idf權重 
word=vectorizer.get_feature_names()#獲取詞袋模型中的所有詞
from sklearn.cluster import KMeans
wcss = []
for i in range(1,11):
    kmeans = KMeans(n_clusters = i, random_state = 42)
    kmeans.fit(weight)
    wcss.append(kmeans.inertia_)
plt.plot(range(1, 11), wcss)
plt.title('K-means算法')
plt.xlabel('Number of clusters')
plt.show()
D:\sofewore\anaconda\lib\site-packages\matplotlib\backends\backend_agg.py:211: RuntimeWarning: Glyph 31639 missing from current font.
  font.set_text(s, 0.0, flags=flags)
D:\sofewore\anaconda\lib\site-packages\matplotlib\backends\backend_agg.py:211: RuntimeWarning: Glyph 27861 missing from current font.
  font.set_text(s, 0.0, flags=flags)
D:\sofewore\anaconda\lib\site-packages\matplotlib\backends\backend_agg.py:180: RuntimeWarning: Glyph 31639 missing from current font.
  font.set_text(s, 0, flags=flags)
D:\sofewore\anaconda\lib\site-packages\matplotlib\backends\backend_agg.py:180: RuntimeWarning: Glyph 27861 missing from current font.
  font.set_text(s, 0, flags=flags)

在這裏插入圖片描述

kmeans = KMeans(n_clusters = 2, random_state = 42)
y=kmeans.fit_predict(weight)
data={
    '類別':y
}
test1=pd.DataFrame(data)
# test1.head()
類別
0 1
1 1
2 1
3 1
4 1
test1["類別"].value_counts()
housetype=test1["類別"].value_counts()
from pylab import mpl
mpl.rcParams['font.sans-serif'] = ['FangSong'] # 指定默認字體
mpl.rcParams['axes.unicode_minus'] = False # 解決保存圖像是負號'-'顯示爲方塊的問題
#設置畫布
asd,sdf = plt.subplots(1,1,dpi=100)
#獲取排前10條類型
housetype.head(10).plot(kind='bar',x='housetype',y='size',title='類別數量分佈',ax=sdf)
plt.legend(['數量'])
plt.show()

在這裏插入圖片描述

from sklearn.cluster import KMeans
wcss = []
for i in range(1,11):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 42)
    kmeans.fit(weight)
    wcss.append(kmeans.inertia_)
plt.plot(range(1, 11), wcss)
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

在這裏插入圖片描述

kmeans = KMeans(n_clusters = 2, init = 'k-means++', random_state = 42)
y=kmeans.fit_predict(weight)
data={
    '類別':y
}
test1=pd.DataFrame(data)
# test1.head()
類別
0 1
1 1
2 1
3 1
4 1
test1["類別"].value_counts()
1    2692
0     458
Name: 類別, dtype: int64
housetype=test1["類別"].value_counts()
from pylab import mpl
mpl.rcParams['font.sans-serif'] = ['FangSong'] # 指定默認字體
mpl.rcParams['axes.unicode_minus'] = False # 解決保存圖像是負號'-'顯示爲方塊的問題
#設置畫布
asd,sdf = plt.subplots(1,1,dpi=100)
#獲取排前10條類型
housetype.head(10).plot(kind='bar',x='housetype',y='size',title='類別數量分佈',ax=sdf)
plt.legend(['數量'])
plt.show()

在這裏插入圖片描述

# # Fitting K-Means to the dataset
# # K-means本身不能解決 Random Initialization Trap,但是K-means++使用wcss算法用n_init參數能解決
# kmeans = KMeans(n_clusters = 2, init = 'k-means++', random_state = 42)
# y_kmeans = kmeans.fit_predict(weight)
# Visualising the clusters
plt.scatter(weight[y == 0, 0], weight[y == 0, 1], s = 100, c = 'red', label = '0')
plt.scatter(weight[y == 1, 0], weight[y == 1, 1], s = 100, c = 'blue', label = '1')
# plt.scatter(weight[y_kmeans == 2, 0], weight[y_kmeans == 2, 1], s = 100, c = 'green', label = 'Sensitive')
# plt.scatter(X[y_kmeans == 3, 0], X[y_kmeans == 3, 1], s = 100, c = 'cyan', label = 'Superior')
# plt.scatter(X[y_kmeans == 4, 0], X[y_kmeans == 4, 1], s = 100, c = 'magenta', label = 'Target')
# plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s = 300, c = 'yellow', label = 'Centroids')
plt.title('Clusters of customers')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.legend()
plt.show()
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章