20200221_2_國家非文化遺產聚類分析

原創

2020-03-05 14:26

這個需求主要是k-means聚類算法，再加一個優化k-means++算法，這些直接調庫就可以

from sklearn import metrics
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

import matplotlib.pyplot as plt
import pandas as pd
import jieba
#進行文件的讀取
test=pd.read_excel('data/國家非文化遺產1.xlsx')
#強制轉化爲str類型
test["內容"]=test["內容"].astype('str')
cuttxt = lambda x: " ".join(jieba.lcut(x)) # 這裏不做任何清理工作，以保留情感詞
#調用apply函數進行修改
test["clean"] = test["內容"].apply(cuttxt) 
#調用停用詞
stpwrdpath ="data/停用詞.txt"
with open(stpwrdpath, 'rb') as fp:
    stopword = fp.read().decode('utf-8')  # 提用詞提取
#將停用詞表轉換爲list  
stpwrdlst = stopword.splitlines()
from sklearn.feature_extraction.text import TfidfTransformer  
from sklearn.feature_extraction.text import CountVectorizer 
vectorizer=CountVectorizer(stop_words=stpwrdlst)#該類會將文本中的詞語轉換爲詞頻矩陣，矩陣元素a[i][j] 表示j詞在i類文本下的詞頻  
transformer=TfidfTransformer()#該類會統計每個詞語的tf-idf權值
tfidf=transformer.fit_transform(vectorizer.fit_transform(test["clean"]))#第一個fit_transform是計算tf-idf，第二個fit_transform是將文本轉爲詞頻矩陣
weight=tfidf.toarray()#將tf-idf矩陣抽取出來，元素a[i][j]表示j詞在i類文本中的tf-idf權重 
word=vectorizer.get_feature_names()#獲取詞袋模型中的所有詞

from sklearn.cluster import KMeans
wcss = []
for i in range(1,11):
    kmeans = KMeans(n_clusters = i, random_state = 42)
    kmeans.fit(weight)
    wcss.append(kmeans.inertia_)
plt.plot(range(1, 11), wcss)
plt.title('K-means算法')
plt.xlabel('Number of clusters')
plt.show()

D:\sofewore\anaconda\lib\site-packages\matplotlib\backends\backend_agg.py:211: RuntimeWarning: Glyph 31639 missing from current font.
  font.set_text(s, 0.0, flags=flags)
D:\sofewore\anaconda\lib\site-packages\matplotlib\backends\backend_agg.py:211: RuntimeWarning: Glyph 27861 missing from current font.
  font.set_text(s, 0.0, flags=flags)
D:\sofewore\anaconda\lib\site-packages\matplotlib\backends\backend_agg.py:180: RuntimeWarning: Glyph 31639 missing from current font.
  font.set_text(s, 0, flags=flags)
D:\sofewore\anaconda\lib\site-packages\matplotlib\backends\backend_agg.py:180: RuntimeWarning: Glyph 27861 missing from current font.
  font.set_text(s, 0, flags=flags)

kmeans = KMeans(n_clusters = 2, random_state = 42)
y=kmeans.fit_predict(weight)

data={
    '類別':y
}
test1=pd.DataFrame(data)
# test1.head()

	類別
0	1
1	1
2	1
3	1
4	1

test1["類別"].value_counts()

housetype=test1["類別"].value_counts()
from pylab import mpl
mpl.rcParams['font.sans-serif'] = ['FangSong'] # 指定默認字體
mpl.rcParams['axes.unicode_minus'] = False # 解決保存圖像是負號'-'顯示爲方塊的問題
#設置畫布
asd,sdf = plt.subplots(1,1,dpi=100)
#獲取排前10條類型
housetype.head(10).plot(kind='bar',x='housetype',y='size',title='類別數量分佈',ax=sdf)
plt.legend(['數量'])
plt.show()

from sklearn.cluster import KMeans
wcss = []
for i in range(1,11):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 42)
    kmeans.fit(weight)
    wcss.append(kmeans.inertia_)
plt.plot(range(1, 11), wcss)
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

kmeans = KMeans(n_clusters = 2, init = 'k-means++', random_state = 42)
y=kmeans.fit_predict(weight)
data={
    '類別':y
}
test1=pd.DataFrame(data)
# test1.head()

	類別
0	1
1	1
2	1
3	1
4	1

test1["類別"].value_counts()

1    2692
0     458
Name: 類別, dtype: int64

housetype=test1["類別"].value_counts()
from pylab import mpl
mpl.rcParams['font.sans-serif'] = ['FangSong'] # 指定默認字體
mpl.rcParams['axes.unicode_minus'] = False # 解決保存圖像是負號'-'顯示爲方塊的問題
#設置畫布
asd,sdf = plt.subplots(1,1,dpi=100)
#獲取排前10條類型
housetype.head(10).plot(kind='bar',x='housetype',y='size',title='類別數量分佈',ax=sdf)
plt.legend(['數量'])
plt.show()

# # Fitting K-Means to the dataset
# # K-means本身不能解決 Random Initialization Trap，但是K-means++使用wcss算法用n_init參數能解決
# kmeans = KMeans(n_clusters = 2, init = 'k-means++', random_state = 42)
# y_kmeans = kmeans.fit_predict(weight)

# Visualising the clusters
plt.scatter(weight[y == 0, 0], weight[y == 0, 1], s = 100, c = 'red', label = '0')
plt.scatter(weight[y == 1, 0], weight[y == 1, 1], s = 100, c = 'blue', label = '1')
# plt.scatter(weight[y_kmeans == 2, 0], weight[y_kmeans == 2, 1], s = 100, c = 'green', label = 'Sensitive')
# plt.scatter(X[y_kmeans == 3, 0], X[y_kmeans == 3, 1], s = 100, c = 'cyan', label = 'Superior')
# plt.scatter(X[y_kmeans == 4, 0], X[y_kmeans == 4, 1], s = 100, c = 'magenta', label = 'Target')
# plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s = 300, c = 'yellow', label = 'Centroids')
plt.title('Clusters of customers')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.legend()
plt.show()

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.

20200221_2_國家非文化遺產聚類分析

AI 畫圖真刺激，手把手教你如何用 ComfyUI 來畫出刺激的圖

公司剛入職了一名 Java 中級開發，短短 4 行代碼居然湊齊了 3 個 bug！我哭了~~

智影AI故事轉視頻創作神器！快速開啓AI繪畫小說推文之旅

數據展示動態（跑分）顯示

公衆號5月C#/.NET熱文一覽

git 下載大陸鏡像地址

20200308——多項式迴歸預測工資

20191226_2_淘寶乒乓球商品分析

20200203_knn分類算法

深度之眼_Week2 編程作業1_梯度下降

機器學習作業班_python實現支持向量機

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結