這個需求主要是k-means聚類算法,再加一個優化k-means++算法,這些直接調庫就可以
from sklearn import metrics
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import jieba
test=pd.read_excel('data/國家非文化遺產1.xlsx')
test["內容"]=test["內容"].astype('str')
cuttxt = lambda x: " ".join(jieba.lcut(x))
test["clean"] = test["內容"].apply(cuttxt)
stpwrdpath ="data/停用詞.txt"
with open(stpwrdpath, 'rb') as fp:
stopword = fp.read().decode('utf-8')
stpwrdlst = stopword.splitlines()
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
vectorizer=CountVectorizer(stop_words=stpwrdlst)
transformer=TfidfTransformer()
tfidf=transformer.fit_transform(vectorizer.fit_transform(test["clean"]))
weight=tfidf.toarray()
word=vectorizer.get_feature_names()
from sklearn.cluster import KMeans
wcss = []
for i in range(1,11):
kmeans = KMeans(n_clusters = i, random_state = 42)
kmeans.fit(weight)
wcss.append(kmeans.inertia_)
plt.plot(range(1, 11), wcss)
plt.title('K-means算法')
plt.xlabel('Number of clusters')
plt.show()
D:\sofewore\anaconda\lib\site-packages\matplotlib\backends\backend_agg.py:211: RuntimeWarning: Glyph 31639 missing from current font.
font.set_text(s, 0.0, flags=flags)
D:\sofewore\anaconda\lib\site-packages\matplotlib\backends\backend_agg.py:211: RuntimeWarning: Glyph 27861 missing from current font.
font.set_text(s, 0.0, flags=flags)
D:\sofewore\anaconda\lib\site-packages\matplotlib\backends\backend_agg.py:180: RuntimeWarning: Glyph 31639 missing from current font.
font.set_text(s, 0, flags=flags)
D:\sofewore\anaconda\lib\site-packages\matplotlib\backends\backend_agg.py:180: RuntimeWarning: Glyph 27861 missing from current font.
font.set_text(s, 0, flags=flags)
kmeans = KMeans(n_clusters = 2, random_state = 42)
y=kmeans.fit_predict(weight)
data={
'類別':y
}
test1=pd.DataFrame(data)
test1["類別"].value_counts()
housetype=test1["類別"].value_counts()
from pylab import mpl
mpl.rcParams['font.sans-serif'] = ['FangSong']
mpl.rcParams['axes.unicode_minus'] = False
asd,sdf = plt.subplots(1,1,dpi=100)
housetype.head(10).plot(kind='bar',x='housetype',y='size',title='類別數量分佈',ax=sdf)
plt.legend(['數量'])
plt.show()
from sklearn.cluster import KMeans
wcss = []
for i in range(1,11):
kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 42)
kmeans.fit(weight)
wcss.append(kmeans.inertia_)
plt.plot(range(1, 11), wcss)
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()
kmeans = KMeans(n_clusters = 2, init = 'k-means++', random_state = 42)
y=kmeans.fit_predict(weight)
data={
'類別':y
}
test1=pd.DataFrame(data)
test1["類別"].value_counts()
1 2692
0 458
Name: 類別, dtype: int64
housetype=test1["類別"].value_counts()
from pylab import mpl
mpl.rcParams['font.sans-serif'] = ['FangSong']
mpl.rcParams['axes.unicode_minus'] = False
asd,sdf = plt.subplots(1,1,dpi=100)
housetype.head(10).plot(kind='bar',x='housetype',y='size',title='類別數量分佈',ax=sdf)
plt.legend(['數量'])
plt.show()
plt.scatter(weight[y == 0, 0], weight[y == 0, 1], s = 100, c = 'red', label = '0')
plt.scatter(weight[y == 1, 0], weight[y == 1, 1], s = 100, c = 'blue', label = '1')
plt.title('Clusters of customers')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.legend()
plt.show()