實習期間老大給了一個任務,給運維歷史數據分類,需要先做聚類,然後把每一類總結出來。目前只是實現了文本聚類。
# -*- coding: utf-8 -*-
"""
Created on Wed Jul 18 15:53:56 2018
@author: zs
"""
import re
import time
import jieba
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.cluster import Birch
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
import matplotlib.cm as cm
# 導入數據集函數,返回聚類的數據與對應ID
def loadDataSet(filename):
dataset = pd.read_csv(filename,encoding='utf-8')
m,n = dataset.shape # 獲取行、列
data = dataset.values[:,-1]
dataID = dataset.values[:,0]
return data.reshape((m,1)), dataID.reshape((m,1))
# numpy 轉化爲 list
def ndarrayToList(dataArr):
dataList = []
m,n = dataArr.shape
for i in range(m):
for j in range(n):
dataList.append(dataArr[i,j])
return dataList
# 去掉字符串、特殊符號
def removeStr(listData):
strData = "".join(listData)
removeStrData = re.sub("[\s+\!\,$^*()+\"\']+:|[+——!,,《》“”〔【】;:。?、�./-~@#¥……&*()]+", "",strData)
return removeStrData
# 創建停用詞列表
def stopwordslist(filePath):
stopword = [line.strip() for line in open(filePath,'r',encoding='utf-8').readlines()]
return stopword
# 保存文件
def saveFile(filename):
with open(filename,'a') as fr:
for line in dataSplit:
strLine = ' '.join(line)
fr.write(strLine)
fr.write('\n')
fr.close()
# 對數據集分詞、去停用詞
def wordSplit(data):
stopword = stopwordslist('./data/stopwords.txt') # 創建通用詞列表
word = ndarrayToList(data)
m = len(word)
wordList = []
for i in range(m):
rowListRemoveStr = removeStr(word[i]) # 去特殊符號
rowList = [eachWord for eachWord in jieba.cut(rowListRemoveStr)] # 分詞
removeStopwordList = []
for eachword in rowList:
if eachword not in stopword and eachword != '\t' and eachword != ' ' :
removeStopwordList.append(eachword)
wordList.append(removeStopwordList)
return wordList
# 計算 tf-idf 值
def TFIDF(wordList):
corpus = [] # 保存預料
for i in range(len(wordList)):
wordList[i] = " ".join(wordList[i])
corpus.append(wordList[i])
# 將文本中的詞語轉換成詞頻矩陣,矩陣元素 a[i][j] 表示j詞在i類文本下的詞頻
vectorizer = CountVectorizer()
# 該類會統計每個詞語tfidf權值
transformer = TfidfTransformer()
# 第一個fit_transform是計算tf-idf 第二個fit_transform是將文本轉爲詞頻矩陣
tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus))
# 獲取詞袋模型中的所有詞語
word = vectorizer.get_feature_names()
# 將tf-idf矩陣抽取出來,元素w[i][j]表示j詞在i類文本中的tf-idf權重
weight = tfidf.toarray()
return word,weight
# 對生成的 tfidf 矩陣做PCA降維
'''
權重矩陣非常稀疏,使用PCA降維(爲什麼不是SVD降維) SVD適合稠密矩陣降維
'''
def matrixPCA(weight,dimension):
pca = PCA(n_components = dimension) # 初始化PCA
pcaMatrix = pca.fit_transform(weight) # 返回降維後的數據
print("降維之前的權重維度:",weight.shape)
print("降維之後的權重維度:",pcaMatrix.shape)
return pcaMatrix
# 層級聚類 birch k-means適合維度低且速度慢
def birch(matrix,k):
clusterer = Birch(n_clusters=k) # 分成簇的個數
y = clusterer.fit_predict(matrix) # 聚類結果
return y
# 計算輪廓係數
def Silhouette(matrix, y):
silhouette_avg = silhouette_score(matrix, y) # 平均輪廓係數
sample_silhouette_values = silhouette_samples(matrix, y) # 每個點的輪廓係數
print(silhouette_avg)
return silhouette_avg, sample_silhouette_values
# 畫圖
def Draw(silhouette_avg, sample_silhouette_values, y, k):
fig, ax1 = plt.subplots(1)
fig.set_size_inches(18, 7)
# 第一個 subplot 放輪廓係數點
# 範圍是[-1, 1]
ax1.set_xlim([-0.2, 0.5])
# 後面的 (k + 1) * 10 是爲了能更明確的展現這些點
#ax1.set_ylim([0, len(X) + (k + 1) * 10])
y_lower = 10
for i in range(k): # 分別遍歷這幾個聚類
ith_cluster_silhouette_values = sample_silhouette_values[y == i]
ith_cluster_silhouette_values.sort()
size_cluster_i = ith_cluster_silhouette_values.shape[0]
y_upper = y_lower + size_cluster_i
color = cm.spectral(float(i)/k) # 搞一款顏色
ax1.fill_betweenx(np.arange(y_lower, y_upper),
0,
ith_cluster_silhouette_values,
facecolor=color,
edgecolor=color,
alpha=0.7)
# 在輪廓係數點這裏加上聚類的類別號
ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
# 計算下一個點的 y_lower y軸位置
y_lower = y_upper + 10
# 在圖裏搞一條垂直的評論輪廓係數虛線
ax1.axvline(x=silhouette_avg, color='red', linestyle="--")
plt.show()
# 保存聚類結果
def saveResult(data,y):
y = y.reshape((len(data),1))
for i in range(12):
filename = './result1/result' + str(i) + '.csv' # 文件名
with open(filename,'a',encoding='utf8') as fr:
for j in range(13001):
if y[j] == i:
strLine = ''.join(data[j])
fr.write(strLine)
fr.write('\n')
fr.close()
if __name__ == "__main__":
# start time
start = time.clock()
k = 12 # 聚成12類
jieba.load_userdict('./data/user_dict.txt') # 添加分詞字典
data,dataId = loadDataSet('./data/new_gongdan.csv')
dataSplit = wordSplit(data)
print('分詞完成')
saveFile('./data/new_gongdan_split.csv') # 保存分詞結果
word, weight = TFIDF(dataSplit) # 生成 tfidf 矩陣
weightPCA = weight
# 將原始矩陣降維,降維後效果反而沒有不降維的好
#weightPCA = matrixPCA(weight, dimension = 1000)
y = birch(weightPCA, k)
silhouette_avg, sample_silhouette_values = Silhouette(weightPCA, y) # 輪廓係數
Draw(silhouette_avg, sample_silhouette_values, y, k)
saveResult(data,y) # 保存聚類結果,一類保存爲一個csv文件
elapsed = (time.clock() - start)
print('Time use', elapsed)