基於sklearn中文文本聚類

實習期間老大給了一個任務,給運維歷史數據分類,需要先做聚類,然後把每一類總結出來。目前只是實現了文本聚類。

# -*- coding: utf-8 -*-
"""
Created on Wed Jul 18 15:53:56 2018

@author: zs

"""
import re
import time
import jieba
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.cluster import Birch
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
import matplotlib.cm as cm


# 導入數據集函數,返回聚類的數據與對應ID
def loadDataSet(filename):
    dataset = pd.read_csv(filename,encoding='utf-8')
    m,n = dataset.shape  # 獲取行、列
    data = dataset.values[:,-1]
    dataID = dataset.values[:,0]
    return data.reshape((m,1)), dataID.reshape((m,1))

# numpy 轉化爲 list
def ndarrayToList(dataArr):
    dataList = []
    m,n = dataArr.shape
    for i in range(m):
        for j in range(n):
            dataList.append(dataArr[i,j])
    return dataList

# 去掉字符串、特殊符號
def removeStr(listData):
    strData = "".join(listData)
    removeStrData = re.sub("[\s+\!\,$^*()+\"\']+:|[+——!,,《》“”〔【】;:。?、�./-~@#¥……&*()]+", "",strData)
    return removeStrData

# 創建停用詞列表
def stopwordslist(filePath):
    stopword = [line.strip() for line in open(filePath,'r',encoding='utf-8').readlines()]
    return stopword

# 保存文件
def saveFile(filename):
    with open(filename,'a') as fr:
        for line in dataSplit:
            strLine = ' '.join(line)
            fr.write(strLine)    
            fr.write('\n')
        fr.close()

# 對數據集分詞、去停用詞
def wordSplit(data):
    stopword = stopwordslist('./data/stopwords.txt')  # 創建通用詞列表
    word = ndarrayToList(data)
    m = len(word)
    wordList = []
    for i in range(m):
        rowListRemoveStr = removeStr(word[i])    # 去特殊符號
        rowList = [eachWord for eachWord in jieba.cut(rowListRemoveStr)]  # 分詞
        removeStopwordList = []
        for eachword in  rowList:
            if eachword not in stopword and eachword != '\t' and eachword != ' ' :
                removeStopwordList.append(eachword)
        wordList.append(removeStopwordList)
    return wordList

# 計算 tf-idf 值
def TFIDF(wordList):
    corpus = []   # 保存預料
    for i in range(len(wordList)):
        wordList[i] = " ".join(wordList[i])
        corpus.append(wordList[i])
    # 將文本中的詞語轉換成詞頻矩陣,矩陣元素 a[i][j] 表示j詞在i類文本下的詞頻
    vectorizer = CountVectorizer()
    # 該類會統計每個詞語tfidf權值
    transformer = TfidfTransformer()
    # 第一個fit_transform是計算tf-idf 第二個fit_transform是將文本轉爲詞頻矩陣
    tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus))
    # 獲取詞袋模型中的所有詞語
    word = vectorizer.get_feature_names()
    # 將tf-idf矩陣抽取出來,元素w[i][j]表示j詞在i類文本中的tf-idf權重  
    weight = tfidf.toarray()
    
    return word,weight

# 對生成的 tfidf 矩陣做PCA降維
'''
權重矩陣非常稀疏,使用PCA降維(爲什麼不是SVD降維) SVD適合稠密矩陣降維
'''
def matrixPCA(weight,dimension):
    pca = PCA(n_components = dimension)  # 初始化PCA
    pcaMatrix = pca.fit_transform(weight)        # 返回降維後的數據
    print("降維之前的權重維度:",weight.shape)
    print("降維之後的權重維度:",pcaMatrix.shape)
    return pcaMatrix

# 層級聚類 birch  k-means適合維度低且速度慢
def birch(matrix,k):
    clusterer = Birch(n_clusters=k)  # 分成簇的個數
    y = clusterer.fit_predict(matrix)    # 聚類結果
    return y

# 計算輪廓係數
def Silhouette(matrix, y):
    silhouette_avg = silhouette_score(matrix, y)   # 平均輪廓係數
    sample_silhouette_values = silhouette_samples(matrix, y)  # 每個點的輪廓係數
    print(silhouette_avg)
    return silhouette_avg, sample_silhouette_values

# 畫圖
def Draw(silhouette_avg, sample_silhouette_values, y, k):
    fig, ax1 = plt.subplots(1)
    fig.set_size_inches(18, 7)
    # 第一個 subplot 放輪廓係數點
    # 範圍是[-1, 1]
    ax1.set_xlim([-0.2, 0.5])
    # 後面的 (k + 1) * 10 是爲了能更明確的展現這些點
    #ax1.set_ylim([0, len(X) + (k + 1) * 10])
    y_lower = 10
 
    for i in range(k): # 分別遍歷這幾個聚類
        ith_cluster_silhouette_values = sample_silhouette_values[y == i]
        ith_cluster_silhouette_values.sort()
        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i
        color = cm.spectral(float(i)/k)  # 搞一款顏色
        ax1.fill_betweenx(np.arange(y_lower, y_upper),
                          0,
                          ith_cluster_silhouette_values,
                          facecolor=color,
                          edgecolor=color,
                          alpha=0.7) 
        # 在輪廓係數點這裏加上聚類的類別號
        ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
        # 計算下一個點的 y_lower y軸位置
        y_lower = y_upper + 10
    # 在圖裏搞一條垂直的評論輪廓係數虛線
    ax1.axvline(x=silhouette_avg, color='red', linestyle="--")
    plt.show()

# 保存聚類結果
def saveResult(data,y):
    y = y.reshape((len(data),1))
    for i in range(12):
        filename = './result1/result' + str(i) + '.csv'   # 文件名
        with open(filename,'a',encoding='utf8') as fr:
            for j in range(13001):
                if y[j] == i:
                    strLine = ''.join(data[j])
                    fr.write(strLine)
                    fr.write('\n')
            fr.close()

if __name__ == "__main__":
    
    # start time
    start = time.clock()
    k = 12  # 聚成12類
    jieba.load_userdict('./data/user_dict.txt')  # 添加分詞字典
    data,dataId = loadDataSet('./data/new_gongdan.csv')
    dataSplit = wordSplit(data)
    print('分詞完成')
    saveFile('./data/new_gongdan_split.csv')  # 保存分詞結果
    word, weight = TFIDF(dataSplit)  # 生成 tfidf 矩陣
    weightPCA = weight

    # 將原始矩陣降維,降維後效果反而沒有不降維的好
    #weightPCA = matrixPCA(weight, dimension = 1000) 
    y = birch(weightPCA, k)
    silhouette_avg, sample_silhouette_values = Silhouette(weightPCA, y) # 輪廓係數
    
    Draw(silhouette_avg, sample_silhouette_values, y, k)
    saveResult(data,y)  # 保存聚類結果,一類保存爲一個csv文件
    
    elapsed = (time.clock() - start)
    print('Time use', elapsed)

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章