jieba和樸素貝葉斯實現文本分類

#盜取男票年輕時候的代碼,現在全給我教學使用了,感恩臉
#分類文檔爲多個文件夾 文件夾是以類別名命名 內含多個單個文檔

#coding: utf-8
from __future__ import print_function, unicode_literals
import os
import time
import random
import jieba

import numpy as np
from collections import defaultdict
import sklearn
from sklearn.naive_bayes import MultinomialNB

# 主要用來獲取停用詞
def MakeWordsSet(words_file):
    words_set = set()
    with open(words_file, 'r') as fp:
        for line in fp.readlines():
            word = line.strip()
            if len(word) > 0 and word not in words_set: # 去重
                words_set.add(word)
    return words_set

def TextProcessing(folder_path, test_size=0.2):
    folder_list = os.listdir(folder_path)
    data_list = []
    class_list = []

    # 類間循環
    for folder in folder_list:
        new_folder_path = os.path.join(folder_path, folder)
        # 輸出相關路徑和時間
        print ("路徑 = ", new_folder_path, time.asctime((time.localtime(time.time()))))
        files = os.listdir(new_folder_path)
        # 類內循環
        for file in files:
            with open(os.path.join(new_folder_path, file), 'r') as fp:
               raw = fp.read()
            word_cut = jieba.cut(raw, cut_all=False) # 精確模式,返回的結構是一個可迭代的genertor
            word_list = list(word_cut) # genertor轉化爲list,每個詞unicode格式
            data_list.append(word_list)
            class_list.append(folder)

    # 劃分訓練集和測試集
    data_class_list = list(zip(data_list, class_list))
    # 返回隨機排列後的序列,沒有返回值,會直接修改data_class_list
    random.shuffle(data_class_list)
    index = int(len(data_class_list) * test_size) + 1 #獲取部分序列位置(index) (train:test)4 : 1
    train_list = data_class_list[index:]
    test_list = data_class_list[:index]
    train_data_list, train_class_list = zip(*train_list)
    test_data_list, test_class_list = zip(*test_list)

    # 統計詞頻放入all_words_dict
    all_words_dict = {}
    for word_list in train_data_list:
        for word in word_list:
            if word in all_words_dict:
                all_words_dict[word] += 1
            else:
                all_words_dict[word] = 1
    # key函數利用詞頻進行降序排序
    # 內建函數sorted參數需爲list
    all_words_tuple_list = sorted(all_words_dict.items(), key=lambda f:f[1], reverse=True) 
    all_words_list = list(zip(*all_words_tuple_list))[0]
    return all_words_list, train_data_list, test_data_list, train_class_list, test_class_list


def words_dict(all_words_list, deleteN, stopwords_set=set()):
    # 選取特徵詞
    feature_words = []
    n = 1
    for t in range(deleteN, len(all_words_list), 1):
        if n > 1500: # feature_words的維度1500
            break
        if not all_words_list[t].isdigit() and all_words_list[t] not in stopwords_set and 1 < len(all_words_list[t]) < 5:
            feature_words.append(all_words_list[t])
            n += 1
    return feature_words

def TextFeatures(train_data_list, test_data_list, feature_words, flag='nltk'):
    # 注意python嵌套函數的使用
    # 可以上網看看相關知識點
    def text_features(text, feature_words):
        text_words = set(text)
        features = [1 if word in text_words else 0 for word in feature_words]
        return features

    train_feature_list = [text_features(text, feature_words) for text in train_data_list]
    test_feature_list = [text_features(text, feature_words) for text in test_data_list]
    return train_feature_list, test_feature_list

def TextClassifier(train_feature_list, test_feature_list, train_class_list, test_class_list):
    ## sklearn分類器
    ## 把這個代碼看懂了 很多sklearn的函數都可以在這個裏面試一試
    classifier = MultinomialNB().fit(train_feature_list, train_class_list)
    test_accuracy = classifier.score(test_feature_list, test_class_list)
    return test_accuracy


if __name__ == '__main__':

    print ('STARTING TIME : ', time.asctime((time.localtime(time.time()))))

    # 文本預處理
    folder_path = './文檔'
    all_words_list, train_data_list, test_data_list, train_class_list, test_class_list = TextProcessing(folder_path, test_size=0.2)

    # 生成stopwords_set
    stopwords_file = './stopwords.txt'
    stopwords_set = MakeWordsSet(stopwords_file)

    ## 文本特徵提取和分類
    feature_words = words_dict(all_words_list, 20, stopwords_set)
    train_feature_list, test_feature_list = TextFeatures(train_data_list, test_data_list, feature_words)
    test_accuracy = TextClassifier(train_feature_list, test_feature_list, train_class_list, test_class_list)
    print ('準確率 : ', test_accuracy * 100, '%')
    print ('ENDING TIME : ', time.asctime((time.localtime(time.time()))))
    print ("finished")


自己把提取關鍵詞改爲用jieba內analyse的函數,還是1500維,結果精確度提高了1.多%,但是時間長了很多,說明用jieba內部提取關鍵字 用uf-idf的時間複雜度還是很高的

feature_words = jieba.analyse.extract_tags(all_words,1500)
改後如下:

#coding: utf-8
from __future__ import print_function, unicode_literals
import os
import time
import random
import jieba
import jieba.analyse
import numpy as np
from collections import defaultdict
import sklearn
from sklearn.naive_bayes import MultinomialNB

# 主要用來獲取停用詞
def MakeWordsSet(words_file):
    words_set = set()
    with open(words_file, 'r') as fp:
        for line in fp.readlines():
            word = line.strip()
            if len(word) > 0 and word not in words_set: # 去重
                words_set.add(word)
    return words_set

def TextProcessing(folder_path, test_size=0.2):
    folder_list = os.listdir(folder_path)
    data_list = []
    class_list = []

    # 類間循環
    for folder in folder_list:
        new_folder_path = os.path.join(folder_path, folder)
        # 輸出相關路徑和時間
        print ("路徑 = ", new_folder_path, time.asctime((time.localtime(time.time()))))
        files = os.listdir(new_folder_path)
        # 類內循環
        for file in files:
            with open(os.path.join(new_folder_path, file), 'r') as fp:
               raw = fp.read()
            word_cut = jieba.cut(raw, cut_all=False) # 精確模式,返回的結構是一個可迭代的genertor
            word_list = list(word_cut) # genertor轉化爲list,每個詞unicode格式
            data_list.append(word_list)
            class_list.append(folder)

    # 劃分訓練集和測試集
    data_class_list = list(zip(data_list, class_list))
    # 返回隨機排列後的序列,沒有返回值,會直接修改data_class_list
    random.shuffle(data_class_list)
    index = int(len(data_class_list) * test_size) + 1 #獲取部分序列位置(index) (train:test)4 : 1
    train_list = data_class_list[index:]
    test_list = data_class_list[:index]
    train_data_list, train_class_list = zip(*train_list)
    test_data_list, test_class_list = zip(*test_list)

    # 統計詞頻放入all_words
    #all_words_dict = {}
    all_words = ''
    for word_list in train_data_list:
        for word in word_list:
            all_words += word
    # key函數利用詞頻進行降序排序
    # 內建函數sorted參數需爲list
    #all_words_tuple_list = sorted(all_words_dict.items(), key=lambda f:f[1], reverse=True)
    #all_words_list = list(zip(*all_words_tuple_list))[0]
    return all_words, train_data_list, test_data_list, train_class_list, test_class_list


def words_dict(all_words_list, deleteN, stopwords_set=set()):
    # 選取特徵詞
    feature_words = []
    n = 1
    for t in range(deleteN, len(all_words_list), 1):
        if n > 2000: # feature_words的維度1500
            break
        if not all_words_list[t].isdigit() and all_words_list[t] not in stopwords_set and 1 < len(all_words_list[t]) < 5:
            feature_words.append(all_words_list[t])
            n += 1
    return feature_words

def TextFeatures(train_data_list, test_data_list, feature_words, flag='nltk'):
    # 注意python嵌套函數的使用
    # 可以上網看看相關知識點
    def text_features(text, feature_words):
        text_words = set(text)
        features = [1 if word in text_words else 0 for word in feature_words]
        return features

    train_feature_list = [text_features(text, feature_words) for text in train_data_list]
    test_feature_list = [text_features(text, feature_words) for text in test_data_list]
    return train_feature_list, test_feature_list

def TextClassifier(train_feature_list, test_feature_list, train_class_list, test_class_list):
    ## sklearn分類器
    ## 把這個代碼看懂了 很多sklearn的函數都可以在這個裏面試一試
    classifier = MultinomialNB().fit(train_feature_list, train_class_list)
    test_accuracy = classifier.score(test_feature_list, test_class_list)
    return test_accuracy


if __name__ == '__main__':

    print ('STARTING TIME : ', time.asctime((time.localtime(time.time()))))

    # 文本預處理
    folder_path = './文檔'
    all_words, train_data_list, test_data_list, train_class_list, test_class_list = TextProcessing(folder_path, test_size=0.2)

    # 生成stopwords_set
    #stopwords_file = './stopwords.txt'
    #stopwords_set = MakeWordsSet(stopwords_file)

    ## 文本特徵提取和分類
    #feature_words = words_dict(all_words_list, 20, stopwords_set)
    feature_words = jieba.analyse.extract_tags(all_words,1500)
    train_feature_list, test_feature_list = TextFeatures(train_data_list, test_data_list, feature_words)
    test_accuracy = TextClassifier(train_feature_list, test_feature_list, train_class_list, test_class_list)
    print ('準確率 : ', test_accuracy * 100, '%')
    print ('ENDING TIME : ', time.asctime((time.localtime(time.time()))))
    print ("finished")



發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章