#盜取男票年輕時候的代碼,現在全給我教學使用了,感恩臉
#分類文檔爲多個文件夾 文件夾是以類別名命名 內含多個單個文檔
#coding: utf-8
from __future__ import print_function, unicode_literals
import os
import time
import random
import jieba
import numpy as np
from collections import defaultdict
import sklearn
from sklearn.naive_bayes import MultinomialNB
# 主要用來獲取停用詞
def MakeWordsSet(words_file):
words_set = set()
with open(words_file, 'r') as fp:
for line in fp.readlines():
word = line.strip()
if len(word) > 0 and word not in words_set: # 去重
words_set.add(word)
return words_set
def TextProcessing(folder_path, test_size=0.2):
folder_list = os.listdir(folder_path)
data_list = []
class_list = []
# 類間循環
for folder in folder_list:
new_folder_path = os.path.join(folder_path, folder)
# 輸出相關路徑和時間
print ("路徑 = ", new_folder_path, time.asctime((time.localtime(time.time()))))
files = os.listdir(new_folder_path)
# 類內循環
for file in files:
with open(os.path.join(new_folder_path, file), 'r') as fp:
raw = fp.read()
word_cut = jieba.cut(raw, cut_all=False) # 精確模式,返回的結構是一個可迭代的genertor
word_list = list(word_cut) # genertor轉化爲list,每個詞unicode格式
data_list.append(word_list)
class_list.append(folder)
# 劃分訓練集和測試集
data_class_list = list(zip(data_list, class_list))
# 返回隨機排列後的序列,沒有返回值,會直接修改data_class_list
random.shuffle(data_class_list)
index = int(len(data_class_list) * test_size) + 1 #獲取部分序列位置(index) (train:test)4 : 1
train_list = data_class_list[index:]
test_list = data_class_list[:index]
train_data_list, train_class_list = zip(*train_list)
test_data_list, test_class_list = zip(*test_list)
# 統計詞頻放入all_words_dict
all_words_dict = {}
for word_list in train_data_list:
for word in word_list:
if word in all_words_dict:
all_words_dict[word] += 1
else:
all_words_dict[word] = 1
# key函數利用詞頻進行降序排序
# 內建函數sorted參數需爲list
all_words_tuple_list = sorted(all_words_dict.items(), key=lambda f:f[1], reverse=True)
all_words_list = list(zip(*all_words_tuple_list))[0]
return all_words_list, train_data_list, test_data_list, train_class_list, test_class_list
def words_dict(all_words_list, deleteN, stopwords_set=set()):
# 選取特徵詞
feature_words = []
n = 1
for t in range(deleteN, len(all_words_list), 1):
if n > 1500: # feature_words的維度1500
break
if not all_words_list[t].isdigit() and all_words_list[t] not in stopwords_set and 1 < len(all_words_list[t]) < 5:
feature_words.append(all_words_list[t])
n += 1
return feature_words
def TextFeatures(train_data_list, test_data_list, feature_words, flag='nltk'):
# 注意python嵌套函數的使用
# 可以上網看看相關知識點
def text_features(text, feature_words):
text_words = set(text)
features = [1 if word in text_words else 0 for word in feature_words]
return features
train_feature_list = [text_features(text, feature_words) for text in train_data_list]
test_feature_list = [text_features(text, feature_words) for text in test_data_list]
return train_feature_list, test_feature_list
def TextClassifier(train_feature_list, test_feature_list, train_class_list, test_class_list):
## sklearn分類器
## 把這個代碼看懂了 很多sklearn的函數都可以在這個裏面試一試
classifier = MultinomialNB().fit(train_feature_list, train_class_list)
test_accuracy = classifier.score(test_feature_list, test_class_list)
return test_accuracy
if __name__ == '__main__':
print ('STARTING TIME : ', time.asctime((time.localtime(time.time()))))
# 文本預處理
folder_path = './文檔'
all_words_list, train_data_list, test_data_list, train_class_list, test_class_list = TextProcessing(folder_path, test_size=0.2)
# 生成stopwords_set
stopwords_file = './stopwords.txt'
stopwords_set = MakeWordsSet(stopwords_file)
## 文本特徵提取和分類
feature_words = words_dict(all_words_list, 20, stopwords_set)
train_feature_list, test_feature_list = TextFeatures(train_data_list, test_data_list, feature_words)
test_accuracy = TextClassifier(train_feature_list, test_feature_list, train_class_list, test_class_list)
print ('準確率 : ', test_accuracy * 100, '%')
print ('ENDING TIME : ', time.asctime((time.localtime(time.time()))))
print ("finished")
自己把提取關鍵詞改爲用jieba內analyse的函數,還是1500維,結果精確度提高了1.多%,但是時間長了很多,說明用jieba內部提取關鍵字 用uf-idf的時間複雜度還是很高的
feature_words = jieba.analyse.extract_tags(all_words,1500)
改後如下:
#coding: utf-8
from __future__ import print_function, unicode_literals
import os
import time
import random
import jieba
import jieba.analyse
import numpy as np
from collections import defaultdict
import sklearn
from sklearn.naive_bayes import MultinomialNB
# 主要用來獲取停用詞
def MakeWordsSet(words_file):
words_set = set()
with open(words_file, 'r') as fp:
for line in fp.readlines():
word = line.strip()
if len(word) > 0 and word not in words_set: # 去重
words_set.add(word)
return words_set
def TextProcessing(folder_path, test_size=0.2):
folder_list = os.listdir(folder_path)
data_list = []
class_list = []
# 類間循環
for folder in folder_list:
new_folder_path = os.path.join(folder_path, folder)
# 輸出相關路徑和時間
print ("路徑 = ", new_folder_path, time.asctime((time.localtime(time.time()))))
files = os.listdir(new_folder_path)
# 類內循環
for file in files:
with open(os.path.join(new_folder_path, file), 'r') as fp:
raw = fp.read()
word_cut = jieba.cut(raw, cut_all=False) # 精確模式,返回的結構是一個可迭代的genertor
word_list = list(word_cut) # genertor轉化爲list,每個詞unicode格式
data_list.append(word_list)
class_list.append(folder)
# 劃分訓練集和測試集
data_class_list = list(zip(data_list, class_list))
# 返回隨機排列後的序列,沒有返回值,會直接修改data_class_list
random.shuffle(data_class_list)
index = int(len(data_class_list) * test_size) + 1 #獲取部分序列位置(index) (train:test)4 : 1
train_list = data_class_list[index:]
test_list = data_class_list[:index]
train_data_list, train_class_list = zip(*train_list)
test_data_list, test_class_list = zip(*test_list)
# 統計詞頻放入all_words
#all_words_dict = {}
all_words = ''
for word_list in train_data_list:
for word in word_list:
all_words += word
# key函數利用詞頻進行降序排序
# 內建函數sorted參數需爲list
#all_words_tuple_list = sorted(all_words_dict.items(), key=lambda f:f[1], reverse=True)
#all_words_list = list(zip(*all_words_tuple_list))[0]
return all_words, train_data_list, test_data_list, train_class_list, test_class_list
def words_dict(all_words_list, deleteN, stopwords_set=set()):
# 選取特徵詞
feature_words = []
n = 1
for t in range(deleteN, len(all_words_list), 1):
if n > 2000: # feature_words的維度1500
break
if not all_words_list[t].isdigit() and all_words_list[t] not in stopwords_set and 1 < len(all_words_list[t]) < 5:
feature_words.append(all_words_list[t])
n += 1
return feature_words
def TextFeatures(train_data_list, test_data_list, feature_words, flag='nltk'):
# 注意python嵌套函數的使用
# 可以上網看看相關知識點
def text_features(text, feature_words):
text_words = set(text)
features = [1 if word in text_words else 0 for word in feature_words]
return features
train_feature_list = [text_features(text, feature_words) for text in train_data_list]
test_feature_list = [text_features(text, feature_words) for text in test_data_list]
return train_feature_list, test_feature_list
def TextClassifier(train_feature_list, test_feature_list, train_class_list, test_class_list):
## sklearn分類器
## 把這個代碼看懂了 很多sklearn的函數都可以在這個裏面試一試
classifier = MultinomialNB().fit(train_feature_list, train_class_list)
test_accuracy = classifier.score(test_feature_list, test_class_list)
return test_accuracy
if __name__ == '__main__':
print ('STARTING TIME : ', time.asctime((time.localtime(time.time()))))
# 文本預處理
folder_path = './文檔'
all_words, train_data_list, test_data_list, train_class_list, test_class_list = TextProcessing(folder_path, test_size=0.2)
# 生成stopwords_set
#stopwords_file = './stopwords.txt'
#stopwords_set = MakeWordsSet(stopwords_file)
## 文本特徵提取和分類
#feature_words = words_dict(all_words_list, 20, stopwords_set)
feature_words = jieba.analyse.extract_tags(all_words,1500)
train_feature_list, test_feature_list = TextFeatures(train_data_list, test_data_list, feature_words)
test_accuracy = TextClassifier(train_feature_list, test_feature_list, train_class_list, test_class_list)
print ('準確率 : ', test_accuracy * 100, '%')
print ('ENDING TIME : ', time.asctime((time.localtime(time.time()))))
print ("finished")