from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
import jieba
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
import os
def getListFiles(path):
ret = []
for root, dirs, files in os.walk(path):
for filespath in files:
if filespath.endswith("txt"):
ret.append(os.path.join(root,filespath))
# ret.append(root)
return ret
def load_data(path):
ret = getListFiles(path)
data_x = []
data_y = []
for j in ret:
f = open(j,'r',encoding='gbk')
for i in f.readlines():
i = i.split('\t')
# print(i[0])
data_x.append(i[1])
data_y.append(i[0])
return data_x,data_y
def func(data_x,data_y):
x_train,x_test,y_train,y_test = train_test_split(data_x,data_y,random_state=1)
word_df = []
word__test = []
# print(y_train)
for i in x_train:
word_df.append(' '.join(jieba.cut(i)))
for i in x_test:
word__test.append(' '.join(jieba.cut(i)))
# vec = CountVectorizer()
# print(word_df)
vec = TfidfVectorizer().fit(word_df) #實例化tf-idf
a = vec.fit_transform(word_df) #擬合數據
print(vec.vocabulary_)
# tf.fit(word_df)
print(a)
classifier = MultinomialNB() #實例化bayes分類
# classifier = svm.SVC() #實例化SVM分類
classifier.fit(vec.transform(word_df),y_train)#擬合
scoure = classifier.score(vec.transform(word__test),y_test) #評分
result = classifier.predict(vec.transform(word__test)) #預測
# print(word__test)
print(result)
if __name__ == '__main__':
data_x,data_y = load_data(r'C:\Users\Administrator\Desktop\語料\result\會議')
func(data_x,data_y)