貝葉斯,SVM分類

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
import jieba
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
import os
def getListFiles(path):
    ret = []
    for root, dirs, files in os.walk(path):
        for filespath in files:
            if filespath.endswith("txt"):
                ret.append(os.path.join(root,filespath))
                # ret.append(root)
    return ret
def load_data(path):
    ret = getListFiles(path)
    data_x = []
    data_y = []
    for j in ret:
        f = open(j,'r',encoding='gbk')
        for i in f.readlines():
            i = i.split('\t')
            # print(i[0])
            data_x.append(i[1])
            data_y.append(i[0])
    return data_x,data_y
def func(data_x,data_y):
    x_train,x_test,y_train,y_test = train_test_split(data_x,data_y,random_state=1)
    word_df = []
    word__test = []
    # print(y_train)
    for i in x_train:
        word_df.append(' '.join(jieba.cut(i)))
    for i in x_test:
        word__test.append(' '.join(jieba.cut(i)))
    # vec = CountVectorizer()
    # print(word_df)
    vec = TfidfVectorizer().fit(word_df) #實例化tf-idf
    a = vec.fit_transform(word_df) #擬合數據
    print(vec.vocabulary_)
    # tf.fit(word_df)
    print(a)
    classifier = MultinomialNB() #實例化bayes分類
    # classifier = svm.SVC() #實例化SVM分類
    classifier.fit(vec.transform(word_df),y_train)#擬合
    scoure = classifier.score(vec.transform(word__test),y_test) #評分
    result = classifier.predict(vec.transform(word__test)) #預測
    # print(word__test)
    print(result)
if __name__ == '__main__':
    data_x,data_y = load_data(r'C:\Users\Administrator\Desktop\語料\result\會議')
    func(data_x,data_y)

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章