訓練賽-汽車行業用戶觀點主題及情感識別

賽題通道(進入

閒聊:
這是我第三次參加大數據比賽,也是第一次接觸大數據比賽的自然語言處理,下面吧現在的代碼寫成博客保存一下,代碼還在不斷優化中。。。

正題:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import jieba      #利用結巴分詞功能進行有效的分詞
import re          #正則表達式相關的庫
from random import shuffle
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.probability import FreqDist, ConditionalFreqDist

#df存儲讀取文件數據
df = pd.read_csv('train.csv')
#sub_list存儲情感分類主題,共10類
sub_list = list(df['subject'].unique())
#sub_cnt_dict存儲每一類對應的所有中文文本
sub_word_dict = dict(zip(sub_list, [[]]*10))

#這是一個手動整理的一個停用詞文件,但是效果不怎麼好。。
with open('stop_word3.txt', 'r') as fp:
    stop_word = fp.readline().split()

#提取字符串中的中文
def Translate(str):
    pat = re.compile(u"[^\\u4e00-\\u9fa5]")
    result = ''.join(pat.split(str))
    return result

#將每一個主題對應的文本提取、分詞、存入字典中
for item in sub_list:
    temp_content = list(df[df['subject']==item]['content'])
    temp_content = ''.join(temp_content)
    #jieba.cut 進行結巴分詞工具分詞
    sub_word_dict[item] = list(jieba.cut(Translate(temp_content), cut_all = False))
#    for i in range(len(temp_content)):
#        sub_word_dict[item].extend(list(set(jieba.cut(Translate(temp_content[i]), cut_all = False))))
#        

#計算分詞的卡方統計量,然後選取最高的number個
def Jieba_feature(sub_word_dict,number):
    #可統計所有詞的詞頻
    word_fd = {}
    #可統計每個主題的詞頻
    con_word_fd = ConditionalFreqDist()
    #存儲每個類別的word的總詞數
    con_word_count = {}
    
    for sub in sub_word_dict.keys():
        for word in sub_word_dict[sub]:
            word_fd[word]  = word_fd.get(word, 0) + 1
            con_word_fd[sub][word] += 1
        temp_num = con_word_fd[sub].N()
        con_word_count[sub] = con_word_count.get(sub, temp_num)
    total_word_count = sum(con_word_count.values())
    
    word_fd = dict(sorted(word_fd.items(), key = lambda x:x[1], reverse = True))
    #print('次品表:', word_fd)
    
    word_scores = {} #存儲每個詞對應的信息量
    #print('總詞數:',sum(word_fd.values()))
    for word, fred in word_fd.items():
        word_scores[word] = 0
        if  17> word_fd[word]  or word_fd[word] > 1506 :
            continue
        for sub in sub_word_dict.keys():
            temp_num = BigramAssocMeasures.chi_sq(con_word_fd[sub][word], 
                        (fred,con_word_fd[sub].N()),
                        total_word_count)
            word_scores[word] += temp_num
    
    #把詞按信息量進行排序,然後去前number個
#    print('word_scores:', word_scores)
    best_vals = sorted(word_scores.items(), 
                       key=lambda item:item[1],
                       reverse = True)[:number]
    best_words = set([w for w,s in best_vals])
    return dict([(word,True) for word in best_words])

temp_list = list(df['sentiment_word'])
for i in range(len(temp_list)):
    temp_list[i] = str(temp_list[i])
temp_str = ''.join(temp_list)
temp_word = set(jieba.cut(Translate(temp_str), cut_all = False))

word_list = list(set(Jieba_feature(sub_word_dict, 200)))
#word_list = list(temp_word)

#將數據集轉換爲特徵數據和分類數據
def GetData(df, word_list,sub_list):
    train_list = []
    for index in range(df.shape[0]):
        temp_list = []
        word_vec = {}
        
        content = df['content'][index]
        subject = df['subject'][index]
        
        fen_ci = list(jieba.cut(Translate(content),cut_all = False))
        
        for word in fen_ci:
            if word in word_list:
                word_vec[word] = 'True'
                
        temp_list.append(word_vec)
        temp_list.append(subject)
        train_list.append(temp_list)
    return train_list

def GetValueData(data, df):    
    data2 = data.copy()
    for i in range(df.shape[0]):
        data2[i][1] = df['sentiment_value'][i]
    return data2

#獲取可計算的數據集
data = GetData(df, word_list, sub_list)
#轉換要預測的值(因爲改題有兩個值需要預測,一個情感詞,還有一個情感值)
#for i in range(df.shape[0]):
#    data[i][1] = df['sentiment_value'][i]

import sklearn
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.svm import SVC, LinearSVC
from xgboost import XGBClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression,LogisticRegressionCV,RidgeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report



def Score(classifier, data):
    shuffle(data)
    train_data = data[:6947]   #6947 條訓練數據
    test_data = data[6947:] 
    test_x, test_y = zip(*test_data)
    classifier = SklearnClassifier(classifier)  #封裝的一個藉口
    classifier.train(train_data) #訓練分類器
    pred = classifier.classify_many(test_x) #給出預測結果
    
    test_y = np.array(test_y)
    pred = np.array(pred)
    #print(classification_report(pred, test_y))
    return sum(pred == test_y) / len(test_y)

bst_subject = XGBClassifier(max_depth=3, 
                    learning_rate=0.36,
                 n_estimators=100, 
                    silent=0,
                 objective='multi:softmax', 
                    booster='gbtree',
                 n_jobs=1, 
                    nthread=None, 
                    gamma=0, 
                    min_child_weight=1,
                 max_delta_step=0, 
                    subsample=1, 
                    colsample_bytree=1, 
                    colsample_bylevel=1,
                 reg_alpha=0, 
                    reg_lambda=1, 
                    scale_pos_weight=1,
                 base_score=0.5, 
                    random_state=0)


print('--------------------------subject-----------------------')
print('BernoulliNB`s accuracy is %f'  %Score(BernoulliNB(), data))
print('MultinomiaNB`s accuracy is %f'  %Score(MultinomialNB(), data))
#print('XGBClassifier1s accuracy is %f' %Score(bst_subject, data))
print('RidgeClassifier`s accuracy is %f' %Score(RidgeClassifier(), data))
print('LogisticRegression`s accuracy is %f' %Score(LogisticRegressionCV(), data))
print('LogisticRegression`s accuracy is  %f' %Score(LogisticRegression(), data))
#print('SVC`s accuracy is %f'  %Score(SVC(), data))
print('LinearSVC`s accuracy is %f'  %Score(LinearSVC(), data))

#經過測試,發現在現在的階段下,lr是預測準確率較高的模型
--------------------------subject-----------------------
BernoulliNBs accuracy is 0.718667 MultinomiaNBs accuracy is 0.707000
RidgeClassifiers accuracy is 0.739333 LogisticRegressions accuracy is 0.739000
LogisticRegressions accuracy is 0.721333 LinearSVCs accuracy is 0.720333

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章