賽題通道(進入)
閒聊:
這是我第三次參加大數據比賽,也是第一次接觸大數據比賽的自然語言處理,下面吧現在的代碼寫成博客保存一下,代碼還在不斷優化中。。。
正題:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import jieba #利用結巴分詞功能進行有效的分詞
import re #正則表達式相關的庫
from random import shuffle
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.probability import FreqDist, ConditionalFreqDist
#df存儲讀取文件數據
df = pd.read_csv('train.csv')
#sub_list存儲情感分類主題,共10類
sub_list = list(df['subject'].unique())
#sub_cnt_dict存儲每一類對應的所有中文文本
sub_word_dict = dict(zip(sub_list, [[]]*10))
#這是一個手動整理的一個停用詞文件,但是效果不怎麼好。。
with open('stop_word3.txt', 'r') as fp:
stop_word = fp.readline().split()
#提取字符串中的中文
def Translate(str):
pat = re.compile(u"[^\\u4e00-\\u9fa5]")
result = ''.join(pat.split(str))
return result
#將每一個主題對應的文本提取、分詞、存入字典中
for item in sub_list:
temp_content = list(df[df['subject']==item]['content'])
temp_content = ''.join(temp_content)
#jieba.cut 進行結巴分詞工具分詞
sub_word_dict[item] = list(jieba.cut(Translate(temp_content), cut_all = False))
# for i in range(len(temp_content)):
# sub_word_dict[item].extend(list(set(jieba.cut(Translate(temp_content[i]), cut_all = False))))
#
#計算分詞的卡方統計量,然後選取最高的number個
def Jieba_feature(sub_word_dict,number):
#可統計所有詞的詞頻
word_fd = {}
#可統計每個主題的詞頻
con_word_fd = ConditionalFreqDist()
#存儲每個類別的word的總詞數
con_word_count = {}
for sub in sub_word_dict.keys():
for word in sub_word_dict[sub]:
word_fd[word] = word_fd.get(word, 0) + 1
con_word_fd[sub][word] += 1
temp_num = con_word_fd[sub].N()
con_word_count[sub] = con_word_count.get(sub, temp_num)
total_word_count = sum(con_word_count.values())
word_fd = dict(sorted(word_fd.items(), key = lambda x:x[1], reverse = True))
#print('次品表:', word_fd)
word_scores = {} #存儲每個詞對應的信息量
#print('總詞數:',sum(word_fd.values()))
for word, fred in word_fd.items():
word_scores[word] = 0
if 17> word_fd[word] or word_fd[word] > 1506 :
continue
for sub in sub_word_dict.keys():
temp_num = BigramAssocMeasures.chi_sq(con_word_fd[sub][word],
(fred,con_word_fd[sub].N()),
total_word_count)
word_scores[word] += temp_num
#把詞按信息量進行排序,然後去前number個
# print('word_scores:', word_scores)
best_vals = sorted(word_scores.items(),
key=lambda item:item[1],
reverse = True)[:number]
best_words = set([w for w,s in best_vals])
return dict([(word,True) for word in best_words])
temp_list = list(df['sentiment_word'])
for i in range(len(temp_list)):
temp_list[i] = str(temp_list[i])
temp_str = ''.join(temp_list)
temp_word = set(jieba.cut(Translate(temp_str), cut_all = False))
word_list = list(set(Jieba_feature(sub_word_dict, 200)))
#word_list = list(temp_word)
#將數據集轉換爲特徵數據和分類數據
def GetData(df, word_list,sub_list):
train_list = []
for index in range(df.shape[0]):
temp_list = []
word_vec = {}
content = df['content'][index]
subject = df['subject'][index]
fen_ci = list(jieba.cut(Translate(content),cut_all = False))
for word in fen_ci:
if word in word_list:
word_vec[word] = 'True'
temp_list.append(word_vec)
temp_list.append(subject)
train_list.append(temp_list)
return train_list
def GetValueData(data, df):
data2 = data.copy()
for i in range(df.shape[0]):
data2[i][1] = df['sentiment_value'][i]
return data2
#獲取可計算的數據集
data = GetData(df, word_list, sub_list)
#轉換要預測的值(因爲改題有兩個值需要預測,一個情感詞,還有一個情感值)
#for i in range(df.shape[0]):
# data[i][1] = df['sentiment_value'][i]
import sklearn
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.svm import SVC, LinearSVC
from xgboost import XGBClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression,LogisticRegressionCV,RidgeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
def Score(classifier, data):
shuffle(data)
train_data = data[:6947] #6947 條訓練數據
test_data = data[6947:]
test_x, test_y = zip(*test_data)
classifier = SklearnClassifier(classifier) #封裝的一個藉口
classifier.train(train_data) #訓練分類器
pred = classifier.classify_many(test_x) #給出預測結果
test_y = np.array(test_y)
pred = np.array(pred)
#print(classification_report(pred, test_y))
return sum(pred == test_y) / len(test_y)
bst_subject = XGBClassifier(max_depth=3,
learning_rate=0.36,
n_estimators=100,
silent=0,
objective='multi:softmax',
booster='gbtree',
n_jobs=1,
nthread=None,
gamma=0,
min_child_weight=1,
max_delta_step=0,
subsample=1,
colsample_bytree=1,
colsample_bylevel=1,
reg_alpha=0,
reg_lambda=1,
scale_pos_weight=1,
base_score=0.5,
random_state=0)
print('--------------------------subject-----------------------')
print('BernoulliNB`s accuracy is %f' %Score(BernoulliNB(), data))
print('MultinomiaNB`s accuracy is %f' %Score(MultinomialNB(), data))
#print('XGBClassifier1s accuracy is %f' %Score(bst_subject, data))
print('RidgeClassifier`s accuracy is %f' %Score(RidgeClassifier(), data))
print('LogisticRegression`s accuracy is %f' %Score(LogisticRegressionCV(), data))
print('LogisticRegression`s accuracy is %f' %Score(LogisticRegression(), data))
#print('SVC`s accuracy is %f' %Score(SVC(), data))
print('LinearSVC`s accuracy is %f' %Score(LinearSVC(), data))
#經過測試,發現在現在的階段下,lr是預測準確率較高的模型
--------------------------subject-----------------------
BernoulliNBs accuracy is 0.718667 MultinomiaNB
s accuracy is 0.707000
RidgeClassifiers accuracy is 0.739333 LogisticRegression
s accuracy is 0.739000
LogisticRegressions accuracy is 0.721333 LinearSVC
s accuracy is 0.720333