【Natural Language Processing】社區問答系統中的comment分類

一、任務要求和環境

        本次實驗是SemEval-2015 Task 3英語部分的子任務A中,根據社區問答系統中的每一組問題,其中包含的數據有如發佈日期,作者的Id,至少一個評論等內容;我們需要根據問題和參與該系統的評論相關性將評論分類爲好的(Good),不好的(Bad)或是潛在有用的(Potential)。

         Anaconda2-4.3.1(Python2.7),gensim,nltk,以及一些常用的模塊。

二、文本清洗

        針對於給出來的文本含有一些停用詞和存在一些單詞的多種形式的存在,我們採用了NLTK對文本進行了如下處理:

        ①詞形還原(lemmatization),把文本中的詞還原爲一般形式;

        ②詞幹抽取(stemming),抽取單詞的詞幹或者詞根形式;

        ③剔除停用詞(stop words),將文本中含有的停用詞剔除。

#-*- coding:utf-8 -*

import re
import os
from os.path import isfile, join
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from bs4 import BeautifulSoup as BS


def nltkProcess(sentence):
    if not sentence:
        print("Empty sentence in nltkProcess()!")
        exit()
        
    tokenizer = RegexpTokenizer(r'\w+') 
    tokens = tokenizer.tokenize(sentence) 
    noStopwords = [w.lower() for w in tokens if not w.lower() in stopwords.words('english')]
    lmtzr = []
    for w in noStopwords:
        lmtzr.append(WordNetLemmatizer().lemmatize(w))
    #print lmtzr

    #波特詞幹器,除去英文單詞分詞變換形式的結尾
    stem = []
    for w in lmtzr:
        stem.append(PorterStemmer().stem(w))
    #print stem
    return stem


def pro_question(question, outputPath):
    if not question:
        print("Empty question in question()!")
        exit()
    if isfile(outputPath):
        print("Invalid output file path! Directory expected!")
        exit()

    qid = question[ 'qid' ]
    qcategory = question[ 'qcategory' ]
    quserid = question[ 'quserid' ]
    qtype = question[ 'qtype' ]
    qgold_yn=question['qgold_yn']

    qsubject = str(question.qsubject.get_text())
    qbody = str(question.qbody.get_text())
    if qsubject not in qbody:
        qbody = qsubject + " " + qbody
    qbody = nltkProcess(qbody)

    os.mkdir(os.path.join(outputPath,qid))
    output = os.path.join(outputPath,qid)
    result = open(join(output, qid), "w+")

    for word in qbody:
        result.write(word + " ")
    result.close()
    print("qid:" + qid + "done!")
    return qid, qcategory, quserid, qtype,qgold_yn

def pro_comment(question, qid, outputPath, hasUrl, prefix):
    if not question:
        print("Empty question in comment()!")
        exit()
    if isfile(outputPath):
        print("Invalid output file path! Directory expected!")
        exit()

    cid = []
    cuserid = []
    cgold=[]
    cgold_yn=[]
    result1 = open(hasUrl, "a+")
    result2 = open(prefix, "a+")
    commentSet = [a for a in question.find_all('comment')]
    for each in commentSet:
        cid.append(each['cid'])
        cuserid.append(each['cuserid'])
        cgold.append(each[ 'cgold' ])
        cgold_yn.append(each[ 'cgold_yn' ])
        
        csubject = str(each.csubject.get_text())
        cbody = str(each.cbody.get_text())
        if csubject not in cbody:
            cbody = csubject + " " + cbody

        urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', cbody)
        if len(urls):
            result1.write(each['cid'] + "\t" + "1" + "\n")
            result2.write(each['cid'] + "\n")
        else:
            result1.write(each['cid'] + "\t" + "0" + "\n")
            result2.write(each['cid'] + "\n")
        cbody = nltkProcess(cbody)

        output = os.path.join(outputPath, qid)
        result = open(join(output, each['cid']), "a+")


        for word in cbody:
            result.write(word + " ")
        result.close()
    return cid, cuserid,cgold,cgold_yn

def process(inputFile, outputPath, qcInfo, hasUrl, prefix):
    result = open(qcInfo, "w+")
    with open(inputFile, "r") as fileContent:
        content = BS(fileContent.read(), "lxml")
        categorySet = {}
        questionSet = [a for a in content.find_all('question')]
        for each in questionSet:
            if each['qtype']!='YES_NO':
                continue
            qid, qcategory, quserid, qtype,qgold_yn = pro_question(each, outputPath)
            if qcategory not in categorySet:
                categorySet[qcategory] = 1
            else:
                categorySet[qcategory] += 1

            cid, cuserid,cgold,cgold_yn = pro_comment(each, qid, outputPath, hasUrl, prefix)
            
            #write qcInfo file
            result.write(qid + "\t" + qcategory + "\t" + quserid + "\t" + qtype + "\t"+qgold_yn+"\n")
            if not cid:
                print("cid empty!")
                exit()
            for i in range(len(cid)):
                result.write(cid[i] + "\t" + cuserid[i] + "\t"+cgold[i]+"\t"+cgold_yn[i]+"\n")
    result.close()
            

if __name__ == '__main__':
    input_file='../SemEval_2015_Task_3/method1/data/dataset/CQA-QL-devel.xml'
    output_path='../SemEval_2015_Task_3/method1/data/pre/dev'
    qcInfo_file='../SemEval_2015_Task_3/method1/data/qcInfo/qcInfo.dev'
    hasUrl_file='../SemEval_2015_Task_3/method1/data/url/hasUrl_dev.txt'
    prefix_file='../SemEval_2015_Task_3/method1/data/prefix/prefix_dev.txt'
    process(input_file,output_path,qcInfo_file,hasUrl_file,prefix_file)

三、特徵提取

        在這一節中,我們將基於一般的機器學習方法,根據兩個子任務的情況來考慮相應的特徵,包括任務A中的評論是否爲提問者發表的,任務B中Yes-likely、No-likely、Unsure-likely單詞數量,以及兩個任務都有的主題模型等特徵,下面將進行詳細的介紹。

3.1   Bag of words模型

        Bag of words模型的基本思想是忽略詞序、語法和句法,僅僅將其看作是一些詞彙和集合,而文本中的每個詞彙都是獨立的,將文本最終表示成矢量。

        在這裏我們將每一個問題和每一條評論的單詞作爲一個詞袋,那麼問題和評論的向量維數即爲該問題和該條評論的單詞總數,利用詞袋的索引號,問題和評論都可以向量表示,每一維向量的值表示該單詞在在對應文本中出現的次數。

對應代碼:bogOfWords.py

#-*- coding:utf-8 -*-

import os
import sys
import copy

class BOW:
    def __init__(self, s1, s2):
        if not (s1 and s2):
            print("Empty s1 or s2 in Class BOW __init__()!")
            exit()         
        
        self.allBag = {}
        list1 = s1.strip().split(" ")
        list2 = s2.strip().split(" ")
        
        for w in list1:
            if w not in self.allBag:
                self.allBag[w] = 0               
        for w in list2:
            if w not in self.allBag:
                self.allBag[w] = 0
        
        self.bag1 = copy.deepcopy(self.allBag)
        self.bag2 = copy.deepcopy(self.allBag)
        for w in list1:
            self.bag1[w] += 1
        for w in list2:
            self.bag2[w] += 1
        #print self.bag1
        #print self.bag2
        
    def getVector(self):
        vector1 = []
        vector2 = []
        for word in self.allBag:
            vector1.append(self.bag1[word])
            vector2.append(self.bag2[word])
        return vector1,vector2

3.2   Vector-based similarity

        此處與上一小節類似,不過我們是利用Google的Word2vec得到問題和評論中單詞的向量,然後求和取平均值,最後再計算餘弦值作爲特徵值。這裏把該文件下載下來,利用gensim就可以實現了。

3.3  主題模型

        主題模型,顧名思義就是對文字中隱含的主題的一種建模方法。主題是一個概念,它表示了一系列相關的詞語,是詞彙表上詞語的條件概率,與主題關係越密切的詞語,它的條件概率就越大,反之越小。

        這裏我們對每一個問題和對應的所有評論進行建模,主題的個數設置爲該問題中評論的條數。同樣最後我們計算每一條評論與問題的餘弦值作爲特徵值。

對應代碼:get_TopicModel.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import os
from nltk.tokenize import RegexpTokenizer
from gensim import corpora
from gensim import models
from os import listdir
from os.path import isdir, join

def GetFilesInFolder(ParentFolder):
    import os
    filenameList = []
    for filename in os.listdir(ParentFolder):
        filenameList.append(filename)
    return filenameList

def get():
    result1 = open('../SemEval_2015_Task_3/method1/data/topicModel/DevTopics.txt', 'w')
    result2 = open('../SemEval_2015_Task_3/method1/data/topicModel/DevLSI.txt', "w+")
    ParentFolder = '../SemEval_2015_Task_3/method1/data/pre/dev'
    files = [ f for f in listdir(ParentFolder) if isdir(join(ParentFolder, f)) ]
    for k,directory in enumerate(files):
        path = os.path.join(ParentFolder, directory)
        filenameList = GetFilesInFolder(path)
        sentences = [ ]
        for fileName in filenameList:
            f = open(path + "/" + fileName, "r", encoding='utf-8')
            sentences.append(f.readlines())

        words = [ ]
        for doc in sentences:
            tokenizer = RegexpTokenizer(r'\w+')
            a = list(tokenizer.tokenize(str(doc)))
            words.append(a)
        dic = corpora.Dictionary(words)
        corpus = [ dic.doc2bow(text) for text in words ]

        tfidf = models.TfidfModel(corpus)
        corpus_tfidf = tfidf[ corpus ]

        lsi = models.LsiModel(corpus_tfidf, id2word=dic, num_topics=len(filenameList))
        corpus_lsi = lsi[ corpus_tfidf ]

        lda = models.LdaModel(corpus_tfidf, id2word=dic, num_topics=len(filenameList))
        corpus_lda = lda[ corpus_tfidf ]

        for (Q, LSI, LDA) in zip(filenameList, corpus_lsi, corpus_lda):
            result1.write(Q + '\t')
            result2.write(Q + '\t')
            a=len(LDA)
            b=len(LSI)
            if a==0:
                result1.write('0' + '\t')
            else:
                for i in range(len(LDA)):
                    result1.write(str(LDA[ i ][ 1 ]) + '\t')
            if b==0:
                result2.write('0' + '\t')
            else:
                for i in range(len(LSI)):
                    result2.write(str(LSI[ i ][ 1 ]) + '\t')
            result1.write('\n')
            result2.write('\n')
        print(k)
    result1.close()
    result2.close()



if __name__ == '__main__':
    get()

3.4  TF-IDF

        TF-IDF(term frequency-inverse document frequency)是一種用於諮詢檢索與諮詢探勘的常用加權技術,它可以評估一個字詞對於一個文件集或者一個語料庫的其中一份文件的重要程度,字詞的重要性隨着它在文件中出現的次數成正比增加,但同時會隨着它在語料庫中出現的頻率成反比下降。

        TF(term frequency)就是分詞出現的頻率:該分詞在文檔中出現的頻率,即爲:(該分詞在該文檔出現的次數)/(該文檔分詞的總數),這個值越大說明這個詞越重要。

        IDF(inverse document frequency),即逆向文件頻率,在一個文檔庫中,一個分詞出現在的文檔數越少就越能和其他文檔區別開來。算法爲:log((總文檔數/出現該分詞的文檔數)+0.01)。

        TF-IDF的計算就是將上面兩個值進行相乘(tf*idf)作爲特徵值。具體實現可見:【Natural Language Processing】TF-IDF及其Python實現

3.5  評論是否爲提問者發表

        該特徵主要是判斷評論是否爲提問者發表的,是爲1,否則爲0,這個很簡單不在貼代碼出來,下同。

3.6  評論中是否有超鏈接

        有爲1,否則爲0。

3.7 最終特徵

        經過以上處理之後,我們得到兩個分別對應問題和評論的向量,一般地,我們認爲如果評論和問題相似,那麼這兩個向量的夾角越小,所以我們求得兩向量的餘弦值作爲特徵值。

四、模型分類

        本次A實驗一共採用了三種機器學習分類器,具體算法和參數如下:

        1.DecisionTreeClassifier (max_depth= 3)

        2.SVM (kernel='rbf', gamma = 0.001,C = 100)

        3.RandomForestClassifier(n_estimators= 100, random_state=10)

對應代碼:Classification.py

# -*- coding: utf-8 -*-

import numpy as np
import os,sys
from sklearn import tree
from sklearn import svm
from sklearn import metrics
from sklearn.preprocessing import Imputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB

class Classification():
    def __init__(self, classifier, trainFile, devFile):
        f = open(trainFile)
        data = np.loadtxt(f)
        
        # select columns 1 through end
        X_train = data[:, 1:]  
        y_train = data[:, 0] 
        f.close()
            
        f = open(devFile)
        pre = np.loadtxt(f)
        X_test = pre[:, 1:]
        
        # min_max_scaler = MinMaxScaler()
        # X_train = min_max_scaler.fit_transform(X_train)
        # X_test = min_max_scaler.fit_transform(X_test)
        
        #fit transform if there exit NAN or INFINITE
        #otherwise you'll get error when clf.predict()
        X_test = Imputer().fit_transform(X_test) 
        if np.isnan(X_test).any():
            print("nan in X_test!")
            exit()
            
        self.y_test = pre[:, 0] 
        f.close()
        
        if classifier == 'tree':
            #max_depth = 4 is best
            # max_depth = np.arange(1, 10)
            # clf = GridSearchCV(tree.DecisionTreeClassifier(), param_grid = {'max_depth': max_depth})
            clf = tree.DecisionTreeClassifier(max_depth = 3)
            
        elif classifier == 'knn':
            #n_neighbors = 9 is best
            #but default is 5, better than 9?
            # n_neighbors = np.arange(1, 10)  
            # clf = GridSearchCV(KNeighborsClassifier(), param_grid = {'n_neighbors': n_neighbors})
            clf = KNeighborsClassifier(n_neighbors = 15)
            
        elif classifier == 'svm':
            #{'kernel': 'rbf', 'C': 100, 'gamma': 0.001}
            # param_grid = [{'C': [1, 10, 100, 1000], 'kernel': ['linear']}, {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']}]
            # clf = GridSearchCV(svm.SVC(), param_grid)
            clf = svm.SVC(kernel='rbf', gamma = 0.01, C = 200)
            
        elif classifier == 'gbdt':
            # max_depth = np.arange(1, 10)
            # n_estimators = [10, 100, 1000]
            # learning_rate = [0.1, 0.2, 0.3, 0.4, 0.5]
            # clf = GridSearchCV(GradientBoostingClassifier(), param_grid = {'max_depth': max_depth, 'n_estimators': n_estimators, 'learning_rate': learning_rate})
            clf = GradientBoostingClassifier(n_estimators = 100, max_depth = 5)
            
        elif classifier == 'essemble':
            #{'n_estimators': 10, 'max_depth': 6}
            # max_depth = np.arange(1, 10)
            # n_estimators = [10, 100, 1000]
            # clf = GridSearchCV(RandomForestClassifier(), param_grid = {'max_depth': max_depth, 'n_estimators': n_estimators})
            clf = RandomForestClassifier(n_estimators = 1000, random_state=15325)
            
        elif classifier == 'nb':
            clf = MultinomialNB()
            print(clf)
        else:
            print("Invalid classifier in Class Classification __init__()!")
            exit()
        
        clf.fit(X_train, y_train) 
        #print clf.best_params_
        
        self.y_pred = clf.predict(X_test)
        
        '''
        #test usage!
        self.calculate_result(self.y_test, self.y_pred)
        print( "ACC:  %f " %accuracy_score(y_test,y_pred))
        '''
        
    def getPreResult(self, outputFile):
        fout = open(outputFile,"w+")
        for each in self.y_pred:
            eachint = int(each)
            fout.write(str(eachint)+"\n")
        fout.close()        
   
    #this function is used for testing!
    def calculate_result(self):  
        m_precision = metrics.precision_score(self.y_test, self.y_pred)
        m_recall = metrics.recall_score(self.y_test, self.y_pred) 
        print('precision:{0:.3f}'.format(m_precision))
        print('recall:{0:0.3f}'.format(m_recall))
        print('f1-score:{0:.3f}'.format(metrics.f1_score(self.y_test, self.y_pred)))


if __name__ == "__main__":
    if len(sys.argv) < 5:
        print("sys.argv[1]: classifier")
        print("sys.argv[2]: trainFile")
        print("sys.argv[3]: devFile")
        print("sys.argv[4]: outputFile")
        exit()

    cfInstance = Classification(sys.argv[ 1 ], sys.argv[ 2 ], sys.argv[ 3 ])
    cfInstance.getPreResult(sys.argv[ 4 ])

五、實驗結果及分析

Model

Accuracy

RandomForestClassifier

61.33%

Support Vector Machine

62.06%

DecisionTreeClassifier

62.43%

        其中最好的結果是62.43%,經過分析總結,有以下結論:從中只提取了少量的文本特徵,特別是關於TF-IDF詞袋模型等特徵,這些特徵都較好能反映出文本的特點。但是除此之外還可以豐富更多的特徵,例如可以從詞性下手,例如評論中是否包含了thank和yes/no等詞語的數量或者所佔比例,因爲這些詞的詞性能很好地反映出整個句子的情感,因此可以考慮作爲特徵的提取;除此之外了一些經典的機器學習算法,並且沒有進行較好地調參工作,如果進行仔細調參或者嘗試其他的算法可能還會有提升的空間。

       說明:由於時間過得比較久,本身也懶得整理了,所以代碼什麼的都很凌亂,旨在做一個思路引導。








發佈了63 篇原創文章 · 獲贊 15 · 訪問量 10萬+
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章