【Natural Language Processing】社区问答系统中的comment分类

一、任务要求和环境

本次实验是SemEval-2015 Task 3英语部分的子任务A中，根据社区问答系统中的每一组问题，其中包含的数据有如发布日期，作者的Id，至少一个评论等内容；我们需要根据问题和参与该系统的评论相关性将评论分类为好的（Good），不好的（Bad）或是潜在有用的（Potential）。

Anaconda2-4.3.1（Python2.7）,gensim，nltk，以及一些常用的模块。

二、文本清洗

针对于给出来的文本含有一些停用词和存在一些单词的多种形式的存在，我们采用了NLTK对文本进行了如下处理：

①词形还原（lemmatization），把文本中的词还原为一般形式；

②词干抽取（stemming），抽取单词的词干或者词根形式；

③剔除停用词（stop words），将文本中含有的停用词剔除。

#-*- coding:utf-8 -*

import re
import os
from os.path import isfile, join
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from bs4 import BeautifulSoup as BS


def nltkProcess(sentence):
    if not sentence:
        print("Empty sentence in nltkProcess()!")
        exit()
        
    tokenizer = RegexpTokenizer(r'\w+') 
    tokens = tokenizer.tokenize(sentence) 
    noStopwords = [w.lower() for w in tokens if not w.lower() in stopwords.words('english')]
    lmtzr = []
    for w in noStopwords:
        lmtzr.append(WordNetLemmatizer().lemmatize(w))
    #print lmtzr

    #波特词干器,除去英文单词分词变换形式的结尾
    stem = []
    for w in lmtzr:
        stem.append(PorterStemmer().stem(w))
    #print stem
    return stem


def pro_question(question, outputPath):
    if not question:
        print("Empty question in question()!")
        exit()
    if isfile(outputPath):
        print("Invalid output file path! Directory expected!")
        exit()

    qid = question[ 'qid' ]
    qcategory = question[ 'qcategory' ]
    quserid = question[ 'quserid' ]
    qtype = question[ 'qtype' ]
    qgold_yn=question['qgold_yn']

    qsubject = str(question.qsubject.get_text())
    qbody = str(question.qbody.get_text())
    if qsubject not in qbody:
        qbody = qsubject + " " + qbody
    qbody = nltkProcess(qbody)

    os.mkdir(os.path.join(outputPath,qid))
    output = os.path.join(outputPath,qid)
    result = open(join(output, qid), "w+")

    for word in qbody:
        result.write(word + " ")
    result.close()
    print("qid:" + qid + "done!")
    return qid, qcategory, quserid, qtype,qgold_yn

def pro_comment(question, qid, outputPath, hasUrl, prefix):
    if not question:
        print("Empty question in comment()!")
        exit()
    if isfile(outputPath):
        print("Invalid output file path! Directory expected!")
        exit()

    cid = []
    cuserid = []
    cgold=[]
    cgold_yn=[]
    result1 = open(hasUrl, "a+")
    result2 = open(prefix, "a+")
    commentSet = [a for a in question.find_all('comment')]
    for each in commentSet:
        cid.append(each['cid'])
        cuserid.append(each['cuserid'])
        cgold.append(each[ 'cgold' ])
        cgold_yn.append(each[ 'cgold_yn' ])
        
        csubject = str(each.csubject.get_text())
        cbody = str(each.cbody.get_text())
        if csubject not in cbody:
            cbody = csubject + " " + cbody

        urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', cbody)
        if len(urls):
            result1.write(each['cid'] + "\t" + "1" + "\n")
            result2.write(each['cid'] + "\n")
        else:
            result1.write(each['cid'] + "\t" + "0" + "\n")
            result2.write(each['cid'] + "\n")
        cbody = nltkProcess(cbody)

        output = os.path.join(outputPath, qid)
        result = open(join(output, each['cid']), "a+")


        for word in cbody:
            result.write(word + " ")
        result.close()
    return cid, cuserid,cgold,cgold_yn

def process(inputFile, outputPath, qcInfo, hasUrl, prefix):
    result = open(qcInfo, "w+")
    with open(inputFile, "r") as fileContent:
        content = BS(fileContent.read(), "lxml")
        categorySet = {}
        questionSet = [a for a in content.find_all('question')]
        for each in questionSet:
            if each['qtype']!='YES_NO':
                continue
            qid, qcategory, quserid, qtype,qgold_yn = pro_question(each, outputPath)
            if qcategory not in categorySet:
                categorySet[qcategory] = 1
            else:
                categorySet[qcategory] += 1

            cid, cuserid,cgold,cgold_yn = pro_comment(each, qid, outputPath, hasUrl, prefix)
            
            #write qcInfo file
            result.write(qid + "\t" + qcategory + "\t" + quserid + "\t" + qtype + "\t"+qgold_yn+"\n")
            if not cid:
                print("cid empty!")
                exit()
            for i in range(len(cid)):
                result.write(cid[i] + "\t" + cuserid[i] + "\t"+cgold[i]+"\t"+cgold_yn[i]+"\n")
    result.close()
            

if __name__ == '__main__':
    input_file='../SemEval_2015_Task_3/method1/data/dataset/CQA-QL-devel.xml'
    output_path='../SemEval_2015_Task_3/method1/data/pre/dev'
    qcInfo_file='../SemEval_2015_Task_3/method1/data/qcInfo/qcInfo.dev'
    hasUrl_file='../SemEval_2015_Task_3/method1/data/url/hasUrl_dev.txt'
    prefix_file='../SemEval_2015_Task_3/method1/data/prefix/prefix_dev.txt'
    process(input_file,output_path,qcInfo_file,hasUrl_file,prefix_file)

三、特征提取

在这一节中，我们将基于一般的机器学习方法，根据两个子任务的情况来考虑相应的特征，包括任务A中的评论是否为提问者发表的，任务B中Yes-likely、No-likely、Unsure-likely单词数量，以及两个任务都有的主题模型等特征，下面将进行详细的介绍。

3.1 Bag of words模型

Bag of words模型的基本思想是忽略词序、语法和句法，仅仅将其看作是一些词汇和集合，而文本中的每个词汇都是独立的，将文本最终表示成矢量。

在这里我们将每一个问题和每一条评论的单词作为一个词袋，那么问题和评论的向量维数即为该问题和该条评论的单词总数，利用词袋的索引号，问题和评论都可以向量表示，每一维向量的值表示该单词在在对应文本中出现的次数。

对应代码：bogOfWords.py

#-*- coding:utf-8 -*-

import os
import sys
import copy

class BOW:
    def __init__(self, s1, s2):
        if not (s1 and s2):
            print("Empty s1 or s2 in Class BOW __init__()!")
            exit()         
        
        self.allBag = {}
        list1 = s1.strip().split(" ")
        list2 = s2.strip().split(" ")
        
        for w in list1:
            if w not in self.allBag:
                self.allBag[w] = 0               
        for w in list2:
            if w not in self.allBag:
                self.allBag[w] = 0
        
        self.bag1 = copy.deepcopy(self.allBag)
        self.bag2 = copy.deepcopy(self.allBag)
        for w in list1:
            self.bag1[w] += 1
        for w in list2:
            self.bag2[w] += 1
        #print self.bag1
        #print self.bag2
        
    def getVector(self):
        vector1 = []
        vector2 = []
        for word in self.allBag:
            vector1.append(self.bag1[word])
            vector2.append(self.bag2[word])
        return vector1,vector2

3.2 Vector-based similarity

此处与上一小节类似，不过我们是利用Google的Word2vec得到问题和评论中单词的向量，然后求和取平均值，最后再计算余弦值作为特征值。这里把该文件下载下来，利用gensim就可以实现了。

3.3 主题模型

主题模型，顾名思义就是对文字中隐含的主题的一种建模方法。主题是一个概念，它表示了一系列相关的词语，是词汇表上词语的条件概率，与主题关系越密切的词语，它的条件概率就越大，反之越小。

这里我们对每一个问题和对应的所有评论进行建模，主题的个数设置为该问题中评论的条数。同样最后我们计算每一条评论与问题的余弦值作为特征值。

对应代码：get_TopicModel.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import os
from nltk.tokenize import RegexpTokenizer
from gensim import corpora
from gensim import models
from os import listdir
from os.path import isdir, join

def GetFilesInFolder(ParentFolder):
    import os
    filenameList = []
    for filename in os.listdir(ParentFolder):
        filenameList.append(filename)
    return filenameList

def get():
    result1 = open('../SemEval_2015_Task_3/method1/data/topicModel/DevTopics.txt', 'w')
    result2 = open('../SemEval_2015_Task_3/method1/data/topicModel/DevLSI.txt', "w+")
    ParentFolder = '../SemEval_2015_Task_3/method1/data/pre/dev'
    files = [ f for f in listdir(ParentFolder) if isdir(join(ParentFolder, f)) ]
    for k,directory in enumerate(files):
        path = os.path.join(ParentFolder, directory)
        filenameList = GetFilesInFolder(path)
        sentences = [ ]
        for fileName in filenameList:
            f = open(path + "/" + fileName, "r", encoding='utf-8')
            sentences.append(f.readlines())

        words = [ ]
        for doc in sentences:
            tokenizer = RegexpTokenizer(r'\w+')
            a = list(tokenizer.tokenize(str(doc)))
            words.append(a)
        dic = corpora.Dictionary(words)
        corpus = [ dic.doc2bow(text) for text in words ]

        tfidf = models.TfidfModel(corpus)
        corpus_tfidf = tfidf[ corpus ]

        lsi = models.LsiModel(corpus_tfidf, id2word=dic, num_topics=len(filenameList))
        corpus_lsi = lsi[ corpus_tfidf ]

        lda = models.LdaModel(corpus_tfidf, id2word=dic, num_topics=len(filenameList))
        corpus_lda = lda[ corpus_tfidf ]

        for (Q, LSI, LDA) in zip(filenameList, corpus_lsi, corpus_lda):
            result1.write(Q + '\t')
            result2.write(Q + '\t')
            a=len(LDA)
            b=len(LSI)
            if a==0:
                result1.write('0' + '\t')
            else:
                for i in range(len(LDA)):
                    result1.write(str(LDA[ i ][ 1 ]) + '\t')
            if b==0:
                result2.write('0' + '\t')
            else:
                for i in range(len(LSI)):
                    result2.write(str(LSI[ i ][ 1 ]) + '\t')
            result1.write('\n')
            result2.write('\n')
        print(k)
    result1.close()
    result2.close()



if __name__ == '__main__':
    get()

3.4 TF-IDF

TF-IDF（term frequency-inverse document frequency）是一种用于咨询检索与咨询探勘的常用加权技术，它可以评估一个字词对于一个文件集或者一个语料库的其中一份文件的重要程度，字词的重要性随着它在文件中出现的次数成正比增加，但同时会随着它在语料库中出现的频率成反比下降。

TF（term frequency）就是分词出现的频率：该分词在文档中出现的频率，即为：（该分词在该文档出现的次数）/(该文档分词的总数)，这个值越大说明这个词越重要。

IDF（inverse document frequency），即逆向文件频率，在一个文档库中，一个分词出现在的文档数越少就越能和其他文档区别开来。算法为：log（（总文档数/出现该分词的文档数）+0.01）。

TF-IDF的计算就是将上面两个值进行相乘（tf*idf）作为特征值。具体实现可见：【Natural Language Processing】TF-IDF及其Python实现

3.5 评论是否为提问者发表

该特征主要是判断评论是否为提问者发表的，是为1，否则为0，这个很简单不在贴代码出来，下同。

3.6 评论中是否有超链接

有为1，否则为0。

3.7 最终特征

经过以上处理之后，我们得到两个分别对应问题和评论的向量，一般地，我们认为如果评论和问题相似，那么这两个向量的夹角越小，所以我们求得两向量的余弦值作为特征值。

四、模型分类

本次A实验一共采用了三种机器学习分类器，具体算法和参数如下：

1．DecisionTreeClassifier (max_depth= 3)

2．SVM (kernel='rbf', gamma = 0.001,C = 100)

3．RandomForestClassifier(n_estimators= 100, random_state=10)

对应代码：Classification.py

# -*- coding: utf-8 -*-

import numpy as np
import os,sys
from sklearn import tree
from sklearn import svm
from sklearn import metrics
from sklearn.preprocessing import Imputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB

class Classification():
    def __init__(self, classifier, trainFile, devFile):
        f = open(trainFile)
        data = np.loadtxt(f)
        
        # select columns 1 through end
        X_train = data[:, 1:]  
        y_train = data[:, 0] 
        f.close()
            
        f = open(devFile)
        pre = np.loadtxt(f)
        X_test = pre[:, 1:]
        
        # min_max_scaler = MinMaxScaler()
        # X_train = min_max_scaler.fit_transform(X_train)
        # X_test = min_max_scaler.fit_transform(X_test)
        
        #fit transform if there exit NAN or INFINITE
        #otherwise you'll get error when clf.predict()
        X_test = Imputer().fit_transform(X_test) 
        if np.isnan(X_test).any():
            print("nan in X_test!")
            exit()
            
        self.y_test = pre[:, 0] 
        f.close()
        
        if classifier == 'tree':
            #max_depth = 4 is best
            # max_depth = np.arange(1, 10)
            # clf = GridSearchCV(tree.DecisionTreeClassifier(), param_grid = {'max_depth': max_depth})
            clf = tree.DecisionTreeClassifier(max_depth = 3)
            
        elif classifier == 'knn':
            #n_neighbors = 9 is best
            #but default is 5, better than 9?
            # n_neighbors = np.arange(1, 10)  
            # clf = GridSearchCV(KNeighborsClassifier(), param_grid = {'n_neighbors': n_neighbors})
            clf = KNeighborsClassifier(n_neighbors = 15)
            
        elif classifier == 'svm':
            #{'kernel': 'rbf', 'C': 100, 'gamma': 0.001}
            # param_grid = [{'C': [1, 10, 100, 1000], 'kernel': ['linear']}, {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']}]
            # clf = GridSearchCV(svm.SVC(), param_grid)
            clf = svm.SVC(kernel='rbf', gamma = 0.01, C = 200)
            
        elif classifier == 'gbdt':
            # max_depth = np.arange(1, 10)
            # n_estimators = [10, 100, 1000]
            # learning_rate = [0.1, 0.2, 0.3, 0.4, 0.5]
            # clf = GridSearchCV(GradientBoostingClassifier(), param_grid = {'max_depth': max_depth, 'n_estimators': n_estimators, 'learning_rate': learning_rate})
            clf = GradientBoostingClassifier(n_estimators = 100, max_depth = 5)
            
        elif classifier == 'essemble':
            #{'n_estimators': 10, 'max_depth': 6}
            # max_depth = np.arange(1, 10)
            # n_estimators = [10, 100, 1000]
            # clf = GridSearchCV(RandomForestClassifier(), param_grid = {'max_depth': max_depth, 'n_estimators': n_estimators})
            clf = RandomForestClassifier(n_estimators = 1000, random_state=15325)
            
        elif classifier == 'nb':
            clf = MultinomialNB()
            print(clf)
        else:
            print("Invalid classifier in Class Classification __init__()!")
            exit()
        
        clf.fit(X_train, y_train) 
        #print clf.best_params_
        
        self.y_pred = clf.predict(X_test)
        
        '''
        #test usage!
        self.calculate_result(self.y_test, self.y_pred)
        print( "ACC:  %f " %accuracy_score(y_test,y_pred))
        '''
        
    def getPreResult(self, outputFile):
        fout = open(outputFile,"w+")
        for each in self.y_pred:
            eachint = int(each)
            fout.write(str(eachint)+"\n")
        fout.close()        
   
    #this function is used for testing!
    def calculate_result(self):  
        m_precision = metrics.precision_score(self.y_test, self.y_pred)
        m_recall = metrics.recall_score(self.y_test, self.y_pred) 
        print('precision:{0:.3f}'.format(m_precision))
        print('recall:{0:0.3f}'.format(m_recall))
        print('f1-score:{0:.3f}'.format(metrics.f1_score(self.y_test, self.y_pred)))


if __name__ == "__main__":
    if len(sys.argv) < 5:
        print("sys.argv[1]: classifier")
        print("sys.argv[2]: trainFile")
        print("sys.argv[3]: devFile")
        print("sys.argv[4]: outputFile")
        exit()

    cfInstance = Classification(sys.argv[ 1 ], sys.argv[ 2 ], sys.argv[ 3 ])
    cfInstance.getPreResult(sys.argv[ 4 ])

五、实验结果及分析

Model	Accuracy
RandomForestClassifier	61.33%
Support Vector Machine	62.06%
DecisionTreeClassifier	62.43%

其中最好的结果是62.43%，经过分析总结，有以下结论：从中只提取了少量的文本特征，特别是关于TF-IDF词袋模型等特征，这些特征都较好能反映出文本的特点。但是除此之外还可以丰富更多的特征，例如可以从词性下手，例如评论中是否包含了thank和yes/no等词语的数量或者所占比例，因为这些词的词性能很好地反映出整个句子的情感，因此可以考虑作为特征的提取；除此之外了一些经典的机器学习算法，并且没有进行较好地调参工作，如果进行仔细调参或者尝试其他的算法可能还会有提升的空间。

说明：由于时间过得比较久，本身也懒得整理了，所以代码什么的都很凌乱，旨在做一个思路引导。

Lawe

发布了63 篇原创文章 · 获赞 15 · 访问量 10万+

私信关注

【Natural Language Processing】社区问答系统中的comment分类

【Deep Learning】循環神經網絡(RNN)推導和實現

【論文筆記】Character-Aware Neural Language Models

【Natural Language Processing】seq2seq學習筆記

【其他】macos安裝git及上傳文件到github

【論文筆記】Neural Relation Extraction with Multi-lingual Attention

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結