任务内容:
给定一个文本库,比如说新闻文本(无标注的)等等,现在有一些已经做好标注的文本,如何在文本库中找到与做好标注的文本相似的文章。
所用工具:
参考来源:
处理流程图:
项目源码:
# -*- coding: utf-8 -*- """ Created on Wed Jan 11 14:01:04 2017 @author: kelvin-li """ #将9722篇文档读入,并以列表方式存储 doc = [] f = open("C:\Users\kelvin-li\*****.txt") for line in f.readlines(): f_split = line.split(':::') path = f_split[6] docpath = path.replace('C:\Users\***','C:\Users\kelvin-li\***') docpath = docpath.replace('\n','') #去除路径最后的换行符 doc_file = open(docpath) doc.append(doc_file.read()) doc_file.close() f.close() import logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) from gensim import corpora,models,similarities #小写化文本,但是标点符号和单词没有分离 texts_lower = [[word for word in document.lower().split()] for document in doc] from nltk.tokenize import word_tokenize #对于每个文档,先进行utf-8解码,然后进行tokenize,再对每个单词小写化 texts_tokenized = [[word.lower() for word in word_tokenize(document.decode('utf-8'))] for document in doc] #去停用词,用nltk带有的停用词表 from nltk.corpus import stopwords english_stopwords = stopwords.words('english') #过滤掉文档中的停用词 texts_filtered_stopwords = [[word for word in document if not word in english_stopwords] for document in texts_tokenized] #过滤完停用词,但是标点符号没有过滤 #定义一个标点符号的词典,用这个词典来过滤标点符号 english_punctuations = [',','.',':',';','?','!','(',')','[',']','@','&','#','%','$','{','}','--','-'] texts_filtered = [[word for word in document if not word in english_punctuations] for document in texts_filtered_stopwords] #接下来将这些英文单词词干化,词干化可以提取不同语态及各种后缀的词干 #可以用nltk中的Lancaster Stemmer和 Poter Stemmer 工具 #对比发现Lancaster抽取时略去太多词尾的e,所以选Poter from nltk.stem.porter import PorterStemmer st = PorterStemmer() #from nltk.stem.lancaster import LancasterStemmer #st = LancasterStemmer() texts_stemmed = [[st.stem(word) for word in document] for document in texts_filtered] ''' #去掉文档语料库中出现次数为1的低频词 all_stems = sum(texts_stemmed,[]) stems_once = set(stem for stem in set(all_stems) if all_stems.count(stem) == 1) texts = [[stem for stem in text if stem not in stems_once] for text in texts_stemmed] ''' #建立一个字典,字典表示了这个词以及这个词在texts语料库里面出现的次数 dictionary = corpora.Dictionary(texts_stemmed) #把整个语料库的文档转化为(id,出现次数) corpus = [dictionary.doc2bow(text) for text in texts_stemmed] # define a 10-dimensional LSI space lsi = models.LsiModel(corpus,id2word=dictionary,num_topics=20) #transform corpus to lsi space and index it index = similarities.MatrixSimilarity(lsi[corpus]) #需要比较的文档 ''' #随机选取5个文件来和其他比较 import random text_num = random.sample(xrange(10),5) ''' #选取指定的1000个文档比较 text_num = [] num_file = open("C:\Users\kelvin-li\Desktop\MMSED-Text\Text_Index1_10.txt") for line in num_file.readlines(): line = line.strip('\n') text_num.append(line) num_file.close() #将列表中字符串改成数值,迭代列表元素处理后组成新的列表。 text_num = [int(i)-1 for i in text_num] sims = [] for count in text_num: compare_text = dictionary.doc2bow(texts_stemmed[count]) sims.append(index[lsi[compare_text]]) #perform a similarity query against the corpus ''' # 前10篇文档的相互比较 count = 0 texts_lsi = [] sims = [] while (count < 10): compare_text = dictionary.doc2bow(texts_stemmed[count]) texts_lsi.append( lsi[compare_text] ) sims.append(index[texts_lsi[count]]) #perform a similarity query against the corpus count+=1 ''' # 利用numpy来处理相似矩阵 import numpy as np sim_matrix = np.array(sims).transpose() np.savetxt('***.txt',sim_matrix, delimiter = ',' , fmt = '%10.8f') #读入数据 #t = np.loadtxt("C:\Users\kelvin-li\***.txt",delimiter = ',')