語義相似度就是計算兩個句子之間的相似度,可以將兩個句子向量化之後,計算餘弦距離。
# -*- coding: utf-8 -*-
"""
Created on Thu Feb 21 20:18:38 2019
@author: lcl
"""
from sklearn.feature_extraction.text import CountVectorizer
import math
import jieba
from setting import logger
#創建停用詞list
def stop_word_list(path):
stopwords = [line.strip() for line in open(path, 'r', encoding='utf-8').readlines()]
return stopwords
#預處理文本
def preprocess(text):
if isinstance(text,str):
text_with_spaces=""
textcut = jieba.cut(text.strip())
stopwords = stop_word_list("data/stop_words.txt")
for word in textcut:
if word not in stopwords:
if word != '\t':
text_with_spaces += word + " "
else:
raise TypeError('text should be str')
return text_with_spaces
def norm_vector_nonzero(ori_vec):
ori_sum = math.sqrt(sum([math.pow(float(value),2) for (idx,value) in ori_vec]))
if ori_sum < 1e-6:
return ori_vec
result_vec = []
for idx, ori_value in ori_vec:
result_vec.append((idx, float(ori_value)/ori_sum))
#print ori_sum
return result_vec
def cosine_distance_nonzero(feat_vec1, feat_vec2, norm=True):
if True == norm:
feat_vec1 = norm_vector_nonzero(feat_vec1)
feat_vec2 = norm_vector_nonzero(feat_vec2)
dist = 0
idx1 = 0
idx2 = 0
while idx1 < len(feat_vec1) and idx2 < len(feat_vec2):
if feat_vec1[idx1][0] == feat_vec2[idx2][0]:
dist += float(feat_vec1[idx1][1])*float(feat_vec2[idx2][1])
idx1 += 1
idx2 += 1
elif feat_vec1[idx1][0] > feat_vec2[idx2][0]:
idx2 += 1
else:
idx1 += 1
return dist
def sparse_convert(matrix):
output = []
mat = matrix.toarray()
for v in mat:
out = []
for i in range(len(v)):
if v[i] > 0:
out.append((i,v[i]))
output.append(out)
return output
def texts_similarity(train,test,T):
logger.info("test:%s"%test)
logger.info("train:%s"%train)
texts_test = []
for tex in test:
texts_test.append(preprocess(tex))
texts_train = []
for tex in train:
texts_train.append(preprocess(tex))
#logger.info("texts_train:%s"%str(texts_train))
#logger.info("texts_test:%s"%str(texts_test))
#計算訓練數據詞向量
all_texts = texts_train + texts_test
count_vector = CountVectorizer(analyzer='word',token_pattern=u"(?u)\\b\\w+\\b")
vector_matrix = count_vector.fit_transform(all_texts)
#計算測試數據詞向量
#test_vector_matrix = count_vector.transform(texts_test)
all_matrix = sparse_convert(vector_matrix)
matrix_train = all_matrix[:len(texts_train)]
matrix_test = all_matrix[len(texts_train):]
#matrix_test = sparse_convert(test_vector_matrix)
result_map = {}
for i in range(len(matrix_train)):
for j in range(len(matrix_test)):
dis = cosine_distance_nonzero(matrix_train[i],matrix_test[j])
#print(dis)
if dis >= T:
result_map[i] = "1"
logger.info("dis:%s"%str(dis))
logger.info("similarity texts:%s---%s"%(str(train[i]),str(test[j])))
#logger.info("%s---%s"%(train[i],test[j]))
#print(train[i],test[j])
break
else:
result_map[i] = "0"
#print(result_map.values())
if "1" in result_map.values():
return True,result_map
else:
return False,{}
if __name__ == "__main__":
test = ["你說什麼啊","你說啥"]
train = ["你到底說啥","你說什麼","您好呀"]
res = texts_similarity(train,test,T=0.8)
print(res)