語義相似度的計算

語義相似度就是計算兩個句子之間的相似度,可以將兩個句子向量化之後,計算餘弦距離。

# -*- coding: utf-8 -*-
"""
Created on Thu Feb 21 20:18:38 2019

@author: lcl
"""
from sklearn.feature_extraction.text import CountVectorizer
import math
import jieba
from setting import logger
#創建停用詞list
def stop_word_list(path):
    stopwords = [line.strip() for line in open(path, 'r', encoding='utf-8').readlines()] 
    return stopwords
#預處理文本
def preprocess(text):
    if isinstance(text,str):
        text_with_spaces=""
        textcut = jieba.cut(text.strip()) 
        stopwords = stop_word_list("data/stop_words.txt")
        for word in textcut:
            if word not in stopwords:
                if word != '\t':
                    text_with_spaces += word + " "
    else:
        raise TypeError('text should be str')
    return text_with_spaces

def norm_vector_nonzero(ori_vec):
    ori_sum = math.sqrt(sum([math.pow(float(value),2) for (idx,value) in ori_vec]))
    if ori_sum < 1e-6:
        return ori_vec
    result_vec = []
    for idx, ori_value in ori_vec:
        result_vec.append((idx, float(ori_value)/ori_sum))
    #print ori_sum
    return result_vec

def cosine_distance_nonzero(feat_vec1, feat_vec2, norm=True):
    if True == norm:
        feat_vec1 = norm_vector_nonzero(feat_vec1)
        feat_vec2 = norm_vector_nonzero(feat_vec2)
    dist = 0
    idx1 = 0
    idx2 = 0
    while idx1 < len(feat_vec1) and idx2 < len(feat_vec2):
        if feat_vec1[idx1][0] == feat_vec2[idx2][0]:
            dist += float(feat_vec1[idx1][1])*float(feat_vec2[idx2][1])
            idx1 += 1
            idx2 += 1
        elif feat_vec1[idx1][0] > feat_vec2[idx2][0]:
            idx2 += 1
        else:
            idx1 += 1
    return dist

def sparse_convert(matrix):
    output = []
    mat = matrix.toarray()
    for v in mat:
        out = []
        for i in range(len(v)):
            if v[i] > 0:
                out.append((i,v[i])) 
        output.append(out)
    return output

def texts_similarity(train,test,T):
    logger.info("test:%s"%test)
    logger.info("train:%s"%train)
    texts_test = []
    for tex in test:
        texts_test.append(preprocess(tex))
    texts_train = []
    for tex in train:
        texts_train.append(preprocess(tex))
    #logger.info("texts_train:%s"%str(texts_train))
    #logger.info("texts_test:%s"%str(texts_test))
    #計算訓練數據詞向量 
    all_texts = texts_train + texts_test      
    count_vector = CountVectorizer(analyzer='word',token_pattern=u"(?u)\\b\\w+\\b")
    vector_matrix = count_vector.fit_transform(all_texts)
    #計算測試數據詞向量
    #test_vector_matrix = count_vector.transform(texts_test)
    all_matrix = sparse_convert(vector_matrix) 
    matrix_train = all_matrix[:len(texts_train)]
    matrix_test = all_matrix[len(texts_train):]
    #matrix_test = sparse_convert(test_vector_matrix) 
    result_map = {}
    for i in range(len(matrix_train)):
        for j in range(len(matrix_test)):
            dis = cosine_distance_nonzero(matrix_train[i],matrix_test[j])
            #print(dis)
            if dis >= T:
                result_map[i] = "1"
                logger.info("dis:%s"%str(dis))
                logger.info("similarity texts:%s---%s"%(str(train[i]),str(test[j])))
                #logger.info("%s---%s"%(train[i],test[j]))
                #print(train[i],test[j])
                break
            else:
                result_map[i] = "0"
    #print(result_map.values())
    if "1" in result_map.values():
        return True,result_map
    else:
        return False,{}

if __name__ == "__main__":
    test = ["你說什麼啊","你說啥"]
    train = ["你到底說啥","你說什麼","您好呀"]
    res = texts_similarity(train,test,T=0.8)
    print(res)


發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章