之前寫了一篇文章實現計算句子相似度:https://blog.csdn.net/u013421629/article/details/85046362
在github上看到一個封裝好的計算句子相似度工具CHlikelihood,底層實現原理跟我之前寫的是一模一樣啊,計算結果也是一樣。正所謂英雄所見略同啊!
看看他的官方網站:https://github.com/ZhanPwBibiBibi/CHlikelihood
linux 安裝方法
pip install Chlikelihood
windows 安裝方法:
windows pip 方式安裝不成功,需要下載源碼,
python setup.py install
注意需要修改下源碼:
打開方式加了一句,encoding=‘utf-8’,不然安裝會報錯。
安裝成功好了之後,拿來即用了。非常方便。
linux 下:
window10下:
調用函數:
# -*- coding: utf-8 -*-
from CHlikelihood.likelihood import Likelihood
import re
# 去除html標籤
def filter_html(html):
"""
:param html: html
:return: 返回去掉html的純淨文本
"""
dr = re.compile(r'<[^>]+>',re.S)
dd = dr.sub('',html).strip()
return dd
# 句子相似度
def sentence_similary(s1,s2):
"""
:param s1: 句子1
:param s2: 句子2
:return: 句子相似度
"""
a = Likelihood()
similary=a.likelihood(s1,s2)
return similary
if __name__ == '__main__':
s1="很高興見到你"
s2="我也很高興見到你"
similary=sentence_similary(s1,s2)
print(similary)
Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\xiaohu\AppData\Local\Temp\jieba.cache
0.8164965809277261
Loading model cost 0.911 seconds.
Prefix dict has been built succesfully.
Process finished with exit code 0
底層實現:
import math
import jieba
class Likelihood:
def word2vec(self, word1, word2):
if self.punctuation is False:
pun_list = ['。', ',', '、', '?', '!', ';', ':', '“', '”', '‘', '’', '「', '」', '『', '』', '(', ')', '[', ']',
'〔', '〕', '【', '】', '——', '—', '……', '…', '—', '-', '~', '·', '《', '》', '〈', '〉', '﹏﹏', '___',
'.']
seg_list_1 = [w for w in list(jieba.cut(word1, cut_all=False)) if w not in pun_list]
seg_list_2 = [w for w in list(jieba.cut(word2, cut_all=False)) if w not in pun_list]
else:
seg_list_1 = list(jieba.cut(word1, cut_all=False))
seg_list_2 = list(jieba.cut(word2, cut_all=False))
total_seg_list = list(set(seg_list_1 + seg_list_2))
seg_vec_1 = []
seg_vec_2 = []
for word_tol in total_seg_list:
freq = 0
for word in seg_list_1:
if word_tol == word:
freq += 1
seg_vec_1.append(freq)
freq = 0
for word in seg_list_2:
if word_tol == word:
freq += 1
seg_vec_2.append(freq)
self.seg_vec_1, self.seg_vec_2 = seg_vec_1, seg_vec_2
def cos_dist(self):
if len(self.seg_vec_1) != len(self.seg_vec_2):
return None
part_up = 0.0
a_sq = 0.0
b_sq = 0.0
for a1, b1 in zip(self.seg_vec_1, self.seg_vec_2):
part_up += a1 * b1
a_sq += a1 ** 2
b_sq += b1 ** 2
part_down = math.sqrt(a_sq * b_sq)
if part_down == 0.0:
return None
else:
return part_up / part_down
def likelihood(self, word1, word2, punctuation=False):
self.word1 = word1
self.word2 = word2
self.punctuation = punctuation
self.word2vec(self.word1, self.word2)
like_per = self.cos_dist()
return like_per
if __name__ == '__main__':
likelihood = Likelihood()
print(likelihood.likelihood('很高興見到你', '我也很高興見到你', punctuation=True))
Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\xiaohu\AppData\Local\Temp\jieba.cache
0.8164965809277261
Loading model cost 0.920 seconds.
Prefix dict has been built succesfully.
Process finished with exit code 0