相似文檔檢測
Mission
data.csv中包含了一個新聞標題列表,試通過近似檢測方法,通過Jaccard相似度,檢測相似文章,將結果保存到csv文件中,不同文章間用空行隔開。
Work
思路:
- 兩個詞作爲一段來計算,末尾不夠截掉
- Jaccard相關係數大於0.5則認爲兩個新聞標題相似
- 利用並查集將相似的合併在一起
Code
import pandas as pd
import nltk
import numpy as np
class Jaccard:
def __init__(self, _len): # _len 爲步長值,語句切分的步長值
self._len = _len
return
def cut2list(self, paragraph):
words = []
sents = nltk.sent_tokenize(paragraph)
for sent in sents:
words.append(nltk.word_tokenize(sent)) # 得到每句中的分詞
word_set = []
for sent in words: # 遍歷每個句子
_len = len(sent) # 句子長度
for i in range(0, _len, self._len): # 按照步長值劃分句子
if (i+self._len) > _len: # 溢出跳出
break
tmp = ""
for j in range(0, self._len): # 包括步長值內單詞
tmp += sent[i+j]
word_set.append(tmp)
return word_set
def jaccard(self, str1, str2):
str1, str2 = self.cut2list(str1), self.cut2list(str2)
str1, str2 = set(str1), set(str2) # 去重
cnt = 0
for i in str1:
if i in str2:
cnt = cnt + 1 # 計算交集
fenmu = len(str1) + len(str2) - cnt
jaccard_coefficient = float(cnt / fenmu)
return jaccard_coefficient
class Merge: # 並查集
def __init__(self, _len): # _len 爲元素個數
self._len = _len
self.pre = []
for i in range(0,self._len): # 初始化指向自己
self.pre.append(i)
pass
def find(self, x):
if self.pre[x] == x:
return x
else:
self.pre[x] = self.find(self.pre[x])
return self.pre[x]
def merge(self, x, y):
x,y = self.pre[x], self.pre[y]
if x != y:
self.pre[x] = self.pre[y]
if __name__ == '__main__':
data = pd.read_csv("data.csv",sep='\\t', engine='python')
jaccard = Jaccard(2)
jaccard_coefficient = [] # 用於記錄兩兩之間的coefficient相關係數
num = data.shape[0] # 多少條新聞
for i in range(0, num): #得到相關係數矩陣
tmp = []
for j in range(0, num):
if i == j:
tmp.append(0)
else:
tmp.append(jaccard.jaccard(data.iloc[i]['title'], data.iloc[j]['title']))
jaccard_coefficient.append(tmp)
print(np.array(jaccard_coefficient).shape)
print(jaccard_coefficient)
merge = Merge(num)
for i in range(0, num):
for j in range(0, num):
if jaccard_coefficient[i][j] > 0.5: # 關係大的加到一個並查集裏
merge.merge(i,j)
classify = [[] for i in range(num)] # 同一類放在一起
# print('classify: ', classify)
for i in range(num): # 和父節點放在一起
classify[merge.pre[i]].append(i)
classify_tmp = []
for i in classify: #去除已合併結點
if i != []:
classify_tmp.append(i)
classify = classify_tmp
new_data = pd.DataFrame(columns=("title",'publisher', 'E')) # 將新的數據添加到空DataFrame中
null_row = pd.DataFrame({'title':['na'],'publisher':['na'],'E':['na']}) # 空行,用於分割
for i in classify: #加入到新的DataFrame中
for j in i:
new_data = new_data.append(data.iloc[j])
new_data = new_data.append(null_row)
print(new_data)
new_data.to_csv('result.csv',sep=',')