词典分词
切分算法
词典导入
# -*- coding:utf-8 -*-
# Author:AG
# Date: 2020-07-5
from pyhanlp import *
def load_dictionary():
IOUtil = JClass('com.hankcs.hanlp.corpus.io.IOUtil')
path = HanLP.Config.CoreDictionaryPath
dic = IOUtil.loadDictionary([path])
return set(dic.keySet())
完全切分
完全切分事实上还不能称之为中文分词,它没有体现有意义的词语序列
# -*- encoding: utf-8 -*-
"""
@File : fully_seg.py
@Contact : [email protected]
@License : (C)Copyright 2019-2020, CodingPark
@Modify Time @Author @Version @Desciption
------------ ------- -------- -----------
2020-07-05 21:29 AG 1.0 None
"""
from dictionarySeg import utility
def fully_segment(dic, text):
word_list = []
for i in range(len(text)):
for j in range(i + 1, len(text) + 1):
word = text[i:j]
if word in dic:
word_list.append(word)
return word_list
dic = utility.load_dictionary()
text = '我爱智能信息处理研究所'
print(fully_segment(dic, text))
结果展示
正向最长匹配
# -*- encoding: utf-8 -*-
"""
@File : forward_seg.py
@Contact : [email protected]
@License : (C)Copyright 2019-2020, CodingPark
@Modify Time @Author @Version @Desciption
------------ ------- -------- -----------
2020-07-05 21:51 AG 1.0 None
"""
from dictionarySeg import utility
def forward_seg(dic, text):
i = 0
word_list = []
while i < len(text):
longest_word = text[i]
for j in range(i + 1, len(text) + 1):
word = text[i:j]
if len(word) > len(longest_word) and word in dic:
longest_word = word
word_list.append(longest_word)
i += len(longest_word)
return word_list
dic = utility.load_dictionary()
text1 = '项目的研究'
text2 = '研究生命起源'
print()
print(forward_seg(dic, text1))
print(forward_seg(dic, text2))
结果展示
逆向最长匹配
# -*- encoding: utf-8 -*-
"""
@File : backward_seg.py
@Contact : [email protected]
@License : (C)Copyright 2019-2020, CodingPark
@Modify Time @Author @Version @Desciption
------------ ------- -------- -----------
2020-07-06 12:56 AG 1.0 None
"""
from dictionarySeg import utility
def backward_seg(dic, text):
word_list = []
i = len(text) - 1
while i >= 0:
longest_word = text[i]
for j in range(0, i):
word = text[j:i+1]
if len(word) > len(longest_word) and word in dic:
longest_word = word
break # 这个break非常重要!
# word_list.append(longest_word) # back_seg里 append就不合适了
word_list.insert(0, longest_word)
i = i - len(longest_word)
return word_list
dic = utility.load_dictionary()
text = '研究生命起源'
print()
print(backward_seg(dic, text))
结果展示
双向最长匹配
# -*- encoding: utf-8 -*-
"""
@File : bidirectional_seg.py
@Contact : [email protected]
@License : (C)Copyright 2019-2020, CodingPark
@Modify Time @Author @Version @Desciption
------------ ------- -------- -----------
2020-07-06 13:16 AG 1.0 其实双向,不怎么样
"""
from dictionarySeg import utility
from dictionarySeg import forward_seg
from dictionarySeg import backward_seg
def count_single_char(word_list: list) -> int:
return sum(1 for word in word_list if word == 1)
def bidirectional_seg(dic, text):
f = forward_seg.forward_seg(dic, text)
b = backward_seg.backward_seg(dic, text)
if len(f) < len(b):
return f
elif len(f) > len(b):
return b
else:
if count_single_char(f) < count_single_char(b):
return f
else:
return b
dic = utility.load_dictionary()
text = '商品和服务'
print(bidirectional_seg(dic, text))
函数参数中的“:”是参数的类型建议符(告诉程序员希望传入的实参的类型)
函数后面跟着的“->”是函数返回值的类型建议符(用来说明该函数返回的值是什么类型)
举个例子:
def isValid(s: 'str') -> 'bool':
return s
这里的参数:‘注解内容’ 和 箭头‘注解内容’的用法是为标注了参数和返回值的类型,使代码更具有阅读性
和 def isValid(s):
return s
效果上其实没有区别
速度测评
# -*- encoding: utf-8 -*-
"""
@File : speedEvl.py
@Contact : [email protected]
@License : (C)Copyright 2019-2020, CodingPark
@Modify Time @Author @Version @Desciption
------------ ------- -------- -----------
2020-07-06 13:55 AG 1.0 None
"""
import time
from dictionarySeg import utility
from dictionarySeg import forward_seg
from dictionarySeg import backward_seg
from dictionarySeg import bidirectional_seg
def evaluate_speed(segment, dic, text):
start_time = time.time()
for i in range(pressure):
segment(dic, text)
elapsed_time = time.time() - start_time
print('%.2f 万字/秒' % (len(text) * pressure / 10000 / elapsed_time))
if __name__ == '__main__':
pressure = 10000
text = "江西鄱阳湖干枯,中国最大淡水湖变成大草原"
dic = utility.load_dictionary()
print('\n由于JPype调用开销巨大,以下速度显著慢于原生Java')
print('\n-----前向最大匹配时速-----')
evaluate_speed(forward_seg.forward_seg, dic, text)
print('\n-----后向最大匹配时速-----')
evaluate_speed(backward_seg.backward_seg, dic, text)
print('\n-----双向最大匹配时速-----')
evaluate_speed(bidirectional_seg.bidirectional_seg, dic, text)
print('\n我认为 后向最大匹配 在速度与准确率上均很出色')
结果展示