# -*- coding:utf-8 -*-
import os
import sys
default_encoding = 'utf-8'
if sys.getdefaultencoding() != default_encoding:
reload(sys)
sys.setdefaultencoding(default_encoding)
window_size = 4
dicts = []
# 正向最大匹配
def forward_maximum_match(input_word):
result = []
while True:
left_word, right_word = (input_word, "") if window_size > len(input_word) \
else (input_word[0:window_size], input_word[window_size:])
left_match_word, right_seg_word = get_forward_match_word(left_word)
result.append(left_match_word)
input_word = right_seg_word + right_word
if input_word == "":
break
return result
# 逆向最大匹配
def reverse_maximum_match(input_word):
result = []
while True:
left_word, right_word = ("", input_word) if window_size > len(input_word) \
else (input_word[0:len(input_word)-window_size], input_word[len(input_word)-window_size:])
left_seg_word, right_match_word = get_reverse_match_word(right_word)
result.append(right_match_word)
input_word = left_word + left_seg_word
if input_word == "":
break
result.reverse()
return result
# 雙向最大匹配
# 比較正向最大匹配和逆向最大匹配結果
# 如果分詞數量結果不同:那麼取分詞數量較少的那個
# 如果分詞數量結果相同:分詞結果相同可以返回任何一個 分詞結果不同返回單字數比較少的那個
def two_way_maximum_match(input_word):
forword_result = forward_maximum_match(input_word)
reverse_result = reverse_maximum_match(input_word)
forword_result_len = len(forword_result)
reverse_result_len = len(reverse_result)
if forword_result_len != reverse_result_len:
return forword_result if forword_result_len < reverse_result_len else reverse_result
else:
fr_single_word_count = 0
for result in forword_result:
if len(result) == 1:
fr_single_word_count = fr_single_word_count + 1
rr_single_word_count = 0
for result in reverse_result:
if len(result) == 1:
rr_single_word_count = rr_single_word_count + 1
return forword_result if fr_single_word_count <= rr_single_word_count else reverse_result
def get_forward_match_word(input_word):
for i in range(len(input_word), 1, -1):
prefix = input_word[0:i]
if prefix in dicts:
return prefix, input_word[i:]
return input_word[0:1], input_word[1:]
def get_reverse_match_word(input_word):
for i in range(0, len(input_word)):
suffix = input_word[i:]
if suffix in dicts:
return input_word[0:i], suffix
return input_word[0:-1], input_word[-1]
def maximum_match(input_word):
dic_file = os.path.join(os.path.dirname(os.getcwd()), 'static', 'dic', 'main.dic')
with open(dic_file, 'r') as df:
lines = df.readlines()
for line in lines:
dicts.append(line.strip())
print '----------forward----------'
f_results = forward_maximum_match(input_word)
for f_result in f_results:
print f_result
print '----------reverse----------'
r_results = reverse_maximum_match(input_word)
for r_result in r_results:
print r_result
print '----------two-way----------'
tw_results = two_way_maximum_match(input_word)
for tw_result in tw_results:
print tw_result
if __name__ == '__main__':
maximum_match(u'中國移動聯合華爲完成獨立5G網絡下視頻通話')
maximum_match(u'大齡程序員該如何規劃未來的人生')
maximum_match(u'民族從此站起來了')