Python學習筆記-文本最大匹配分詞

# -*- coding:utf-8 -*-

import os
import sys

default_encoding = 'utf-8'
if sys.getdefaultencoding() != default_encoding:
    reload(sys)
    sys.setdefaultencoding(default_encoding)

window_size = 4

dicts = []


# 正向最大匹配
def forward_maximum_match(input_word):
    result = []
    while True:
        left_word, right_word = (input_word, "") if window_size > len(input_word) \
            else (input_word[0:window_size], input_word[window_size:])
        left_match_word, right_seg_word = get_forward_match_word(left_word)
        result.append(left_match_word)
        input_word = right_seg_word + right_word
        if input_word == "":
            break
    return result


# 逆向最大匹配
def reverse_maximum_match(input_word):
    result = []
    while True:
        left_word, right_word = ("", input_word) if window_size > len(input_word) \
            else (input_word[0:len(input_word)-window_size], input_word[len(input_word)-window_size:])
        left_seg_word, right_match_word = get_reverse_match_word(right_word)
        result.append(right_match_word)
        input_word = left_word + left_seg_word
        if input_word == "":
            break
    result.reverse()
    return result


# 雙向最大匹配
# 比較正向最大匹配和逆向最大匹配結果
# 如果分詞數量結果不同:那麼取分詞數量較少的那個
# 如果分詞數量結果相同:分詞結果相同可以返回任何一個 分詞結果不同返回單字數比較少的那個
def two_way_maximum_match(input_word):
    forword_result = forward_maximum_match(input_word)
    reverse_result = reverse_maximum_match(input_word)
    forword_result_len = len(forword_result)
    reverse_result_len = len(reverse_result)
    if forword_result_len != reverse_result_len:
        return forword_result if forword_result_len < reverse_result_len else reverse_result
    else:
        fr_single_word_count = 0
        for result in forword_result:
            if len(result) == 1:
                fr_single_word_count = fr_single_word_count + 1
        rr_single_word_count = 0
        for result in reverse_result:
            if len(result) == 1:
                rr_single_word_count = rr_single_word_count + 1
        return forword_result if fr_single_word_count <= rr_single_word_count else reverse_result


def get_forward_match_word(input_word):
    for i in range(len(input_word), 1, -1):
        prefix = input_word[0:i]
        if prefix in dicts:
            return prefix, input_word[i:]
    return input_word[0:1], input_word[1:]


def get_reverse_match_word(input_word):
    for i in range(0, len(input_word)):
        suffix = input_word[i:]
        if suffix in dicts:
            return input_word[0:i], suffix
    return input_word[0:-1], input_word[-1]


def maximum_match(input_word):
    dic_file = os.path.join(os.path.dirname(os.getcwd()), 'static', 'dic', 'main.dic')
    with open(dic_file, 'r') as df:
        lines = df.readlines()
        for line in lines:
            dicts.append(line.strip())
    print '----------forward----------'
    f_results = forward_maximum_match(input_word)
    for f_result in f_results:
        print f_result
    print '----------reverse----------'
    r_results = reverse_maximum_match(input_word)
    for r_result in r_results:
        print r_result
    print '----------two-way----------'
    tw_results = two_way_maximum_match(input_word)
    for tw_result in tw_results:
        print tw_result

if __name__ == '__main__':
    maximum_match(u'中國移動聯合華爲完成獨立5G網絡下視頻通話')
    maximum_match(u'大齡程序員該如何規劃未來的人生')
    maximum_match(u'民族從此站起來了')

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章