學習筆記-文本最大匹配分詞

數組結構詞典掃描匹配

# -*- coding:utf-8 -*-

import os
import sys

default_encoding = 'utf-8'
if sys.getdefaultencoding() != default_encoding:
    reload(sys)
    sys.setdefaultencoding(default_encoding)


class MaximumMatching(object):

    def __init__(self, window_size):
        self.window_size = window_size
        self.dicts = []
        dic_file = os.path.join(os.path.dirname(os.getcwd()), 'static', 'dic', 'main.dic')
        with open(dic_file, 'r') as df:
            lines = df.readlines()
            for line in lines:
                self.dicts.append(line.strip())

    # 正向最大匹配
    def forward_maximum_matching(self, input_word):
        result = []
        while True:
            left_word, right_word = (input_word, "") if self.window_size > len(input_word) \
                else (input_word[0:self.window_size], input_word[self.window_size:])
            left_match_word, right_seg_word = self.get_forward_matching_word(left_word)
            result.append(left_match_word)
            input_word = right_seg_word + right_word
            if input_word == "":
                break
        return result

    # 逆向最大匹配
    def reverse_maximum_matching(self, input_word):
        result = []
        while True:
            left_word, right_word = ("", input_word) if self.window_size > len(input_word) \
                else (input_word[0:len(input_word)-self.window_size], input_word[len(input_word)-self.window_size:])
            left_seg_word, right_match_word = self.get_reverse_matching_word(right_word)
            result.append(right_match_word)
            input_word = left_word + left_seg_word
            if input_word == "":
                break
        result.reverse()
        return result

    # 雙向最大匹配
    # 比較正向最大匹配和逆向最大匹配結果
    # 如果分詞數量結果不同:那麼取分詞數量較少的那個
    # 如果分詞數量結果相同:分詞結果相同可以返回任何一個 分詞結果不同返回單字數比較少的那個
    def two_way_maximum_matching(self, input_word):
        forword_result = self.forward_maximum_matching(input_word)
        reverse_result = self.reverse_maximum_matching(input_word)
        forword_result_len = len(forword_result)
        reverse_result_len = len(reverse_result)
        if forword_result_len != reverse_result_len:
            return forword_result if forword_result_len < reverse_result_len else reverse_result
        else:
            fr_single_word_count = 0
            for result in forword_result:
                if len(result) == 1:
                    fr_single_word_count = fr_single_word_count + 1
            rr_single_word_count = 0
            for result in reverse_result:
                if len(result) == 1:
                    rr_single_word_count = rr_single_word_count + 1
            return forword_result if fr_single_word_count <= rr_single_word_count else reverse_result

    def get_forward_matching_word(self, input_word):
        for i in range(len(input_word), 1, -1):
            prefix = input_word[0:i]
            if prefix in self.dicts:
                return prefix, input_word[i:]
        return input_word[0:1], input_word[1:]

    def get_reverse_matching_word(self, input_word):
        for i in range(0, len(input_word)):
            suffix = input_word[i:]
            if suffix in self.dicts:
                return input_word[0:i], suffix
        return input_word[0:-1], input_word[-1]


def do_maximum_matching(maximum_matching_obj, input_word):
    print '----------forward----------'
    f_results = maximum_matching_obj.forward_maximum_matching(input_word)
    for f_result in f_results:
        print f_result
    print '----------reverse----------'
    r_results = maximum_matching_obj.reverse_maximum_matching(input_word)
    for r_result in r_results:
        print r_result
    print '----------two-way----------'
    tw_results = maximum_matching_obj.two_way_maximum_matching(input_word)
    for tw_result in tw_results:
        print tw_result


if __name__ == '__main__':
    maximum_matching = MaximumMatching(4)
    do_maximum_matching(maximum_matching, u'中國移動聯合華爲完成獨立5G網絡下視頻通話')
    do_maximum_matching(maximum_matching, u'大齡程序員該如何規劃未來的人生')
    do_maximum_matching(maximum_matching, u'民族從此站起來了')

 

Trie樹結構詞典掃描匹配

# -*- coding:utf-8 -*-

import os
import sys

default_encoding = 'utf-8'
if sys.getdefaultencoding() != default_encoding:
    reload(sys)
    sys.setdefaultencoding(default_encoding)


class MaximumMatching(object):

    def __init__(self, window_size):
        self.window_size = window_size
        self.trie = Trie()
        self.build_trie()

    # 正向最大匹配
    def forward_maximum_matching(self, input_word):
        result = []
        while True:
            left_word, right_word = (input_word, "") if self.window_size > len(input_word) \
                else (input_word[0:self.window_size], input_word[self.window_size:])
            left_match_word, right_seg_word = self.get_forward_matching_word(left_word)
            result.append(left_match_word)
            input_word = right_seg_word + right_word
            if input_word == "":
                break
        return result

    # 逆向最大匹配
    def reverse_maximum_matching(self, input_word):
        result = []
        while True:
            left_word, right_word = ("", input_word) if self.window_size > len(input_word) \
                else (input_word[0:len(input_word)-self.window_size], input_word[len(input_word)-self.window_size:])
            left_seg_word, right_match_word = self.get_reverse_matching_word(right_word)
            result.append(right_match_word)
            input_word = left_word + left_seg_word
            if input_word == "":
                break
        result.reverse()
        return result

    # 雙向最大匹配
    # 比較正向最大匹配和逆向最大匹配結果
    # 如果分詞數量結果不同:那麼取分詞數量較少的那個
    # 如果分詞數量結果相同:分詞結果相同可以返回任何一個 分詞結果不同返回單字數比較少的那個
    def two_way_maximum_matching(self, input_word):
        forword_result = self.forward_maximum_matching(input_word)
        reverse_result = self.reverse_maximum_matching(input_word)
        forword_result_len = len(forword_result)
        reverse_result_len = len(reverse_result)
        if forword_result_len != reverse_result_len:
            return forword_result if forword_result_len < reverse_result_len else reverse_result
        else:
            fr_single_word_count = 0
            for result in forword_result:
                if len(result) == 1:
                    fr_single_word_count = fr_single_word_count + 1
            rr_single_word_count = 0
            for result in reverse_result:
                if len(result) == 1:
                    rr_single_word_count = rr_single_word_count + 1
            return forword_result if fr_single_word_count <= rr_single_word_count else reverse_result

    def get_forward_matching_word(self, input_word):
        for i in range(len(input_word), 1, -1):
            prefix = input_word[0:i]
            if self.trie.search(prefix):
                return prefix, input_word[i:]
        return input_word[0:1], input_word[1:]

    def get_reverse_matching_word(self, input_word):
        for i in range(0, len(input_word)):
            suffix = input_word[i:]
            if self.trie.search(suffix):
                return input_word[0:i], suffix
        return input_word[0:-1], input_word[-1]

    def build_trie(self):
        prefix = os.path.dirname(os.getcwd())
        self.build_trie_by_path(os.path.join(prefix, 'static', 'dic', 'main.dic'))
        self.build_trie_by_path(os.path.join(prefix, 'static', 'ad', 'administrative_division_1.dic'))
        self.build_trie_by_path(os.path.join(prefix, 'static', 'ad', 'administrative_division_2.dic'))
        self.build_trie_by_path(os.path.join(prefix, 'static', 'ad', 'administrative_division_3.dic'))
        self.build_trie_by_path(os.path.join(prefix, 'static', 'ad', 'administrative_division_4.dic'))
        self.build_trie_by_path(os.path.join(prefix, 'static', 'ad', 'administrative_division_5.dic'))

    def build_trie_by_path(self, dic_file):
        with open(dic_file, 'r') as df:
            lines = df.readlines()
            for line in lines:
                self.trie.insert(line.strip())


class Trie(object):

    def __init__(self):
        self.root = {}
        self.end_flag = -1

    def insert(self, word):
        current_node = self.root
        for s_word in unicode(word):
            if s_word not in current_node:
                current_node[s_word] = {}
            current_node = current_node[s_word]
        current_node[self.end_flag] = True

    def search(self, word):
        current_node = self.root
        for s_word in unicode(word):
            if s_word not in current_node:
                return False
            current_node = current_node[s_word]
        return False if self.end_flag not in current_node else True

    def print_trie(self):
        self.print_item(self.root)

    def print_item(self, current, indent=0):
        if current:
            prefix = '' + '-' * indent
            for key, value in current.items():
                if key == self.end_flag:
                    continue
                print prefix + key
                self.print_item(current[key], indent + 1)


def do_maximum_matching(maximum_matching_obj, input_word):
    print '----------forward----------'
    f_results = maximum_matching_obj.forward_maximum_matching(input_word)
    for f_result in f_results:
        print f_result
    print '----------reverse----------'
    r_results = maximum_matching_obj.reverse_maximum_matching(input_word)
    for r_result in r_results:
        print r_result
    print '----------two-way----------'
    tw_results = maximum_matching_obj.two_way_maximum_matching(input_word)
    for tw_result in tw_results:
        print tw_result


if __name__ == '__main__':
    maximum_matching = MaximumMatching(4)
    do_maximum_matching(maximum_matching, u'中國移動聯合華爲完成獨立5G網絡下視頻通話')
    do_maximum_matching(maximum_matching, u'大齡程序員該如何規劃未來的人生')
    do_maximum_matching(maximum_matching, u'民族從此站起來了')

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章