學習筆記-行政區劃匹配缺失補全

簡單實現，存在缺陷

具體代碼如下

# -*- coding:utf-8 -*-

import os
import sys
import enum
from maximum_matching_2 import MaximumMatching

default_encoding = 'utf-8'
if sys.getdefaultencoding() != default_encoding:
    reload(sys)
    sys.setdefaultencoding(default_encoding)


class AdministrativeDivisionMatching(object):

    def __init__(self):
        prefix = os.path.dirname(os.getcwd())
        self.provinces = []
        self.province_abbr_full = {}
        ad_1_file = os.path.join(prefix, 'static', 'ad', 'administrative_division_1.dic')
        AdministrativeDivisionMatching.init_ad_params(ad_1_file, self.provinces, self.province_abbr_full)
        self.cities = []
        self.city_abbr_full = {}
        ad_2_file = os.path.join(prefix, 'static', 'ad', 'administrative_division_2.dic')
        AdministrativeDivisionMatching.init_ad_params(ad_2_file, self.cities, self.city_abbr_full)
        self.counties = []
        self.county_abbr_full = {}
        ad_3_file = os.path.join(prefix, 'static', 'ad', 'administrative_division_3.dic')
        AdministrativeDivisionMatching.init_ad_params(ad_3_file, self.counties, self.county_abbr_full)
        self.towns = []
        self.town_abbr_full = {}
        ad_4_file = os.path.join(prefix, 'static', 'ad', 'administrative_division_4.dic')
        AdministrativeDivisionMatching.init_ad_params(ad_4_file, self.towns, self.town_abbr_full)
        self.villages = []
        self.village_abbr_full = {}
        ad_5_file = os.path.join(prefix, 'static', 'ad', 'administrative_division_5.dic')
        AdministrativeDivisionMatching.init_ad_params(ad_5_file, self.villages, self.village_abbr_full)
        self.ads = {}
        ad_file = os.path.join(prefix, 'static', 'ad', 'administrative_division.dic')
        with open(ad_file, 'r') as f:
            lines = f.readlines()
            for line in lines:
                words = line.strip().split(',')
                self.ads[unicode(words[0])] = unicode(words[1])
                self.ads[unicode(words[1])] = unicode(words[2])
                self.ads[unicode(words[2])] = unicode(words[3])
                if len(words) == 5:
                    self.ads[unicode(words[3])] = unicode(words[4])
        self.maximum_matching = MaximumMatching(8)

    @staticmethod
    def init_ad_params(dic_path, ad_array, ad_dict):
        with open(dic_path, 'r') as df:
            lines = df.readlines()
            for i in range(0, len(lines), 2):
                abbr = unicode(lines[i].strip())
                full = unicode(lines[i + 1].strip())
                ad_array.append(abbr)
                ad_array.append(full)
                ad_dict[abbr] = full

    def address_missing_completion(self, address):
        matching = Matching()
        words = self.maximum_matching.two_way_maximum_matching(address)
        print ' '.join(words)
        for word in words:
            if word in self.provinces:
                if word in self.province_abbr_full.keys():
                    word = self.province_abbr_full[word]
                matching.add_word(MatchingType.PROVINCE, word)
            elif word in self.cities:
                if word in self.city_abbr_full.keys():
                    word = self.city_abbr_full[word]
                matching.add_word(MatchingType.CITY, word)
            elif word in self.counties:
                if word in self.county_abbr_full.keys():
                    word = self.county_abbr_full[word]
                matching.add_word(MatchingType.COUNTY, word)
            elif word in self.towns:
                if word in self.town_abbr_full.keys():
                    word = self.town_abbr_full[word]
                matching.add_word(MatchingType.TOWN, word)
            elif word in self.villages:
                if word in self.village_abbr_full.keys():
                    word = self.village_abbr_full[word]
                matching.add_word(MatchingType.VILLAGE, word)
        matcher_items = matching.matcher_items
        if matcher_items is None:
            return
        # sorted_items = sorted(matcher_items.items(), key=lambda item: item[0], reverse=True)
        full_result = []
        keys = matcher_items.keys()
        keys.sort(reverse=True)
        level = keys[0]
        while level != 0:
            if level in keys:
                result = matcher_items[level][0]
            else:
                pre_level = level + 1
                result = self.ads[matcher_items[pre_level][0]]
                matcher_items[level] = [result]
            level -= 1
            full_result.append(result)
        full_result.reverse()
        return full_result


class MatchingType(enum.Enum):
    PROVINCE = 1
    CITY = 2
    COUNTY = 3
    TOWN = 4
    VILLAGE = 5


class Matching(object):

    def __init__(self):
        self.matcher_items = {}

    def add_word(self, matching_type, word):
        matching_type_value = matching_type.value
        if matching_type_value in self.matcher_items.keys():
            words = self.matcher_items[matching_type_value]
            words.append(word)
        else:
            self.matcher_items[matching_type_value] = [word]


if __name__ == '__main__':
    ad_matching = AdministrativeDivisionMatching()
    print ' '.join(ad_matching.address_missing_completion(u'北京市'))
    print ' '.join(ad_matching.address_missing_completion(u'成都市'))
    print ' '.join(ad_matching.address_missing_completion(u'資中縣'))
    print ' '.join(ad_matching.address_missing_completion(u'鰲頭鎮'))
    print ' '.join(ad_matching.address_missing_completion(u'上方山村'))
    print ' '.join(ad_matching.address_missing_completion(u'四川省資中縣'))
    print ' '.join(ad_matching.address_missing_completion(u'萊蕪市上方山村'))

結果如下：

北京市
北京市
成都市
四川省 成都市
資中縣
四川省 內江市 資中縣
鰲頭鎮
廣東省 茂名市 茂南區 鰲頭鎮
上方山村
山東省 萊蕪市 萊城區 苗山鎮 上方山村
四川省 資中縣
四川省 內江市 資中縣
萊蕪市 上方山村
山東省 萊蕪市 萊城區 苗山鎮 上方山村

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.

學習筆記-行政區劃匹配缺失補全

Python學習筆記-WXPY語音信息識別

推薦系統筆記-01-文章畫像

Python學習筆記-WXPY初識

Activiti學習筆記-整合SpringBoot與簡單使用

ElasticSearch學習筆記-ngram、中文拼音、簡繁體搜索記錄

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結