中文信息處理--分句

中文信息處理--分句


    工慾善其事必先利其器。中文信息處理之,中文分句。

    按這幾個標點“ 。!?…!?”,將中文進行分句,一般會遇到一些問題,比如成對的《》“”‘’{}()()【】"",如果其中包含句的標點,會將完整的一句話拆分成幾個句子。

    以下是使用Python實現的中文分句程序,可以處理單個文件或者文件夾,但文件編碼需爲UTF-8,

   

# coding=utf-8
# python

import sys;
import os;

#設置分句的標誌符號
cutlist="。!?…!?".decode('utf-8')
punct_pair_str = "《》“”‘’{}()()【】\"\"".decode('utf-8')
punct_pair_hm = {}

sent_count = 0

# 檢查某字符是否分句標誌符號的函數;如果是,返回True, 否則返回False
def FindTok(char):
    global cutlist
    if char in cutlist:
        return True
    else:
        return False

def CutSent(cut_str):

    sent_list = []
    sent = []

    punct_pair = []

    for ch in cut_str:
        AddPunct(punct_pair, ch)
        if FindTok(ch):
            sent.append(ch)
            if len(punct_pair)==0:
                sent_list.append(''.join(sent))
                sent = []
                punct_pair = []
        else:
            sent.append(ch)
            
    if len(sent)!=0:
        sent_list.append(''.join(sent))

    return sent_list

def ConstPunctPair():
    global punct_pair_str, punct_pair_hm

    for index in range(0, len(punct_pair_str), 2):
        punct_pair_hm[punct_pair_str[index+1]] = punct_pair_str[index]
        #print (punct_pair_str[index+1]+"\t<==>\t"+punct_pair_str[index]).encode('gbk')


def AddPunct(punct_pair, ch):
    global punct_pair_str, punct_pair_hm
    
    if ch not in punct_pair_str:
        return punct_pair

    if len(punct_pair_hm)==0:
        ConstPunctPair()

    if ch not in punct_pair_hm:
        punct_pair.append(ch)
        return punct_pair

    hasMatch = False
    pair_ch = punct_pair_hm[ch]
    for index in range(len(punct_pair)-1, -1, -1):
        if punct_pair[index]==pair_ch:
            del punct_pair[index]
            hasMatch = True
            break
    if not hasMatch:
        punct_pair.append(ch)

    return punct_pair

def handle_file(input_path, output_path, multi_line=False):
    global sent_count
    
    if multi_line:
        fpw = open(output_path, 'w')
        
        total_line = ""
        for line in open(input_path).xreadlines():
            new_line = line[:-1].decode('utf-8')
            total_line += new_line

        sent_list = CutSent(total_line)
        for sent in sent_list:
            sent_count += 1
            #fpw.write(str(sent_count)+"\t"+sent.encode('utf-8')+"\n")
            fpw.write(sent.encode('utf-8')+"\n")
            
        fpw.close()
        return
    
    else:
        fpw = open(output_path, 'w')

        for line in open(input_path).xreadlines():
            new_line = line[:-1].decode('utf-8')

            sent_list = CutSent(new_line)
            for sent in sent_list:
                sent_count += 1
                #fpw.write(str(sent_count)+"\t"+sent.encode('utf-8')+"\n")
                fpw.write(sent.encode('utf-8')+"\n")
        fpw.close()
        return
    
def handle_dir(input_path, output_path, multi_line=False):

    if not os.path.exists(output_path):
        os.mkdir(output_path)

    file_list = os.listdir(input_path)
    for file_name in file_list:
        if os.path.isdir(input_path+"/"+file_name):
            handle_dir(input_path+"/"+file_name, output_path+"/"+file_name, multi_line)
        else:
            handle_file(input_path+"/"+file_name, output_path+"/"+file_name, multi_line)


def handle(input_path, output_path, multi_line=False):

    if os.path.isdir(input_path):
        handle_dir(input_path, output_path, multi_line)
    else:
        handle_file(input_path, output_path, multi_line)

if __name__ == "__main__":
    if len(sys.argv)!=3:
        print "python %s input_path, output_path" % sys.argv[0]
    else:
        handle(sys.argv[1], sys.argv[2], False)


#cutlist="[。,,!!《》<>\"'::?\?、、|“”‘’;]{}(){}【】();~-_——+=*&……#@`·\n\r".decode('utf-8')

 或源代碼見:

  https://github.com/beifeng600/nlp_storeroom/tree/master/tools/%E5%88%86%E5%8F%A5


   參考:

    Python 中文處理問題--分句,

   http://m.blog.csdn.net/blog/yhc13429826359/4141471


 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章