搜狗 scel詞包解析

原創

2019-06-10 19:56

前言

在一些基本的任務中，構建單詞詞包是其中一項不可避免的任務，爲了不從零開始對詞包進行構建，就要應用一些已有的詞包，下面就是對搜狗 scel詞包解析，代碼比較久了，那時候用的py2。希望能夠對大家有所幫助。


#!/usr/bin/python 
#  -*-  coding:  utf-8  -*- 

import struct 
import  sys 
import  binascii   
import  pdb
import os
import chardet
#搜狗的scel詞庫就是保存的文本的unicode編碼，每兩個字節一個字符（中文漢字或者英文字母） 
startPy  =  0x1540;

#漢語詞組表偏移 
startChinese  =  0x2628;
#全局拼音表
GPy_Table  ={}
#解析結果 
#元組(詞頻,拼音,中文詞組)的列表 
GTable  =  []


def byte2str(data):
  '''''將原始字節碼轉爲字符串''' 
  i  =  0; 
  length  =  len(data) 
  ret  =  u'' 
  while  i  <  length: 
    x  =  data[i]  +  data[i+1] 
    t  =  unichr(struct.unpack('H',x)[0]) 
    if  t  ==  u'\r': 
      ret  +=  u'\n' 
    elif  t  !=  u'  ': 
      ret  +=  t 
    i  +=  2 
  return  ret 
#獲取拼音表 
def  getPyTable(data): 
 
  if  data[0:4]  !=  "\x9D\x01\x00\x00": 
    return  None 
  data  =  data[4:] 
  pos  =  0 
  length  =  len(data) 
  while  pos  <  length: 
    index  =  struct.unpack('H',data[pos]+data[pos+1])[0] 
    #print  index, 
    pos  +=  2 
    l  =  struct.unpack('H',data[pos]+data[pos+1])[0] 
    #print  l, 
    pos  +=  2 
    py  =  byte2str(data[pos:pos+l]) 
    #print  py 
    GPy_Table[index]=py 
    pos  +=  l 
 
#獲取一個詞組的拼音 
def  getWordPy(data): 
  pos  =  0 
  length  =  len(data) 
  ret  =  u'' 
  while  pos  <  length: 
     
    index  =  struct.unpack('H',data[pos]+data[pos+1])[0] 
    ret  +=  GPy_Table[index] 
    pos  +=  2   
  return  ret 
 
#獲取一個詞組 
def  getWord(data): 
  pos  =  0 
  length  =  len(data) 
  ret  =  u'' 
  while  pos  <  length: 
     
    index  =  struct.unpack('H',data[pos]+data[pos+1])[0] 
    ret  +=  GPy_Table[index] 
    pos  +=  2   
  return  ret 
 
#讀取中文表   
def  getChinese(data): 
  #import  pdb 
  #pdb.set_trace() 
   
  pos  =  0 
  length  =  len(data)

  while  pos  <  length: 
    #同音詞數量 
    same  =  struct.unpack('H',data[pos]+data[pos+1])[0] 
    #print  '[same]:',same, 
     
    #拼音索引表長度 
    pos  +=  2 
    py_table_len  =  struct.unpack('H',data[pos]+data[pos+1])[0] 
    #拼音索引表 
    pos  +=  2 
    py  =  getWordPy(data[pos:  pos+py_table_len]) 
 
    #中文詞組 
    pos  +=  py_table_len

    for  i  in  xrange(same): 
      #中文詞組長度 
      c_len  =  struct.unpack('H',data[pos]+data[pos+1])[0] 
      #中文詞組 
      pos  +=  2  
      word  =  byte2str(data[pos:  pos  +  c_len]) 
      #擴展數據長度 
      pos  +=  c_len     
      ext_len  =  struct.unpack('H',data[pos]+data[pos+1])[0] 
      #詞頻 
      pos  +=  2 
      count =  struct.unpack('H',data[pos]+data[pos+1])[0] 
 
      #保存 
      GTable.append((count,py,word)) 
     
      #到下個詞的偏移位置 
      pos  += ext_len 
 
 
def  deal(file_name): 
  print  '-'*60
  f  =  open(file_name,'rb') 
  data  =  f.read() 
  f.close() 
   
   
  if  data[0:12]  !="\x40\x15\x00\x00\x44\x43\x53\x01\x01\x00\x00\x00": 
    print  "確認你選擇的是搜狗(.scel)詞庫?" 
    pass

  print  "詞庫名："  ,byte2str(data[0x130:0x338])#.encode('GB18030')
  print  "詞庫類型："  ,byte2str(data[0x338:0x540])#.encode('GB18030') 
  print  "描述信息："  ,byte2str(data[0x540:0xd40])#.encode('GB18030') 
  print  "詞庫示例：",byte2str(data[0xd40:startPy])#.encode('GB18030') 
   
  getPyTable(data[startPy:startChinese]) 
  getChinese(data[startChinese:]) 
  return byte2str(data[0x130:0x338])#.encode('GB18030')
     
if  __name__  ==  '__main__':

  #將要轉換的詞庫添加在這裏就可以了
  #rootdir = unicode("D:\詞庫整理" , "utf8")
  rootdir = unicode("D:\dic", "utf8")
  o  =  [r'D:\詞庫整理\07外貿英語名單.scel' ]
   
  for rt, dirs, files  in  os.walk(rootdir):

        for f in files :
            try :
              dic_name = deal(os.path.join(rootdir , f))   ### 處理函數
              print dic_name
            except Exception:
              pass

            savef  =  open(rootdir + "\\" + f  +  r'sougou.txt','w')
            savef.write(unicode(dic_name).encode("utf8"))   ### 寫入名稱
            for  count,py,word  in  GTable:
                #GTable保存着結果，是一個列表，每個元素是一個元組(詞頻,拼音,中文詞組)，有需要的話可以保存成自己需要個格式
                #我沒排序，所以結果是按照上面輸入文件的順序
                savef.write(  unicode('{%(count)s}'  %{'count':count}+py+'  '+  word).encode('utf-8')  )#最終保存文件的編碼，可以自給改
                savef.write('\n')

            savef.close()
            GTable = []  #寫完數據 初始化全局變量

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.

搜狗 scel詞包解析

前言

最近鄰逼近搜索

Bert編碼訓練NER實體偏移的問題

hnswlib庫在windows系統中的安裝

大量字符串快速匹配-字典樹匹配

keras自定義層對自定義參數進行保存

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結