mmseg算法是對最大匹配算法的擴展。簡單來說,mmseg每次匹配時,總會多向後匹配兩個單詞,然後選擇這個三個單詞的總體匹配最優的。
mmseg 主要做了以下幾方面的擴展:
假設對字符串C1C2...Cn進行分割
-
匹配時,從小到大,逐個匹配字典中以C1開頭的詞
-
每次連續匹配三個詞語(three-word chunk ),並列出所有可能的分割
-
選擇最匹配的three-word chunk(依次運用以下規則,一旦可以選出唯一結果則返回):
a.三個單詞的總長度最大
b.單詞平均長度最大
c.單詞長度的方差最小
b.單詞的詞頻總和最大
-
選取three-word chunk中的第一個單詞,然後重複1-4這四個步驟
代碼如下(結合上一篇一起看 http://hi.baidu.com/bithigher/item/cbd098c52123df0a5050584d ):
由於選用的語料庫中沒有單個漢字的信息,所以分詞效果還不是非常理想,下面代碼中的函數 mmseg和上一篇中的函數 maxmath以及maxmatch_back是一樣的,都可以作爲參數傳給 slove函數
git 地址: https://github.com/BitHigher/hfseg
# -*- coding: UTF-8 -*-
import re
import sys
d = {}
def init(filename="SogouLabDic.dic"):
f = open(filename, 'r')
for line in f:
word, freq = line.split('\t')[0:2]
try:
d[word.decode("gbk")] = int(freq)
except:
d[word] = int(freq)
def maxmatch(s):
maxlen = 5
l = len(s)
p = 0
result = {}
while p < l:
length = min(maxlen, l-p)
wlen = 1
for i in range(length, 0, -1):
if d.has_key(s[p:p+i]):
wlen = i
break
if wlen > 1:
result.setdefault(s[p:p+wlen], 0)
result[s[p:p+wlen]] += 1
p += wlen
return result
def maxmatch_back(s):
maxlen = 5
l = len(s)
result = {}
while l > 0:
length = min(maxlen, l)
wlen = 1
for i in range(length, 0, -1):
if d.has_key(s[l-i:l]):
wlen = i
break
if wlen > 1:
result.setdefault(s[l-wlen:l], 0)
result[s[l-wlen:l]] += 1
l -= wlen
return result
def one_word(s, start, rest=3):
result = []
maxlen = 5
l = len(s)
for former in start:
p = former[len(former)-1]
if p >= l:
result.append(former)
break
length = min(maxlen, l-p)
num = 0
for i in range(1, length+1):
if d.has_key(s[p:p+i]):
result.append(former + [p+i])
num += 1
if num == 0: result.append(former + [p+1])
if rest > 1: return one_word(s, result, rest-1)
else: return result
def three_word_chunk(s, start):
result = one_word(s, [[start]], 3)
longest = 0
lset = []
for i in range(len(result)):
cur = result[i][len(result[i])-1] - result[i][0]
if cur > longest:
longest = cur
lset = [i]
elif cur == longest:
lset.append(i)
if len(lset) == 1:
return result[lset[0]]
else:
# get the longest averge
longavg = 0
lavg = []
for i in range(len(lset)):
cur = longest / float(len(result[lset[i]])-1)
if cur > longavg:
longavg = cur
lavg = [lset[i]]
elif cur == longavg:
lavg.append(lset[i])
lset = lavg
longest = longavg
if len(lset) == 1:
return result[lset[0]]
else:
# get the minmum dx
mindk = sys.maxint
dkset = []
for i in range(len(lset)):
cur = 0
for j in range(1, len(result[lset[i]])):
wordlen = result[lset[i]][j] - result[lset[i]][j-1]
cur += pow((wordlen - longest), 2)
if cur < mindk:
mindk = cur
dkset = [lset[i]]
elif cur == mindk:
dkset.append(lset[i])
lset = dkset
longest = mindk
if len(lset) == 1:
return result[lset[0]]
else:
# get the maxmum frequency
maxFre = 0
fset = []
for i in range(len(lset)):
cur = 0
for j in range(1, len(result[i])):
key = s[result[i][j-1]:result[i][j]]
if d.has_key(key):
cur += d[key]
if cur > maxFre:
maxFre = cur
fset = [lset[i]]
elif cur == maxFre:
fset.append(lset[i])
lset = fset
longest = maxFre
if len(lset) == 1:
return result[lset[0]]
else:
# print 'Really More than one...', lset
return result[lset[0]]
# look ahead two more words
def mmseg(s):
maxlen = 5
l = len(s)
p = 0
result = {}
while p < l:
chunk = three_word_chunk(s, p)
if(len(chunk) < 2): break
if chunk[1] - chunk[0] > 1:
result.setdefault(s[chunk[0]:chunk[1]], 0)
result[s[chunk[0]:chunk[1]]] += 1
p = chunk[1]
return result
def solve(s, segment=maxmatch):
s = s.decode("utf8")
return segment(s)