AllenNLP中常使用spacy對英文進行分詞,但是spacy不能對中文分詞。因此我想嘗試加一箇中文分詞的word_splitter。前不久加了一個THUNLPSplitter,今天把jieba也加進去。
測試代碼:(pos_tags指是否標註詞性,only_tokens指最終是否只保留字符,去掉詞性等屬性,user_dict指用戶自定義詞典,是一個UTF-8的txt文件路徑,一行是一個詞)
from allennlp.data.tokenizers.word_splitter import JIEBASplitter
from allennlp.data.tokenizers.token import show_token
splitter = JIEBASplitter(pos_tags=False)
print(splitter.split_words("武漢市長江大橋"))
splitter2 = JIEBASplitter(pos_tags=True, only_tokens=False)
tokens = splitter2.split_words("武漢市長江大橋")
for token in tokens:
print(show_token(token))
splitter3 = JIEBASplitter(pos_tags=False, user_dict='F:\\test\\userdict.txt')
print(splitter3.split_words("中美合拍,文體兩開花。皮皮蝦我們走"))
結果如下:
用戶詞典是一個UTF-8的txt文件,一行有一個自定義的詞。
完整代碼如下:
import jieba.posseg as poss
import jieba
@WordSplitter.register('jieba')
class JIEBASplitter(WordSplitter):
"""
A ``WordSplitter`` that uses JIEBA's tokenizer. To Split Chinese sentences.
user_dict:a txt file, one word in a line.
"""
def __init__(self,pos_tags: bool = False,
only_tokens: bool = True,
user_dict: str = None) -> None:
self._pos_tags = pos_tags
if user_dict and os.path.exists(user_dict):
jieba.load_userdict(user_dict)
self._only_tokens = only_tokens
def _sanitize(self, tokens) -> List[Token]:
"""
Converts spaCy tokens to allennlp tokens. Is a no-op if
keep_spacy_tokens is True
"""
sanitize_tokens = []
if self._pos_tags:
for text, pos in tokens:
token = Token(text)
if self._only_tokens:
pass
else:
token = Token(token.text,
token.idx,
token.lemma_,
pos,
token.tag_,
token.dep_,
token.ent_type_)
sanitize_tokens.append(token)
else:
for token in tokens:
token = Token(token)
sanitize_tokens.append(token)
return sanitize_tokens
@overrides
def batch_split_words(self, sentences: List[str]) -> List[List[Token]]:
split_words = []
if self._pos_tags:
for sent in sentences:
split_words.append(self._sanitize(tokens) for tokens in poss.cut(sent))
else:
for sent in sentences:
split_words.append(self._sanitize(tokens) for tokens in jieba.cut(sent))
return split_words
@overrides
def split_words(self, sentence: str) -> List[Token]:
if self._pos_tags:
return self._sanitize(poss.cut(sentence))
else:
return self._sanitize(jieba.cut(sentence))
過幾天試着用這個測試一下中文分類