spacy3.0 創建新模型

import random
import re
from pathlib import Path
import spacy
from spacy.util import minibatch
from spacy.training import Example
from spacy.tokenizer import Tokenizer


TRAIN_DATA = [
    ('AMD Athlon 320GE YD32GEC6FHBOX', {'entities': [(0, 3, '廠家'), (4, 10, '家族'), (11, 16, '系列'), (17, 30, '產品編號')]}),
    ('Intel Core i5-4670K - Core i5 4th Gen Haswell Quad-Core 3.4 GHz LGA 1150 84W Intel HD Graphics Desktop Processor - BX80646I54670K ', {'entities': [(0, 5, '廠家'), (6, 10, '家族'), (11, 19, '型號'), (56, 63, '頻率'), (64, 72, '插槽'), (73, 76, '功耗'), (95, 102, '平臺'), (115, 129, '產品編號')]})
]


special_cases = {":)": [{"ORTH": ":)"}]}
prefix_re = re.compile(r'''^[\[\("']''')
suffix_re = re.compile(r'''[\]\)"']$''')
infix_re = re.compile(r'''[-~]''')
simple_url_re = re.compile(r'''^https?://''')

def custom_tokenizer(nlp):
    return Tokenizer(nlp.vocab, rules=special_cases, prefix_search=prefix_re.search, suffix_search=suffix_re.search, infix_finditer=infix_re.finditer, url_match=simple_url_re.match)

model = None
output_dir = Path("../ner2")
iter_number = 100

source_nlp = spacy.load("en_core_web_sm")
nlp = spacy.blank('en')
nlp.tokenizer = custom_tokenizer(nlp)
nlp.add_pipe('tok2vec', source=source_nlp)
nlp.add_pipe('ner', source=source_nlp)
ner = nlp.get_pipe('ner')

examples = []
for text, annotations in TRAIN_DATA:
    for ent in annotations.get('entities'):
        ner.add_label(ent[2])
    examples.append(Example.from_dict(nlp.make_doc(text), annotations))

for iteration in range(iter_number):
    random.shuffle(examples)
    losses = {}
    for batch in minibatch(examples, size=10):
        nlp.update(batch, drop=0.1, losses=losses)
    print("Losses", losses)


if output_dir is not None:
    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()
    nlp.to_disk(output_dir)
    print("Saved model to", output_dir)


PREDICTION_DATA = [
    "AMD Athlon 320GE YD32GEC6FHBOX",
    "AMD Athlon Gold PRO 3150G"
]

for text in PREDICTION_DATA:
    doc = nlp(text)
    print("Entities", [(ent.text, ent.label_, ent.start_char, ent.end_char) for ent in doc.ents])

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章