nltk 同義詞替換 單詞拼寫校正 製作僞原創文章

一.基於貝葉斯單詞拼寫校正

# -*- coding: utf-8 -*-
# @Time    : 2019/11/26 10:13
# @Author  :
# @FileName: word_check.py

import os
import re
import collections

# 下載詞庫big.txt文件到本地
# import requests
# url = "http://norvig.com/big.txt"
# response = requests.get(url=url)
# with open("big.txt","w",encoding="utf-8") as f:
#     f.write(response.text)

def words(text):
    return re.findall('[a-z]+', text.lower())


def train(features):
    model = collections.defaultdict(lambda: 1)
    for f in features:
        model[f] += 1
    return model


filepath = os.path.dirname(__file__)
NWORDS = train(words(open('%s/big.txt' % filepath).read()))

alphabet = 'abcdefghijklmnopqrstuvwxyz'


def edits1(word):
    splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
    deletes = [a + b[1:] for a, b in splits if b]
    transposes = [a + b[1] + b[0] + b[2:] for a, b in splits if len(b) > 1]
    replaces = [a + c + b[1:] for a, b in splits for c in alphabet if b]
    inserts = [a + c + b for a, b in splits for c in alphabet]
    return set(deletes + transposes + replaces + inserts)


def known_edits2(word):
    return set(e2 for e1 in edits1(word) for e2 in edits1(e1) if e2 in NWORDS)


def known(words): return set(w for w in words if w in NWORDS)


def correct(word):
    candidates = known([word]) or known(edits1(word)) or known_edits2(word) or [word]
    return max(candidates, key=NWORDS.get)


if __name__ == '__main__':
    print(correct("Hammett"))

二.同義詞替換生成僞原創文章

# -*- coding: utf-8 -*-
# @Time    : 2019/11/26 10:13
# @Author  :

import re

import nltk
import inflect
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer

from word_check import correct

# 縮略詞還原
replacement_patterns = [
    ("won\'t", "will not"),
    ("won\’t", "will not"),
    ("can\'t", "cannot"),
    ("can\’t", "cannot"),
    ("i\'m", "i am"),
    ("i\’m", "i am"),
    ("ain\'t", "is not"),
    ("ain\’t", "is not"),
    ("(\w+)\'ll", "\g<1> will"),
    ("(\w+)\’ll", "\g<1> will"),
    ("(\w+)n\'t", "\g<1> not"),
    ("(\w+)n’t", "\g<1> not"),
    ("(\w+)\'ve", "\g<1> have"),
    ("(\w+)\’ve", "\g<1> have"),
    ("(\w+)\'s", "\g<1> is"),
    ("(\w+)\’s", "\g<1> is"),
    ("(\w+)\'re", "\g<1> are"),
    ("(\w+)\’re", "\g<1> are"),
    ("(\w+)\'d", "\g<1> would"),
    ("(\w+)\’d", "\g<1> would")
]


class RegexpReplacer(object):
    def __init__(self, patterns=replacement_patterns):
        """縮略詞還原"""
        self.patterns = [(re.compile(regex), repl) for (regex, repl) in patterns]

    def rep(self, text):
        s = text
        for (pattern, repl) in self.patterns:
            s = re.sub(pattern, repl, s)
        return s


def generate_the_article(text):
    """
    僞原創文章生成
    :param text:
    :return:
    """
    replacer = RegexpReplacer()
    text = replacer.rep(text)

    # 將文章切割成句子
    setences = nltk.sent_tokenize(text)
    all_world = []
    for setn in setences:

        # 詞性分析
        tokens = nltk.word_tokenize(setn)
        pos_tags = nltk.pos_tag(tokens)
        new_world = []

        for word, pos in pos_tags:
            pos_list = ["VBG", "JJ", "NN", "NNS", "MD", "VB", "VBD"]
            if pos in pos_list:
                word_list = []

                # 同義詞替換
                if pos in ['NN', 'NNS', 'NNP', 'NNPS']:
                    word_set = wn.synsets(word, pos='n')
                elif pos in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', "MD"]:
                    word_set = wn.synsets(word, pos='v')
                elif pos in ['RB', 'RBR', 'RBS']:
                    word_set = wn.synsets(word, pos='r')
                elif pos in ['JJ', 'JJR', 'JJS']:
                    word_set = wn.synsets(word, pos='a')
                else:
                    word_set = wn.synsets(word)
                w_list = list(i._lemma_names for i in word_set)
                for w in w_list:
                    word_list.extend(w)
                if word_list:
                    word_set = set(word_list)
                if word_set:
                    word_set.discard(word)
                    if word_set:
                        word_set.discard(word)
                        if word.istitle():
                            nw = word_set.pop().replace("_", " ").capitalize()
                        else:
                            p_word = word_set.pop().replace("_", " ")
                            if p_word.lower() == word.lower():
                                nw = word
                            else:
                                nw = p_word
                        if pos in ["VBG"]:

                            # 動詞現在進行時轉換
                            nw = p.present_participle(nw)
                            # 單詞拼寫校正
                            nw = correct(nw)

                        elif pos in ["NNS", "NNPS"]:
                            if not nw.endswith("s"):
                                # 名詞複數轉換
                                nw = p.plural_noun(nw)
                                # 單詞拼寫校正
                                nw = correct(nw)
                    else:
                        nw = word
                else:
                    nw = word
                new_world.append(nw)
            else:
                new_world.append(word)
            if new_world:
                if new_world[-1] in [',', ':', ',', '.', ';', '。', ';', '-', '—', '?', '?', '!', '!', ']', '】', '}',
                                     '}', ')',
                                     ')', '|']:
                    if len(new_world) > 1:
                        point = new_world.pop(-1)
                        new_world[-1] = new_world[-1] + point

        all_world.extend(new_world)

    return " ".join(all_world).replace("( ", "(").replace("( ", "(").replace('[ ', "[").replace('【 ', '【', ).replace(
        '{ ', '{', ).replace('{ ', '{')


def main():
    global p, lemmatizer
    text = """NORMANDY PARK, Wash. — There was just one problem with the brand-new, wide-open layout of Kay and Bob Comiskey's home: it was really open.
"We remodelled and didn't have money for furniture," says Kay. "We lived for three years with almost nothing. We had one sectional we'd move from room to room. The UPS driver asked if we ballroom-danced."
They did not. But the Comiskeys are promenading on air these days, now that Bjarko5/8Serra Architects (who created those wide-open spaces) connected them with Amy May, of MAY Designs (who creatively filled those wide-open spaces).
The Comiskeys love their family-friendly neighbourhood and their delightful midcentury home (originally an old, dark A-frame), which served admirably for years as a flexible, active hub for their three kids and their coming-and-going lifestyle. But once the nest emptied, May says, "They wanted a new way to enjoy the house, such as creating intimate gathering spaces with dedicated furniture."
May didn't have to look far for decor inspiration: sand and saltwater shimmer forever, just outside a west-facing wall of windows.
"The clients wanted to maintain a neutral palette that acted as a backdrop and setting for the natural beauty of the Puget Sound," May says.
And now, a beautiful blend of natural simplicity and industrial touches artfully flows through the reimagined first floor, in driftwood, coral and beachy glass; colour-popping art pieces; and all-new, fantastically functional furniture whose only movement is the occasional swivel of a purposely placed chair.
•In the warmly welcoming living room, May softened the existing two-storey, black-clad fireplace with a giant, artsy, battery-operated clock that hangs from the 20-foot ceiling. 
"It casts interesting shadows and helps break up the mass of the black background," she says. 
"""
    p = inflect.engine()
    lemmatizer = WordNetLemmatizer()
    text_nw = generate_the_article(text=text)
    print(text_nw)

    # 藉助第三方平臺進行語法校正
    # print(grammar_check(text_nw))


if __name__ == '__main__':
    main()

# 詞性分析: https://www.jianshu.com/p/418cbdbf5e20
# 同義詞替換: https://blog.csdn.net/jining11/article/details/89458865
# 詞性還原: https://pypi.org/project/inflect/

# 單詞拼寫校驗 https://blog.csdn.net/Pwiling/article/details/50573650

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章