python實現拼寫檢查(帶過程舉例)

python實現拼寫檢查

附件地址https://gitee.com/wenlong850606/spelling_check_python

參考https://blog.csdn.net/u013830811/article/details/46539919會開花的樹-《21行Python寫出拼寫檢查器》

import re
import collections

封裝函數

def words(text): 
    '''
    該函數將文本中的單詞轉化爲小寫,拆分,存到列表
    don't 會被拆分爲don 和 t
    '''
    return re.findall('[a-z]+', text.lower())
def train(features):
    '''
    該函數統計每個單詞出現的次數,對於沒有在訓練集中出現的單詞,採用拉普拉斯平滑,默認它們出現了一次。
    '''
    model = collections.defaultdict(lambda : 1)
    for f in features:
        model[f] += 1
    return model

精華部分一:“一次編輯”函數,只對單詞中的一個字符進行(增、刪、改、換)處理

切割,對第2部分進行增、刪、改、換操作

alphabet = 'abcdefghijklmnopqrstuvwxyz'

def edits_once(word):
    '''
    一次編輯:增加1個字母、刪除1個字母、修改1個字母、相鄰字母調換1次
    '''
    splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]  # 切割,對第2部分進行增、刪、改、換操作
    inserts = [a + c + b for a, b in splits for c in alphabet]  # 增<-->a和b中插入一個c
    deletes = [a + b[1:] for a, b in splits if b]  # 刪<-->b[0]
    replaces = [a + c + b[1:] for a, b in splits for c in alphabet if b]  # 改<-->b[0]換成c
    transposes = [a + b[1] + b[0] + b[2:] for a, b in splits if len(b) > 1]  # 換<-->b[1]和b[0]互換
    return set(inserts + deletes + replaces + transposes)

“一次編輯”函數舉例:

對單詞’word’進行一次處理

splits = [('word'[:i], 'word'[i:]) for i in range(len('word') + 1)]
splits
[('', 'word'), ('w', 'ord'), ('wo', 'rd'), ('wor', 'd'), ('word', '')]
edits_once('word')
{'aord',
 'aword',
 'bord',
 'bword',
 'cord',
 'cword',
 'dord',
 'dword',
 'eord',
 'eword',
 'ford',
 'fword',
 'gord',
 'gword',
 'hord',
 'hword',
 'iord',
 'iword',
 'jord',
 'jword',
 'kord',
 'kword',
 'lord',
 'lword',
 'mord',
 'mword',
 'nord',
 'nword',
 'oord',
 'ord',
 'oword',
 'owrd',
 'pord',
 'pword',
 'qord',
 'qword',
 'rord',
 'rword',
 'sord',
 'sword',
 'tord',
 'tword',
 'uord',
 'uword',
 'vord',
 'vword',
 'waord',
 'ward',
 'wbord',
 'wbrd',
 'wcord',
 'wcrd',
 'wdord',
 'wdrd',
 'weord',
 'werd',
 'wford',
 'wfrd',
 'wgord',
 'wgrd',
 'whord',
 'whrd',
 'wiord',
 'wird',
 'wjord',
 'wjrd',
 'wkord',
 'wkrd',
 'wlord',
 'wlrd',
 'wmord',
 'wmrd',
 'wnord',
 'wnrd',
 'woad',
 'woard',
 'wobd',
 'wobrd',
 'wocd',
 'wocrd',
 'wod',
 'wodd',
 'wodr',
 'wodrd',
 'woed',
 'woerd',
 'wofd',
 'wofrd',
 'wogd',
 'wogrd',
 'wohd',
 'wohrd',
 'woid',
 'woird',
 'wojd',
 'wojrd',
 'wokd',
 'wokrd',
 'wold',
 'wolrd',
 'womd',
 'womrd',
 'wond',
 'wonrd',
 'wood',
 'woord',
 'wopd',
 'woprd',
 'woqd',
 'woqrd',
 'wor',
 'wora',
 'worad',
 'worb',
 'worbd',
 'worc',
 'worcd',
 'word',
 'worda',
 'wordb',
 'wordc',
 'wordd',
 'worde',
 'wordf',
 'wordg',
 'wordh',
 'wordi',
 'wordj',
 'wordk',
 'wordl',
 'wordm',
 'wordn',
 'wordo',
 'wordp',
 'wordq',
 'wordr',
 'words',
 'wordt',
 'wordu',
 'wordv',
 'wordw',
 'wordx',
 'wordy',
 'wordz',
 'wore',
 'wored',
 'worf',
 'worfd',
 'worg',
 'worgd',
 'worh',
 'worhd',
 'wori',
 'worid',
 'worj',
 'worjd',
 'work',
 'workd',
 'worl',
 'world',
 'worm',
 'wormd',
 'worn',
 'wornd',
 'woro',
 'worod',
 'worp',
 'worpd',
 'worq',
 'worqd',
 'worr',
 'worrd',
 'wors',
 'worsd',
 'wort',
 'wortd',
 'woru',
 'worud',
 'worv',
 'worvd',
 'worw',
 'worwd',
 'worx',
 'worxd',
 'wory',
 'woryd',
 'worz',
 'worzd',
 'wosd',
 'wosrd',
 'wotd',
 'wotrd',
 'woud',
 'wourd',
 'wovd',
 'wovrd',
 'wowd',
 'wowrd',
 'woxd',
 'woxrd',
 'woyd',
 'woyrd',
 'wozd',
 'wozrd',
 'wpord',
 'wprd',
 'wqord',
 'wqrd',
 'wrd',
 'wrod',
 'wrord',
 'wrrd',
 'wsord',
 'wsrd',
 'wtord',
 'wtrd',
 'wuord',
 'wurd',
 'wvord',
 'wvrd',
 'wword',
 'wwrd',
 'wxord',
 'wxrd',
 'wyord',
 'wyrd',
 'wzord',
 'wzrd',
 'xord',
 'xword',
 'yord',
 'yword',
 'zord',
 'zword'}

驗證

print('ord' in edits_once('word'))
True

主流程

第一步,由《big.txt》生成訓練集

各個單詞出現的頻次

NWORDS = train(words(open('big.txt').read()))
NWORDS
defaultdict(<function __main__.train.<locals>.<lambda>()>,
            {'the': 80031,
             'project': 289,
             'gutenberg': 264,
             'ebook': 88,
             'of': 40026,
             'adventures': 18,
             'sherlock': 102,
             'holmes': 468,
             'by': 6739,
             'sir': 178,
             'arthur': 35,
             'conan': 5,
             'doyle': 6,
             'in': 22048,
             'our': 1067,
             'series': 129,
             'copyright': 70,
             'laws': 234,
             'are': 3631,
             'changing': 45,
             'all': 4145,
             'over': 1283,
             'world': 363,
             'be': 6156,
             'sure': 124,
             'to': 28767,
             'check': 39,
             'for': 6940,
             'your': 1280,
             'country': 424,
             'before': 1364,
             'downloading': 6,
             'or': 5353,
             'redistributing': 8,
             'this': 4064,
             'any': 1205,
             'other': 1503,
             'header': 8,
             'should': 1298,
             'first': 1178,
             'thing': 304,
             'seen': 445,
             'when': 2924,
             'viewing': 8,
             'file': 22,
             'please': 173,
             'do': 1504,
             'not': 6626,
             'remove': 54,
             'it': 10682,
             'change': 151,
             'edit': 5,
             'without': 1016,
             'written': 118,
             'permission': 53,
             'read': 219,
             'legal': 53,
             'small': 528,
             'print': 48,
             'and': 38313,
             'information': 74,
             'about': 1498,
             'at': 6792,
             'bottom': 43,
             'included': 44,
             'is': 9775,
             'important': 286,
             'specific': 38,
             'rights': 169,
             'restrictions': 24,
             'how': 1316,
             'may': 2552,
             'used': 277,
             'you': 5623,
             'can': 1096,
             'also': 779,
             'find': 295,
             'out': 1988,
             'make': 505,
             'a': 21156,
             'donation': 11,
             'get': 469,
             'involved': 108,
             'welcome': 19,
             'free': 422,
             'plain': 109,
             'vanilla': 7,
             'electronic': 59,
             'texts': 8,
             'ebooks': 55,
             'readable': 14,
             'both': 530,
             'humans': 3,
             'computers': 8,
             'since': 261,
             'these': 1232,
             'were': 4290,
             'prepared': 139,
             'thousands': 94,
             'volunteers': 23,
             'title': 40,
             'author': 30,
             'release': 29,
             'date': 49,
             'march': 136,
             'most': 909,
             'recently': 31,
             'updated': 5,
             'november': 42,
             'edition': 22,
             'language': 62,
             'english': 212,
             'character': 175,
             'set': 325,
             'encoding': 6,
             'ascii': 12,
             'start': 68,
             'additional': 31,
             'editing': 7,
             'jose': 2,
             'menendez': 2,
             'contents': 51,
             'i': 7683,
             'scandal': 20,
             'bohemia': 16,
             'ii': 78,
             'red': 289,
             'headed': 38,
             'league': 54,
             'iii': 92,
             'case': 439,
             'identity': 12,
             'iv': 56,
             'boscombe': 17,
             'valley': 79,
             'mystery': 40,
             'v': 52,
             'five': 280,
             'orange': 24,
             'pips': 13,
             'vi': 38,
             'man': 1653,
             'with': 9741,
             'twisted': 22,
             'lip': 57,
             'vii': 35,
             'adventure': 35,
             'blue': 144,
             'carbuncle': 18,
             'viii': 40,
             'speckled': 6,
             'band': 55,
             'ix': 29,
             'engineer': 13,
             's': 5632,
             'thumb': 52,
             'x': 137,
             'noble': 49,
             'bachelor': 19,
             'xi': 29,
             'beryl': 5,
             'coronet': 30,
             'xii': 29,
             'copper': 27,
             'beeches': 13,
             'she': 3947,
             'always': 609,
             'woman': 326,
             'have': 3494,
             'seldom': 77,
             'heard': 637,
             'him': 5231,
             'mention': 47,
             'her': 5285,
             'under': 964,
             'name': 263,
             'his': 10035,
             'eyes': 940,
             'eclipses': 3,
             'predominates': 4,
             'whole': 745,
             'sex': 12,
             'was': 11411,
             'that': 12513,
             'he': 12402,
             'felt': 698,
             'emotion': 37,
             'akin': 15,
             'love': 485,
             'irene': 19,
             'adler': 17,
             'emotions': 11,
             'one': 3372,
             'particularly': 175,
             'abhorrent': 2,
             'cold': 258,
             'precise': 14,
             'but': 5654,
             'admirably': 8,
             'balanced': 7,
             'mind': 342,
             'take': 617,
             'perfect': 40,
             'reasoning': 42,
             'observing': 22,
             'machine': 40,
             'has': 1604,
             'as': 8065,
             'lover': 27,
             'would': 1954,
             'placed': 183,
             'himself': 1159,
             'false': 65,
             'position': 433,
             'never': 594,
             'spoke': 219,
             'softer': 11,
             'passions': 30,
             'save': 111,
             'gibe': 3,
             'sneer': 7,
             'they': 3939,
             'admirable': 15,
             'things': 322,
             'observer': 14,
             'excellent': 63,
             'drawing': 241,
             'veil': 17,
             'from': 5710,
             'men': 1146,
             'motives': 15,
             'actions': 78,
             'trained': 24,
             'reasoner': 7,
             'admit': 66,
             'such': 1437,
             'intrusions': 2,
             'into': 2125,
             'own': 786,
             'delicate': 55,
             'finely': 12,
             'adjusted': 17,
             'temperament': 6,
             'introduce': 24,
             'distracting': 2,
             'factor': 42,
             'which': 4843,
             'might': 537,
             'throw': 49,
             'doubt': 153,
             'upon': 1112,
             'mental': 38,
             'results': 230,
             'grit': 2,
             'sensitive': 36,
             'instrument': 36,
             'crack': 21,
             'high': 291,
             'power': 549,
             'lenses': 2,
             'more': 1998,
             'disturbing': 10,
             'than': 1207,
             'strong': 169,
             'nature': 171,
             'yet': 489,
             'there': 2973,
             'late': 166,
             'dubious': 2,
             'questionable': 4,
             'memory': 56,
             'had': 7384,
             'little': 1002,
             'lately': 23,
             'my': 2250,
             'marriage': 97,
             'drifted': 6,
             'us': 685,
             'away': 839,
             'each': 412,
             'complete': 146,
             'happiness': 144,
             'home': 296,
             'centred': 3,
             'interests': 119,
             'rise': 241,
             'up': 2285,
             'around': 272,
             'who': 3051,
             'finds': 24,
             'master': 142,
             'establishment': 41,
             'sufficient': 76,
             'absorb': 5,
             'attention': 192,
             'while': 769,
             'loathed': 2,
             'every': 651,
             'form': 508,
             'society': 170,
             'bohemian': 9,
             'soul': 169,
             'remained': 232,
             'lodgings': 12,
             'baker': 50,
             'street': 181,
             'buried': 22,
             'among': 452,
             'old': 1181,
             'books': 60,
             'alternating': 3,
             'week': 96,
             'between': 655,
             'cocaine': 5,
             'ambition': 14,
             'drowsiness': 5,
             'drug': 22,
             'fierce': 13,
             'energy': 46,
             'keen': 33,
             'still': 923,
             'ever': 275,
             'deeply': 78,
             'attracted': 37,
             'study': 145,
             'crime': 62,
             'occupied': 117,
             'immense': 78,
             'faculties': 9,
             'extraordinary': 75,
             'powers': 150,
             'observation': 40,
             'following': 209,
             'those': 1202,
             'clues': 4,
             'clearing': 30,
             'mysteries': 10,
             'been': 2600,
             'abandoned': 73,
             'hopeless': 18,
             'official': 92,
             'police': 95,
             'time': 1530,
             'some': 1537,
             'vague': 40,
             'account': 178,
             'doings': 12,
             'summons': 12,
             'odessa': 4,
             'trepoff': 2,
             'murder': 31,
             'singular': 37,
             'tragedy': 10,
             'atkinson': 2,
             'brothers': 51,
             'trincomalee': 2,
             'finally': 157,
             'mission': 35,
             'accomplished': 40,
             'so': 3018,
             'delicately': 4,
             'successfully': 26,
             'reigning': 4,
             'family': 211,
             'holland': 13,
             'beyond': 226,
             'signs': 99,
             'activity': 132,
             'however': 431,
             'merely': 190,
             'shared': 26,
             'readers': 12,
             'daily': 45,
             'press': 82,
             'knew': 497,
             'former': 178,
             'friend': 284,
             'companion': 82,
             'night': 386,
             'on': 6644,
             'twentieth': 20,
             'returning': 69,
             'journey': 70,
             'patient': 384,
             'now': 1698,
             'returned': 195,
             'civil': 178,
             'practice': 96,
             'way': 860,
             'led': 197,
             'me': 1921,
             'through': 816,
             'passed': 368,
             'well': 1199,
             'remembered': 121,
             'door': 499,
             'must': 956,
             'associated': 197,
             'wooing': 3,
             'dark': 182,
             'incidents': 15,
             'scarlet': 23,
             'seized': 115,
             'desire': 97,
             'see': 1102,
             'again': 867,
             'know': 1049,
             'employing': 8,
             'rooms': 87,
             'brilliantly': 6,
             'lit': 75,
             'even': 947,
             'looked': 761,
             'saw': 600,
             'tall': 75,
             'spare': 28,
             'figure': 104,
             'pass': 155,
             'twice': 85,
             'silhouette': 2,
             'against': 661,
             'blind': 24,
             'pacing': 27,
             'room': 961,
             'swiftly': 39,
             'eagerly': 40,
             'head': 726,
             'sunk': 28,
             'chest': 82,
             'hands': 456,
             'clasped': 12,
             'behind': 402,
             'mood': 52,
             'habit': 56,
             'attitude': 73,
             'manner': 136,
             'told': 491,
             'their': 2956,
             'story': 134,
             'work': 383,
             'risen': 31,
             'created': 63,
             'dreams': 17,
             'hot': 120,
             'scent': 18,
             'new': 1212,
             'problem': 77,
             'rang': 30,
             'bell': 66,
             'shown': 114,
             'chamber': 36,
             'formerly': 78,
             'part': 705,
             'effusive': 3,
             'glad': 151,
             'think': 558,
             'hardly': 174,
             'word': 299,
             'spoken': 93,
             'kindly': 87,
             'eye': 111,
             'waved': 30,
             'an': 3424,
             'armchair': 50,
             'threw': 97,
             'across': 223,
             'cigars': 8,
             'indicated': 89,
             'spirit': 168,
             'gasogene': 2,
             'corner': 129,
             'then': 1559,
             'stood': 384,
             'fire': 275,
             'introspective': 4,
             'fashion': 50,
             'wedlock': 2,
             'suits': 9,
             'remarked': 170,
             'watson': 84,
             'put': 436,
             'seven': 133,
             'half': 319,
             'pounds': 27,
             'answered': 227,
             'indeed': 140,
             'thought': 903,
             'just': 768,
             'trifle': 12,
             'fancy': 51,
             'observe': 38,
             'did': 1876,
             'tell': 493,
             'intended': 59,
             'go': 906,
             'harness': 28,
             'deduce': 15,
             'getting': 93,
             'yourself': 163,
             'very': 1341,
             'wet': 61,
             'clumsy': 9,
             'careless': 15,
             'servant': 47,
             'girl': 167,
             'dear': 450,
             'said': 3465,
             'too': 549,
             'much': 672,
             'certainly': 120,
             'burned': 78,
             'lived': 114,
             'few': 459,
             'centuries': 13,
             'ago': 109,
             'true': 206,
             'walk': 76,
             'thursday': 8,
             'came': 980,
             'dreadful': 69,
             'mess': 11,
             'changed': 135,
             'clothes': 63,
             't': 1319,
             'imagine': 97,
             'mary': 706,
             'jane': 3,
             'incorrigible': 3,
             'wife': 368,
             'given': 365,
             'notice': 99,
             'fail': 41,
             'chuckled': 8,
             'rubbed': 33,
             'long': 992,
             'nervous': 55,
             'together': 261,
             'simplicity': 31,
             'itself': 274,
             'inside': 44,
             'left': 835,
             'shoe': 12,
             'where': 978,
             'firelight': 3,
             'strikes': 20,
             'leather': 36,
             'scored': 5,
             'six': 177,
             'almost': 326,
             'parallel': 18,
             'cuts': 6,
             'obviously': 39,
             'caused': 103,
             'someone': 161,
             'carelessly': 15,
             'scraped': 22,
             'round': 557,
             'edges': 71,
             'sole': 71,
             'order': 405,
             'crusted': 3,
             'mud': 37,
             'hence': 33,
             'double': 50,
             'deduction': 13,
             'vile': 17,
             'weather': 43,
             'malignant': 89,
             'boot': 23,
             'slitting': 3,
             'specimen': 15,
             'london': 77,
             'slavey': 2,
             'if': 2373,
             'gentleman': 100,
             'walks': 11,
             'smelling': 6,
             'iodoform': 44,
             'black': 236,
             'mark': 39,
             'nitrate': 8,
             'silver': 129,
             'right': 711,
             'forefinger': 8,
             'bulge': 3,
             'side': 512,
             'top': 43,
             'hat': 106,
             'show': 214,
             'secreted': 3,
             'stethoscope': 3,
             'dull': 75,
             'pronounce': 10,
             'active': 97,
             'member': 51,
             'medical': 23,
             'profession': 23,
             'could': 1701,
             'help': 231,
             'laughing': 116,
             'ease': 45,
             'explained': 61,
             'process': 220,
             'hear': 184,
             'give': 524,
             'reasons': 65,
             'appears': 109,
             'ridiculously': 2,
             'simple': 140,
             'easily': 115,
             'myself': 228,
             'though': 651,
             'successive': 18,
             'instance': 51,
             'am': 747,
             'baffled': 9,
             'until': 326,
             'explain': 124,
             'believe': 184,
             'good': 745,
             'yours': 47,
             'quite': 503,
             'lighting': 17,
             'cigarette': 7,
             'throwing': 47,
             'down': 1129,
             'distinction': 20,
             'clear': 234,
             'example': 287,
             'frequently': 219,
             'steps': 189,
             'lead': 138,
             'hall': 84,
             'often': 444,
             'hundreds': 49,
             'times': 237,
             'many': 610,
             'don': 582,
             'observed': 132,
             'point': 224,
             'seventeen': 11,
             'because': 631,
             'interested': 66,
             'problems': 79,
             'enough': 176,
             'chronicle': 8,
             'two': 1139,
             'trifling': 13,
             'experiences': 12,
             'sheet': 30,
             'thick': 78,
             'pink': 28,
             'tinted': 10,
             'notepaper': 3,
             'lying': 119,
             'open': 326,
             'table': 297,
             'last': 566,
             'post': 118,
             'aloud': 29,
             'note': 116,
             'undated': 2,
             'either': 294,
             'signature': 10,
             'address': 77,
             'will': 1578,
             'call': 198,
             'quarter': 47,
             'eight': 129,
             'o': 258,
             'clock': 121,
             'desires': 23,
             'consult': 20,
             'matter': 366,
             'deepest': 16,
             'moment': 488,
             'recent': 55,
             'services': 39,
             'royal': 112,
             'houses': 118,
             'europe': 154,
             'safely': 12,
             'trusted': 17,
             'matters': 137,
             'importance': 118,
             'exaggerated': 29,
             'we': 1907,
             'quarters': 73,
             'received': 281,
             'hour': 158,
             'amiss': 7,
             'visitor': 75,
             'wear': 31,
             'mask': 13,
             'what': 3012,
             'means': 254,
             'no': 2349,
             'data': 18,
             'capital': 145,
             'mistake': 40,
             'theorise': 2,
             'insensibly': 3,
             'begins': 48,
             'twist': 15,
             'facts': 73,
             'suit': 26,
             'theories': 22,
             'instead': 138,
             'carefully': 73,
             'examined': 50,
             'writing': 70,
             'paper': 178,
             'wrote': 150,
             'presumably': 9,
             'endeavouring': 9,
             'imitate': 8,
             'processes': 36,
             'bought': 56,
             'crown': 62,
             'packet': 12,
             'peculiarly': 15,
             'stiff': 21,
             'peculiar': 85,
             'hold': 115,
             'light': 279,
             'large': 484,
             'e': 137,
             'g': 56,
             'p': 67,
             'woven': 6,
             'texture': 7,
             'asked': 778,
             'maker': 5,
             'monogram': 5,
             'rather': 220,
             'stands': 20,
             'gesellschaft': 2,
             'german': 197,
             'company': 193,
             'customary': 20,
             'contraction': 62,
             'like': 1081,
             'co': 31,
             'course': 390,
             'papier': 2,
             'eg': 2,
             'let': 507,
             'glance': 92,
             'continental': 47,
             'gazetteer': 2,
             'took': 574,
             'heavy': 140,
             'brown': 72,
             'volume': 31,
             'shelves': 4,
             'eglow': 2,
             'eglonitz': 2,
             'here': 692,
             'egria': 2,
             'speaking': 186,
             'far': 409,
             'carlsbad': 2,
             'remarkable': 78,
             'being': 919,
             'scene': 50,
             'death': 331,
             'wallenstein': 2,
             'its': 1636,
             'numerous': 51,
             'glass': 117,
             'factories': 30,
             'mills': 40,
             'ha': 76,
             'boy': 170,
             'sparkled': 6,
             'sent': 320,
             'great': 793,
             'triumphant': 17,
             'cloud': 31,
             'made': 1008,
             'precisely': 25,
             'construction': 26,
             'sentence': 27,
             'frenchman': 103,
             'russian': 462,
             'uncourteous': 2,
             'verbs': 2,
             'only': 1874,
             'remains': 74,
             'therefore': 187,
             'discover': 29,
             'wanted': 214,
             'writes': 21,
             'prefers': 3,
             'wearing': 88,
             'showing': 105,
             'face': 1126,
             'comes': 92,
             'mistaken': 60,
             'resolve': 15,
             'doubts': 40,
             'sharp': 84,
             'sound': 220,
             'horses': 263,
             'hoofs': 25,
             'grating': 11,
             'wheels': 48,
             'curb': 5,
             'followed': 330,
             'pull': 24,
             'whistled': 14,
             'pair': 41,
             'yes': 689,
             'continued': 292,
             'glancing': 99,
             'window': 187,
             'nice': 54,
             'brougham': 5,
             'beauties': 3,
             'hundred': 230,
             'fifty': 95,
             'guineas': 4,
             'apiece': 8,
             'money': 327,
             'nothing': 647,
             'else': 202,
             'better': 267,
             'bit': 64,
             'doctor': 184,
             'stay': 75,
             'lost': 225,
             'boswell': 2,
             'promises': 16,
             'interesting': 72,
             'pity': 76,
             'miss': 113,
             'client': 34,
             'want': 324,
             'sit': 90,
             'best': 269,
             'slow': 66,
             'step': 140,
             'stairs': 32,
             'passage': 111,
             'paused': 80,
             'immediately': 183,
             'outside': 111,
             'loud': 65,
             'authoritative': 3,
             'tap': 11,
             'come': 935,
             'entered': 283,
             'less': 368,
             'feet': 180,
             'inches': 17,
             'height': 37,
             'limbs': 68,
             'hercules': 5,
             'dress': 139,
             'rich': 93,
             'richness': 3,
             'england': 312,
             'bad': 156,
             'taste': 24,
             'bands': 28,
             'astrakhan': 2,
             'slashed': 4,
             'sleeves': 31,
             'fronts': 2,
             'breasted': 2,
             'coat': 173,
             'deep': 216,
             'cloak': 63,
             'thrown': 93,
             'shoulders': 126,
             'lined': 33,
             'flame': 16,
             'coloured': 22,
             'silk': 51,
             'secured': 49,
             'neck': 204,
             'brooch': 2,
             'consisted': 39,
             'single': 174,
             'flaming': 9,
             'boots': 92,
             'extended': 76,
             'halfway': 20,
             'calves': 4,
             'trimmed': 9,
             'tops': 4,
             'fur': 39,
             'completed': 26,
             'impression': 68,
             'barbaric': 3,
             'opulence': 4,
             'suggested': 70,
             'appearance': 136,
             'carried': 283,
             'broad': 93,
             'brimmed': 5,
             'hand': 835,
             'wore': 59,
             'upper': 131,
             'extending': 36,
             'past': 224,
             'cheekbones': 5,
             'vizard': 2,
             'apparently': 69,
             'raised': 213,
             'lower': 197,
             'appeared': 198,
             'hanging': 43,
             'straight': 125,
             'chin': 31,
             'suggestive': 12,
             'resolution': 58,
             'pushed': 82,
             'length': 64,
             'obstinacy': 8,
             'harsh': 23,
             'voice': 463,
             'strongly': 42,
             'marked': 139,
             'accent': 19,
             'uncertain': 31,
             'pray': 80,
             'seat': 171,
             'colleague': 8,
             'dr': 49,
             'occasionally': 90,
             'cases': 454,
             'whom': 490,
             'honour': 17,
             'count': 749,
             'von': 12,
             'kramm': 3,
             'nobleman': 12,
             'understand': 413,
             'discretion': 14,
             'trust': 69,
             'extreme': 73,
             'prefer': 22,
             'communicate': 16,
             'alone': 338,
             'rose': 244,
             'caught': 91,
             'wrist': 69,
             'back': 747,
             'chair': 136,
             'none': 111,
             'say': 756,
             'anything': 380,
             'shrugged': 36,
             'begin': 98,
             'binding': 19,
             'absolute': 57,
             'secrecy': 19,
             'years': 572,
             'end': 466,
             'present': 330,
             'weight': 71,
             'influence': 139,
             'european': 100,
             'history': 440,
             'promise': 68,
             'excuse': 54,
             'strange': 221,
             'august': 71,
             'person': 186,
             'employs': 3,
             'wishes': 43,
             'agent': 26,
             'unknown': 88,
             'confess': 37,
             'once': 570,
             'called': 451,
             'exactly': 48,
             'aware': 53,
             'dryly': 6,
             'circumstances': 108,
             'delicacy': 12,
             'precaution': 10,
             'taken': 439,
             'quench': 4,
             'grow': 75,
             'seriously': 64,
             'compromise': 72,
             'families': 46,
             'speak': 256,
             'plainly': 40,
             'implicates': 6,
             'house': 662,
             'ormstein': 3,
             'hereditary': 15,
             'kings': 28,
             'murmured': 19,
             'settling': 17,
             'closing': 36,
             'glanced': 177,
             ...})

經過兩次編輯後,能出現在訓練集中的情形,概率不高,放在短路原則靠後

def known_edits_twice(word):
    return set(e2 for e1 in edits_once(word) for e2 in edits_once(e1) if e2 in NWORDS)

舉例

known_edits_twice('word')
{'award',
 'bird',
 'board',
 'bold',
 'bond',
 'bore',
 'bored',
 'born',
 'bory',
 'card',
 'chord',
 'cod',
 'cold',
 'cond',
 'cor',
 'cord',
 'cords',
 'core',
 'cork',
 'corn',
 'dodd',
 'dora',
 'dorr',
 'fold',
 'fond',
 'food',
 'for',
 'ford',
 'fords',
 'fore',
 'fork',
 'form',
 'fort',
 'gird',
 'god',
 'gold',
 'good',
 'gory',
 'hard',
 'herd',
 'hoard',
 'hold',
 'hood',
 'horde',
 'horn',
 'hors',
 'load',
 'lord',
 'lords',
 'lore',
 'loud',
 'mold',
 'mood',
 'more',
 'mori',
 'mort',
 'nod',
 'nor',
 'norm',
 'odd',
 'old',
 'or',
 'orb',
 'ore',
 'org',
 'owed',
 'pond',
 'pork',
 'port',
 'rd',
 'road',
 'rod',
 'sird',
 'sold',
 'sore',
 'sort',
 'sword',
 'swords',
 'swore',
 'sworn',
 'told',
 'tore',
 'torn',
 'tory',
 'trod',
 'void',
 'wad',
 'wand',
 'war',
 'ward',
 'wards',
 'ware',
 'warm',
 'warn',
 'warp',
 'wars',
 'wart',
 'wary',
 'wead',
 'wed',
 'weed',
 'weird',
 'weld',
 'were',
 'wert',
 'wid',
 'wild',
 'wind',
 'wire',
 'wired',
 'wirt',
 'wiry',
 'wo',
 'woe',
 'woes',
 'woke',
 'wolf',
 'womb',
 'won',
 'wont',
 'wood',
 'woods',
 'woof',
 'wool',
 'woot',
 'word',
 'worded',
 'words',
 'wordy',
 'wore',
 'work',
 'worked',
 'works',
 'world',
 'worlds',
 'worm',
 'worms',
 'worn',
 'worry',
 'worse',
 'worst',
 'worth',
 'would',
 'wound',
 'wove',
 'wry',
 'yard',
 'yore',
 'york'}

字符輸入正確,且在訓練集裏面的情形,概率最高,放在短路原則最前面

def known(words):
    return set(w for w in words if w in NWORDS)

精華部分二:概率思想(短路原則)

經過0次變換的(輸入正確),概率最高

如果輸入錯誤,只差一個(增、刪、改、換)的概率次之

如果輸入錯誤,差兩個的概率更低

如果前面三種情況都沒有出現,字符可能不在訓練集中,“原樣”輸出

def correct(word):
    candidates = known([word]) or known(edits_once(word)) or known_edits_twice(word) or [word]
    return max(candidates, key = NWORDS.get)

測試

correct('WordS')
'words'

總結:

整個代碼充分體現了python的靈活性和簡潔的特點,到了“可讀性差”的地步。

下面的代碼可以將1到100的數中,能被3整除時打印fizz,能被5整除時打印buzz,既能被3整除又能被5整除時打印fizzbuzz,其它的數不打印,也充分體現了python“簡單暴力”:

for i in range(1, 101):
    print(f'{i}: ', 'fizz'[i % 3 * 4::] + 'buzz'[i % 5 * 4::])
1:  
2:  
3:  fizz
4:  
5:  buzz
6:  fizz
7:  
8:  
9:  fizz
10:  buzz
11:  
12:  fizz
13:  
14:  
15:  fizzbuzz
16:  
17:  
18:  fizz
19:  
20:  buzz
21:  fizz
22:  
23:  
24:  fizz
25:  buzz
26:  
27:  fizz
28:  
29:  
30:  fizzbuzz
31:  
32:  
33:  fizz
34:  
35:  buzz
36:  fizz
37:  
38:  
39:  fizz
40:  buzz
41:  
42:  fizz
43:  
44:  
45:  fizzbuzz
46:  
47:  
48:  fizz
49:  
50:  buzz
51:  fizz
52:  
53:  
54:  fizz
55:  buzz
56:  
57:  fizz
58:  
59:  
60:  fizzbuzz
61:  
62:  
63:  fizz
64:  
65:  buzz
66:  fizz
67:  
68:  
69:  fizz
70:  buzz
71:  
72:  fizz
73:  
74:  
75:  fizzbuzz
76:  
77:  
78:  fizz
79:  
80:  buzz
81:  fizz
82:  
83:  
84:  fizz
85:  buzz
86:  
87:  fizz
88:  
89:  
90:  fizzbuzz
91:  
92:  
93:  fizz
94:  
95:  buzz
96:  fizz
97:  
98:  
99:  fizz
100:  buzz

發佈了18 篇原創文章 · 獲贊 2 · 訪問量 633
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章