


python WikiExtractor.py -b 50M -o extracted_dir zhwiki-latest-pages-articles.xml.bz2


-rw-rw-r-- 1 zwx zwx 50M Jan  2 17:32 wiki_00
-rw-rw-r-- 1 zwx zwx 50M Jan  2 17:32 wiki_01
-rw-rw-r-- 1 zwx zwx 50M Jan  2 17:32 wiki_02
-rw-rw-r-- 1 zwx zwx 50M Jan  2 17:33 wiki_03
-rw-rw-r-- 1 zwx zwx 50M Jan  2 17:34 wiki_04
-rw-rw-r-- 1 zwx zwx 50M Jan  2 17:34 wiki_05
-rw-rw-r-- 1 zwx zwx 50M Jan  2 17:35 wiki_06
-rw-rw-r-- 1 zwx zwx 50M Jan  2 17:36 wiki_07
-rw-rw-r-- 1 zwx zwx 50M Jan  2 17:36 wiki_08
-rw-rw-r-- 1 zwx zwx 50M Jan  2 17:37 wiki_09
-rw-rw-r-- 1 zwx zwx 50M Jan  2 17:38 wiki_10
-rw-rw-r-- 1 zwx zwx 50M Jan  2 17:39 wiki_11
-rw-rw-r-- 1 zwx zwx 50M Jan  2 17:40 wiki_12


<doc id="25278" url="https://zh.wikipedia.org/wiki?curid=25278" title="飛越瘋人院">





opencc -i ori_file -o final_file -c t2s.json


source ~/.bashrc

if [[ $# -eq 2 ]];then

    if [[ ! -d $input ]];then
        echo "$input id not a director!"

    if [[ ! -d output ]];then
        mkdir -p $output

    for file in $(ls $input);do
        echo "$input/$file -> $output/$file.txt"
        opencc -i $input/$file -o $output/$file.txt -c t2s.json
    echo "parameter error!"


sh run_t2s.sh ori_root des_root


ori_root/AA/wiki_00 -> des_root/wiki_00.txt
ori_root/AA/wiki_01 -> des_root/wiki_01.txt
ori_root/AA/wiki_02 -> des_root/wiki_02.txt
ori_root/AA/wiki_03 -> des_root/wiki_03.txt


import jieba
import jieba.analyse
import codecs
import sys
import re
import os

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

wikiDocBegin = re.compile(r'^<doc.*>$')
wikiDocEnd = re.compile(r'^</doc>$')

def segment(ori_file, seg_file, wiki=True):
        ori_file: 原始文本文件
        seg_file: 分詞後的文本文件
        wiki: 是否爲wiki數據,如果是wiki,則去掉包含<doc></doc>的行
    with codecs.open(seg_file, 'w+', encoding='utf-8') as fw:
        for line in codecs.open(ori_file, 'r', encoding='utf-8'):
            if line == '\n':

            if wiki and (wikiDocBegin.match(line) or wikiDocEnd.match(line)):
                line_seg = '\n'
                # 這裏可以過濾停用詞
                # for word in jieba.cut(line):
                #     if word in stopwords:
                #         ...
                line_seg = ' '.join(jieba.cut(line))

def corpus_segment(ori_root, seg_root):
        ori_root: 原始文件目錄
        seg_root: 分詞結果文件目錄
    if os.path.exists(ori_root) and os.path.exists(seg_root):
        for filename in os.listdir(ori_root):
            filepath = os.path.join(ori_root, filename)
            savepath = os.path.join(seg_root, filename)
            print('%s -> %s' % (filepath, savepath))
            segment(filepath, savepath)
        print("%s or %s is not exists!" % (ori_root, seg_root))
corpus_segment('des_root', 'seg_root')  # seg_roog爲分詞結果文件存儲目錄


《 美國 恐怖 故事 》 ( ) 是 美國 恐怖 電視劇 系列 。 第一季 《 凶宅 》 , 該 系列 於 2011 年 10 月 5 日 在 美國 有線電視 FX 首播 。 第二季 片名 爲 《 美國 恐怖 故事 : 瘋人院 》 。 2013 年 10 月 10 日 第三季 《 美國 恐怖 故事 : 女巫 集會 》 開播 。 第四季 《 美國 恐怖 故事 : 畸形 秀 》 2014 年 10 月 8 日 開播 。 第五 季   《 美國 恐怖 故事 : 旅館 》   2015 年 10 月 7 日 開播 。 第六 季 《 美國 恐怖 故事 : 羅亞 諾克 》 2016 年 9 月 14 日 開播 。 第七 季 《 美國 恐怖 故事 : 異教 》 2017 年 9 月 5 日 開播 。 第八 季 《 美國 恐怖 故事 : 啓示錄 》 2018 年 9 月 12 日 開播 。 第九 季 《 美國 恐怖 故事 : 1984 》 2019 年 9 月 18 日 開播 。


from gensim import utils
import gensim.models
import gensim
import codecs
import sys

class MyCorpus:
    def __init__(self, seg_root):
        self.seg_root = seg_root

    def __iter__(self):
        for filename in os.listdir(self.seg_root):
            filepath = os.path.join(self.seg_root, filename)
            for line in codecs.open(filepath, 'r', encoding='utf-8'):
                yield utils.simple_preprocess(line)

sentences = MyCorpus('../../datasets/zhwiki-20191201-segment')
# 訓練word2vec
model = gensim.models.Word2Vec(sentences=sentences, min_count=10, size=128, workers=4)

# 保存模型
2020-01-02 19:44:41,370 : INFO : collecting all words and their counts
2020-01-02 19:44:41,373 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-01-02 19:44:42,171 : INFO : PROGRESS: at sentence #10000, processed 222440 words, keeping 47328 word types
2020-01-02 19:44:43,002 : INFO : PROGRESS: at sentence #20000, processed 448130 words, keeping 75947 word types
2020-01-02 19:44:43,799 : INFO : PROGRESS: at sentence #30000, processed 663495 words, keeping 98422 word types
2020-01-02 19:44:44,505 : INFO : PROGRESS: at sentence #40000, processed 853474 words, keeping 116492 word types
2020-01-02 19:44:45,233 : INFO : PROGRESS: at sentence #50000, processed 1061075 words, keeping 134479 word types
2020-01-02 19:44:46,048 : INFO : PROGRESS: at sentence #60000, processed 1284120 words, keeping 150575 word types
2020-01-02 19:44:46,791 : INFO : PROGRESS: at sentence #70000, processed 1493683 words, keeping 165951 word types
2020-01-02 19:44:47,655 : INFO : PROGRESS: at sentence #80000, processed 1703929 words, keeping 178880 word types
2020-01-02 19:44:48,550 : INFO : PROGRESS: at sentence #90000, processed 1912052 words, keeping 191417 word types
2020-01-02 19:47:13,804 : INFO : PROGRESS: at sentence #2230000, processed 41002249 words, keeping 1232408 word types
2020-01-02 19:47:14,239 : INFO : collected 1234088 word types from a corpus of 41101789 raw words and 2236138 sentences
2020-01-02 19:47:14,240 : INFO : Loading a fresh vocabulary
2020-01-02 19:47:15,952 : INFO : effective_min_count=10 retains 195164 unique words (15% of original 1234088, drops 1038924)
2020-01-02 19:47:15,953 : INFO : effective_min_count=10 leaves 38937017 word corpus (94% of original 41101789, drops 2164772)
2020-01-02 19:47:16,810 : INFO : deleting the raw counts dictionary of 1234088 items
2020-01-02 19:47:16,868 : INFO : sample=0.001 downsamples 4 most-common words
2020-01-02 19:47:16,869 : INFO : downsampling leaves estimated 38877937 word corpus (99.8% of prior 38937017)
2020-01-02 19:47:17,980 : INFO : estimated required memory for 195164 words and 128 dimensions: 297429936 bytes
2020-01-02 19:47:17,981 : INFO : resetting layer weights
2020-01-02 19:48:17,349 : INFO : training model with 4 workers on 195164 vocabulary and 128 features, using sg=0 hs=0 sample=0.001 negative=5 window=5
2020-01-02 19:48:18,398 : INFO : EPOCH 1 - PROGRESS: at 0.36% examples, 162992 words/s, in_qsize 0, out_qsize 0
2020-01-02 19:48:19,399 : INFO : EPOCH 1 - PROGRESS: at 0.80% examples, 188924 words/s, in_qsize 0, out_qsize 0
2020-01-02 19:48:20,430 : INFO : EPOCH 1 - PROGRESS: at 1.25% examples, 189589 words/s, in_qsize 0, out_qsize 0
2020-01-02 20:13:14,215 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-01-02 20:13:14,217 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-01-02 20:13:14,218 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-01-02 20:13:14,244 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-01-02 20:13:14,246 : INFO : EPOCH - 5 : training on 84461422 raw words (79079705 effective words) took 388.4s, 203605 effective words/s
2020-01-02 20:13:14,247 : WARNING : EPOCH - 5 : supplied example count (4783645) did not equal expected count (2236138)
2020-01-02 20:13:14,248 : WARNING : EPOCH - 5 : supplied raw word count (84461422) did not equal expected count (41101789)
2020-01-02 20:13:14,249 : INFO : training on a 325405977 raw words (305531780 effective words) took 1496.9s, 204110 effective words/s
2020-01-02 20:13:14,251 : INFO : saving Word2Vec object under zhwiki.model, separately None
2020-01-02 20:13:14,252 : INFO : storing np array 'vectors' to zhwiki.model.wv.vectors.npy
2020-01-02 20:13:14,466 : INFO : not storing attribute vectors_norm
2020-01-02 20:13:14,468 : INFO : storing np array 'syn1neg' to zhwiki.model.trainables.syn1neg.npy
2020-01-02 20:13:14,673 : INFO : not storing attribute cum_table
2020-01-02 20:13:15,453 : INFO : saved zhwiki.model
# 加載模型
array([ 2.53410757e-01, -5.01059532e-01,  3.24785173e-01, -2.87997127e-01,
       -2.03630328e-01, -4.41740513e-01, -9.80056226e-01, -3.18926215e-01,
        2.69603521e-01,  4.15005870e-02, -3.44983906e-01, -9.29946065e-01,
        1.17542446e-01,  6.26461327e-01,  2.55293489e-01, -5.00528038e-01,
       -1.31913185e+00, -3.49614114e-01,  8.75532031e-01, -9.50986221e-02,
       -7.91940093e-01,  5.96264660e-01, -1.03322893e-01, -7.24020064e-01,
        3.32150936e-01, -5.02196193e-01, -5.40378332e-01, -4.38142538e-01,
       -2.99931973e-01,  6.05336547e-01,  2.25469723e-01,  1.24695599e+00,
        7.90903568e-01, -2.20353305e-01,  3.88635784e-01, -7.72054732e-01,
       -2.29143873e-01,  5.93681633e-01, -3.20815593e-01, -9.59340408e-02,
       -7.45754242e-01, -2.82496344e-02, -7.66685843e-01, -1.11329404e-03,
        6.21623039e-01,  6.16037071e-01,  9.83787656e-01, -2.17081606e-01,
        2.30760559e-01, -7.36923099e-01, -2.57025719e-01,  4.26793098e-02,
       -3.00876021e-01, -1.11014020e+00,  6.52286649e-01,  1.22128703e-01,
        1.25344169e+00, -6.48789406e-01, -6.11470565e-02,  5.13327718e-01,
       -4.11103815e-01, -5.15032470e-01,  5.64131439e-01, -9.04708505e-01,
        1.31055459e-01, -4.61531520e-01,  1.66795403e-01, -2.88387984e-01,
       -3.16922873e-01,  4.83584367e-02,  7.84826279e-01, -3.43771875e-01,
       -5.57598472e-01, -4.08290237e-01,  4.23400253e-01,  6.28027320e-02,
       -2.75126398e-02,  3.92360210e-01, -3.07065427e-01, -1.40871659e-01,
       -8.15511942e-01,  1.11995924e+00, -5.61312139e-01,  6.45037889e-01,
        6.06502593e-01,  6.34985268e-01, -2.82812864e-01,  4.19008672e-01,
       -2.59597808e-01,  3.67016762e-01, -3.60184878e-01,  9.87740993e-01,
        9.83297706e-01, -7.29820848e-01, -3.91828895e-01,  3.56867343e-01,
       -9.34051692e-01, -5.09055316e-01,  3.45538296e-02,  5.19557238e-01,
        7.74595933e-03, -8.79977345e-02, -4.12295192e-01, -8.62251520e-02,
        8.03641453e-02,  2.12872773e-01, -3.37330550e-01,  5.11872172e-01,
       -5.07763445e-01,  8.02263737e-01, -1.25437587e-01,  8.14941991e-03,
        3.82363439e-01,  4.61312026e-01, -4.81690168e-01, -5.06968517e-03,
       -4.44162220e-01, -2.96849832e-02, -2.33183742e-01, -1.61927864e-01,
       -7.59666190e-02,  5.23146868e-01,  5.90918779e-01,  1.07205339e-01,
       -7.55360901e-01,  3.98152739e-01,  2.35744521e-01,  5.28471589e-01],
# 相似詞
/home/zwx/.virtualenvs/py3tf2/lib/python3.6/site-packages/ipykernel_launcher.py:1: DeprecationWarning: Call to deprecated `most_similar` (Method will be removed in 4.0.0, use self.wv.most_similar() instead).
  """Entry point for launching an IPython kernel.
2020-01-02 20:17:16,817 : INFO : precomputing L2-norms of word weight vectors

[('銀行業', 0.7874805927276611),
 ('金融業', 0.7728987336158752),
 ('金融市場', 0.7700618505477905),
 ('金融服務', 0.7571905851364136),
 ('保險業', 0.7285447716712952),
 ('信貸', 0.7182446718215942),
 ('金融機構', 0.712306022644043),
 ('國際金融', 0.6994981169700623),
 ('風險管理', 0.6932019591331482),
 ('國際貿易', 0.6882727146148682)]
# 加載模型
new_model = gensim.models.Word2Vec.load('zhwiki.model')
2020-01-02 20:19:59,994 : INFO : loading Word2Vec object from zhwiki.model
2020-01-02 20:20:01,533 : INFO : loading wv recursively from zhwiki.model.wv.* with mmap=None
2020-01-02 20:20:01,535 : INFO : loading vectors from zhwiki.model.wv.vectors.npy with mmap=None
2020-01-02 20:20:01,820 : INFO : setting ignored attribute vectors_norm to None
2020-01-02 20:20:01,822 : INFO : loading vocabulary recursively from zhwiki.model.vocabulary.* with mmap=None
2020-01-02 20:20:01,823 : INFO : loading trainables recursively from zhwiki.model.trainables.* with mmap=None
2020-01-02 20:20:01,824 : INFO : loading syn1neg from zhwiki.model.trainables.syn1neg.npy with mmap=None
2020-01-02 20:20:02,104 : INFO : setting ignored attribute cum_table to None
2020-01-02 20:20:02,106 : INFO : loaded zhwiki.model
/home/zwx/.virtualenvs/py3tf2/lib/python3.6/site-packages/ipykernel_launcher.py:1: DeprecationWarning: Call to deprecated `most_similar` (Method will be removed in 4.0.0, use self.wv.most_similar() instead).
  """Entry point for launching an IPython kernel.

[('友情', 0.7878501415252686),
 ('親情', 0.767195463180542),
 ('甜蜜', 0.7553292512893677),
 ('浪漫', 0.7542786598205566),
 ('感情', 0.7500328421592712),
 ('愛戀', 0.7450888752937317),
 ('愛情故事', 0.7412598133087158),
 ('初戀', 0.73231440782547),
 ('激情', 0.7304527163505554),
 ('男女之間', 0.7273457050323486)]
