中文維基百科word2vec訓練及其代碼

參考文章:中英文維基百科語料上的Word2Vec實驗

數據來自:https://dumps.wikimedia.org/zhwiki/latest/zhwiki-latest-pages-articles.xml.bz2

繁體轉爲簡體: 

opencc -i zhwiki.txt -o zhwiki.txt.simle -c zht2zhs.ini
先把文件拆分爲多個文件:
split  -l  30000  ../zhwiki.txt.simple seg
$ 多進程分詞
cat multi_cut.py
import jieba
from multiprocessing import Pool,cpu_count



def cut(name):
    print(name)
    out = open('out/'+name,'w')
    with open('seg/'+name,'r') as f:
        while True:
            line = f.readline()
            if not line:
                break
            line = line.strip()
            wordss = []
            for ss in line.split(" "):
                sent = jieba.lcut(ss,cut_all=False)
                words = [i for i in sent]
                wordss.append(" ".join(words))
            s = ' '.join(wordss)
            out.write(s+"\n")
    out.close()
    # f = open(path,'r')
    # 讀取數據
    # data = f.readlines()
    # f.close()
    # return
    #

if __name__ == '__main__':
    path = "zhwiki.txt.simple"
    files = "segaa  segab  segac  segad  segae  segaf  segag  segah  segai  segaj  segak".replace("  "," ").split(" ")
    print(files)
    pool = Pool(cpu_count()-1)
    data = pool.map(cut, files)


訓練的代碼來自互聯網:

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Author: Pan Yang ([email protected])
# Copyrigh 2017

from __future__ import print_function

import logging
import os.path
import six
import sys

from gensim.corpora import WikiCorpus

if __name__ == '__main__':
    program = os.path.basename(sys.argv[0])
    logger = logging.getLogger(program)

    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logger.info("running %s" % ' '.join(sys.argv))

    # check and process input arguments
    if len(sys.argv) != 3:
        print("Using: python process_wiki.py enwiki.xxx.xml.bz2 wiki.en.text")
        sys.exit(1)
    inp, outp = sys.argv[1:3]
    space = " "
    i = 0

    output = open(outp, 'w')
    wiki = WikiCorpus(inp, lemmatize=False, dictionary={})
    for text in wiki.get_texts():
        if six.PY3:
            output.write(' '.join(text) + '\n')
        #   ###another method###
        #    output.write(
        #            space.join(map(lambda x:x.decode("utf-8"), text)) + '\n')
        else:
            output.write(space.join(text) + "\n")
        i = i + 1
        if (i % 10000 == 0):
            logger.info("Saved " + str(i) + " articles")

    output.close()
    logger.info("Finished Saved " + str(i) + " articles")





發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章