jieba並行分詞

jieba並行分詞每次都要重新寫,這次記下來。

# coding:utf-8
import codecs
from multiprocessing import Pool
import jieba


fin = "news.txt"
fout = "news.seg"

def read_data():
    fr = codecs.open(fin, "r", "utf-8")
    trunk = 100000  # 每次返回10條數據
    icount = 0
    texts = []
    for line in fr:
        line = line.strip()
        texts.append(line)
        icount += 1
        if icount % trunk == 0:
            yield texts
            texts = []

def seg(texts):
    result = []
    for text in texts:
        result.append(" ".join(jieba.cut(text)))
    return result


def parallel_seg():
    fw = codecs.open(fout, "w", "utf-8")
    texts = read_data()
    cpus = 10  # CPU個數
    ichunk = 0  # 第ichunk個生成器
    for t in texts:
        pool = Pool(cpus)
        step = int(len(t) / cpus)
        tmp = [t[i:i+step] for i in range(0, len(t) , step)]
        results = pool.map(seg, tmp)
        pool.close()
        pool.join()
        # 寫入
        for r in results:
            for i in r:
                fw.write(i + "\n")
        ichunk += 1
        print "finished samples:",len(t) * ichunk
    fw.close()


if __name__ == "__main__":
    parallel_seg()

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章