分词、去停用词

#https://github.com/xgli/jieba

import os
import jieba

# 未分词语料库路径
corpus_path =r' '
# 分词后语料库路径
seg_path = r' '
# 停用词路径
stop_list_Path = r' '

def stopwordsList(stop_list_Path):
    f = open(stop_list_Path,'r',encoding='utf-8')
    stopwords = [line.strip() for line in f.readlines()]
    return stopwords

def readfile(filepath):
    f = open(filepath,'r',encoding='gb2312',errors='ignore')
    content = f.read()
    # read()返回的是字符串,读全文本的内容。readline()返回一行，是字符串类型。readlines()读取所有行，保存在列表中
    f.close()
    return content
    # 这里返回整个文本，以便后续进行分词
    
def savefile(seg_path,content):
    f = open(seg_path,'w',encoding='utf-8')
    f.write(content)
    f.close()

def tikenizer_and_removeStoplist(corpus_path,stop_list_Path):
    cate_dir = os.listdir(corpus_path) # 获取子类别目录
    for cate in cate_dir:
        cate_complete_dir = corpus_path+'\\'+cate+"\\" # 获取子类别的完整路径
        seg_cate_complete_dir = seg_path + '\\' + cate + "\\"
        if not os.path.exists(seg_cate_complete_dir): # 创建分词后的保存的路径
            os.makedirs(seg_cate_complete_dir)
        file_dir = os.listdir(cate_complete_dir)#获取每个类别下的文件
        for file in file_dir:
            file_complete_dir = cate_complete_dir+file # 获取每个类别下的文件的完整路径
            content = readfile(file_complete_dir) # 返回这个文本
            # 对文本进行处理,删除换行以及多余空格
            content = content.replace("\n",'').strip()
            content_seg = jieba.cut(content)
            #创建停用词表
            stopwords = stopwordsList(stop_list_Path)
            outstr =''
            for word in content_seg:
                if word not in stopwords:
                    if word !='\t':
                        outstr+=word
                        outstr+=" "
            savefile(seg_cate_complete_dir+"\\"+file,' '.join(outstr))
    print("分词结束")

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.

分词、去停用词

分词、去停用词

如何使用 JS 判断用户是否处于活跃状态

Mono 支持LoongArch架构

lightdb秒级增加列和删除列（not null带默认值）

lightdb数据库超时相关控制参数

通过HPA+CronHPA组合应对业务复杂弹性伸缩场景

❤️‍🔥 Solon Cloud Event 新的事务特性与应用

lightdb mysql 8.0兼容之不可见主键

使用 JS 实现在浏览器控制台打印图片 console.image()

基于Ubuntu-22.04安装K8s-v1.28.2实验（四）使用域名访问网站应用

分類問題集錦及練習

中餐館過程僞代碼及python實現

Day1——Data PreProcessing

gensim word2vec

IDEA初上手的一天

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結