Python——n-gram實現
目標:給定文本,以及劃分的長度n,將文本劃分爲將長度爲n的子文本,列表輸出。
例子:
輸入:哈哈
切分長度:2
列表輸出:['哈哈']
集合輸出:{('哈', '哈')}
輸入:哈哈哈哈
切分長度:3
列表輸出:['哈哈哈', '哈哈哈']
集合輸出:{('哈', '哈', '哈')}
輸入:唧唧復唧唧
切分長度:3
列表輸出:['唧唧復', '唧復唧', '復唧唧']
集合輸出:{('唧', '唧', '復'), ('復', '唧', '唧'), ('唧', '復', '唧')}
輸入:君不見黃河之水天上來
切分長度:3
列表輸出:['君不見', '不見黃', '見黃河', '黃河之', '河之水', '之水天', '水天上', '天上來']
集合輸出:{('河', '之', '水'), ('君', '不', '見'), ('不', '見', '黃'), ('黃', '河', '之'), ('見', '黃', '河'), ('天', '上', '來'), ('水', '天', '上'), ('之', '水', '天')}
代碼:
#!/usr/bin/env python
# -*- coding:utf-8 -*-
"""
@Time :2019/12/7
@Name :Zhang Wei
@Contact :[email protected]
@File :demo.py
@Software :Pycharm
"""
# 讀取csv文件
def load_csv(path, encoding='utf-8', sep="\t"):
with open(path, 'r', encoding=encoding) as file:
lines = file.readlines()
if sep is not None:
lines = [line.replace("\n", "").split(sep) for line in lines]
else:
lines = [line.replace("\n", "") for line in lines]
return lines
# n-gram-list
def create_ngram_list(input_list, ngram_num):
ngram_list = []
if len(input_list) <= ngram_num:
ngram_list.append(input_list)
else:
for tmp in zip(*[input_list[i:] for i in range(ngram_num)]):
tmp = "".join(tmp)
ngram_list.append(tmp)
return ngram_list
# n-gram-set
def create_ngram_set(input_list, ngram_num):
if len(input_list) <= ngram_num:
return {tuple(list(input_list))}
else:
return set(zip(*[input_list[i:] for i in range(ngram_num)]))
# 隨機生成實體詞,長度爲n
def get_entity(words_list, n):
entity = ""
import random
for i in range(n):
c = words_list[random.randint(0, len(words_list) - 1)]
entity += c
return entity
if __name__ == "__main__":
while True:
# demo 1:n-gram切分
text = input("demo 1:\n輸入:")
ngram_num = int(input("切分長度:"))
print("\n列表輸出:{0}".format(create_ngram_list(text, ngram_num))) # 列表形式
print("集合輸出:{0}\n".format(create_ngram_set(text, ngram_num))) # 集合形式
# demo 2:隨機生成長度爲n的文本
print("demo 2:{0}\n".format(get_entity(
words_list=["我", "是", "四", "川", "人", "我", "愛", "喫", "火", "鍋"],
n=3)))
# 是川四
補充:函數create_ngram_set的輸入還可以是列表,如:
print("集合輸出:{0}\n".format(create_ngram_set([1, 4, 9, 4, 1, 4], 3))) # 集合形式
# 集合輸出:{(4, 1, 4), (4, 9, 4), (1, 4, 9), (9, 4, 1)}
但是列表輸出會報錯,如果希望輸出列表,需要對create_ngram_list進行修改。