自然语言处理-多分类模型搭建

一\介绍

2020中国大学生保险数字科技竞赛
在这里插入图片描述
其中训练集是
在这里插入图片描述
数据中给的是char和word,都是代表一句话,最终根据char和word以及判断的是label

二\数据处理思路

数据两种思路
一种是常见的自然语言处理文本处理, 使用word2vec生成对应的词向量,因为直接调用API特别方便

读取数据

# 读取数据
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', 5)

file = pd.read_csv(r"data/train.csv")
print(len(file))
print(file.columns.values)
104344
['id' 'category' 'char' 'word' 'label']

提取char和word,转化为txt

# 将csv中的word和char转化为txt文本
# print(char_file[0:10]) # ["['109', '57', '56', '52']" "['109']  ["23,43"]]"
def csv_2_txt(words, save_file):
    import re
    print(len(words))
    with open(save_file, "w") as fd:
        for i,c in enumerate(words):
            all_numb_char_ = re.findall(r'\d+-\d+|\d+|[1-9]\d*', str(c)) # 读取所有数字
            for j in all_numb_char_:
                fd.write(j+" ")
            if i%10000==0 : 
                print(i)
                print(all_numb_char_)
                
csv_2_txt(file["char"].values, "swap/char_all.txt")
csv_2_txt(file["word"].values, "swap/word_all.txt")

利用上面的txt, 使用word2vec生成模型

# 生成词向量
def get_vector(txt_file, save_file):
    import gensim
    from gensim.models import word2vec
    sentences=word2vec.Text8Corpus(txt_file) # 加载语料对象
    model=word2vec.Word2Vec(sentences,
                            sg=1,# **skip—gram算法 对低频词敏感,默认sg=0为CBOW算法
                            size=128, # **size是神经网络层数,值太大则会耗内存并使算法计算变慢,一般值取为100到200之间。
                            window=5,  # **window是句子中当前词与目标词之间的最大距离
                            min_count=1, # **对词进行过滤,频率小于min-count的单词则会被忽视  
                            negative=3,sample=0.001, # sample表示更高频率的词被随机下采样到所设置的阈值
                            hs=1, # 示层级softmax将会被使用
                            workers=4) #**线程数,此参数只有在安装了Cpython后才有效

    model.save(save_file)  # 保存训练结果
get_vector("swap/word_all.txt", 'swap/word_all.pkl')

对每句话根据上面生成的模型,得到对应的向量

# 对每一句话生成次词向量
def numextract(lists):
    vector = []
    for i in range(len(lists)):
        for t in range(len(lists[i])):
            vector.append(lists[i].item(t))
    return vector

# 加载词向量
from gensim.models.word2vec import Word2Vec
model_char = Word2Vec.load('swap/char_all.pkl')
model_word = Word2Vec.load('swap/word_all.pkl')
def get_generate_vector_from_word2vec(model, ind="word"):
    import re
    import numpy as np
    vectors = []
    numbers = []
    num = []
    for sentence_char in file[ind].values:
        vector = [] # 保存每一句话的词向量
        a = re.findall(r'\d+-\d+|\d+|[1-9]\d*', sentence_char) # 每句话对应的number
        num.append(len(a))
        vector.append(model[a])
        numbers.append(a)
        # print(vector)
        # print(numextract(vector))
        vectors.append(numextract(vector)) 
    # print(vectors)
    # print(numbers)
    # print(num)
    np.save(f"swap/{ind}_vectors.npy",vectors)
    np.save(f"swap/{ind}_numbers.npy",numbers)

# get_generate_vector_from_word2vec(model_char, "char")    
# b = np.load("filename.npy") # 加载格式

这里生成vector的方法可以在很多NLP中使用, 因为模型训练的输入肯定是数字化格式

  • 第二个方法直接使用文本序列, 在利用

利用re正则匹配将csv中的word和char做成list,

# 将csv中的word和char都调出来, 做成文本, 每一句话一行
import pandas as pd
import re
csv_data = pd.read_csv("data/public_test.csv")
# csv_data = pd.read_csv("data/train.csv")
csv_data.head(2)

char_line = []
word_line = []
for i in range(len(csv_data)):
    csv_data_char = " ".join(re.findall(r'\d+-\d+|\d+|[1-9]\d*', csv_data["char"].values[i]))
    csv_data_word = " ".join(re.findall(r'\d+-\d+|\d+|[1-9]\d*', csv_data["word"].values[i]))
    char_line.append(csv_data_char) # "25 56 896 25-56 25 36 \n"
    word_line.append(csv_data_word)
# print(char_line)
# print(word_line)

# np.save("char_line_list.npy",char_line)
# np.save("word_line_list.npy",word_line)
np.save("char_line_list_public.npy",char_line)
np.save("word_line_list_public.npy",word_line)
array(['109 57 56 52', '109',
       '54 55 56 52 57 58 59 60 ....'],
  dtype='<U1143')

将list通过keras的preprocessing方法转化为数字序列

# 构建词典跟训练数据
# 导入使用到的库
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.layers.merge import concatenate
from keras.models import Sequential, Model
from keras.layers import Dense, Embedding, Activation, merge, Input, Lambda, Reshape
from keras.layers import Convolution1D, Flatten, Dropout, MaxPool1D, GlobalAveragePooling1D
from keras.layers import LSTM, GRU, TimeDistributed, Bidirectional
from keras.utils.np_utils import to_categorical
from keras import initializers
from keras import backend as K
from keras.engine.topology import Layer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np

def encode_1hot(label) -> np.array:
    from sklearn.preprocessing import OneHotEncoder
    listUniq = list(set(label))
    print(listUniq, len(listUniq)) # 输出所有的标签类型 ['XV', 'XXIV', 'LII', 'XVIII', 'L', 'IV', 'XLVIII', 'XXVI', 'x', ]
    label_onehot_ = []
    for i in label:
        label_onehot_.append(listUniq.index(i))
    # print(label_onehot_)
    labels = np.array(label_onehot_).reshape(len(label_onehot_), -1)
    enc = OneHotEncoder()
    enc.fit(labels)
    tempdata = np.array(enc.transform(labels).toarray())
    # print(tempdata[0:4])
    # print('取值范围整数个数:',enc.n_values_)
    return tempdata,len(listUniq)


char_line = np.load("swap/char_line_list.npy")
word_line = np.load("swap/word_line_list.npy")
csv_data = pd.read_csv("swap/new_train.csv")

# 分词,构建单词-id词典       
tokenizer = Tokenizer(filters='!"#$%&()*+,./:;<=>?@[\\]^_`{|}~\t\n',lower=True,split=" ")
tokenizer.fit_on_texts(char_line)
vocab = tokenizer.word_index
# print(vocab) # {'57': 1, '56': 2, '58': 3, '60': 4, '29': 5, '30': 6, '64': 7, '7': 8, '28': 9, '24': 10, '20': 11,....}
print("字典长度是:",len(vocab))


# 将每个词用词典中的数值代替
word_ids = tokenizer.texts_to_sequences(char_line)
print("word_id长度:",len(word_ids))
# print("word_ids:",word_ids[0:10]) # [[27, 1, 2, 14], [27], [49, 66, 2, 14, 1, 3, 75, 4, 232, 4, 3, 78, 52....] ...]
 
# # 序列模式
sequences = pad_sequences(word_ids, maxlen=100)
print(sequences[2]) # [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0    0   0   0  49  66   2  14   1   3  75   4 232   4   ..]


y = csv_data["label"].values
y_1hot,specise=encode_1hot(y)

三\模型

两个全连层,最简单的神经网络了

seed = 236
from keras import optimizers
model = Sequential()
model.add(Dense(256,activation='relu', input_dim=100)) # 因为上面的维度设置的是100维, sequences = pad_sequences(word_ids, maxlen=100)
model.add(Dropout(0.5))
# 防止过拟合 提升模型泛化能力;直接作用是减少中间特征的数量,从而减少冗余,即增加每层各个特征之间的正交性

model.add(Dense(specise,activation='softmax'))
model.compile(loss='categorical_crossentropy', # 多分类
              optimizer='adam', # 加快梯度下降
              metrics=['accuracy']) # 使用准确率作为优化器补偿参考
 

# 输出当前模型的相关信息
model.summary()
# model.compile(optimizer='rmsprop',loss='categorical_crossentropy',metrics=['accuracy'])
sgd = optimizers.SGD(lr=0.01,  #这里自定义训练
                    decay=1e-6, 
                    momentum=0.9, 
                    nesterov=True)
model.compile(loss='mean_squared_error', 
                optimizer=sgd,
                metrics=['accuracy'])
# 训练模型
# 10-fold cross-validation 十倍交叉验证机制, 十轮, 每轮抽1成作为训练
from sklearn.model_selection import StratifiedKFold
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
for j, (train_index, test_index) in enumerate(kf.split(y, y)):  
    model.fit(sequences[train_index],y_1hot[train_index],epochs=50,batch_size=64,verbose=1)
    test_loss,test_acc = model.evaluate(sequences[test_index],y_1hot[test_index])
    print("test_loss:",test_loss,"准确率",test_acc)

在这里插入图片描述

两层卷积

seed = 236
from keras import optimizers
from keras.layers.normalization import BatchNormalization
model = Sequential()

model.add(Embedding(len(vocab)+1, 100, input_length=100))
model.add(Convolution1D(256, 3, padding="same"))
model.add(MaxPool1D(3,3,padding="same"))
model.add(Convolution1D(128, 3, padding="same"))
model.add(MaxPool1D(3,3,padding="same"))
model.add(Convolution1D(64, 3, padding="same"))
model.add(Flatten())
model.add(Dropout(0.1))
model.add(BatchNormalization()) # (批)规范化层
model.add(Dense(256,activation="relu"))
model.add(Dropout(0.1))
model.add(Dense(specise,activation="softmax"))
# 输出当前模型的相关信息
model.summary()
# model.compile(optimizer='rmsprop',loss='categorical_crossentropy',metrics=['accuracy'])
sgd = optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='mean_squared_error', optimizer=sgd,metrics=['accuracy'])
# 训练模型
# 10-fold cross-validation
from sklearn.model_selection import StratifiedKFold
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
for j, (train_index, test_index) in enumerate(kf.split(y, y)):  
    model.fit(sequences[train_index],y_1hot[train_index],epochs=50,batch_size=64,verbose=0)
    test_loss,test_acc = model.evaluate(sequences[test_index],y_1hot[test_index])
    print("test_loss:",test_loss,"准确率",test_acc)

最后的模型, 因为题目是使用f1的值作为结果,因此定义f1作为每轮训练的补偿

# 导入使用到的库
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.layers.merge import concatenate
from keras.models import Sequential, Model
from keras.layers import Dense, Embedding, Activation, merge, Input, Lambda, Reshape
from keras.layers import Convolution1D, Flatten, Dropout, MaxPool1D, GlobalAveragePooling1D
from keras.layers import LSTM, GRU, TimeDistributed, Bidirectional
from keras.utils.np_utils import to_categorical
from keras import initializers
from keras import backend as K
from keras.engine.topology import Layer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np

from keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))



def encode_1hot(label) -> np.array:
    from sklearn.preprocessing import OneHotEncoder
    listUniq = list(set(label))
    np.save("data/listUniq.npy", listUniq)# 保存编码顺序, 当预测的值重新转化为编码值
    print(listUniq, len(listUniq))
    label_onehot_ = []
    for i in label:
        label_onehot_.append(listUniq.index(i))
    # print(label_onehot_)
    labels = np.array(label_onehot_).reshape(len(label_onehot_), -1)
    enc = OneHotEncoder()
    enc.fit(labels)
    tempdata = np.array(enc.transform(labels).toarray())
    # print(tempdata[0:4])
    # print('取值范围整数个数:',enc.n_values_)
    return tempdata,len(listUniq)


char_line = np.load("swap/char_line_list.npy")
word_line = np.load("swap/word_line_list.npy")

char_line_public = np.load("cnn/char_line_list_public.npy")
word_line_public = np.load("cnn/word_line_list_public.npy")

csv_data = pd.read_csv("swap/new_train.csv")
csv_data_test = pd.read_csv("data/public_test.csv")

# 训练集
for i in range(len(word_line)):
    list(char_line[i]).extend(word_line[i])
train_and_test = np.array(char_line)
print("train_and_test len = :",len(train_and_test))
# 验证
for i in range(len(char_line_public)):
    list(char_line_public[i]).extend(word_line_public[i])
need_to_predict = np.array(char_line_public)
print("need_to_predict len = :",len(need_to_predict))

list(char_line).extend(char_line_public)   
all_data = np.array(char_line)
print("all_data len = :",len(all_data))


# 分词,构建单词-id词典       
tokenizer = Tokenizer(filters='!"#$%&()*+,./:;<=>?@[\\]^_`{|}~\t\n',lower=True,split=" ")
tokenizer.fit_on_texts(all_data)
vocab = tokenizer.word_index
print("字典长度是:",len(vocab))


# 将每个词用词典中的数值代替
word_ids = tokenizer.texts_to_sequences(train_and_test)
print("word_id len:",len(word_ids))
 
word_ids_predict = tokenizer.texts_to_sequences(need_to_predict)
print("word_ids_predict len:",len(word_ids_predict))    
    
# # 序列模式
sequences = pad_sequences(word_ids, maxlen=128)
sequences_predict = pad_sequences(word_ids_predict, maxlen=128)


y = csv_data["label"].values
y_1hot,specise=encode_1hot(y)


seed = 256
from keras.models import load_model
from keras import optimizers
from keras.layers.normalization import BatchNormalization
model = Sequential()

model.add(Embedding(len(vocab)+1, 128, input_length=128))
model.add(Convolution1D(128, 3, padding="same"))
model.add(MaxPool1D(3,3,padding="same"))
model.add(Convolution1D(128, 3, padding="same"))
model.add(MaxPool1D(3,3,padding="same"))
model.add(Convolution1D(64, 3, padding="same"))
model.add(MaxPool1D(3,3,padding="same"))
model.add(Convolution1D(64, 3, padding="same"))
model.add(Flatten())
model.add(Dropout(0.1))
model.add(BatchNormalization()) # (批)规范化层
model.add(Dense(256,activation="relu"))
model.add(Dropout(0.1))
model.add(Dense(specise,activation="softmax"))
# 输出当前模型的相关信息
model.summary()
# model.compile(optimizer='rmsprop',loss='categorical_crossentropy',metrics=['accuracy'])
sgd = optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)

# model.compile(loss='mean_squared_error', optimizer=sgd, metrics=['acc',f1_m,precision_m, recall_m])

# model.compile(loss='mean_squared_error', optimizer="adam", metrics=["accuracy", 'acc',f1_m,precision_m, recall_m])
# 多分类使用categorical_crossentropy
# 二分类用mean_squared_error
model.compile(loss='categorical_crossentropy', optimizer="adam", metrics=[f1_m])
# 训练模型
# 10-fold cross-validation
from sklearn.model_selection import StratifiedKFold
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
for j, (train_index, test_index) in enumerate(kf.split(y, y)): 
    print(test_index[0:20])
    model.fit(sequences[train_index],y_1hot[train_index],epochs=20,batch_size=64,verbose=1)
    # test_loss,test_acc = model.evaluate(sequences[test_index],y_1hot[test_index])
    one = model.evaluate(sequences[test_index],y_1hot[test_index])
    # print("result == === ==:",test_acc, test_f1_m, test_precision_m, test_recall_m )
    print("result == === ==:",one)
    predict = model.predict(sequences_predict)
    predict=np.argmax(predict,axis=1)
    pd.DataFrame({"predict":predict}).to_csv(f"data/result_{j}.csv")
    # 保存模型
    print(f"model save model/model_{j}.h5")
    model.save(f"model/model_{j}.h5")
#     predict = model.predict(sequences[test_index])
#     predict=np.argmax(predict,axis=1)
#     print(predict)
    

在这里插入图片描述
最终是利用每一轮训练出来的模型 预测测试集的标签, 保存到csv中,但是由于已经进过了1热编码,因此需要反一热编码,下面的代码处理csv文件生成比赛要求结果格式

处理模型结果,得到最终结果

import numpy as np
import pandas as pd
import sys

csv_file = sys.argv[1]


listUniq = np.load("listUniq.npy")
print(listUniq)
print(len(listUniq))

# 删除catgory=1的标签 合并三个
# 加载数据
data = pd.read_csv("public_test.csv")
predict_result = pd.read_csv(csv_file)["predict"].values # [23,23,23,43,2,4324,3,54,5]
listUniq = np.load("listUniq.npy")  # ['XI' 'XLIV' 'XXXVIII' 'XLI' 'LV' 'I' 'XXXVII']

data.head(2)
print(data["id"].values[1])

id_ = []
predict = []
for i,num in enumerate(data["catgory"].values): # 筛选catog=0的索引
    if num==0:
        id_.append(data["id"].values[i])
        predict.append(listUniq[predict_result[i]]) # 直接找到对应的标签进行替换 25
pd.DataFrame({"id":id_, "prodict":predict}).to_csv("result_xxyl.csv", index = False, sep="\t", header=False)

result_xxyl.csv
在这里插入图片描述

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章