NLP實踐-Task1

對cnews數據做一些數據處理

import jieba
import pandas as pd
import tensorflow as tf
from collections import Counter
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import CountVectorizer


# 讀取停用詞
def read_stopword(filename):
    stopword = []
    fp = open(filename, 'r')
    for line in fp.readlines():
        stopword.append(line.replace('\n', ''))
    fp.close()
    return stopword


# 切分數據,並刪除停用詞
def cut_data(data, stopword):
    words = []
    for content in data['content']:
        word = list(jieba.cut(content))
        for w in list(set(word) & set(stopword)):
            while w in word:
                word.remove(w)
        words.append(word)
    data['content'] = words
    return data


# 獲取單詞列表
def word_list(data):
    all_word = []
    for word in data['content']:
        all_word.extend(word)
    return all_word


# 提取特徵
def feature(train_data, test_data, val_data):
    content = pd.concat([train_data['content'], test_data['content'], val_data['content']], ignore_index=True)
    # count_vec = CountVectorizer(max_features=300, min_df=2)
    # count_vec.fit_transform(content)
    # train_fea = count_vec.transform(train_data['content']).toarray()
    # test_fea = count_vec.transform(test_data['content']).toarray()
    # val_fea = count_vec.transform(val_data['content']).toarray()
    model = Word2Vec(content, size=100, min_count=1, window=10, iter=10)
    train_fea = train_data['content'].apply(lambda x: model[x])
    test_fea = test_data['content'].apply(lambda x: model[x])
    val_fea = val_data['content'].apply(lambda x: model[x])
    return train_fea, test_fea, val_fea


if __name__ == '__main__':
    train_data = pd.read_csv('./data/task1/cnews/cnews.train.txt', names=['title', 'content'], sep='\t')  # (50000, 2)
    test_data = pd.read_csv('./data/task1/cnews/cnews.test.txt', names=['title', 'content'], sep='\t')  # (10000, 2)
    val_data = pd.read_csv('./data/task1/cnews/cnews.val.txt', names=['title', 'content'], sep='\t')  # (5000, 2)

    train_data = train_data.head(50)
    test_data = test_data.head(50)
    val_data = val_data.head(50)

    stopword = read_stopword('./data/stopword.txt')
    train_data = cut_data(train_data, stopword)
    test_data = cut_data(test_data, stopword)
    val_data = cut_data(val_data, stopword)

    train_fea, test_fea, val_fea = feature(train_data, test_data, val_data)
    print(train_fea)

    all_word = []
    all_word.extend(word_list(train_data))
    all_word.extend(word_list(test_data))
    all_word.extend(word_list(val_data))
    all_word = list(set(all_word))

 使用pytorch對cnews數據進行訓練

import os
import csv
import jieba
import numpy as np
import pandas as pd
from tqdm import tqdm
from gensim.models import Word2Vec
import torch
import torch.nn as nn
from torch.optim import Adam
import torch.autograd as autograd

class_num = 10
batch_size = 256
maxlen = 100
word2vec_size = 100

train_dir = './data/cnews/cnews.train.txt'
valid_dir = './data/cnews/cnews.val.txt'
test_dir = './data/cnews/cnews.test.txt'
word2vec_dir = './word2vec/word2vec.hdf5'
userdict_dir = './dict/userdict.txt'
stopword_dir = './dict/stopword.txt'


def cut_word(x, stop_word):
    words = []
    for word in list(jieba.cut(x)):
        if word not in stop_word and len(word) != 1:
            words.append(word)
    return words


def get_word_vocab(content):
    word_vocb = []
    for sentence in content:
        word_vocb.extend(list(set(sentence)))
    return list(set(word_vocb))


def get_x(content, word_index):
    X = np.array((len(content), maxlen))
    for i in range(len(content)):
        if len(content[i]) < maxlen:
            for j in range(0, len(content[i])):
                X[i][j] = word_index[content[i][j]]
        else:
            for j in range(0, maxlen):
                X[i][j] = word_index[content[i][j]]
    return X


def get_label_vector(label):
    label_code = pd.get_dummies(list(set(label)))
    label_vector = dict()
    for col in label_code.columns:
        label_vector[col] = label_code[col].tolist()
    return label_vector


print('read data')
train = pd.read_csv(valid_dir, delimiter='\t', index_col=None, names=['label', 'content'])
test = pd.read_csv(test_dir, delimiter='\t', index_col=None, names=['label', 'content'])

print(train.shape)
print(test.shape)

print('cut word')
jieba.load_userdict(userdict_dir)
stop_word = pd.read_csv(stopword_dir, quoting=csv.QUOTE_NONE, index_col=None, names=['word'])['word'].tolist()
train['content'] = train['content'].apply(lambda x: cut_word(x, stop_word))
test['content'] = test['content'].apply(lambda x: cut_word(x, stop_word))
content = pd.concat([train['content'], test['content']], axis=0, ignore_index=True)

print('word vocab')
word_vocab = get_word_vocab(content)
word_index = dict(zip(word_vocab, range(1, len(word_vocab) + 1)))
index_word = dict(zip(list(word_index.values()), list(word_index.keys())))

print('word2vec')
if not os.path.exists(word2vec_dir):
    model = Word2Vec(content, size=word2vec_size, seed=2019, min_count=5, window=10, iter=10, workers=8)
    model.save(word2vec_dir)
else:
    model = Word2Vec.load(word2vec_dir)

embedding_matrix = np.zeros((len(word_index) + 1, word2vec_size))
for word, i in word_index.items():
    if word in model:
        embedding_matrix[i] = model[word]


print('label')
label_vector = get_label_vector(train['label'])
y_train = train['label'].map(label_vector)
y_test = test['label'].map(label_vector)


class DataLoader():
    def __init__(self, data, config, w2v_model):
        self.data = data
        self.batch_size = config['batch_size']
        self.maxlen = config['maxlen']
        self.label_vector = config['label_vector']
        self.word_index = config['word_index']
        self.embedding = config['embedding']
        self.w2v_model = w2v_model

    def data_to_matrix(self, content):
        X = np.array((len(content), self.maxlen))
        for i in range(len(content)):
            if len(content[i]) < maxlen:
                for j in range(0, len(content[i])):
                    X[i][j] = self.word_index[content[i][j]]
            else:
                for j in range(0, maxlen):
                    X[i][j] = self.word_index[content[i][j]]

    def train_batch_data(data, batch_size, is_shuffle=True):
        if is_shuffle:
            data = data.sample(frac=1).reset_index(drop=True)

        length = len(data) // batch_size

        if batch_size * length < len(data):
            length += 1

        for i in tqdm(range(length)):
            if batch_size * (i + 1) > len(data):
                batch_data = data.loc[batch_size * i:, :]
            else:
                batch_data = data.loc[batch_size * i:batch_size * (i + 1) - 1, :]

            yield batch_data

    def test_batch_data(data, batch_size):
        length = len(data) // batch_size

        if batch_size * length < len(data):
            length += 1

        for i in tqdm(range(length)):
            if batch_size * (i + 1) > len(data):
                batch_data = data.loc[batch_size * i:, :]
            else:
                batch_data = data.loc[batch_size * i:batch_size * (i + 1) - 1, :]
            yield batch_data


class textCNN(nn.Module):
    def __init__(self, config):
        super(textCNN, self).__init__()
        vocab_size = config['vocab_size']
        embedding_dim = config['embedding_dim']
        class_num = config['class_num']
        embedding_matrix = config['embedding_matrix']

        self.embeding = nn.Embedding(vocab_size, embedding_dim, _weight=embedding_matrix)
        self.conv1 = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=16, kernel_size=5, stride=1, padding=2),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2))
        self.conv2 = nn.Sequential(
            nn.Conv2d(in_channels=16, out_channels=32, kernel_size=5, stride=1, padding=2),
            nn.ReLU(),
            nn.MaxPool2d(2))
        self.conv3 = nn.Sequential(
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=5, stride=1, padding=2),
            nn.ReLU(),
            nn.MaxPool2d(2))
        self.conv4 = nn.Sequential(
            nn.Conv2d(in_channels=64, out_channels=128, kernel_size=5, stride=1, padding=2),
            nn.ReLU(),
            nn.MaxPool2d(2))
        self.out = nn.Linear(512, class_num)

    def forward(self, x):
        x = self.embeding(x)
        x = x.view(x.size(0), 1, maxlen, word2vec_size)
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)
        x = x.view(x.size(0), -1)  # 將(batch,outchanel,w,h)展平爲(batch,outchanel*w*h)
        output = self.out(x)
        return output


config = dict()
config['vocab_size'] = len(word_vocab)
config['class_num'] = class_num
config['batch_size'] = batch_size
config['maxlen'] = maxlen
config['label_vector'] = label_vector
config['word_index'] = word_index
config['learning_rate'] = 1e-3
config['embedding_dim'] = word2vec_size
config['embedding_matrix'] = torch.Tensor(embedding_matrix)


class Model():
    def __init__(self, train_wide_deep_loader, valid_wide_deep_loader, test_wide_deep_loader, config):
        self.train_loader = train_wide_deep_loader
        self.valid_loader = valid_wide_deep_loader
        self.test_loader = test_wide_deep_loader
        self.model = textCNN(config=config)
        self.criterion = nn.CrossEntropyLoss()
        self.optimizer = Adam(self.model.parameters(), lr=config['learning_rate'])

    def verification(self):
        res = []
        for query, title, fea, label in self.valid_loader.train_batch_data(is_shuffle=True):
            out = self.model(query, title, fea)
            res.extend([item.detach().numpy()[1] for item in list(out)])

        res = pd.DataFrame(res, columns=['pred'])
        valid_ans = pd.concat([self.valid_loader.data.loc[:, ['query_id', 'label']], res], axis=1)

        qauc = calculate_qauc(valid_ans)
        print('qauc is:')
        print(qauc)
        if qauc > self.mx_qauc:
            self.mx_qauc = qauc
            torch.save(self.model, './wnd/model/model.pkl')

    def fit(self, epoch):
        for i in range(epoch):
            for X_train in self.train_loader.train_batch_data():
                out = self.model(query, title, fea)  # 前向傳播求出的預測值
                self.optimizer.zero_grad()  # 將梯度初始化爲零
                loss = self.criterion(out, autograd.Variable(label.long()))  # 損失函數
                loss.backward()  # 反向傳播求梯度
                self.optimizer.step()  # 更新所有參數

            self.verification()

    def restore(self):
        self.model = torch.load('./wnd/model/model.pkl')

    def predict(self):
        res = []
        for query, title, fea in self.test_loader.test_batch_data():
            out = self.model(query, title, fea)
            res.extend([item.detach().numpy()[1] for item in list(out)])

        res = pd.DataFrame(res, columns=['pred'])
        res.to_csv('./nn_res.csv', header=None, index=None, sep=',')

model = Model(train_loader, valid_loader, test_loader, config)
model.fit(1)
# model = Model(train_loader, valid_loader, test_loader, config)
# model.restore()
model.predict()

評價指標:https://blog.csdn.net/zh11403070219/article/details/82026338

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章