
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import os
from gensim.models.word2vec import Word2Vec
import tqdm
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
from tensorflow.keras import Sequential as Sequential
import nltk
from nltk.corpus import stopwords

# set config 


1:w2v + 1 layer lstm : 0.77(valid data)

2:w2v + 2 layer lstm : 0.77

3:w2v + 4 layer lstm : 0.77

4:w2v + 1 layer bi-lstm : 0.76

5:w2v + 4 layer bi-lstm : 0.77

6:w2v + 4 layer bi-lstm + dropout(0.5 層間): 0.77

7:w2v + 4 layer bi-lstm + dropout(0.5 層間): 0.77

8:w2v + 8 layer bi-lstm + dropout(0.5 層間): 0.76

9:w2v + 3 layer bi-lstm + dropout(0.4 input gate 之前):0.76

10:w2v + 3 layer bi-lstm + dropout(0.3 input gate 之前):0.76

11:w2v + 3 layer bi-lstm + dropout(0.3 input gate 之前) + batch = 32(加倍batch):0.76

12:w2v + 3 layer bi-lstm + dropout(0.1 input gate 之前) + batch = 64 :0.76

小結:bidirection 在這個案例中無效

#load data
path_home = "/home/lowry/pro/kaggle_tweets/kaggle_tweets_emotion"
data_train = pd.read_csv(os.path.join(path_home,"train.csv"))
data_test = pd.read_csv(os.path.join(path_home,"test.csv"))
data_submit = pd.read_csv(os.path.join(path_home,"sample_submission.csv"))
id keyword location text target
0 1 NaN NaN Our Deeds are the Reason of this #earthquake M... 1
1 4 NaN NaN Forest fire near La Ronge Sask. Canada 1
2 5 NaN NaN All residents asked to 'shelter in place' are ... 1
3 6 NaN NaN 13,000 people receive #wildfires evacuation or... 1
4 7 NaN NaN Just got sent this photo from Ruby #Alaska as ... 1
# data clearn

stopwords_english = stopwords.words("english")

import re
def cleanword(s):
    s = s.lower()
    s = " ".join([word for word in s.split(" ") if word not in stopwords_english])
    temp = re.findall("http\S*",s)  
    for deletStr in temp:
        if deletStr != "":
            s = s.replace(deletStr," ")
    temp = re.findall("@\S*",s)
    for deletStr in temp:
        if deletStr != "":
            s = s.replace(deletStr," ")
    temp = re.findall("\d*",s)
    for deletStr in temp:
        if deletStr != "":
            s = s.replace(deletStr," ")
    temp = re.findall("\x89\S*",s)
    for deletStr in temp:
        if deletStr != "":
            s = s.replace(deletStr[:5]," ")

    s = s.replace("\n"," ")
    s = s.replace(","," ")
    s = s.replace("?"," ")
    s = s.replace("..."," ")
    s = s.replace("."," ")
    s = s.replace("["," ")
    s = s.replace("]"," ")
    s = s.replace("!"," ")
    s = s.replace(":"," ")
    s = s.replace("-"," ")
    s = s.replace("#"," ")
    s = s.replace("|"," ")
    s = s.replace("("," ")
    s = s.replace(")"," ")
    s = s.replace(";"," ")
    s = s.replace("="," ")
    s = s.replace(">"," ")
    s = s.replace("<"," ")
    s = s.replace("/"," ")

    #delet conntinue " "
    s_new_list = [word for word in s.split(" ") if word != " "]
    s_new = " ".join(s_new_list)
    s_new = s_new.strip()
    return s_new
data_test['text'] = data_test['text'].apply(cleanword)
data_train['text'] = data_train['text'].apply(cleanword)

data_train_copy = data_train.copy()
data_train_copy["text_length"] = data_train_copy["text"].apply(lambda s : len(s.split(" ")))
data_test_copy = data_train.copy()
data_test_copy["text_length"] = data_train_copy["text"].apply(lambda s : len(s.split(" ")))
print("train_text_words_length(95%) = ",np.percentile(data_train_copy["text_length"].tolist(),95))
print("test_text_words_length(95%) = ",np.percentile(data_test_copy["text_length"].tolist(),95))
train_text_words_length(95%) =  22.0
test_text_words_length(95%) =  22.0
# print(data_train_copy)
# print(data_train_copy.info(memory_usage=True))
#train word2vec model 

# sentences = []
# for line in data_train['text'].values:
#     sentences.append(list(line.split(" ")))
# for line in data_test['text'].values:
#     sentences.append(list(line.split(" ")))
# print(len(sentences))
# path_model_w2v = os.path.join(path_home,"w2v_model.model")
# model_w2v = Word2Vec(
#         sentences=sentences,
#         size=200,#維度
#         alpha=0.025, #默認
#         window=5, #默認
#         min_count=2,#2,3
#         sample=0.001,#
#         seed=2018, #
#         workers=11, #線程
#         min_alpha=0.0001, 
#         sg=0, #cbow
#         hs=0, #負採樣
#         negative=5,#負採樣個數
#         ns_exponent=0.75, 
#         cbow_mean=1,#求和再取平均
#         iter=10 #10到15
#         )
# model_w2v.save(path_model_w2v)

# test: 
# model_w2v = Word2Vec.load(path_model_w2v)
# print(model_w2v)
# model_w2v.wv["our"].shape
# for word , wordInfo in model_w2v.wv.vocab.items():
#     print("word = ",word)
#     print("wordInfo = ",wordInfo )
#     break

#build word2vec map
path_model_w2v = os.path.join(path_home,"w2v_model.model")
model_w2v = Word2Vec.load(path_model_w2v)
vocab_list = [word for word, Vocab in model_w2v.wv.vocab.items()]# 存儲 所有的 詞語
word_index = {" ": 0}# 初始化 `[word : token]` ,後期 tokenize 語料庫就是用該詞典 (把詞語改成下表)。
word_vector = {} # 初始化`[word : vector]`字典
# 初始化存儲所有向量的大矩陣,留意其中多一位(首行),詞向量全爲 0,用於 padding補零。
# 行數 爲 所有單詞數+1 比如 10000+1 ; 列數爲 詞向量“維度”比如100。
embedding_matrix = np.zeros( (len(vocab_list)+1,model_w2v.vector_size) )  #embedding 矩陣每一列都是一個詞向量

for i in range(len(vocab_list)):
    word = vocab_list[i]   #每個詞語
    word_index[word] = i+1 #詞語->序號
    word_vector[word] = model_w2v.wv[word] #詞語->詞向量
    embedding_matrix[i+1] = model_w2v.wv[word] #序號->詞向量
(8010, 200)
def tokenizer(texts, word_index):
    data = []
    for sentence in texts:  #遍歷每一行
        new_txt = []
        for word in sentence.split(" "):
                new_txt.append(word_index[word])  # 把句子中的 詞語轉化爲index
        new_txt = new_txt[:SEQUENCE_LENGTH]
        padding_length = SEQUENCE_LENGTH - len(new_txt)
        if padding_length > 0:
            new_txt += [0]*padding_length
    return np.array(data)

X_train_tokenizer = tokenizer(data_train["text"].values,word_index)
X_test_tokenizer = tokenizer(data_test["text"].values,word_index)

x_train ,x_valid , y_train , y_valid = train_test_split(X_train_tokenizer,data_train["target"].values)
(5709, 22)
    model = keras.Sequential()
            input_dim = len(embedding_matrix),
            output_dim = WORD_SIZE,
            weights=[embedding_matrix],     #預訓練的詞向量係數
            input_length = SEQUENCE_LENGTH,
            trainable = False                #是否在訓練過程中更新詞向量

    for i in range(3):
#         model.add(keras.layers.LSTM(64,activation='tanh',return_sequences=True,dropout=0.1))
#         model.add(keras.layers.Bidirectional(keras.layers.LSTM(64,activation='tanh',return_sequences=True,dropout=0.1)))
#         model.add(keras.layers.Dropout(0.5))
#         model.add(keras.layers.BatchNormalization())

    return model
Model: "sequential_21"
Layer (type)                 Output Shape              Param #   
embedding_21 (Embedding)     (None, 22, 200)           1602000   
lstm_102 (LSTM)              (None, 22, 64)            67840     
lstm_103 (LSTM)              (None, 22, 64)            33024     
lstm_104 (LSTM)              (None, 22, 64)            33024     
lstm_105 (LSTM)              (None, 64)                33024     
dense_21 (Dense)             (None, 1)                 65        
Total params: 1,768,977
Trainable params: 166,977
Non-trainable params: 1,602,000
history = model.fit(
    epochs = 60,
    batch_size= BATCH_SIZE,
Train on 5709 samples, validate on 1904 samples
Epoch 1/60
5709/5709 [==============================] - 10s 2ms/sample - loss: 0.6199 - accuracy: 0.6614 - val_loss: 0.5872 - val_accuracy: 0.6949
Epoch 2/60
5709/5709 [==============================] - 1s 249us/sample - loss: 0.6039 - accuracy: 0.6744 - val_loss: 0.6035 - val_accuracy: 0.6702
Epoch 3/60
5709/5709 [==============================] - 1s 248us/sample - loss: 0.5763 - accuracy: 0.7043 - val_loss: 0.5729 - val_accuracy: 0.7048
Epoch 4/60
5709/5709 [==============================] - 1s 249us/sample - loss: 0.5697 - accuracy: 0.7084 - val_loss: 0.5840 - val_accuracy: 0.6843
Epoch 5/60
5709/5709 [==============================] - 1s 248us/sample - loss: 0.5733 - accuracy: 0.7057 - val_loss: 0.5598 - val_accuracy: 0.7159
Epoch 6/60
5709/5709 [==============================] - 1s 247us/sample - loss: 0.5620 - accuracy: 0.7183 - val_loss: 0.5613 - val_accuracy: 0.7164
Epoch 7/60
5709/5709 [==============================] - 1s 248us/sample - loss: 0.5538 - accuracy: 0.7208 - val_loss: 0.5454 - val_accuracy: 0.7227
Epoch 8/60
5709/5709 [==============================] - 1s 249us/sample - loss: 0.5453 - accuracy: 0.7334 - val_loss: 0.5382 - val_accuracy: 0.7327
Epoch 9/60
5709/5709 [==============================] - 1s 249us/sample - loss: 0.5509 - accuracy: 0.7255 - val_loss: 0.5407 - val_accuracy: 0.7311
Epoch 10/60
5709/5709 [==============================] - 1s 252us/sample - loss: 0.5544 - accuracy: 0.7194 - val_loss: 0.5543 - val_accuracy: 0.7337
Epoch 11/60
5709/5709 [==============================] - 1s 247us/sample - loss: 0.5406 - accuracy: 0.7274 - val_loss: 0.5313 - val_accuracy: 0.7390
Epoch 12/60
5709/5709 [==============================] - 1s 246us/sample - loss: 0.5377 - accuracy: 0.7325 - val_loss: 0.5391 - val_accuracy: 0.7321
Epoch 13/60
5709/5709 [==============================] - 1s 250us/sample - loss: 0.5265 - accuracy: 0.7465 - val_loss: 0.5458 - val_accuracy: 0.7321
Epoch 14/60
5709/5709 [==============================] - 1s 248us/sample - loss: 0.5335 - accuracy: 0.7352 - val_loss: 0.5556 - val_accuracy: 0.7269
Epoch 15/60
5709/5709 [==============================] - 1s 248us/sample - loss: 0.5299 - accuracy: 0.7437 - val_loss: 0.5301 - val_accuracy: 0.7405
Epoch 16/60
5709/5709 [==============================] - 1s 247us/sample - loss: 0.5247 - accuracy: 0.7462 - val_loss: 0.5677 - val_accuracy: 0.7274
Epoch 17/60
5709/5709 [==============================] - 1s 248us/sample - loss: 0.5229 - accuracy: 0.7485 - val_loss: 0.5201 - val_accuracy: 0.7426
Epoch 18/60
5709/5709 [==============================] - 1s 245us/sample - loss: 0.5315 - accuracy: 0.7359 - val_loss: 0.5246 - val_accuracy: 0.7463
Epoch 19/60
5709/5709 [==============================] - 1s 247us/sample - loss: 0.5240 - accuracy: 0.7437 - val_loss: 0.5345 - val_accuracy: 0.7316
Epoch 20/60
5709/5709 [==============================] - 1s 246us/sample - loss: 0.5256 - accuracy: 0.7399 - val_loss: 0.5279 - val_accuracy: 0.7468
Epoch 21/60
5709/5709 [==============================] - 1s 247us/sample - loss: 0.5160 - accuracy: 0.7551 - val_loss: 0.5166 - val_accuracy: 0.7521
Epoch 22/60
5709/5709 [==============================] - 1s 248us/sample - loss: 0.5224 - accuracy: 0.7462 - val_loss: 0.5210 - val_accuracy: 0.7521
Epoch 23/60
5709/5709 [==============================] - 1s 246us/sample - loss: 0.5236 - accuracy: 0.7444 - val_loss: 0.5187 - val_accuracy: 0.7500
Epoch 24/60
5709/5709 [==============================] - 1s 248us/sample - loss: 0.5285 - accuracy: 0.7409 - val_loss: 0.5789 - val_accuracy: 0.7447
Epoch 25/60
5709/5709 [==============================] - 1s 245us/sample - loss: 0.5236 - accuracy: 0.7493 - val_loss: 0.5223 - val_accuracy: 0.7484
Epoch 26/60
5709/5709 [==============================] - 1s 246us/sample - loss: 0.5126 - accuracy: 0.7511 - val_loss: 0.5415 - val_accuracy: 0.7337
Epoch 27/60
5709/5709 [==============================] - 1s 248us/sample - loss: 0.5133 - accuracy: 0.7499 - val_loss: 0.5087 - val_accuracy: 0.7558
Epoch 28/60
5709/5709 [==============================] - 1s 250us/sample - loss: 0.5091 - accuracy: 0.7502 - val_loss: 0.5145 - val_accuracy: 0.7489
Epoch 29/60
5709/5709 [==============================] - 1s 246us/sample - loss: 0.5125 - accuracy: 0.7493 - val_loss: 0.5141 - val_accuracy: 0.7505
Epoch 30/60
5709/5709 [==============================] - 1s 248us/sample - loss: 0.5104 - accuracy: 0.7514 - val_loss: 0.5357 - val_accuracy: 0.7285
Epoch 31/60
5709/5709 [==============================] - 1s 245us/sample - loss: 0.5051 - accuracy: 0.7500 - val_loss: 0.5242 - val_accuracy: 0.7384
Epoch 32/60
5709/5709 [==============================] - 1s 250us/sample - loss: 0.5008 - accuracy: 0.7588 - val_loss: 0.5083 - val_accuracy: 0.7526
Epoch 33/60
5709/5709 [==============================] - 1s 249us/sample - loss: 0.5096 - accuracy: 0.7527 - val_loss: 0.5223 - val_accuracy: 0.7463
Epoch 34/60
5709/5709 [==============================] - 1s 248us/sample - loss: 0.4977 - accuracy: 0.7581 - val_loss: 0.5096 - val_accuracy: 0.7563
Epoch 35/60
5709/5709 [==============================] - 1s 246us/sample - loss: 0.5061 - accuracy: 0.7555 - val_loss: 0.5132 - val_accuracy: 0.7574
Epoch 36/60
5709/5709 [==============================] - 1s 246us/sample - loss: 0.5076 - accuracy: 0.7544 - val_loss: 0.5090 - val_accuracy: 0.7558
Epoch 37/60
5709/5709 [==============================] - 1s 245us/sample - loss: 0.4993 - accuracy: 0.7613 - val_loss: 0.5265 - val_accuracy: 0.7468
Epoch 38/60
5709/5709 [==============================] - 1s 244us/sample - loss: 0.4971 - accuracy: 0.7630 - val_loss: 0.5162 - val_accuracy: 0.7579
Epoch 39/60
5709/5709 [==============================] - 1s 246us/sample - loss: 0.4979 - accuracy: 0.7623 - val_loss: 0.5178 - val_accuracy: 0.7426
Epoch 40/60
5709/5709 [==============================] - 1s 248us/sample - loss: 0.4977 - accuracy: 0.7586 - val_loss: 0.5048 - val_accuracy: 0.7537
Epoch 41/60
5709/5709 [==============================] - 1s 248us/sample - loss: 0.4993 - accuracy: 0.7642 - val_loss: 0.5071 - val_accuracy: 0.7516
Epoch 42/60
5709/5709 [==============================] - 1s 247us/sample - loss: 0.4915 - accuracy: 0.7628 - val_loss: 0.5114 - val_accuracy: 0.7600
Epoch 43/60
5709/5709 [==============================] - 1s 245us/sample - loss: 0.4857 - accuracy: 0.7705 - val_loss: 0.5226 - val_accuracy: 0.7558
Epoch 44/60
5709/5709 [==============================] - 1s 247us/sample - loss: 0.4879 - accuracy: 0.7676 - val_loss: 0.5074 - val_accuracy: 0.7647
Epoch 45/60
5709/5709 [==============================] - 1s 244us/sample - loss: 0.4837 - accuracy: 0.7709 - val_loss: 0.5122 - val_accuracy: 0.7574
Epoch 46/60
5709/5709 [==============================] - 1s 246us/sample - loss: 0.4837 - accuracy: 0.7681 - val_loss: 0.5023 - val_accuracy: 0.7558
Epoch 47/60
5709/5709 [==============================] - 1s 244us/sample - loss: 0.4820 - accuracy: 0.7688 - val_loss: 0.5541 - val_accuracy: 0.7279
Epoch 48/60
5709/5709 [==============================] - 1s 246us/sample - loss: 0.4844 - accuracy: 0.7665 - val_loss: 0.5059 - val_accuracy: 0.7521
Epoch 49/60
5709/5709 [==============================] - 1s 246us/sample - loss: 0.4790 - accuracy: 0.7739 - val_loss: 0.5055 - val_accuracy: 0.7621
Epoch 50/60
5709/5709 [==============================] - 1s 245us/sample - loss: 0.4820 - accuracy: 0.7716 - val_loss: 0.5064 - val_accuracy: 0.7579
Epoch 51/60
5709/5709 [==============================] - 1s 245us/sample - loss: 0.4840 - accuracy: 0.7672 - val_loss: 0.5160 - val_accuracy: 0.7521
Epoch 52/60
5709/5709 [==============================] - 1s 247us/sample - loss: 0.4745 - accuracy: 0.7733 - val_loss: 0.5221 - val_accuracy: 0.7584
Epoch 53/60
5709/5709 [==============================] - 1s 247us/sample - loss: 0.4749 - accuracy: 0.7758 - val_loss: 0.5186 - val_accuracy: 0.7474
Epoch 54/60
5709/5709 [==============================] - 1s 247us/sample - loss: 0.4699 - accuracy: 0.7775 - val_loss: 0.5061 - val_accuracy: 0.7568
Epoch 55/60
5709/5709 [==============================] - 1s 247us/sample - loss: 0.4683 - accuracy: 0.7767 - val_loss: 0.5178 - val_accuracy: 0.7432
Epoch 56/60
5709/5709 [==============================] - 1s 246us/sample - loss: 0.4728 - accuracy: 0.7756 - val_loss: 0.5245 - val_accuracy: 0.7516
Epoch 57/60
5709/5709 [==============================] - 1s 247us/sample - loss: 0.4718 - accuracy: 0.7730 - val_loss: 0.5179 - val_accuracy: 0.7495
Epoch 58/60
5709/5709 [==============================] - 1s 246us/sample - loss: 0.4681 - accuracy: 0.7781 - val_loss: 0.5209 - val_accuracy: 0.7547
Epoch 59/60
5709/5709 [==============================] - 1s 246us/sample - loss: 0.4612 - accuracy: 0.7847 - val_loss: 0.5193 - val_accuracy: 0.7521
Epoch 60/60
5709/5709 [==============================] - 1s 246us/sample - loss: 0.4562 - accuracy: 0.7886 - val_loss: 0.5204 - val_accuracy: 0.7563
def draw(history):
    data = history.history
    data = pd.DataFrame(data)



