經過一段時間的梳理,今天有時間就把做項目使用的demo寫成博客,對研究自然語言的你們提供一些思路。
代碼都能運行,若有問題,請留言,
# -*- coding: utf-8 -*-
# @Time : 2019/7/3 9:05
# @Author : hejipei
# @File : keras_sentiment.py
""" """
'''好的博客和github'''
# https://github.com/ShawnyXiao/TextClassification-Keras/tree/master/model
# http://www.tensorflownews.com/2018/05/10/keras_gru/
# https://my.oschina.net/u/3800567/blog/2965731
# http://www.voidcn.com/article/p-alhbnusv-bon.html
# https://blog.csdn.net/shu15121856/article/category/8840507
import numpy as np
import pandas as pd
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense,Embedding,Dropout,Flatten
from keras.layers import LSTM,SimpleRNN,Bidirectional,GRU
from keras.layers import GlobalAveragePooling1D
from keras.datasets import imdb
from keras.callbacks import EarlyStopping
def input_data():
print('Loading data...')
(x_train,y_train),(x_test,y_test) = imdb.load_data(num_words= max_features )
print(len(x_train),'train sequences')
print(len(x_test),'test sequences')
print('Average train sequence length: {}'.format(np.mean(list(map(len, x_train)), dtype=int)))
print('Average test sequence length: {}'.format(np.mean(list(map(len, x_test)), dtype=int)))
print('Pad sequences(samples x time)')
x_train = sequence.pad_sequences(x_train ,maxlen= maxlen )
x_test = sequence.pad_sequences(x_test ,maxlen= maxlen )
print('x_train shape:',x_train.shape )
print('x_test shape:',x_test.shape )
return x_train,y_train,x_test,y_test
def LSTM_model():
print('Build LSTM model...')
model = Sequential()
model.add(Embedding (max_features ,embed_size,input_length =maxlen)) # 只能作爲模型的第一層 2.5w行的句子,每個詞變成128維度的詞向量,每個句子80個詞
model.add(Dropout(0.2))
model.add(LSTM(32))
model.add(Dense(256,activation= 'relu'))
model.add(Dropout(0.2))
model.add(Dense(1,activation= 'sigmoid'))
model.compile(loss= 'binary_crossentropy',optimizer= 'adam',metrics= ['accuracy'])
return model
def BILSTM_model(): #雙向
print('Build BILSTM model...')
model = Sequential()
model.add(Embedding (max_features ,embed_size,input_length =maxlen))
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(32,return_sequences =True),merge_mode ='concat'))
model.add(Dropout(0.5))
model.add(Flatten())
model.add(Dense(1,activation= 'sigmoid'))
model.compile(loss= 'binary_crossentropy',optimizer= 'adam',metrics= ['accuracy'])
return model
def RNN_model():
print('Build RNN model...')
model = Sequential()
model.add(Embedding(max_features,embed_size,input_length =maxlen))
model.add(Dropout(0.5))
model.add(SimpleRNN(16))
model.add(Dense(256,activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss= 'binary_crossentropy',optimizer= 'adam',metrics= ['accuracy'])
return model
def DRNN_model():# 雙向
print('Build DBRNN_ model...')
model = Sequential()
model.add(Embedding(max_features,embed_size,input_length =maxlen))
model.add(Dropout(0.5))
model.add(Bidirectional(SimpleRNN(16,return_sequences =True),merge_mode ='concat'))
model.add(Dropout(0.5))
model.add(Flatten())
model.add(Dense(1,activation='sigmoid'))
model.compile(loss= 'binary_crossentropy',optimizer= 'adam',metrics= ['accuracy'])
return model
def DBRNN_model():# 組合
print('Build DBRNN_ model...')
model = Sequential()
model.add(Embedding(max_features,embed_size,input_length =maxlen))
model.add(Dropout(0.5))
model.add(Bidirectional(SimpleRNN(16,return_sequences =True),merge_mode ='concat'))
model.add(SimpleRNN(8))
model.add(Dropout(0.5))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss= 'binary_crossentropy',optimizer= 'adam',metrics= ['accuracy'])
return model
def GRU_model():
print('Build GRU model...')
model = Sequential()
model.add(Embedding(max_features, embed_size, input_length=maxlen))
model.add(Dropout(0.2))
model.add(GRU(32))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
return model
def BIGRU_model(): #雙向
print('Build BIGRU model...')
model = Sequential()
model.add(Embedding (max_features ,embed_size,input_length =maxlen))
model.add(Dropout(0.5))
model.add(Bidirectional(GRU(32,return_sequences =True),merge_mode ='concat'))
model.add(Dropout(0.5))
model.add(Flatten())
model.add(Dense(1,activation= 'sigmoid'))
model.compile(loss= 'binary_crossentropy',optimizer= 'adam',metrics= ['accuracy'])
return model
# 構建模型
def Fast_text_model():
print('Build Fast_text model...')
model = Sequential()
model.add(Embedding(max_features, embed_size,input_length=maxlen))
model.add(GlobalAveragePooling1D())
model.add(Dense(1, activation='sigmoid'))
model.summary()
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
return model
def fit_evaluate(model, x_train, y_train, x_test, y_test ):
early_stopping = EarlyStopping(monitor='val_acc', patience=5) # 增加 EarlyStopping
model.fit(x_train ,y_train ,
batch_size= batch_size ,
epochs= epochs,
callbacks =[early_stopping],
validation_data= (x_test ,y_test ))
score,acc = model.evaluate(x_test ,y_test ,batch_size= batch_size )
print('Test score:',score)
print('Test accuracy:', acc)
if __name__ == "__main__":
'''
1.input_data :導入預處理好的數據
2.xxx_model:構建模型並編譯好
3.fit_evaluate:訓練並評估模型的預測accuracy值
'''
max_features = 25000 # 詞彙表大小
maxlen = 400 # 序列最大長度
batch_size = 32 # 批數據量大小
embed_size = 50 # 詞向量維度
epochs = 10 # 迭代輪次
x_train, y_train, x_test, y_test = input_data()
model = Fast_text_model()
fit_evaluate(model, x_train, y_train, x_test, y_test )
# accuracy: 0.889
# -----------------------------------
model = LSTM_model()
fit_evaluate(model, x_train, y_train, x_test, y_test )
# accuracy: 0.868
# -----------------------------------
model = BILSTM_model()
fit_evaluate(model, x_train, y_train, x_test, y_test)
# accuracy: 0.88936
#-----------------------------------
model = RNN_model()
fit_evaluate(model, x_train, y_train, x_test, y_test )
# accuracy: 0.65
#-----------------------------------
model = DRNN_model()
fit_evaluate(model, x_train, y_train, x_test, y_test )
# accuracy: 0.8718
#-----------------------------------
model = DBRNN_model()
fit_evaluate(model, x_train, y_train, x_test, y_test )
# accuracy: 0.75036
#-----------------------------------
model = GRU_model()
fit_evaluate(model, x_train, y_train, x_test, y_test )
# accuracy: 0.86364
# -----------------------------------
model = BIGRU_model()
fit_evaluate(model, x_train, y_train, x_test, y_test )
# accuracy: 0.8748
# -----------------------------------
model.summary()
#
#
# from keras.preprocessing import sequence
# from keras.models import Sequential
# from keras.layers import Dense,Embedding
# from keras.layers import LSTM
# from keras.datasets import imdb
#
# max_features = 20000
# maxlen = 80
# batch_size = 32
#
# print('Loading data...')
# (x_train,y_train),(x_test,y_test) = imdb.load_data(num_words= max_features )
# print(len(x_train),'train sequences')
# print(len(x_test),'test sequences')
# print('Pad sequences(samples x time)')
# x_train = sequence .pad_sequences(x_train ,maxlen= maxlen )
# x_test = sequence .pad_sequences(x_test ,maxlen= maxlen )
#
# print('x_train shape:',x_train .shape )
# print('x_test shape:',x_test .shape )
#
# print('Build model...')
# model = Sequential()
# model.add(Embedding (max_features ,128))#嵌入層將正整數下標轉換爲固定大小的向量。只能作爲模型的第一層
# model.add(LSTM (128,dropout= 0.2,recurrent_dropout= 0.2))
# model.add(Dense(1,activation= 'sigmoid'))
# model.compile(loss= 'binary_crossentropy',optimizer= 'adam',metrics= ['accuracy'])
#
# print('Train...')
#
# model.fit(x_train ,y_train ,batch_size= batch_size ,epochs= 5,validation_data= (x_test ,y_test ))
#
# score,acc = model.evaluate(x_test ,y_test ,batch_size= batch_size )
# print('Test score:',score)
# print('Test accuracy:', acc)