keras文本分類

1,reuters數據集,keras內置可下載

from keras.datasets import reuters
import numpy as np
from keras import models
from keras import layers
import copy
# from keras.utils.np_utils import to_categorical
# 8,982 training examples and 2,246 test examples,46個目標分類
(train_data,train_labels),(test_data,test_labels)=reuters.load_data(num_words=10000)
#計算隨機猜測命中率18.4%
copy_labels=copy.copy(test_labels)
np.random.shuffle(copy_labels)
arr=np.array(test_labels)==np.array(copy_labels)
print(float(np.sum(arr)/len(arr)))
# 查看實際內容
word_index = reuters.get_word_index()
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
decoded_newswire = ' '.join([reverse_word_index.get(i - 3, '?') for i in train_data[0]])
print(decoded_newswire)

def vectorize_sequences(sequences, dimension=10000):
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
        results[i, sequence] = 1.
    return results
#內置了to_categorical方法,作用一樣
def prehandle_labels(labels,dimension=46):
	results=np.zeros((len(labels),dimension))
	for i,label in enumerate(labels):
		results[i,label]=1.
	return results
#預處理數據和標籤
x_train=vectorize_sequences(train_data)
x_test=vectorize_sequences(test_data)
y_train=prehandle_labels(train_labels)
y_test=prehandle_labels(test_labels)

#分出部分驗證集
x_val=x_train[:1000]
y_val=y_train[:1000]
part_x_train=x_train[1000:]
part_y_train=y_train[1000:]
model=models.Sequential()
model.add(layers.Dense(64, activation='relu', input_shape=(10000,)))
model.add(layers.Dense(64, activation='relu'))
#softmax,概率分佈
model.add(layers.Dense(46, activation='softmax'))
model.compile(loss='categorical_crossentropy',optimizer='rmsprop',metrics=['acc'])
#9次迭代開始過擬合
history=model.fit(part_x_train,part_y_train,epochs=9,batch_size=512,validation_data=(x_val,y_val))
results=model.evaluate(x_test,y_test)
#[0.9810771037719128, 0.7880676759212865],大概80%的成功率,比瞎猜好很多
print(results)
#查看預測結果
predict=model.predict(x_test)
print(predict[:20])



2,IMDB數據集,keras可下載

2.1 原始方式

from keras.datasets import imdb
import numpy as np
from keras import models
from keras import layers
import matplotlib.pyplot as plt
#只保留出現最頻繁的10000個單詞
#訓練數據爲電影評論,單詞映射爲數字了,標籤0代表負面評價,1代表正面
(train_data,train_labels),(test_data,test_labels)=imdb.load_data(num_words=10000)
# print(train_data[0])
# print(train_data.shape)
# print(train_labels[0])
#每篇評價的最大單詞量組成數組,輸出數組最大值:9999
# print(max([max(seq) for seq in train_data]))
#輸出數字序列對應單詞文本
# word_index = imdb.get_word_index()
# reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
# #前面3個爲json頭信息
# decoded_review = ' '.join([reverse_word_index.get(i - 3, '?') for i in train_data[0]])
# print(decoded_review)

#預處理數據,將每個序列擴展到10000維向量,train_data返回results爲(25000,10000)
def vectorize_sequences(sequences, dimension=10000):
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
    	#sequence爲train_data一個評論單詞對應的索引列表
    	#比如第一個索引列表排序爲1,2,3,4 那麼轉換後這個評論的張量表爲0,1,1,1,1
    	#即索引值爲x,則第x位(0開始)置爲1
        results[i, sequence] = 1.
    return results
x_train = vectorize_sequences(train_data)
x_test = vectorize_sequences(test_data)
# print(x_train[0]) #[0 1 1... 0 0 0]
#處理標籤爲向量
y_train = np.asarray(train_labels).astype('float32')
y_test = np.asarray(test_labels).astype('float32')
#構建model
model=models.Sequential()
#bias是內置了的,如果沒有bias,則多個線性變換的結果可能等效於一個線性變換,
# 那麼多層網絡就沒有多少意思了
model.add(layers.Dense(16,activation='relu',input_shape=(10000,)))
model.add(layers.Dense(16,activation='relu'))
# 激活函數爲對應二分情況
model.add(layers.Dense(1,activation='sigmoid'))
# 損失函數
model.compile(optimizer='rmsprop',loss='binary_crossentropy',metrics=['acc'])
# 將訓練集分出部分驗證集
x_val = x_train[:10000]
partial_x_train = x_train[10000:]
y_val = y_train[:10000]
partial_y_train = y_train[10000:]
history=model.fit(partial_x_train,partial_y_train,epochs=20,batch_size=512,validation_data=(x_val, y_val))
loss,accuracy=model.evaluate(x_test,y_test)
print('accuracy:',accuracy)
# 訓練集精度99.9%,驗證集精度87%,測試集精度85%
# 圖形化過程
history_dict=history.history
train_loss=history_dict['loss']
validation_loss=history_dict['val_loss']
epochs=range(1,len(train_loss)+1)
plt.plot(epochs,train_loss,'bo',label='Training loss')
plt.plot(epochs,validation_loss,'r',label='Validation loss')
plt.title('training and validation loss')
plt.xlabel('epochs')
plt.ylabel('loss')
plt.legend()
plt.show()
#清楚之前作圖,重新作圖
plt.clf()
train_acc = history_dict['acc']
validation_acc = history_dict['val_acc']
plt.plot(epochs, train_acc, 'bo', label='Training acc')
plt.plot(epochs, validation_acc, 'r', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()


#重新訓練,epochs不宜過大,容易過擬合,大概3,4就夠
#以上初步從訓練集中選出10000的驗證集訓練,此時可對所有訓練集訓練了
model=models.Sequential()
model.add(layers.Dense(16,activation='relu',input_shape=(10000,)))
model.add(layers.Dense(16,activation='relu'))
model.add(layers.Dense(1,activation='sigmoid'))
model.compile(optimizer='rmsprop',loss='binary_crossentropy',metrics=['accuracy'])
#epochs=4,使用整個訓練集
model.fit(x_train,y_train,epochs=4,batch_size=512)
#輸出在測試集上的測試loss和精度[0.294587216424942, 0.8832]
results=model.evaluate(x_test,y_test)
print(results)
#輸出模型在測試集上每一個試樣的預測值(0-1之間)
predict=model.predict(x_test)
print(predict)

2.2 在原始方式上,進行3種調節,查看測試結果

#3種調節方式
from keras.datasets import imdb
import numpy as np
from keras import models
from keras import layers
(train_data,train_labels),(test_data,test_labels)=imdb.load_data(num_words=10000)
def vectorize_sequences(sequences, dimension=10000):
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
        results[i, sequence] = 1.
    return results
x_train = vectorize_sequences(train_data)
x_test = vectorize_sequences(test_data)
y_train = np.asarray(train_labels).astype('float32')
y_test = np.asarray(test_labels).astype('float32')

model=models.Sequential()
model.add(layers.Dense(16,activation='relu',input_shape=(10000,)))
model.add(layers.Dense(16,activation='relu'))
#3,使用tanh激活函數,測試集精度只有84%,驗證集上的精度也不行
# model.add(layers.Dense(16,activation='tanh',input_shape=(10000,)))
# model.add(layers.Dense(16,activation='tanh'))
#1,添加一層後,調整第二層輸出爲32維,其他不變,結果沒什麼改變
# model.add(layers.Dense(16,activation='relu'))
model.add(layers.Dense(1,activation='sigmoid'))
#2,使用均方誤差損失函數mse,驗證loss變的很低了,但是2種精度都沒什麼改變
# model.compile(optimizer='rmsprop',loss='mse',metrics=['accuracy'])
model.compile(optimizer='rmsprop',loss='binary_crossentropy',metrics=['accuracy'])
model.fit(x_train,y_train,epochs=4,batch_size=512)
#原始測試集上的測試loss和精度[0.294587216424942, 0.8832]
#1,[0.308044932346344, 0.882]
#2,[0.086122335729599, 0.88292]
#3,[0.32080198724746706, 0.87768]
results=model.evaluate(x_test,y_test)
print(results)

2.3 在原始方式上,添加L1或L2正則化

從結果看,L1與L2同時使用最佳,L2表現就已經很好了,L1表現還比原始方式略微低一點

#添加L1或L2正則化
from keras.datasets import imdb
import numpy as np
from keras import models
from keras import layers
from keras import regularizers
(train_data,train_labels),(test_data,test_labels)=imdb.load_data(num_words=10000)
def vectorize_sequences(sequences, dimension=10000):
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
        results[i, sequence] = 1.
    return results
x_train = vectorize_sequences(train_data)
x_test = vectorize_sequences(test_data)
y_train = np.asarray(train_labels).astype('float32')
y_test = np.asarray(test_labels).astype('float32')
#打亂訓練集
index=[i for i in range(len(x_train))]
np.random.shuffle(index)
x_train=x_train[index]
y_train=y_train[index]
#分出驗證集
x_val = x_train[:10000]
partial_x_train = x_train[10000:]
y_val = y_train[:10000]
partial_y_train = y_train[10000:]
#原始模型
print('start training model_1')
model=models.Sequential()
model.add(layers.Dense(16,activation='relu',input_shape=(10000,)))
model.add(layers.Dense(16,activation='relu'))
model.add(layers.Dense(1,activation='sigmoid'))
model.compile(optimizer='rmsprop',loss='binary_crossentropy',metrics=['accuracy'])
history=model.fit(partial_x_train,partial_y_train,epochs=20,batch_size=512,validation_data=(x_val,y_val),verbose=0)
#L2正則化後
print('start training model_2')
model2=models.Sequential()
model2.add(layers.Dense(16,kernel_regularizer=regularizers.l2(0.001),activation='relu',input_shape=(10000,)))
model2.add(layers.Dense(16,kernel_regularizer=regularizers.l2(0.001),activation='relu'))
model2.add(layers.Dense(1,activation='sigmoid'))
model2.compile(optimizer='rmsprop',loss='binary_crossentropy',metrics=['accuracy'])
history2=model2.fit(partial_x_train,partial_y_train,epochs=20,batch_size=512,validation_data=(x_val,y_val),verbose=0)
#L1正則化後
print('start training model_3')
model3=models.Sequential()
model3.add(layers.Dense(16,kernel_regularizer=regularizers.l1(0.001),activation='relu',input_shape=(10000,)))
model3.add(layers.Dense(16,kernel_regularizer=regularizers.l1(0.001),activation='relu'))
model3.add(layers.Dense(1,activation='sigmoid'))
model3.compile(optimizer='rmsprop',loss='binary_crossentropy',metrics=['accuracy'])
history3=model3.fit(partial_x_train,partial_y_train,epochs=20,batch_size=512,validation_data=(x_val,y_val),verbose=0)
#L1與L2同時使用
print('start training model_4')
model4=models.Sequential()
model4.add(layers.Dense(16,kernel_regularizer=regularizers.l1_l2(l1=0.001,l2=0.001),activation='relu',input_shape=(10000,)))
model4.add(layers.Dense(16,kernel_regularizer=regularizers.l1_l2(l1=0.001,l2=0.001),activation='relu'))
model4.add(layers.Dense(1,activation='sigmoid'))
model4.compile(optimizer='rmsprop',loss='binary_crossentropy',metrics=['accuracy'])
#40個循環整體驗證loss依然下降
#48個循環後整體開始上升
history4=model4.fit(partial_x_train,partial_y_train,epochs=20,batch_size=512,validation_data=(x_val,y_val),verbose=0)
#作圖
import matplotlib.pyplot as plt
#注意只有將訓練集分出驗證集,訓練模型後纔會有val_loss這個key
val_loss_1=history.history['val_loss']
val_loss_2=history2.history['val_loss']
val_loss_3=history3.history['val_loss']
val_loss_4=history4.history['val_loss']
epochs=range(1,len(val_loss_4)+1)
plt.plot(epochs,val_loss_1,'r',label='val_loss_1')
plt.plot(epochs,val_loss_2,'bo',label='val_loss_L2')
plt.plot(epochs,val_loss_3,'y+',label='val_loss_L1')
plt.plot(epochs,val_loss_4,'k^',label='val_loss_L1&L2')
plt.title('validation loss')
plt.xlabel('epochs')
plt.ylabel('loss')
plt.legend()
plt.show()
results=model.evaluate(x_test,y_test)
results_2=model2.evaluate(x_test,y_test)
results_3=model3.evaluate(x_test,y_test)
results_4=model4.evaluate(x_test,y_test)
# [0.7685215129828453, 0.85004] 
# [0.4729495596218109, 0.86188] 
# [0.5593033714866639, 0.84552] 
# [0.5237347905826568, 0.87284]
print(results,results_2,results_3,results_4)

這是驗證loss圖,從圖中看,似乎L2最佳

2.4 在原始方式上,添加dropout

#使用Dropout層的影響,驗證loss要低一些,時間爲556s
from keras.datasets import imdb
import numpy as np
from keras import models
from keras import layers
import matplotlib.pyplot as plt

(train_data,train_labels),(test_data,test_labels)=imdb.load_data(num_words=10000)

def vectorize_sequences(sequences, dimension=10000):
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
        results[i, sequence] = 1.
    return results
x_train = vectorize_sequences(train_data)
x_test = vectorize_sequences(test_data)
y_train = np.asarray(train_labels).astype('float32')
y_test = np.asarray(test_labels).astype('float32')
#打亂訓練集
index=[i for i in range(len(x_train))]
np.random.shuffle(index)
x_train=x_train[index]
y_train=y_train[index]
#分出驗證集
x_val = x_train[:10000]
partial_x_train = x_train[10000:]
y_val = y_train[:10000]
partial_y_train = y_train[10000:]
#model 1
print('starting model_1...')
model=models.Sequential()
model.add(layers.Dense(16,activation='relu',input_shape=(10000,)))
model.add(layers.Dense(16,activation='relu'))
model.add(layers.Dense(1,activation='sigmoid'))
model.compile(optimizer='rmsprop',loss='binary_crossentropy',metrics=['acc'])
history=model.fit(partial_x_train,partial_y_train,epochs=20,batch_size=512,validation_data=(x_val, y_val))
#model 2
print('starting model_2...')
model2=models.Sequential()
model2.add(layers.Dense(16,activation='relu',input_shape=(10000,)))
#每個隱藏層後添加Dropout層
model2.add(layers.Dropout(0.5))
model2.add(layers.Dense(16,activation='relu'))
model2.add(layers.Dropout(0.5))
model2.add(layers.Dense(1,activation='sigmoid'))
model2.compile(optimizer='rmsprop',loss='binary_crossentropy',metrics=['acc'])
history2=model2.fit(partial_x_train,partial_y_train,epochs=20,batch_size=512,validation_data=(x_val, y_val))
#
result1=model.evaluate(x_test,y_test)
result2=model2.evaluate(x_test,y_test)
print(result1,result2)
#
history_dict=history.history
val_loss_1=history_dict['val_loss']
val_loss_2=history2.history['val_loss']
epochs=range(1,len(val_loss_1)+1)
plt.plot(epochs,val_loss_1,'b^',label='val_loss_1')
plt.plot(epochs,val_loss_2,'ro',label='val_loss_2')
plt.title('validation loss')
plt.xlabel('epochs')
plt.ylabel('loss')
plt.legend()
plt.show()

驗證loss如下,可以看出過擬合推遲出現,且上升更爲緩慢,還是有點作用的。

3,還是IMDB數據集, 使用其他技術

3.1 詞嵌入Embedding

Embedding其實相當於一層全連接層,但是直接使用全連接層,效率不高,使用查表方式效率很高,具體可參考這篇文章:

Embedding剖析

#詞嵌入
from keras.datasets import imdb
from keras import preprocessing
from keras.models import Sequential
from keras.layers import Flatten,Dense,Embedding
import matplotlib.pyplot as plt
#符號數爲1000:1+max_word_index,每個矢量維度64
# embedding_layer=Embedding(1000,64)
max_feature=10000
#每個評論僅考慮前20個單詞
max_len=20
(x_train,y_train),(x_test,y_test)=imdb.load_data(num_words=max_feature)
#預處理數據爲統一維度
x_train=preprocessing.sequence.pad_sequences(x_train,maxlen=max_len)
x_test=preprocessing.sequence.pad_sequences(x_test,maxlen=max_len)
model=Sequential()
#3個參數爲:只考慮文本中出現的最熱10000詞,
# 每個詞對應的向量長度8,每個評論只取前max_len個詞訓練或測試
model.add(Embedding(10000,8,input_length=max_len))
model.add(Flatten())
model.add(Dense(1,activation='sigmoid'))
model.compile(optimizer='rmsprop',loss='binary_crossentropy',metrics=['acc'])
#validation_split更簡便的分出驗證集
#僅僅取前20個單詞,可以達到75%左右精度
history=model.fit(x_train,y_train,epochs=10,batch_size=32,validation_split=0.2,verbose=2)
h=history.history
acc=h['acc']
val_acc=h['val_acc']
loss=h['loss']
val_loss=h['val_loss']
epochs=range(1,len(acc)+1)
plt.plot(epochs,acc,'ro',label='train_acc')
plt.plot(epochs,val_acc,'b^',label='val_acc')
plt.title('acc')
plt.legend()
plt.figure()
plt.plot(epochs,loss,'ro',label='train_loss')
plt.plot(epochs,val_loss,'b^',label='val_loss')
plt.title('loss')
plt.legend()
plt.figure()
plt.show()

3.2 使用原始的IMDB數據集,即沒有將文本轉換成數字序列,然而根據下載的大表製作自己的熱詞字典

這裏需要下載2個東東,一個叫aclImdb,爲原始數據集,一個是glove,爲40萬的單詞-向量表。當我們想處理原始的文本數據時,就得像這樣從頭開始一步步做。glove應該是在龐大的數據集上生成的,裏面的詞-向量對應關係具有強的普遍性。

測試結果好像不怎麼樣~

#使用預設的詞嵌入,即40萬的單詞-向量表
#熱詞:10000
#每個文本:取前100詞訓練、評估
#使用原始(raw)的IMDB數據,即文件夾aclImdb下的
#只用200個訓練樣本,10000個驗證樣本
import os
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense
import json
import matplotlib.pyplot as plt
from keras.models import model_from_json
imdb_dir='D:/Data/DeepLearningWithPython/IMDB/aclImdb'
train_dir=os.path.join(imdb_dir,'train')
#獲取文本及標籤
labels=[]
texts=[]
for label_type in ['neg','pos']:
	dir_name=os.path.join(train_dir,label_type)
	for fname in os.listdir(dir_name):
		#默認是按系統編碼打開,要設置utf-8
		f=open(os.path.join(dir_name,fname),mode='r',encoding='utf-8')
		texts.append(f.read())
		f.close()
		if label_type=='neg':
			labels.append(0)
		else:
			labels.append(1)

maxlen = 100
training_samples = 200
validation_samples = 10000
max_words = 10000
#步驟
#1,定義Tokenizer
#只考慮最常見的10000個單詞
tokenizer = Tokenizer(num_words=max_words)
#2,建立詞索引
#選好熱詞,設置對應向量
tokenizer.fit_on_texts(texts)
#3,文本到序列
sequences = tokenizer.texts_to_sequences(texts)
#也可以一步到位,這裏2個字符串樣本,輸出維度(samples,10000)
# one_hot_results=tokenizer.texts_to_matrix(samples,mode='binary')
#word及對應索引(0-9999)
word_index = tokenizer.word_index
# print('Found %s unique tokens.' % len(word_index))
#截取或填充至100長度
data = pad_sequences(sequences, maxlen=maxlen)
#標籤list轉array
labels = np.asarray(labels)
#(25000, 100)即25000個樣本,每個樣本取前100詞
# print('Shape of data tensor:', data.shape)
#(25000, )
# print('Shape of label tensor:', labels.shape)
# 3,打亂數據
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
x_train = data[:training_samples]
y_train = labels[:training_samples]
x_val = data[training_samples: training_samples + validation_samples]
y_val = labels[training_samples: training_samples + validation_samples]

#4,導入預定的400000個詞及對應向量,即用其進行查表
glove_dir = 'D:/Data/DeepLearningWithPython/glove'
#400000個詞及對應的向量,每個向量長度爲100
embeddings_index = {}
#100d說明這個文件裏將每個單詞或符號映射爲100維向量
f = open(os.path.join(glove_dir, 'glove.6B.100d.txt'),encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
# print('Found %s word vectors.' % len(embeddings_index))

embedding_dim = 100
#5,將統計的word_index在預定的詞-向量字典中查找,將結果彙總至embedding_matrix矩陣
#維度爲10000個熱詞,每個詞對應100長度的向量
embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items():
    if i < max_words:
        #查表
        embedding_vector = embeddings_index.get(word)
        #不存在的向量置爲0
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
#6,構建模型
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
#7,爲嵌入層設置權重(即詞-向量矩陣,也即生成的熱詞字典),
# 並凍結爲不可訓練(字典當然是固定的,只供查找)
model.layers[0].set_weights([embedding_matrix])
#凍結
model.layers[0].trainable = False
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['acc'])
history = model.fit(x_train, y_train,
                    epochs=10,
                    batch_size=32,
                    validation_data=(x_val, y_val))

#繪圖
acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)
plt.plot(epochs, acc, 'ro', label='Training acc')
plt.plot(epochs, val_acc, 'b^', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()
plt.figure()
plt.plot(epochs, loss, 'ro', label='Training loss')
plt.plot(epochs, val_loss, 'b^', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()

#測試數據
test_dir = os.path.join(imdb_dir, 'test')
labels = []
texts = []
for label_type in ['neg', 'pos']:
    dir_name = os.path.join(test_dir, label_type)
    for fname in sorted(os.listdir(dir_name)):
        if fname[-4:] == '.txt':
            f = open(os.path.join(dir_name, fname),encoding='utf-8')
            texts.append(f.read())
            f.close()
            if label_type == 'neg':
                labels.append(0)
            else:
                labels.append(1)
sequences = tokenizer.texts_to_sequences(texts)
x_test = pad_sequences(sequences, maxlen=maxlen)
y_test = np.asarray(labels)
result=model.evaluate(x_test,y_test)
# [0.8080911725044251, 0.561]
print(result)

3.3 使用RNN(循環神經網絡)之SimpleRNN

測試精度80%,比之前最高的L1&L2正則化後的87%要低,因爲之前每個單詞對應1萬維的向量,且使用所有單詞。

#RNN循環神經網絡SimpleRNN
#熱詞10000
#取前500個詞
#SimpleRNN不適合處理長序列
#實際中也不可能學習到長期依賴信息,多層網絡面臨梯度消失問題
#解決梯度消失:使用獨立模塊保存信息,以便後續使用
#LSTM:長短期記憶
from keras.datasets import imdb
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense,Embedding,SimpleRNN
import matplotlib.pyplot as plt
max_features=10000
maxlen=500
# batch_size=32
(x_train,y_train),(x_test,y_test)=imdb.load_data(num_words=max_features)
x_train=sequence.pad_sequences(x_train,maxlen=maxlen)
x_test=sequence.pad_sequences(x_test,maxlen=maxlen)
model=Sequential()
#少了一個input_length參數,當它後面跟Dense或Flatten層時,這個參數必需
model.add(Embedding(max_features,32))
#輸出32維
model.add(SimpleRNN(32))
model.add(Dense(1,activation='sigmoid'))
model.compile(optimizer='rmsprop',loss='binary_crossentropy',metrics=['acc'])
history=model.fit(x_train,y_train,epochs=10,batch_size=128,validation_split=0.2,verbose=2)
h=history.history
acc = h['acc']
val_acc = h['val_acc']
loss = h['loss']
val_loss = h['val_loss']
epochs = range(1, len(acc) + 1)
plt.plot(epochs, acc, 'ro', label='Training acc')
plt.plot(epochs, val_acc, 'b^', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()
plt.figure()
plt.plot(epochs, loss, 'ro', label='Training loss')
plt.plot(epochs, val_loss, 'b^', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()
#[0.6611378932380676, 0.804]比第三章的88%要低,因爲只取前500個詞
result=model.evaluate(x_test,y_test,verbose=2)
print(result)

3.4 使用LSTM(長短期記憶)替換SimpleRNN,爲RNN的一個子類

測試精度提升到86%,接近之前的最高值了。

#RNN循環神經網絡LSTM
#LSTM:長短期記憶。回答問題、機器翻譯
from keras.datasets import imdb
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense,Embedding,LSTM
import matplotlib.pyplot as plt
max_features=10000
maxlen=500
(x_train,y_train),(x_test,y_test)=imdb.load_data(num_words=max_features)
x_train=sequence.pad_sequences(x_train,maxlen=maxlen)
x_test=sequence.pad_sequences(x_test,maxlen=maxlen)
model=Sequential()
model.add(Embedding(max_features,32))
#輸出32維
model.add(LSTM(32))
model.add(Dense(1,activation='sigmoid'))
model.compile(optimizer='rmsprop',loss='binary_crossentropy',metrics=['acc'])
history=model.fit(x_train,y_train,epochs=10,batch_size=128,validation_split=0.2,verbose=2)
h=history.history
acc = h['acc']
val_acc = h['val_acc']
loss = h['loss']
val_loss = h['val_loss']
epochs = range(1, len(acc) + 1)
plt.plot(epochs, acc, 'ro', label='Training acc')
plt.plot(epochs, val_acc, 'b^', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()
plt.figure()
plt.plot(epochs, loss, 'ro', label='Training loss')
plt.plot(epochs, val_loss, 'b^', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()
#[0.46161370715618133, 0.86164]比SimpleRNN提高6%
result=model.evaluate(x_test,y_test,verbose=2)
print(result)

3.5 使用另一個RNN子類GRU

這個跟LSTM原理類似,不過比LSTM稍微簡潔一些,精度略有下降,但是訓練較爲快。

關於LSTM與GRU的原理解析,見參考文章:LSTM與GRU

#GRU代替LSTM
#GRU:只使用2個門,重置門和更新門。時間稍少一點
from keras.datasets import imdb
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense,Embedding,GRU
import matplotlib.pyplot as plt
max_features=10000
maxlen=500
(x_train,y_train),(x_test,y_test)=imdb.load_data(num_words=max_features)
x_train=sequence.pad_sequences(x_train,maxlen=maxlen)
x_test=sequence.pad_sequences(x_test,maxlen=maxlen)
model=Sequential()
model.add(Embedding(max_features,32))
model.add(GRU(32))
model.add(Dense(1,activation='sigmoid'))
model.compile(optimizer='rmsprop',loss='binary_crossentropy',metrics=['acc'])
history=model.fit(x_train,y_train,epochs=10,batch_size=128,validation_split=0.2,verbose=2)
h=history.history
acc = h['acc']
val_acc = h['val_acc']
loss = h['loss']
val_loss = h['val_loss']
epochs = range(1, len(acc) + 1)
plt.plot(epochs, acc, 'ro', label='Training acc')
plt.plot(epochs, val_acc, 'b^', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()
plt.figure()
plt.plot(epochs, loss, 'ro', label='Training loss')
plt.plot(epochs, val_loss, 'b^', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()
#[0.4026906166577339, 0.85212]跟LSTM差不多
result=model.evaluate(x_test,y_test,verbose=2)
print(result)

3.6 將數據翻轉,測試

可以看到精度仍然有83%,表明情感分析可能只與單詞有關,而與其出現順序沒有什麼關係

#將訓練和測試數據進行翻轉,測試表明其對順序不敏感
from keras.datasets import imdb
from keras.preprocessing import sequence
from keras import layers
from keras.models import Sequential
import matplotlib.pyplot as plt
max_features=10000
maxlen=500
(x_train,y_train),(x_test,y_test)=imdb.load_data(num_words=max_features)
#1,翻轉
x_train=[x[::-1] for x in x_train]
x_test=[x[::-1] for x in x_test]
x_train=sequence.pad_sequences(x_train,maxlen=maxlen)
x_test=sequence.pad_sequences(x_test,maxlen=maxlen)
model=Sequential()
# input_dim:字典長度,輸入數據最大下標+1,表示獨熱碼的維度
#output_dim:表示一個單詞映射成向量的維度
#Embedding:其實就是一個字典,充當全連接層,
# 將獨熱碼乘以全連接層的操作轉換爲查表,提升效率
model.add(layers.Embedding(input_dim=max_features,output_dim=32))
model.add(layers.GRU(32))
model.add(layers.Dense(1,activation='sigmoid'))
model.compile(optimizer='rmsprop',loss='binary_crossentropy',metrics=['acc'])
history=model.fit(x_train,y_train,batch_size=128,epochs=10,validation_split=0.2,verbose=2)
h=history.history
acc = h['acc']
val_acc = h['val_acc']
loss = h['loss']
val_loss = h['val_loss']
epochs = range(1, len(acc) + 1)
plt.plot(epochs, acc, 'r.:', label='Training acc')
plt.plot(epochs, val_acc, 'b.:', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()
plt.figure()
plt.plot(epochs, loss, 'r.:', label='Training loss')
plt.plot(epochs, val_loss, 'b.:', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()
#[0.47393704013347626, 0.83376]跟正向GRU的85%差距不大
result=model.evaluate(x_test,y_test,verbose=2)
print(result)


3.7 使用雙向循環神經網絡Bidirectional

即同時從正、反2個反向訓練數據。從結果來看,還沒有單獨正向的好。

#bidirectional RNNs 雙向神經網絡
#對於情感分析等文本來說,不同順序的loss可能差不多,但從不同角度可以獲得更多的pattern
#即同時進行正向與反向的訓練
from keras.layers import Bidirectional,Dense,Embedding,GRU
from keras.models import Sequential
from keras.preprocessing import sequence
from keras.datasets import imdb
import sys
sys.path.append('..')
#這個是我自己寫的一個保存數據和讀取數據的輔助模塊
from utils.acc_loss_from_txt import data_to_text
# import matplotlib.pyplot as plt
max_features=10000
maxlen=500
(x_train,y_train),(x_test,y_test)=imdb.load_data(num_words=max_features)
x_train=sequence.pad_sequences(x_train,maxlen=maxlen)
x_test=sequence.pad_sequences(x_test,maxlen=maxlen)
model=Sequential()
model.add(Embedding(max_features,32))
#1,使用雙向循環神經網絡
model.add(Bidirectional(GRU(32)))
model.add(Dense(1,activation='sigmoid'))
model.compile(optimizer='rmsprop',loss='binary_crossentropy',metrics=['acc'])
history=model.fit(x_train,y_train,batch_size=128,epochs=10,validation_split=0.2,verbose=2)
h=history.history
acc = h['acc']
val_acc = h['val_acc']
loss = h['loss']
val_loss = h['val_loss']
path='./imdb_7.txt'
data_to_text(path,'acc',acc)
data_to_text(path,'val_acc',val_acc)
data_to_text(path,'loss',loss)
data_to_text(path,'val_loss',val_loss)
# [0.4243735435771942, 0.83652]比反向大一點點,比正向還小?
result=model.evaluate(x_test,y_test,verbose=2)
# print(result)
data_to_text(path,'test result',result)


附上我自己寫的一個輔助模塊,用於保存模型的精度、loss信息並讀取顯示爲圖片

import matplotlib.pyplot as plt
#用於平滑曲線
def smooth_curve(points, factor=0.8):
  smoothed_points = []
  for point in points:
    if smoothed_points:
      previous = smoothed_points[-1]
      smoothed_points.append(previous * factor + point * (1 - factor))
    else:
      smoothed_points.append(point)
  return smoothed_points

#讀取數據
def data_from_text(file,smooth=False):
  """
    # Arguments
      file:the file which data extract from
      smooth:wether or not to smooth the curve
  """
  f=open(file,encoding='utf-8')
  lines=f.read().split('\n')
  f.close()
  names=[]
  result=[]
  for line in lines:
    #排除空行
    if line:
      data=line.split(':')
      names.append(data[0])
      a=[float(x) for x in data[1][1:-1].split(',')]
      if smooth:
        a=smooth_curve(a)
      result.append(a)
  return names,result
#記錄數據
def data_to_text(file,desc,data):
  """
  # Arguments
      file:the file which data extract from
      desc:a string describe for data
      data:datas to store,list pattern
  """
  f=open(file,'a+')
  f.write(desc+':'+str(data)+'\n')
  f.close()
#直接一步讀取數據並顯示爲圖片
def data_to_graph(file,smooth=False):
  names,data=data_from_text(file,smooth)
  length=len(data)
  #如果包含result,就是奇數
  if length%2==1:
    length-=1
  epoches=range(1,len(data[0])+1)
  for i in range(0,length,2):
    plt.plot(epoches,data[i],'r.--',label=names[i])
    plt.plot(epoches,data[i+1],'b.--',label=names[i+1])
    plt.legend()
    if i==length-2:
      plt.show()
    else:
      plt.figure()

3.8 使用1D卷積-池化層

類似與圖片的2D卷積-池化層,文本可以有1D卷積-池化層。優勢在於訓練非常迅速,一輪只需要10幾秒,缺點是隻能獲取局部序列信息,對於對順序敏感的文本,表現不好。

#使用1D卷積-池化層,對於文本訓練非常迅速,且精度還可以
#第6輪驗證精度接近84%,測試精度82%,GRU測試精度爲85.2%,還是有差距的
from keras.models import Sequential
from keras.layers import Conv1D,Dense,Embedding,MaxPool1D,GlobalMaxPool1D
from keras.optimizers import RMSprop
from keras.preprocessing import sequence
from keras.datasets import imdb
import sys
sys.path.append('..')
from utils.acc_loss_from_txt import data_to_text,data_to_graph
max_features=10000
maxlen=500
(x_train,y_train),(x_test,y_test)=imdb.load_data(num_words=max_features)
x_train=sequence.pad_sequences(x_train,maxlen=maxlen)
x_test=sequence.pad_sequences(x_test,maxlen=maxlen)
model=Sequential()
model.add(Embedding(max_features,128,input_length=maxlen))
#1D卷積層
model.add(Conv1D(32,7,activation='relu'))
#1D最大池化層
model.add(MaxPool1D(5))
model.add(Conv1D(32,7,activation='relu'))
#全局最大池化層,也可以用Flatten層
model.add(GlobalMaxPool1D())
model.add(Dense(1))
#學習率默認0.001
model.compile(optimizer=RMSprop(lr=1e-4),loss='binary_crossentropy',metrics=['acc'])
history=model.fit(x_train,y_train,batch_size=128,epochs=10,validation_split=0.2,verbose=2)
h=history.history
loss=h['loss']
val_loss=h['val_loss']
acc=h['acc']
val_acc=h['val_acc']
path='./imdb_8.txt'
data_to_text(path,'loss',loss)
data_to_text(path,'val_loss',val_loss)
data_to_text(path,'acc',acc)
data_to_text(path,'val_acc',val_acc)
result=model.evaluate(x_test,y_test,verbose=2)
data_to_text(path,'test result',result)
data_to_graph(path)

3.9 使用1D卷積層預處理,再使用GRU訓練

這裏是先使用了驗證集,結果顯示第6輪開始出現過擬合,然後使用全部訓練集訓練到第6輪,然後在測試集上測試精度,之前的程序有些是直接用過擬合的模型測試精度,不夠正規。不過這裏精度不如上一個。。。奇怪~

#使用1D卷積-池化層預處理,再使用GRU訓練
#第6輪驗證精度79%,再使用全部訓練集訓練6輪,得到測試精度79%
from keras.models import Sequential
from keras.layers import Conv1D,Dense,Embedding,MaxPool1D,GRU
from keras.optimizers import RMSprop
from keras.preprocessing import sequence
from keras.datasets import imdb
import time
import sys
sys.path.append('..')
from utils.acc_loss_from_txt import data_to_text,data_to_graph
max_features=10000
maxlen=500
(x_train,y_train),(x_test,y_test)=imdb.load_data(num_words=max_features)
x_train=sequence.pad_sequences(x_train,maxlen=maxlen)
x_test=sequence.pad_sequences(x_test,maxlen=maxlen)
model=Sequential()
model.add(Embedding(max_features,128,input_length=maxlen))
model.add(Conv1D(32,7,activation='relu'))
model.add(MaxPool1D(5))
model.add(Conv1D(32,7,activation='relu'))
#1,1D卷積層之後使用GRU
model.add(GRU(32,dropout=0.2,recurrent_dropout=0.5))
model.add(Dense(1))
#學習率默認0.001
model.compile(optimizer=RMSprop(lr=1e-4),loss='binary_crossentropy',metrics=['acc'])
t1=time.time()
history=model.fit(x_train,y_train,batch_size=128,epochs=6,verbose=2)
h=history.history
loss=h['loss']
acc=h['acc']
path='./imdb_9.txt'
data_to_text(path,'loss',loss)
data_to_text(path,'acc',acc)
result=model.evaluate(x_test,y_test,verbose=2)
data_to_text(path,'test_result',result)
t2=time.time()
data_to_text(path,'cost_mins',(t2-t1)/60.)
data_to_graph(path)

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章