tensorflow練習1:利用神經網絡進行分類

TensorFlow可被用於語音識別或圖像識別等多項機器深度學習領域,它可在小到手機、大到數千臺服務器上運行。前段時間在做有關情感分類的實驗,利用了神經網絡對數據進行分類;效果還不錯,達到80+%。
數據集來源:評論數據集,中文的,很不容易,感謝作者!
pos數
neg數據

數據處理:

import random
def loadfile():
    neg = pd.read_excel('data/neg.xls', header=None, index=None)
    pos = pd.read_excel('data/pos.xls', header=None, index=None)  # 讀取訓練語料完畢
    pos['mark'] = 1
    neg['mark'] = 0  # 給訓練語料貼上標籤
    pn = pd.concat([pos, neg], ignore_index=True)  # 合併語料
    #neglen = len(neg)
    #poslen = len(pos)  # 計算語料數目
    #print(type(neg['mark'].values[0]))
    #print(pn[:10],pn[-10:-1])

    print (len(pn[0].values),len(pn['mark'].values))

    with open('data/data.txt','w',encoding='utf-8') as f:
        for x in pn[0].values:
           f.write(x+'\n')
    with open('data/label.txt', 'w', encoding='utf-8') as f:
        for x in pn['mark'].values:
            f.write(str(x)+'\n')

loadfile()#加載併合並數據

-------------------------------------------------

#分詞,去停用詞
import jieba
import numpy as np
with open('data/stopwords', 'r', encoding='utf-8') as f:

    stopwords = []
    for line in f.readlines():
        stopwords.append(line.strip())
def split_word():
    with open('data/data.txt', 'r', encoding='utf-8') as f:
        lines = f.readlines()
        #lines=random.sample(lines, len(lines))#打亂次序
        lines_1 = []
        #word_list = []
        for line in lines:
            line = ' '.join(jieba.cut(line.strip()))
            #for word in line.split(' '):
            #    if word not in stopwords:
            #        word_list.append(word)
            lines_1.append(line)
        with open('data/split_data.txt','w',encoding='utf-8') as f1:
            for line in lines_1:
                f1.write(line+'\n')


    #print(lines_1[0])
    #with open('data/clean_data.txt','w',encoding='utf-8') as f:
    #    for line in lines_1:
    #        f.write((" ".join([word for word in line]) + "\n"))
with open('data/split_data.txt','r',encoding='utf-8') as f:
    line_list=[]
    #len_list=[]
    for line in f.readlines():
        line =line.strip().split(' ')
        line_1=[]
        for word in line:
            if word not in stopwords:
                line_1.append(word)
        #len_list.append(len(line_1))
        line_list.append(line_1)
    with open('data_clean.txt','w',encoding='utf-8') as f1:
        for line in line_list:
            f1.write((" ".join([num for num in line]) + "\n"))

停用詞表(stopwords):

"
..
>>

/
...

8
二
<
@
]
、
,
“
”
。
-
&
《
》
…
?
^
_
(
)
#
啊
此
這
呢
哦
僅
*
+
=
0
1
2
3
4
5
6
7
8
9
@
$
【
】
[
]
矣
兮
~
>
<
{
}
了
個
呵
的
」
「
&#
;
%
.
.
:
—
TWILIGHT
,
\
;
.....

創建詞典:

#coding=utf-8
import numpy as np
import random
import os
from io import open
import datetime
"""
***yuchuli
"""
PAD = "__PAD__"
GO = "__GO__"
EOS = "__EOS__"  # 對話結束
UNK = "__UNK__"  # 標記未出現在詞彙表中的字符
START_VOCABULART = [PAD, GO, EOS, UNK]

PAD_ID = 0
GO_ID = 1
EOS_ID = 2
UNK_ID = 3

dataset_path_1='data_clean.txt'
#dataset_path_2="data/sentiment_XS_test.txt"

def set_dataset_path(path):
    dataset_path=path

if not os.path.exists(dataset_path_1):
    print('training dataset is null')
    exit()

#gen_vocabulary(生成字典)
def gen_vocabulary_file(input_file, output_file,vocab_size,input_file2=None):
    f = open(input_file, encoding='utf-8')
    train_set_x = []
    #train_set_y = []
    #test_set_x = []
    #test_set_y = []
    for line in f.readlines():
        x = line.strip()
        train_set_x.append(x)
        #train_set_y.append(y)
    f.close()

    #train_set_x = train_set_x[1:]
    vocabulary = {}

    counter = 0
    for line in train_set_x:
        counter += 1
        # print line
        tokens = line.strip().split(' ')  # 這一步有問題,輸出的不是漢字
        #print(tokens)
        for word in tokens:
            if word in vocabulary:  # 已在詞彙表中,則詞頻加1
                vocabulary[word] += 1
            else:  # 不在則爲1
                vocabulary[word] = 1
    vocabulary_list = START_VOCABULART + sorted(vocabulary, key=vocabulary.get, reverse=True)
    # print vocabulary
    # 取前5000個常用漢字, 應該差不多夠用了
    if len(vocabulary_list) > vocab_size:
        vocabulary_list = vocabulary_list[:vocab_size]  # vocab_size大小的詞彙表

    print(input_file, " 詞彙表大小:", len(vocabulary_list))
    with open(output_file, "w",encoding='utf-8') as ff:
        for word in vocabulary_list:
            ff.write(word + '\n')



print ("vocabulary start convert...:")
gen_vocabulary_file(dataset_path_1,"train_set_vocabulary",20000)

句子轉換id:

#coding=utf-8
import numpy as np
import random
import os
from io import open
import datetime
"""
***yuchuli
"""
PAD = "__PAD__"
GO = "__GO__"
EOS = "__EOS__"  # 對話結束
UNK = "__UNK__"  # 標記未出現在詞彙表中的字符
START_VOCABULART = [PAD, GO, EOS, UNK]

PAD_ID = 0
GO_ID = 1
EOS_ID = 2
UNK_ID = 3

dataset_path_1='data_clean.txt'
#dataset_path_2="data/sentiment_XS_test.txt"

#  把對話字符串轉爲向量形式
def convert_to_vector(input_file, vocabulary_file, output_file):
    starttime = datetime.datetime.now()
    tmp_vocab = []
    with open(vocabulary_file, "r",encoding='utf-8') as f:
        tmp_vocab.extend(f.readlines())#將詞彙表填入tmp_vocab
    tmp_vocab = [line.strip() for line in tmp_vocab]#去除一些無用字符

    vocab = dict([(x, y) for (y, x) in enumerate(tmp_vocab)])
    # {'碩': 3142, 'v': 577, 'I': 4789, '\ue796': 4515, '拖': 1333, '疤': 2201 ...}
    #print vocab以上內容正確組成了字典
    output_f = open(output_file, 'w',encoding='utf-8')#寫入輸出文件
    train_set_x=[]
    train_set_y=[]
    with open(input_file, encoding='utf-8') as f:
        for line in f:
            x = line.strip().split(' ')
            #print (x)
            train_set_x.append(x)
    #train_set_x = train_set_x[1:]



    for line in train_set_x:
        line_vec = []
        for words in line:

            line_vec.append(vocab.get(words, UNK_ID))

            #如果words在vocab裏,則填入vocab[words],否則3
        #print line_vec
        output_f.write((" ".join([str(num) for num in line_vec]) + "\n"))
        #返回一個字符串的連接,以空格爲分隔符,以換行符爲結尾
    output_f.close()
    endtime = datetime.datetime.now()
    print("運行時間:%d 秒"%((endtime - starttime).seconds))

convert_to_vector(dataset_path_1,vocabulary_file="train_set_vocabulary",output_file="train_set_encode")
#convert_to_vector(dataset_path_2,vocabulary_file="train_set_vocabulary",output_file="test_set_encode")

自己手動提取10%的數據作爲測試集
接下來,進行分類模型構建:
MLP模型:mlp_model.py

#coding=utf-8
import tensorflow as tf
import numpy as np
#coding=utf-8
import tensorflow as tf
import numpy as np
class MLP_Model(object):


    def __init__(self,config,is_training=True):

        self.keep_prob=config.keep_prob
        self.batch_size=tf.Variable(0,dtype=tf.int32,trainable=False)
        self.is_training =is_training
        num_step=config.num_step
        self.input_data=tf.placeholder(tf.int32,[None,num_step])
        self.target = tf.placeholder(tf.int64,[None])
        #self.mask_x = tf.placeholder(tf.float32,[num_step,None])

        #emotion_embed_dim = config.emotion_embed_dim
        class_num=config.class_num
        hidden_neural_size=config.hidden_neural_size
        vocabulary_size=config.vocabulary_size
        max_len = config.max_len
        embed_dim=config.embed_dim
        hidden_layer_num = config.hidden_layer_num
        self.new_batch_size = tf.placeholder(tf.int32,shape=[],name="new_batch_size")
        self._batch_size_update = tf.assign(self.batch_size,self.new_batch_size)

        # Store layers weight & bias
        weights = {
            'h1': tf.Variable(tf.random_normal([embed_dim, hidden_neural_size])),
            'h2': tf.Variable(tf.random_normal([hidden_neural_size, hidden_neural_size])),
            'out': tf.Variable(tf.random_normal([hidden_neural_size, class_num]))
        }
        biases = {
            'b1': tf.Variable(tf.random_normal([hidden_neural_size])),
            'b2': tf.Variable(tf.random_normal([hidden_neural_size])),
            'out': tf.Variable(tf.random_normal([class_num]))
        }
        #build mlp network
        def multilayer_perceptron(_X, _weights, _biases):
            layer_1=[]
            layer_2=[]
            for i in range(max_len):
                if i > 0: tf.get_variable_scope().reuse_variables()
                layer_1.append(tf.nn.relu(
                    tf.add(tf.matmul(_X[i], _weights['h1']), _biases['b1'])))  # Hidden layer with sigmoid activation
                layer_2.append(tf.nn.relu(
                    tf.add(tf.matmul(layer_1[i], _weights['h2']), _biases['b2']))) # Hidden layer with RELU activation
            with tf.name_scope("mean_pooling_layer"):

                out_put = tf.reduce_mean(layer_2, 0)
            return tf.matmul(out_put, _weights['out']) + _biases['out']

        #lstm_fw_cell = rnn_cell.BasicLSTMCell(hidden_neural_size,forget_bias=0.0,state_is_tuple=True)
        #lstm_bw_cell = rnn_cell.BasicLSTMCell(hidden_neural_size, forget_bias=0.0,state_is_tuple=True)
        #if self.keep_prob<1:
        #    lstm_fw_cell =  rnn_cell.DropoutWrapper(
        #        lstm_fw_cell,output_keep_prob=self.keep_prob
        #    )
        #    lstm_bw_cell = rnn_cell.DropoutWrapper(
        #        lstm_bw_cell, output_keep_prob=self.keep_prob
        #    )


        #lstm_fw_cell = tf.nn.rnn_cell.MultiRNNCell([lstm_fw_cell]*hidden_layer_num)
        #lstm_bw_cell = tf.nn.rnn_cell.MultiRNNCell([lstm_bw_cell]*hidden_layer_num)
        #self._initial_state = cell.zero_state(self.batch_size,dtype=tf.float32)

        #embedding layer
        with tf.device("/cpu:0"),tf.name_scope("embedding_layer"):
            embedding = tf.get_variable("embedding",[vocabulary_size,embed_dim],dtype=tf.float32)
            inputs= tf.nn.embedding_lookup(embedding,self.input_data)
            inputs_emb = tf.transpose(inputs, [1, 0, 2])
            inputs_emb = tf.reshape(inputs_emb, [-1, embed_dim])
            inputs_emb = tf.split(0, num_step, inputs_emb)

        #print(inputs)
        if self.keep_prob<1:
            inputs = tf.nn.dropout(inputs,self.keep_prob)
        with tf.variable_scope("mlp_layer"):
            self.logits = multilayer_perceptron(inputs_emb,weights,biases)

        #out_put=[]
        #state=self._initial_state
        #with tf.variable_scope("LSTM_layer"):
        #    for time_step in range(num_step):
        #        if time_step>0: tf.get_variable_scope().reuse_variables()
        #        (cell_output,state)=cell(inputs[:,time_step,:],state)
        #        out_put.append(cell_output)

        #out_put=out_put*self.mask_x[:,:,None]

        #with tf.name_scope("mean_pooling_layer"):

        #    out_put=tf.reduce_sum(out_put,0)/(tf.reduce_sum(self.mask_x,0)[:,None])

        #with tf.name_scope("Softmax_layer_and_output"):
        #    softmax_w = tf.get_variable("softmax_w",[2*hidden_neural_size,class_num],dtype=tf.float32)
        #    softmax_b = tf.get_variable("softmax_b",[class_num],dtype=tf.float32)
        #    self.logits = tf.matmul(outputs[-1],softmax_w)+softmax_b

        with tf.name_scope("loss"):
            self.loss = tf.nn.sparse_softmax_cross_entropy_with_logits(self.logits+1e-10,self.target)
            self.cost = tf.reduce_mean(self.loss)

        with tf.name_scope("accuracy"):
            self.prediction = tf.argmax(self.logits,1)
            correct_prediction = tf.equal(self.prediction,self.target)
            self.correct_num=tf.reduce_sum(tf.cast(correct_prediction,tf.float32))
            self.accuracy = tf.reduce_mean(tf.cast(correct_prediction,tf.float32),name="accuracy")

        #add summary
        loss_summary = tf.summary.scalar("loss",self.cost)
        #add summary
        accuracy_summary=tf.summary.scalar("accuracy_summary",self.accuracy)

        if not self.is_training:
            self.saver = tf.train.Saver(tf.global_variables())
            return

        self.globle_step = tf.Variable(0,name="globle_step",trainable=False)
        self.lr = tf.Variable(0.0,trainable=False)

        tvars = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars),
                                      config.max_grad_norm)


        # Keep track of gradient values and sparsity (optional)
        grad_summaries = []
        for g, v in zip(grads, tvars):
            if g is not None:
                grad_hist_summary = tf.summary.histogram("{}/grad/hist".format(v.name), g)
                sparsity_summary = tf.summary.scalar("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g))
                grad_summaries.append(grad_hist_summary)
                grad_summaries.append(sparsity_summary)
        self.grad_summaries_merged = tf.summary.merge(grad_summaries)

        self.summary =tf.summary.merge([loss_summary,accuracy_summary,self.grad_summaries_merged])



        optimizer = tf.train.GradientDescentOptimizer(self.lr)
        optimizer.apply_gradients(zip(grads, tvars))
        self.train_op=optimizer.apply_gradients(zip(grads, tvars))

        self.new_lr = tf.placeholder(tf.float32,shape=[],name="new_learning_rate")
        self._lr_update = tf.assign(self.lr,self.new_lr)
        self.global_step = tf.Variable(0, trainable=False)
        self.saver = tf.train.Saver(tf.global_variables())


    def assign_new_lr(self,session,lr_value):
        session.run(self._lr_update,feed_dict={self.new_lr:lr_value})
    def assign_new_batch_size(self,session,batch_size_value):
        session.run(self._batch_size_update,feed_dict={self.new_batch_size:batch_size_value})

訓練模型:mlp.py

import os
import time

import tensorflow as tf
#import datetime
#from rnn_model import RNN_Model
from mlp_model import MLP_Model
import data_process

flags =tf.app.flags
FLAGS = flags.FLAGS


flags.DEFINE_integer('batch_size',64,'the batch_size of the training procedure')
flags.DEFINE_float('lr',0.1,'the learning rate')
flags.DEFINE_float('lr_decay',0.6,'the learning rate decay')
flags.DEFINE_integer('vocabulary_size',40000,'vocabulary_size')
#emotion embedding
flags.DEFINE_integer("emotion_nums",2,'emotion_nums')#posivitive,negative,neural
flags.DEFINE_integer("emotion_embed_dim",128,'emotion embedding_dim')

flags.DEFINE_integer('emdedding_dim',128,'embedding dim')
flags.DEFINE_integer('hidden_neural_size',128,'LSTM hidden neural size')
flags.DEFINE_integer('hidden_layer_num',3,'LSTM hidden layer num')
flags.DEFINE_string('dataset_path','data/subj0.pkl','dataset path')
flags.DEFINE_integer('max_len',100,'max_len of training sentence')
flags.DEFINE_integer('valid_num',100,'epoch num of validation')
flags.DEFINE_integer('checkpoint_num',1000,'epoch num of checkpoint')
flags.DEFINE_float('init_scale',0.1,'init scale')
flags.DEFINE_integer('class_num',2,'class num')
flags.DEFINE_float('keep_prob',0.5,'dropout rate')
flags.DEFINE_integer('num_epoch',81,'num epoch')
flags.DEFINE_integer('max_decay_epoch',30,'num epoch')
flags.DEFINE_integer('max_grad_norm',5,'max_grad_norm')
flags.DEFINE_string('out_dir',os.path.abspath(os.path.join(os.path.curdir,"review_runs2_81")),'output directory')
flags.DEFINE_integer('check_point_every',10,'checkpoint every num epoch ')

class Config(object):

    hidden_neural_size=FLAGS.hidden_neural_size
    vocabulary_size=FLAGS.vocabulary_size
    embed_dim=FLAGS.emdedding_dim
    #emotion
    emotion_nums=FLAGS.emotion_nums
    emotion_embed_dim=FLAGS.emotion_embed_dim
    #
    hidden_layer_num=FLAGS.hidden_layer_num
    class_num=FLAGS.class_num
    keep_prob=FLAGS.keep_prob
    lr = FLAGS.lr
    lr_decay = FLAGS.lr_decay
    batch_size=FLAGS.batch_size
    num_step = FLAGS.max_len
    max_grad_norm=FLAGS.max_grad_norm
    num_epoch = FLAGS.num_epoch
    max_decay_epoch = FLAGS.max_decay_epoch
    valid_num=FLAGS.valid_num
    out_dir=FLAGS.out_dir
    max_len = FLAGS.max_len
    checkpoint_every = FLAGS.check_point_every


def evaluate(model,session,data,global_steps=None,summary_writer=None):


    correct_num=0
    total_num=len(data[0])
    for step, (x,y,mask_x) in enumerate(data_process.batch_iter(data, batch_size=FLAGS.batch_size)):

         fetches = model.correct_num
         feed_dict={}
         feed_dict[model.input_data]=x
         feed_dict[model.target]=y
         #feed_dict[model.mask_x]=mask_x
         model.assign_new_batch_size(session,len(x))
         #state = session.run(model._initial_state)
         #for i , (c,h) in enumerate(model._initial_state):
         #   feed_dict[c]=state[i].c
         #   feed_dict[h]=state[i].h
         count=session.run(fetches,feed_dict)
         correct_num+=count

    accuracy=float(correct_num)/total_num
    dev_summary = tf.summary.scalar('dev_accuracy',accuracy)
    dev_summary = session.run(dev_summary)
    if summary_writer:
        summary_writer.add_summary(dev_summary,global_steps)
        summary_writer.flush()
    return accuracy

def run_epoch(model,session,data,global_steps,valid_model,valid_data,train_summary_writer,valid_summary_writer=None):
    for step, (x,y,mask_x) in enumerate(data_process.batch_iter(data, batch_size=FLAGS.batch_size)):

        feed_dict={}
        feed_dict[model.input_data]=x
        #feed_dict[model.emotion_state]=[]
        feed_dict[model.target]=y
        #feed_dict[model.mask_x]=mask_x
        model.assign_new_batch_size(session,len(x))
        fetches = [model.cost,model.accuracy,model.train_op,model.summary]
        #state = session.run(model._initial_state)
        #for i , (c,h) in enumerate(model._initial_state):
        #    feed_dict[c]=state[i].c
        #    feed_dict[h]=state[i].h
        cost,accuracy,_,summary = session.run(fetches,feed_dict)
        train_summary_writer.add_summary(summary,global_steps)
        train_summary_writer.flush()
        model.is_training=False
        valid_accuracy=evaluate(valid_model,session,valid_data,global_steps,valid_summary_writer)
        if(global_steps%100==0):
            print (time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
            print("the %i step, train cost is: %f and the train accuracy is %f and the valid accuracy is %f"%(global_steps,cost,accuracy,valid_accuracy))
        global_steps+=1

    return global_steps

def train_step():

    print("loading the dataset...")
    config = Config()
    eval_config=Config()
    eval_config.keep_prob=1.0

    train_data,valid_data,test_data= data_process.load_data(FLAGS.max_len, batch_size=config.batch_size)

    print("begin training")

    # gpu_config=tf.ConfigProto()
    # gpu_config.gpu_options.allow_growth=True
    with tf.Graph().as_default(), tf.Session() as session:
        initializer = tf.random_uniform_initializer(-1*FLAGS.init_scale,1*FLAGS.init_scale)
        with tf.variable_scope("model",reuse=None,initializer=initializer):
            model = MLP_Model(config=config,is_training=True)
        # train_summary_op = tf.merge_summary([model.loss_summary,model.accuracy])
        train_summary_dir = os.path.join(config.out_dir,"summaries","train")
        train_summary_writer =  tf.summary.FileWriter(train_summary_dir,session.graph)

        # dev_summary_op = tf.merge_summary([valid_model.loss_summary,valid_model.accuracy])
        dev_summary_dir = os.path.join(eval_config.out_dir,"summaries","dev")
        dev_summary_writer =  tf.summary.FileWriter(dev_summary_dir,session.graph)

        #add checkpoint
        checkpoint_dir = os.path.abspath(os.path.join(config.out_dir, "checkpoints"))
        checkpoint_prefix = os.path.join(checkpoint_dir, "model")
        if not os.path.exists(checkpoint_dir):
            os.makedirs(checkpoint_dir)
        saver = tf.train.Saver(tf.global_variables())


        tf.global_variables_initializer().run()
        global_steps=1
        begin_time=int(time.time())

        for i in range(config.num_epoch):
            print (time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
            print("the %d epoch training..."%(i+1))
            lr_decay = config.lr_decay ** max(i-config.max_decay_epoch,0.0)
            model.assign_new_lr(session,config.lr*lr_decay)
            global_steps=run_epoch(model,session,train_data,global_steps,model,valid_data,train_summary_writer,dev_summary_writer)

            if i% config.checkpoint_every==0:
                path = saver.save(session,checkpoint_prefix,global_steps)
                print("Saved model chechpoint to{}\n".format(path))

        print("the train is finished")
        end_time=int(time.time())
        print("training takes %d seconds already\n"%(end_time-begin_time))
        #test_accuracy=evaluate(test_model,session,test_data)

        #print("the test data accuracy is %f"%test_accuracy)
        print (time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
        print("program end!")



def main(_):
    train_step()


if __name__ == "__main__":
    tf.app.run()

指定python mlp.py

評估:evalute.py

import os
import time

import numpy as np
import tensorflow as tf

#import datetime
#from rnn_model import RNN_Model
from mlp_model import MLP_Model
import data_process

flags =tf.app.flags
FLAGS = flags.FLAGS



flags.DEFINE_integer('batch_size',64,'the batch_size of the training procedure')
flags.DEFINE_float('lr',0.1,'the learning rate')
flags.DEFINE_float('lr_decay',0.6,'the learning rate decay')
flags.DEFINE_integer('vocabulary_size',40000,'vocabulary_size')
#emotion embedding
flags.DEFINE_integer("emotion_nums",2,'emotion_nums')#posivitive,negative,neural
flags.DEFINE_integer("emotion_embed_dim",128,'emotion embedding_dim')

flags.DEFINE_integer('emdedding_dim',128,'embedding dim')
flags.DEFINE_integer('hidden_neural_size',128,'LSTM hidden neural size')
flags.DEFINE_integer('hidden_layer_num',3,'LSTM hidden layer num')
flags.DEFINE_string('dataset_path','data/subj0.pkl','dataset path')
flags.DEFINE_integer('max_len',100,'max_len of training sentence')
flags.DEFINE_integer('valid_num',100,'epoch num of validation')
flags.DEFINE_integer('checkpoint_num',1000,'epoch num of checkpoint')
flags.DEFINE_float('init_scale',0.1,'init scale')
flags.DEFINE_integer('class_num',2,'class num')
flags.DEFINE_float('keep_prob',0.5,'dropout rate')
flags.DEFINE_integer('num_epoch',81,'num epoch')
flags.DEFINE_integer('max_decay_epoch',30,'num epoch')
flags.DEFINE_integer('max_grad_norm',5,'max_grad_norm')
flags.DEFINE_string('out_dir',os.path.abspath(os.path.join(os.path.curdir,"review_runs2_81")),'output directory')
flags.DEFINE_integer('check_point_every',10,'checkpoint every num epoch ')


class Config(object):

    hidden_neural_size=FLAGS.hidden_neural_size
    vocabulary_size=FLAGS.vocabulary_size
    embed_dim=FLAGS.emdedding_dim
    #emotion
    emotion_nums=FLAGS.emotion_nums
    emotion_embed_dim=FLAGS.emotion_embed_dim
    #
    hidden_layer_num=FLAGS.hidden_layer_num
    class_num=FLAGS.class_num
    keep_prob=FLAGS.keep_prob
    lr = FLAGS.lr
    lr_decay = FLAGS.lr_decay
    batch_size=FLAGS.batch_size
    num_step = FLAGS.max_len
    max_grad_norm=FLAGS.max_grad_norm
    num_epoch = FLAGS.num_epoch
    max_decay_epoch = FLAGS.max_decay_epoch
    valid_num=FLAGS.valid_num
    out_dir=FLAGS.out_dir
    max_len = FLAGS.max_len
    checkpoint_every = FLAGS.check_point_every

def evaluate(model,session,data,global_steps=None,summary_writer=None):


    #pre_label=[]
    accuracy=[]
    for step, (x,y,mask_x) in enumerate(data_process.batch_iter(data, batch_size=FLAGS.batch_size)):

         fetches = model.correct_num
         label=model.prediction
         feed_dict={}
         feed_dict[model.input_data]=x
         #feed_dict[model.target]=y
         #feed_dict[model.mask_x]=mask_x
         model.assign_new_batch_size(session,len(x))
         #state = session.run(model._initial_state)
         #for i , (c,h) in enumerate(model._initial_state):
         #   feed_dict[c]=state[i].c
         #   feed_dict[h]=state[i].h
         #count=session.run(fetches,feed_dict)
         pre=session.run(label,feed_dict)
         correct_num=0
         #pre_label.append(pre)
         for i in range(len(pre)):
             if pre[i]== y[i]:
                 correct_num +=1
         accuracy.append(correct_num/len(pre))



    #accuracy=float(correct_num)/total_num
    #dev_summary = tf.summary.scalar('dev_accuracy',accuracy)
    #dev_summary = session.run(dev_summary)
    '''
    if summary_writer:
        summary_writer.add_summary(dev_summary,global_steps)
        summary_writer.flush()
    '''
    return accuracy

def test_step():
    print("loading the dataset...")
    config = Config()
    eval_config = Config()
    eval_config.keep_prob = 1.0
    train_data, valid_data, test_data = data_process.load_data(FLAGS.max_len, batch_size=config.batch_size)
    print("begin testing....")
    print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
    with tf.Session() as session:
        initializer = tf.random_uniform_initializer(-1 * FLAGS.init_scale, 1 * FLAGS.init_scale)
        with tf.variable_scope("model", reuse=None, initializer=initializer):

            test_model = MLP_Model(config=eval_config, is_training=False)
            curdir = os.path.abspath(os.path.join(config.out_dir, "checkpoints"))
            #curdir ="D:\\emotion_classifier\\runs3_60\\checkpoints\\model-18922"
            ckpt = tf.train.get_checkpoint_state(curdir)
            if ckpt != None:
                print(ckpt.model_checkpoint_path)
                test_model.saver.restore(session, ckpt.model_checkpoint_path)
            else:
                print("該路徑不存在,結束!")
                tf.global_variables_initializer().run()
                return

        accs = evaluate(test_model, session, test_data)
        accuracy = np.mean(accs)
        print("精確率爲:%f"%(accuracy))

test_step()

數據處理data_process:

#coding=utf-8
import numpy as np
import random
import os
from io import open
import string
import datetime
"""
***yuchuli
"""
PAD = "__PAD__"
GO = "__GO__"
EOS = "__EOS__"  # 對話結束
UNK = "__UNK__"  # 標記未出現在詞彙表中的字符
START_VOCABULART = [PAD, GO, EOS, UNK]

PAD_ID = 0
GO_ID = 1
EOS_ID = 2
UNK_ID = 3

dataset_path_1='data/train_label.txt'
dataset_path_2="data/test_label.txt"

def set_dataset_path(path):
    dataset_path=path

if not os.path.exists(dataset_path_1):
    print('training dataset is null')
    exit()

if not os.path.exists(dataset_path_2):
    print('test dataset is null')
    exit()


def load_data(max_len,batch_size,n_words=40000,valid_portion=0.2,sort_by_len=False):

    f=open(dataset_path_1,'rb')
    f1=open(dataset_path_2,'rb')
    f2=open('data/train_set_encode','rb')
    f3=open('data/test_set_encode','rb')

    print ('load training label from %s\nload test label from %s'%(dataset_path_1,dataset_path_2))

    train_set_x=[]
    train_set_y=[]
    test_set_x=[]
    test_set_y=[]

    #load label
    for line in f.readlines():
        y=int(line.strip())
        train_set_y.append(y)

    for line1 in f1.readlines():
        y = int(line1.strip())
        test_set_y.append(y)

    #get the trainset
    for line in f2.readlines():
        line=line.decode('utf-8').strip().split(' ')
        train_set_x.append(line)
    for line in f3.readlines():
        line=line.decode('utf-8').strip().split(' ')
        test_set_x.append(line)
    f.close()
    f1.close()
    f2.close()
    f3.close()

    #string matrix-->int matrix
    def string_to_int(input):
        output=[]
        for line in input:
            line_vec=[]
            for word in line:
                num=int(word)
                line_vec.append(num)
            output.append(line_vec)
        return output

    train_set_x=string_to_int(train_set_x)

    test_set_x=string_to_int(test_set_x)

    valid_set_y=[]
    valid_set_x=[]

    #split train/valid set
    n_samples = len(train_set_x)

    sidx=np.random.permutation(n_samples)
    n_train = int(np.round(n_samples * (1. - valid_portion)))
    valid_set_x = [train_set_x[s] for s in sidx[n_train:]]
    valid_set_y = [train_set_y[s] for s in sidx[n_train:]]
    train_set_x = [train_set_x[s] for s in sidx[:n_train]]
    train_set_y = [train_set_y[s] for s in sidx[:n_train]]
    train_set=(train_set_x,train_set_y)
    valid_set=(valid_set_x,valid_set_y)
    test_set=(test_set_x,test_set_y)
    # remove unknow words
    def remove_unk(x):
        return [[UNK_ID if w >= n_words else w for w in sen] for sen in x]

    test_set_x, test_set_y = test_set
    valid_set_x, valid_set_y = valid_set
    train_set_x, train_set_y = train_set

    train_set_x = remove_unk(train_set_x)
    valid_set_x = remove_unk(valid_set_x)
    test_set_x = remove_unk(test_set_x)

    def len_argsort(seq):
        return sorted(range(len(seq)), key=lambda x: len(seq[x]))

    if sort_by_len:
        sorted_index = len_argsort(test_set_x)
        test_set_x = [test_set_x[i] for i in sorted_index]
        test_set_y = [test_set_y[i] for i in sorted_index]

        sorted_index = len_argsort(valid_set_x)
        valid_set_x = [valid_set_x[i] for i in sorted_index]
        valid_set_y = [valid_set_y[i] for i in sorted_index]


        sorted_index = len_argsort(train_set_x)
        train_set_x = [train_set_x[i] for i in sorted_index]
        train_set_y = [train_set_y[i] for i in sorted_index]

    train_set = (train_set_x, train_set_y)
    valid_set = (valid_set_x, valid_set_y)
    test_set = (test_set_x, test_set_y)

    new_train_set_x = np.zeros([len(train_set[0]), max_len])
    new_train_set_y = np.zeros(len(train_set[0]))

    new_valid_set_x = np.zeros([len(valid_set[0]), max_len])
    new_valid_set_y = np.zeros(len(valid_set[0]))

    new_test_set_x = np.zeros([len(test_set[0]), max_len])
    new_test_set_y = np.zeros(len(test_set[0]))

    mask_train_x = np.zeros([max_len, len(train_set[0])])
    mask_valid_x = np.zeros([max_len, len(valid_set[0])])
    mask_test_x = np.zeros([max_len, len(test_set[0])])
    #padding
    def padding_and_generate_mask(x, y, new_x, new_y, new_mask_x):
        for i, (x, y) in enumerate(zip(x, y)):
            if len(x) <= max_len:
                new_x[i, 0:len(x)] = x
                new_mask_x[0:len(x), i] = 1
                new_y[i] = y
            else:
                new_x[i] = (x[0:max_len])
                new_mask_x[:, i] = 1
                new_y[i] = y
        new_set = (new_x, new_y, new_mask_x)
        del new_x, new_y
        return new_set

    train_set = padding_and_generate_mask(train_set[0], train_set[1], new_train_set_x, new_train_set_y, mask_train_x)

    valid_set = padding_and_generate_mask(valid_set[0], valid_set[1], new_valid_set_x, new_valid_set_y, mask_valid_x)

    test_set = padding_and_generate_mask(test_set[0], test_set[1], new_test_set_x, new_test_set_y, mask_test_x)

    return train_set,valid_set,test_set


#return batch dataset
def batch_iter(data,batch_size):

    #get dataset and label
    x,y,mask_x=data#wentiguanjian
    x=np.array(x)
    y=np.array(y)
    data_size=len(x)
    num_batches_per_epoch=int((data_size-1)/batch_size)+1
    for batch_index in range(num_batches_per_epoch):
        start_index=batch_index*batch_size
        end_index=min((batch_index+1)*batch_size,data_size)
        return_x = x[start_index:end_index]
        return_y = y[start_index:end_index]
        return_mask_x = mask_x[:,start_index:end_index]
        # if(len(return_x)<batch_size):
        #     print(len(return_x))
        #     print return_x
        #     print return_y
        #     print return_mask_x
        #     import sys
        #     sys.exit(0)

        yield (return_x,return_y,return_mask_x)

最終結果86%左右,還不錯。

發佈了118 篇原創文章 · 獲贊 102 · 訪問量 30萬+
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章