MNIST手寫數字數據集通常做爲深度學習的練習數據集,這個數據集恐怕早已經被大家玩壞了。本帖就介紹一個和MNIST類似,同時又適合國人練習的數據集-手寫漢字數據集,然後訓練一個簡單的Deep Convolutional Network識別手寫漢字。
識別手寫漢字要把識別手寫洋文難上很多。首先,英文字符的分類少,總共10+26*2;而中文總共50,000多漢字,常用的就有3000多。其次,漢字有書法,每個人書寫風格多樣。
手寫漢字數據集: CASIA-HWDB
下載HWDB1.1數據集:
$ wgethttp://www.nlpr.ia.ac.cn/databases/download/feature_data/HWDB1.1trn_gnt.zip # zip解壓沒得說, 之後還要解壓alz壓縮文件 $ wgethttp://www.nlpr.ia.ac.cn/databases/download/feature_data/HWDB1.1tst_gnt.zip
這個數據集由 模式識別國家重點實驗室 共享,它還共享了其它幾個數據庫,先mark:
- 行爲分析數據庫
- 三維人臉數據庫
- 中文語言資源庫
- 步態數據庫
- 掌紋數據庫
- 虹膜庫數據
手寫漢字的樣子:
import os import numpyas np import struct import PIL.Image train_data_dir = "HWDB1.1trn_gnt" test_data_dir = "HWDB1.1tst_gnt" # 讀取圖像和對應的漢字 def read_from_gnt_dir(gnt_dir=train_data_dir): def one_file(f): header_size = 10 while True: header = np.fromfile(f, dtype='uint8', count=header_size) if not header.size: break sample_size = header[0] + (header[1]<<8) + (header[2]<<16) + (header[3]<<24) tagcode = header[5] + (header[4]<<8) width = header[6] + (header[7]<<8) height = header[8] + (header[9]<<8) if header_size + width*height != sample_size: break image = np.fromfile(f, dtype='uint8', count=width*height).reshape((height, width)) yield image, tagcode for file_namein os.listdir(gnt_dir): if file_name.endswith('.gnt'): file_path = os.path.join(gnt_dir, file_name) with open(file_path, 'rb') as f: for image, tagcodein one_file(f): yield image, tagcode # 統計樣本數 train_counter = 0 test_counter = 0 for image, tagcodein read_from_gnt_dir(gnt_dir=train_data_dir): tagcode_unicode = struct.pack('>H', tagcode).decode('gb2312') """ # 提取點圖像, 看看什麼樣 if train_counter < 1000: im = PIL.Image.fromarray(image) im.convert('RGB').save('png/' + tagcode_unicode + str(train_counter) + '.png') """ train_counter += 1 for image, tagcodein read_from_gnt_dir(gnt_dir=test_data_dir): tagcode_unicode = struct.pack('>H', tagcode).decode('gb2312') test_counter += 1 # 樣本數 print(train_counter, test_counter)
由於時間和系統資源有限,我只使用數據集的一部分(只識別最常用的140個漢字)。
訓練模型
import os import numpyas np import struct import PIL.Image train_data_dir = "HWDB1.1trn_gnt" test_data_dir = "HWDB1.1tst_gnt" # 讀取圖像和對應的漢字 def read_from_gnt_dir(gnt_dir=train_data_dir): def one_file(f): header_size = 10 while True: header = np.fromfile(f, dtype='uint8', count=header_size) if not header.size: break sample_size = header[0] + (header[1]<<8) + (header[2]<<16) + (header[3]<<24) tagcode = header[5] + (header[4]<<8) width = header[6] + (header[7]<<8) height = header[8] + (header[9]<<8) if header_size + width*height != sample_size: break image = np.fromfile(f, dtype='uint8', count=width*height).reshape((height, width)) yield image, tagcode for file_namein os.listdir(gnt_dir): if file_name.endswith('.gnt'): file_path = os.path.join(gnt_dir, file_name) with open(file_path, 'rb') as f: for image, tagcodein one_file(f): yield image, tagcode import scipy.misc from sklearn.utilsimport shuffle import tensorflowas tf # 我取常用的前140個漢字進行測試 char_set = "的一是了我不人在他有這個上們來到時大地爲子中你說生國年着就那和要她出也得裏後自以會家可下而過天去能對小多然於心學麼之都好看起發當沒成只如事把還用第樣道想作種開美總從無情己面最女但現前些所同日手又行意動方期它頭經長兒回位分愛老因很給名法間斯知世什兩次使身者被高已親其進此話常與活正感" def resize_and_normalize_image(img): # 補方 pad_size = abs(img.shape[0]-img.shape[1]) // 2 if img.shape[0] < img.shape[1]: pad_dims = ((pad_size, pad_size), (0, 0)) else: pad_dims = ((0, 0), (pad_size, pad_size)) img = np.lib.pad(img, pad_dims, mode='constant', constant_values=255) # 縮放 img = scipy.misc.imresize(img, (64 - 4*2, 64 - 4*2)) img = np.lib.pad(img, ((4, 4), (4, 4)), mode='constant', constant_values=255) assert img.shape == (64, 64) img = img.flatten() # 像素值範圍-1到1 img = (img - 128) / 128 return img # one hot def convert_to_one_hot(char): vector = np.zeros(len(char_set)) vector[char_set.index(char)] = 1 return vector # 由於數據量不大, 可一次全部加載到RAM train_data_x = [] train_data_y = [] for image, tagcodein read_from_gnt_dir(gnt_dir=train_data_dir): tagcode_unicode = struct.pack('>H', tagcode).decode('gb2312') if tagcode_unicodein char_set: train_data_x.append(resize_and_normalize_image(image)) train_data_y.append(convert_to_one_hot(tagcode_unicode)) # shuffle樣本 train_data_x, train_data_y = shuffle(train_data_x, train_data_y, random_state=0) batch_size = 128 num_batch = len(train_data_x) // batch_size text_data_x = [] text_data_y = [] for image, tagcodein read_from_gnt_dir(gnt_dir=test_data_dir): tagcode_unicode = struct.pack('>H', tagcode).decode('gb2312') if tagcode_unicodein char_set: text_data_x.append(resize_and_normalize_image(image)) text_data_y.append(convert_to_one_hot(tagcode_unicode)) # shuffle樣本 text_data_x, text_data_y = shuffle(text_data_x, text_data_y, random_state=0) X = tf.placeholder(tf.float32, [None, 64*64]) Y = tf.placeholder(tf.float32, [None, 140]) keep_prob = tf.placeholder(tf.float32) def chinese_hand_write_cnn(): x = tf.reshape(X, shape=[-1, 64, 64, 1]) # 3 conv layers w_c1 = tf.Variable(tf.random_normal([3, 3, 1, 32], stddev=0.01)) b_c1 = tf.Variable(tf.zeros([32])) conv1 = tf.nn.relu(tf.nn.bias_add(tf.nn.conv2d(x, w_c1, strides=[1, 1, 1, 1], padding='SAME'), b_c1)) conv1 = tf.nn.max_pool(conv1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME') w_c2 = tf.Variable(tf.random_normal([3, 3, 32, 64], stddev=0.01)) b_c2 = tf.Variable(tf.zeros([64])) conv2 = tf.nn.relu(tf.nn.bias_add(tf.nn.conv2d(conv1, w_c2, strides=[1, 1, 1, 1], padding='SAME'), b_c2)) conv2 = tf.nn.max_pool(conv2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME') """ # 訓練開始之後我就去睡覺了, 早晨起來一看, 白跑了, 準確率不足10%; 把網絡變量改少了再來一發 w_c3 = tf.Variable(tf.random_normal([3, 3, 64, 128], stddev=0.01)) b_c3 = tf.Variable(tf.zeros([128])) conv3 = tf.nn.relu(tf.nn.bias_add(tf.nn.conv2d(conv2, w_c3, strides=[1, 1, 1, 1], padding='SAME'), b_c3)) conv3 = tf.nn.max_pool(conv3, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME') conv3 = tf.nn.dropout(conv3, keep_prob) """ # fully connect layer w_d = tf.Variable(tf.random_normal([8*32*64, 1024], stddev=0.01)) b_d = tf.Variable(tf.zeros([1024])) dense = tf.reshape(conv2, [-1, w_d.get_shape().as_list()[0]]) dense = tf.nn.relu(tf.add(tf.matmul(dense, w_d), b_d)) dense = tf.nn.dropout(dense, keep_prob) w_out = tf.Variable(tf.random_normal([1024, 140], stddev=0.01)) b_out = tf.Variable(tf.zeros([140])) out = tf.add(tf.matmul(dense, w_out), b_out) return out def train_hand_write_cnn(): output = chinese_hand_write_cnn() loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(output, Y)) optimizer = tf.train.AdamOptimizer(learning_rate=0.001).minimize(loss) accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.argmax(output, 1), tf.argmax(Y, 1)), tf.float32)) # TensorBoard tf.scalar_summary("loss", loss) tf.scalar_summary("accuracy", accuracy) merged_summary_op = tf.merge_all_summaries() saver = tf.train.Saver() with tf.Session() as sess: sess.run(tf.global_variables_initializer()) # 命令行執行 tensorboard --logdir=./log 打開瀏覽器訪問http://0.0.0.0:6006 summary_writer = tf.train.SummaryWriter('./log', graph=tf.get_default_graph()) for e in range(50): for i in range(num_batch): batch_x = train_data_x[i*batch_size : (i+1)*batch_size] batch_y = train_data_y[i*batch_size : (i+1)*batch_size] _, loss_, summary = sess.run([optimizer, loss, merged_summary_op], feed_dict={X: batch_x, Y: batch_y, keep_prob: 0.5}) # 每次迭代都保存日誌 summary_writer.add_summary(summary, e*num_batch+i) print(e*num_batch+i, loss_) if e*num_batch+i % 100 == 0: # 計算準確率 acc = accuracy.eval({X: text_data_x[:500], Y: text_data_y[:500], keep_prob: 1.}) #acc = sess.run(accuracy, feed_dict={X: text_data_x[:500], Y: text_data_y[:500], keep_prob: 1.}) print(e*num_batch+i, acc) train_hand_write_cnn()