(二)python爬蟲驗證碼識別(去除干擾線)
釘釘釘~繼完成第一波的任務之後,又來第二波了!!!!!!
1.開發環境與工具
- python36:sklearn、pytesser、opencv等
- pycharm
- windows7
2.數據集
3.解決思想討論
觀察驗證碼,發現這次驗證碼和之前的驗證碼不同:
(1)驗證碼類型:6位驗證碼,有數字字母,分類較多
(2)驗證碼分割:驗證碼字符位置隨機,不固定,有些驗證碼字符甚至疊加在一起,而且出現的概率很高,基本佔一半。如果進行圖片切割,就會喪失一定的信息,識別精度也會很低,所以初步想法是,在較小標註樣本的情況下,不進行圖片分割,嘗試使用遷移學習VGG16來進行驗證碼識別,看是否能夠提高精度。
(3)噪聲去除:由於噪聲的顏色有時候會和字母的顏色一樣或近似相似,不適合用之前那種方法。觀察驗證碼,可根據點噪聲方法來去噪。
4。解決方案
1、遷移學習: 在訓練集爲63000張、進行去噪但不進行圖片分割等預處理之後,嘗試使用遷移學習VGG16,參考鏈接:tensorflow vgg16、使用CNN進行4位驗證碼識別,結果效果不佳,大概驗證碼單個數字的準確率在65%左右,訓練過程慢且對gpu要求較高。
2、KNN分類:由於遷移學習調參過程複雜麻煩,而且由於使用的遷移模型較複雜,訓練時間比較久,自然遲遲都沒取得實質性的效果,沒辦法交任務,於是想着,放棄那些重疊的驗證碼,嘗試使用之前的驗證碼識別的方法,對圖片進行分割,看效果如何,大概做了一個小時之後,發現效果還行,準確度達54%,但是因爲KNN算法,註定訓練集越多,所得的訓練得到的模型越大,大概1G,這樣不僅模型過大佔內存,而且預測效率也很低。但是,終於可以交任務啦!!!!!!!
3、CNN分類:針對KNN一些缺點,博主覺得還有待改進,使用遷移學習,有點殺雞用牛刀,然後寫了個CNN看效果如何,由於時間緊,初步取了個模型,發現,效果不錯,訓練得到的模型大小爲101M,預測效率也高了好幾倍,精度在77%左右。
5.預測結果
KNN
CNN
6.圖片預處理代碼
圖片去噪、進行圖片擴充,224*224
# -*- coding:utf-8 -*-
import cv2
import os
import numpy as np
import copy
''' 根據該像素周圍點爲黑色的像素數(包括本身)來判斷是否把它歸屬於噪聲,如果是噪聲就將其變爲白色'''
'''
input: img:二值化圖
number:周圍像素數爲黑色的小於number個,就算爲噪聲,並將其去掉,如number=6,
就是一個像素周圍9個點(包括本身)中小於6個的就將這個像素歸爲噪聲
output:返回去噪聲的圖像
'''
def del_noise(img,number):
height = img.shape[0]
width = img.shape[1]
img_new = copy.deepcopy(img)
for i in range(1, height - 1):
for j in range(1, width - 1):
point = [[], [], []]
count = 0
point[0].append(img[i - 1][j - 1])
point[0].append(img[i - 1][j])
point[0].append(img[i - 1][j + 1])
point[1].append(img[i][j - 1])
point[1].append(img[i][j])
point[1].append(img[i][j + 1])
point[2].append(img[i + 1][j - 1])
point[2].append(img[i + 1][j])
point[2].append(img[i + 1][j + 1])
for k in range(3):
for z in range(3):
if point[k][z] == 0:
count += 1
if count <= number:
img_new[i, j] = 255
return img_new
if __name__=='__main__':
img_dir = './img_down_sets/corpus_manual/test'
img_name = os.listdir(img_dir) # 列出文件夾下所有的目錄與文件
kernel = np.ones((5, 5), np.uint8)
for i in range(len(img_name)):
path = os.path.join(img_dir, img_name[i])
image = cv2.imread(path)
name_list = list(img_name[i])[:6]
if '.' in name_list:
print("%s標籤錯誤,請重新標籤!" % img_name[i])
else:
name = ''.join(name_list)
# 灰度化
# print(image.shape)
grayImage = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
# 二值化
result = cv2.adaptiveThreshold(grayImage, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 21, 1)
# 去噪聲
img = del_noise(result, 6)
img = del_noise(img, 4)
img = del_noise(img, 3)
# 加濾波去噪
im_temp = cv2.bilateralFilter(src=img, d=15, sigmaColor=130, sigmaSpace=150)
im_temp = im_temp[1:-1,1:-1]
im_temp = cv2.copyMakeBorder(im_temp, 83, 83, 13, 13, cv2.BORDER_CONSTANT, value=[255])
cv2.imwrite('./img_down_sets/new_corpus/%s.jpg' %(name), im_temp)
print("%s %s.jpg"%(i,name))
print("圖片預處理完成!")
7.圖片切割代碼
分割得60*34
#-*-coding:utf-8 -*-
import cv2
import os
def cut_image(image, num, img_name):
# image = cv2.imread('./img/8.jpg')
im = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
# im_cut_real = im[8:47, 28:128]
im_cut_1 = im[80:140, 23:57]
im_cut_2 = im[80:140, 53:87]
im_cut_3 = im[80:140, 83:117]
im_cut_4 = im[80:140, 113:147]
im_cut_5 = im[80:140, 143:177]
im_cut_6 = im[80:140, 173:207]
im_cut = [im_cut_1, im_cut_2, im_cut_3, im_cut_4, im_cut_5, im_cut_6]
for i in range(6):
im_temp = im_cut[i]
cv2.imwrite('./img_cut_train/'+str(num)+ '_' + str(i)+'_'+img_name[i]+'.jpg', im_temp)
if __name__ == '__main__':
img_dir = './new_corpus'
img_name = os.listdir(img_dir) # 列出文件夾下所有的目錄與文件
for i in range(len(img_name)):
path = os.path.join(img_dir, img_name[i])
image = cv2.imread(path)
name_list = list(img_name[i])[:6]
# name = ''.join(name_list)
cut_image(image, i, name_list)
if i %2000==0:
print('圖片%s分割完成' % (i))
print(u'*****圖片分割預處理完成!*****')
8.KNN代碼
knn代碼與驗證碼(一)所用方法相似,在這不再貼代碼。
9.CNN代碼
vec_text.py(應要求貼出,加載數據的代碼塊)
#-*-coding:utf-8 -*-
import numpy as np
import os
import cv2
def text2vec(labels):
# 製作詞典
number = ['2', '3', '4', '5', '6', '7', '8', '9']
alphabet = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U',
'V', 'W', 'X', 'Y', 'Z']
dictionary = number + alphabet
vec = [0]*34
for i in range(len(dictionary)):
if dictionary[i] == labels:
vec[i] = 1
return vec
def vec2text(index):
# 製作詞典
number = ['2', '3', '4', '5', '6', '7', '8', '9']
alphabet = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U',
'V', 'W', 'X', 'Y', 'Z']
dictionary = number + alphabet
return dictionary[index]
def load_data(img_dir):
# 讀入數據
data = []
labels = []
img_name = os.listdir(img_dir)
for i in range(len(img_name)):
path = os.path.join(img_dir, img_name[i])
# cv2讀進來的圖片是RGB3維的,轉成灰度圖,將圖片轉化成1維
image = cv2.imread(path,0)
data.append(image)
y_temp = img_name[i][-5]
y_vec = text2vec(y_temp)
labels.append(y_vec)
# 標籤規範化
x = np.array(data)
y = np.array(labels)
return x, y
#
# img_dir = './img'
# x, y = load_data(img_dir)
# print(x.shape)
# print(y.shape)
訓練CNN模型
# -*- coding:utf-8 -*-
import tensorflow as tf
import os
from sklearn.model_selection import train_test_split
import cv2
import numpy as np
from vec_text import text2vec,load_data
def weight_variable(shape):
initial = tf.truncated_normal(shape,stddev=0.001)
return tf.Variable(initial, name='w')
def bias_variable(shape):
initial = tf.constant(0.1,shape = shape)
return tf.Variable(initial, name='b')
def conv2d(x, W):
return tf.nn.conv2d(x,W,strides=[1,1,1,1],padding='SAME')
def max_pool(x):
return tf.nn.max_pool(x,ksize=[1,2,2,1],strides=[1,2,2,1],padding='SAME')
with tf.variable_scope("Input"):
x = tf.placeholder(tf.float32,[None,60,34],name='x')
x_image = tf.reshape(x,[-1,60,34,1])
y = tf.placeholder(tf.float32,[None,34],name='y')
with tf.variable_scope("Cnn_net"):
# 第一層 卷積層
with tf.variable_scope("conv_1"):
w_conv1 = weight_variable([3,3,1,32])
b_conv1 = bias_variable([32])
h_conv1 = tf.nn.relu(conv2d(x_image,w_conv1) + b_conv1)
h_pool1 = max_pool(h_conv1)
# 第二層 卷積層
with tf.variable_scope("conv_2"):
w_conv2 = weight_variable([5,5,32,64])
b_conv2 = bias_variable([64])
h_conv2 = tf.nn.relu(conv2d(h_pool1,w_conv2) + b_conv2)
h_pool2 = max_pool(h_conv2)
# 第三層 全連接層
with tf.variable_scope("full_connect"):
w_fc1 = weight_variable([15*9*64, 1024])
b_fc1 = weight_variable([1024])
h_pool2_flat = tf.reshape(h_pool2, [-1,15*9*64])
h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat , w_fc1)+b_fc1)
# dropout
with tf.variable_scope("dropout"):
keep_prob = tf.placeholder(tf.float32)
h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)
# 第四層 softmax輸出層
with tf.variable_scope("softmax"):
w_fc2 = weight_variable([1024,34])
b_fc2 = bias_variable([34])
y_out = tf.nn.softmax(tf.matmul(h_fc1_drop,w_fc2)+b_fc2,name="output")
# 模型訓練與評估
cross_entropy = -tf.reduce_sum(y * tf.log(tf.clip_by_value(y_out,1e-10,1.0))) #計算交叉熵
train_step = tf.train.AdamOptimizer(2e-6).minimize(cross_entropy) #使用adam優化器來以0.0001的學習率來進行微調
correct_prediction = tf.equal(tf.argmax(y_out,1), tf.argmax(y,1)) #判斷預測標籤和實際標籤是否匹配
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
tf.summary.scalar('accuracy', accuracy)
tf.summary.scalar('loss', cross_entropy)
# # 將標籤轉化爲向量,輸入'2',輸出數組[0,0,1....,0]
sess = tf.Session()
sess.run(tf.global_variables_initializer())
print('New_built')
writer = tf.summary.FileWriter('./logs/cnn', sess.graph)
merged = tf.summary.merge_all()
# 保存模型
def save(path='./models/cnn', step=1):
saver = tf.train.Saver()
saver.save(sess, path, write_meta_graph=False, global_step=step)
img_dir = './img_cut_train'
x_data, y_data = load_data(img_dir)
# 拆分訓練數據與測試數據
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.003)
for i in range(3000000):
b_idx = np.random.randint(0, len(x_train), 100)
# print(x_train[b_idx].shape)
# train = sess.run(train_step,{x:x_train[b_idx],y:y_train[b_idx],keep_prob:0.75})
# print(sess.run(x_image,{x:x_train[b_idx]}).shape)
train_loss, __ , train_merged= sess.run([cross_entropy, train_step, merged], {x: x_train[b_idx], y: y_train[b_idx], keep_prob: 0.5})
if (i+1)%100==0:
print(str(i+1),"train loss:",train_loss)
if (i+1) % 1000 == 0:
accuracy_result, test_merged = sess.run([accuracy,merged], {x: x_test, y: y_test, keep_prob:1.0})
print(str(i+1),"test accuracy:",str(accuracy_result))
writer.add_summary(train_merged)
writer.add_summary(test_merged)
if accuracy_result > 0.96 and (i+1)%10000==0:
save(step=i+1)
writer.close()
sess.close()
CNN加載模型預測
import tensorflow as tf
from vec_text import load_data,vec2text
def predict_single(x_data, restore_from = './models/cnn-3085000'):
def weight_variable(shape):
initial = tf.truncated_normal(shape, stddev=0.001)
return tf.Variable(initial, name='w')
def bias_variable(shape):
initial = tf.constant(0.1, shape=shape)
return tf.Variable(initial, name='b')
def conv2d(x, W):
return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')
def max_pool(x):
return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
with tf.variable_scope("Input"):
x = tf.placeholder(tf.float32,[None,60,34],name='x')
x_image = tf.reshape(x,[-1,60,34,1])
y = tf.placeholder(tf.float32,[None,34],name='y')
with tf.variable_scope("Cnn_net"):
# 第一層 卷積層
with tf.variable_scope("conv_1"):
w_conv1 = weight_variable([3,3,1,32])
b_conv1 = bias_variable([32])
h_conv1 = tf.nn.relu(conv2d(x_image,w_conv1) + b_conv1)
h_pool1 = max_pool(h_conv1)
# 第二層 卷積層
with tf.variable_scope("conv_2"):
w_conv2 = weight_variable([5,5,32,64])
b_conv2 = bias_variable([64])
h_conv2 = tf.nn.relu(conv2d(h_pool1,w_conv2) + b_conv2)
h_pool2 = max_pool(h_conv2)
# 第三層 全連接層
with tf.variable_scope("full_connect"):
w_fc1 = weight_variable([15*9*64, 1024])
b_fc1 = weight_variable([1024])
h_pool2_flat = tf.reshape(h_pool2, [-1,15*9*64])
h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat , w_fc1)+b_fc1)
# dropout
with tf.variable_scope("dropout"):
keep_prob = tf.placeholder(tf.float32)
h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)
# 第四層 softmax輸出層
with tf.variable_scope("softmax"):
w_fc2 = weight_variable([1024,34])
b_fc2 = bias_variable([34])
y_out = tf.nn.softmax(tf.matmul(h_fc1_drop,w_fc2)+b_fc2,name="output")
# 模型訓練與評估
y_vec = tf.argmax(y_out,1)
cross_entropy = -tf.reduce_sum(y * tf.log(tf.clip_by_value(y_out,1e-10,1.0))) #計算交叉熵
train_step = tf.train.AdamOptimizer(2e-6).minimize(cross_entropy) #使用adam優化器來以0.0001的學習率來進行微調
correct_prediction = tf.equal(tf.argmax(y_out,1), tf.argmax(y,1)) #判斷預測標籤和實際標籤是否匹配
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
tf.summary.scalar('accuracy', accuracy)
tf.summary.scalar('loss', cross_entropy)
sess = tf.Session()
# 重載模型
saver = tf.train.Saver()
saver.restore(sess, restore_from)
y_predict = sess.run(y_vec,{x:x_data,keep_prob:1.0}) # 輸出格式[1 2 8 9]
y_predict_alpha = [vec2text(index) for index in y_predict] #用字典轉換成字母
# print(y_predict_alpha)
sess.close()
tf.reset_default_graph()
return y_predict_alpha
# 輸入單數字圖片,返回該圖片對應的字符
# if __name__ == "__main__":
# img_dir = './img_test'
# x_data, y_data = load_data(img_dir)
# predict_single(x_data, restore_from = './models/cnn-1139999')