一、用pthon爬取圖片
如圖:
創建一個文件夾,下放每一類的文件夾(我的絕對路徑是:/home/user/dataset/)
在每一個class下面,創建一個test.py文件,用以爬取圖片
# coding=utf-8
"""根據搜索詞下載百度圖片"""
import re
import sys
import urllib
import requests
def get_onepage_urls(onepageurl):
"""獲取單個翻頁的所有圖片的urls+當前翻頁的下一翻頁的url"""
if not onepageurl:
print('已到最後一頁, 結束')
return [], ''
try:
html = requests.get(onepageurl).text
except Exception as e:
print(e)
pic_urls = []
fanye_url = ''
return pic_urls, fanye_url
pic_urls = re.findall('"objURL":"(.*?)",', html, re.S)
fanye_urls = re.findall(re.compile(r'<a href="(.*)" class="n">下一頁</a>'), html, flags=0)
fanye_url = 'http://image.baidu.com' + fanye_urls[0] if fanye_urls else ''
return pic_urls, fanye_url
def down_pic(pic_urls):
"""給出圖片鏈接列表, 下載所有圖片"""
for i, pic_url in enumerate(pic_urls):
try:
pic = requests.get(pic_url, timeout=15)
string = str(i + 1200) + '.jpg'
with open(string, 'wb') as f:
f.write(pic.content)
print('成功下載第%s張圖片: %s' % (str(i + 1), str(pic_url)))
except Exception as e:
print('下載第%s張圖片時失敗: %s' % (str(i + 1), str(pic_url)))
print(e)
continue
if __name__ == '__main__':
keyword = '筆記本電腦' # 關鍵詞, 改爲你想輸入的詞即可, 相當於在百度圖片裏搜索一樣
url_init_first = r'http://image.baidu.com/search/flip?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1497491098685_R&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&ctd=1497491098685%5E00_1519X735&word='
url_init = url_init_first + urllib.parse.quote(keyword, safe='/')
all_pic_urls = []
onepage_urls, fanye_url = get_onepage_urls(url_init)
all_pic_urls.extend(onepage_urls)
fanye_count = 1 # 累計翻頁數
while 1:
onepage_urls, fanye_url = get_onepage_urls(fanye_url)
fanye_count += 1
print('第%s頁' % fanye_count)
if fanye_url == '' and onepage_urls == []:
break
all_pic_urls.extend(onepage_urls)
down_pic(list(set(all_pic_urls)))
基本上只需要改變keyword和命名就可以以每次60張圖片的速度進行爬取。
具體命令如下:
cd /home/user/dataset/class0
python test.py
在下載過程中會有提示,class1也是一樣。下載完批量的圖片以後就要開始選擇要用什麼框架訓練數據集。(因爲一直在學習mxnet,所以選擇了mxnet,mxnet主要將圖片數據生成rec文件,有一個im2rec.py專門將圖片生成rec文件,具體步驟如下:
1)生成lst文件
recursive:是否遞歸訪問子目錄,如果存在多個目錄可以設置該參數
list::腳本默認爲False,所以製作lst時應設置爲True
prefix:需要生成的lst文件的前綴(這裏我命名爲test,就會生成test.lst)
root:指定數據集的根目錄,其子目錄爲圖片或進一步的子目錄(注意:路徑一定要寫對!!!)
終端命令爲:python ~/mxnet/tools/im2rec.py --recursive --list test /home/user/dataset/
隨後,在我的目錄下面就生成了lst文件
打開lst文件是這樣的:
第一行是圖片大小,中間是類別,後面是相對路徑
2)生成rec文件
涉及參數:
–list 是否創建list文件,默認爲False
–exts 所能接受的圖片後綴,默認爲jpg和jpeg(如果圖片是png格式,可以對im2rec.py文件進行修改)
–chunks 分塊數量,默認爲1
–train-ratio 訓練集所佔的比例,默認爲1.0
–test-ratio 測試集所佔的比例,默認爲0
–recursive 是否遞歸的對root下的文件夾進行遍歷
–shuffle 是否打亂list中的圖片順序,默認爲True
–pass-through 是否跳過transform,默認爲False
–resize 是否將短邊縮放至設定尺寸,默認爲0
–center-crop 是否進行中心剪裁,默認爲False
–quality 圖片解碼質量(0-100),默認爲95
–num-thread 編碼的線程數,默認爲1
–color 色彩解碼模式[-1,0,1],-1爲彩色模式,0爲灰度模式,1爲alpha模式,默認爲1
–encoding 解碼模式(jpeg,png),默認爲jpeg
–pack-label 是否讀入多維度標籤數據,默認爲False 如果進行多標籤數據製作或者目標檢測的數據製作,那麼就必須將其設置爲True
終端命令:python ~/mxnet/tools/im2rec.py --recursive test /home/user/dataset/
然後就在該路徑下生成了一個rec後綴和一個idx文件,裏面是bin格式,所以一般打不開。這裏就不打開了。
截圖:
生成了rec文件以後,就需要在網絡中訓練了。
import sys sys.path.insert(0, '..') import gluonbook as gb import mxnet as mx from mxnet import autograd, nd, gluon, init from mxnet.gluon import loss as gloss, nn from time import time import os from skimage import io import numpy as np class Residual(nn.HybridBlock): def __init__(self, num_channels, use_1x1conv=False, strides=1, **kwargs): super(Residual, self).__init__(**kwargs) self.conv1 = nn.Conv2D(num_channels, kernel_size=3, padding=1, strides=strides) self.conv2 = nn.Conv2D(num_channels, kernel_size=3, padding=1) if use_1x1conv: self.conv3 = nn.Conv2D(num_channels, kernel_size=1, strides=strides) else: self.conv3 = None self.bn1 = nn.BatchNorm() self.bn2 = nn.BatchNorm() def hybrid_forward(self, F, X): Y = F.relu(self.bn1(self.conv1(X))) Y = self.bn2(self.conv2(Y)) if self.conv3: X = self.conv3(X) return F.relu(Y + X) def resnet18(num_classes): net = nn.HybridSequential() net.add(nn.Conv2D(64, kernel_size=3, strides=1, padding=1), nn.BatchNorm(), nn.Activation('relu')) def resnet_block(num_channels, num_residuals, first_block=False): blk = nn.HybridSequential() for i in range(num_residuals): if i == 0 and not first_block: blk.add(Residual(num_channels, use_1x1conv=True, strides=2)) else: blk.add(Residual(num_channels)) return blk net.add(resnet_block(64, 2, first_block=True), resnet_block(128, 2), resnet_block(256, 2), resnet_block(512, 2)) net.add(nn.GlobalAvgPool2D(), nn.Dense(num_classes)) return net def get_net(ctx): num_classes = 2 net = resnet18(num_classes) net.initialize(ctx=ctx, init=init.Xavier()) return net batch_size = 32 train_iter = mx.image.ImageIter( batch_size = batch_size, data_shape = (3, 256,256), path_imgrec = 'test.rec', path_imgidx = 'test.idx', #help shuffle performance shuffle = True, #aug_list=[mx.image.HorizontalFlipAug(0.5)] ) train_iter.reset() for batch in train_iter: x = batch.data[0] y = batch.label[0] print(x) print('y is' ,y) break def try_gpu(): try: ctx = mx.gpu() _ = nd.zeros((1,), ctx=ctx) except: ctx = mx.cpu() return ctx ctx = try_gpu() ctx def evaluate_accuracy(data_iter, net, ctx): acc = nd.array([0], ctx=ctx) for X, y in data_iter: # 如果 ctx 是 GPU,將數據複製到 GPU 上。 X = X.as_in_context(ctx) y = y.as_in_context(ctx) acc += gb.accuracy(net(X), y) return acc.asscalar() / len(data_iter) def train_ch5(net, train_iter, loss, batch_size, trainer, ctx, num_epochs): print('training on', ctx) for epoch in range(1, num_epochs + 1): train_l_sum = 0 train_acc_sum = 0 start = time() for batch in train_iter: X = batch.data[0] y = batch.label[0] # 如果 ctx 是 GPU,將數據複製到 GPU 上。 print(X,y) X = X.as_in_context(ctx) y = y.as_in_context(ctx) with autograd.record(): net = get_net(ctx) y_hat = net(X) l = loss(y_hat, y) l.backward() trainer.step(batch_size) train_l_sum += l.mean().asscalar() train_acc_sum += gb.accuracy(y_hat, y) #test_acc = evaluate_accuracy(test_iter, net, ctx) print('epoch %d, loss %.4f, train acc %.3f, ' 'time %.1f sec' % (epoch, train_l_sum / len(train_iter), train_acc_sum / len(train_iter), time() - start)) lr = 0.8 num_epochs = 5 net.initialize(force_reinit=True, ctx=ctx, init=init.Xavier()) trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr}) loss = gloss.SoftmaxCrossEntropyLoss() train_ch5(net, train_iter, loss, batch_size, trainer, ctx, num_epochs)
這裏簡單的用了resnet18作爲網絡進行訓練,大家可以使用適合自己數據集的網絡進行訓練。 祝大家玩的愉快!有什麼問題可以一起探討~~