MTCNN移植安卓並檢測視頻中人臉

繼續此前的文章，使用vlc播放了rtsp流媒體視頻後，想檢測視頻中的人臉，之前採用了opencv但是遇到低頭、擡頭和側臉時候，效果就不太好。所以本篇介紹如何使用mtcnn來檢測視頻中的人臉。
大致流程：

一、Tensorflow 模型固化

將PNet、ONet、RNet 網絡參數.npy固化成.pb格式，方便java載入，固化後的文件在assets中，文件名mtcnn_freezed_model.pb。

二、引入android tensorflow lite 庫

只需在build.gradle(module)最後添加以下幾行語句即可。

dependencies {
    implementation fileTree(include: ['*.jar'], dir: 'libs')
    implementation 'com.android.support:appcompat-v7:27.1.1'
    implementation 'com.android.support.constraint:constraint-layout:1.1.3'
    testImplementation 'junit:junit:4.12'
    androidTestImplementation 'com.android.support.test:runner:1.0.2'
    androidTestImplementation 'com.android.support.test.espresso:espresso-core:3.0.2'
    compile(name: 'libvlc-3.0.0', ext: 'aar')
    implementation files('libs/androidutils.jar')
    compile 'org.tensorflow:tensorflow-android:+'
    implementation files('libs/libutils.jar')
}

三、新建MTCNN類

該類包含加載模型文件，並檢測bitmap中的人臉

package com.cayden.face.facenet;

import android.content.res.AssetManager;
import android.graphics.Bitmap;
import android.graphics.Matrix;
import android.graphics.Point;
import android.util.Log;


import com.cayden.face.FaceApplication;

import org.tensorflow.contrib.android.TensorFlowInferenceInterface;

import java.util.Vector;

import static java.lang.Math.max;
import static java.lang.Math.min;

/**
 *  Created by caydencui on 2018/9/6.
 */
public class MTCNN {
    //參數
    private float factor=0.709f;
    private float PNetThreshold=0.6f;
    private float RNetThreshold=0.7f;
    private float ONetThreshold=0.7f;
    //MODEL PATH
    private static final String MODEL_FILE  = "file:///android_asset/mtcnn_freezed_model.pb";
    //tensor name
    private static final String PNetInName  ="pnet/input:0";
    private static final String[] PNetOutName =new String[]{"pnet/prob1:0","pnet/conv4-2/BiasAdd:0"};
    private static final String RNetInName  ="rnet/input:0";
    private static final String[] RNetOutName =new String[]{ "rnet/prob1:0","rnet/conv5-2/conv5-2:0",};
    private static final String ONetInName  ="onet/input:0";
    private static final String[] ONetOutName =new String[]{ "onet/prob1:0","onet/conv6-2/conv6-2:0","onet/conv6-3/conv6-3:0"};


    private static class SingletonInstance {
        private static final MTCNN INSTANCE = new MTCNN();
    }

    public static MTCNN getInstance() {
        return SingletonInstance.INSTANCE;
    }

    //安卓相關
    public  long lastProcessTime;   //最後一張圖片處理的時間ms
    private static final String TAG="MTCNN";
    private AssetManager assetManager;
    private TensorFlowInferenceInterface inferenceInterface;

    private MTCNN() {
        assetManager= FaceApplication.getMyApplication().getAssets();
        loadModel();
    }



    private boolean loadModel() {
        //AssetManager
        try {
            inferenceInterface = new TensorFlowInferenceInterface(assetManager, MODEL_FILE);
            Log.d("MTCNN","[*]load model success");
        }catch(Exception e){
            Log.e("MTCNN","[*]load model failed"+e);
            return false;
        }
        return true;
    }
    //讀取Bitmap像素值，預處理(-127.5 /128)，轉化爲一維數組返回
    private float[] normalizeImage(Bitmap bitmap){
        int w=bitmap.getWidth();
        int h=bitmap.getHeight();
        float[] floatValues=new float[w*h*3];
        int[]   intValues=new int[w*h];
        bitmap.getPixels(intValues,0,bitmap.getWidth(),0,0,bitmap.getWidth(),bitmap.getHeight());
        float imageMean=127.5f;
        float imageStd=128;

        for (int i=0;i<intValues.length;i++){
            final int val=intValues[i];
            floatValues[i * 3 + 0] = (((val >> 16) & 0xFF) - imageMean) / imageStd;
            floatValues[i * 3 + 1] = (((val >> 8) & 0xFF) - imageMean) / imageStd;
            floatValues[i * 3 + 2] = ((val & 0xFF) - imageMean) / imageStd;
        }
        return floatValues;
    }
    /*
       檢測人臉,minSize是最小的人臉像素值
     */
    private Bitmap bitmapResize(Bitmap bm, float scale) {
        int width = bm.getWidth();
        int height = bm.getHeight();
        // CREATE A MATRIX FOR THE MANIPULATION。matrix指定圖片仿射變換參數
        Matrix matrix = new Matrix();
        // RESIZE THE BIT MAP
        matrix.postScale(scale, scale);
        Bitmap resizedBitmap = Bitmap.createBitmap(
                bm, 0, 0, width, height, matrix, true);
        return resizedBitmap;
    }
    //輸入前要翻轉，輸出也要翻轉
    private  int PNetForward(Bitmap bitmap, float [][]PNetOutProb, float[][][]PNetOutBias){
        int w=bitmap.getWidth();
        int h=bitmap.getHeight();

        float[] PNetIn=normalizeImage(bitmap);
        Utils.flip_diag(PNetIn,h,w,3); //沿着對角線翻轉
        inferenceInterface.feed(PNetInName,PNetIn,1,w,h,3);
        inferenceInterface.run(PNetOutName,false);
        int PNetOutSizeW=(int) Math.ceil(w*0.5-5);
        int PNetOutSizeH=(int) Math.ceil(h*0.5-5);
        float[] PNetOutP=new float[PNetOutSizeW*PNetOutSizeH*2];
        float[] PNetOutB=new float[PNetOutSizeW*PNetOutSizeH*4];
        inferenceInterface.fetch(PNetOutName[0],PNetOutP);
        inferenceInterface.fetch(PNetOutName[1],PNetOutB);
        //【寫法一】先翻轉，後轉爲2/3維數組
        Utils.flip_diag(PNetOutP,PNetOutSizeW,PNetOutSizeH,2);
        Utils.flip_diag(PNetOutB,PNetOutSizeW,PNetOutSizeH,4);
        Utils.expand(PNetOutB,PNetOutBias);
        Utils.expandProb(PNetOutP,PNetOutProb);
        /*
        *【寫法二】這個比較快，快了3ms。意義不大，用上面的方法比較直觀
        for (int y=0;y<PNetOutSizeH;y++)
            for (int x=0;x<PNetOutSizeW;x++){
               int idx=PNetOutSizeH*x+y;
               PNetOutProb[y][x]=PNetOutP[idx*2+1];
               for(int i=0;i<4;i++)
                   PNetOutBias[y][x][i]=PNetOutB[idx*4+i];
            }
        */
        return 0;
    }
    //Non-Maximum Suppression
    //nms，不符合條件的deleted設置爲true
    private void nms(Vector<Box> boxes, float threshold, String method){
        //NMS.兩兩比對
        //int delete_cnt=0;
        for(int i=0;i<boxes.size();i++) {
            Box box = boxes.get(i);
            if (!box.deleted) {
                //score<0表示當前矩形框被刪除
                for (int j = i + 1; j < boxes.size(); j++) {
                    Box box2=boxes.get(j);
                    if (!box2.deleted) {
                        int x1 = max(box.box[0], box2.box[0]);
                        int y1 = max(box.box[1], box2.box[1]);
                        int x2 = min(box.box[2], box2.box[2]);
                        int y2 = min(box.box[3], box2.box[3]);
                        if (x2 < x1 || y2 < y1) continue;
                        int areaIoU = (x2 - x1 + 1) * (y2 - y1 + 1);
                        float iou=0f;
                        if (method.equals("Union"))
                            iou = 1.0f*areaIoU / (box.area() + box2.area() - areaIoU);
                        else if (method.equals("Min"))
                            iou= 1.0f*areaIoU / (min(box.area(),box2.area()));
                        if (iou >= threshold) { //刪除prob小的那個框
                            if (box.score>box2.score)
                                box2.deleted=true;
                            else
                                box.deleted=true;
                            //delete_cnt++;
                        }
                    }
                }
            }
        }
        //Log.i(TAG,"[*]sum:"+boxes.size()+" delete:"+delete_cnt);
    }
    private int generateBoxes(float[][] prob,float[][][]bias,float scale,float threshold,Vector<Box> boxes){
        int h=prob.length;
        int w=prob[0].length;
        //Log.i(TAG,"[*]height:"+prob.length+" width:"+prob[0].length);
        for (int y=0;y<h;y++)
            for (int x=0;x<w;x++){
                float score=prob[y][x];
                //only accept prob >threadshold(0.6 here)
                if (score>PNetThreshold){
                    Box box=new Box();
                    //score
                    box.score=score;
                    //box
                    box.box[0]= Math.round(x*2/scale);
                    box.box[1]= Math.round(y*2/scale);
                    box.box[2]= Math.round((x*2+11)/scale);
                    box.box[3]= Math.round((y*2+11)/scale);
                    //bbr
                    for(int i=0;i<4;i++)
                        box.bbr[i]=bias[y][x][i];
                    //add
                    boxes.addElement(box);
                }
            }
        return 0;
    }
    private void BoundingBoxReggression(Vector<Box> boxes){
        for (int i=0;i<boxes.size();i++)
            boxes.get(i).calibrate();
    }
    //Pnet + Bounding Box Regression + Non-Maximum Regression
    /* NMS執行完後，才執行Regression
     * (1) For each scale , use NMS with threshold=0.5
     * (2) For all candidates , use NMS with threshold=0.7
     * (3) Calibrate Bounding Box
     * 注意：CNN輸入圖片最上面一行，座標爲[0..width,0]。所以Bitmap需要對摺後再跑網絡;網絡輸出同理.
     */
    private Vector<Box> PNet(Bitmap bitmap, int minSize){
        int whMin=min(bitmap.getWidth(),bitmap.getHeight());
        float currentFaceSize=minSize;  //currentFaceSize=minSize/(factor^k) k=0,1,2... until excced whMin
        Vector<Box> totalBoxes=new Vector<Box>();
        //【1】Image Paramid and Feed to Pnet
        while (currentFaceSize<=whMin){
            float scale=12.0f/currentFaceSize;
            //(1)Image Resize
            Bitmap bm=bitmapResize(bitmap,scale);
            int w=bm.getWidth();
            int h=bm.getHeight();
            //(2)RUN CNN
            int PNetOutSizeW=(int)(Math.ceil(w*0.5-5)+0.5);
            int PNetOutSizeH=(int)(Math.ceil(h*0.5-5)+0.5);
            float[][]   PNetOutProb=new float[PNetOutSizeH][PNetOutSizeW];;
            float[][][] PNetOutBias=new float[PNetOutSizeH][PNetOutSizeW][4];
            PNetForward(bm,PNetOutProb,PNetOutBias);
            //(3)數據解析
            Vector<Box> curBoxes=new Vector<Box>();
            generateBoxes(PNetOutProb,PNetOutBias,scale,PNetThreshold,curBoxes);
            //Log.i(TAG,"[*]CNN Output Box number:"+curBoxes.size()+" Scale:"+scale);
            //(4)nms 0.5
            nms(curBoxes,0.5f,"Union");
            //(5)add to totalBoxes
            for (int i=0;i<curBoxes.size();i++)
                if (!curBoxes.get(i).deleted)
                    totalBoxes.addElement(curBoxes.get(i));
            //Face Size等比遞增
            currentFaceSize/=factor;
        }
        //NMS 0.7
        nms(totalBoxes,0.7f,"Union");
        //BBR
        BoundingBoxReggression(totalBoxes);
        return Utils.updateBoxes(totalBoxes);
    }
    //截取box中指定的矩形框(越界要處理)，並resize到size*size大小，返回數據存放到data中。
    public Bitmap tmp_bm;
    private void crop_and_resize(Bitmap bitmap, Box box, int size, float[] data){
        //(2)crop and resize
        Matrix matrix = new Matrix();
        float scale=1.0f*size/box.width();
        matrix.postScale(scale, scale);
        Bitmap croped= Bitmap.createBitmap(bitmap, box.left(),box.top(),box.width(), box.height(),matrix,true);
        //(3)save
        int[] pixels_buf=new int[size*size];
        croped.getPixels(pixels_buf,0,croped.getWidth(),0,0,croped.getWidth(),croped.getHeight());
        float imageMean=127.5f;
        float imageStd=128;
        for (int i=0;i<pixels_buf.length;i++){
            final int val=pixels_buf[i];
            data[i * 3 + 0] = (((val >> 16) & 0xFF) - imageMean) / imageStd;
            data[i * 3 + 1] = (((val >> 8) & 0xFF) - imageMean) / imageStd;
            data[i * 3 + 2] = ((val & 0xFF) - imageMean) / imageStd;
        }
    }
    /*
     * RNET跑神經網絡，將score和bias寫入boxes
     */
    private void RNetForward(float[] RNetIn,Vector<Box> boxes){
        int num=RNetIn.length/24/24/3;
        //feed & run
        inferenceInterface.feed(RNetInName,RNetIn,num,24,24,3);
        inferenceInterface.run(RNetOutName,false);
        //fetch
        float[] RNetP=new float[num*2];
        float[] RNetB=new float[num*4];
        inferenceInterface.fetch(RNetOutName[0],RNetP);
        inferenceInterface.fetch(RNetOutName[1],RNetB);
        //轉換
        for (int i=0;i<num;i++) {
            boxes.get(i).score = RNetP[i * 2 + 1];
            for (int j=0;j<4;j++)
                boxes.get(i).bbr[j]=RNetB[i*4+j];
        }
    }
    //Refine Net
    private Vector<Box> RNet(Bitmap bitmap, Vector<Box> boxes){
        //RNet Input Init
        int num=boxes.size();
        float[] RNetIn=new float[num*24*24*3];
        float[] curCrop=new float[24*24*3];
        int RNetInIdx=0;
        for (int i=0;i<num;i++){
            crop_and_resize(bitmap,boxes.get(i),24,curCrop);
            Utils.flip_diag(curCrop,24,24,3);
            //Log.i(TAG,"[*]Pixels values:"+curCrop[0]+" "+curCrop[1]);
            for (int j=0;j<curCrop.length;j++) RNetIn[RNetInIdx++]= curCrop[j];
        }
        //Run RNet
        RNetForward(RNetIn,boxes);
        //RNetThreshold
        for (int i=0;i<num;i++)
            if (boxes.get(i).score<RNetThreshold)
                boxes.get(i).deleted=true;
        //Nms
        nms(boxes,0.7f,"Union");
        BoundingBoxReggression(boxes);
        return Utils.updateBoxes(boxes);
    }
    /*
     * ONet跑神經網絡，將score和bias寫入boxes
     */
    private void ONetForward(float[] ONetIn,Vector<Box> boxes){
        int num=ONetIn.length/48/48/3;
        //feed & run
        inferenceInterface.feed(ONetInName,ONetIn,num,48,48,3);
        inferenceInterface.run(ONetOutName,false);
        //fetch
        float[] ONetP=new float[num*2]; //prob
        float[] ONetB=new float[num*4]; //bias
        float[] ONetL=new float[num*10]; //landmark
        inferenceInterface.fetch(ONetOutName[0],ONetP);
        inferenceInterface.fetch(ONetOutName[1],ONetB);
        inferenceInterface.fetch(ONetOutName[2],ONetL);
        //轉換
        for (int i=0;i<num;i++) {
            //prob
            boxes.get(i).score = ONetP[i * 2 + 1];
            //bias
            for (int j=0;j<4;j++)
                boxes.get(i).bbr[j]=ONetB[i*4+j];

            //landmark
            for (int j=0;j<5;j++) {
                int x=boxes.get(i).left()+(int) (ONetL[i * 10 + j]*boxes.get(i).width());
                int y= boxes.get(i).top()+(int) (ONetL[i * 10 + j +5]*boxes.get(i).height());
                boxes.get(i).landmark[j] = new Point(x,y);
                //Log.i(TAG,"[*] landmarkd "+x+ "  "+y);
            }
        }
    }
    //ONet
    private Vector<Box> ONet(Bitmap bitmap, Vector<Box> boxes){
        //ONet Input Init
        int num=boxes.size();
        float[] ONetIn=new float[num*48*48*3];
        float[] curCrop=new float[48*48*3];
        int ONetInIdx=0;
        for (int i=0;i<num;i++){
            crop_and_resize(bitmap,boxes.get(i),48,curCrop);
            Utils.flip_diag(curCrop,48,48,3);
            for (int j=0;j<curCrop.length;j++) ONetIn[ONetInIdx++]= curCrop[j];
        }
        //Run ONet
        ONetForward(ONetIn,boxes);
        //ONetThreshold
        for (int i=0;i<num;i++)
            if (boxes.get(i).score<ONetThreshold)
                boxes.get(i).deleted=true;
        BoundingBoxReggression(boxes);
        //Nms
        nms(boxes,0.7f,"Min");
        return Utils.updateBoxes(boxes);
    }
    private void square_limit(Vector<Box> boxes, int w, int h){
        //square
        for (int i=0;i<boxes.size();i++) {
            boxes.get(i).toSquareShape();
            boxes.get(i).limit_square(w,h);
        }
    }
    /*
     * 參數：
     *   bitmap:要處理的圖片
     *   minFaceSize:最小的人臉像素值.(此值越大，檢測越快)
     * 返回：
     *   人臉框
     */
    public Vector<Box> detectFaces(Bitmap bitmap, int minFaceSize) {
        long t_start = System.currentTimeMillis();
        //【1】PNet generate candidate boxes
        Vector<Box> boxes=PNet(bitmap,minFaceSize);
        square_limit(boxes,bitmap.getWidth(),bitmap.getHeight());
        //【2】RNet
        boxes=RNet(bitmap,boxes);
        square_limit(boxes,bitmap.getWidth(),bitmap.getHeight());
        //【3】ONet
        boxes=ONet(bitmap,boxes);
        //return
        Log.i(TAG,"[*]Mtcnn Detection Time:"+(System.currentTimeMillis()-t_start));
        lastProcessTime=(System.currentTimeMillis()-t_start);
        return  boxes;
    }
}

四、視頻流數據處理

由於vlc播放的流媒體視頻格式是nv12，需要將其轉爲nv21，並保存爲bitmap

1、首先給出nv12轉爲nv21方法

    private void NV12ToNV21(byte[] nv12, byte[] nv21, int width, int height) {
        if (nv21 == null || nv12 == null) return;
        int framesize = width * height;
        int i = 0, j = 0;
        //System.arraycopy(nv21, test, nv12, test, framesize);
        for (i = 0; i < framesize; i++) {
            nv21[i] = nv12[i];
        }
        for (j = 0; j < framesize / 2; j += 2) {
            nv21[framesize + j] = nv12[j + framesize + 1];
        }
        for (j = 0; j < framesize / 2; j += 2) {
            nv21[framesize + j + 1] = nv12[j + framesize];
        }
    }

2、然後給出nv21高效率轉換爲bitmap的方法
可以查看此前的一篇文章 nv21高效率轉換爲bitmap
3、調用mtcnn的人臉檢測方法

Vector boxes = mtcnn.detectFaces(bitmap, 20);

4、根據返回的數據，標記人臉

    protected void drawAnim(Vector<Box> faces, SurfaceView outputView, float scale_bit, int cameraId, String fps) {
        Paint paint = new Paint();
        Canvas canvas = ((SurfaceView) outputView).getHolder().lockCanvas();
        if (canvas != null) {
            try {
                int viewH = outputView.getHeight();
                int viewW = outputView.getWidth();
//                DLog.d("viewW:"+viewW+",viewH:"+viewH);
                canvas.drawColor(0, PorterDuff.Mode.CLEAR);
                if (faces == null || faces.size() == 0) return;
                for (int i = 0; i < faces.size(); i++) {
                    paint.setColor(Color.BLUE);
                    int size = DisplayUtil.dip2px(this, 3);
                    paint.setStrokeWidth(size);
                    paint.setStyle(Paint.Style.STROKE);
                    Box box = faces.get(i);
                    float[] rect = box.transform2float();
                    float x1 = rect[0] * scale_bit;
                    float y1 = rect[1] * scale_bit;
                    float rect_width = rect[2] * 0.5F;
                    RectF rectf = new RectF(x1, y1, x1 + rect_width, y1 + rect_width);
                    canvas.drawRect(rectf, paint);
                }

            } catch (Exception e) {
                e.printStackTrace();
            } finally {
                ((SurfaceView) outputView).getHolder().unlockCanvasAndPost(canvas);
            }
        }
    }

項目源碼

https://github.com/cayden/facesample
本項目主要基於vlc來播放流媒體視頻

主要包含以下內容

1、使用已經編譯的libvlc來播放流媒體視頻
2、使用MTCNN進行人臉識別並標記人臉
3、保存標記的人臉圖片
4、使用FACENET進行人臉比對
未完待續…

v1.0.2

1, 通過MTCNN檢測人臉
2, 對人臉進行標記

v1.0.1

1, 獲取返回的視頻流數據
2, 將數據nv12轉換爲nv21,並保存圖片

v1.0.0

1, added libvlc
2, support for playing rtsp video stream

感謝大家的閱讀，也希望能轉發並關注我的公衆號

MTCNN移植安卓並檢測視頻中人臉

一、Tensorflow 模型固化

二、引入android tensorflow lite 庫

三、新建MTCNN類

四、視頻流數據處理

項目源碼

v1.0.2

v1.0.1

v1.0.0

釘釘打卡速度慢

Nginx R31 doc 官方文檔-01-nginx 如何安裝

Qt/C++音視頻開發74-合併標籤圖形/生成yolo運算結果圖形/文字和圖形合併成一個/水印濾鏡

挑戰程序設計競賽 2.2章習題 POJ - 3617 Best Cow Line 貪心

字節面試：MySQL什麼時候鎖表？如何防止鎖表？

.NET8連接SQL SERVER 2008 R2 報：證書鏈是由不受信任的頒發機構頒發的

golang開發環境搭建(win10)

python計算機視覺學習筆記——PIL庫的用法

Golang初學：獲取程序內存使用情況，std runtime

Guacamole搭建,一個基於HTML5的遠程桌面

以太坊DApp開發實戰基礎

openssl生成RSA格式及pkcs1與pkcs8格式互相轉換

Xen搭建虛擬機實現VNC訪問

如何理解拜占庭將軍問題

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結