視頻檢索

視頻檢索
本文采用了三種方法進行視頻檢索
1 根據圖像檢索視頻關鍵幀的算法
2 採用vedioSearch的方式進行檢索
3 採用yolo視頻目標檢測的算法進行檢索
下面針對三種方法進行詳細說明


根據圖像檢索視頻關鍵幀的算法

step1:提取新聞片頭關鍵幀
step2:提取整個新聞的關鍵幀
step3:定義圖像的相似度匹配算法
step4:根據相似的匹配算法匹配關鍵幀
代碼如下:

#關鍵幀提取算法
# -*- coding: utf-8 -*-
"""
Created on Tue Dec  4 16:48:57 2018
keyframes extract tool
this key frame extract algorithm is based on interframe difference.
The principle is very simple
First, we load the video and compute the interframe difference between each frames
Then, we can choose one of these three methods to extract keyframes, which are 
all based on the difference method:
    
1. use the difference order
    The first few frames with the largest average interframe difference 
    are considered to be key frames.
2. use the difference threshold
    The frames which the average interframe difference are large than the 
    threshold are considered to be key frames.
3. use local maximum
    The frames which the average interframe difference are local maximum are 
    considered to be key frames.
    It should be noted that smoothing the average difference value before 
    calculating the local maximum can effectively remove noise to avoid 
    repeated extraction of frames of similar scenes.
After a few experiment, the third method has a better key frame extraction effect.
The original code comes from the link below, I optimized the code to reduce 
unnecessary memory consumption.
https://blog.csdn.net/qq_21997625/article/details/81285096
@author: zyb_as
""" 
import cv2
import operator
import numpy as np
import matplotlib.pyplot as plt
import sys
from scipy.signal import argrelextrema

 
def smooth(x, window_len=13, window='hanning'):
    """smooth the data using a window with requested size.
    
    This method is based on the convolution of a scaled window with the signal.
    The signal is prepared by introducing reflected copies of the signal 
    (with the window size) in both ends so that transient parts are minimized
    in the begining and end part of the output signal.
    
    input:
        x: the input signal 
        window_len: the dimension of the smoothing window
        window: the type of window from 'flat', 'hanning', 'hamming', 'bartlett', 'blackman'
            flat window will produce a moving average smoothing.
    output:
        the smoothed signal
        
    example:
    import numpy as np    
    t = np.linspace(-2,2,0.1)
    x = np.sin(t)+np.random.randn(len(t))*0.1
    y = smooth(x)
    
    see also: 
    
    numpy.hanning, numpy.hamming, numpy.bartlett, numpy.blackman, numpy.convolve
    scipy.signal.lfilter
 
    TODO: the window parameter could be the window itself if an array instead of a string   
    """
    print(len(x), window_len)
    # if x.ndim != 1:
    #     raise ValueError, "smooth only accepts 1 dimension arrays."
    #
    # if x.size < window_len:
    #     raise ValueError, "Input vector needs to be bigger than window size."
    #
    # if window_len < 3:
    #     return x
    #
    # if not window in ['flat', 'hanning', 'hamming', 'bartlett', 'blackman']:
    #     raise ValueError, "Window is on of 'flat', 'hanning', 'hamming', 'bartlett', 'blackman'"
 
    s = np.r_[2 * x[0] - x[window_len:1:-1],
              x, 2 * x[-1] - x[-1:-window_len:-1]]
    #print(len(s))
 
    if window == 'flat':  # moving average
        w = np.ones(window_len, 'd')
    else:
        w = getattr(np, window)(window_len)
    y = np.convolve(w / w.sum(), s, mode='same')
    return y[window_len - 1:-window_len + 1]
 

class Frame:
    """class to hold information about each frame
    
    """
    def __init__(self, id, diff):
        self.id = id
        self.diff = diff
 
    def __lt__(self, other):
        if self.id == other.id:
            return self.id < other.id
        return self.id < other.id
 
    def __gt__(self, other):
        return other.__lt__(self)
 
    def __eq__(self, other):
        return self.id == other.id and self.id == other.id
 
    def __ne__(self, other):
        return not self.__eq__(other)
 
 
def rel_change(a, b):
   x = (b - a) / max(a, b)
   print(x)
   return x
 
    
if __name__ == "__main__":
    print(sys.executable)
    #Setting fixed threshold criteria
    USE_THRESH = False
    #fixed threshold value
    THRESH = 0.6
    #Setting fixed threshold criteria
    USE_TOP_ORDER = False
    #Setting local maxima criteria
    USE_LOCAL_MAXIMA = True
    #Number of top sorted frames
    NUM_TOP_FRAMES = 50
     
    #Video path of the source file
    videopath = 'myvedio.flv'
    #Directory to store the processed frames
    dir = './myvedio_extract_result/'
    #smoothing window size
    len_window = int(50)
    
    
    print("target video :" + videopath)
    print("frame save directory: " + dir)
    # load video and compute diff between frames
    cap = cv2.VideoCapture(str(videopath)) 
    curr_frame = None
    prev_frame = None 
    frame_diffs = []
    frames = []
    success, frame = cap.read()
    i = 0 
    while(success):
        luv = cv2.cvtColor(frame, cv2.COLOR_BGR2LUV)
        curr_frame = luv
        if curr_frame is not None and prev_frame is not None:
            #logic here
            diff = cv2.absdiff(curr_frame, prev_frame)
            diff_sum = np.sum(diff)
            diff_sum_mean = diff_sum / (diff.shape[0] * diff.shape[1])
            frame_diffs.append(diff_sum_mean)
            frame = Frame(i, diff_sum_mean)
            frames.append(frame)
        prev_frame = curr_frame
        i = i + 1
        success, frame = cap.read()   
    cap.release()
    
    # compute keyframe
    keyframe_id_set = set()
    if USE_TOP_ORDER:
        # sort the list in descending order
        frames.sort(key=operator.attrgetter("diff"), reverse=True)
        for keyframe in frames[:NUM_TOP_FRAMES]:
            keyframe_id_set.add(keyframe.id) 
    if USE_THRESH:
        print("Using Threshold")
        for i in range(1, len(frames)):
            if (rel_change(np.float(frames[i - 1].diff), np.float(frames[i].diff)) >= THRESH):
                keyframe_id_set.add(frames[i].id)   
    if USE_LOCAL_MAXIMA:
        print("Using Local Maxima")
        diff_array = np.array(frame_diffs)
        sm_diff_array = smooth(diff_array, len_window)
        frame_indexes = np.asarray(argrelextrema(sm_diff_array, np.greater))[0]
        for i in frame_indexes:
            keyframe_id_set.add(frames[i - 1].id)
            
        plt.figure(figsize=(40, 20))
        plt.locator_params(numticks=100)
        plt.stem(sm_diff_array)
        plt.savefig(dir + 'plot.png')匹配
    
    # save all keyframes as image
    cap = cv2.VideoCapture(str(videopath))
    curr_frame = None
    keyframes = []
    success, frame = cap.read()
    idx = 0
    while(success):
        if idx in keyframe_id_set:
            name = "keyframe_" + str(idx) + ".jpg"
            cv2.imwrite(dir + name, frame)
            keyframe_id_set.remove(idx)
        idx = idx + 1
        success馬, frame = cap.read()
    cap.release()

計算圖片的相似度

#計算圖片的相似度距離
# -*- coding: utf-8 -*-
# !/usr/bin/env python
# @Time    : 2018/11/17 14:52
# @Author  : xhh
# @Desc    : 餘弦相似度計算
# @File    : difference_image_consin.py
# @Software: PyCharm
from PIL import Image
from numpy import average, dot, linalg
 
# 對圖片進行統一化處理
def get_thum(image, size=(64,64), greyscale=False):
    # 利用image對圖像大小重新設置, Image.ANTIALIAS爲高質量的
    image = image.resize(size, Image.ANTIALIAS)
    if greyscale:
        # 將圖片轉換爲L模式,其爲灰度圖,其每個像素用8個bit表示
        image = image.convert('L')
    return image
 
# 計算圖片的餘弦距離
def image_similarity_vectors_via_numpy(image1, image2):
    image1 = get_thum(image1)
    image2 = get_thum(image2)
    images = [image1, image2]
    vectors = []
    norms = []
    for image in images:
        vector = []
        for pixel_tuple in image.getdata():
            vector.append(average(pixel_tuple))
        vectors.append(vector)
        # linalg=linear(線性)+algebra(代數),norm則表示範數
        # 求圖片的範數??
        norms.append(linalg.norm(vector, 2))
    a, b = vectors
    a_norm, b_norm = norms
    # dot返回的是點積,對二維數組(矩陣)進行計算
    res = dot(a / a_norm, b / b_norm)
    return res
 
#'''
#image1 = Image.open('images/1.jpeg')
#image2 = Image.open('myimage64.jpg')
#cosin = image_similarity_vectors_via_numpy(image1, image2)
#print('圖片餘弦相似度',cosin)

關鍵幀匹配算法

image1=Image.open('11.jpg')
import os
cosin1=[]
imagematch=[]
for filename in os.listdir(r"./myvedio_extract_result"):              #listdir的參數是文件夾的路徑
    #image2 = cv2.imread(filename)   #此時的filename是文件夾中文件的名稱
    image2=Image.open(os.path.join('myvedio_extract_result',filename))
    cosin = image_similarity_vectors_via_numpy(image1, image2) 
    cosin1.append(cosin)
    imagematch.append(filename)
print(max(cosin1))
index=cosin1.index(max(cosin1))
print(index)
print(imagematch[index])
  #train_dir +"/" + train_image_names[0] 

這是最後的結果
1.0匹配的精度爲1
258 第258個關鍵幀得到匹配
keyframe_56.jpg 第56幀被匹配

採用vedioSearch的方式進行檢索

開源視頻檢索技術VedioSearch
https://blog.csdn.net/meloyi/article/details/53034823
https://github.com/andrefaraujo/videosearch
本項目主要做的事情是:
1 提取視頻的關鍵幀(關鍵幀,其實就是視頻中的一張圖像)。 對視頻進行鏡頭邊緣檢測。
2 對圖片或者幀提取SIFT(尺度不變特徵變換,用於在圖像中檢測出關鍵點,是一種局部特徵描述子)。
3 爲每張圖片/關鍵幀、鏡頭或者視頻片段 提取全局描述子(Fisher Vectors)。
4 使用Bloom Filters對每個視頻片段進行索引。
5 使用圖片對圖片或者視頻數據庫進行檢索。
6 用區間爲0~1的平均精準度和精準度來評估檢索結果。

本項目代碼可以優化,得到最後的視頻匹配效果
根據代碼參考資料進行視頻檢索,得到一個很好的匹配度,但是不能夠有實時的匹配

採用yolo視頻目標檢測的算法進行檢索

修改yolov3目標檢測到視頻的目標檢測
step1 安裝cuda,cuddn,darkNet
step2 運行darknet 圖像的目標檢測
step3 修改darknet 可以運行視頻的目標檢測
詳細步驟如下:

  • 下載代碼:

git clone https://github.com/pjreddie/darknet

  • 編譯代碼

cd darknet
make

  • 下載權重文件

wget https://pjreddie.com/media/files/yolov3.weights

  • 修改代碼運行

我們首先需要將“darknet”文件夾內的“libdarknet.so”文件移動到“darknet/python”內
打開“darknet/cfg/coco.data”文件,將“names”也改爲絕對路徑:
進入“darknet/python”然後執行“darknet.py”文件即可

  • 修改代碼進行視頻的目標檢測

1.修改src/image.c

#ifdef NUMPY
image ndarray_to_image(unsigned char* src, long* shape, long* strides)
{
    int h = shape[0];
    int w = shape[1];
    int c = shape[2];
    int step_h = strides[0];
    int step_w = strides[1];
    int step_c = strides[2];
    image im = make_image(w, h, c);
    int i, j, k;
    int index1, index2 = 0;
 
    for(i = 0; i < h; ++i){
            for(k= 0; k < c; ++k){
                for(j = 0; j < w; ++j){
 
                    index1 = k*w*h + i*w + j;
                    index2 = step_h*i + step_w*j + step_c*k;
                    //fprintf(stderr, "w=%d h=%d c=%d step_w=%d step_h=%d step_c=%d \n", w, h, c, step_w, step_h, step_c);
                    //fprintf(stderr, "im.data[%d]=%u data[%d]=%f \n", index1, src[index2], index2, src[index2]/255.);
                    im.data[index1] = src[index2]/255.;
                }
            }
        }
 
    rgbgr_image(im);
 
    return im;
}
#endif

2.然後在src/image.h大概22行插入:

#ifdef NUMPY
image ndarray_to_image(unsigned char* src, long* shape, long* strides);
#endif

3.修改Makefile文件:

GPU=1
CUDNN=1
OPENCV=1
# 添加
NUMPY=1
OPENMP=1
DEBUG=1
 
ARCH= -gencode arch=compute_30,code=sm_30 \
      -gencode arch=compute_35,code=sm_35 \
      -gencode arch=compute_50,code=[sm_50,compute_50] \
      -gencode arch=compute_52,code=[sm_52,compute_52] \
	  -gencode arch=compute_70,code=[sm_70,compute_70] \
	  -gencode arch=compute_75,code=[sm_75,compute_75]
#      -gencode arch=compute_20,code=[sm_20,sm_21] \ This one is deprecated?
 
# This is what I use, uncomment if you know your arch and want to specify
# ARCH= -gencode arch=compute_52,code=compute_52
 
VPATH=./src/:./examples
SLIB=libdarknet.so
ALIB=libdarknet.a
EXEC=darknet
OBJDIR=./obj/
 
CC=gcc
CPP=g++
NVCC=nvcc 
AR=ar
ARFLAGS=rcs
OPTS=-Ofast
LDFLAGS= -lm -pthread 
COMMON= -Iinclude/ -Isrc/
CFLAGS=-Wall -Wno-unused-result -Wno-unknown-pragmas -Wfatal-errors -fPIC
 
ifeq ($(OPENMP), 1) 
CFLAGS+= -fopenmp
endif
 
ifeq ($(DEBUG), 1) 
OPTS=-O0 -g
endif
 
CFLAGS+=$(OPTS)
 
ifeq ($(OPENCV), 1) 
COMMON+= -DOPENCV
CFLAGS+= -DOPENCV
LDFLAGS+= `pkg-config --libs opencv` -lstdc++
COMMON+= `pkg-config --cflags opencv` 
endif
# 添加
ifeq ($(NUMPY), 1) 
COMMON+= -DNUMPY -I/home/sbs/anaconda3/envs/tracy/include/python3.6m/ -I/home/sbs/anaconda3/envs/tracy/lib/python3.6/site-packages/numpy/core/include/numpy/
CFLAGS+= -DNUMPY
endif
 
ifeq ($(GPU), 1) 
COMMON+= -DGPU -I/usr/local/cuda-10.0-cudnn-7.3.1/include/
CFLAGS+= -DGPU
LDFLAGS+= -L/usr/local/cuda-10.0-cudnn-7.3.1/lib64 -lcuda -lcudart -lcublas -lcurand
endif
 
ifeq ($(CUDNN), 1) 
COMMON+= -DCUDNN 
CFLAGS+= -DCUDNN
LDFLAGS+= -lcudnn
endif
 
OBJ=gemm.o utils.o cuda.o deconvolutional_layer.o convolutional_layer.o list.o image.o activations.o im2col.o col2im.o blas.o crop_layer.o dropout_layer.o maxpool_layer.o softmax_layer.o data.o matrix.o network.o connected_layer.o cost_layer.o parser.o option_list.o detection_layer.o route_layer.o upsample_layer.o box.o normalization_layer.o avgpool_layer.o layer.o local_layer.o shortcut_layer.o logistic_layer.o activation_layer.o rnn_layer.o gru_layer.o crnn_layer.o demo.o batchnorm_layer.o region_layer.o reorg_layer.o tree.o  lstm_layer.o l2norm_layer.o yolo_layer.o iseg_layer.o image_opencv.o
EXECOBJA=captcha.o lsd.o super.o art.o tag.o cifar.o go.o rnn.o segmenter.o regressor.o classifier.o coco.o yolo.o detector.o nightmare.o instance-segmenter.o darknet.o
ifeq ($(GPU), 1) 
LDFLAGS+= -lstdc++ 
OBJ+=convolutional_kernels.o deconvolutional_kernels.o activation_kernels.o im2col_kernels.o col2im_kernels.o blas_kernels.o crop_layer_kernels.o dropout_layer_kernels.o maxpool_layer_kernels.o avgpool_layer_kernels.o
endif
 
EXECOBJ = $(addprefix $(OBJDIR), $(EXECOBJA))
OBJS = $(addprefix $(OBJDIR), $(OBJ))
DEPS = $(wildcard src/*.h) Makefile include/darknet.h
 
all: obj backup results $(SLIB) $(ALIB) $(EXEC)
#all: obj  results $(SLIB) $(ALIB) $(EXEC)
 
 
$(EXEC): $(EXECOBJ) $(ALIB)
	$(CC) $(COMMON) $(CFLAGS) $^ -o $@ $(LDFLAGS) $(ALIB)
 
$(ALIB): $(OBJS)
	$(AR) $(ARFLAGS) $@ $^
 
$(SLIB): $(OBJS)
	$(CC) $(CFLAGS) -shared $^ -o $@ $(LDFLAGS)
 
$(OBJDIR)%.o: %.cpp $(DEPS)
	$(CPP) $(COMMON) $(CFLAGS) -c $< -o $@
 
$(OBJDIR)%.o: %.c $(DEPS)
	$(CC) $(COMMON) $(CFLAGS) -c $< -o $@
 
$(OBJDIR)%.o: %.cu $(DEPS)
	$(NVCC) $(ARCH) $(COMMON) --compiler-options "$(CFLAGS)" -c $< -o $@
 
obj:
	mkdir -p obj
backup:
	mkdir -p backup
results:
	mkdir -p results
 
.PHONY: clean
 
clean:
	rm -rf $(OBJS) $(SLIB) $(ALIB) $(EXEC) $(EXECOBJ) $(OBJDIR)/*
 

4.然後make clean再重新make -j8就可以
5.修改python/darknet.py文件,這裏我直接貼完整的程序了

from ctypes import *
import math
import random
import time
import numpy as np
import cv2
import os
import sys
 
def sample(probs):
    s = sum(probs)
    probs = [a/s for a in probs]
    r = random.uniform(0, 1)
    for i in range(len(probs)):
        r = r - probs[i]
        if r <= 0:
            return i
    return len(probs)-1
 
def c_array(ctype, values):
    arr = (ctype*len(values))()
    arr[:] = values
    return arr
 
class BOX(Structure):
    _fields_ = [("x", c_float),
                ("y", c_float),
                ("w", c_float),
                ("h", c_float)]
 
class DETECTION(Structure):
    _fields_ = [("bbox", BOX),
                ("classes", c_int),
                ("prob", POINTER(c_float)),
                ("mask", POINTER(c_float)),
                ("objectness", c_float),
                ("sort_class", c_int)]
 
 
class IMAGE(Structure):
    _fields_ = [("w", c_int),
                ("h", c_int),
                ("c", c_int),
                ("data", POINTER(c_float))]
 
class METADATA(Structure):
    _fields_ = [("classes", c_int),
                ("names", POINTER(c_char_p))]
 
    
 
#lib = CDLL("/home/pjreddie/documents/darknet/libdarknet.so", RTLD_GLOBAL)
lib = CDLL("../libdarknet.so", RTLD_GLOBAL)
lib.network_width.argtypes = [c_void_p]
lib.network_width.restype = c_int
lib.network_height.argtypes = [c_void_p]
lib.network_height.restype = c_int
 
predict = lib.network_predict
predict.argtypes = [c_void_p, POINTER(c_float)]
predict.restype = POINTER(c_float)
 
set_gpu = lib.cuda_set_device
set_gpu.argtypes = [c_int]
 
make_image = lib.make_image
make_image.argtypes = [c_int, c_int, c_int]
make_image.restype = IMAGE
 
get_network_boxes = lib.get_network_boxes
get_network_boxes.argtypes = [c_void_p, c_int, c_int, c_float, c_float, POINTER(c_int), c_int, POINTER(c_int)]
get_network_boxes.restype = POINTER(DETECTION)
 
make_network_boxes = lib.make_network_boxes
make_network_boxes.argtypes = [c_void_p]
make_network_boxes.restype = POINTER(DETECTION)
 
free_detections = lib.free_detections
free_detections.argtypes = [POINTER(DETECTION), c_int]
 
free_ptrs = lib.free_ptrs
free_ptrs.argtypes = [POINTER(c_void_p), c_int]
 
network_predict = lib.network_predict
network_predict.argtypes = [c_void_p, POINTER(c_float)]
 
reset_rnn = lib.reset_rnn
reset_rnn.argtypes = [c_void_p]
 
load_net = lib.load_network
load_net.argtypes = [c_char_p, c_char_p, c_int]
load_net.restype = c_void_p
 
do_nms_obj = lib.do_nms_obj
do_nms_obj.argtypes = [POINTER(DETECTION), c_int, c_int, c_float]
 
do_nms_sort = lib.do_nms_sort
do_nms_sort.argtypes = [POINTER(DETECTION), c_int, c_int, c_float]
 
free_image = lib.free_image
free_image.argtypes = [IMAGE]
 
letterbox_image = lib.letterbox_image
letterbox_image.argtypes = [IMAGE, c_int, c_int]
letterbox_image.restype = IMAGE
 
load_meta = lib.get_metadata
lib.get_metadata.argtypes = [c_char_p]
lib.get_metadata.restype = METADATA
 
load_image = lib.load_image_color
load_image.argtypes = [c_char_p, c_int, c_int]
load_image.restype = IMAGE
 
# 添加以處理視頻
ndarray_image = lib.ndarray_to_image
ndarray_image.argtypes = [POINTER(c_ubyte), POINTER(c_long), POINTER(c_long)]
ndarray_image.restype = IMAGE
 
rgbgr_image = lib.rgbgr_image
rgbgr_image.argtypes = [IMAGE]
 
predict_image = lib.network_predict_image
predict_image.argtypes = [c_void_p, IMAGE]
predict_image.restype = POINTER(c_float)
 
def classify(net, meta, im):
    out = predict_image(net, im)
    res = []
    for i in range(meta.classes):
        res.append((meta.names[i], out[i]))
    res = sorted(res, key=lambda x: -x[1])
    return res
 
"""
Yolo-v3目前耗時過長的步驟
    1.輸入圖像的預處理階段
    2.python接口調用網絡執行一次推理過程
"""
 
def detect(net, meta, image, thresh=.5, hier_thresh=.5, nms=.45):
    # preprocess_image_time = time.time()
    # 大約0.1131s
    im = load_image(image, 0, 0)
    # print("Yolo Preprocess image time in python version:", (time.time() - preprocess_image_time))
    num = c_int(0)
    pnum = pointer(num)
    # start_time = time.time()
    # 大概0.129秒左右
    predict_image(net, im)
    # print("Yolo Do inference time in python version:", (time.time() - start_time))
    
    # get_detection_time = time.time()
    # 大約0.0022s
    dets = get_network_boxes(net, im.w, im.h, thresh, hier_thresh, None, 0, pnum)
    # print("Yolo Get detections time in python version:", (time.time() - get_detection_time))
    num = pnum[0]
    # do_nms_time = time.time()
    # 可以忽略不計
    if (nms): do_nms_obj(dets, num, meta.classes, nms)
    # print("Yolo Do nms time in python version:", (time.time() - do_nms_time))
 
    res = []
    for j in range(num):
        for i in range(meta.classes):
            if dets[j].prob[i] > 0:
                b = dets[j].bbox
                res.append((meta.names[i], dets[j].prob[i], (b.x, b.y, b.w, b.h)))
    res = sorted(res, key=lambda x: -x[1])
    free_image(im)
    free_detections(dets, num)
    return res
 
# 添加以處理視頻
def detect_im(net, meta, im, thresh=.5, hier_thresh=.5, nms=.45):
    # to_image_time = time.time()
    # 大約0.0012~0.0013秒
    im, image = array_to_image(im)
    # print("to_image time:", (time.time() - to_image_time))
    # rgbgr_image_time = time.time()
    # 大約0.0013秒
    rgbgr_image(im)
    # print("rgbgr_image time:", (time.time() - rgbgr_image_time))
    num = c_int(0)
    pnum = pointer(num)
    # do_inference_time = time.time()
    # 大約0.083秒
    predict_image(net, im)
    # print("Do inference time:", (time.time() - do_inference_time))
    dets = get_network_boxes(net, im.w, im.h, thresh, hier_thresh, None, 0, pnum)
    num = pnum[0]
    if (nms): do_nms_obj(dets, num, meta.classes, nms)
 
    res = []
    for j in range(num):
        a = dets[j].prob[0:meta.classes]
        if any(a):
            ai = np.array(a).nonzero()[0]
            for i in ai:
                b = dets[j].bbox
                res.append((meta.names[i], dets[j].prob[i],
                           (b.x, b.y, b.w, b.h)))
 
    res = sorted(res, key=lambda x: -x[1])
    if isinstance(image, bytes):
        free_image(im)
    free_detections(dets, num)
 
    return res
 
def array_to_image(arr):
    # need to return old values to avoid python freeing memory
    arr = arr.transpose(2,0,1)
    c, h, w = arr.shape[0:3]
    arr = np.ascontiguousarray(arr.flat, dtype=np.float32) / 255.0
    data = arr.ctypes.data_as(POINTER(c_float))
    im = IMAGE(w,h,c,data)
    return im, arr
 
def get_folderImages(folder):
    all_files = os.listdir(folder)
    abs_path = [os.path.join(folder, i) for i in all_files]
    return abs_path
 
def convertBack(x, y, w, h):
    xmin = int(round(x - (w / 2)))
    xmax = int(round(x + (w / 2)))
    ymin = int(round(y - (h / 2)))
    ymax = int(round(y + (h / 2)))
    return xmin, ymin, xmax, ymax
 
def init():
    net = load_net("../cfg/yolov3.cfg".encode("utf-8"), "../cfg/yolov3.weights".encode("utf-8"), 0)
    meta = load_meta("../cfg/coco.data".encode("utf-8"))
    return net, meta
 
def image_processing():
    net, meta = init()
 
    folder = "images"
    save_folder = "results"
    each_process_time = []
 
    for image_path in get_folderImages(folder):
        image = cv2.imread(image_path)
        start_time = time.time()
        r = detect(net, meta, image_path.encode("utf-8"))
        processing_time = time.time() - start_time
        each_process_time.append(processing_time)
        for i in range(len(r)):
            x, y, w, h = r[i][2][0], r[i][2][1], r[i][2][2], r[i][2][3]
            topleft, topright, bottomleft, bottomright = convertBack(float(x), float(y), float(w), float(h))
            result = cv2.rectangle(
                image,
                (topleft, topright),
                (bottomleft, bottomright),
                (0, 255, 255),
                2
            )
            cv2.putText(
                result, 
                bytes.decode(r[i][0]), 
                (topleft, topright),
                cv2.FONT_HERSHEY_SIMPLEX, 
                1.0, 
                (0, 0, 255), 
                2
            )
        save_path = os.path.join(save_folder, image_path.split('/')[-1].split(".jpg")[0] + "-result.jpg")
        cv2.imwrite(save_path, result)
    average_processing_time = np.mean(each_process_time)
    print("Yolo-v3 COCO Average each Image processing Time:\n")
    print(average_processing_time)
 
def video_processing():
    set_gpu(7)
    net, meta = init()
 
    processing_path = "small.mp4"
    cam = cv2.VideoCapture(processing_path)
    total_frames = cam.get(cv2.CAP_PROP_FRAME_COUNT)
    fps = cam.get(cv2.CAP_PROP_FPS)
    frame_size = (int(cam.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cam.get(cv2.CAP_PROP_FRAME_HEIGHT)))
    # fourcc = int(cam.get(cv2.CAP_PROP_FOURCC))
    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    processing_result_name = processing_path.split(".mp4")[0] + "-result.mp4"
    result = cv2.VideoWriter(processing_result_name, fourcc, fps, frame_size)
        
    timeF = 1
    c = 1
    print("opencv?", cam.isOpened())
    print("fps:", fps)
    print("decode style:", fourcc)
    print("size:", frame_size)
    print("total frames:", total_frames)
    start_total = time.time()
    while True:
        frame_start = time.time()
        _, img = cam.read()
        if (c % timeF == 0 or c == total_frames):
            if img is not None:
                r = detect_im(net, meta, img)
                for i in range(len(r)):
                    x, y, w, h = r[i][2][0], r[i][2][1], r[i][2][2], r[i][2][3]
                    topleft, topright, bottomleft, bottomright = convertBack(float(x), float(y), float(w), float(h))
                    img = cv2.rectangle(
                        img,
                        (topleft, topright),
                        (bottomleft, bottomright),
                        (0, 255, 255),
                        1
                    )
                    label_score = "{}:{:.2f}".format(bytes.decode(r[i][0]), r[i][1])
                    cv2.putText(
                        img, 
                        label_score, 
                        (topleft, topright),
                        cv2.FONT_HERSHEY_SIMPLEX, 
                        1.0, 
                        (0, 0, 255), 
                        1
                    )
                result.write(img)
        else:
            result.write(img)
 
        c += 1
 
        if c > total_frames:
            print("Finished Processing!")
            break
        print("processing one frame total time:", (time.time() - frame_start))
        print()
        
    processing_time = time.time() - start_total
    cam.release()
    result.release()
    post_compression(processing_result_name)
    print("Yolo-v3 COCO one Video Process Time:\n")
    print(processing_time)
 
if __name__ == "__main__":
    #net = load_net("cfg/densenet201.cfg", "/home/pjreddie/trained/densenet201.weights", 0)
    #im = load_image("data/wolf.jpg", 0, 0)
    #meta = load_meta("cfg/imagenet1k.data")
    #r = classify(net, meta, im)
    #print r[:10]
    # net = load_net("../cfg/yolov3.cfg".encode("utf-8"), "../cfg/yolov3.weights".encode("utf-8"), 0)
    # meta = load_meta("../cfg/coco.data".encode("utf-8"))
    # start_time = time.time()
    # r = detect(net, meta, "../data/car.jpg".encode("utf-8"))
    # print("Inference time:{:.4f}".format(time.time() - start_time))
    # print(r)
    image_processing()
    # video_processing()

運行
./darknet detector demo cfg/coco.data cfg/yolov3.cfg cfg/yolov3.weights python/videos/test.mp4

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章