python手勢視頻識別標記

環境

python3.7

  • pytorch1.1.0

  • torchvision 0.3.0

  • cuda 9.0以上
    ##項目框架

  • Audio-and-video-demo

    • bgm (背景語音播報文件)
    • images
      • ffempeg-img
      • rec-img
    • model (自訓練模型保存)
    • video (輸入輸出視頻文件)
    • bgm.py
    • combination.py
    • ffempeg-img-recognition.py
    • gesture-recognition.py
    • main.py
    • putlabel.py

模塊

ffempeg-img-recognition.py

    將手勢視頻按幀分解爲圖片並保存


	def ffmpeg_img_extract(videopath):
   		 container = av.open(videopath)
    
   		 stream = container.streams.video[0]
   		 stream.codec_context.skip_frame = 'NONKEY'
    
   		 for frame in container.decode(stream):
       		 #savepath = 'C:/Users/hp/Desktop/Audio_and_video_processing/Audio_and_video_demo/images/ffmpeg_img/' +'%d.jpg'%frame.index
        	 savepath = 'images/ffmpeg_img/' +'%d.jpg'%frame.index
       	     frame.to_image().save(savepath,quality=80)
 
 	def img_to_video(videopath):
    #轉換爲每幀

    	container = av.open(videopath)
    
   	    for frame in container.decode(video=0):
        	#savepath = 'C:/Users/hp/Desktop/Audio_and_video_processing/Audio_and_video_demo/images/ffmpeg_img/' +'%d.jpg'%frame.index
        	savepath = 'images/ffmpeg_img/' +'%d.jpg'%frame.index
            frame.to_image().save(savepath)

gesture-recognition.py

    利用訓練好的模型對手勢圖像進行識別,並用label_flag矩陣記錄標籤。這裏使用的是googlenet預訓練模型對我們的數據集進行訓練,採用學習率降低法多次迭代訓練,得到的模型對手勢圖像識別正確率在95%以上。


	def gesture_recognition(filepath):
    fileList = os.listdir(filepath)
    
    count = 0
    for filename in fileList:
        count += 1
     
    #背景音樂標籤
    bgm_label = []
    for i in range(count):
        
        filename = filepath+str(i)+'.jpg'
        #圖片讀取
        input_image = Image.open(filename)
          
        
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        
        
        #導入測試圖片
        input_image = Image.open(filename)
        preprocess = transforms.Compose([
            transforms.Resize(256),
            #transforms.CenterCrop(224),
            transforms.RandomRotation(20),
            #transforms.ColorJitter(contrast=3),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ])
        input_tensor = preprocess(input_image)
        input_batch = input_tensor.unsqueeze(0) 
        image_tensor = input_batch.to(device)
        
        
        #打開labels
        with open('images/gesture_24.txt', 'r', encoding='gbk') as clf:
            labels = clf.readlines()
            
            
        #導入訓練好的模型
        alexnet = torch.load('model/googlenet_model.pkl')
        alexnet.eval()
        
        start = time()
        with torch.no_grad():
            output = alexnet(image_tensor)
 
        prob = F.softmax(output[0], dim=0)
        indexs = torch.argsort(-prob)

        finish = time()
        print("識別時間:")
        print(finish-start)
        #添加音樂標籤
        bgm_label.append(labels[indexs[0]].strip())

        #對圖片做標記
        putText(filename,labels[indexs[0]])
        
    return bgm_label

#bgm.py

    給視頻按照手勢識別標籤製作添加音頻針對變換手勢音頻快速變換問題,採用一個標記矩陣lable_flag記錄所有標籤中標籤變換的位置信息,同時len_flag矩陣存儲每一個連續標籤存在的時長,對於小於一定連續幀長度的標籤做一個容錯處理,默認其識別錯誤,用相應時長空白矩陣進行填充,對於大於30幀的連續幀標記,進行一次語音播報剩下時長用等時長的空白音頻填充。


	bgm_dict = {'Congratulation':1, 
            'Eight':2,
            'Fist':3,
            'Five':4,
            'Four':5,
            'Heart_1':6, 
            'Heart_2':7,
            'Heart_3':8, 
            'Heart_single':9, 
            'Honour':10,
            'ILY':11,
            'Insult':12,
            'Nine':13,
            'OK':14,
            'One':15,
            'Palm_up':16,
            'Prayer':17,
            'Rock':18,
            'Seven':19,
            'Six':20, 
            'Three':21, 
            'Thumb_down':22, 
            'Thumb_up':23, 
            'Two':24  }
 
 	        
	def add_bgm3(bgm_label):
  
  	    count = len(bgm_label)
    	#print(count)
    
  		  #標記標籤變化位置
  	   label_flag = [0]
 	   label = bgm_label[0]
  	   for i in range(count):
            if bgm_label[i]!=label:
           	    label_flag.append(i)
                label = bgm_label[i]
                label_flag.append(len(bgm_label)-1)
    
    
    			label_flag_number = len(label_flag)
     
    
   			    music = AudioSegment.from_wav('bgm/1.wav')
                clip = music[:0.0001*1000]
    
    
            for i in range(label_flag_number-1):
                #相應幀數對應標籤
                flag = label_flag[i]
                label = bgm_label[flag]

                #標籤對應的音頻序號
                number = bgm_dict[label]
                number = int(number)
                #print(number)
    
                #去除手勢變換識別錯誤標籤
               if_or_not = int(label_flag[i+1]-label_flag[i])

        
               if if_or_not < 12:
                   start = float(label_flag[i]*0.033)
                   end = float(label_flag[i+1]*0.033)
            
                   bgm_len = float(end-start)
                   #print(bgm_len)
            
                   bgm_path = 'bgm/0.wav'
            
                   music = AudioSegment.from_wav(bgm_path)
         
                   clip = clip + music[:bgm_len*1000]
            
               elif if_or_not > 30:
                   start = float(label_flag[i]*0.033)
                   end = float(label_flag[i+1]*0.033)
            
                   bgm_len = float(end-start)
                   #print(bgm_len)
            
                   bgm_path = "bgm/%d"%number + ".wav"
            
                   music = AudioSegment.from_wav(bgm_path)
        
                   clip = clip + music[:30*0.033*1000]
            
                   bgm_path = 'bgm/0.wav'
            
                   music = AudioSegment.from_wav(bgm_path)
         
                   clip = clip + music[:(if_or_not-30)*0.033*1000]
            
               else:              
                   start = float(label_flag[i]*0.033)
                   end = float(label_flag[i+1]*0.033)
            
                   bgm_len = float(end-start)
                   #print(bgm_len)
            
                   bgm_path = "bgm/%d"%number + ".wav"
            
                   music = AudioSegment.from_wav(bgm_path)
             
                   clip = clip + music[:bgm_len*1000]
      
        clip.export('bgm/clip.wav', format='wav')

    將對應音頻添加到合成好的視頻上


	def video_merge2(outpath):
    
     bgm_path = "bgm/clip.wav"
     #print(bgm_path)
     # 讀取音頻
     audio = AudioFileClip(bgm_path)
       
     video = VideoFileClip('video/saveVideo.mp4')
     
     # 設置視頻的音頻
     video = video.set_audio(audio)
     
     video.write_videofile(outpath)

###combination.py
    將識別完並打上標籤的手勢圖片合成爲視頻


	def combination(length):
    
    img = cv2.imread("images/rec_image/0.jpg")
    w, h ,c = img.shape
    
    #print(w,h,c)
    img_root = "images/rec_image/"
    #path=".\\"
    filelist=os.listdir()
    fps = 30
    
    file_path='video/saveVideo.mp4' # 導出路徑DIVX/mp4v
    size = (h, w)
   
    fourcc = cv2.VideoWriter_fourcc(*'mp4v') # mp4
    
    videoWriter = cv2.VideoWriter(file_path,fourcc,fps,size)
    
    # 這種情況更適合於照片是從"1.jpg" 開始,然後每張圖片名字+1的那種
    for i in range(length):
        frame = cv2.imread(img_root+str(i)+'.jpg')
        videoWriter.write(frame)
    
    
    videoWriter.release() #釋放

putlabel.py

    對視頻分解爲幀的圖片進行手勢識別並貼上標籤


	def putText(image,label):
    
    print(image)
    flag = image.rfind("/")
    imagename = image[flag+1:]
    imagename = str(imagename)
    #savepath = 'C:/Users/hp/Desktop/Audio_and_video_processing/Audio_and_video_demo/images/rec_image/'+imagename
    savepath = 'images/rec_image/'+imagename

    print(savepath)
    
    label = label.strip()
    #cv2.namedWindow("mark", cv2.WINDOW_AUTOSIZE)
    image = cv2.imread(image)
    image = cv2.putText(image, label, (100, 100), cv2.FONT_HERSHEY_SIMPLEX, 2, (255, 0, 0), 2)
    cv2.imwrite(savepath,image)

###main.py
    主函數


	def main():

    #videopath = 'video/test3.mp4'
    videopath = sys.argv[1]
    videopath = 'video/'+str(videopath)
    
    outpath = sys.argv[2]
    outpath = 'video/'+str(outpath)
    #ffmpeg_img_extract(videopath)
    img_to_video(videopath)
    #輸入需要讀取圖片目錄
    filepath = 'images/ffmpeg_img/'
    
    #識別圖像手勢內容並標註保存
    bgm_label = gesture_recognition(filepath)
    #print(bgm_label)
    
    
    #圖像編碼
    combination(len(bgm_label))
    
    #添加bgm
    add_bgm3(bgm_label)   
    video_merge2(outpath)
    

使用

命令行使用

example:

 python main.py test.mp4(輸入) out.mp4(輸出)

giteel鏈接:https://gitee.com/ceasarxo/gesture-recognition

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章