文本-图片-视频-相似度算法-demo


##TODO:

  • [] 视频相似度关键帧获取
  • [] 音频相似度计算

  • 2019-09-20
    • 17:28 更:


  • 源语言: GoLang

文本相似度:

  • 小文本采用的方式为php自带的文本比较即可:

    // 简单字符串匹配,适用于当长度小的情况
    func SimpleCompareTextSimilarity(prev, newUpload string) (float64, int) {
       per := 0.0
       i := php2go.SimilarText(prev, newUpload, &per)
       return per, i
    }
    
  • 大文本采取的是基于海明距离:

    • // 校验字符串的相似度,使用simhash进行判断
      func CompareTextSimilarity(prev, newUpload string) (float64, int) {
      	hash1 := simhash.Simhash(simhash.NewWordFeatureSet([]byte(prev)))
      	hash2 := simhash.Simhash(simhash.NewWordFeatureSet([]byte(newUpload)))
      	compare := simhash.Compare(hash1, hash2)
      	return 0.0, int(compare)
      }
      

图片相似度:

  • 基于 ahash,dhash
// batch比较图片的相似度
func BatchComparePicSimilarity(funName string, filePathSlice1 []string, filePathSlice2 []string, threshold int) (int, float64, error) {
   hash1 := make([]*goimagehash.ImageHash, 0)
   for _, p := range filePathSlice1 {
      imageHash, e := GetImgHash(funName, p)
      if nil != e {
         return 0, 0.0, e
      }
      hash1 = append(hash1, imageHash)
   }
   hash2 := make([]*goimagehash.ImageHash, 0)
   for _, p := range filePathSlice2 {
      imageHash, e := GetImgHash(funName, p)
      if nil != e {
         return 0, 0.0, e
      }
      hash2 = append(hash2, imageHash)
   }
   f, e := CompareImgHashes(hash1, hash2, threshold)
   return -1, f, e
}
func GetImgHash(funcName string, filePath string) (*goimagehash.ImageHash, error) {
  suffix := path.Ext(filePath)
  if exists := utils.IsFileOrDirExists(filePath); !exists {
    return nil, errors.New(fmt.Sprintf("[%s]文件不存在", filePath))
  }
  var img image.Image
  if suffix == ".jpeg" || suffix == ".jpg" {
    file, e := os.Open(filePath)
    if nil != e {
      return nil, e
    }
    image, e := jpeg.Decode(file)
    if nil != e {
      return nil, e
    }
    img = image
  } else if suffix == ".png" {
    file, e := os.Open(filePath)
    if nil != e {
      return nil, e
    }
    image, e := png.Decode(file)
    if nil != e {
      return nil, e
    }
    img = image
  } else {
    return nil, errors.New("图片格式错误,现暂仅支持jpeg,jpg,png结尾的图片")
  }

  imgHash := new(goimagehash.ImageHash)
  var e error
  switch funcName {
    case "a":
    imgHash, e = goimagehash.AverageHash(img)
    case "d":
    imgHash, e = goimagehash.DifferenceHash(img)
  }
  if nil != e {
    return nil, e
  }
  return imgHash, nil
}


// srcHashes : 代表的原先存在着的hash
// newHashes: 代表的是新上传来匹配的
func CompareImgHashes(prevHashes, newHashes []*goimagehash.ImageHash, threshold int) (float64, error) {
	l1 := len(prevHashes)
	l2 := len(newHashes)

	count := 0
	for i := 0; i < l2; i++ {
		for j := 0; j < l1; j++ {
			distance, e := newHashes[i].Distance(prevHashes[j])
			if nil != e {
				return 0.0, e
			}
			if distance <= threshold {
				count++
				break
			}
		}
	}

	return float64(count) / float64(l2), nil
}

视频相似度:

  • 基于go-opencv 截取视频帧从而以图片的形式获取
func CompareVideosWithImg(filePath1, filePath2 string, funcName string, threshold int) (float64, error) {
	hashes, e := GetVideoFramesWithImg(filePath1, funcName)
	if nil != e {
		return 0.0, e
	}
	hashes2, e := GetVideoFramesWithImg(filePath2, funcName)
	if nil != e {
		return 0.0, e
	}

	return pic.CompareImgHashes(hashes, hashes2, threshold)
}

func GetVideoFramesWithImg(filePath string, funcName string) ([]*goimagehash.ImageHash, error) {
	// 当level为less的时候默认为10
	level, e := getVideoFrameLevel(filePath)
	if nil != e {
		return nil, e
	}
	return getVideoFramesWithGoimage(filePath, funcName, level)
}


func getVideoFrameLevel(filePath string) (int, error) {
	size, e := utils.GetFileSize(filePath)
	if nil != e {
		return 0, e
	}
	if size < VIDEO_SIZE_10M {
		return VIDEO_FRAME_LEVEL_LESS, nil
	} else if size < VIDEO_SIZE_100M {
		return VIDEO_FRAME_LEVEL_MID, nil
	} else if size < VIDEO_SIZE_1G {
		return VIDEO_FRAME_LEVEL_UPMID, nil
	} else {
		return VIDEO_FRAME_LEVEL_MOST, nil
	}
}

// 通过goimage 获取hash
// 通过level 从而判断要捕获多少帧的图片
func getVideoFramesWithGoimage(filePath string, funName string, level int) ([]*goimagehash.ImageHash, error) {
	picCount := int(VIDEO_LEVEL_COUNT_ARRAY[level])
	// result := make([][]byte, picCount)
	result := make([]*goimagehash.ImageHash,0)
	// load video
	vc, err := gocv.VideoCaptureFile(filePath)
	if err != nil {
		return nil, err
	}

	// fps是帧率,意思是每一秒刷新图片的数量,frames是一整段视频中总的图片数量。
	frames := vc.Get(gocv.VideoCaptureFrameCount)
	total := frames
	fps := vc.Get(gocv.VideoCaptureFPS)
	// 获取时间总长
	duration := frames / fps
	// fmt.Println(duration)
	// 递增的值
	loopAddFrequence := duration / float64(picCount)
	for i, j := 0.0, 0; j < picCount; i += loopAddFrequence {
		// Set Video frames
		// time/duration 获取到那个时间点的百分比
		frames = (i / duration) * total
		vc.Set(gocv.VideoCapturePosFrames, frames)
		img := gocv.NewMat()
		vc.Read(&img)
		// gocv.IMWrite("/Users/joker/Desktop/temp/images/"+utils.GenerateUUID()+"----"+strconv.Itoa(j)+".jpg", img)
		// result[j] = img.ToBytes()
		image, err := img.ToImage()
		if nil != err {
			return result, err
		}
		imageHash := new(goimagehash.ImageHash)
		switch funName {
		case "a":
			imageHash, err = goimagehash.AverageHash(image)
		}
		if nil != err {
			return result, err
		}
		result = append(result, imageHash)
		j++
	}

	return result, err
}
 
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章