- 鏈接在這: mygithub-itsfunny
##TODO:
- [] 視頻相似度關鍵幀獲取
- [] 音頻相似度計算
- 源語言: GoLang
文本相似度:
-
小文本採用的方式爲php自帶的文本比較即可:
// 簡單字符串匹配,適用於當長度小的情況 func SimpleCompareTextSimilarity(prev, newUpload string) (float64, int) { per := 0.0 i := php2go.SimilarText(prev, newUpload, &per) return per, i }
-
大文本採取的是基於
海明距離
:-
// 校驗字符串的相似度,使用simhash進行判斷 func CompareTextSimilarity(prev, newUpload string) (float64, int) { hash1 := simhash.Simhash(simhash.NewWordFeatureSet([]byte(prev))) hash2 := simhash.Simhash(simhash.NewWordFeatureSet([]byte(newUpload))) compare := simhash.Compare(hash1, hash2) return 0.0, int(compare) }
-
圖片相似度:
- 基於 ahash,dhash
// batch比較圖片的相似度
func BatchComparePicSimilarity(funName string, filePathSlice1 []string, filePathSlice2 []string, threshold int) (int, float64, error) {
hash1 := make([]*goimagehash.ImageHash, 0)
for _, p := range filePathSlice1 {
imageHash, e := GetImgHash(funName, p)
if nil != e {
return 0, 0.0, e
}
hash1 = append(hash1, imageHash)
}
hash2 := make([]*goimagehash.ImageHash, 0)
for _, p := range filePathSlice2 {
imageHash, e := GetImgHash(funName, p)
if nil != e {
return 0, 0.0, e
}
hash2 = append(hash2, imageHash)
}
f, e := CompareImgHashes(hash1, hash2, threshold)
return -1, f, e
}
func GetImgHash(funcName string, filePath string) (*goimagehash.ImageHash, error) {
suffix := path.Ext(filePath)
if exists := utils.IsFileOrDirExists(filePath); !exists {
return nil, errors.New(fmt.Sprintf("[%s]文件不存在", filePath))
}
var img image.Image
if suffix == ".jpeg" || suffix == ".jpg" {
file, e := os.Open(filePath)
if nil != e {
return nil, e
}
image, e := jpeg.Decode(file)
if nil != e {
return nil, e
}
img = image
} else if suffix == ".png" {
file, e := os.Open(filePath)
if nil != e {
return nil, e
}
image, e := png.Decode(file)
if nil != e {
return nil, e
}
img = image
} else {
return nil, errors.New("圖片格式錯誤,現暫僅支持jpeg,jpg,png結尾的圖片")
}
imgHash := new(goimagehash.ImageHash)
var e error
switch funcName {
case "a":
imgHash, e = goimagehash.AverageHash(img)
case "d":
imgHash, e = goimagehash.DifferenceHash(img)
}
if nil != e {
return nil, e
}
return imgHash, nil
}
// srcHashes : 代表的原先存在着的hash
// newHashes: 代表的是新上傳來匹配的
func CompareImgHashes(prevHashes, newHashes []*goimagehash.ImageHash, threshold int) (float64, error) {
l1 := len(prevHashes)
l2 := len(newHashes)
count := 0
for i := 0; i < l2; i++ {
for j := 0; j < l1; j++ {
distance, e := newHashes[i].Distance(prevHashes[j])
if nil != e {
return 0.0, e
}
if distance <= threshold {
count++
break
}
}
}
return float64(count) / float64(l2), nil
}
視頻相似度:
- 基於go-opencv 截取視頻幀從而以圖片的形式獲取
func CompareVideosWithImg(filePath1, filePath2 string, funcName string, threshold int) (float64, error) {
hashes, e := GetVideoFramesWithImg(filePath1, funcName)
if nil != e {
return 0.0, e
}
hashes2, e := GetVideoFramesWithImg(filePath2, funcName)
if nil != e {
return 0.0, e
}
return pic.CompareImgHashes(hashes, hashes2, threshold)
}
func GetVideoFramesWithImg(filePath string, funcName string) ([]*goimagehash.ImageHash, error) {
// 當level爲less的時候默認爲10
level, e := getVideoFrameLevel(filePath)
if nil != e {
return nil, e
}
return getVideoFramesWithGoimage(filePath, funcName, level)
}
func getVideoFrameLevel(filePath string) (int, error) {
size, e := utils.GetFileSize(filePath)
if nil != e {
return 0, e
}
if size < VIDEO_SIZE_10M {
return VIDEO_FRAME_LEVEL_LESS, nil
} else if size < VIDEO_SIZE_100M {
return VIDEO_FRAME_LEVEL_MID, nil
} else if size < VIDEO_SIZE_1G {
return VIDEO_FRAME_LEVEL_UPMID, nil
} else {
return VIDEO_FRAME_LEVEL_MOST, nil
}
}
// 通過goimage 獲取hash
// 通過level 從而判斷要捕獲多少幀的圖片
func getVideoFramesWithGoimage(filePath string, funName string, level int) ([]*goimagehash.ImageHash, error) {
picCount := int(VIDEO_LEVEL_COUNT_ARRAY[level])
// result := make([][]byte, picCount)
result := make([]*goimagehash.ImageHash,0)
// load video
vc, err := gocv.VideoCaptureFile(filePath)
if err != nil {
return nil, err
}
// fps是幀率,意思是每一秒刷新圖片的數量,frames是一整段視頻中總的圖片數量。
frames := vc.Get(gocv.VideoCaptureFrameCount)
total := frames
fps := vc.Get(gocv.VideoCaptureFPS)
// 獲取時間總長
duration := frames / fps
// fmt.Println(duration)
// 遞增的值
loopAddFrequence := duration / float64(picCount)
for i, j := 0.0, 0; j < picCount; i += loopAddFrequence {
// Set Video frames
// time/duration 獲取到那個時間點的百分比
frames = (i / duration) * total
vc.Set(gocv.VideoCapturePosFrames, frames)
img := gocv.NewMat()
vc.Read(&img)
// gocv.IMWrite("/Users/joker/Desktop/temp/images/"+utils.GenerateUUID()+"----"+strconv.Itoa(j)+".jpg", img)
// result[j] = img.ToBytes()
image, err := img.ToImage()
if nil != err {
return result, err
}
imageHash := new(goimagehash.ImageHash)
switch funName {
case "a":
imageHash, err = goimagehash.AverageHash(image)
}
if nil != err {
return result, err
}
result = append(result, imageHash)
j++
}
return result, err
}