【Similarity calculation】Jaccard Distance

[b]Jaccard token distance[/b]
It is simply given by the number of common tokens in two names and the count of total number of tokens in those names.
[img]http://dl.iteye.com/upload/attachment/0077/1301/33139bc6-9680-36b1-b3f9-75b24d5d5c10.png[/img]

[b]Jaccard distance (simplify)[/b]
To reduce the computational complexity
[img]http://dl.iteye.com/upload/attachment/0077/1303/f4c44f96-392f-339b-8e92-491211728a48.png[/img]

[b]Jaccard distance (weighted)[/b]
weighted Jaccard distance is equal to the following expression
[img]http://dl.iteye.com/upload/attachment/0077/1305/9b50eebb-9fb5-3624-bf53-53f8b1d3ad05.png[/img]

then [b]Jaccard similarity function[/b] only need to take last past from the above each function.



import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;

public class JaccardDistance {
public static Map<String, Double> weightMap = new HashMap<String, Double>();

/**
* intersection between two strings
* @param source
* @param target
* @return
*/
public static List<String> intersection(String source, String target) {
List<String> slist = Arrays.asList(source.split(" "));
List<String> tlist = Arrays.asList(target.split(" "));
List<String> intersection = new ArrayList<String>();

for (String s: slist) {
if (tlist.contains(s)) {
if (!intersection.contains(s)) {
intersection.add(s);
}
}
}

return intersection;
}
/**
* J(s,t) = 1 - intersection(s, t).size()) / (s.size() + t.size() - intersection.size()) *
* @param source
* @param target
*/
public static double Jaccard1(String source, String target) {
List<String> slist = Arrays.asList(source.split(" "));
List<String> tlist = Arrays.asList(target.split(" "));
List<String> intersection = intersection(source, target);

return (double) 1 - intersection.size() / (double)(slist.size() + tlist.size() - intersection.size());
}

/**
* J(s,t) = 1 - 2 * intersection(s, t).size()) / (s.size() + t.size())
* @param source
* @param target
* @return
*/
public static double Jaccard2(String source, String target) {
List<String> slist = Arrays.asList(source.split(" "));
List<String> tlist = Arrays.asList(target.split(" "));
List<String> intersection = intersection(source, target);

return (double) 1 - 2 * intersection.size() / (double)(slist.size() + tlist.size());
}

/**
* J(s,t) each token has weight value.
* @param stringList
* @param token
* @return
*/
public static void JaccardWeight(List<String> stringList) {
Map<String, Integer> freqMap = new HashMap<String, Integer>();

for (String string : stringList) {
List<String> slist = Arrays.asList(string.split(" "));
for (String s : slist) {
s = s.trim();
if (freqMap.containsKey(s)) {
freqMap.put(s, freqMap.get(s)+1);
} else {
freqMap.put(s, 1);
}
}
}

for (String key : freqMap.keySet()) {
int freq = freqMap.get(key);
double weight = (double) 1 / (Math.log(freq) + 1);
weightMap.put(key, weight);
}
// return weightMap;
}


public static double Jaccard3(String source, String target) {
List<String> slist = Arrays.asList(source.split(" "));
List<String> tlist = Arrays.asList(target.split(" "));
List<String> intersection = intersection(source, target);

double intersectionWeight = 0;
double sourceWeight = 0;
double targetWeight = 0;
for (String s : intersection) {
intersectionWeight += weightMap.get(s);
}
for (String s : slist) {
sourceWeight += weightMap.get(s);
}
for (String s: tlist) {
targetWeight += weightMap.get(s);
}

return 1 - 2 * intersectionWeight / (sourceWeight + targetWeight);
}


//main
public static void main(String[] args) {
String s1 = "AAE HOLDING";
String s2 = "AAE TECHNOLOGY INTERNATIONAL";
String s3 = "AGRIPA HOLDING";
System.out.println("J1(s1, s2) = " + Jaccard1(s1, s2));
System.out.println("J1(s1, s3) = " + Jaccard1(s1, s3));
System.out.println("J1(s2, s3) = " + Jaccard1(s2, s3));

System.out.println();
System.out.println("J2(s1, s2) = " + Jaccard2(s1, s2));
System.out.println("J2(s1, s3) = " + Jaccard2(s1, s3));
System.out.println("J2(s2, s3) = " + Jaccard2(s2, s3));

System.out.println();
List<String> stringList = new ArrayList<String>();
Collections.addAll(stringList, s1, s2, s3);
JaccardWeight(stringList);
System.out.println(weightMap);
System.out.println("J3(s1, s2) = " + Jaccard3(s1, s2));
System.out.println("J3(s1, s3) = " + Jaccard3(s1, s3));
System.out.println("J3(s2, s3) = " + Jaccard3(s2, s3));
}

}



Output:
J1(s1, s2) = 0.75
J1(s1, s3) = 0.6666666666666667
J1(s2, s3) = 1.0

J2(s1, s2) = 0.6
J2(s1, s3) = 0.5
J2(s2, s3) = 1.0

{AAE=0.5906161091496412, TECHNOLOGY=1.0, AGRIPA=1.0, INTERNATIONAL=1.0, HOLDING=0.5906161091496412}
J3(s1, s2) = 0.6868293431358082
J3(s1, s3) = 0.5738467337473576
J3(s2, s3) = 1.0

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章