Daily Archives: January 10, 2015

Jaccard distance

This is to response Junmin Liu’s Jaccard distance problem.
For example:
two strings: adbc and adffg, result is 2 / 6
(size of intersect of chars in strings) / (size of union of chars in strings). and duplicate character can only count once.

I came up with 2 solutions:
1. O(m+n) time, O(m+n) space with hashset
2. O((m+n)log(m+n)) time, O(1) space by sorting.
Below is my code:

  1. import java.util.Arrays;
  2. import java.util.HashSet;
  3. public class Jaccard {
  4.     public static void main(String[] args) {
  5.         char[] str = {‘a’, ‘f’, ‘g’, ‘e’, ‘a’, ‘c’, ‘a’, ‘d’, ‘f’, ‘b’, ‘q’};
  6.         int pos = 5;
  7.         calJacDistance(str, pos);
  8.         char[] a = {‘a’, ‘b’, ‘c’, ‘d’,’e’,’g’};
  9.         char[] b = {‘a’, ‘d’, ‘f’, ‘f’, ‘g’};
  10.         calJacDistance2(a, b);
  11.     }
  12.     /*
  13.      * Time O(m+n), space O(m+n)
  14.      */
  15.     public static void calJacDistance2(char[] a, char[] b){
  16.         if(a==null||b==null||a.length<1||b.length<1){
  17.             return;    //lenght of b[] is zero.
  18.         }
  19.         HashSet hs1 = new HashSet();
  20.         HashSet hs2 = new HashSet();
  21.         for(int i=0;i
  22.             hs1.add(a[i]);
  23.         }
  24.         for(int i=0;i
  25.             hs2.add(b[i]);
  26.         }
  27.         //put elements from hs2 to hs1. During this process, record how many duplication it is.
  28.         int dup_num = 0;
  29.         for(char ch:hs2){
  30.             if(hs1.contains(ch)){
  31.                 dup_num++;
  32.             }
  33.             else{
  34.                 hs1.add(ch);
  35.             }
  36.         }
  37.         int union_num = hs1.size();
  38.         System.out.println((double)dup_num / (double)union_num);
  39.     }
  40.     /*
  41.      * The idea of this solution is to sort and find the duplicaitons.
  42.      * str[0,…,pos] is the a[], str[pos+1,…,len-1] is b[]
  43.      * time O[(m+n)log(m+n)], space O(1).
  44.      * time mostly is costed by sorting.
  45.      */
  46.     public static void calJacDistance(char[] str, int pos){
  47.         if(pos==str.length-1||pos<0){
  48.             return;    //lenght of b[] is zero.
  49.         }
  50.         Arrays.sort(str, 0, pos+1);
  51.         Arrays.sort(str, pos+1, str.length);
  52.         //below 2 for loop is to deduplicate a[] and b[] separately
  53.         int dup_num = 0;    //to record how many duplication are there in str.
  54.         for(int i=1;i<=pos;i++){
  55.             if(str[i]==str[i-1-dup_num]){
  56.                 dup_num++;
  57.             }
  58.             else{
  59.                 str[i-dup_num] = str[i];
  60.             }
  61.         }
  62.         str[pos+1-dup_num] = str[pos+1];
  63.         for(int i=pos+2;i
  64.             if(str[i]==str[i-1-dup_num]){
  65.                 dup_num++;
  66.             }
  67.             else{
  68.                 str[i-dup_num] = str[i];
  69.             }
  70.         }
  71.         Arrays.sort(str, 0, str.length -dup_num);    //sort the new array
  72.         //duplication number in str[0,…,len-dup_num-1] will be the intersection number
  73.         //str.len-dup_num-dup_num2 will be the union number
  74.         int dup_num2 = 0;
  75.         for(int i=1;i
  76.             if(str[i]==str[i-1-dup_num2]){
  77.                 dup_num2++;
  78.             }
  79.             else{
  80.                 str[i-dup_num2] = str[i];
  81.             }
  82.         }
  83.         int union_num = str.length – dup_num – dup_num2;
  84.         System.out.println((double)dup_num2 / (double)union_num);
  85.     }
  86. }