Java Program to Implement Min Hash

This is a java program to implement Min Hash. In computer science, MinHash (or the min-wise independent permutations locality sensitive hashing scheme) is a technique for quickly estimating how similar two sets are.

Here is the source code of the Java Program to Implement Min Hash. The Java program is successfully compiled and run on a Windows system. The program output is also shown below.

package com.maixuanviet.datastructures;
 
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Random;
import java.util.Set;
 
public class MinHash<T>
{
    private int hash[];
    private int numHash;
 
    public MinHash(int numHash)
    {
        this.numHash = numHash;
        hash = new int[numHash];
        Random r = new Random(11);
        for (int i = 0; i < numHash; i++)
        {
            int a = (int) r.nextInt();
            int b = (int) r.nextInt();
            int c = (int) r.nextInt();
            int x = hash(a * b * c, a, b, c);
            hash[i] = x;
        }
    }
 
    public double similarity(Set<T> set1, Set<T> set2)
    {
        int numSets = 2;
        Map<T, boolean[]> bitMap = buildBitMap(set1, set2);
        int[][] minHashValues = initializeHashBuckets(numSets, numHash);
        computeMinHashForSet(set1, 0, minHashValues, bitMap);
        computeMinHashForSet(set2, 1, minHashValues, bitMap);
        return computeSimilarityFromSignatures(minHashValues, numHash);
    }
 
    private static int[][] initializeHashBuckets(int numSets,
            int numHashFunctions)
    {
        int[][] minHashValues = new int[numSets][numHashFunctions];
        for (int i = 0; i < numSets; i++)
        {
            for (int j = 0; j < numHashFunctions; j++)
            {
                minHashValues[i][j] = Integer.MAX_VALUE;
            }
        }
        return minHashValues;
    }
 
    private static double computeSimilarityFromSignatures(
            int[][] minHashValues, int numHashFunctions)
    {
        int identicalMinHashes = 0;
        for (int i = 0; i < numHashFunctions; i++)
        {
            if (minHashValues[0][i] == minHashValues[1][i])
            {
                identicalMinHashes++;
            }
        }
        return (1.0 * identicalMinHashes) / numHashFunctions;
    }
 
    private static int hash(int x, int a, int b, int c)
    {
        int hashValue = (int) ((a * (x >> 4) + b * x + c) & 131071);
        return Math.abs(hashValue);
    }
 
    private void computeMinHashForSet(Set<T> set, int setIndex,
            int[][] minHashValues, Map<T, boolean[]> bitArray)
    {
        int index = 0;
        for (T element : bitArray.keySet())
        {
            /*
             * for every element in the bit array
             */
            for (int i = 0; i < numHash; i++)
            {
                /*
                 * for every hash
                 */
                if (set.contains(element))
                {
                    /*
                     * if the set contains the element
                     */
                    int hindex = hash[index];
                    if (hindex < minHashValues[setIndex][index])
                    {
                        /*
                         * if current hash is smaller than the existing hash in
                         * the slot then replace with the smaller hash value
                         */
                        minHashValues[setIndex][i] = hindex;
                    }
                }
            }
            index++;
        }
    }
 
    public Map<T, boolean[]> buildBitMap(Set<T> set1, Set<T> set2)
    {
        Map<T, boolean[]> bitArray = new HashMap<T, boolean[]>();
        for (T t : set1)
        {
            bitArray.put(t, new boolean[] { true, false });
        }
        for (T t : set2)
        {
            if (bitArray.containsKey(t))
            {
                // item is not present in set1
                bitArray.put(t, new boolean[] { true, true });
            }
            else if (!bitArray.containsKey(t))
            {
                // item is not present in set1
                bitArray.put(t, new boolean[] { false, true });
            }
        }
        return bitArray;
    }
 
    public static void main(String[] args)
    {
        Set<String> set1 = new HashSet<String>();
        set1.add("FRANCISCO");
        set1.add("MISSION");
        set1.add("SAN");
        Set<String> set2 = new HashSet<String>();
        set2.add("FRANCISCO");
        set2.add("MISSION");
        set2.add("SAN");
        set2.add("USA");
        MinHash<String> minHash = new MinHash<String>(set1.size() + set2.size());
        System.out.println("Set1 : " + set1);
        System.out.println("Set2 : " + set2);
        System.out.println("Similarity between two sets: "
                + minHash.similarity(set1, set2));
    }
}

Output:

$ javac MinHash.java
$ java MinHash
 
Set1 : [SAN, MISSION, FRANCISCO]
Set2 : [SAN, USA, MISSION, FRANCISCO]
Similarity between two sets: 1.0

Related posts:

Java Program to Implement String Matching Using Vectors
Comparing Long Values in Java
Java Program to Implement Interpolation Search Algorithm
Quick Intro to Spring Cloud Configuration
Java Program to Find Maximum Element in an Array using Binary Search
Hướng dẫn kết nối cơ sở dữ liệu với Java JDBC
A Guide to Concurrent Queues in Java
Spring MVC Async vs Spring WebFlux
Java – Create a File
Java Program to Find Nearest Neighbor for Dynamic Data Set
Java Program to Implement Repeated Squaring Algorithm
Convert a Map to an Array, List or Set in Java
Error Handling for REST with Spring
Logging in Spring Boot
Introduction to Java 8 Streams
Apache Commons Collections MapUtils
Java Program to Implement Ford–Fulkerson Algorithm
Java Program to Implement Hamiltonian Cycle Algorithm
Java Program to Create a Random Graph Using Random Edge Generation
Hướng dẫn Java Design Pattern – Bridge
Java Program to Implement EnumMap API
So sánh ArrayList và Vector trong Java
Java Program to Find the Connected Components of an UnDirected Graph
Java Program to Find the Shortest Path Between Two Vertices Using Dijkstra’s Algorithm
RegEx for matching Date Pattern in Java
Java Program to Generate N Number of Passwords of Length M Each
Case-Insensitive String Matching in Java
Reactive Flow with MongoDB, Kotlin, and Spring WebFlux
Java Program to Implement Leftist Heap
Loại bỏ các phần tử trùng trong một ArrayList như thế nào?
Java Program to Perform the Unique Factorization of a Given Number
How to Break from Java Stream forEach