This is a java program to implement Min Hash. In computer science, MinHash (or the min-wise independent permutations locality sensitive hashing scheme) is a technique for quickly estimating how similar two sets are.
Here is the source code of the Java Program to Implement Min Hash. The Java program is successfully compiled and run on a Windows system. The program output is also shown below.
package com.maixuanviet.datastructures;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Random;
import java.util.Set;
public class MinHash<T>
{
private int hash[];
private int numHash;
public MinHash(int numHash)
{
this.numHash = numHash;
hash = new int[numHash];
Random r = new Random(11);
for (int i = 0; i < numHash; i++)
{
int a = (int) r.nextInt();
int b = (int) r.nextInt();
int c = (int) r.nextInt();
int x = hash(a * b * c, a, b, c);
hash[i] = x;
}
}
public double similarity(Set<T> set1, Set<T> set2)
{
int numSets = 2;
Map<T, boolean[]> bitMap = buildBitMap(set1, set2);
int[][] minHashValues = initializeHashBuckets(numSets, numHash);
computeMinHashForSet(set1, 0, minHashValues, bitMap);
computeMinHashForSet(set2, 1, minHashValues, bitMap);
return computeSimilarityFromSignatures(minHashValues, numHash);
}
private static int[][] initializeHashBuckets(int numSets,
int numHashFunctions)
{
int[][] minHashValues = new int[numSets][numHashFunctions];
for (int i = 0; i < numSets; i++)
{
for (int j = 0; j < numHashFunctions; j++)
{
minHashValues[i][j] = Integer.MAX_VALUE;
}
}
return minHashValues;
}
private static double computeSimilarityFromSignatures(
int[][] minHashValues, int numHashFunctions)
{
int identicalMinHashes = 0;
for (int i = 0; i < numHashFunctions; i++)
{
if (minHashValues[0][i] == minHashValues[1][i])
{
identicalMinHashes++;
}
}
return (1.0 * identicalMinHashes) / numHashFunctions;
}
private static int hash(int x, int a, int b, int c)
{
int hashValue = (int) ((a * (x >> 4) + b * x + c) & 131071);
return Math.abs(hashValue);
}
private void computeMinHashForSet(Set<T> set, int setIndex,
int[][] minHashValues, Map<T, boolean[]> bitArray)
{
int index = 0;
for (T element : bitArray.keySet())
{
/*
* for every element in the bit array
*/
for (int i = 0; i < numHash; i++)
{
/*
* for every hash
*/
if (set.contains(element))
{
/*
* if the set contains the element
*/
int hindex = hash[index];
if (hindex < minHashValues[setIndex][index])
{
/*
* if current hash is smaller than the existing hash in
* the slot then replace with the smaller hash value
*/
minHashValues[setIndex][i] = hindex;
}
}
}
index++;
}
}
public Map<T, boolean[]> buildBitMap(Set<T> set1, Set<T> set2)
{
Map<T, boolean[]> bitArray = new HashMap<T, boolean[]>();
for (T t : set1)
{
bitArray.put(t, new boolean[] { true, false });
}
for (T t : set2)
{
if (bitArray.containsKey(t))
{
// item is not present in set1
bitArray.put(t, new boolean[] { true, true });
}
else if (!bitArray.containsKey(t))
{
// item is not present in set1
bitArray.put(t, new boolean[] { false, true });
}
}
return bitArray;
}
public static void main(String[] args)
{
Set<String> set1 = new HashSet<String>();
set1.add("FRANCISCO");
set1.add("MISSION");
set1.add("SAN");
Set<String> set2 = new HashSet<String>();
set2.add("FRANCISCO");
set2.add("MISSION");
set2.add("SAN");
set2.add("USA");
MinHash<String> minHash = new MinHash<String>(set1.size() + set2.size());
System.out.println("Set1 : " + set1);
System.out.println("Set2 : " + set2);
System.out.println("Similarity between two sets: "
+ minHash.similarity(set1, set2));
}
}
Output:
$ javac MinHash.java $ java MinHash Set1 : [SAN, MISSION, FRANCISCO] Set2 : [SAN, USA, MISSION, FRANCISCO] Similarity between two sets: 1.0
Related posts:
Initialize a HashMap in Java
Hướng dẫn Java Design Pattern – Mediator
A Guide to TreeMap in Java
Hướng dẫn sử dụng lớp Console trong java
Java Program to Implement Doubly Linked List
Java Program to Test Using DFS Whether a Directed Graph is Strongly Connected or Not
Vòng lặp for, while, do-while trong Java
How to Define a Spring Boot Filter?
Java Program to Find Number of Articulation points in a Graph
Spring Cloud – Tracing Services with Zipkin
Java Program to Implement Treap
Guide to Selenium with JUnit / TestNG
Spring Boot - Quick Start
Java Program to Implement the Alexander Bogomolny’s UnOrdered Permutation Algorithm for Elements Fro...
Spring Data JPA @Modifying Annotation
Lập trình hướng đối tượng (OOPs) trong java
New Features in Java 8
Java Program to Implement Trie
Các nguyên lý thiết kế hướng đối tượng – SOLID
Java Program to Implement Bellman-Ford Algorithm
Rest Web service: Filter và Interceptor với Jersey 2.x (P2)
A Guide to WatchService in Java NIO2
Spring Boot - Hystrix
HandlerAdapters in Spring MVC
Deploy a Spring Boot WAR into a Tomcat Server
What is Thread-Safety and How to Achieve it?
OAuth2 Remember Me with Refresh Token
Intro to the Jackson ObjectMapper
Guide to Character Encoding
The “final” Keyword in Java
Java – Write a Reader to File
Giới thiệu Aspect Oriented Programming (AOP)