This is a java program to implement Min Hash. In computer science, MinHash (or the min-wise independent permutations locality sensitive hashing scheme) is a technique for quickly estimating how similar two sets are.
Here is the source code of the Java Program to Implement Min Hash. The Java program is successfully compiled and run on a Windows system. The program output is also shown below.
package com.maixuanviet.datastructures;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Random;
import java.util.Set;
public class MinHash<T>
{
private int hash[];
private int numHash;
public MinHash(int numHash)
{
this.numHash = numHash;
hash = new int[numHash];
Random r = new Random(11);
for (int i = 0; i < numHash; i++)
{
int a = (int) r.nextInt();
int b = (int) r.nextInt();
int c = (int) r.nextInt();
int x = hash(a * b * c, a, b, c);
hash[i] = x;
}
}
public double similarity(Set<T> set1, Set<T> set2)
{
int numSets = 2;
Map<T, boolean[]> bitMap = buildBitMap(set1, set2);
int[][] minHashValues = initializeHashBuckets(numSets, numHash);
computeMinHashForSet(set1, 0, minHashValues, bitMap);
computeMinHashForSet(set2, 1, minHashValues, bitMap);
return computeSimilarityFromSignatures(minHashValues, numHash);
}
private static int[][] initializeHashBuckets(int numSets,
int numHashFunctions)
{
int[][] minHashValues = new int[numSets][numHashFunctions];
for (int i = 0; i < numSets; i++)
{
for (int j = 0; j < numHashFunctions; j++)
{
minHashValues[i][j] = Integer.MAX_VALUE;
}
}
return minHashValues;
}
private static double computeSimilarityFromSignatures(
int[][] minHashValues, int numHashFunctions)
{
int identicalMinHashes = 0;
for (int i = 0; i < numHashFunctions; i++)
{
if (minHashValues[0][i] == minHashValues[1][i])
{
identicalMinHashes++;
}
}
return (1.0 * identicalMinHashes) / numHashFunctions;
}
private static int hash(int x, int a, int b, int c)
{
int hashValue = (int) ((a * (x >> 4) + b * x + c) & 131071);
return Math.abs(hashValue);
}
private void computeMinHashForSet(Set<T> set, int setIndex,
int[][] minHashValues, Map<T, boolean[]> bitArray)
{
int index = 0;
for (T element : bitArray.keySet())
{
/*
* for every element in the bit array
*/
for (int i = 0; i < numHash; i++)
{
/*
* for every hash
*/
if (set.contains(element))
{
/*
* if the set contains the element
*/
int hindex = hash[index];
if (hindex < minHashValues[setIndex][index])
{
/*
* if current hash is smaller than the existing hash in
* the slot then replace with the smaller hash value
*/
minHashValues[setIndex][i] = hindex;
}
}
}
index++;
}
}
public Map<T, boolean[]> buildBitMap(Set<T> set1, Set<T> set2)
{
Map<T, boolean[]> bitArray = new HashMap<T, boolean[]>();
for (T t : set1)
{
bitArray.put(t, new boolean[] { true, false });
}
for (T t : set2)
{
if (bitArray.containsKey(t))
{
// item is not present in set1
bitArray.put(t, new boolean[] { true, true });
}
else if (!bitArray.containsKey(t))
{
// item is not present in set1
bitArray.put(t, new boolean[] { false, true });
}
}
return bitArray;
}
public static void main(String[] args)
{
Set<String> set1 = new HashSet<String>();
set1.add("FRANCISCO");
set1.add("MISSION");
set1.add("SAN");
Set<String> set2 = new HashSet<String>();
set2.add("FRANCISCO");
set2.add("MISSION");
set2.add("SAN");
set2.add("USA");
MinHash<String> minHash = new MinHash<String>(set1.size() + set2.size());
System.out.println("Set1 : " + set1);
System.out.println("Set2 : " + set2);
System.out.println("Similarity between two sets: "
+ minHash.similarity(set1, set2));
}
}
Output:
$ javac MinHash.java $ java MinHash Set1 : [SAN, MISSION, FRANCISCO] Set2 : [SAN, USA, MISSION, FRANCISCO] Similarity between two sets: 1.0
Related posts:
HandlerAdapters in Spring MVC
Feign – Tạo ứng dụng Java RESTful Client
StringBuilder vs StringBuffer in Java
Function trong Java 8
Lớp HashMap trong Java
How to Use if/else Logic in Java 8 Streams
Spring Boot - Application Properties
Các nguyên lý thiết kế hướng đối tượng – SOLID
Hashtable trong java
Java – Write a Reader to File
Guide to the Synchronized Keyword in Java
Spring Security 5 for Reactive Applications
So sánh HashMap và HashSet trong Java
Java Program to Find Nearest Neighbor Using Linear Search
Giới thiệu JDBC Connection Pool
Java Program to Implement Solovay Strassen Primality Test Algorithm
Sorting in Java
Sử dụng Fork/Join Framework với ForkJoinPool trong Java
The Difference Between map() and flatMap()
Java Program to Implement Hash Tree
Hướng dẫn Java Design Pattern – Command
Add Multiple Items to an Java ArrayList
OAuth 2.0 Resource Server With Spring Security 5
CharSequence vs. String in Java
Comparing getPath(), getAbsolutePath(), and getCanonicalPath() in Java
Upload and Display Excel Files with Spring MVC
Spring MVC Content Negotiation
Spring Boot - Cloud Configuration Client
Daemon Threads in Java
Jackson – Change Name of Field
Java Program to Implement Bit Array
Serialization và Deserialization trong java