Loading your tools...
Loading your tools...
Learn to build a powerful text similarity checker using Google's SimHash algorithm. We explain locality-sensitive hashing, 64-bit fingerprints, Hamming distance, and how to implement it all in JavaScript.
Hey Developers! 🔍✨
Ever wondered how Google detects duplicate web pages across 8 billion documents in milliseconds? They don't compare every page character by character – that would take forever. Instead, they use a brilliant algorithm called SimHash.
Today, we're going to build the exact same system. By the end, you'll have a text similarity checker that generates 64-bit fingerprints and calculates similarity in the blink of an eye.
Let's say you want to detect if two articles are plagiarized copies. You might think: "Just hash both and compare!"
// ❌ This doesn't work for similarity
const hash1 = md5("The quick brown fox jumps over the lazy dog");
const hash2 = md5("The quick brown fox leaps over the lazy dog");
// hash1 and hash2 are COMPLETELY different!
Cryptographic hashes (MD5, SHA-256) are designed so that even a 1-character change produces a completely different hash. That's great for security, but useless for detecting similar documents.
SimHash solves this. Similar documents produce similar fingerprints.
SimHash works in 4 phases:
The result? A 64-bit fingerprint that captures the "essence" of the document.
Document A: "The quick brown fox"
↓ SimHash
Fingerprint: 0x8A3F1C2B5D6E7890
Document B: "The quick brown dog" (1 word different)
↓ SimHash
Fingerprint: 0x8A3F1C2B5D6E7891 (only 1 bit different!)
We need to break text into meaningful chunks. We use both:
export function tokenize(text) {
const features = new Map();
// Normalize: lowercase, remove special chars
const normalized = text
.toLowerCase()
.replace(/[^\w\s]/g, " ")
.replace(/\s+/g, " ")
.trim();
const words = normalized.split(" ").filter(w => w.length >= 2);
// Add unigrams (weight 1)
for (const word of words) {
features.set(word, (features.get(word) || 0) + 1);
}
// Add bigrams (weight 2 for context)
for (let i = 0; i < words.length - 1; i++) {
const bigram = `${words[i]} ${words[i + 1]}`;
features.set(bigram, (features.get(bigram) || 0) + 2);
}
return features; // Map<feature, weight>
}
Example Output:
tokenize("the quick brown fox")
// Map {
// "the" => 1,
// "quick" => 1,
// "brown" => 1,
// "fox" => 1,
// "the quick" => 2,
// "quick brown" => 2,
// "brown fox" => 2
// }
We need to convert each feature string into a 64-bit number. FNV-1a (Fowler-Noll-Vo) is perfect: fast and good distribution.
const HASH_BITS = 64;
function fnv1aHash(str) {
const FNV_PRIME = BigInt("0x100000001b3");
const FNV_OFFSET = BigInt("0xcbf29ce484222325");
let hash = FNV_OFFSET;
for (let i = 0; i < str.length; i++) {
hash ^= BigInt(str.charCodeAt(i));
hash = (hash * FNV_PRIME) & BigInt("0xFFFFFFFFFFFFFFFF");
}
return hash; // 64-bit BigInt
}
Why BigInt? JavaScript numbers are 53-bit floats. For precise 64-bit operations, we need BigInt.
This is where the magic happens. We create a 64-element array (one per bit position). For each feature:
export function simhash(text) {
const features = tokenize(text);
if (features.size === 0) return BigInt(0);
// 64 vote counters, one per bit
const vector = new Array(HASH_BITS).fill(0);
for (const [feature, weight] of features) {
const hash = fnv1aHash(feature);
// For each bit position (0-63)
for (let i = 0; i < HASH_BITS; i++) {
// Is bit i set?
const bit = (hash >> BigInt(i)) & BigInt(1);
if (bit === BigInt(1)) {
vector[i] += weight; // Vote YES
} else {
vector[i] -= weight; // Vote NO
}
}
}
// Generate final fingerprint
let fingerprint = BigInt(0);
for (let i = 0; i < HASH_BITS; i++) {
if (vector[i] > 0) {
fingerprint |= BigInt(1) << BigInt(i);
}
}
return fingerprint;
}
The Intuition: If many features have bit 7 set to 1, the final fingerprint will also have bit 7 = 1. The majority wins!
To compare two fingerprints, we count how many bits are different. This is called Hamming distance.
export function hammingDistance(hash1, hash2) {
let xor = hash1 ^ hash2; // Different bits are now 1
let distance = 0;
while (xor > BigInt(0)) {
distance += Number(xor & BigInt(1)); // Count the 1s
xor >>= BigInt(1); // Shift right
}
return distance; // 0 to 64
}
// Convert to percentage
export function calculateSimilarity(text1, text2) {
const fp1 = simhash(text1);
const fp2 = simhash(text2);
const distance = hammingDistance(fp1, fp2);
// Similarity = (64 - distance) / 64 * 100
const similarity = Math.round(((HASH_BITS - distance) / HASH_BITS) * 100);
return {
similarity: Math.max(0, Math.min(100, similarity)),
fingerprint1: fp1.toString(16).padStart(16, "0").toUpperCase(),
fingerprint2: fp2.toString(16).padStart(16, "0").toUpperCase(),
hammingDistance: distance,
};
}
Interpretation Guide:
| Hamming Distance | Similarity | Meaning |
|---|---|---|
| 0-3 | 95-100% | Near-identical (likely duplicates) |
| 4-10 | 85-94% | Very similar content |
| 11-20 | 69-84% | Related content |
| 21+ | Less than 69% | Different content |
Now let's build the complete UI with visual feedback.
"use client"
import { useState } from "react"
import { calculateSimilarity, getTextStats } from "@/lib/simhash"
export function TextSimilarityChecker() {
const [textA, setTextA] = useState("")
const [textB, setTextB] = useState("")
const [result, setResult] = useState(null)
const compare = () => {
if (!textA.trim() || !textB.trim()) return
const comparisonResult = calculateSimilarity(textA, textB)
setResult(comparisonResult)
}
const getSimilarityColor = (similarity) => {
if (similarity >= 80) return "text-green-500"
if (similarity >= 50) return "text-yellow-500"
return "text-red-500"
}
return (
<div className="space-y-6">
{/* Text Input A */}
<div>
<label className="font-semibold">Text A</label>
<textarea
placeholder="Enter first text..."
value={textA}
onChange={(e) => setTextA(e.target.value)}
className="w-full h-32 p-3 border rounded-lg font-mono"
/>
</div>
{/* Text Input B */}
<div>
<label className="font-semibold">Text B</label>
<textarea
placeholder="Enter second text..."
value={textB}
onChange={(e) => setTextB(e.target.value)}
className="w-full h-32 p-3 border rounded-lg font-mono"
/>
</div>
<button onClick={compare} className="w-full py-4 bg-blue-600 text-white rounded-lg">
Check Similarity
</button>
{/* Results */}
{result && (
<div className="p-6 bg-slate-100 rounded-lg text-center">
<div className="text-sm text-gray-500">Similarity Score</div>
<div className={`text-5xl font-bold ${getSimilarityColor(result.similarity)}`}>
{result.similarity}%
</div>
<div className="mt-4 grid grid-cols-2 gap-4 text-sm">
<div>
<strong>Fingerprint A:</strong>
<code className="block bg-white p-2 rounded mt-1">
0x{result.fingerprint1}
</code>
</div>
<div>
<strong>Fingerprint B:</strong>
<code className="block bg-white p-2 rounded mt-1">
0x{result.fingerprint2}
</code>
</div>
</div>
<div className="mt-4 text-sm">
<strong>Hamming Distance:</strong> {result.hammingDistance} / 64 bits
</div>
</div>
)}
</div>
)
}
The genius of SimHash is probability theory. When two documents share many features:
Google's research proved that for 8 billion pages, documents with Hamming distance ≤3 were near-duplicates 99% of the time. That's the power of locality-sensitive hashing!
Our implementation is fast, but here are tips for production:
// 1. Memoize fingerprints for repeated comparisons
const fingerprintCache = new Map();
function getCachedFingerprint(text) {
const key = text.substring(0, 100); // Use prefix as cache key
if (!fingerprintCache.has(key)) {
fingerprintCache.set(key, simhash(text));
}
return fingerprintCache.get(key);
}
// 2. Web Workers for large documents
// Move simhash() to a worker to avoid blocking UI
// 3. Chunking for huge documents
function simhashLargeDoc(text, chunkSize = 5000) {
const chunks = [];
for (let i = 0; i < text.length; i += chunkSize) {
chunks.push(simhash(text.slice(i, i + chunkSize)));
}
// Combine chunk fingerprints (advanced: use XOR or voting)
return chunks[0]; // Simplified
}
Here's everything in one file:
// lib/simhash.ts
const HASH_BITS = 64;
function fnv1aHash(str) {
const FNV_PRIME = BigInt("0x100000001b3");
const FNV_OFFSET = BigInt("0xcbf29ce484222325");
let hash = FNV_OFFSET;
for (let i = 0; i < str.length; i++) {
hash ^= BigInt(str.charCodeAt(i));
hash = (hash * FNV_PRIME) & BigInt("0xFFFFFFFFFFFFFFFF");
}
return hash;
}
export function tokenize(text) {
const features = new Map();
const normalized = text.toLowerCase().replace(/[^\w\s]/g, " ").replace(/\s+/g, " ").trim();
if (!normalized) return features;
const words = normalized.split(" ").filter(w => w.length >= 2);
for (const word of words) {
features.set(word, (features.get(word) || 0) + 1);
}
for (let i = 0; i < words.length - 1; i++) {
const bigram = `${words[i]} ${words[i + 1]}`;
features.set(bigram, (features.get(bigram) || 0) + 2);
}
return features;
}
export function simhash(text) {
const features = tokenize(text);
if (features.size === 0) return BigInt(0);
const vector = new Array(HASH_BITS).fill(0);
for (const [feature, weight] of features) {
const hash = fnv1aHash(feature);
for (let i = 0; i < HASH_BITS; i++) {
const bit = (hash >> BigInt(i)) & BigInt(1);
vector[i] += bit === BigInt(1) ? weight : -weight;
}
}
let fingerprint = BigInt(0);
for (let i = 0; i < HASH_BITS; i++) {
if (vector[i] > 0) fingerprint |= BigInt(1) << BigInt(i);
}
return fingerprint;
}
export function hammingDistance(hash1, hash2) {
let xor = hash1 ^ hash2;
let distance = 0;
while (xor > BigInt(0)) {
distance += Number(xor & BigInt(1));
xor >>= BigInt(1);
}
return distance;
}
export function calculateSimilarity(text1, text2) {
const fp1 = simhash(text1);
const fp2 = simhash(text2);
const distance = hammingDistance(fp1, fp2);
const similarity = Math.round(((HASH_BITS - distance) / HASH_BITS) * 100);
return {
similarity: Math.max(0, Math.min(100, similarity)),
fingerprint1: fp1.toString(16).padStart(16, "0").toUpperCase(),
fingerprint2: fp2.toString(16).padStart(16, "0").toUpperCase(),
hammingDistance: distance,
};
}
export function getTextStats(text) {
const words = text.trim().split(/\s+/).filter(w => w.length > 0);
const features = tokenize(text);
return {
characters: text.length,
words: words.length,
features: features.size,
};
}
Now that you've built this, here's what you can do:
You've just implemented the same algorithm Google uses to crawl and deduplicate the entire web! SimHash is a beautiful example of how clever math can solve seemingly impossible problems.
Key Takeaways:
Developer Tools & Resource Experts
FastTools is dedicated to curating high-quality content and resources that empower developers. With nearly 5 years of hands-on development experience, our team rigorously evaluates every tool and API we recommend, ensuring you get only the most reliable and effective solutions for your projects.