From c4605ea13d74c8a585a704d40c6e9f16b9ffca86 Mon Sep 17 00:00:00 2001 From: Bruce-Feldman Date: Mon, 30 Jul 2018 05:20:40 -0400 Subject: [PATCH] Refactor Rabin-Karp (#110) * Simplify Rabin-Karp functionality * Created Rabin Fingerprinting module within util directory * Updated Rabin-Karp search to use rolling hash module Incorporate tests from @dubzzz --- .../rabin-karp/__test__/rabinKarp.test.js | 16 ++- src/algorithms/string/rabin-karp/rabinKarp.js | 101 ++++-------------- src/utils/hash/rolling/Rabin_Fingerprint.js | 51 +++++++++ .../__test__/Rabin_Fingerprint.test.js | 59 ++++++++++ 4 files changed, 139 insertions(+), 88 deletions(-) create mode 100644 src/utils/hash/rolling/Rabin_Fingerprint.js create mode 100644 src/utils/hash/rolling/__test__/Rabin_Fingerprint.test.js diff --git a/src/algorithms/string/rabin-karp/__test__/rabinKarp.test.js b/src/algorithms/string/rabin-karp/__test__/rabinKarp.test.js index 5edefb955e..489149b322 100644 --- a/src/algorithms/string/rabin-karp/__test__/rabinKarp.test.js +++ b/src/algorithms/string/rabin-karp/__test__/rabinKarp.test.js @@ -1,24 +1,20 @@ -import { rabinKarp, hashWord, reHashWord } from '../rabinKarp'; +import rabinKarp from '../rabinKarp'; describe('rabinKarp', () => { - it('should correctly calculates hash and re-hash', () => { - expect(hashWord('a')).toBe(97); - expect(hashWord('b')).toBe(98); - expect(hashWord('abc')).toBe(941094); - expect(hashWord('bcd')).toBe(950601); - expect(reHashWord(hashWord('abc'), 'abc', 'bcd')).toBe(950601); - expect(reHashWord(hashWord('abc'), 'abc', 'bcd')).toBe(hashWord('bcd')); - }); - it('should find substring in a string', () => { expect(rabinKarp('', '')).toBe(0); expect(rabinKarp('a', '')).toBe(0); expect(rabinKarp('a', 'a')).toBe(0); + expect(rabinKarp('ab', 'b')).toBe(1); expect(rabinKarp('abcbcglx', 'abca')).toBe(-1); expect(rabinKarp('abcbcglx', 'bcgl')).toBe(3); expect(rabinKarp('abcxabcdabxabcdabcdabcy', 'abcdabcy')).toBe(15); expect(rabinKarp('abcxabcdabxabcdabcdabcy', 'abcdabca')).toBe(-1); expect(rabinKarp('abcxabcdabxaabcdabcabcdabcdabcy', 'abcdabca')).toBe(12); expect(rabinKarp('abcxabcdabxaabaabaaaabcdabcdabcy', 'aabaabaaa')).toBe(11); + expect(rabinKarp('^ !/\'#\'pp', ' !/\'#\'pp')).toBe(1); + expect(rabinKarp('a\u{ffff}', '\u{ffff}')).toBe(1); + expect(rabinKarp('a\u{10000}', '\u{10000}')).toBe(1); + expect(rabinKarp('\u0000耀\u0000', '耀\u0000')).toBe(1); }); }); diff --git a/src/algorithms/string/rabin-karp/rabinKarp.js b/src/algorithms/string/rabin-karp/rabinKarp.js index e6c544facb..378e5acb12 100644 --- a/src/algorithms/string/rabin-karp/rabinKarp.js +++ b/src/algorithms/string/rabin-karp/rabinKarp.js @@ -1,88 +1,33 @@ -/** - * A prime number used to create - * the hash representation of a word - * - * Bigger the prime number, - * bigger the hash value - */ -const PRIME = 97; - -/** - * Function that creates hash representation of the word. - * - * @param {string} word - * @return {number} - */ -export function hashWord(word) { - let hash = 0; - - for (let charIndex = 0; charIndex < word.length; charIndex += 1) { - hash += word[charIndex].charCodeAt(0) * (PRIME ** charIndex); - } - - return hash; -} - -/** - * Function that creates hash representation of the word - * based on previous word (shifted by one character left) hash value. - * - * Recalculates the hash representation of a word so that it isn't - * necessary to traverse the whole word again - * - * @param {number} prevHash - * @param {string} prevWord - * @param {string} newWord - * @return {number} - */ -export function reHashWord(prevHash, prevWord, newWord) { - const newWordLastIndex = newWord.length - 1; - let newHash = prevHash - prevWord[0].charCodeAt(0); - newHash /= PRIME; - newHash += newWord[newWordLastIndex].charCodeAt(0) * (PRIME ** newWordLastIndex); - - return newHash; -} +import RabinFingerprint from '../../../utils/hash/rolling/Rabin_Fingerprint'; /** * @param {string} text * @param {string} word * @return {number} */ -export function rabinKarp(text, word) { - // Calculate word hash that we will use for comparison with other substring hashes. - const wordHash = hashWord(word); - - let prevSegment = null; - let currentSegmentHash = null; - - // Go through all substring of the text that may match - for (let charIndex = 0; charIndex <= text.length - word.length; charIndex += 1) { - const currentSegment = text.substring(charIndex, charIndex + word.length); - - // Calculate the hash of current substring. - if (currentSegmentHash === null) { - currentSegmentHash = hashWord(currentSegment); - } else { - currentSegmentHash = reHashWord(currentSegmentHash, prevSegment, currentSegment); - } - - prevSegment = currentSegment; - - // Compare the hash of current substring and seeking string. - if (wordHash === currentSegmentHash) { - // In case if hashes match let's check substring char by char. - let numberOfMatches = 0; - - for (let deepCharIndex = 0; deepCharIndex < word.length; deepCharIndex += 1) { - if (word[deepCharIndex] === text[charIndex + deepCharIndex]) { - numberOfMatches += 1; - } - } +export default function rabinKarp(text, word) { + const toNum = function toNum(character) { + const surrogate = character.codePointAt(1); + return ((surrogate === undefined) ? 0 : surrogate) + (character.codePointAt(0) * (2 ** 16)); + }; + const arrEq = (a1, a2) => ((a1.length === a2.length) && a1.every((val, idx) => val === a2[idx])); + + const wordArr = [...word].map(toNum); + const textArr = [...text].map(toNum); + + // The prime generation function could depend on the inputs for collision guarantees. + const hasher = new RabinFingerprint(() => 229); + const cmpVal = hasher.init(wordArr); + + let currHash = hasher.init(textArr.slice(0, wordArr.length)); + if ((currHash === cmpVal) && arrEq(wordArr, textArr.slice(0, wordArr.length))) { + return 0; + } - if (numberOfMatches === word.length) { - return charIndex; - } + for (let i = 0; i < (textArr.length - wordArr.length); i += 1) { + currHash = hasher.roll(textArr[i], textArr[i + wordArr.length]); + if ((currHash === cmpVal) && arrEq(wordArr, textArr.slice(i + 1, i + wordArr.length + 1))) { + return i + 1; } } diff --git a/src/utils/hash/rolling/Rabin_Fingerprint.js b/src/utils/hash/rolling/Rabin_Fingerprint.js new file mode 100644 index 0000000000..b854af0864 --- /dev/null +++ b/src/utils/hash/rolling/Rabin_Fingerprint.js @@ -0,0 +1,51 @@ +/** + * Generates fingerprints using Rabin scheme with x = 2 (for potential compiler optimizations). + * Guaranteed not to over or underflow if function assumptions are met. + */ +export default class RabinFingerprint { + /** + * @param { function() : number } [primeGenerator] + * @assumes Output from any function call is prime less than Number.MAX_SAFE_INTEGER / 2. + */ + constructor(primeGenerator) { + this.prime = primeGenerator(); + } + + /** + * @param { array[number] } [values] + * @returns {number} - The hash value after digesting input. + * @assumes All array elements are non-negative. + * @note First element in array is considered to be oldest value. + */ + init(values) { + this.val = 0; + this.len = values.length; + + for (let i = 0; i < values.length; i += 1) { + this.val = (((this.val * 2) % this.prime) + (values[i] % this.prime)) % this.prime; + } + + return this.val; + } + + /* + * @param {number} [oldValue] + * @param {number} [newValue] + * @returns {number} - The hash value after removing the oldest value & inserting the newest. + * @assumes Instance has already been initialized. + * @assumes oldValue is the oldest value still processed by the hash. + * @assumes newValue is non-negative. + */ + roll(oldValue, newValue) { + let oldVal = oldValue % this.prime; + for (let i = 1; i < this.len; i += 1) { + oldVal = (oldVal * 2) % this.prime; + } + this.val = (this.val + this.prime - (oldVal % this.prime)) % this.prime; + + const newVal = newValue % this.prime; + this.val = (((this.val * 2) % this.prime) + (newVal % this.prime)) % this.prime; + + return this.val; + } +} diff --git a/src/utils/hash/rolling/__test__/Rabin_Fingerprint.test.js b/src/utils/hash/rolling/__test__/Rabin_Fingerprint.test.js new file mode 100644 index 0000000000..d96f124234 --- /dev/null +++ b/src/utils/hash/rolling/__test__/Rabin_Fingerprint.test.js @@ -0,0 +1,59 @@ +import RabinFingerprint from '../Rabin_Fingerprint'; + +describe('Rabin fingerprint Hash Family', () => { + it('should hash deterministically', () => { + const primeVals = [3, 5, 19, 53, 97, 401, 7039, 193939]; + for (let primeIdx = 0; primeIdx < primeVals.length; primeIdx += 1) { + const primeVal = primeVals[primeIdx]; + const hasher = new RabinFingerprint(() => primeVal); + + // Test basic values + expect(hasher.init([])).toEqual(0); + expect(hasher.init([1])).toEqual(1); + + // Test overflow + const largeVal = Number.MAX_SAFE_INTEGER; + expect(hasher.init([primeVal])).toEqual(0); + expect(hasher.init([largeVal])).toEqual(largeVal % primeVal); + + const numLargeVal = 2; // 2 ^ numLargeVal fits in javascript number + const largeValues = new Array(numLargeVal).fill(largeVal); + + const expVal = ((largeVal % primeVal) * ((2 ** numLargeVal) - 1)) % primeVal; + expect(hasher.init(largeValues)).toEqual(expVal); + + // Test using Fermat's little theorem + const fermatValues = new Array(primeVal).fill(primeVal); + const numFermatTests = 100; + for (let i = 0; i < numFermatTests; i += 1) { + const randomValue = Math.floor(Math.random() * largeVal); + fermatValues[0] = randomValue; + expect(hasher.init(fermatValues)).toEqual(randomValue % primeVal); + } + } + }); + + it('should roll appropriately', () => { + const primeVals = [3, 5, 19, 53, 97, 401, 7039, 193939]; + + for (let primeIdx = 0; primeIdx < primeVals.length; primeIdx += 1) { + const primeVal = primeVals[primeIdx]; + const hasher = new RabinFingerprint(() => primeVal); + + // Test basic values + const largeVal = Number.MAX_SAFE_INTEGER; + expect(hasher.init([0])).toEqual(0); + expect(hasher.roll(0, 1)).toEqual(1); + expect(hasher.roll(1, primeVal)).toEqual(0); + expect(hasher.roll(primeVal, largeVal)).toEqual(largeVal % primeVal); + + const numRollTest = 100; + let previousValue = largeVal; + for (let i = 0; i < numRollTest; i += 1) { + const randomVal = Math.floor(Math.random() * largeVal); + expect(hasher.roll(previousValue, randomVal)).toEqual(randomVal % primeVal); + previousValue = randomVal; + } + } + }); +});