Improve MurmurHash in a number of ways:

Use Swift's overflowing operators rather than multipliedReportingOverflow
etc.

Use UInt32 for the hash values.  This matches how MurmurHash3 is generally
defined (e.g. on Wikipedia).

Add support for decoding Strings (UTF-8, then little-endian) and hashing
the resultant UInt32 values.

Add a test set, using test patterns from Ian Boyd (public domain).
This commit is contained in:
Ewan Mellor 2017-09-27 12:47:17 -07:00
parent b8fd54780a
commit 70db334a58
No known key found for this signature in database
GPG Key ID: 7CE1C6BC9EC8645D
20 changed files with 168 additions and 91 deletions

View File

@ -150,13 +150,12 @@ public class ATNConfig: Hashable, CustomStringConvertible {
///
public var hashValue: Int {
var hashCode: Int = MurmurHash.initialize(7)
var hashCode = MurmurHash.initialize(7)
hashCode = MurmurHash.update(hashCode, state.stateNumber)
hashCode = MurmurHash.update(hashCode, alt)
hashCode = MurmurHash.update(hashCode, context)
hashCode = MurmurHash.update(hashCode, semanticContext)
hashCode = MurmurHash.finish(hashCode, 4)
return hashCode
return MurmurHash.finish(hashCode, 4)
}

View File

@ -313,15 +313,12 @@ public class ATNConfigSet: Hashable, CustomStringConvertible {
/// return configLookup.toArray(a);
///
private final func configHash(_ stateNumber: Int,_ context: PredictionContext?) -> Int{
var hashCode: Int = MurmurHash.initialize(7)
var hashCode = MurmurHash.initialize(7)
hashCode = MurmurHash.update(hashCode, stateNumber)
hashCode = MurmurHash.update(hashCode, context)
hashCode = MurmurHash.finish(hashCode, 2)
return hashCode
return MurmurHash.finish(hashCode, 2)
}
public final func getConflictingAltSubsets() throws -> Array<BitSet> {
let length = configs.count
let configToAlts: HashMap<Int, BitSet> = HashMap<Int, BitSet>(count: length)

View File

@ -77,15 +77,14 @@ public class LexerATNConfig: ATNConfig {
}*/
public var hashValue: Int {
var hashCode: Int = MurmurHash.initialize(7)
var hashCode = MurmurHash.initialize(7)
hashCode = MurmurHash.update(hashCode, state.stateNumber)
hashCode = MurmurHash.update(hashCode, alt)
hashCode = MurmurHash.update(hashCode, context)
hashCode = MurmurHash.update(hashCode, semanticContext)
hashCode = MurmurHash.update(hashCode, passedThroughNonGreedyDecision ? 1 : 0)
hashCode = MurmurHash.update(hashCode, lexerActionExecutor)
hashCode = MurmurHash.finish(hashCode, 6)
return hashCode
return MurmurHash.finish(hashCode, 6)
}

View File

@ -36,7 +36,7 @@ public class LexerActionExecutor: Hashable {
public init(_ lexerActions: [LexerAction]) {
self.lexerActions = lexerActions
var hash: Int = MurmurHash.initialize()
var hash = MurmurHash.initialize()
for lexerAction: LexerAction in lexerActions {
hash = MurmurHash.update(hash, lexerAction)
}

View File

@ -65,7 +65,7 @@ public final class LexerChannelAction: LexerAction, CustomStringConvertible {
override
public var hashValue: Int {
var hash: Int = MurmurHash.initialize()
var hash = MurmurHash.initialize()
hash = MurmurHash.update(hash, getActionType().rawValue)
hash = MurmurHash.update(hash, channel)
return MurmurHash.finish(hash, 2)

View File

@ -94,7 +94,7 @@ public final class LexerCustomAction: LexerAction {
override
public var hashValue: Int {
var hash: Int = MurmurHash.initialize()
var hash = MurmurHash.initialize()
hash = MurmurHash.update(hash, getActionType().rawValue)
hash = MurmurHash.update(hash, ruleIndex)
hash = MurmurHash.update(hash, actionIndex)

View File

@ -97,7 +97,7 @@ public final class LexerIndexedCustomAction: LexerAction {
public override var hashValue: Int {
var hash: Int = MurmurHash.initialize()
var hash = MurmurHash.initialize()
hash = MurmurHash.update(hash, offset)
hash = MurmurHash.update(hash, action)
return MurmurHash.finish(hash, 2)

View File

@ -64,7 +64,7 @@ public final class LexerModeAction: LexerAction, CustomStringConvertible {
}
override
public var hashValue: Int {
var hash: Int = MurmurHash.initialize()
var hash = MurmurHash.initialize()
hash = MurmurHash.update(hash, getActionType().rawValue)
hash = MurmurHash.update(hash, mode)
return MurmurHash.finish(hash, 2)

View File

@ -58,7 +58,7 @@ public final class LexerMoreAction: LexerAction, CustomStringConvertible {
override
public var hashValue: Int {
var hash: Int = MurmurHash.initialize()
var hash = MurmurHash.initialize()
hash = MurmurHash.update(hash, getActionType().rawValue)
return MurmurHash.finish(hash, 1)

View File

@ -59,7 +59,7 @@ public final class LexerPopModeAction: LexerAction, CustomStringConvertible {
override
public var hashValue: Int {
var hash: Int = MurmurHash.initialize()
var hash = MurmurHash.initialize()
hash = MurmurHash.update(hash, getActionType().rawValue)
return MurmurHash.finish(hash, 1)

View File

@ -66,7 +66,7 @@ public final class LexerPushModeAction: LexerAction, CustomStringConvertible {
override
public var hashValue: Int {
var hash: Int = MurmurHash.initialize()
var hash = MurmurHash.initialize()
hash = MurmurHash.update(hash, getActionType().rawValue)
hash = MurmurHash.update(hash, mode)
return MurmurHash.finish(hash, 2)

View File

@ -58,7 +58,7 @@ public final class LexerSkipAction: LexerAction, CustomStringConvertible {
override
public var hashValue: Int {
var hash: Int = MurmurHash.initialize()
var hash = MurmurHash.initialize()
hash = MurmurHash.update(hash, getActionType().rawValue)
return MurmurHash.finish(hash, 1)
}

View File

@ -64,7 +64,7 @@ public class LexerTypeAction: LexerAction, CustomStringConvertible {
override
public var hashValue: Int {
var hash: Int = MurmurHash.initialize()
var hash = MurmurHash.initialize()
hash = MurmurHash.update(hash, getActionType().rawValue)
hash = MurmurHash.update(hash, type)
return MurmurHash.finish(hash, 2)

View File

@ -21,7 +21,7 @@ public class PredictionContext: Hashable, CustomStringConvertible {
///
public static let EMPTY_RETURN_STATE: Int = Int(Int32.max)
private static let INITIAL_HASH: Int = 1
private static let INITIAL_HASH = UInt32(1)
public static var globalNodeCount: Int = 0
public final let id: Int = {
@ -118,21 +118,19 @@ public class PredictionContext: Hashable, CustomStringConvertible {
}
static func calculateEmptyHashCode() -> Int {
var hash: Int = MurmurHash.initialize(INITIAL_HASH)
hash = MurmurHash.finish(hash, 0)
return hash
let hash = MurmurHash.initialize(INITIAL_HASH)
return MurmurHash.finish(hash, 0)
}
static func calculateHashCode(_ parent: PredictionContext?, _ returnState: Int) -> Int {
var hash: Int = MurmurHash.initialize(INITIAL_HASH)
var hash = MurmurHash.initialize(INITIAL_HASH)
hash = MurmurHash.update(hash, parent)
hash = MurmurHash.update(hash, returnState)
hash = MurmurHash.finish(hash, 2)
return hash
return MurmurHash.finish(hash, 2)
}
static func calculateHashCode(_ parents: [PredictionContext?], _ returnStates: [Int]) -> Int {
var hash: Int = MurmurHash.initialize(INITIAL_HASH)
var hash = MurmurHash.initialize(INITIAL_HASH)
var length = parents.count
for i in 0..<length {
hash = MurmurHash.update(hash, parents[i])
@ -142,8 +140,7 @@ public class PredictionContext: Hashable, CustomStringConvertible {
hash = MurmurHash.update(hash, returnStates[i])
}
hash = MurmurHash.finish(hash, 2 * parents.count)
return hash
return MurmurHash.finish(hash, 2 * parents.count)
}
// dispatch

View File

@ -97,12 +97,11 @@ public class SemanticContext: Hashable, CustomStringConvertible {
override
public var hashValue: Int {
var hashCode: Int = MurmurHash.initialize()
var hashCode = MurmurHash.initialize()
hashCode = MurmurHash.update(hashCode, ruleIndex)
hashCode = MurmurHash.update(hashCode, predIndex)
hashCode = MurmurHash.update(hashCode, isCtxDependent ? 1 : 0)
hashCode = MurmurHash.finish(hashCode, 3)
return hashCode
return MurmurHash.finish(hashCode, 3)
}

View File

@ -124,10 +124,9 @@ public class DFAState: Hashable, CustomStringConvertible {
public var hashValue: Int {
var hash: Int = MurmurHash.initialize(7)
var hash = MurmurHash.initialize(7)
hash = MurmurHash.update(hash, configs.hashValue)
hash = MurmurHash.finish(hash, 1)
return hash
return MurmurHash.finish(hash, 1)
}
///

View File

@ -508,24 +508,22 @@ public class IntervalSet: IntSet, Hashable, CustomStringConvertible {
public func hashCode() -> Int {
var hash: Int = MurmurHash.initialize()
var hash = MurmurHash.initialize()
for I: Interval in intervals {
hash = MurmurHash.update(hash, I.a)
hash = MurmurHash.update(hash, I.b)
}
hash = MurmurHash.finish(hash, intervals.count * 2)
return hash
return MurmurHash.finish(hash, intervals.count * 2)
}
public var hashValue: Int {
var hash: Int = MurmurHash.initialize()
var hash = MurmurHash.initialize()
for I: Interval in intervals {
hash = MurmurHash.update(hash, I.a)
hash = MurmurHash.update(hash, I.b)
}
hash = MurmurHash.finish(hash, intervals.count * 2)
return hash
return MurmurHash.finish(hash, intervals.count * 2)
}
///
/// Are two IntervalSets equal? Because all intervals are sorted

View File

@ -6,20 +6,28 @@
///
/// https://en.wikipedia.org/wiki/MurmurHash
///
/// - Author: Sam Harwell
///
public final class MurmurHash {
private static let DEFAULT_SEED: Int = 0
private static let DEFAULT_SEED: UInt32 = 0
private static let c1 = UInt32(0xCC9E2D51)
private static let c2 = UInt32(0x1B873593)
private static let r1 = UInt32(15)
private static let r2 = UInt32(13)
private static let m = UInt32(5)
private static let n = UInt32(0xE6546B64)
///
/// Initialize the hash using the default seed value.
///
/// - Returns: the intermediate hash value
///
public static func initialize() -> Int {
public static func initialize() -> UInt32 {
return initialize(DEFAULT_SEED)
}
@ -29,10 +37,18 @@ public final class MurmurHash {
/// - Parameter seed: the seed
/// - Returns: the intermediate hash value
///
public static func initialize(_ seed: Int) -> Int {
public static func initialize(_ seed: UInt32) -> UInt32 {
return seed
}
private static func calcK(_ value: UInt32) -> UInt32 {
var k = value
k = k &* c1
k = (k << r1) | (k >> (32 - r1))
k = k &* c2
return k
}
///
/// Update the intermediate hash value for the next input `value`.
///
@ -40,31 +56,14 @@ public final class MurmurHash {
/// - Parameter value: the value to add to the current hash
/// - Returns: the updated intermediate hash value
///
public static func update2(_ hashIn: Int, _ value: Int) -> Int {
let c1: Int32 = -862048943//0xCC9E2D51;
let c2: Int32 = 0x1B873593
let r1: Int32 = 15
let r2: Int32 = 13
let m: Int32 = 5
let n: Int32 = -430675100//0xE6546B64;
var k: Int32 = Int32(truncatingIfNeeded: value)
k = k.multipliedReportingOverflow(by: c1).partialValue
// (k,_) = UInt32.multiplyWithOverflow(k, c1) ;//( k * c1);
//TODO: CHECKE >>>
k = (k << r1) | (k >>> (Int32(32) - r1)) //k = (k << r1) | (k >>> (32 - r1));
//k = UInt32 (truncatingBitPattern:Int64(Int64(k) * Int64(c2)));//( k * c2);
//(k,_) = UInt32.multiplyWithOverflow(k, c2)
k = k.multipliedReportingOverflow(by: c2).partialValue
var hash = Int32(hashIn)
public static func update2(_ hashIn: UInt32, _ value: Int) -> UInt32 {
let k = calcK(UInt32(truncatingIfNeeded: value))
var hash = hashIn
hash = hash ^ k
hash = (hash << r2) | (hash >>> (Int32(32) - r2))//hash = (hash << r2) | (hash >>> (32 - r2));
hash = hash.multipliedReportingOverflow(by: m).partialValue
hash = hash.addingReportingOverflow(n).partialValue
//hash = hash * m + n;
hash = (hash << r2) | (hash >> (32 - r2))
hash = hash &* m &+ n
// print("murmur update2 : \(hash)")
return Int(hash)
return hash
}
///
@ -74,9 +73,8 @@ public final class MurmurHash {
/// - Parameter value: the value to add to the current hash
/// - Returns: the updated intermediate hash value
///
public static func update<T:Hashable>(_ hash: Int, _ value: T?) -> Int {
public static func update<T:Hashable>(_ hash: UInt32, _ value: T?) -> UInt32 {
return update2(hash, value != nil ? value!.hashValue : 0)
// return update2(hash, value);
}
///
@ -84,21 +82,24 @@ public final class MurmurHash {
/// to form the final result of the MurmurHash 3 hash function.
///
/// - Parameter hash: the intermediate hash value
/// - Parameter numberOfWords: the number of integer values added to the hash
/// - Parameter numberOfWords: the number of UInt32 values added to the hash
/// - Returns: the final hash result
///
public static func finish(_ hashin: Int, _ numberOfWordsIn: Int) -> Int {
var hash = Int32(hashin)
let numberOfWords = Int32(numberOfWordsIn)
hash = hash ^ numberOfWords.multipliedReportingOverflow(by: 4).partialValue //(numberOfWords * UInt32(4));
hash = hash ^ (hash >>> Int32(16)) //hash = hash ^ (hash >>> 16);
hash = hash.multipliedReportingOverflow(by: -2048144789).partialValue //hash * UInt32(0x85EBCA6B);
hash = hash ^ (hash >>> Int32(13))//hash = hash ^ (hash >>> 13);
//hash = UInt32(truncatingBitPattern: UInt64(hash) * UInt64(0xC2B2AE35)) ;
hash = hash.multipliedReportingOverflow(by: -1028477387).partialValue
hash = hash ^ (hash >>> Int32(16))// hash = hash ^ (hash >>> 16);
public static func finish(_ hashin: UInt32, _ numberOfWords: Int) -> Int {
return Int(finish(hashin, byteCount: (numberOfWords &* 4)))
}
private static func finish(_ hashin: UInt32, byteCount byteCountInt: Int) -> UInt32 {
let byteCount = UInt32(truncatingIfNeeded: byteCountInt)
var hash = hashin
hash ^= byteCount
hash ^= (hash >> 16)
hash = hash &* 0x85EBCA6B
hash ^= (hash >> 13)
hash = hash &* 0xC2B2AE35
hash ^= (hash >> 16)
//print("murmur finish : \(hash)")
return Int(hash)
return hash
}
///
@ -111,14 +112,55 @@ public final class MurmurHash {
/// - Returns: the hash code of the data
///
public static func hashCode<T:Hashable>(_ data: [T], _ seed: Int) -> Int {
var hash: Int = initialize(seed)
for value: T in data {
//var hashValue = value != nil ? value.hashValue : 0
hash = update(hash, value.hashValue)
var hash = initialize(UInt32(truncatingIfNeeded: seed))
for value in data {
hash = update(hash, value)
}
hash = finish(hash, data.count)
return hash
return finish(hash, data.count)
}
///
/// Compute a hash for the given String and seed. The String is encoded
/// using UTF-8, then the bytes are interpreted as unsigned 32-bit
/// little-endian values, giving UInt32 values for the update call.
///
/// If the bytes do not evenly divide by 4, the final bytes are treated
/// slightly differently (not doing the final rotate / multiply / add).
///
/// This matches the treatment of byte sequences in publicly available
/// test patterns (see MurmurHashTests.swift) and the example code on
/// Wikipedia.
///
public static func hashString(_ s: String, _ seed: UInt32) -> UInt32 {
let bytes = Array(s.utf8)
return hashBytesLittleEndian(bytes, seed)
}
private static func hashBytesLittleEndian(_ bytes: [UInt8], _ seed: UInt32) -> UInt32 {
let byteCount = bytes.count
var hash = seed
for i in stride(from: 0, to: byteCount - 3, by: 4) {
var word = UInt32(bytes[i])
word |= UInt32(bytes[i + 1]) << 8
word |= UInt32(bytes[i + 2]) << 16
word |= UInt32(bytes[i + 3]) << 24
hash = update(hash, word)
}
let remaining = byteCount & 3
if remaining != 0 {
var lastWord = UInt32(0)
for r in 0 ..< remaining {
lastWord |= UInt32(bytes[byteCount - 1 - r]) << (8 * (remaining - 1 - r))
}
let k = calcK(lastWord)
hash ^= k
}
return finish(hash, byteCount: byteCount)
}
private init() {

View File

@ -16,7 +16,7 @@ public class Triple<A:Hashable, B:Hashable, C:Hashable>: Hashable, CustomStringC
self.c = c
}
public var hashValue: Int {
var hash: Int = MurmurHash.initialize()
var hash = MurmurHash.initialize()
hash = MurmurHash.update(hash, a)
hash = MurmurHash.update(hash, b)
hash = MurmurHash.update(hash, c)

View File

@ -0,0 +1,47 @@
/// Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
/// Use of this file is governed by the BSD 3-clause license that
/// can be found in the LICENSE.txt file in the project root.
/// The test patterns below are by Ian Boyd and have been released into the
/// public domain.
/// https://stackoverflow.com/questions/14747343/murmurhash3-test-vectors
import XCTest
import Antlr4
class MurmurHashTests: XCTestCase {
func testMurmurHash() {
doMurmurHashTest("", 0, 0) //empty string with zero seed should give zero
doMurmurHashTest("", 1, 0x514E28B7)
doMurmurHashTest("", 0xffffffff, 0x81F16F39) //make sure seed value is handled unsigned
doMurmurHashTest("\0\0\0\0", 0, 0x2362F9DE) //make sure we handle embedded nulls
doMurmurHashTest("aaaa", 0x9747b28c, 0x5A97808A) //one full chunk
doMurmurHashTest("aaa", 0x9747b28c, 0x283E0130) //three characters
doMurmurHashTest("aa", 0x9747b28c, 0x5D211726) //two characters
doMurmurHashTest("a", 0x9747b28c, 0x7FA09EA6) //one character
//Endian order within the chunks
doMurmurHashTest("abcd", 0x9747b28c, 0xF0478627) //one full chunk
doMurmurHashTest("abc", 0x9747b28c, 0xC84A62DD)
doMurmurHashTest("ab", 0x9747b28c, 0x74875592)
doMurmurHashTest("a", 0x9747b28c, 0x7FA09EA6)
doMurmurHashTest("Hello, world!", 0x9747b28c, 0x24884CBA)
//Make sure you handle UTF-8 high characters. A bcrypt implementation messed this up
doMurmurHashTest("ππππππππ", 0x9747b28c, 0xD58063C1) //U+03C0: Greek Small Letter Pi
//String of 256 characters.
doMurmurHashTest(String(repeating: "a", count: 256), 0x9747b28c, 0x37405BDC)
doMurmurHashTest("abc", 0, 0xB3DD93FA)
doMurmurHashTest("abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq", 0, 0xEE925B90)
doMurmurHashTest("The quick brown fox jumps over the lazy dog", 0x9747b28c, 0x2FA826CD)
}
}
private func doMurmurHashTest(_ input: String, _ seed: UInt32, _ expected: UInt32) {
XCTAssertEqual(MurmurHash.hashString(input, seed), expected)
}