diff --git a/scripts/bloom_filter.py b/scripts/bloom_filter.py new file mode 100644 index 00000000..62afe701 --- /dev/null +++ b/scripts/bloom_filter.py @@ -0,0 +1,105 @@ +#!/usr/bin/env python +from math import log +from bitarray import bitarray # from pypi.python.org/pypi/bitarray +import pyhash # from https://github.com/flier/pyfasthash + +# Based on BIP-37 +# https://github.com/QuantumExplorer/bips/blob/master/bip-0037.mediawiki + +# Defined in bloom.h +# https://github.com/dashpay/dash/blob/master/src/bloom.h#L17-#L18 +MAX_BLOOM_FILTER_SIZE = 36000 +MAX_HASH_FUNCS = 50 + +# Set of flags that control how matched items are added to the filter (per BIP-37) +# https://github.com/dashpay/dash/blob/master/src/bloom.h#L24 +nFlags = 0 + +nElements = 1 # Number of elements +nFPRate = 0.0001 # False positive rate + +nFilterBytes = int(min((-1 / log(2)**2 * nElements * log(nFPRate)) / 8, MAX_BLOOM_FILTER_SIZE)) + +# Calculate the number of hash functions to use in the filter +# Limit the maximum number to 50 per BIP-37 +nHashFuncs = int(min(nFilterBytes * 8 / nElements * log(2), MAX_HASH_FUNCS)) + +murmur3 = pyhash.murmur3_32() + +TEST_TXID = "019f5b01d4195ecbc9398fbf3c3b1fa9bb3183301d7a1fb3bd174fcfa40a2b65" + +def bloom_hash(nHashNum, data): + seed = (nHashNum * 0xfba4c795 + nTweak) & 0xffffffff + return( murmur3(data, seed=seed) % (nFilterBytes * 8) ) + + +# Bloom Filter creation +def create_filter(nTweak): + print('Creating bloom filter') + vData = nFilterBytes * 8 * bitarray('0', endian="little") + + data_to_hash = TEST_TXID + data_to_hash = data_to_hash.decode("hex") + + print('Filter bytes: {}; Hash functions: {}'.format(nFilterBytes, nHashFuncs)) + + print(" Filter (As Bits)") + print("nHashNum nIndex Filter 0123456789abcdef") + print("~~~~~~~~ ~~~~~~ ~~~~~~ ~~~~~~~~~~~~~~~~") + for nHashNum in range(nHashFuncs): + nIndex = bloom_hash(nHashNum, data_to_hash) + + ## Set the bit at nIndex to 1 + vData[nIndex] = True + + ## Debug: print current state + print(' {0:2} {1:2} {2} {3}'.format( + nHashNum, + hex(int(nIndex)), + vData.tobytes().encode("hex"), + vData.to01() + )) + + print('Bloom filter: {}\n'.format(vData.tobytes().encode("hex"))) + + +# Bloom Filter evaluation +def evaluate_filter(): + print('Evaluating bloom filter') + vData = bitarray(endian='little') + vData.frombytes("b50f".decode("hex")) + nHashFuncs = 11 + nTweak = 0 + nFlags = 0 + + def contains(nHashFuncs, data_to_hash): + for nHashNum in range(nHashFuncs): + ## bloom_hash as defined in previous section + nIndex = bloom_hash(nHashNum, data_to_hash) + + if vData[nIndex] != True: + print("MATCH FAILURE: Index {0} not set in {1}\n".format( + hex(int(nIndex)), + vData.to01() + )) + return False + print("MATCH SUCCESS\n") + + ## Test 1: Same TXID as previously added to filter + data_to_hash = TEST_TXID + print('\nChecking: {}'.format(data_to_hash)) + data_to_hash = data_to_hash.decode("hex") + contains(nHashFuncs, data_to_hash) + + ## Test 2: Arbitrary string + data_to_hash = "1/10,000 chance this ASCII string will match" + print('Checking: {}'.format(data_to_hash)) + contains(nHashFuncs, data_to_hash) + + +# Tweak is a random value added to the seed value in the hash function +# used by the bloom filter +nTweak = 0 + +create_filter(nTweak) +evaluate_filter()