From 1d59bb3d47cd06dda5b91c536bdada22a7a3db4c Mon Sep 17 00:00:00 2001 From: Peter Conrad Date: Thu, 8 Mar 2018 15:59:03 +0100 Subject: [PATCH] Updated bloom_filter.hpp from https://github.com/ArashPartow/bloom/blob/c89c2912c445dd5c0afe3d819ca012a6c6639e4e/bloom_filter.hpp --- include/fc/bloom_filter.hpp | 228 ++++++++++++++++++++++-------------- 1 file changed, 140 insertions(+), 88 deletions(-) diff --git a/include/fc/bloom_filter.hpp b/include/fc/bloom_filter.hpp index 940dfbd..a99f69d 100644 --- a/include/fc/bloom_filter.hpp +++ b/include/fc/bloom_filter.hpp @@ -1,5 +1,3 @@ -#pragma once - /* ********************************************************************* * * @@ -11,16 +9,20 @@ * * * Copyright notice: * * Free use of the Open Bloom Filter Library is permitted under the * - * guidelines and in accordance with the most current version of the * - * Common Public License. * - * http://www.opensource.org/licenses/cpl1.0.php * + * guidelines and in accordance with the MIT License. * + * http://www.opensource.org/licenses/MIT * * * ********************************************************************* */ + +#ifndef INCLUDE_BLOOM_FILTER_HPP +#define INCLUDE_BLOOM_FILTER_HPP + #include #include #include +#include #include #include #include @@ -32,6 +34,7 @@ namespace fc { static constexpr std::size_t bits_per_char = 0x08; // 8 bits in 1 char(unsigned) + static const unsigned char bit_mask[bits_per_char] = { 0x01, //00000001 0x02, //00000010 @@ -87,22 +90,22 @@ public: (0xFFFFFFFFFFFFFFFFULL == random_seed); } - //Allowed min/max size of the bloom filter in bits + // Allowable min/max size of the bloom filter in bits unsigned long long int minimum_size; unsigned long long int maximum_size; - //Allowed min/max number of hash functions + // Allowable min/max number of hash functions unsigned int minimum_number_of_hashes; unsigned int maximum_number_of_hashes; - //The approximate number of elements to be inserted - //into the bloom filter, should be within one order - //of magnitude. The default is 10000. + // The approximate number of elements to be inserted + // into the bloom filter, should be within one order + // of magnitude. The default is 10000. unsigned long long int projected_element_count; - //The approximate false positive probability expected - //from the bloom filter. The default is the reciprocal - //of the projected_element_count. + // The approximate false positive probability expected + // from the bloom filter. The default is assumed to be + // the reciprocal of the projected_element_count. double false_positive_probability; unsigned long long int random_seed; @@ -133,28 +136,32 @@ public: if (!(*this)) return false; - double min_m = std::numeric_limits::infinity(); - double min_k = 0.0; - double curr_m = 0.0; - double k = 1.0; + double min_m = std::numeric_limits::infinity(); + double min_k = 0.0; + double k = 1.0; while (k < 1000.0) { - double numerator = (- k * projected_element_count); - double denominator = std::log(1.0 - std::pow(false_positive_probability, 1.0 / k)); - curr_m = numerator / denominator; + const double numerator = (- k * projected_element_count); + const double denominator = std::log(1.0 - std::pow(false_positive_probability, 1.0 / k)); + + const double curr_m = numerator / denominator; + if (curr_m < min_m) { min_m = curr_m; min_k = k; } + k += 1.0; } optimal_parameters_t& optp = optimal_parameters; optp.number_of_hashes = static_cast(min_k); + optp.table_size = static_cast(min_m); + optp.table_size += (((optp.table_size % bits_per_char) != 0) ? (bits_per_char - (optp.table_size % bits_per_char)) : 0); if (optp.number_of_hashes < minimum_number_of_hashes) @@ -178,15 +185,15 @@ protected: typedef unsigned int bloom_type; typedef unsigned char cell_type; + typedef std::vector table_type; public: bloom_filter() : salt_count_(0), table_size_(0), - raw_table_size_(0), projected_element_count_(0), - inserted_element_count_(0), + inserted_element_count_ (0), random_seed_(0), desired_false_positive_probability_(0.0) {} @@ -199,12 +206,10 @@ public: { salt_count_ = p.optimal_parameters.number_of_hashes; table_size_ = p.optimal_parameters.table_size; - generate_unique_salt(); - raw_table_size_ = table_size_ / bits_per_char; - bit_table_.resize( static_cast(raw_table_size_) ); - //bit_table_ = new cell_type[static_cast(raw_table_size_)]; - std::fill_n(bit_table_.data(),raw_table_size_,0x00); + generate_unique_salt(); + + bit_table_.resize(table_size_ / bits_per_char, static_cast(0x00)); } bloom_filter(const bloom_filter& filter) @@ -217,15 +222,15 @@ public: if (this != &f) { return - (salt_count_ == f.salt_count_) && - (table_size_ == f.table_size_) && - (raw_table_size_ == f.raw_table_size_) && - (projected_element_count_ == f.projected_element_count_) && - (inserted_element_count_ == f.inserted_element_count_) && - (random_seed_ == f.random_seed_) && + (salt_count_ == f.salt_count_ ) && + (table_size_ == f.table_size_ ) && + (bit_table_.size() == f.bit_table_.size() ) && + (projected_element_count_ == f.projected_element_count_ ) && + (inserted_element_count_ == f.inserted_element_count_ ) && + (random_seed_ == f.random_seed_ ) && (desired_false_positive_probability_ == f.desired_false_positive_probability_) && - (salt_ == f.salt_) && - std::equal(f.bit_table_.data(),f.bit_table_.data() + raw_table_size_,bit_table_.data()); + (salt_ == f.salt_ ) && + (bit_table_ == f.bit_table_ ) ; } else return true; @@ -242,21 +247,22 @@ public: { salt_count_ = f.salt_count_; table_size_ = f.table_size_; - raw_table_size_ = f.raw_table_size_; + bit_table_ = f.bit_table_; + salt_ = f.salt_; + projected_element_count_ = f.projected_element_count_; - inserted_element_count_ = f.inserted_element_count_; + inserted_element_count_ = f.inserted_element_count_; + random_seed_ = f.random_seed_; + desired_false_positive_probability_ = f.desired_false_positive_probability_; - bit_table_.resize( raw_table_size_ ); - std::copy(f.bit_table_.data(),f.bit_table_.data() + raw_table_size_,bit_table_.data()); - salt_ = f.salt_; } + return *this; } virtual ~bloom_filter() - { - } + {} inline bool operator!() const { @@ -265,23 +271,26 @@ public: inline void clear() { - std::fill_n(bit_table_.data(),raw_table_size_,0x00); + std::fill(bit_table_.begin(), bit_table_.end(), static_cast(0x00)); inserted_element_count_ = 0; } inline void insert(const unsigned char* key_begin, const std::size_t& length) { std::size_t bit_index = 0; - std::size_t bit = 0; + std::size_t bit = 0; + for (std::size_t i = 0; i < salt_.size(); ++i) { - compute_indices(hash_ap(key_begin,length,salt_[i]),bit_index,bit); + compute_indices(hash_ap(key_begin, length, salt_[i]), bit_index, bit); + bit_table_[bit_index / bits_per_char] |= bit_mask[bit]; } + ++inserted_element_count_; } - template + template inline void insert(const T& t) { // Note: T must be a C++ POD type. @@ -290,7 +299,7 @@ public: inline void insert(const std::string& key) { - insert(reinterpret_cast(key.c_str()),key.size()); + insert(reinterpret_cast(key.data()),key.size()); } inline void insert(const char* data, const std::size_t& length) @@ -298,10 +307,11 @@ public: insert(reinterpret_cast(data),length); } - template + template inline void insert(const InputIterator begin, const InputIterator end) { InputIterator itr = begin; + while (end != itr) { insert(*(itr++)); @@ -311,19 +321,22 @@ public: inline virtual bool contains(const unsigned char* key_begin, const std::size_t length) const { std::size_t bit_index = 0; - std::size_t bit = 0; + std::size_t bit = 0; + for (std::size_t i = 0; i < salt_.size(); ++i) { - compute_indices(hash_ap(key_begin,length,salt_[i]),bit_index,bit); + compute_indices(hash_ap(key_begin, length, salt_[i]), bit_index, bit); + if ((bit_table_[bit_index / bits_per_char] & bit_mask[bit]) != bit_mask[bit]) { return false; } } + return true; } - template + template inline bool contains(const T& t) const { return contains(reinterpret_cast(&t),static_cast(sizeof(T))); @@ -339,33 +352,39 @@ public: return contains(reinterpret_cast(data),length); } - template + template inline InputIterator contains_all(const InputIterator begin, const InputIterator end) const { InputIterator itr = begin; + while (end != itr) { if (!contains(*itr)) { return itr; } + ++itr; } + return end; } - template + template inline InputIterator contains_none(const InputIterator begin, const InputIterator end) const { InputIterator itr = begin; + while (end != itr) { if (contains(*itr)) { return itr; } + ++itr; } + return end; } @@ -374,7 +393,7 @@ public: return table_size_; } - inline std::size_t element_count() const + inline unsigned long long int element_count() const { return inserted_element_count_; } @@ -395,16 +414,17 @@ public: { /* intersection */ if ( - (salt_count_ == f.salt_count_) && - (table_size_ == f.table_size_) && - (random_seed_ == f.random_seed_) + (salt_count_ == f.salt_count_ ) && + (table_size_ == f.table_size_ ) && + (random_seed_ == f.random_seed_) ) { - for (std::size_t i = 0; i < raw_table_size_; ++i) + for (std::size_t i = 0; i < bit_table_.size(); ++i) { bit_table_[i] &= f.bit_table_[i]; } } + return *this; } @@ -412,16 +432,17 @@ public: { /* union */ if ( - (salt_count_ == f.salt_count_) && - (table_size_ == f.table_size_) && - (random_seed_ == f.random_seed_) + (salt_count_ == f.salt_count_ ) && + (table_size_ == f.table_size_ ) && + (random_seed_ == f.random_seed_) ) { - for (std::size_t i = 0; i < raw_table_size_; ++i) + for (std::size_t i = 0; i < bit_table_.size(); ++i) { bit_table_[i] |= f.bit_table_[i]; } } + return *this; } @@ -429,16 +450,17 @@ public: { /* difference */ if ( - (salt_count_ == f.salt_count_) && - (table_size_ == f.table_size_) && - (random_seed_ == f.random_seed_) + (salt_count_ == f.salt_count_ ) && + (table_size_ == f.table_size_ ) && + (random_seed_ == f.random_seed_) ) { - for (std::size_t i = 0; i < raw_table_size_; ++i) + for (std::size_t i = 0; i < bit_table_.size(); ++i) { bit_table_[i] ^= f.bit_table_[i]; } } + return *this; } @@ -457,7 +479,7 @@ protected: inline virtual void compute_indices(const bloom_type& hash, std::size_t& bit_index, std::size_t& bit) const { bit_index = hash % table_size_; - bit = bit_index % bits_per_char; + bit = bit_index % bits_per_char; } void generate_unique_salt() @@ -469,6 +491,7 @@ protected: hash function with different values seems to be adequate. */ const unsigned int predef_salt_count = 128; + static const bloom_type predef_salt[predef_salt_count] = { 0xAAAAAAAA, 0x55555555, 0x33333333, 0xCCCCCCCC, @@ -510,25 +533,31 @@ protected: std::copy(predef_salt, predef_salt + salt_count_, std::back_inserter(salt_)); - for (unsigned int i = 0; i < salt_.size(); ++i) - { + + for (std::size_t i = 0; i < salt_.size(); ++i) + { /* - Note: - This is done to integrate the user defined random seed, - so as to allow for the generation of unique bloom filter - instances. + Note: + This is done to integrate the user defined random seed, + so as to allow for the generation of unique bloom filter + instances. */ salt_[i] = salt_[i] * salt_[(i + 3) % salt_.size()] + static_cast(random_seed_); - } + } } else { - std::copy(predef_salt,predef_salt + predef_salt_count,std::back_inserter(salt_)); + std::copy(predef_salt, predef_salt + predef_salt_count, std::back_inserter(salt_)); + srand(static_cast(random_seed_)); + while (salt_.size() < salt_count_) { bloom_type current_salt = static_cast(rand()) * static_cast(rand()); - if (0 == current_salt) continue; + + if (0 == current_salt) + continue; + if (salt_.end() == std::find(salt_.begin(), salt_.end(), current_salt)) { salt_.push_back(current_salt); @@ -540,57 +569,71 @@ protected: inline bloom_type hash_ap(const unsigned char* begin, std::size_t remaining_length, bloom_type hash) const { const unsigned char* itr = begin; - unsigned int loop = 0; + unsigned int loop = 0; + while (remaining_length >= 8) { const unsigned int& i1 = *(reinterpret_cast(itr)); itr += sizeof(unsigned int); const unsigned int& i2 = *(reinterpret_cast(itr)); itr += sizeof(unsigned int); + hash ^= (hash << 7) ^ i1 * (hash >> 3) ^ (~((hash << 11) + (i2 ^ (hash >> 5)))); + remaining_length -= 8; } + if (remaining_length) { if (remaining_length >= 4) { const unsigned int& i = *(reinterpret_cast(itr)); + if (loop & 0x01) hash ^= (hash << 7) ^ i * (hash >> 3); else hash ^= (~((hash << 11) + (i ^ (hash >> 5)))); + ++loop; + remaining_length -= 4; + itr += sizeof(unsigned int); } + if (remaining_length >= 2) { const unsigned short& i = *(reinterpret_cast(itr)); + if (loop & 0x01) hash ^= (hash << 7) ^ i * (hash >> 3); else hash ^= (~((hash << 11) + (i ^ (hash >> 5)))); + ++loop; + remaining_length -= 2; + itr += sizeof(unsigned short); } + if (remaining_length) { hash += ((*itr) ^ (hash * 0xA5A5A5A5)) + loop; } } + return hash; } -public: - std::vector salt_; - std::vector bit_table_; - unsigned int salt_count_; - unsigned long long int table_size_; - unsigned long long int raw_table_size_; - unsigned long long int projected_element_count_; - unsigned int inserted_element_count_; - unsigned long long int random_seed_; - double desired_false_positive_probability_; + public: + std::vector salt_; + std::vector bit_table_; + unsigned int salt_count_; + unsigned long long int table_size_; + unsigned long long int projected_element_count_; + unsigned long long int inserted_element_count_; + unsigned long long int random_seed_; + double desired_false_positive_probability_; }; inline bloom_filter operator & (const bloom_filter& a, const bloom_filter& b) @@ -617,12 +660,21 @@ inline bloom_filter operator ^ (const bloom_filter& a, const bloom_filter& b) } // namespace fc +FC_REFLECT( fc::bloom_filter, (salt_)(bit_table_)(salt_count_)(table_size_)(projected_element_count_) + (inserted_element_count_)(random_seed_)(desired_false_positive_probability_) ) +FC_REFLECT( fc::bloom_parameters::optimal_parameters_t, (number_of_hashes)(table_size) ) +FC_REFLECT( fc::bloom_parameters, (minimum_size)(maximum_size)(minimum_number_of_hashes) + (maximum_number_of_hashes)(projected_element_count) + (false_positive_probability)(random_seed)(optimal_parameters) ) + +#endif + /* Note 1: If it can be guaranteed that bits_per_char will be of the form 2^n then the following optimization can be used: - hash_table[bit_index >> n] |= bit_mask[bit_index & (bits_per_char - 1)]; + bit_table_[bit_index >> n] |= bit_mask[bit_index & (bits_per_char - 1)]; Note 2: For performance reasons where possible when allocating memory it should