C++ Bloom Filter Library  release
bloom_filter.hpp
Go to the documentation of this file.
1 /*
2  *********************************************************************
3  * *
4  * Open Bloom Filter *
5  * *
6  * Author: Arash Partow - 2000 *
7  * URL: http://www.partow.net *
8  * URL: http://www.partow.net/programming/hashfunctions/index.html *
9  * *
10  * Copyright notice: *
11  * Free use of the Open Bloom Filter Library is permitted under the *
12  * guidelines and in accordance with the MIT License. *
13  * http://www.opensource.org/licenses/MIT *
14  * *
15  *********************************************************************
16 */
17 
18 
19 #ifndef INCLUDE_BLOOM_FILTER_HPP
20 #define INCLUDE_BLOOM_FILTER_HPP
21 
22 #include <algorithm>
23 #include <cmath>
24 #include <cstddef>
25 #include <cstdlib>
26 #include <iterator>
27 #include <limits>
28 #include <string>
29 #include <vector>
30 
31 
32 static const std::size_t bits_per_char = 0x08; // 8 bits in 1 char(unsigned)
33 
34 static const unsigned char bit_mask[bits_per_char] = {
35  0x01, //00000001
36  0x02, //00000010
37  0x04, //00000100
38  0x08, //00001000
39  0x10, //00010000
40  0x20, //00100000
41  0x40, //01000000
42  0x80 //10000000
43  };
44 
46 {
47 public:
48 
50  : minimum_size(1),
51  maximum_size(std::numeric_limits<unsigned long long int>::max()),
53  maximum_number_of_hashes(std::numeric_limits<unsigned int>::max()),
56  random_seed(0xA5A5A5A55A5A5A5AULL)
57  {}
58 
60  {}
61 
62  inline bool operator!()
63  {
64  return (minimum_size > maximum_size) ||
67  (0 == maximum_number_of_hashes) ||
68  (0 == projected_element_count) ||
70  (std::numeric_limits<double>::infinity() == std::abs(false_positive_probability)) ||
71  (0 == random_seed) ||
72  (0xFFFFFFFFFFFFFFFFULL == random_seed);
73  }
74 
75  // Allowable min/max size of the bloom filter in bits
76  unsigned long long int minimum_size;
77  unsigned long long int maximum_size;
78 
79  // Allowable min/max number of hash functions
82 
83  // The approximate number of elements to be inserted
84  // into the bloom filter, should be within one order
85  // of magnitude. The default is 10000.
86  unsigned long long int projected_element_count;
87 
88  // The approximate false positive probability expected
89  // from the bloom filter. The default is assumed to be
90  // the reciprocal of the projected_element_count.
92 
93  unsigned long long int random_seed;
94 
96  {
98  : number_of_hashes(0),
99  table_size(0)
100  {}
101 
102  unsigned int number_of_hashes;
103  unsigned long long int table_size;
104  };
105 
107 
109  {
110  /*
111  Note:
112  The following will attempt to find the number of hash functions
113  and minimum amount of storage bits required to construct a bloom
114  filter consistent with the user defined false positive probability
115  and estimated element insertion count.
116  */
117 
118  if (!(*this))
119  return false;
120 
121  double min_m = std::numeric_limits<double>::infinity();
122  double min_k = 0.0;
123  double k = 1.0;
124 
125  while (k < 1000.0)
126  {
127  const double numerator = (- k * projected_element_count);
128  const double denominator = std::log(1.0 - std::pow(false_positive_probability, 1.0 / k));
129 
130  const double curr_m = numerator / denominator;
131 
132  if (curr_m < min_m)
133  {
134  min_m = curr_m;
135  min_k = k;
136  }
137 
138  k += 1.0;
139  }
140 
142 
143  optp.number_of_hashes = static_cast<unsigned int>(min_k);
144 
145  optp.table_size = static_cast<unsigned long long int>(min_m);
146 
147  optp.table_size += (((optp.table_size % bits_per_char) != 0) ? (bits_per_char - (optp.table_size % bits_per_char)) : 0);
148 
149  if (optp.number_of_hashes < minimum_number_of_hashes)
151  else if (optp.number_of_hashes > maximum_number_of_hashes)
153 
154  if (optp.table_size < minimum_size)
155  optp.table_size = minimum_size;
156  else if (optp.table_size > maximum_size)
157  optp.table_size = maximum_size;
158 
159  return true;
160  }
161 
162 };
163 
165 {
166 protected:
167 
168  typedef unsigned int bloom_type;
169  typedef unsigned char cell_type;
170  typedef std::vector<unsigned char> table_type;
171 
172 public:
173 
175  : salt_count_(0),
176  table_size_(0),
177  projected_element_count_(0),
178  inserted_element_count_ (0),
179  random_seed_(0),
180  desired_false_positive_probability_(0.0)
181  {}
182 
184  : projected_element_count_(p.projected_element_count),
185  inserted_element_count_(0),
186  random_seed_((p.random_seed * 0xA5A5A5A5) + 1),
187  desired_false_positive_probability_(p.false_positive_probability)
188  {
189  salt_count_ = p.optimal_parameters.number_of_hashes;
190  table_size_ = p.optimal_parameters.table_size;
191 
192  generate_unique_salt();
193 
194  bit_table_.resize(table_size_ / bits_per_char, static_cast<unsigned char>(0x00));
195  }
196 
197  bloom_filter(const bloom_filter& filter)
198  {
199  this->operator=(filter);
200  }
201 
202  inline bool operator == (const bloom_filter& f) const
203  {
204  if (this != &f)
205  {
206  return
207  (salt_count_ == f.salt_count_ ) &&
208  (table_size_ == f.table_size_ ) &&
209  (bit_table_.size() == f.bit_table_.size() ) &&
210  (projected_element_count_ == f.projected_element_count_ ) &&
211  (inserted_element_count_ == f.inserted_element_count_ ) &&
212  (random_seed_ == f.random_seed_ ) &&
213  (desired_false_positive_probability_ == f.desired_false_positive_probability_) &&
214  (salt_ == f.salt_ ) &&
215  (bit_table_ == f.bit_table_ ) ;
216  }
217  else
218  return true;
219  }
220 
221  inline bool operator != (const bloom_filter& f) const
222  {
223  return !operator==(f);
224  }
225 
226  inline bloom_filter& operator = (const bloom_filter& f)
227  {
228  if (this != &f)
229  {
230  salt_count_ = f.salt_count_;
231  table_size_ = f.table_size_;
232  bit_table_ = f.bit_table_;
233  salt_ = f.salt_;
234 
235  projected_element_count_ = f.projected_element_count_;
236  inserted_element_count_ = f.inserted_element_count_;
237 
238  random_seed_ = f.random_seed_;
239 
240  desired_false_positive_probability_ = f.desired_false_positive_probability_;
241  }
242 
243  return *this;
244  }
245 
246  virtual ~bloom_filter()
247  {}
248 
249  inline bool operator!() const
250  {
251  return (0 == table_size_);
252  }
253 
254  inline void clear()
255  {
256  std::fill(bit_table_.begin(), bit_table_.end(), static_cast<unsigned char>(0x00));
257  inserted_element_count_ = 0;
258  }
259 
260  inline void insert(const unsigned char* key_begin, const std::size_t& length)
261  {
262  std::size_t bit_index = 0;
263  std::size_t bit = 0;
264 
265  for (std::size_t i = 0; i < salt_.size(); ++i)
266  {
267  compute_indices(hash_ap(key_begin, length, salt_[i]), bit_index, bit);
268 
269  bit_table_[bit_index / bits_per_char] |= bit_mask[bit];
270  }
271 
272  ++inserted_element_count_;
273  }
274 
275  template <typename T>
276  inline void insert(const T& t)
277  {
278  // Note: T must be a C++ POD type.
279  insert(reinterpret_cast<const unsigned char*>(&t),sizeof(T));
280  }
281 
282  inline void insert(const std::string& key)
283  {
284  insert(reinterpret_cast<const unsigned char*>(key.data()),key.size());
285  }
286 
287  inline void insert(const char* data, const std::size_t& length)
288  {
289  insert(reinterpret_cast<const unsigned char*>(data),length);
290  }
291 
292  template <typename InputIterator>
293  inline void insert(const InputIterator begin, const InputIterator end)
294  {
295  InputIterator itr = begin;
296 
297  while (end != itr)
298  {
299  insert(*(itr++));
300  }
301  }
302 
303  inline virtual bool contains(const unsigned char* key_begin, const std::size_t length) const
304  {
305  std::size_t bit_index = 0;
306  std::size_t bit = 0;
307 
308  for (std::size_t i = 0; i < salt_.size(); ++i)
309  {
310  compute_indices(hash_ap(key_begin, length, salt_[i]), bit_index, bit);
311 
312  if ((bit_table_[bit_index / bits_per_char] & bit_mask[bit]) != bit_mask[bit])
313  {
314  return false;
315  }
316  }
317 
318  return true;
319  }
320 
321  template <typename T>
322  inline bool contains(const T& t) const
323  {
324  return contains(reinterpret_cast<const unsigned char*>(&t),static_cast<std::size_t>(sizeof(T)));
325  }
326 
327  inline bool contains(const std::string& key) const
328  {
329  return contains(reinterpret_cast<const unsigned char*>(key.c_str()),key.size());
330  }
331 
332  inline bool contains(const char* data, const std::size_t& length) const
333  {
334  return contains(reinterpret_cast<const unsigned char*>(data),length);
335  }
336 
337  template <typename InputIterator>
338  inline InputIterator contains_all(const InputIterator begin, const InputIterator end) const
339  {
340  InputIterator itr = begin;
341 
342  while (end != itr)
343  {
344  if (!contains(*itr))
345  {
346  return itr;
347  }
348 
349  ++itr;
350  }
351 
352  return end;
353  }
354 
355  template <typename InputIterator>
356  inline InputIterator contains_none(const InputIterator begin, const InputIterator end) const
357  {
358  InputIterator itr = begin;
359 
360  while (end != itr)
361  {
362  if (contains(*itr))
363  {
364  return itr;
365  }
366 
367  ++itr;
368  }
369 
370  return end;
371  }
372 
373  inline virtual unsigned long long int size() const
374  {
375  return table_size_;
376  }
377 
378  inline unsigned long long int element_count() const
379  {
380  return inserted_element_count_;
381  }
382 
383  inline double effective_fpp() const
384  {
385  /*
386  Note:
387  The effective false positive probability is calculated using the
388  designated table size and hash function count in conjunction with
389  the current number of inserted elements - not the user defined
390  predicated/expected number of inserted elements.
391  */
392  return std::pow(1.0 - std::exp(-1.0 * salt_.size() * inserted_element_count_ / size()), 1.0 * salt_.size());
393  }
394 
395  inline bloom_filter& operator &= (const bloom_filter& f)
396  {
397  /* intersection */
398  if (
399  (salt_count_ == f.salt_count_ ) &&
400  (table_size_ == f.table_size_ ) &&
401  (random_seed_ == f.random_seed_)
402  )
403  {
404  for (std::size_t i = 0; i < bit_table_.size(); ++i)
405  {
406  bit_table_[i] &= f.bit_table_[i];
407  }
408  }
409 
410  return *this;
411  }
412 
413  inline bloom_filter& operator |= (const bloom_filter& f)
414  {
415  /* union */
416  if (
417  (salt_count_ == f.salt_count_ ) &&
418  (table_size_ == f.table_size_ ) &&
419  (random_seed_ == f.random_seed_)
420  )
421  {
422  for (std::size_t i = 0; i < bit_table_.size(); ++i)
423  {
424  bit_table_[i] |= f.bit_table_[i];
425  }
426  }
427 
428  return *this;
429  }
430 
431  inline bloom_filter& operator ^= (const bloom_filter& f)
432  {
433  /* difference */
434  if (
435  (salt_count_ == f.salt_count_ ) &&
436  (table_size_ == f.table_size_ ) &&
437  (random_seed_ == f.random_seed_)
438  )
439  {
440  for (std::size_t i = 0; i < bit_table_.size(); ++i)
441  {
442  bit_table_[i] ^= f.bit_table_[i];
443  }
444  }
445 
446  return *this;
447  }
448 
449  inline const cell_type* table() const
450  {
451  return bit_table_.data();
452  }
453 
454  inline std::size_t hash_count()
455  {
456  return salt_.size();
457  }
458 
459 protected:
460 
461  inline virtual void compute_indices(const bloom_type& hash, std::size_t& bit_index, std::size_t& bit) const
462  {
463  bit_index = hash % table_size_;
464  bit = bit_index % bits_per_char;
465  }
466 
468  {
469  /*
470  Note:
471  A distinct hash function need not be implementation-wise
472  distinct. In the current implementation "seeding" a common
473  hash function with different values seems to be adequate.
474  */
475  const unsigned int predef_salt_count = 128;
476 
477  static const bloom_type predef_salt[predef_salt_count] =
478  {
479  0xAAAAAAAA, 0x55555555, 0x33333333, 0xCCCCCCCC,
480  0x66666666, 0x99999999, 0xB5B5B5B5, 0x4B4B4B4B,
481  0xAA55AA55, 0x55335533, 0x33CC33CC, 0xCC66CC66,
482  0x66996699, 0x99B599B5, 0xB54BB54B, 0x4BAA4BAA,
483  0xAA33AA33, 0x55CC55CC, 0x33663366, 0xCC99CC99,
484  0x66B566B5, 0x994B994B, 0xB5AAB5AA, 0xAAAAAA33,
485  0x555555CC, 0x33333366, 0xCCCCCC99, 0x666666B5,
486  0x9999994B, 0xB5B5B5AA, 0xFFFFFFFF, 0xFFFF0000,
487  0xB823D5EB, 0xC1191CDF, 0xF623AEB3, 0xDB58499F,
488  0xC8D42E70, 0xB173F616, 0xA91A5967, 0xDA427D63,
489  0xB1E8A2EA, 0xF6C0D155, 0x4909FEA3, 0xA68CC6A7,
490  0xC395E782, 0xA26057EB, 0x0CD5DA28, 0x467C5492,
491  0xF15E6982, 0x61C6FAD3, 0x9615E352, 0x6E9E355A,
492  0x689B563E, 0x0C9831A8, 0x6753C18B, 0xA622689B,
493  0x8CA63C47, 0x42CC2884, 0x8E89919B, 0x6EDBD7D3,
494  0x15B6796C, 0x1D6FDFE4, 0x63FF9092, 0xE7401432,
495  0xEFFE9412, 0xAEAEDF79, 0x9F245A31, 0x83C136FC,
496  0xC3DA4A8C, 0xA5112C8C, 0x5271F491, 0x9A948DAB,
497  0xCEE59A8D, 0xB5F525AB, 0x59D13217, 0x24E7C331,
498  0x697C2103, 0x84B0A460, 0x86156DA9, 0xAEF2AC68,
499  0x23243DA5, 0x3F649643, 0x5FA495A8, 0x67710DF8,
500  0x9A6C499E, 0xDCFB0227, 0x46A43433, 0x1832B07A,
501  0xC46AFF3C, 0xB9C8FFF0, 0xC9500467, 0x34431BDF,
502  0xB652432B, 0xE367F12B, 0x427F4C1B, 0x224C006E,
503  0x2E7E5A89, 0x96F99AA5, 0x0BEB452A, 0x2FD87C39,
504  0x74B2E1FB, 0x222EFD24, 0xF357F60C, 0x440FCB1E,
505  0x8BBE030F, 0x6704DC29, 0x1144D12F, 0x948B1355,
506  0x6D8FD7E9, 0x1C11A014, 0xADD1592F, 0xFB3C712E,
507  0xFC77642F, 0xF9C4CE8C, 0x31312FB9, 0x08B0DD79,
508  0x318FA6E7, 0xC040D23D, 0xC0589AA7, 0x0CA5C075,
509  0xF874B172, 0x0CF914D5, 0x784D3280, 0x4E8CFEBC,
510  0xC569F575, 0xCDB2A091, 0x2CC016B4, 0x5C5F4421
511  };
512 
513  if (salt_count_ <= predef_salt_count)
514  {
515  std::copy(predef_salt,
516  predef_salt + salt_count_,
517  std::back_inserter(salt_));
518 
519  for (std::size_t i = 0; i < salt_.size(); ++i)
520  {
521  /*
522  Note:
523  This is done to integrate the user defined random seed,
524  so as to allow for the generation of unique bloom filter
525  instances.
526  */
527  salt_[i] = salt_[i] * salt_[(i + 3) % salt_.size()] + static_cast<bloom_type>(random_seed_);
528  }
529  }
530  else
531  {
532  std::copy(predef_salt, predef_salt + predef_salt_count, std::back_inserter(salt_));
533 
534  srand(static_cast<unsigned int>(random_seed_));
535 
536  while (salt_.size() < salt_count_)
537  {
538  bloom_type current_salt = static_cast<bloom_type>(rand()) * static_cast<bloom_type>(rand());
539 
540  if (0 == current_salt)
541  continue;
542 
543  if (salt_.end() == std::find(salt_.begin(), salt_.end(), current_salt))
544  {
545  salt_.push_back(current_salt);
546  }
547  }
548  }
549  }
550 
551  inline bloom_type hash_ap(const unsigned char* begin, std::size_t remaining_length, bloom_type hash) const
552  {
553  const unsigned char* itr = begin;
554  unsigned int loop = 0;
555 
556  while (remaining_length >= 8)
557  {
558  const unsigned int& i1 = *(reinterpret_cast<const unsigned int*>(itr)); itr += sizeof(unsigned int);
559  const unsigned int& i2 = *(reinterpret_cast<const unsigned int*>(itr)); itr += sizeof(unsigned int);
560 
561  hash ^= (hash << 7) ^ i1 * (hash >> 3) ^
562  (~((hash << 11) + (i2 ^ (hash >> 5))));
563 
564  remaining_length -= 8;
565  }
566 
567  if (remaining_length)
568  {
569  if (remaining_length >= 4)
570  {
571  const unsigned int& i = *(reinterpret_cast<const unsigned int*>(itr));
572 
573  if (loop & 0x01)
574  hash ^= (hash << 7) ^ i * (hash >> 3);
575  else
576  hash ^= (~((hash << 11) + (i ^ (hash >> 5))));
577 
578  ++loop;
579 
580  remaining_length -= 4;
581 
582  itr += sizeof(unsigned int);
583  }
584 
585  if (remaining_length >= 2)
586  {
587  const unsigned short& i = *(reinterpret_cast<const unsigned short*>(itr));
588 
589  if (loop & 0x01)
590  hash ^= (hash << 7) ^ i * (hash >> 3);
591  else
592  hash ^= (~((hash << 11) + (i ^ (hash >> 5))));
593 
594  ++loop;
595 
596  remaining_length -= 2;
597 
598  itr += sizeof(unsigned short);
599  }
600 
601  if (remaining_length)
602  {
603  hash += ((*itr) ^ (hash * 0xA5A5A5A5)) + loop;
604  }
605  }
606 
607  return hash;
608  }
609 
610  std::vector<bloom_type> salt_;
611  std::vector<unsigned char> bit_table_;
612  unsigned int salt_count_;
613  unsigned long long int table_size_;
614  unsigned long long int projected_element_count_;
615  unsigned long long int inserted_element_count_;
616  unsigned long long int random_seed_;
618 };
619 
621 {
622  bloom_filter result = a;
623  result &= b;
624  return result;
625 }
626 
628 {
629  bloom_filter result = a;
630  result |= b;
631  return result;
632 }
633 
635 {
636  bloom_filter result = a;
637  result ^= b;
638  return result;
639 }
640 
642 {
643 public:
644 
646  : bloom_filter(p)
647  {
648  size_list.push_back(table_size_);
649  }
650 
651  inline unsigned long long int size() const
652  {
653  return size_list.back();
654  }
655 
656  inline bool compress(const double& percentage)
657  {
658  if (
659  (percentage < 0.0) ||
660  (percentage >= 100.0)
661  )
662  {
663  return false;
664  }
665 
666  unsigned long long int original_table_size = size_list.back();
667  unsigned long long int new_table_size = static_cast<unsigned long long int>((size_list.back() * (1.0 - (percentage / 100.0))));
668 
669  new_table_size -= new_table_size % bits_per_char;
670 
671  if (
672  (bits_per_char > new_table_size) ||
673  (new_table_size >= original_table_size)
674  )
675  {
676  return false;
677  }
678 
679  desired_false_positive_probability_ = effective_fpp();
680 
681  const unsigned long long int new_tbl_raw_size = new_table_size / bits_per_char;
682 
683  table_type tmp(new_tbl_raw_size);
684 
685  std::copy(bit_table_.begin(), bit_table_.begin() + new_tbl_raw_size, tmp.begin());
686 
687  typedef table_type::iterator itr_t;
688 
689  itr_t itr = bit_table_.begin() + (new_table_size / bits_per_char);
690  itr_t end = bit_table_.begin() + (original_table_size / bits_per_char);
691  itr_t itr_tmp = tmp.begin();
692 
693  while (end != itr)
694  {
695  *(itr_tmp++) |= (*itr++);
696  }
697 
698  std::swap(bit_table_, tmp);
699 
700  size_list.push_back(new_table_size);
701 
702  return true;
703  }
704 
705 private:
706 
707  inline void compute_indices(const bloom_type& hash, std::size_t& bit_index, std::size_t& bit) const
708  {
709  bit_index = hash;
710 
711  for (std::size_t i = 0; i < size_list.size(); ++i)
712  {
713  bit_index %= size_list[i];
714  }
715 
716  bit = bit_index % bits_per_char;
717  }
718 
719  std::vector<unsigned long long int> size_list;
720 };
721 
722 #endif
723 
724 
725 /*
726  Note 1:
727  If it can be guaranteed that bits_per_char will be of the form 2^n then
728  the following optimization can be used:
729 
730  bit_table_[bit_index >> n] |= bit_mask[bit_index & (bits_per_char - 1)];
731 
732  Note 2:
733  For performance reasons where possible when allocating memory it should
734  be aligned (aligned_alloc) according to the architecture being used.
735 */
bloom_filter(const bloom_filter &filter)
virtual bool compute_optimal_parameters()
unsigned char cell_type
unsigned long long int random_seed
void insert(const unsigned char *key_begin, const std::size_t &length)
static const unsigned char bit_mask[bits_per_char]
InputIterator contains_all(const InputIterator begin, const InputIterator end) const
static const std::size_t bits_per_char
unsigned long long int minimum_size
unsigned long long int element_count() const
bloom_filter operator&(const bloom_filter &a, const bloom_filter &b)
unsigned int maximum_number_of_hashes
virtual ~bloom_filter()
bool contains(const std::string &key) const
double false_positive_probability
bloom_filter operator|(const bloom_filter &a, const bloom_filter &b)
std::vector< unsigned char > table_type
bool contains(const char *data, const std::size_t &length) const
const cell_type * table() const
std::vector< unsigned char > bit_table_
unsigned long long int table_size_
bool contains(const T &t) const
virtual unsigned long long int size() const
unsigned int minimum_number_of_hashes
unsigned long long int inserted_element_count_
bloom_type hash_ap(const unsigned char *begin, std::size_t remaining_length, bloom_type hash) const
double effective_fpp() const
void generate_unique_salt()
virtual ~bloom_parameters()
void insert(const std::string &key)
unsigned long long int random_seed_
unsigned long long int maximum_size
bool compress(const double &percentage)
unsigned int salt_count_
InputIterator contains_none(const InputIterator begin, const InputIterator end) const
optimal_parameters_t optimal_parameters
std::vector< bloom_type > salt_
virtual void compute_indices(const bloom_type &hash, std::size_t &bit_index, std::size_t &bit) const
void insert(const InputIterator begin, const InputIterator end)
double desired_false_positive_probability_
unsigned int bloom_type
void insert(const char *data, const std::size_t &length)
bloom_filter operator^(const bloom_filter &a, const bloom_filter &b)
unsigned long long int projected_element_count_
bool operator!() const
unsigned long long int size() const
virtual bool contains(const unsigned char *key_begin, const std::size_t length) const
unsigned long long int projected_element_count
compressible_bloom_filter(const bloom_parameters &p)
void insert(const T &t)
bloom_filter(const bloom_parameters &p)
std::size_t hash_count()