C++ Bloom Filter Library  release
bloom_filter_example02.cpp
Go to the documentation of this file.
1 /*
2  *********************************************************************
3  * *
4  * Open Bloom Filter *
5  * *
6  * Description: Demonstration of a Bloom Filter *
7  * Author: Arash Partow - 2000 *
8  * URL: http://www.partow.net *
9  * URL: http://www.partow.net/programming/hashfunctions/index.html *
10  * *
11  * Copyright notice: *
12  * Free use of the Open Bloom Filter Library is permitted under the *
13  * guidelines and in accordance with the MIT License. *
14  * http://www.opensource.org/licenses/MIT *
15  * *
16  *********************************************************************
17 */
18 
19 
20 /*
21  Description: This example will demonstrate how to instantiate a Bloom filter,
22  insert strings and then query the inserted strings and a set of
23  outlier strings for membership status within the Bloom filter.
24  Furthermore this process will be carried out upon 1000 unique
25  instances of Bloom filter. The objective is to empirically
26  determine which "random" seed that when used to construct a
27  Bloom filter will provide the smallest observed false positive
28  probability for the given sets of data. The optimal seed will
29  be the one associated with the round that has the smallest
30  difference percentage of false positive probability against
31  the user specified false positive probability.
32 */
33 
34 
35 #include <iostream>
36 #include <cstddef>
37 #include <cstdio>
38 #include <fstream>
39 #include <iterator>
40 #include <algorithm>
41 #include <vector>
42 #include <deque>
43 #include <set>
44 #include <string>
45 
46 #include "bloom_filter.hpp"
47 
48 bool load_word_list(int argc, char* argv[], std::vector<std::string>& word_list);
49 
50 template <class T,
51  class Allocator,
52  template <class,class> class Container>
53 bool read_file(const std::string& file_name, Container<T, Allocator>& c);
54 
55 std::string uppercase(std::string str);
56 std::string reverse(std::string str);
57 
58 void generate_outliers(const std::vector<std::string>& word_list, std::deque<std::string>& outliers);
59 void purify_outliers(const std::vector<std::string>& word_list,std::deque<std::string>& outliers);
60 
61 int main(int argc, char* argv[])
62 {
63  std::vector<std::string> word_list;
64  std::deque<std::string> outliers;
65 
66  if (!load_word_list(argc,argv,word_list))
67  {
68  return 1;
69  }
70 
71  generate_outliers(word_list,outliers);
72 
73  unsigned int random_seed = 0;
74  std::size_t word_list_storage_size = 0;
75 
76  for (unsigned int i = 0; i < word_list.size(); ++i)
77  {
78  word_list_storage_size += word_list[i].size();
79  }
80 
81  std::size_t total_number_of_queries = 0;
82 
83  const double desired_probability_of_false_positive = 1.0 / word_list.size();
84 
85  printf("Round\t Queries\t FPQ\t IPFP\t PFP\t DPFP\t TvD\n");
86 
87  unsigned int max_false_positive_count = 0;
88  unsigned int min_false_positive_count = std::numeric_limits<unsigned int>::max();
89  unsigned int total_false_positive = 0;
90  unsigned int total_zero_fp = 0;
91  unsigned long long int bloom_filter_size = 0;
92 
93 
94  static const unsigned int rounds = 1000;
95 
96  while (random_seed < rounds)
97  {
98  bloom_parameters parameters;
99 
100  parameters.projected_element_count = word_list.size();
101  parameters.false_positive_probability = desired_probability_of_false_positive;
102  parameters.random_seed = ++random_seed;
103 
104  if (!parameters)
105  {
106  std::cout << "Error - Invalid set of bloom filter parameters!" << std::endl;
107  return 1;
108  }
109 
110  parameters.compute_optimal_parameters();
111 
112  bloom_filter filter(parameters);
113 
114  filter.insert(word_list.begin(),word_list.end());
115 
116  std::vector<std::string>::iterator it = filter.contains_all(word_list.begin(),word_list.end());
117 
118  if (word_list.end() != it)
119  {
120  std::cout << "ERROR: key not found in bloom filter! =>" << (*it) << std::endl;
121  return 1;
122  }
123 
124  unsigned int current_total_false_positive = 0;
125 
126  for (std::deque<std::string>::iterator itr = outliers.begin(); itr != outliers.end(); ++itr)
127  {
128  if (filter.contains(*itr))
129  {
130  ++current_total_false_positive;
131  }
132  }
133 
134  total_number_of_queries += (outliers.size() + word_list.size());
135 
136  // Overall false positive probability
137  double pfp = current_total_false_positive / (1.0 * outliers.size());
138 
139  printf("%6llu\t%10llu\t%6d\t%8.7f\t%8.7f\t%9.3f%%\t%8.6f\n",
140  static_cast<unsigned long long>(random_seed),
141  static_cast<unsigned long long>(total_number_of_queries),
142  current_total_false_positive,
143  desired_probability_of_false_positive,
144  pfp,
145  (100.0 * pfp) / desired_probability_of_false_positive,
146  (100.0 * filter.size()) / (bits_per_char * word_list_storage_size));
147 
148  if (current_total_false_positive < min_false_positive_count)
149  min_false_positive_count = current_total_false_positive;
150  else if (current_total_false_positive > max_false_positive_count)
151  max_false_positive_count = current_total_false_positive;
152 
153  total_false_positive += current_total_false_positive;
154 
155  if (0 == current_total_false_positive)
156  ++total_zero_fp;
157 
158  bloom_filter_size = filter.size();
159  }
160 
161  double average_fpc = (1.0 * total_false_positive) / rounds;
162  double average_fpp = average_fpc / (outliers.size() + word_list.size());
163 
164  printf("Bloom Filter Statistics\n"
165  "MinFPC: %d\tMaxFPC: %d\tAverageFPC: %8.5f\tAverageFPP: %9.8f Zero-FPC:%d\n"
166  "Filter Size: %lluKB\tData Size: %dKB\n",
167  min_false_positive_count,
168  max_false_positive_count,
169  average_fpc,
170  average_fpp,
171  total_zero_fp,
172  bloom_filter_size / (8 * 1024),
173  static_cast<unsigned int>(word_list_storage_size / 1024));
174 
175  /*
176  Terminology
177  MinFPC : Minimum (smallest) False Positive Count
178  MaxFPC : Maximum (largest) False Positive Count
179  AverageFPC : Average False Positive Count
180  AverageFPP : Average False Positive Probability
181 
182  FPQ : False Positive Queries
183  IPFP : Indicative (desired) False Positive Probability
184  PFP : Probability of a False Positive (based on the FPQ)
185  DPFP : Difference as a percentage between IPFP and PFP
186  TvD : percentage of the filter size versus the raw data size
187  */
188 
189  return 0;
190 }
191 
192 bool load_word_list(int argc, char* argv[], std::vector<std::string>& word_list)
193 {
194  // Note: The word-lists can be obtained from:
195  // https://github.com/ArashPartow/bloom
196  static const std::string wl_list[] =
197  { "word-list.txt",
198  "word-list-large.txt",
199  "word-list-extra-large.txt",
200  "random-list.txt"
201  };
202 
203  std::size_t index = 0;
204 
205  if (2 == argc)
206  {
207  index = ::atoi(argv[1]);
208 
209  const std::size_t wl_list_size = sizeof(wl_list) / sizeof(std::string);
210 
211  if (index >= wl_list_size)
212  {
213  std::cout << "Invalid world list index: " << index << std::endl;
214  return false;
215  }
216  }
217 
218  std::cout << "Loading list " << wl_list[index] << ".....";
219 
220  if (!read_file(wl_list[index],word_list))
221  {
222  return false;
223  }
224 
225  if (word_list.empty())
226  {
227  std::cout << "No word list - Either none requested, or desired word list could not be loaded." << std::endl;
228  return false;
229  }
230  else
231  std::cout << " Complete." << std::endl;
232 
233  return true;
234 }
235 
236 template <class T,
237  class Allocator,
238  template <class,class> class Container>
239 bool read_file(const std::string& file_name, Container<T, Allocator>& c)
240 {
241  std::ifstream stream(file_name.c_str());
242 
243  if (!stream)
244  {
245  std::cout << "Error: Failed to open file '" << file_name << "'" << std::endl;
246  return false;
247  }
248 
249  std::string buffer;
250 
251  while (std::getline(stream,buffer))
252  {
253  c.push_back(buffer);
254  c.push_back(uppercase(buffer));
255  }
256 
257  return true;
258 }
259 
260 std::string uppercase(std::string str)
261 {
262  for (std::size_t i = 0; i < str.size(); ++i)
263  {
264  str[i] = static_cast<char>(toupper(str[i]));
265  }
266 
267  return str;
268 }
269 
270 std::string reverse(std::string str)
271 {
272  // Not the most efficient way of doing this.
273  std::reverse(str.begin(),str.end());
274  return str;
275 }
276 
277 void generate_outliers(const std::vector<std::string>& word_list, std::deque<std::string>& outliers)
278 {
279  std::cout << "Generating outliers..... ";
280 
281  for (std::vector<std::string>::const_iterator it = word_list.begin(); it != word_list.end(); ++it)
282  {
283  if ((*it) != reverse((*it)))
284  {
285  outliers.push_back((*it) + reverse((*it)));
286  outliers.push_back((*it) + (*it));
287  outliers.push_back(reverse((*it)) + (*it) + reverse((*it)));
288  }
289 
290  std::string ns = *it;
291 
292  for (unsigned int i = 0; i < ns.size(); ++i)
293  {
294  if (1 == (i & 0x00)) ns[i] = ~ns[i];
295  }
296 
297  outliers.push_back(ns);
298  }
299 
300  static const std::string rand_str[] =
301  {
302  "oD5l", "pccW", "5yHt", "ndaN", "OaJh", "tWPc", "Cr9C", "a9zE",
303  "H1wL", "yo1V", "16D7", "f2WR", "0MVQ", "PkKn", "PlVa", "MvzL",
304  "9Csl", "JQTv", "IveD", "FDVS", "Q7HE", "QgcF", "Q9Vo", "V8zJ",
305  "EJWT", "GuLC", "rM3d", "PJF4", "HXPW", "qKx3", "ztRP", "t4KP",
306  "m1zV", "fn12", "B1QP", "Jr4I", "Mf8M", "4jBd", "anGR", "Pipt",
307  "QHon", "GNlc", "UeXM", "mVM5", "ABI8", "RhB3", "5h2s", "hOYo",
308  "gaId", "DX40", "THMu", "EwlP", "n9Mz", "oC1S", "BfMl", "uCZ1",
309  "G2bA", "MOH9", "zZ0O", "PKDO", "3nRU", "Z6ie", "4cso", "LnQO",
310  "MJTtT","td3rC","A5JNR","1yL5B","rQnJk","jNKYF","CD0XD","pFLSG",
311  "fxO1a","CAjBE","ORk4e","0LERI","R7d0x","Qqd7v","6Kih5","9tTCB",
312  "yCg9U","D2Tv7","XpNHn","6zeFQ","BT2cs","WGhKW","zTv6B","TTPFk",
313  "XjNVX","pg9yW","4pKiZ","mQUhL","xrXzR","kVRm5","NSyC4","olXm9",
314  "UWkYy","8Ys6r","yd4Fl","5L4mB","nP3nH","f0DFb","glnQa","DlXQa",
315  "cQdH6","eBmIN","fDj6F","ezLow","C15vu","I2Z2j","BQgzg","eVBid",
316  "hn5TO","WZyQN","xXgsE","sL6nK","8DKD8","jcrbp","AcRak","h8N5o",
317  "LViwC","ThEKf","O7fd5","oN0Id","OM1m0","4OLiR","VIa8N","bJZFG",
318  "9j3rL","SzW0N","7m7pY","mY9bg","k1p3e","3OFm1","r45se","VYwz3",
319  "pDjXt","ZcqcJ","npPHx","hA3bw","w7lSO","jEmZL","1x3AZ","FN47G",
320  "kThNf","aC4fq","rzDwi","CYRNG","gCeuG","wCVqO","d1R60","bEauW",
321  "KeUwW","lIKhO","RfPv3","dK5wE","1X7qu","tRwEn","1c03P","GwHCl",
322  "CsJaO","zl4j1","e0aEc","Uskgi","rgTGR","jyR4g","Tt6l4","lRoaw",
323  "94ult","qZwBX","eYW8S","Qf6UH","AbV56","N1hJq","JIaVe","8LHEx",
324  "DeNbS","30I0a","hm6qw","3jcaO","4WkuA","mQ219","Gb81C","yx4HM",
325  "Chlqfi9S1y", "BMwUgVFu2X", "ZmpEGOVrVe", "13ggJxrPkC", "fcJJpyMGjm", "9T00Dv4ZAb",
326  "p3YRcP7M2o", "sR0qNUXCHv", "gCxWZbJ6rb", "R4YtzRXXUl", "vwyYz5j6pY", "XPWUvLXhJ7",
327  "7PwfnVVb7U", "1f34Q6hOYz", "1EM2abZY61", "0a6Ivi4S0a", "Teq2LrQs2T", "dWXLCgWHc8",
328  "LawMv7ujn4", "N8VFgbZQx5", "tfvHHxoDgi", "ImwYgXA2tf", "KkIES9NqZO", "ajcz0qjjda",
329  "6Vz28vlGs9", "VMCc5W8cCt", "BiQB8BRJ98", "43CpOJSMpA", "jfBJdqwXcU", "ecHR9EO2ib",
330  "LH7CcXyCZ7", "JntqGSgSpa", "0MbTMpZPFW", "5FJSdiCXzR", "5gda2AhA2x", "lrDFc1lnXk",
331  "zrEwECHvjs", "B0JldDxFa1", "6DYal4QxKa", "Hsqx6kP2S4", "zZwnALSuFh", "Shh4ISZcKW",
332  "P9VDaNSk7Z", "mEI2PLSCO6", "WyTyrQORtu", "IvJyMMRgh3", "Q6pgJq8Nkv", "dhOgR3tDAD",
333  "Y9h6bVgbxO", "wA15tiOPTm", "8TaIKf1zCO", "z75dzabHBs", "AS6OPnwoJI", "2DSZka9Auj",
334  "QLzUjV2CWs", "KZSN2SVhia", "7ttYKWF2ue", "1Zxfu7B2ST", "RnkpmwjsCi", "YpcSIzaqx5",
335  "RDEwFD9gmX", "Nlx3V4Cjw4", "9ZdvITOj8M", "httUPWMNXO", "Ypv9PjxGwa", "LlwyNolNnH",
336  "6xpJOht47a", "tbmz4WIdcG", "OwzuVDlb7D", "PBQKJxo8DQ", "uVnMQn7hK6", "rlnZINuDUa",
337  "2feyyYukPa", "teOlpKuDBn", "LxBSWh0dL1", "Onyb7r4Jp0", "bZxXE6xOXg", "d9NSvNTunQ",
338  "ONerLBic32", "8mar4rKmFk", "5cCN9uwaCg", "ElVrYOHHMv", "YF6Og8DX40", "OgiCwpCQ5a",
339  "K6nSRZVxdR", "gqyXXXoVFW", "ulyRYizcBP", "khUx31K5UR", "qZFRzVthju", "pQBh0vnB20",
340  "dk8NIN7ajy", "XP7ed1OjZx", "IRYNwA5iFR", "hiSEBhTukC", "Ns4jJ3jzGo", "dYoCSxjIvM",
341  "HzGLbl5i1g", "baizENd4ko", "6rCqGBO8t1", "QWGfC8UaA7", "JFhRfxQe4K", "8R4W6IWANz",
342  "2TnWf1w7JH", "0z69e0wcoG", "8SN1mRHCY7", "oFGCYHHwGX", "G8xqnBgxjO", "6B3SAOayHt",
343  "XRW3ZSG1gw", "WcIjTxMxOM", "wNqCAIaTb2", "gO4em4HW8H", "TgGFSMEtbG", "WiwmbEw3QA",
344  "D2xshYUgpu", "xRUZCQVzBs", "nCnUmMgIjE", "p4Ewt1yCJr", "MeOjDcaMY5", "1XelMeXiiI"
345  };
346 
347  static const std::size_t rand_str_size = sizeof(rand_str) / sizeof(std::string);
348 
349  for (unsigned int i = 0; i < rand_str_size; ++i)
350  {
351  std::string s0 = rand_str[i];
352  std::string s1 = rand_str[(i + 1) % rand_str_size];
353  std::string s2 = rand_str[(i + 2) % rand_str_size];
354  std::string s3 = rand_str[(i + 3) % rand_str_size];
355  std::string s4 = rand_str[(i + 4) % rand_str_size];
356  std::string s5 = rand_str[(i + 5) % rand_str_size];
357  std::string s6 = rand_str[(i + 6) % rand_str_size];
358 
359  outliers.push_back(s0);
360  outliers.push_back(s0 + s1);
361  outliers.push_back(s0 + s2 + s4);
362  outliers.push_back(s0 + s1 + s3);
363  outliers.push_back(s0 + s1 + s2 + s3 + s4 + s5);
364  outliers.push_back(s0 + s1 + s2 + s3 + s4 + s5 + s6);
365 
366  outliers.push_back(reverse(s0));
367  outliers.push_back(reverse(s0 + s1));
368  outliers.push_back(reverse(s0 + s2 + s4));
369  outliers.push_back(reverse(s0 + s1 + s3));
370  outliers.push_back(reverse(s0 + s1 + s2 + s3 + s4 + s5));
371  outliers.push_back(reverse(s0 + s1 + s2 + s3 + s4 + s5 + s6));
372  }
373 
374  std::sort(outliers.begin(),outliers.end());
375 
376  purify_outliers(word_list,outliers);
377 
378  std::cout << "Complete." << std::endl;
379 }
380 
381 void purify_outliers(const std::vector<std::string>& word_list, std::deque<std::string>& outliers)
382 {
383  std::set<std::string> set1;
384  std::set<std::string> set2;
385 
386  std::copy(word_list.begin(), word_list.end(),std::inserter(set1,set1.begin()));
387  std::copy(outliers.begin(), outliers.end(), std::inserter(set2,set2.begin()));
388 
389  std::deque<std::string> intersect_list;
390 
391  std::set_intersection(set1.begin(),set1.end(),
392  set2.begin(),set2.end(),
393  std::back_inserter(intersect_list));
394 
395  std::sort(intersect_list.begin(),intersect_list.end());
396 
397  if (!intersect_list.empty())
398  {
399  std::deque<std::string> new_outliers;
400 
401  for (std::deque<std::string>::iterator it = outliers.begin(); it != outliers.end(); ++it)
402  {
403  if (!std::binary_search(intersect_list.begin(),intersect_list.end(),*it))
404  {
405  new_outliers.push_back(*it);
406  }
407  }
408 
409  outliers.swap(new_outliers);
410  }
411 }
virtual bool compute_optimal_parameters()
unsigned long long int random_seed
std::string uppercase(std::string str)
void insert(const unsigned char *key_begin, const std::size_t &length)
InputIterator contains_all(const InputIterator begin, const InputIterator end) const
static const std::size_t bits_per_char
double false_positive_probability
int main(int argc, char *argv[])
bool load_word_list(int argc, char *argv[], std::vector< std::string > &word_list)
virtual unsigned long long int size() const
bool read_file(const std::string &file_name, Container< T, Allocator > &c)
void generate_outliers(const std::vector< std::string > &word_list, std::deque< std::string > &outliers)
void purify_outliers(const std::vector< std::string > &word_list, std::deque< std::string > &outliers)
virtual bool contains(const unsigned char *key_begin, const std::size_t length) const
std::string reverse(std::string str)
unsigned long long int projected_element_count