C++ Bloom Filter Library  release
Functions
bloom_filter_example02.cpp File Reference
#include <iostream>
#include <cstddef>
#include <cstdio>
#include <fstream>
#include <iterator>
#include <algorithm>
#include <vector>
#include <deque>
#include <set>
#include <string>
#include "bloom_filter.hpp"
Include dependency graph for bloom_filter_example02.cpp:

Go to the source code of this file.

Functions

bool load_word_list (int argc, char *argv[], std::vector< std::string > &word_list)
 
template<class T , class Allocator , template< class, class > class Container>
bool read_file (const std::string &file_name, Container< T, Allocator > &c)
 
std::string uppercase (std::string str)
 
std::string reverse (std::string str)
 
void generate_outliers (const std::vector< std::string > &word_list, std::deque< std::string > &outliers)
 
void purify_outliers (const std::vector< std::string > &word_list, std::deque< std::string > &outliers)
 
int main (int argc, char *argv[])
 

Function Documentation

◆ generate_outliers()

void generate_outliers ( const std::vector< std::string > &  word_list,
std::deque< std::string > &  outliers 
)

Definition at line 277 of file bloom_filter_example02.cpp.

References purify_outliers(), and reverse().

Referenced by main().

278 {
279  std::cout << "Generating outliers..... ";
280 
281  for (std::vector<std::string>::const_iterator it = word_list.begin(); it != word_list.end(); ++it)
282  {
283  if ((*it) != reverse((*it)))
284  {
285  outliers.push_back((*it) + reverse((*it)));
286  outliers.push_back((*it) + (*it));
287  outliers.push_back(reverse((*it)) + (*it) + reverse((*it)));
288  }
289 
290  std::string ns = *it;
291 
292  for (unsigned int i = 0; i < ns.size(); ++i)
293  {
294  if (1 == (i & 0x00)) ns[i] = ~ns[i];
295  }
296 
297  outliers.push_back(ns);
298  }
299 
300  static const std::string rand_str[] =
301  {
302  "oD5l", "pccW", "5yHt", "ndaN", "OaJh", "tWPc", "Cr9C", "a9zE",
303  "H1wL", "yo1V", "16D7", "f2WR", "0MVQ", "PkKn", "PlVa", "MvzL",
304  "9Csl", "JQTv", "IveD", "FDVS", "Q7HE", "QgcF", "Q9Vo", "V8zJ",
305  "EJWT", "GuLC", "rM3d", "PJF4", "HXPW", "qKx3", "ztRP", "t4KP",
306  "m1zV", "fn12", "B1QP", "Jr4I", "Mf8M", "4jBd", "anGR", "Pipt",
307  "QHon", "GNlc", "UeXM", "mVM5", "ABI8", "RhB3", "5h2s", "hOYo",
308  "gaId", "DX40", "THMu", "EwlP", "n9Mz", "oC1S", "BfMl", "uCZ1",
309  "G2bA", "MOH9", "zZ0O", "PKDO", "3nRU", "Z6ie", "4cso", "LnQO",
310  "MJTtT","td3rC","A5JNR","1yL5B","rQnJk","jNKYF","CD0XD","pFLSG",
311  "fxO1a","CAjBE","ORk4e","0LERI","R7d0x","Qqd7v","6Kih5","9tTCB",
312  "yCg9U","D2Tv7","XpNHn","6zeFQ","BT2cs","WGhKW","zTv6B","TTPFk",
313  "XjNVX","pg9yW","4pKiZ","mQUhL","xrXzR","kVRm5","NSyC4","olXm9",
314  "UWkYy","8Ys6r","yd4Fl","5L4mB","nP3nH","f0DFb","glnQa","DlXQa",
315  "cQdH6","eBmIN","fDj6F","ezLow","C15vu","I2Z2j","BQgzg","eVBid",
316  "hn5TO","WZyQN","xXgsE","sL6nK","8DKD8","jcrbp","AcRak","h8N5o",
317  "LViwC","ThEKf","O7fd5","oN0Id","OM1m0","4OLiR","VIa8N","bJZFG",
318  "9j3rL","SzW0N","7m7pY","mY9bg","k1p3e","3OFm1","r45se","VYwz3",
319  "pDjXt","ZcqcJ","npPHx","hA3bw","w7lSO","jEmZL","1x3AZ","FN47G",
320  "kThNf","aC4fq","rzDwi","CYRNG","gCeuG","wCVqO","d1R60","bEauW",
321  "KeUwW","lIKhO","RfPv3","dK5wE","1X7qu","tRwEn","1c03P","GwHCl",
322  "CsJaO","zl4j1","e0aEc","Uskgi","rgTGR","jyR4g","Tt6l4","lRoaw",
323  "94ult","qZwBX","eYW8S","Qf6UH","AbV56","N1hJq","JIaVe","8LHEx",
324  "DeNbS","30I0a","hm6qw","3jcaO","4WkuA","mQ219","Gb81C","yx4HM",
325  "Chlqfi9S1y", "BMwUgVFu2X", "ZmpEGOVrVe", "13ggJxrPkC", "fcJJpyMGjm", "9T00Dv4ZAb",
326  "p3YRcP7M2o", "sR0qNUXCHv", "gCxWZbJ6rb", "R4YtzRXXUl", "vwyYz5j6pY", "XPWUvLXhJ7",
327  "7PwfnVVb7U", "1f34Q6hOYz", "1EM2abZY61", "0a6Ivi4S0a", "Teq2LrQs2T", "dWXLCgWHc8",
328  "LawMv7ujn4", "N8VFgbZQx5", "tfvHHxoDgi", "ImwYgXA2tf", "KkIES9NqZO", "ajcz0qjjda",
329  "6Vz28vlGs9", "VMCc5W8cCt", "BiQB8BRJ98", "43CpOJSMpA", "jfBJdqwXcU", "ecHR9EO2ib",
330  "LH7CcXyCZ7", "JntqGSgSpa", "0MbTMpZPFW", "5FJSdiCXzR", "5gda2AhA2x", "lrDFc1lnXk",
331  "zrEwECHvjs", "B0JldDxFa1", "6DYal4QxKa", "Hsqx6kP2S4", "zZwnALSuFh", "Shh4ISZcKW",
332  "P9VDaNSk7Z", "mEI2PLSCO6", "WyTyrQORtu", "IvJyMMRgh3", "Q6pgJq8Nkv", "dhOgR3tDAD",
333  "Y9h6bVgbxO", "wA15tiOPTm", "8TaIKf1zCO", "z75dzabHBs", "AS6OPnwoJI", "2DSZka9Auj",
334  "QLzUjV2CWs", "KZSN2SVhia", "7ttYKWF2ue", "1Zxfu7B2ST", "RnkpmwjsCi", "YpcSIzaqx5",
335  "RDEwFD9gmX", "Nlx3V4Cjw4", "9ZdvITOj8M", "httUPWMNXO", "Ypv9PjxGwa", "LlwyNolNnH",
336  "6xpJOht47a", "tbmz4WIdcG", "OwzuVDlb7D", "PBQKJxo8DQ", "uVnMQn7hK6", "rlnZINuDUa",
337  "2feyyYukPa", "teOlpKuDBn", "LxBSWh0dL1", "Onyb7r4Jp0", "bZxXE6xOXg", "d9NSvNTunQ",
338  "ONerLBic32", "8mar4rKmFk", "5cCN9uwaCg", "ElVrYOHHMv", "YF6Og8DX40", "OgiCwpCQ5a",
339  "K6nSRZVxdR", "gqyXXXoVFW", "ulyRYizcBP", "khUx31K5UR", "qZFRzVthju", "pQBh0vnB20",
340  "dk8NIN7ajy", "XP7ed1OjZx", "IRYNwA5iFR", "hiSEBhTukC", "Ns4jJ3jzGo", "dYoCSxjIvM",
341  "HzGLbl5i1g", "baizENd4ko", "6rCqGBO8t1", "QWGfC8UaA7", "JFhRfxQe4K", "8R4W6IWANz",
342  "2TnWf1w7JH", "0z69e0wcoG", "8SN1mRHCY7", "oFGCYHHwGX", "G8xqnBgxjO", "6B3SAOayHt",
343  "XRW3ZSG1gw", "WcIjTxMxOM", "wNqCAIaTb2", "gO4em4HW8H", "TgGFSMEtbG", "WiwmbEw3QA",
344  "D2xshYUgpu", "xRUZCQVzBs", "nCnUmMgIjE", "p4Ewt1yCJr", "MeOjDcaMY5", "1XelMeXiiI"
345  };
346 
347  static const std::size_t rand_str_size = sizeof(rand_str) / sizeof(std::string);
348 
349  for (unsigned int i = 0; i < rand_str_size; ++i)
350  {
351  std::string s0 = rand_str[i];
352  std::string s1 = rand_str[(i + 1) % rand_str_size];
353  std::string s2 = rand_str[(i + 2) % rand_str_size];
354  std::string s3 = rand_str[(i + 3) % rand_str_size];
355  std::string s4 = rand_str[(i + 4) % rand_str_size];
356  std::string s5 = rand_str[(i + 5) % rand_str_size];
357  std::string s6 = rand_str[(i + 6) % rand_str_size];
358 
359  outliers.push_back(s0);
360  outliers.push_back(s0 + s1);
361  outliers.push_back(s0 + s2 + s4);
362  outliers.push_back(s0 + s1 + s3);
363  outliers.push_back(s0 + s1 + s2 + s3 + s4 + s5);
364  outliers.push_back(s0 + s1 + s2 + s3 + s4 + s5 + s6);
365 
366  outliers.push_back(reverse(s0));
367  outliers.push_back(reverse(s0 + s1));
368  outliers.push_back(reverse(s0 + s2 + s4));
369  outliers.push_back(reverse(s0 + s1 + s3));
370  outliers.push_back(reverse(s0 + s1 + s2 + s3 + s4 + s5));
371  outliers.push_back(reverse(s0 + s1 + s2 + s3 + s4 + s5 + s6));
372  }
373 
374  std::sort(outliers.begin(),outliers.end());
375 
376  purify_outliers(word_list,outliers);
377 
378  std::cout << "Complete." << std::endl;
379 }
void purify_outliers(const std::vector< std::string > &word_list, std::deque< std::string > &outliers)
std::string reverse(std::string str)
Here is the call graph for this function:
Here is the caller graph for this function:

◆ load_word_list()

bool load_word_list ( int  argc,
char *  argv[],
std::vector< std::string > &  word_list 
)

Definition at line 192 of file bloom_filter_example02.cpp.

References read_file().

Referenced by main().

193 {
194  // Note: The word-lists can be obtained from:
195  // https://github.com/ArashPartow/bloom
196  static const std::string wl_list[] =
197  { "word-list.txt",
198  "word-list-large.txt",
199  "word-list-extra-large.txt",
200  "random-list.txt"
201  };
202 
203  std::size_t index = 0;
204 
205  if (2 == argc)
206  {
207  index = ::atoi(argv[1]);
208 
209  const std::size_t wl_list_size = sizeof(wl_list) / sizeof(std::string);
210 
211  if (index >= wl_list_size)
212  {
213  std::cout << "Invalid world list index: " << index << std::endl;
214  return false;
215  }
216  }
217 
218  std::cout << "Loading list " << wl_list[index] << ".....";
219 
220  if (!read_file(wl_list[index],word_list))
221  {
222  return false;
223  }
224 
225  if (word_list.empty())
226  {
227  std::cout << "No word list - Either none requested, or desired word list could not be loaded." << std::endl;
228  return false;
229  }
230  else
231  std::cout << " Complete." << std::endl;
232 
233  return true;
234 }
bool read_file(const std::string &file_name, Container< T, Allocator > &c)
Here is the call graph for this function:
Here is the caller graph for this function:

◆ main()

int main ( int  argc,
char *  argv[] 
)

Definition at line 61 of file bloom_filter_example02.cpp.

References bits_per_char, bloom_parameters::compute_optimal_parameters(), bloom_filter::contains(), bloom_filter::contains_all(), bloom_parameters::false_positive_probability, generate_outliers(), bloom_filter::insert(), load_word_list(), bloom_parameters::projected_element_count, bloom_parameters::random_seed, and bloom_filter::size().

62 {
63  std::vector<std::string> word_list;
64  std::deque<std::string> outliers;
65 
66  if (!load_word_list(argc,argv,word_list))
67  {
68  return 1;
69  }
70 
71  generate_outliers(word_list,outliers);
72 
73  unsigned int random_seed = 0;
74  std::size_t word_list_storage_size = 0;
75 
76  for (unsigned int i = 0; i < word_list.size(); ++i)
77  {
78  word_list_storage_size += word_list[i].size();
79  }
80 
81  std::size_t total_number_of_queries = 0;
82 
83  const double desired_probability_of_false_positive = 1.0 / word_list.size();
84 
85  printf("Round\t Queries\t FPQ\t IPFP\t PFP\t DPFP\t TvD\n");
86 
87  unsigned int max_false_positive_count = 0;
88  unsigned int min_false_positive_count = std::numeric_limits<unsigned int>::max();
89  unsigned int total_false_positive = 0;
90  unsigned int total_zero_fp = 0;
91  unsigned long long int bloom_filter_size = 0;
92 
93 
94  static const unsigned int rounds = 1000;
95 
96  while (random_seed < rounds)
97  {
98  bloom_parameters parameters;
99 
100  parameters.projected_element_count = word_list.size();
101  parameters.false_positive_probability = desired_probability_of_false_positive;
102  parameters.random_seed = ++random_seed;
103 
104  if (!parameters)
105  {
106  std::cout << "Error - Invalid set of bloom filter parameters!" << std::endl;
107  return 1;
108  }
109 
110  parameters.compute_optimal_parameters();
111 
112  bloom_filter filter(parameters);
113 
114  filter.insert(word_list.begin(),word_list.end());
115 
116  std::vector<std::string>::iterator it = filter.contains_all(word_list.begin(),word_list.end());
117 
118  if (word_list.end() != it)
119  {
120  std::cout << "ERROR: key not found in bloom filter! =>" << (*it) << std::endl;
121  return 1;
122  }
123 
124  unsigned int current_total_false_positive = 0;
125 
126  for (std::deque<std::string>::iterator itr = outliers.begin(); itr != outliers.end(); ++itr)
127  {
128  if (filter.contains(*itr))
129  {
130  ++current_total_false_positive;
131  }
132  }
133 
134  total_number_of_queries += (outliers.size() + word_list.size());
135 
136  // Overall false positive probability
137  double pfp = current_total_false_positive / (1.0 * outliers.size());
138 
139  printf("%6llu\t%10llu\t%6d\t%8.7f\t%8.7f\t%9.3f%%\t%8.6f\n",
140  static_cast<unsigned long long>(random_seed),
141  static_cast<unsigned long long>(total_number_of_queries),
142  current_total_false_positive,
143  desired_probability_of_false_positive,
144  pfp,
145  (100.0 * pfp) / desired_probability_of_false_positive,
146  (100.0 * filter.size()) / (bits_per_char * word_list_storage_size));
147 
148  if (current_total_false_positive < min_false_positive_count)
149  min_false_positive_count = current_total_false_positive;
150  else if (current_total_false_positive > max_false_positive_count)
151  max_false_positive_count = current_total_false_positive;
152 
153  total_false_positive += current_total_false_positive;
154 
155  if (0 == current_total_false_positive)
156  ++total_zero_fp;
157 
158  bloom_filter_size = filter.size();
159  }
160 
161  double average_fpc = (1.0 * total_false_positive) / rounds;
162  double average_fpp = average_fpc / (outliers.size() + word_list.size());
163 
164  printf("Bloom Filter Statistics\n"
165  "MinFPC: %d\tMaxFPC: %d\tAverageFPC: %8.5f\tAverageFPP: %9.8f Zero-FPC:%d\n"
166  "Filter Size: %lluKB\tData Size: %dKB\n",
167  min_false_positive_count,
168  max_false_positive_count,
169  average_fpc,
170  average_fpp,
171  total_zero_fp,
172  bloom_filter_size / (8 * 1024),
173  static_cast<unsigned int>(word_list_storage_size / 1024));
174 
175  /*
176  Terminology
177  MinFPC : Minimum (smallest) False Positive Count
178  MaxFPC : Maximum (largest) False Positive Count
179  AverageFPC : Average False Positive Count
180  AverageFPP : Average False Positive Probability
181 
182  FPQ : False Positive Queries
183  IPFP : Indicative (desired) False Positive Probability
184  PFP : Probability of a False Positive (based on the FPQ)
185  DPFP : Difference as a percentage between IPFP and PFP
186  TvD : percentage of the filter size versus the raw data size
187  */
188 
189  return 0;
190 }
virtual bool compute_optimal_parameters()
unsigned long long int random_seed
static const std::size_t bits_per_char
double false_positive_probability
bool load_word_list(int argc, char *argv[], std::vector< std::string > &word_list)
void generate_outliers(const std::vector< std::string > &word_list, std::deque< std::string > &outliers)
unsigned long long int projected_element_count
Here is the call graph for this function:

◆ purify_outliers()

void purify_outliers ( const std::vector< std::string > &  word_list,
std::deque< std::string > &  outliers 
)

Definition at line 381 of file bloom_filter_example02.cpp.

Referenced by generate_outliers().

382 {
383  std::set<std::string> set1;
384  std::set<std::string> set2;
385 
386  std::copy(word_list.begin(), word_list.end(),std::inserter(set1,set1.begin()));
387  std::copy(outliers.begin(), outliers.end(), std::inserter(set2,set2.begin()));
388 
389  std::deque<std::string> intersect_list;
390 
391  std::set_intersection(set1.begin(),set1.end(),
392  set2.begin(),set2.end(),
393  std::back_inserter(intersect_list));
394 
395  std::sort(intersect_list.begin(),intersect_list.end());
396 
397  if (!intersect_list.empty())
398  {
399  std::deque<std::string> new_outliers;
400 
401  for (std::deque<std::string>::iterator it = outliers.begin(); it != outliers.end(); ++it)
402  {
403  if (!std::binary_search(intersect_list.begin(),intersect_list.end(),*it))
404  {
405  new_outliers.push_back(*it);
406  }
407  }
408 
409  outliers.swap(new_outliers);
410  }
411 }
Here is the caller graph for this function:

◆ read_file()

template<class T , class Allocator , template< class, class > class Container>
bool read_file ( const std::string &  file_name,
Container< T, Allocator > &  c 
)

Definition at line 239 of file bloom_filter_example02.cpp.

References uppercase().

Referenced by load_word_list().

240 {
241  std::ifstream stream(file_name.c_str());
242 
243  if (!stream)
244  {
245  std::cout << "Error: Failed to open file '" << file_name << "'" << std::endl;
246  return false;
247  }
248 
249  std::string buffer;
250 
251  while (std::getline(stream,buffer))
252  {
253  c.push_back(buffer);
254  c.push_back(uppercase(buffer));
255  }
256 
257  return true;
258 }
std::string uppercase(std::string str)
Here is the call graph for this function:
Here is the caller graph for this function:

◆ reverse()

std::string reverse ( std::string  str)

Definition at line 270 of file bloom_filter_example02.cpp.

Referenced by generate_outliers().

271 {
272  // Not the most efficient way of doing this.
273  std::reverse(str.begin(),str.end());
274  return str;
275 }
std::string reverse(std::string str)
Here is the caller graph for this function:

◆ uppercase()

std::string uppercase ( std::string  str)

Definition at line 260 of file bloom_filter_example02.cpp.

Referenced by read_file().

261 {
262  for (std::size_t i = 0; i < str.size(); ++i)
263  {
264  str[i] = static_cast<char>(toupper(str[i]));
265  }
266 
267  return str;
268 }
Here is the caller graph for this function: