C++ Bloom Filter Library  release
bloom_filter_example03.cpp
Go to the documentation of this file.
1 /*
2  *********************************************************************
3  * *
4  * Open Bloom Filter *
5  * *
6  * Description: Usage pattern of Compressible Bloom Filter *
7  * Author: Arash Partow - 2000 *
8  * URL: http://www.partow.net *
9  * URL: http://www.partow.net/programming/hashfunctions/index.html *
10  * *
11  * Copyright notice: *
12  * Free use of the Open Bloom Filter Library is permitted under the *
13  * guidelines and in accordance with the MIT License. *
14  * http://www.opensource.org/licenses/MIT *
15  * *
16  *********************************************************************
17 */
18 
19 
20 /*
21  Description: This example will demonstrate how to instantiate a compressible
22  Bloom filter, insert strings and then query the inserted strings
23  and a set of outlier strings for membership status in the filter.
24  Furthermore on each round the size of the Bloom filter will be
25  reduced/compressed by 5%. In theory this should cause the
26  effective/theoretical false positive probability to increase.
27  The objective of this exercise is to track the observed false
28  positive probability against the effective false positive
29  probability as the filter's size is gradually reduced.
30 */
31 
32 
33 #include <iostream>
34 #include <cstddef>
35 #include <cstdio>
36 #include <fstream>
37 #include <iterator>
38 #include <algorithm>
39 #include <vector>
40 #include <deque>
41 #include <set>
42 #include <string>
43 
44 #include "bloom_filter.hpp"
45 
46 bool load_word_list(int argc, char* argv[], std::vector<std::string>& word_list);
47 
48 template <class T,
49  class Allocator,
50  template <class,class> class Container>
51 bool read_file(const std::string& file_name, Container<T, Allocator>& c);
52 
53 void generate_outliers(const std::vector<std::string>& word_list, std::deque<std::string>& outliers);
54 void purify_outliers(const std::vector<std::string>& word_list,std::deque<std::string>& outliers);
55 
56 int main(int argc, char* argv[])
57 {
58  std::vector<std::string> word_list;
59  std::deque<std::string> outliers;
60 
61  if (!load_word_list(argc,argv,word_list))
62  {
63  return 1;
64  }
65 
66  generate_outliers(word_list,outliers);
67 
68  unsigned int random_seed = 0xA57EC3B2;
69 
70  const double desired_probability_of_false_positive = 1.0 / word_list.size();
71 
72  bloom_parameters parameters;
73  parameters.projected_element_count = word_list.size();
74  parameters.false_positive_probability = desired_probability_of_false_positive;
75  parameters.random_seed = random_seed++;
76  parameters.maximum_number_of_hashes = 7;
77 
78  if (!parameters)
79  {
80  std::cout << "Error - Invalid set of bloom filter parameters!" << std::endl;
81  return 1;
82  }
83 
84  parameters.compute_optimal_parameters();
85 
86  compressible_bloom_filter filter(parameters);
87 
88  filter.insert(word_list.begin(),word_list.end());
89 
90  std::cout << "Filter Size\tEFPP \tOFPP \tDiff" << std::endl;
91 
92  while (filter.size() > 1)
93  {
94  std::vector<std::string>::iterator it = filter.contains_all(word_list.begin(),word_list.end());
95 
96  if (word_list.end() != it)
97  {
98  std::cout << "ERROR: key not found in bloom filter! =>" << (*it) << std::endl;
99  return 1;
100  }
101 
102  std::size_t total_false_positive = 0;
103 
104  for (std::deque<std::string>::iterator itr = outliers.begin(); itr != outliers.end(); ++itr)
105  {
106  if (filter.contains(*itr)) ++total_false_positive;
107  }
108 
109  double pfp = total_false_positive / (1.0 * outliers.size());
110 
111  printf("%11llu\t%8.7f\t%8.7f\t%8.6f\n",
112  static_cast<unsigned long long>(filter.size()),
113  filter.effective_fpp(),
114  pfp,
115  100.0 * (pfp / filter.effective_fpp()));
116 
117  if (!filter.compress(5.0))
118  {
119  std::cout << "Filter cannot be compressed any further." << std::endl;
120  break;
121  }
122  }
123 
124  return 0;
125 }
126 
127 bool load_word_list(int argc, char* argv[], std::vector<std::string>& word_list)
128 {
129  // Note: The word-lists can be obtained from:
130  // https://github.com/ArashPartow/bloom
131  static const std::string wl_list[] =
132  { "word-list.txt",
133  "word-list-large.txt",
134  "word-list-extra-large.txt",
135  "random-list.txt"
136  };
137 
138  std::size_t index = 0;
139 
140  if (2 == argc)
141  {
142  index = ::atoi(argv[1]);
143 
144  const std::size_t wl_list_size = sizeof(wl_list) / sizeof(std::string);
145 
146  if (index >= wl_list_size)
147  {
148  std::cout << "Invalid world list index: " << index << std::endl;
149  return false;
150  }
151  }
152 
153  std::cout << "Loading list " << wl_list[index] << ".....";
154 
155  if (!read_file(wl_list[index],word_list))
156  {
157  return false;
158  }
159 
160  if (word_list.empty())
161  {
162  std::cout << "No word list - Either none requested, or desired word list could not be loaded." << std::endl;
163  return false;
164  }
165  else
166  std::cout << " Complete." << std::endl;
167 
168  return true;
169 }
170 
171 template <class T,
172  class Allocator,
173  template <class,class> class Container>
174 bool read_file(const std::string& file_name, Container<T, Allocator>& c)
175 {
176  std::ifstream stream(file_name.c_str());
177 
178  if (!stream)
179  {
180  std::cout << "Error: Failed to open file '" << file_name << "'" << std::endl;
181  return false;
182  }
183 
184  std::string buffer;
185 
186  while (std::getline(stream,buffer))
187  {
188  c.push_back(buffer);
189  }
190 
191  return true;
192 }
193 
194 std::string reverse(std::string str)
195 {
196  // Not the most efficient way of doing this.
197  std::reverse(str.begin(),str.end());
198  return str;
199 }
200 
201 void generate_outliers(const std::vector<std::string>& word_list, std::deque<std::string>& outliers)
202 {
203  std::cout << "Generating outliers..... ";
204 
205  for (std::vector<std::string>::const_iterator it = word_list.begin(); it != word_list.end(); ++it)
206  {
207  if ((*it) != reverse((*it)))
208  {
209  outliers.push_back((*it) + reverse((*it)));
210  outliers.push_back((*it) + (*it));
211  outliers.push_back(reverse((*it)) + (*it) + reverse((*it)));
212  }
213 
214  std::string ns = *it;
215 
216  for (unsigned int i = 0; i < ns.size(); ++i)
217  {
218  if (1 == (i & 0x00)) ns[i] = ~ns[i];
219  }
220 
221  outliers.push_back(ns);
222  }
223 
224  static const std::string rand_str[] =
225  {
226  "oD5l", "pccW", "5yHt", "ndaN", "OaJh", "tWPc", "Cr9C", "a9zE",
227  "H1wL", "yo1V", "16D7", "f2WR", "0MVQ", "PkKn", "PlVa", "MvzL",
228  "9Csl", "JQTv", "IveD", "FDVS", "Q7HE", "QgcF", "Q9Vo", "V8zJ",
229  "EJWT", "GuLC", "rM3d", "PJF4", "HXPW", "qKx3", "ztRP", "t4KP",
230  "m1zV", "fn12", "B1QP", "Jr4I", "Mf8M", "4jBd", "anGR", "Pipt",
231  "QHon", "GNlc", "UeXM", "mVM5", "ABI8", "RhB3", "5h2s", "hOYo",
232  "gaId", "DX40", "THMu", "EwlP", "n9Mz", "oC1S", "BfMl", "uCZ1",
233  "G2bA", "MOH9", "zZ0O", "PKDO", "3nRU", "Z6ie", "4cso", "LnQO",
234  "MJTtT","td3rC","A5JNR","1yL5B","rQnJk","jNKYF","CD0XD","pFLSG",
235  "fxO1a","CAjBE","ORk4e","0LERI","R7d0x","Qqd7v","6Kih5","9tTCB",
236  "yCg9U","D2Tv7","XpNHn","6zeFQ","BT2cs","WGhKW","zTv6B","TTPFk",
237  "XjNVX","pg9yW","4pKiZ","mQUhL","xrXzR","kVRm5","NSyC4","olXm9",
238  "UWkYy","8Ys6r","yd4Fl","5L4mB","nP3nH","f0DFb","glnQa","DlXQa",
239  "cQdH6","eBmIN","fDj6F","ezLow","C15vu","I2Z2j","BQgzg","eVBid",
240  "hn5TO","WZyQN","xXgsE","sL6nK","8DKD8","jcrbp","AcRak","h8N5o",
241  "LViwC","ThEKf","O7fd5","oN0Id","OM1m0","4OLiR","VIa8N","bJZFG",
242  "9j3rL","SzW0N","7m7pY","mY9bg","k1p3e","3OFm1","r45se","VYwz3",
243  "pDjXt","ZcqcJ","npPHx","hA3bw","w7lSO","jEmZL","1x3AZ","FN47G",
244  "kThNf","aC4fq","rzDwi","CYRNG","gCeuG","wCVqO","d1R60","bEauW",
245  "KeUwW","lIKhO","RfPv3","dK5wE","1X7qu","tRwEn","1c03P","GwHCl",
246  "CsJaO","zl4j1","e0aEc","Uskgi","rgTGR","jyR4g","Tt6l4","lRoaw",
247  "94ult","qZwBX","eYW8S","Qf6UH","AbV56","N1hJq","JIaVe","8LHEx",
248  "DeNbS","30I0a","hm6qw","3jcaO","4WkuA","mQ219","Gb81C","yx4HM",
249  "Chlqfi9S1y", "BMwUgVFu2X", "ZmpEGOVrVe", "13ggJxrPkC", "fcJJpyMGjm", "9T00Dv4ZAb",
250  "p3YRcP7M2o", "sR0qNUXCHv", "gCxWZbJ6rb", "R4YtzRXXUl", "vwyYz5j6pY", "XPWUvLXhJ7",
251  "7PwfnVVb7U", "1f34Q6hOYz", "1EM2abZY61", "0a6Ivi4S0a", "Teq2LrQs2T", "dWXLCgWHc8",
252  "LawMv7ujn4", "N8VFgbZQx5", "tfvHHxoDgi", "ImwYgXA2tf", "KkIES9NqZO", "ajcz0qjjda",
253  "6Vz28vlGs9", "VMCc5W8cCt", "BiQB8BRJ98", "43CpOJSMpA", "jfBJdqwXcU", "ecHR9EO2ib",
254  "LH7CcXyCZ7", "JntqGSgSpa", "0MbTMpZPFW", "5FJSdiCXzR", "5gda2AhA2x", "lrDFc1lnXk",
255  "zrEwECHvjs", "B0JldDxFa1", "6DYal4QxKa", "Hsqx6kP2S4", "zZwnALSuFh", "Shh4ISZcKW",
256  "P9VDaNSk7Z", "mEI2PLSCO6", "WyTyrQORtu", "IvJyMMRgh3", "Q6pgJq8Nkv", "dhOgR3tDAD",
257  "Y9h6bVgbxO", "wA15tiOPTm", "8TaIKf1zCO", "z75dzabHBs", "AS6OPnwoJI", "2DSZka9Auj",
258  "QLzUjV2CWs", "KZSN2SVhia", "7ttYKWF2ue", "1Zxfu7B2ST", "RnkpmwjsCi", "YpcSIzaqx5",
259  "RDEwFD9gmX", "Nlx3V4Cjw4", "9ZdvITOj8M", "httUPWMNXO", "Ypv9PjxGwa", "LlwyNolNnH",
260  "6xpJOht47a", "tbmz4WIdcG", "OwzuVDlb7D", "PBQKJxo8DQ", "uVnMQn7hK6", "rlnZINuDUa",
261  "2feyyYukPa", "teOlpKuDBn", "LxBSWh0dL1", "Onyb7r4Jp0", "bZxXE6xOXg", "d9NSvNTunQ",
262  "ONerLBic32", "8mar4rKmFk", "5cCN9uwaCg", "ElVrYOHHMv", "YF6Og8DX40", "OgiCwpCQ5a",
263  "K6nSRZVxdR", "gqyXXXoVFW", "ulyRYizcBP", "khUx31K5UR", "qZFRzVthju", "pQBh0vnB20",
264  "dk8NIN7ajy", "XP7ed1OjZx", "IRYNwA5iFR", "hiSEBhTukC", "Ns4jJ3jzGo", "dYoCSxjIvM",
265  "HzGLbl5i1g", "baizENd4ko", "6rCqGBO8t1", "QWGfC8UaA7", "JFhRfxQe4K", "8R4W6IWANz",
266  "2TnWf1w7JH", "0z69e0wcoG", "8SN1mRHCY7", "oFGCYHHwGX", "G8xqnBgxjO", "6B3SAOayHt",
267  "XRW3ZSG1gw", "WcIjTxMxOM", "wNqCAIaTb2", "gO4em4HW8H", "TgGFSMEtbG", "WiwmbEw3QA",
268  "D2xshYUgpu", "xRUZCQVzBs", "nCnUmMgIjE", "p4Ewt1yCJr", "MeOjDcaMY5", "1XelMeXiiI"
269  };
270  static const std::size_t rand_str_size = sizeof(rand_str) / sizeof(std::string);
271 
272  for (unsigned int i = 0; i < rand_str_size; ++i)
273  {
274  std::string s0 = rand_str[i];
275  std::string s1 = rand_str[(i + 1) % rand_str_size];
276  std::string s2 = rand_str[(i + 2) % rand_str_size];
277  std::string s3 = rand_str[(i + 3) % rand_str_size];
278  std::string s4 = rand_str[(i + 4) % rand_str_size];
279  std::string s5 = rand_str[(i + 5) % rand_str_size];
280  std::string s6 = rand_str[(i + 6) % rand_str_size];
281 
282  outliers.push_back(s0);
283  outliers.push_back(s0 + s1);
284  outliers.push_back(s0 + s2 + s4);
285  outliers.push_back(s0 + s1 + s3);
286  outliers.push_back(s0 + s1 + s2 + s3 + s4 + s5);
287  outliers.push_back(s0 + s1 + s2 + s3 + s4 + s5 + s6);
288 
289  outliers.push_back(reverse(s0));
290  outliers.push_back(reverse(s0 + s1));
291  outliers.push_back(reverse(s0 + s2 + s4));
292  outliers.push_back(reverse(s0 + s1 + s3));
293  outliers.push_back(reverse(s0 + s1 + s2 + s3 + s4 + s5));
294  outliers.push_back(reverse(s0 + s1 + s2 + s3 + s4 + s5 + s6));
295  }
296 
297  std::sort(outliers.begin(),outliers.end());
298 
299  purify_outliers(word_list,outliers);
300 
301  std::cout << "Complete." << std::endl;
302 }
303 
304 void purify_outliers(const std::vector<std::string>& word_list, std::deque<std::string>& outliers)
305 {
306  std::set<std::string> set1;
307  std::set<std::string> set2;
308 
309  std::copy(word_list.begin(), word_list.end(), std::inserter(set1,set1.begin()));
310  std::copy(outliers .begin(), outliers .end(), std::inserter(set2,set2.begin()));
311 
312  std::deque<std::string> intersect_list;
313 
314  std::set_intersection(set1.begin(),set1.end(),
315  set2.begin(),set2.end(),
316  std::back_inserter(intersect_list));
317 
318  std::sort(intersect_list.begin(),intersect_list.end());
319 
320  if (!intersect_list.empty())
321  {
322  std::deque<std::string> new_outliers;
323 
324  for (std::deque<std::string>::iterator it = outliers.begin(); it != outliers.end(); ++it)
325  {
326  if (!std::binary_search(intersect_list.begin(),intersect_list.end(),*it))
327  {
328  new_outliers.push_back(*it);
329  }
330  }
331 
332  outliers.swap(new_outliers);
333  }
334 }
virtual bool compute_optimal_parameters()
unsigned long long int random_seed
bool read_file(const std::string &file_name, Container< T, Allocator > &c)
void insert(const unsigned char *key_begin, const std::size_t &length)
InputIterator contains_all(const InputIterator begin, const InputIterator end) const
unsigned int maximum_number_of_hashes
bool load_word_list(int argc, char *argv[], std::vector< std::string > &word_list)
double false_positive_probability
double effective_fpp() const
void generate_outliers(const std::vector< std::string > &word_list, std::deque< std::string > &outliers)
bool compress(const double &percentage)
std::string reverse(std::string str)
unsigned long long int size() const
void purify_outliers(const std::vector< std::string > &word_list, std::deque< std::string > &outliers)
virtual bool contains(const unsigned char *key_begin, const std::size_t length) const
unsigned long long int projected_element_count
int main(int argc, char *argv[])