48 bool load_word_list(
int argc,
char* argv[], std::vector<std::string>& word_list);
52 template <
class,
class>
class Container>
53 bool read_file(
const std::string& file_name, Container<T, Allocator>& c);
56 std::string
reverse(std::string str);
58 void generate_outliers(
const std::vector<std::string>& word_list, std::deque<std::string>& outliers);
59 void purify_outliers(
const std::vector<std::string>& word_list,std::deque<std::string>& outliers);
61 int main(
int argc,
char* argv[])
63 std::vector<std::string> word_list;
64 std::deque<std::string> outliers;
73 unsigned int random_seed = 0;
74 std::size_t word_list_storage_size = 0;
76 for (
unsigned int i = 0; i < word_list.size(); ++i)
78 word_list_storage_size += word_list[i].size();
81 std::size_t total_number_of_queries = 0;
83 const double desired_probability_of_false_positive = 1.0 / word_list.size();
85 printf(
"Round\t Queries\t FPQ\t IPFP\t PFP\t DPFP\t TvD\n");
87 unsigned int max_false_positive_count = 0;
88 unsigned int min_false_positive_count = std::numeric_limits<unsigned int>::max();
89 unsigned int total_false_positive = 0;
90 unsigned int total_zero_fp = 0;
91 unsigned long long int bloom_filter_size = 0;
94 static const unsigned int rounds = 1000;
96 while (random_seed < rounds)
106 std::cout <<
"Error - Invalid set of bloom filter parameters!" << std::endl;
114 filter.
insert(word_list.begin(),word_list.end());
116 std::vector<std::string>::iterator it = filter.
contains_all(word_list.begin(),word_list.end());
118 if (word_list.end() != it)
120 std::cout <<
"ERROR: key not found in bloom filter! =>" << (*it) << std::endl;
124 unsigned int current_total_false_positive = 0;
126 for (std::deque<std::string>::iterator itr = outliers.begin(); itr != outliers.end(); ++itr)
130 ++current_total_false_positive;
134 total_number_of_queries += (outliers.size() + word_list.size());
137 double pfp = current_total_false_positive / (1.0 * outliers.size());
139 printf(
"%6llu\t%10llu\t%6d\t%8.7f\t%8.7f\t%9.3f%%\t%8.6f\n",
140 static_cast<unsigned long long>(random_seed),
141 static_cast<unsigned long long>(total_number_of_queries),
142 current_total_false_positive,
143 desired_probability_of_false_positive,
145 (100.0 * pfp) / desired_probability_of_false_positive,
148 if (current_total_false_positive < min_false_positive_count)
149 min_false_positive_count = current_total_false_positive;
150 else if (current_total_false_positive > max_false_positive_count)
151 max_false_positive_count = current_total_false_positive;
153 total_false_positive += current_total_false_positive;
155 if (0 == current_total_false_positive)
158 bloom_filter_size = filter.
size();
161 double average_fpc = (1.0 * total_false_positive) / rounds;
162 double average_fpp = average_fpc / (outliers.size() + word_list.size());
164 printf(
"Bloom Filter Statistics\n" 165 "MinFPC: %d\tMaxFPC: %d\tAverageFPC: %8.5f\tAverageFPP: %9.8f Zero-FPC:%d\n" 166 "Filter Size: %lluKB\tData Size: %dKB\n",
167 min_false_positive_count,
168 max_false_positive_count,
172 bloom_filter_size / (8 * 1024),
173 static_cast<unsigned int>(word_list_storage_size / 1024));
196 static const std::string wl_list[] =
198 "word-list-large.txt",
199 "word-list-extra-large.txt",
203 std::size_t index = 0;
207 index = ::atoi(argv[1]);
209 const std::size_t wl_list_size =
sizeof(wl_list) /
sizeof(std::string);
211 if (index >= wl_list_size)
213 std::cout <<
"Invalid world list index: " << index << std::endl;
218 std::cout <<
"Loading list " << wl_list[index] <<
".....";
220 if (!
read_file(wl_list[index],word_list))
225 if (word_list.empty())
227 std::cout <<
"No word list - Either none requested, or desired word list could not be loaded." << std::endl;
231 std::cout <<
" Complete." << std::endl;
238 template <
class,
class>
class Container>
239 bool read_file(
const std::string& file_name, Container<T, Allocator>& c)
241 std::ifstream stream(file_name.c_str());
245 std::cout <<
"Error: Failed to open file '" << file_name <<
"'" << std::endl;
251 while (std::getline(stream,buffer))
262 for (std::size_t i = 0; i < str.size(); ++i)
264 str[i] =
static_cast<char>(toupper(str[i]));
277 void generate_outliers(
const std::vector<std::string>& word_list, std::deque<std::string>& outliers)
279 std::cout <<
"Generating outliers..... ";
281 for (std::vector<std::string>::const_iterator it = word_list.begin(); it != word_list.end(); ++it)
285 outliers.push_back((*it) +
reverse((*it)));
286 outliers.push_back((*it) + (*it));
290 std::string ns = *it;
292 for (
unsigned int i = 0; i < ns.size(); ++i)
294 if (1 == (i & 0x00)) ns[i] = ~ns[i];
297 outliers.push_back(ns);
300 static const std::string rand_str[] =
302 "oD5l",
"pccW",
"5yHt",
"ndaN",
"OaJh",
"tWPc",
"Cr9C",
"a9zE",
303 "H1wL",
"yo1V",
"16D7",
"f2WR",
"0MVQ",
"PkKn",
"PlVa",
"MvzL",
304 "9Csl",
"JQTv",
"IveD",
"FDVS",
"Q7HE",
"QgcF",
"Q9Vo",
"V8zJ",
305 "EJWT",
"GuLC",
"rM3d",
"PJF4",
"HXPW",
"qKx3",
"ztRP",
"t4KP",
306 "m1zV",
"fn12",
"B1QP",
"Jr4I",
"Mf8M",
"4jBd",
"anGR",
"Pipt",
307 "QHon",
"GNlc",
"UeXM",
"mVM5",
"ABI8",
"RhB3",
"5h2s",
"hOYo",
308 "gaId",
"DX40",
"THMu",
"EwlP",
"n9Mz",
"oC1S",
"BfMl",
"uCZ1",
309 "G2bA",
"MOH9",
"zZ0O",
"PKDO",
"3nRU",
"Z6ie",
"4cso",
"LnQO",
310 "MJTtT",
"td3rC",
"A5JNR",
"1yL5B",
"rQnJk",
"jNKYF",
"CD0XD",
"pFLSG",
311 "fxO1a",
"CAjBE",
"ORk4e",
"0LERI",
"R7d0x",
"Qqd7v",
"6Kih5",
"9tTCB",
312 "yCg9U",
"D2Tv7",
"XpNHn",
"6zeFQ",
"BT2cs",
"WGhKW",
"zTv6B",
"TTPFk",
313 "XjNVX",
"pg9yW",
"4pKiZ",
"mQUhL",
"xrXzR",
"kVRm5",
"NSyC4",
"olXm9",
314 "UWkYy",
"8Ys6r",
"yd4Fl",
"5L4mB",
"nP3nH",
"f0DFb",
"glnQa",
"DlXQa",
315 "cQdH6",
"eBmIN",
"fDj6F",
"ezLow",
"C15vu",
"I2Z2j",
"BQgzg",
"eVBid",
316 "hn5TO",
"WZyQN",
"xXgsE",
"sL6nK",
"8DKD8",
"jcrbp",
"AcRak",
"h8N5o",
317 "LViwC",
"ThEKf",
"O7fd5",
"oN0Id",
"OM1m0",
"4OLiR",
"VIa8N",
"bJZFG",
318 "9j3rL",
"SzW0N",
"7m7pY",
"mY9bg",
"k1p3e",
"3OFm1",
"r45se",
"VYwz3",
319 "pDjXt",
"ZcqcJ",
"npPHx",
"hA3bw",
"w7lSO",
"jEmZL",
"1x3AZ",
"FN47G",
320 "kThNf",
"aC4fq",
"rzDwi",
"CYRNG",
"gCeuG",
"wCVqO",
"d1R60",
"bEauW",
321 "KeUwW",
"lIKhO",
"RfPv3",
"dK5wE",
"1X7qu",
"tRwEn",
"1c03P",
"GwHCl",
322 "CsJaO",
"zl4j1",
"e0aEc",
"Uskgi",
"rgTGR",
"jyR4g",
"Tt6l4",
"lRoaw",
323 "94ult",
"qZwBX",
"eYW8S",
"Qf6UH",
"AbV56",
"N1hJq",
"JIaVe",
"8LHEx",
324 "DeNbS",
"30I0a",
"hm6qw",
"3jcaO",
"4WkuA",
"mQ219",
"Gb81C",
"yx4HM",
325 "Chlqfi9S1y",
"BMwUgVFu2X",
"ZmpEGOVrVe",
"13ggJxrPkC",
"fcJJpyMGjm",
"9T00Dv4ZAb",
326 "p3YRcP7M2o",
"sR0qNUXCHv",
"gCxWZbJ6rb",
"R4YtzRXXUl",
"vwyYz5j6pY",
"XPWUvLXhJ7",
327 "7PwfnVVb7U",
"1f34Q6hOYz",
"1EM2abZY61",
"0a6Ivi4S0a",
"Teq2LrQs2T",
"dWXLCgWHc8",
328 "LawMv7ujn4",
"N8VFgbZQx5",
"tfvHHxoDgi",
"ImwYgXA2tf",
"KkIES9NqZO",
"ajcz0qjjda",
329 "6Vz28vlGs9",
"VMCc5W8cCt",
"BiQB8BRJ98",
"43CpOJSMpA",
"jfBJdqwXcU",
"ecHR9EO2ib",
330 "LH7CcXyCZ7",
"JntqGSgSpa",
"0MbTMpZPFW",
"5FJSdiCXzR",
"5gda2AhA2x",
"lrDFc1lnXk",
331 "zrEwECHvjs",
"B0JldDxFa1",
"6DYal4QxKa",
"Hsqx6kP2S4",
"zZwnALSuFh",
"Shh4ISZcKW",
332 "P9VDaNSk7Z",
"mEI2PLSCO6",
"WyTyrQORtu",
"IvJyMMRgh3",
"Q6pgJq8Nkv",
"dhOgR3tDAD",
333 "Y9h6bVgbxO",
"wA15tiOPTm",
"8TaIKf1zCO",
"z75dzabHBs",
"AS6OPnwoJI",
"2DSZka9Auj",
334 "QLzUjV2CWs",
"KZSN2SVhia",
"7ttYKWF2ue",
"1Zxfu7B2ST",
"RnkpmwjsCi",
"YpcSIzaqx5",
335 "RDEwFD9gmX",
"Nlx3V4Cjw4",
"9ZdvITOj8M",
"httUPWMNXO",
"Ypv9PjxGwa",
"LlwyNolNnH",
336 "6xpJOht47a",
"tbmz4WIdcG",
"OwzuVDlb7D",
"PBQKJxo8DQ",
"uVnMQn7hK6",
"rlnZINuDUa",
337 "2feyyYukPa",
"teOlpKuDBn",
"LxBSWh0dL1",
"Onyb7r4Jp0",
"bZxXE6xOXg",
"d9NSvNTunQ",
338 "ONerLBic32",
"8mar4rKmFk",
"5cCN9uwaCg",
"ElVrYOHHMv",
"YF6Og8DX40",
"OgiCwpCQ5a",
339 "K6nSRZVxdR",
"gqyXXXoVFW",
"ulyRYizcBP",
"khUx31K5UR",
"qZFRzVthju",
"pQBh0vnB20",
340 "dk8NIN7ajy",
"XP7ed1OjZx",
"IRYNwA5iFR",
"hiSEBhTukC",
"Ns4jJ3jzGo",
"dYoCSxjIvM",
341 "HzGLbl5i1g",
"baizENd4ko",
"6rCqGBO8t1",
"QWGfC8UaA7",
"JFhRfxQe4K",
"8R4W6IWANz",
342 "2TnWf1w7JH",
"0z69e0wcoG",
"8SN1mRHCY7",
"oFGCYHHwGX",
"G8xqnBgxjO",
"6B3SAOayHt",
343 "XRW3ZSG1gw",
"WcIjTxMxOM",
"wNqCAIaTb2",
"gO4em4HW8H",
"TgGFSMEtbG",
"WiwmbEw3QA",
344 "D2xshYUgpu",
"xRUZCQVzBs",
"nCnUmMgIjE",
"p4Ewt1yCJr",
"MeOjDcaMY5",
"1XelMeXiiI" 347 static const std::size_t rand_str_size =
sizeof(rand_str) /
sizeof(std::string);
349 for (
unsigned int i = 0; i < rand_str_size; ++i)
351 std::string s0 = rand_str[i];
352 std::string s1 = rand_str[(i + 1) % rand_str_size];
353 std::string s2 = rand_str[(i + 2) % rand_str_size];
354 std::string s3 = rand_str[(i + 3) % rand_str_size];
355 std::string s4 = rand_str[(i + 4) % rand_str_size];
356 std::string s5 = rand_str[(i + 5) % rand_str_size];
357 std::string s6 = rand_str[(i + 6) % rand_str_size];
359 outliers.push_back(s0);
360 outliers.push_back(s0 + s1);
361 outliers.push_back(s0 + s2 + s4);
362 outliers.push_back(s0 + s1 + s3);
363 outliers.push_back(s0 + s1 + s2 + s3 + s4 + s5);
364 outliers.push_back(s0 + s1 + s2 + s3 + s4 + s5 + s6);
366 outliers.push_back(
reverse(s0));
367 outliers.push_back(
reverse(s0 + s1));
368 outliers.push_back(
reverse(s0 + s2 + s4));
369 outliers.push_back(
reverse(s0 + s1 + s3));
370 outliers.push_back(
reverse(s0 + s1 + s2 + s3 + s4 + s5));
371 outliers.push_back(
reverse(s0 + s1 + s2 + s3 + s4 + s5 + s6));
374 std::sort(outliers.begin(),outliers.end());
378 std::cout <<
"Complete." << std::endl;
381 void purify_outliers(
const std::vector<std::string>& word_list, std::deque<std::string>& outliers)
383 std::set<std::string> set1;
384 std::set<std::string> set2;
386 std::copy(word_list.begin(), word_list.end(),std::inserter(set1,set1.begin()));
387 std::copy(outliers.begin(), outliers.end(), std::inserter(set2,set2.begin()));
389 std::deque<std::string> intersect_list;
391 std::set_intersection(set1.begin(),set1.end(),
392 set2.begin(),set2.end(),
393 std::back_inserter(intersect_list));
395 std::sort(intersect_list.begin(),intersect_list.end());
397 if (!intersect_list.empty())
399 std::deque<std::string> new_outliers;
401 for (std::deque<std::string>::iterator it = outliers.begin(); it != outliers.end(); ++it)
403 if (!std::binary_search(intersect_list.begin(),intersect_list.end(),*it))
405 new_outliers.push_back(*it);
409 outliers.swap(new_outliers);
virtual bool compute_optimal_parameters()
unsigned long long int random_seed
std::string uppercase(std::string str)
void insert(const unsigned char *key_begin, const std::size_t &length)
InputIterator contains_all(const InputIterator begin, const InputIterator end) const
static const std::size_t bits_per_char
double false_positive_probability
int main(int argc, char *argv[])
bool load_word_list(int argc, char *argv[], std::vector< std::string > &word_list)
virtual unsigned long long int size() const
bool read_file(const std::string &file_name, Container< T, Allocator > &c)
void generate_outliers(const std::vector< std::string > &word_list, std::deque< std::string > &outliers)
void purify_outliers(const std::vector< std::string > &word_list, std::deque< std::string > &outliers)
virtual bool contains(const unsigned char *key_begin, const std::size_t length) const
std::string reverse(std::string str)
unsigned long long int projected_element_count