diff --git a/unit-tests/Makefile.in b/unit-tests/Makefile.in index db790e6..403e67a 100644 --- a/unit-tests/Makefile.in +++ b/unit-tests/Makefile.in @@ -50,6 +50,7 @@ TEST_SOURCE=\ unit-tests/base64_t.cc \ unit-tests/bitset_t.cc \ unit-tests/block_t.cc \ + unit-tests/bloom_filter_t.cc \ unit-tests/btree_t.cc \ unit-tests/btree_damage_visitor_t.cc \ unit-tests/buffer_t.cc \ diff --git a/unit-tests/bloom_filter_t.cc b/unit-tests/bloom_filter_t.cc new file mode 100644 index 0000000..af95f38 --- /dev/null +++ b/unit-tests/bloom_filter_t.cc @@ -0,0 +1,308 @@ +#include "gmock/gmock.h" +#include "persistent-data/transaction_manager.h" +#include "persistent-data/space-maps/core.h" +#include "persistent-data/data-structures/array_block.h" +#include "test_utils.h" + +#include +#include +#include +#include +#include +#include + +using namespace persistent_data; +using namespace std; +using namespace test; +using namespace testing; + +//---------------------------------------------------------------- + +namespace { + struct block_address_bloom_traits { + typedef block_address value_type; + + static const uint64_t ones = ~0ull; + static const uint64_t m1 = 0x9e37fffffffc0001UL; + static const uint64_t m2 = ones - 82; + + static const unsigned bits = 18; + + static uint64_t hash1(block_address const &b) { + return (b * m1) >> bits; + } + + static uint64_t hash2(block_address const &b) { + uint32_t n = b; + + n = n ^ (n >> 16); + n = n * 0x85ebca6bu; + n = n ^ (n >> 13); + n = n * 0xc2b2ae35u; + n = n ^ (n >> 16); + + return n; + } + + static uint64_t hash3(block_address const &b) { + return (b * m2) >> bits; + } + }; + + template + class bloom_filter { + public: + bloom_filter(unsigned nr_bits_power, unsigned nr_probes) + : bits_(1ull << nr_bits_power, false), + nr_probes_(nr_probes), + mask_((1ull << nr_bits_power) - 1) { + + cerr << "nr entries = " << bits_.size() << ", mask = " << mask_ << endl; + } + + bool test(typename Traits::value_type const &v) { + vector probes(nr_probes_); + fill_probes(v, probes); + + for (unsigned p = 0; p < nr_probes_; p++) + if (!bits_.at(probes[p])) + return false; + + return true; + } + + void add(typename Traits::value_type const &v) { + vector probes(nr_probes_); + fill_probes(v, probes); + + for (unsigned p = 0; p < nr_probes_; p++) { + //cerr << probes[p] << ", "; + bits_.at(probes[p]) = true; + } + //cerr << endl; + } + + void dump() const { + residency(); + + map runs; + + for (unsigned i = 0; i < bits_.size();) { + bool v = bits_[i]; + unsigned run_length = 1; + + while (bits_[++i] == v && i < bits_.size()) + run_length++; + + map::iterator it = runs.find(run_length); + if (it != runs.end()) + it->second++; + else + runs.insert(make_pair(run_length, 1)); + } + + { + map::const_iterator it; + for (it = runs.begin(); it != runs.end(); ++it) + cout << it->first << ": " << it->second << endl; + } + } + + void residency() const { + unsigned count = 0; + for (unsigned i = 0; i < bits_.size(); i++) + if (bits_[i]) + count++; + + cout << "residency: " << count << "/" << bits_.size() << endl; + } + + private: + void fill_probes(typename Traits::value_type const &v, vector &probes) { + uint32_t h1 = Traits::hash1(v) & mask_; + uint32_t h2 = Traits::hash2(v) & mask_; + + probes[0] = h1; + for (unsigned p = 1; p < nr_probes_; p++) { + h1 = (h1 + h2) & mask_; + h2 = (h2 + p) & mask_; + probes[p] = h1; + } + } + + vector bits_; + unsigned nr_probes_; + uint64_t mask_; + }; + + //-------------------------------- +#if 0 + class dm_era { + public: + dm_era(block_address nr_blocks) + : nr_blocks_(nr_blocks), + era_base_(0), + base_(nr_blocks, false) { + } + + set blocks_written_since(unsigned era) const { + + } + + unsigned get_era() const { + return era_base_ + eras_.size() - 1; + } + + void record_write(block_address b) { + current_era.record_write(b); + } + + void resize(block_address new_size) { + nr_blocks_ = new_size; + push_era(); + base_.resize(new_size, false); + } + + private: + era_details ¤t_era() { + return eras_.back(); + } + + void need_new_era() { + // ??? + } + + void push_era() { + eras_.push_back(era(nr_blocks_)); + if (eras_.size() > 100) + pop_era(); + } + + void pop_era() { + era_base_++; + + + + eras_.pop_front(); + } + + static const unsigned NR_PROBES = 6; + + class era_details { + public: + era_details(block_address nr_blocks) + : nr_blocks_(nr_blocks), + f(power_bits(nr_blocks, NR_PROBES)) { + } + + void record_write(block_address b) { + f.add(b); + } + + void add_blocks_written(set filter; + + block_address nr_blocks; + filter f; + }; + + block_address nr_blocks_; + unsigned era_base_; + vector base_; + deque eras_; + }; +#endif + + //-------------------------------- + + class BloomFilterTests : public Test { + public: + + set generate_random_blocks(unsigned count, + block_address max = std::numeric_limits::max()) { + set r; + + using namespace boost::random; + + mt19937 rng; + uniform_int_distribution uniform_dist(0, max); + + while (r.size() < count) { + block_address b = uniform_dist(rng); + r.insert(b); + } + + return r; + } + }; +} + +//---------------------------------------------------------------- + +TEST_F(BloomFilterTests, can_create_a_bloom_filter) +{ + bloom_filter f(10, 3); +} + +TEST_F(BloomFilterTests, no_false_negatives) +{ + bloom_filter f(12, 6); + set bs = generate_random_blocks(1000); + + set::const_iterator it; + for (it = bs.begin(); it != bs.end(); ++it) + f.add(*it); + + for (it = bs.begin(); it != bs.end(); ++it) + ASSERT_THAT(f.test(*it), Eq(true)); + + f.dump(); +} + +TEST_F(BloomFilterTests, count_false_positives) +{ + block_address nr_blocks = 128 * 1024 * 1024; + block_address written_blocks = nr_blocks / 100; + + unsigned shift = 1; + + while ((1ull << shift) < (16 * written_blocks)) + shift++; + cerr << "bitset " << ((1 << shift) / (8 * 1024)) << "k" << endl; + + bloom_filter f(shift, 6); + set bs = generate_random_blocks(written_blocks, nr_blocks); + set::const_iterator it; + + for (it = bs.begin(); it != bs.end(); ++it) + f.add(*it); + + f.dump(); + + unsigned count = 0; + for (unsigned i = 0; i < nr_blocks; i++) + if (!bs.count(i) && f.test(i)) + count++; + + cerr << count << "false positives out of " << nr_blocks << endl; + cerr << static_cast(count * 100) / static_cast(nr_blocks) << "%" << endl; +} + +//----------------------------------------------------------------