diff --git a/Makefile.in b/Makefile.in index 73326b1..d08f8cf 100644 --- a/Makefile.in +++ b/Makefile.in @@ -55,6 +55,7 @@ SOURCE=\ persistent-data/transaction_manager.cc \ \ persistent-data/data-structures/bitset.cc \ + persistent-data/data-structures/bloom_filter.cc \ persistent-data/data-structures/btree.cc \ \ persistent-data/space_map.cc \ diff --git a/persistent-data/data-structures/bloom_filter.cc b/persistent-data/data-structures/bloom_filter.cc new file mode 100644 index 0000000..3ca6ffb --- /dev/null +++ b/persistent-data/data-structures/bloom_filter.cc @@ -0,0 +1,146 @@ +#include "persistent-data/data-structures/bloom_filter.h" + +#include + +using namespace persistent_data; + +//---------------------------------------------------------------- + +namespace { + static const uint64_t m1 = 0x9e37fffffffc0001UL; + static const unsigned bits = 18; + + static uint32_t hash1(block_address const &b) { + return (b * m1) >> bits; + } + + static uint32_t hash2(block_address const &b) { + uint32_t n = b; + + n = n ^ (n >> 16); + n = n * 0x85ebca6bu; + n = n ^ (n >> 13); + n = n * 0xc2b2ae35u; + n = n ^ (n >> 16); + + return n; + } + + void check_power_of_two(unsigned nr_bits) { + if (nr_bits & (nr_bits - 1)) + throw std::runtime_error("bloom filter needs a power of two nr_bits"); + } +} + +//---------------------------------------------------------------- + +bloom_filter::bloom_filter(tm_ptr tm, + unsigned nr_bits, unsigned nr_probes) + : tm_(tm), + bits_(tm), + nr_probes_(nr_probes), + mask_(nr_bits - 1) +{ + check_power_of_two(nr_bits); + bits_.grow(nr_bits, false); +} + +bloom_filter::bloom_filter(tm_ptr tm, block_address root, + unsigned nr_bits, unsigned nr_probes) + : tm_(tm), + bits_(tm, root, nr_bits), + nr_probes_(nr_probes), + mask_(nr_bits - 1) +{ + check_power_of_two(nr_bits); +} + +block_address +bloom_filter::get_root() const +{ + return bits_.get_root(); +} + +bool +bloom_filter::test(uint64_t b) +{ + vector probes(nr_probes_); + fill_probes(b, probes); + + for (unsigned p = 0; p < nr_probes_; p++) + if (!bits_.get(probes[p])) + return false; + + return true; +} + +void +bloom_filter::set(uint64_t b) +{ + vector probes(nr_probes_); + fill_probes(b, probes); + + for (unsigned p = 0; p < nr_probes_; p++) + bits_.set(probes[p], true); +} + +void +bloom_filter::flush() +{ + bits_.flush(); +} + +void +bloom_filter::fill_probes(block_address b, vector &probes) const +{ + uint32_t h1 = hash1(b) & mask_; + uint32_t h2 = hash2(b) & mask_; + + probes[0] = h1; + for (unsigned p = 1; p < nr_probes_; p++) { + h1 = (h1 + h2) & mask_; + h2 = (h2 + p) & mask_; + probes[p] = h1; + } +} + +void +bloom_filter::print_debug(ostream &out) +{ + print_residency(out); + + map runs; + + for (unsigned i = 0; i < bits_.get_nr_bits();) { + bool v = bits_.get(i); + unsigned run_length = 1; + + while (++i < bits_.get_nr_bits() && bits_.get(i) == v) + run_length++; + + map::iterator it = runs.find(run_length); + if (it != runs.end()) + it->second++; + else + runs.insert(make_pair(run_length, 1)); + } + + { + map::const_iterator it; + for (it = runs.begin(); it != runs.end(); ++it) + out << it->first << ": " << it->second << endl; + } +} + +void +bloom_filter::print_residency(ostream &out) +{ + unsigned count = 0; + for (unsigned i = 0; i < bits_.get_nr_bits(); i++) + if (bits_.get(i)) + count++; + + out << "residency: " << count << "/" << bits_.get_nr_bits() << endl; +} + +//---------------------------------------------------------------- diff --git a/persistent-data/data-structures/bloom_filter.h b/persistent-data/data-structures/bloom_filter.h new file mode 100644 index 0000000..6703a7d --- /dev/null +++ b/persistent-data/data-structures/bloom_filter.h @@ -0,0 +1,47 @@ +#ifndef PERSISTENT_DATA_DATA_STRUCTURES_BLOOM_FILTER_H +#define PERSISTENT_DATA_DATA_STRUCTURES_BLOOM_FILTER_H + +#include "persistent-data/transaction_manager.h" +#include "persistent-data/data-structures/bitset.h" + +#include + +//---------------------------------------------------------------- + +namespace persistent_data { + class bloom_filter { + public: + typedef boost::shared_ptr ptr; + typedef typename persistent_data::transaction_manager::ptr tm_ptr; + + // nr_bits must be a power of two + bloom_filter(tm_ptr tm, + unsigned nr_bits, unsigned nr_probes); + + bloom_filter(tm_ptr tm, block_address root, + unsigned nr_bits_power, unsigned nr_probes); + + block_address get_root() const; + + bool test(uint64_t b); // not const due to caching effects in bitset + void set(uint64_t b); + void flush(); + + void print_debug(ostream &out); + + private: + void print_residency(ostream &out); + + void fill_probes(block_address b, vector &probes) const; + + tm_ptr tm_; + unsigned nr_bits_; + persistent_data::bitset bits_; + unsigned nr_probes_; + uint64_t mask_; + }; +} + +//---------------------------------------------------------------- + +#endif diff --git a/unit-tests/bloom_filter_t.cc b/unit-tests/bloom_filter_t.cc index af95f38..bb879ed 100644 --- a/unit-tests/bloom_filter_t.cc +++ b/unit-tests/bloom_filter_t.cc @@ -1,4 +1,5 @@ #include "gmock/gmock.h" +#include "persistent-data/data-structures/bloom_filter.h" #include "persistent-data/transaction_manager.h" #include "persistent-data/space-maps/core.h" #include "persistent-data/data-structures/array_block.h" @@ -19,221 +20,19 @@ using namespace testing; //---------------------------------------------------------------- namespace { - struct block_address_bloom_traits { - typedef block_address value_type; - - static const uint64_t ones = ~0ull; - static const uint64_t m1 = 0x9e37fffffffc0001UL; - static const uint64_t m2 = ones - 82; - - static const unsigned bits = 18; - - static uint64_t hash1(block_address const &b) { - return (b * m1) >> bits; - } - - static uint64_t hash2(block_address const &b) { - uint32_t n = b; - - n = n ^ (n >> 16); - n = n * 0x85ebca6bu; - n = n ^ (n >> 13); - n = n * 0xc2b2ae35u; - n = n ^ (n >> 16); - - return n; - } - - static uint64_t hash3(block_address const &b) { - return (b * m2) >> bits; - } - }; - - template - class bloom_filter { - public: - bloom_filter(unsigned nr_bits_power, unsigned nr_probes) - : bits_(1ull << nr_bits_power, false), - nr_probes_(nr_probes), - mask_((1ull << nr_bits_power) - 1) { - - cerr << "nr entries = " << bits_.size() << ", mask = " << mask_ << endl; - } - - bool test(typename Traits::value_type const &v) { - vector probes(nr_probes_); - fill_probes(v, probes); - - for (unsigned p = 0; p < nr_probes_; p++) - if (!bits_.at(probes[p])) - return false; - - return true; - } - - void add(typename Traits::value_type const &v) { - vector probes(nr_probes_); - fill_probes(v, probes); - - for (unsigned p = 0; p < nr_probes_; p++) { - //cerr << probes[p] << ", "; - bits_.at(probes[p]) = true; - } - //cerr << endl; - } - - void dump() const { - residency(); - - map runs; - - for (unsigned i = 0; i < bits_.size();) { - bool v = bits_[i]; - unsigned run_length = 1; - - while (bits_[++i] == v && i < bits_.size()) - run_length++; - - map::iterator it = runs.find(run_length); - if (it != runs.end()) - it->second++; - else - runs.insert(make_pair(run_length, 1)); - } - - { - map::const_iterator it; - for (it = runs.begin(); it != runs.end(); ++it) - cout << it->first << ": " << it->second << endl; - } - } - - void residency() const { - unsigned count = 0; - for (unsigned i = 0; i < bits_.size(); i++) - if (bits_[i]) - count++; - - cout << "residency: " << count << "/" << bits_.size() << endl; - } - - private: - void fill_probes(typename Traits::value_type const &v, vector &probes) { - uint32_t h1 = Traits::hash1(v) & mask_; - uint32_t h2 = Traits::hash2(v) & mask_; - - probes[0] = h1; - for (unsigned p = 1; p < nr_probes_; p++) { - h1 = (h1 + h2) & mask_; - h2 = (h2 + p) & mask_; - probes[p] = h1; - } - } - - vector bits_; - unsigned nr_probes_; - uint64_t mask_; - }; - - //-------------------------------- -#if 0 - class dm_era { - public: - dm_era(block_address nr_blocks) - : nr_blocks_(nr_blocks), - era_base_(0), - base_(nr_blocks, false) { - } - - set blocks_written_since(unsigned era) const { - - } - - unsigned get_era() const { - return era_base_ + eras_.size() - 1; - } - - void record_write(block_address b) { - current_era.record_write(b); - } - - void resize(block_address new_size) { - nr_blocks_ = new_size; - push_era(); - base_.resize(new_size, false); - } - - private: - era_details ¤t_era() { - return eras_.back(); - } - - void need_new_era() { - // ??? - } - - void push_era() { - eras_.push_back(era(nr_blocks_)); - if (eras_.size() > 100) - pop_era(); - } - - void pop_era() { - era_base_++; - - - - eras_.pop_front(); - } - - static const unsigned NR_PROBES = 6; - - class era_details { - public: - era_details(block_address nr_blocks) - : nr_blocks_(nr_blocks), - f(power_bits(nr_blocks, NR_PROBES)) { - } - - void record_write(block_address b) { - f.add(b); - } - - void add_blocks_written(set filter; - - block_address nr_blocks; - filter f; - }; - - block_address nr_blocks_; - unsigned era_base_; - vector base_; - deque eras_; - }; -#endif + block_address const BLOCK_SIZE = 4096; + block_address const NR_BLOCKS = 102400; + block_address const SUPERBLOCK = 0; //-------------------------------- class BloomFilterTests : public Test { public: + BloomFilterTests() + : bm_(create_bm(NR_BLOCKS)), + sm_(setup_core_map()), + tm_(new transaction_manager(bm_, sm_)) { + } set generate_random_blocks(unsigned count, block_address max = std::numeric_limits::max()) { @@ -251,58 +50,104 @@ namespace { return r; } + + void commit() { + block_manager<>::write_ref superblock(bm_->superblock(SUPERBLOCK)); + } + + space_map::ptr setup_core_map() { + space_map::ptr sm(new core_map(NR_BLOCKS)); + sm->inc(SUPERBLOCK); + return sm; + } + + with_temp_directory dir_; + block_manager<>::ptr bm_; + space_map::ptr sm_; + transaction_manager::ptr tm_; }; } //---------------------------------------------------------------- +TEST_F(BloomFilterTests, nr_bits_must_be_a_power_of_two) +{ + ASSERT_THROW(bloom_filter f(tm_, 1023, 3), runtime_error); +} + TEST_F(BloomFilterTests, can_create_a_bloom_filter) { - bloom_filter f(10, 3); + bloom_filter f(tm_, 1024, 3); } TEST_F(BloomFilterTests, no_false_negatives) { - bloom_filter f(12, 6); + bloom_filter f(tm_, 4096, 6); set bs = generate_random_blocks(1000); set::const_iterator it; for (it = bs.begin(); it != bs.end(); ++it) - f.add(*it); + f.set(*it); for (it = bs.begin(); it != bs.end(); ++it) ASSERT_THAT(f.test(*it), Eq(true)); +} - f.dump(); +TEST_F(BloomFilterTests, reload_works) +{ + block_address root; + set bs = generate_random_blocks(1000); + + { + bloom_filter f(tm_, 4096, 6); + + set::const_iterator it; + for (it = bs.begin(); it != bs.end(); ++it) + f.set(*it); + + f.flush(); + root = f.get_root(); + commit(); + } + + { + bloom_filter f(tm_, root, 4096, 6); + + set::const_iterator it; + for (it = bs.begin(); it != bs.end(); ++it) + ASSERT_THAT(f.test(*it), Eq(true)); + } } TEST_F(BloomFilterTests, count_false_positives) { - block_address nr_blocks = 128 * 1024 * 1024; + block_address nr_blocks = 1024 * 1024; block_address written_blocks = nr_blocks / 100; unsigned shift = 1; while ((1ull << shift) < (16 * written_blocks)) shift++; - cerr << "bitset " << ((1 << shift) / (8 * 1024)) << "k" << endl; + cerr << "bitset size: " << ((1 << shift) / (8 * 1024)) << "k" << endl; + + bloom_filter f(tm_, 1 << shift, 6); - bloom_filter f(shift, 6); set bs = generate_random_blocks(written_blocks, nr_blocks); set::const_iterator it; for (it = bs.begin(); it != bs.end(); ++it) - f.add(*it); + f.set(*it); - f.dump(); + // f.print_debug(cerr); unsigned count = 0; for (unsigned i = 0; i < nr_blocks; i++) if (!bs.count(i) && f.test(i)) count++; - cerr << count << "false positives out of " << nr_blocks << endl; - cerr << static_cast(count * 100) / static_cast(nr_blocks) << "%" << endl; + cerr << count << " false positives out of " << nr_blocks << ", " + << static_cast(count * 100) / static_cast(nr_blocks) + << "%" << endl; } //----------------------------------------------------------------