persistent-data/data-structures/bloom_filter
This commit is contained in:
parent
9402f09408
commit
180f7e6187
@ -55,6 +55,7 @@ SOURCE=\
|
||||
persistent-data/transaction_manager.cc \
|
||||
\
|
||||
persistent-data/data-structures/bitset.cc \
|
||||
persistent-data/data-structures/bloom_filter.cc \
|
||||
persistent-data/data-structures/btree.cc \
|
||||
\
|
||||
persistent-data/space_map.cc \
|
||||
|
146
persistent-data/data-structures/bloom_filter.cc
Normal file
146
persistent-data/data-structures/bloom_filter.cc
Normal file
@ -0,0 +1,146 @@
|
||||
#include "persistent-data/data-structures/bloom_filter.h"
|
||||
|
||||
#include <stdexcept>
|
||||
|
||||
using namespace persistent_data;
|
||||
|
||||
//----------------------------------------------------------------
|
||||
|
||||
namespace {
|
||||
static const uint64_t m1 = 0x9e37fffffffc0001UL;
|
||||
static const unsigned bits = 18;
|
||||
|
||||
static uint32_t hash1(block_address const &b) {
|
||||
return (b * m1) >> bits;
|
||||
}
|
||||
|
||||
static uint32_t hash2(block_address const &b) {
|
||||
uint32_t n = b;
|
||||
|
||||
n = n ^ (n >> 16);
|
||||
n = n * 0x85ebca6bu;
|
||||
n = n ^ (n >> 13);
|
||||
n = n * 0xc2b2ae35u;
|
||||
n = n ^ (n >> 16);
|
||||
|
||||
return n;
|
||||
}
|
||||
|
||||
void check_power_of_two(unsigned nr_bits) {
|
||||
if (nr_bits & (nr_bits - 1))
|
||||
throw std::runtime_error("bloom filter needs a power of two nr_bits");
|
||||
}
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------
|
||||
|
||||
bloom_filter::bloom_filter(tm_ptr tm,
|
||||
unsigned nr_bits, unsigned nr_probes)
|
||||
: tm_(tm),
|
||||
bits_(tm),
|
||||
nr_probes_(nr_probes),
|
||||
mask_(nr_bits - 1)
|
||||
{
|
||||
check_power_of_two(nr_bits);
|
||||
bits_.grow(nr_bits, false);
|
||||
}
|
||||
|
||||
bloom_filter::bloom_filter(tm_ptr tm, block_address root,
|
||||
unsigned nr_bits, unsigned nr_probes)
|
||||
: tm_(tm),
|
||||
bits_(tm, root, nr_bits),
|
||||
nr_probes_(nr_probes),
|
||||
mask_(nr_bits - 1)
|
||||
{
|
||||
check_power_of_two(nr_bits);
|
||||
}
|
||||
|
||||
block_address
|
||||
bloom_filter::get_root() const
|
||||
{
|
||||
return bits_.get_root();
|
||||
}
|
||||
|
||||
bool
|
||||
bloom_filter::test(uint64_t b)
|
||||
{
|
||||
vector<unsigned> probes(nr_probes_);
|
||||
fill_probes(b, probes);
|
||||
|
||||
for (unsigned p = 0; p < nr_probes_; p++)
|
||||
if (!bits_.get(probes[p]))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void
|
||||
bloom_filter::set(uint64_t b)
|
||||
{
|
||||
vector<unsigned> probes(nr_probes_);
|
||||
fill_probes(b, probes);
|
||||
|
||||
for (unsigned p = 0; p < nr_probes_; p++)
|
||||
bits_.set(probes[p], true);
|
||||
}
|
||||
|
||||
void
|
||||
bloom_filter::flush()
|
||||
{
|
||||
bits_.flush();
|
||||
}
|
||||
|
||||
void
|
||||
bloom_filter::fill_probes(block_address b, vector<unsigned> &probes) const
|
||||
{
|
||||
uint32_t h1 = hash1(b) & mask_;
|
||||
uint32_t h2 = hash2(b) & mask_;
|
||||
|
||||
probes[0] = h1;
|
||||
for (unsigned p = 1; p < nr_probes_; p++) {
|
||||
h1 = (h1 + h2) & mask_;
|
||||
h2 = (h2 + p) & mask_;
|
||||
probes[p] = h1;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
bloom_filter::print_debug(ostream &out)
|
||||
{
|
||||
print_residency(out);
|
||||
|
||||
map<unsigned, unsigned> runs;
|
||||
|
||||
for (unsigned i = 0; i < bits_.get_nr_bits();) {
|
||||
bool v = bits_.get(i);
|
||||
unsigned run_length = 1;
|
||||
|
||||
while (++i < bits_.get_nr_bits() && bits_.get(i) == v)
|
||||
run_length++;
|
||||
|
||||
map<unsigned, unsigned>::iterator it = runs.find(run_length);
|
||||
if (it != runs.end())
|
||||
it->second++;
|
||||
else
|
||||
runs.insert(make_pair(run_length, 1));
|
||||
}
|
||||
|
||||
{
|
||||
map<unsigned, unsigned>::const_iterator it;
|
||||
for (it = runs.begin(); it != runs.end(); ++it)
|
||||
out << it->first << ": " << it->second << endl;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
bloom_filter::print_residency(ostream &out)
|
||||
{
|
||||
unsigned count = 0;
|
||||
for (unsigned i = 0; i < bits_.get_nr_bits(); i++)
|
||||
if (bits_.get(i))
|
||||
count++;
|
||||
|
||||
out << "residency: " << count << "/" << bits_.get_nr_bits() << endl;
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------
|
47
persistent-data/data-structures/bloom_filter.h
Normal file
47
persistent-data/data-structures/bloom_filter.h
Normal file
@ -0,0 +1,47 @@
|
||||
#ifndef PERSISTENT_DATA_DATA_STRUCTURES_BLOOM_FILTER_H
|
||||
#define PERSISTENT_DATA_DATA_STRUCTURES_BLOOM_FILTER_H
|
||||
|
||||
#include "persistent-data/transaction_manager.h"
|
||||
#include "persistent-data/data-structures/bitset.h"
|
||||
|
||||
#include <boost/shared_ptr.hpp>
|
||||
|
||||
//----------------------------------------------------------------
|
||||
|
||||
namespace persistent_data {
|
||||
class bloom_filter {
|
||||
public:
|
||||
typedef boost::shared_ptr<bloom_filter> ptr;
|
||||
typedef typename persistent_data::transaction_manager::ptr tm_ptr;
|
||||
|
||||
// nr_bits must be a power of two
|
||||
bloom_filter(tm_ptr tm,
|
||||
unsigned nr_bits, unsigned nr_probes);
|
||||
|
||||
bloom_filter(tm_ptr tm, block_address root,
|
||||
unsigned nr_bits_power, unsigned nr_probes);
|
||||
|
||||
block_address get_root() const;
|
||||
|
||||
bool test(uint64_t b); // not const due to caching effects in bitset
|
||||
void set(uint64_t b);
|
||||
void flush();
|
||||
|
||||
void print_debug(ostream &out);
|
||||
|
||||
private:
|
||||
void print_residency(ostream &out);
|
||||
|
||||
void fill_probes(block_address b, vector<unsigned> &probes) const;
|
||||
|
||||
tm_ptr tm_;
|
||||
unsigned nr_bits_;
|
||||
persistent_data::bitset bits_;
|
||||
unsigned nr_probes_;
|
||||
uint64_t mask_;
|
||||
};
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------
|
||||
|
||||
#endif
|
@ -1,4 +1,5 @@
|
||||
#include "gmock/gmock.h"
|
||||
#include "persistent-data/data-structures/bloom_filter.h"
|
||||
#include "persistent-data/transaction_manager.h"
|
||||
#include "persistent-data/space-maps/core.h"
|
||||
#include "persistent-data/data-structures/array_block.h"
|
||||
@ -19,221 +20,19 @@ using namespace testing;
|
||||
//----------------------------------------------------------------
|
||||
|
||||
namespace {
|
||||
struct block_address_bloom_traits {
|
||||
typedef block_address value_type;
|
||||
|
||||
static const uint64_t ones = ~0ull;
|
||||
static const uint64_t m1 = 0x9e37fffffffc0001UL;
|
||||
static const uint64_t m2 = ones - 82;
|
||||
|
||||
static const unsigned bits = 18;
|
||||
|
||||
static uint64_t hash1(block_address const &b) {
|
||||
return (b * m1) >> bits;
|
||||
}
|
||||
|
||||
static uint64_t hash2(block_address const &b) {
|
||||
uint32_t n = b;
|
||||
|
||||
n = n ^ (n >> 16);
|
||||
n = n * 0x85ebca6bu;
|
||||
n = n ^ (n >> 13);
|
||||
n = n * 0xc2b2ae35u;
|
||||
n = n ^ (n >> 16);
|
||||
|
||||
return n;
|
||||
}
|
||||
|
||||
static uint64_t hash3(block_address const &b) {
|
||||
return (b * m2) >> bits;
|
||||
}
|
||||
};
|
||||
|
||||
template <class Traits>
|
||||
class bloom_filter {
|
||||
public:
|
||||
bloom_filter(unsigned nr_bits_power, unsigned nr_probes)
|
||||
: bits_(1ull << nr_bits_power, false),
|
||||
nr_probes_(nr_probes),
|
||||
mask_((1ull << nr_bits_power) - 1) {
|
||||
|
||||
cerr << "nr entries = " << bits_.size() << ", mask = " << mask_ << endl;
|
||||
}
|
||||
|
||||
bool test(typename Traits::value_type const &v) {
|
||||
vector<uint32_t> probes(nr_probes_);
|
||||
fill_probes(v, probes);
|
||||
|
||||
for (unsigned p = 0; p < nr_probes_; p++)
|
||||
if (!bits_.at(probes[p]))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void add(typename Traits::value_type const &v) {
|
||||
vector<uint32_t> probes(nr_probes_);
|
||||
fill_probes(v, probes);
|
||||
|
||||
for (unsigned p = 0; p < nr_probes_; p++) {
|
||||
//cerr << probes[p] << ", ";
|
||||
bits_.at(probes[p]) = true;
|
||||
}
|
||||
//cerr << endl;
|
||||
}
|
||||
|
||||
void dump() const {
|
||||
residency();
|
||||
|
||||
map<unsigned, unsigned> runs;
|
||||
|
||||
for (unsigned i = 0; i < bits_.size();) {
|
||||
bool v = bits_[i];
|
||||
unsigned run_length = 1;
|
||||
|
||||
while (bits_[++i] == v && i < bits_.size())
|
||||
run_length++;
|
||||
|
||||
map<unsigned, unsigned>::iterator it = runs.find(run_length);
|
||||
if (it != runs.end())
|
||||
it->second++;
|
||||
else
|
||||
runs.insert(make_pair(run_length, 1));
|
||||
}
|
||||
|
||||
{
|
||||
map<unsigned, unsigned>::const_iterator it;
|
||||
for (it = runs.begin(); it != runs.end(); ++it)
|
||||
cout << it->first << ": " << it->second << endl;
|
||||
}
|
||||
}
|
||||
|
||||
void residency() const {
|
||||
unsigned count = 0;
|
||||
for (unsigned i = 0; i < bits_.size(); i++)
|
||||
if (bits_[i])
|
||||
count++;
|
||||
|
||||
cout << "residency: " << count << "/" << bits_.size() << endl;
|
||||
}
|
||||
|
||||
private:
|
||||
void fill_probes(typename Traits::value_type const &v, vector<uint32_t> &probes) {
|
||||
uint32_t h1 = Traits::hash1(v) & mask_;
|
||||
uint32_t h2 = Traits::hash2(v) & mask_;
|
||||
|
||||
probes[0] = h1;
|
||||
for (unsigned p = 1; p < nr_probes_; p++) {
|
||||
h1 = (h1 + h2) & mask_;
|
||||
h2 = (h2 + p) & mask_;
|
||||
probes[p] = h1;
|
||||
}
|
||||
}
|
||||
|
||||
vector<bool> bits_;
|
||||
unsigned nr_probes_;
|
||||
uint64_t mask_;
|
||||
};
|
||||
|
||||
//--------------------------------
|
||||
#if 0
|
||||
class dm_era {
|
||||
public:
|
||||
dm_era(block_address nr_blocks)
|
||||
: nr_blocks_(nr_blocks),
|
||||
era_base_(0),
|
||||
base_(nr_blocks, false) {
|
||||
}
|
||||
|
||||
set<block_address> blocks_written_since(unsigned era) const {
|
||||
|
||||
}
|
||||
|
||||
unsigned get_era() const {
|
||||
return era_base_ + eras_.size() - 1;
|
||||
}
|
||||
|
||||
void record_write(block_address b) {
|
||||
current_era.record_write(b);
|
||||
}
|
||||
|
||||
void resize(block_address new_size) {
|
||||
nr_blocks_ = new_size;
|
||||
push_era();
|
||||
base_.resize(new_size, false);
|
||||
}
|
||||
|
||||
private:
|
||||
era_details ¤t_era() {
|
||||
return eras_.back();
|
||||
}
|
||||
|
||||
void need_new_era() {
|
||||
// ???
|
||||
}
|
||||
|
||||
void push_era() {
|
||||
eras_.push_back(era(nr_blocks_));
|
||||
if (eras_.size() > 100)
|
||||
pop_era();
|
||||
}
|
||||
|
||||
void pop_era() {
|
||||
era_base_++;
|
||||
|
||||
|
||||
|
||||
eras_.pop_front();
|
||||
}
|
||||
|
||||
static const unsigned NR_PROBES = 6;
|
||||
|
||||
class era_details {
|
||||
public:
|
||||
era_details(block_address nr_blocks)
|
||||
: nr_blocks_(nr_blocks),
|
||||
f(power_bits(nr_blocks, NR_PROBES)) {
|
||||
}
|
||||
|
||||
void record_write(block_address b) {
|
||||
f.add(b);
|
||||
}
|
||||
|
||||
void add_blocks_written(set<block_address &result) const {
|
||||
for (block_address b = 0; b < nr_blocks; b++)
|
||||
if (f.test(b))
|
||||
result.insert(b);
|
||||
}
|
||||
|
||||
private:
|
||||
static unsigned power_bits(block_address nr_blocks) {
|
||||
// We're expecting 1% of the cache to change per era
|
||||
block_address expected_writes = nr_blocks / 100;
|
||||
|
||||
unsigned r = 1;
|
||||
while ((1ull << r) < (16 * expected_writes))
|
||||
r++;
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
typedef bloom_filter<block_address_bloom_traits> filter;
|
||||
|
||||
block_address nr_blocks;
|
||||
filter f;
|
||||
};
|
||||
|
||||
block_address nr_blocks_;
|
||||
unsigned era_base_;
|
||||
vector<bool> base_;
|
||||
deque<era_details> eras_;
|
||||
};
|
||||
#endif
|
||||
block_address const BLOCK_SIZE = 4096;
|
||||
block_address const NR_BLOCKS = 102400;
|
||||
block_address const SUPERBLOCK = 0;
|
||||
|
||||
//--------------------------------
|
||||
|
||||
class BloomFilterTests : public Test {
|
||||
public:
|
||||
BloomFilterTests()
|
||||
: bm_(create_bm<BLOCK_SIZE>(NR_BLOCKS)),
|
||||
sm_(setup_core_map()),
|
||||
tm_(new transaction_manager(bm_, sm_)) {
|
||||
}
|
||||
|
||||
set<block_address> generate_random_blocks(unsigned count,
|
||||
block_address max = std::numeric_limits<uint64_t>::max()) {
|
||||
@ -251,58 +50,104 @@ namespace {
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
void commit() {
|
||||
block_manager<>::write_ref superblock(bm_->superblock(SUPERBLOCK));
|
||||
}
|
||||
|
||||
space_map::ptr setup_core_map() {
|
||||
space_map::ptr sm(new core_map(NR_BLOCKS));
|
||||
sm->inc(SUPERBLOCK);
|
||||
return sm;
|
||||
}
|
||||
|
||||
with_temp_directory dir_;
|
||||
block_manager<>::ptr bm_;
|
||||
space_map::ptr sm_;
|
||||
transaction_manager::ptr tm_;
|
||||
};
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------
|
||||
|
||||
TEST_F(BloomFilterTests, nr_bits_must_be_a_power_of_two)
|
||||
{
|
||||
ASSERT_THROW(bloom_filter f(tm_, 1023, 3), runtime_error);
|
||||
}
|
||||
|
||||
TEST_F(BloomFilterTests, can_create_a_bloom_filter)
|
||||
{
|
||||
bloom_filter<block_address_bloom_traits> f(10, 3);
|
||||
bloom_filter f(tm_, 1024, 3);
|
||||
}
|
||||
|
||||
TEST_F(BloomFilterTests, no_false_negatives)
|
||||
{
|
||||
bloom_filter<block_address_bloom_traits> f(12, 6);
|
||||
bloom_filter f(tm_, 4096, 6);
|
||||
set<block_address> bs = generate_random_blocks(1000);
|
||||
|
||||
set<block_address>::const_iterator it;
|
||||
for (it = bs.begin(); it != bs.end(); ++it)
|
||||
f.add(*it);
|
||||
f.set(*it);
|
||||
|
||||
for (it = bs.begin(); it != bs.end(); ++it)
|
||||
ASSERT_THAT(f.test(*it), Eq(true));
|
||||
}
|
||||
|
||||
f.dump();
|
||||
TEST_F(BloomFilterTests, reload_works)
|
||||
{
|
||||
block_address root;
|
||||
set<block_address> bs = generate_random_blocks(1000);
|
||||
|
||||
{
|
||||
bloom_filter f(tm_, 4096, 6);
|
||||
|
||||
set<block_address>::const_iterator it;
|
||||
for (it = bs.begin(); it != bs.end(); ++it)
|
||||
f.set(*it);
|
||||
|
||||
f.flush();
|
||||
root = f.get_root();
|
||||
commit();
|
||||
}
|
||||
|
||||
{
|
||||
bloom_filter f(tm_, root, 4096, 6);
|
||||
|
||||
set<block_address>::const_iterator it;
|
||||
for (it = bs.begin(); it != bs.end(); ++it)
|
||||
ASSERT_THAT(f.test(*it), Eq(true));
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(BloomFilterTests, count_false_positives)
|
||||
{
|
||||
block_address nr_blocks = 128 * 1024 * 1024;
|
||||
block_address nr_blocks = 1024 * 1024;
|
||||
block_address written_blocks = nr_blocks / 100;
|
||||
|
||||
unsigned shift = 1;
|
||||
|
||||
while ((1ull << shift) < (16 * written_blocks))
|
||||
shift++;
|
||||
cerr << "bitset " << ((1 << shift) / (8 * 1024)) << "k" << endl;
|
||||
cerr << "bitset size: " << ((1 << shift) / (8 * 1024)) << "k" << endl;
|
||||
|
||||
bloom_filter f(tm_, 1 << shift, 6);
|
||||
|
||||
bloom_filter<block_address_bloom_traits> f(shift, 6);
|
||||
set<block_address> bs = generate_random_blocks(written_blocks, nr_blocks);
|
||||
set<block_address>::const_iterator it;
|
||||
|
||||
for (it = bs.begin(); it != bs.end(); ++it)
|
||||
f.add(*it);
|
||||
f.set(*it);
|
||||
|
||||
f.dump();
|
||||
// f.print_debug(cerr);
|
||||
|
||||
unsigned count = 0;
|
||||
for (unsigned i = 0; i < nr_blocks; i++)
|
||||
if (!bs.count(i) && f.test(i))
|
||||
count++;
|
||||
|
||||
cerr << count << "false positives out of " << nr_blocks << endl;
|
||||
cerr << static_cast<double>(count * 100) / static_cast<double>(nr_blocks) << "%" << endl;
|
||||
cerr << count << " false positives out of " << nr_blocks << ", "
|
||||
<< static_cast<double>(count * 100) / static_cast<double>(nr_blocks)
|
||||
<< "%" << endl;
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------
|
||||
|
Loading…
Reference in New Issue
Block a user