2011-06-23 19:17:08 +05:30
|
|
|
#ifndef BTREE_H
|
|
|
|
#define BTREE_H
|
|
|
|
|
2011-08-22 15:12:13 +05:30
|
|
|
#include "endian_utils.h"
|
2011-06-23 19:17:08 +05:30
|
|
|
#include "transaction_manager.h"
|
|
|
|
|
2011-07-15 19:51:28 +05:30
|
|
|
#include <boost/noncopyable.hpp>
|
|
|
|
#include <boost/optional.hpp>
|
|
|
|
#include <list>
|
|
|
|
|
2011-06-23 19:17:08 +05:30
|
|
|
//----------------------------------------------------------------
|
|
|
|
|
|
|
|
namespace persistent_data {
|
2011-07-15 19:51:28 +05:30
|
|
|
|
2011-07-22 20:39:56 +05:30
|
|
|
template <typename ValueType>
|
|
|
|
class NoOpRefCounter {
|
|
|
|
public:
|
|
|
|
void inc(ValueType const &v) {}
|
|
|
|
void dec(ValueType const &v) {}
|
|
|
|
};
|
|
|
|
|
2011-07-15 19:51:28 +05:30
|
|
|
struct uint64_traits {
|
|
|
|
typedef base::__le64 disk_type;
|
|
|
|
typedef uint64_t value_type;
|
2011-07-22 20:39:56 +05:30
|
|
|
typedef NoOpRefCounter<uint64_t> ref_counter;
|
2011-07-15 19:51:28 +05:30
|
|
|
|
|
|
|
static void unpack(disk_type const &disk, value_type &value) {
|
|
|
|
value = base::to_cpu<uint64_t>(disk);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void pack(value_type const &value, disk_type &disk) {
|
|
|
|
disk = base::to_disk<base::__le64>(value);
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
namespace btree_detail {
|
|
|
|
using namespace base;
|
|
|
|
using namespace std;
|
|
|
|
using namespace boost;
|
|
|
|
|
|
|
|
//------------------------------------------------
|
|
|
|
// On disk data layout for btree nodes
|
|
|
|
enum node_flags {
|
|
|
|
INTERNAL_NODE = 1,
|
|
|
|
LEAF_NODE = 1 << 1
|
|
|
|
};
|
|
|
|
|
|
|
|
struct node_header {
|
|
|
|
__le32 csum;
|
|
|
|
__le32 flags;
|
|
|
|
__le64 blocknr; /* which block this node is supposed to live in */
|
|
|
|
|
|
|
|
__le32 nr_entries;
|
|
|
|
__le32 max_entries;
|
2011-08-23 16:25:37 +05:30
|
|
|
__le32 value_size;
|
|
|
|
__le32 padding;
|
2011-07-15 19:51:28 +05:30
|
|
|
} __attribute__((packed));
|
|
|
|
|
|
|
|
struct disk_node {
|
|
|
|
struct node_header header;
|
|
|
|
__le64 keys[0];
|
|
|
|
} __attribute__((packed));
|
|
|
|
|
|
|
|
enum node_type {
|
|
|
|
INTERNAL,
|
|
|
|
LEAF
|
|
|
|
};
|
|
|
|
|
|
|
|
//------------------------------------------------
|
|
|
|
// Class that acts as an interface over the raw little endian btree
|
|
|
|
// node data.
|
|
|
|
template <typename ValueTraits, uint32_t BlockSize>
|
|
|
|
class node_ref {
|
|
|
|
public:
|
2011-08-22 18:44:10 +05:30
|
|
|
explicit node_ref(block_address b, disk_node *raw);
|
|
|
|
|
|
|
|
block_address get_location() const {
|
|
|
|
return location_;
|
|
|
|
}
|
2011-07-15 19:51:28 +05:30
|
|
|
|
2011-08-23 16:25:37 +05:30
|
|
|
block_address get_block_nr() const;
|
|
|
|
|
2011-07-15 19:51:28 +05:30
|
|
|
node_type get_type() const;
|
|
|
|
void set_type(node_type t);
|
|
|
|
|
|
|
|
unsigned get_nr_entries() const;
|
|
|
|
void set_nr_entries(unsigned n);
|
|
|
|
|
|
|
|
unsigned get_max_entries() const;
|
|
|
|
void set_max_entries(unsigned n);
|
2011-07-22 20:39:56 +05:30
|
|
|
|
|
|
|
// FIXME: remove this, and get the constructor to do it.
|
2011-07-15 19:51:28 +05:30
|
|
|
void set_max_entries(); // calculates the max for you.
|
|
|
|
|
2011-08-23 16:25:37 +05:30
|
|
|
size_t get_value_size() const;
|
|
|
|
|
2011-07-15 19:51:28 +05:30
|
|
|
uint64_t key_at(unsigned i) const;
|
|
|
|
void set_key(unsigned i, uint64_t k);
|
|
|
|
|
|
|
|
typename ValueTraits::value_type value_at(unsigned i) const;
|
|
|
|
void set_value(unsigned i,
|
|
|
|
typename ValueTraits::value_type const &v);
|
|
|
|
|
|
|
|
// Increments the nr_entries field
|
|
|
|
void insert_at(unsigned i,
|
|
|
|
uint64_t key,
|
|
|
|
typename ValueTraits::value_type const &v);
|
|
|
|
|
|
|
|
// Does not increment nr_entries
|
|
|
|
void overwrite_at(unsigned i,
|
|
|
|
uint64_t key,
|
|
|
|
typename ValueTraits::value_type const &v);
|
|
|
|
|
|
|
|
// Copies entries from another node, appends them
|
|
|
|
// to the back of this node. Adjusts nr_entries.
|
|
|
|
void copy_entries(node_ref const &rhs,
|
|
|
|
unsigned begin,
|
|
|
|
unsigned end);
|
|
|
|
|
|
|
|
// Various searches
|
|
|
|
int bsearch(uint64_t key, int want_hi) const;
|
|
|
|
optional<unsigned> exact_search(uint64_t key) const;
|
|
|
|
int lower_bound(uint64_t key) const;
|
|
|
|
|
2011-07-22 20:39:56 +05:30
|
|
|
template <typename RefCounter>
|
|
|
|
void inc_children(RefCounter &rc);
|
|
|
|
|
|
|
|
// FIXME: remove
|
|
|
|
void *raw() {
|
|
|
|
return raw_;
|
|
|
|
}
|
|
|
|
|
2011-07-15 19:51:28 +05:30
|
|
|
private:
|
|
|
|
static unsigned calc_max_entries(void);
|
|
|
|
|
|
|
|
void *key_ptr(unsigned i) const;
|
|
|
|
void *value_ptr(unsigned i) const;
|
|
|
|
|
2011-08-22 18:44:10 +05:30
|
|
|
block_address location_;
|
2011-07-15 19:51:28 +05:30
|
|
|
disk_node *raw_;
|
|
|
|
};
|
|
|
|
|
|
|
|
//------------------------------------------------
|
|
|
|
//
|
|
|
|
template <typename ValueTraits, uint32_t BlockSize>
|
|
|
|
node_ref<ValueTraits, BlockSize>
|
|
|
|
to_node(typename block_manager<BlockSize>::read_ref &b)
|
|
|
|
{
|
|
|
|
// FIXME: this should return a const read_ref somehow.
|
|
|
|
return node_ref<ValueTraits, BlockSize>(
|
2011-08-22 18:44:10 +05:30
|
|
|
b.get_location(),
|
2011-07-15 19:51:28 +05:30
|
|
|
reinterpret_cast<disk_node *>(
|
|
|
|
const_cast<unsigned char *>(b.data())));
|
|
|
|
}
|
|
|
|
|
|
|
|
template <typename ValueTraits, uint32_t BlockSize>
|
|
|
|
node_ref<ValueTraits, BlockSize>
|
|
|
|
to_node(typename block_manager<BlockSize>::write_ref &b)
|
|
|
|
{
|
|
|
|
return node_ref<ValueTraits, BlockSize>(
|
2011-08-22 18:44:10 +05:30
|
|
|
b.get_location(),
|
2011-07-15 19:51:28 +05:30
|
|
|
reinterpret_cast<disk_node *>(
|
|
|
|
const_cast<unsigned char *>(b.data())));
|
|
|
|
}
|
|
|
|
|
|
|
|
template <uint32_t BlockSize>
|
|
|
|
class ro_spine : private noncopyable {
|
|
|
|
public:
|
|
|
|
ro_spine(typename transaction_manager<BlockSize>::ptr tm)
|
|
|
|
: tm_(tm) {
|
|
|
|
}
|
|
|
|
|
|
|
|
void step(block_address b) {
|
|
|
|
spine_.push_back(tm_->read_lock(b));
|
|
|
|
if (spine_.size() > 2)
|
|
|
|
spine_.pop_front();
|
|
|
|
}
|
|
|
|
|
|
|
|
template <typename ValueTraits>
|
|
|
|
node_ref<ValueTraits, BlockSize> get_node() {
|
|
|
|
return to_node<ValueTraits, BlockSize>(spine_.back());
|
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
|
|
|
typename transaction_manager<BlockSize>::ptr tm_;
|
|
|
|
std::list<typename block_manager<BlockSize>::read_ref> spine_;
|
|
|
|
};
|
|
|
|
|
|
|
|
template <uint32_t BlockSize>
|
|
|
|
class shadow_spine : private noncopyable {
|
|
|
|
public:
|
2011-08-24 18:57:45 +05:30
|
|
|
typedef typename transaction_manager<BlockSize>::read_ref read_ref;
|
|
|
|
typedef typename transaction_manager<BlockSize>::write_ref write_ref;
|
|
|
|
|
2011-07-15 19:51:28 +05:30
|
|
|
shadow_spine(typename transaction_manager<BlockSize>::ptr tm)
|
|
|
|
: tm_(tm) {
|
|
|
|
}
|
|
|
|
|
|
|
|
// true if the children of the shadow need incrementing
|
|
|
|
bool step(block_address b) {
|
2011-08-24 18:57:45 +05:30
|
|
|
pair<write_ref, bool> p = tm_->shadow(b);
|
2011-07-15 19:51:28 +05:30
|
|
|
try {
|
|
|
|
step(p.first);
|
|
|
|
} catch (...) {
|
|
|
|
tm_->get_sm()->dec(p.first.get_location());
|
|
|
|
throw;
|
|
|
|
}
|
|
|
|
return p.second;
|
|
|
|
}
|
|
|
|
|
|
|
|
void step(typename transaction_manager<BlockSize>::write_ref b) {
|
|
|
|
spine_.push_back(b);
|
|
|
|
if (spine_.size() == 1)
|
|
|
|
root_ = spine_.front().get_location();
|
|
|
|
else if (spine_.size() > 2)
|
|
|
|
spine_.pop_front();
|
|
|
|
}
|
|
|
|
|
|
|
|
void pop() {
|
|
|
|
spine_.pop_back();
|
|
|
|
}
|
|
|
|
|
|
|
|
template <typename ValueTraits>
|
|
|
|
node_ref<ValueTraits, BlockSize> get_node() {
|
|
|
|
return to_node<ValueTraits, BlockSize>(spine_.back());
|
|
|
|
}
|
|
|
|
|
|
|
|
block_address get_block() const {
|
|
|
|
return spine_.back().get_location();
|
|
|
|
}
|
|
|
|
|
|
|
|
bool has_parent() const {
|
|
|
|
return spine_.size() > 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
node_ref<uint64_traits, BlockSize> get_parent() {
|
|
|
|
if (spine_.size() < 2)
|
|
|
|
throw std::runtime_error("no parent");
|
|
|
|
|
|
|
|
return to_node<uint64_traits, BlockSize>(spine_.front());
|
|
|
|
}
|
|
|
|
|
2011-07-22 20:39:56 +05:30
|
|
|
block_address get_parent_location() const {
|
|
|
|
return spine_.front().get_location();
|
|
|
|
}
|
|
|
|
|
2011-07-15 19:51:28 +05:30
|
|
|
block_address get_root() const {
|
|
|
|
return root_;
|
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
|
|
|
typename transaction_manager<BlockSize>::ptr tm_;
|
|
|
|
std::list<typename block_manager<BlockSize>::write_ref> spine_;
|
|
|
|
block_address root_;
|
|
|
|
};
|
|
|
|
|
2011-07-22 20:39:56 +05:30
|
|
|
// FIXME: make a member of btree
|
2011-07-15 19:51:28 +05:30
|
|
|
template <typename ValueTraits, uint32_t BlockSize>
|
|
|
|
optional<typename ValueTraits::value_type>
|
|
|
|
lookup_raw(ro_spine<BlockSize> &spine, block_address block, uint64_t key) {
|
|
|
|
|
|
|
|
using namespace boost;
|
|
|
|
typedef typename ValueTraits::value_type leaf_type;
|
|
|
|
|
|
|
|
for (;;) {
|
|
|
|
spine.step(block);
|
2011-08-24 18:57:45 +05:30
|
|
|
node_ref<ValueTraits, BlockSize> leaf = spine.template get_node<ValueTraits>();
|
2011-07-15 19:51:28 +05:30
|
|
|
|
2011-08-24 18:57:45 +05:30
|
|
|
optional<unsigned> mi = leaf.exact_search(key);
|
2011-07-15 19:51:28 +05:30
|
|
|
if (!mi)
|
|
|
|
return optional<leaf_type>();
|
|
|
|
|
|
|
|
if (leaf.get_type() == btree_detail::LEAF)
|
|
|
|
return optional<leaf_type>(leaf.value_at(*mi));
|
|
|
|
|
2011-08-24 18:57:45 +05:30
|
|
|
node_ref<uint64_traits, BlockSize> internal = spine.template get_node<uint64_traits>();
|
2011-07-15 19:51:28 +05:30
|
|
|
block = internal.value_at(*mi);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-06-23 19:17:08 +05:30
|
|
|
template <unsigned Levels, typename ValueTraits, uint32_t BlockSize>
|
|
|
|
class btree {
|
|
|
|
public:
|
2011-07-15 19:51:28 +05:30
|
|
|
typedef boost::shared_ptr<btree<Levels, ValueTraits, BlockSize> > ptr;
|
|
|
|
|
2011-06-23 19:17:08 +05:30
|
|
|
typedef uint64_t key[Levels];
|
|
|
|
typedef typename ValueTraits::value_type value_type;
|
|
|
|
typedef boost::optional<value_type> maybe_value;
|
|
|
|
typedef boost::optional<std::pair<unsigned, value_type> > maybe_pair;
|
|
|
|
typedef typename block_manager<BlockSize>::read_ref read_ref;
|
|
|
|
typedef typename block_manager<BlockSize>::write_ref write_ref;
|
2011-08-24 18:57:45 +05:30
|
|
|
typedef typename btree_detail::node_ref<ValueTraits, BlockSize> leaf_node;
|
|
|
|
typedef typename btree_detail::node_ref<uint64_traits, BlockSize> internal_node;
|
2011-06-23 19:17:08 +05:30
|
|
|
|
2011-07-22 20:39:56 +05:30
|
|
|
btree(typename persistent_data::transaction_manager<BlockSize>::ptr tm,
|
|
|
|
typename ValueTraits::ref_counter rc);
|
|
|
|
|
2011-06-27 15:15:30 +05:30
|
|
|
btree(typename transaction_manager<BlockSize>::ptr tm,
|
2011-07-22 20:39:56 +05:30
|
|
|
block_address root,
|
|
|
|
typename ValueTraits::ref_counter rc);
|
|
|
|
|
2011-06-23 19:17:08 +05:30
|
|
|
~btree();
|
|
|
|
|
|
|
|
maybe_value lookup(key const &key) const;
|
|
|
|
maybe_pair lookup_le(key const &key) const;
|
|
|
|
maybe_pair lookup_ge(key const &key) const;
|
|
|
|
|
|
|
|
void insert(key const &key, typename ValueTraits::value_type const &value);
|
|
|
|
void remove(key const &key);
|
|
|
|
|
|
|
|
void set_root(block_address root);
|
|
|
|
block_address get_root() const;
|
|
|
|
|
|
|
|
ptr clone() const;
|
|
|
|
|
|
|
|
// free the on disk btree when the destructor is called
|
|
|
|
void destroy();
|
|
|
|
|
2011-08-22 16:25:55 +05:30
|
|
|
|
|
|
|
// Derive a class from this base class if you need to
|
|
|
|
// inspect the individual nodes that make up a btree.
|
|
|
|
class visitor {
|
|
|
|
public:
|
|
|
|
virtual ~visitor() {}
|
|
|
|
typedef boost::shared_ptr<visitor> ptr;
|
|
|
|
|
2011-08-24 18:57:45 +05:30
|
|
|
virtual void visit_internal(unsigned level, bool is_root, internal_node const &n) = 0;
|
|
|
|
virtual void visit_internal_leaf(unsigned level, bool is_root, internal_node const &n) = 0;
|
|
|
|
virtual void visit_leaf(unsigned level, bool is_root, leaf_node const &n) = 0;
|
2011-08-22 16:25:55 +05:30
|
|
|
};
|
|
|
|
|
|
|
|
// Walks the tree in depth first order
|
|
|
|
void visit(typename visitor::ptr visitor);
|
|
|
|
|
2011-06-23 19:17:08 +05:30
|
|
|
private:
|
2011-07-15 19:51:28 +05:30
|
|
|
template <typename ValueTraits2>
|
|
|
|
void split_node(btree_detail::shadow_spine<BlockSize> &spine,
|
|
|
|
block_address parent_index,
|
|
|
|
uint64_t key,
|
|
|
|
bool top);
|
2011-06-27 15:15:30 +05:30
|
|
|
|
2011-07-15 19:51:28 +05:30
|
|
|
template <typename ValueTraits2>
|
|
|
|
void split_beneath(btree_detail::shadow_spine<BlockSize> &spine, uint64_t key);
|
2011-06-27 15:15:30 +05:30
|
|
|
|
2011-07-15 19:51:28 +05:30
|
|
|
template <typename ValueTraits2>
|
|
|
|
void split_sibling(btree_detail::shadow_spine<BlockSize> &spine,
|
|
|
|
block_address parent_index,
|
|
|
|
uint64_t key);
|
2011-06-27 15:15:30 +05:30
|
|
|
|
2011-07-15 19:51:28 +05:30
|
|
|
template <typename ValueTraits2>
|
|
|
|
bool
|
|
|
|
insert_location(btree_detail::shadow_spine<BlockSize> &spine,
|
|
|
|
block_address block,
|
|
|
|
uint64_t key,
|
|
|
|
int *index);
|
|
|
|
|
2011-08-22 16:25:55 +05:30
|
|
|
void walk_tree(typename visitor::ptr visitor,
|
2011-08-24 15:15:39 +05:30
|
|
|
unsigned level, bool is_root,
|
|
|
|
block_address b);
|
2011-08-22 16:25:55 +05:30
|
|
|
|
2011-07-15 19:51:28 +05:30
|
|
|
typename persistent_data::transaction_manager<BlockSize>::ptr tm_;
|
|
|
|
bool destroy_;
|
|
|
|
block_address root_;
|
2011-07-22 20:39:56 +05:30
|
|
|
NoOpRefCounter<uint64_t> internal_rc_;
|
|
|
|
typename ValueTraits::ref_counter rc_;
|
2011-06-27 15:15:30 +05:30
|
|
|
};
|
2011-06-23 19:17:08 +05:30
|
|
|
};
|
|
|
|
|
2011-06-27 15:15:30 +05:30
|
|
|
#include "btree.tcc"
|
|
|
|
|
2011-06-23 19:17:08 +05:30
|
|
|
//----------------------------------------------------------------
|
|
|
|
|
|
|
|
#endif
|