// Copyright (C) 2011 Red Hat, Inc. All rights reserved. // // This file is part of the thin-provisioning-tools source. // // thin-provisioning-tools is free software: you can redistribute it // and/or modify it under the terms of the GNU General Public License // as published by the Free Software Foundation, either version 3 of // the License, or (at your option) any later version. // // thin-provisioning-tools is distributed in the hope that it will be // useful, but WITHOUT ANY WARRANTY; without even the implied warranty // of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License along // with thin-provisioning-tools. If not, see // . #ifndef BTREE_H #define BTREE_H #include "base/endian_utils.h" #include "persistent-data/transaction_manager.h" #include "persistent-data/data-structures/ref_counter.h" #include #include #include #include //---------------------------------------------------------------- namespace persistent_data { class block_ref_counter : public ref_counter { public: block_ref_counter(space_map::ptr sm); virtual void set(block_address const &v, uint32_t rc); virtual void inc(block_address const &v); virtual void dec(block_address const &v); private: space_map::ptr sm_; }; // FIXME: move to sep file. I don't think it's directly used by // the btree code. struct uint64_traits { typedef base::le64 disk_type; typedef uint64_t value_type; typedef no_op_ref_counter ref_counter; static void unpack(disk_type const &disk, value_type &value) { value = base::to_cpu(disk); } static void pack(value_type const &value, disk_type &disk) { disk = base::to_disk(value); } }; struct block_traits { typedef base::le64 disk_type; typedef block_address value_type; typedef block_ref_counter ref_counter; static void unpack(disk_type const &disk, value_type &value) { value = base::to_cpu(disk); } static void pack(value_type const &value, disk_type &disk) { disk = base::to_disk(value); } }; namespace btree_detail { using namespace base; using namespace std; uint32_t const BTREE_CSUM_XOR = 121107; //------------------------------------------------ // On disk data layout for btree nodes enum node_flags { INTERNAL_NODE = 1, LEAF_NODE = 1 << 1 }; struct node_header { le32 csum; le32 flags; le64 blocknr; /* which block this node is supposed to live in */ le32 nr_entries; le32 max_entries; le32 value_size; le32 padding; } __attribute__((packed)); struct disk_node { struct node_header header; le64 keys[0]; } __attribute__((packed)); enum node_type { INTERNAL, LEAF }; //------------------------------------------------ // Class that acts as an interface over the raw little endian btree // node data. template class node_ref { public: explicit node_ref(block_address b, disk_node *raw); uint32_t get_checksum() const; block_address get_location() const { return location_; } block_address get_block_nr() const; node_type get_type() const; void set_type(node_type t); unsigned get_nr_entries() const; void set_nr_entries(unsigned n); unsigned get_max_entries() const; void set_max_entries(unsigned n); // FIXME: remove this, and get the constructor to do it. void set_max_entries(); // calculates the max for you. size_t get_value_size() const; void set_value_size(size_t); uint64_t key_at(unsigned i) const; void set_key(unsigned i, uint64_t k); typename ValueTraits::value_type value_at(unsigned i) const; void set_value(unsigned i, typename ValueTraits::value_type const &v); // Increments the nr_entries field void insert_at(unsigned i, uint64_t key, typename ValueTraits::value_type const &v); // Does not increment nr_entries void overwrite_at(unsigned i, uint64_t key, typename ValueTraits::value_type const &v); // Copies entries from another node, appends them // to the back of this node. Adjusts nr_entries. void copy_entries(node_ref const &rhs, unsigned begin, unsigned end); // Various searches int bsearch(uint64_t key, int want_hi) const; boost::optional exact_search(uint64_t key) const; int lower_bound(uint64_t key) const; template void inc_children(RefCounter &rc); disk_node *raw() { return raw_; } disk_node const *raw() const { return raw_; } private: static unsigned calc_max_entries(void); void *key_ptr(unsigned i) const; void *value_ptr(unsigned i) const; block_address location_; disk_node *raw_; }; //------------------------------------------------ // template node_ref to_node(typename block_manager<>::read_ref &b) { // FIXME: this should return a const read_ref somehow. return node_ref( b.get_location(), reinterpret_cast( const_cast(b.data().raw()))); } template node_ref to_node(typename block_manager<>::write_ref &b) { return node_ref( b.get_location(), reinterpret_cast( const_cast(b.data().raw()))); } class ro_spine : private boost::noncopyable { public: ro_spine(transaction_manager::ptr tm, block_manager<>::validator::ptr v) : tm_(tm), validator_(v) { } void step(block_address b); template node_ref get_node() { return to_node(spine_.back()); } private: transaction_manager::ptr tm_; block_manager<>::validator::ptr validator_; std::list::read_ref> spine_; }; class shadow_spine : private boost::noncopyable { public: typedef transaction_manager::read_ref read_ref; typedef transaction_manager::write_ref write_ref; typedef boost::optional maybe_block; shadow_spine(transaction_manager::ptr tm, block_manager<>::validator::ptr v) : tm_(tm), validator_(v) { } // true if the children of the shadow need incrementing bool step(block_address b); void step(transaction_manager::write_ref b) { spine_.push_back(b); if (spine_.size() == 1) root_ = spine_.front().get_location(); else if (spine_.size() > 2) spine_.pop_front(); } void pop() { spine_.pop_back(); } template node_ref get_node() { return to_node(spine_.back()); } block_address get_block() const { return spine_.back().get_location(); } bool has_parent() const { return spine_.size() > 1; } node_ref get_parent() { if (spine_.size() < 2) throw std::runtime_error("no parent"); return to_node(spine_.front()); } block_address get_parent_location() const { return spine_.front().get_location(); } block_address get_root() const { if (root_) return *root_; throw std::runtime_error("shadow spine has no root"); } private: transaction_manager::ptr tm_; block_manager<>::validator::ptr validator_; std::list::write_ref> spine_; maybe_block root_; }; // Used to keep a record of a nested btree's position. typedef std::vector btree_path; // Used when visiting the nodes that make up a btree. struct node_location { node_location() : depth(0) { } void inc_depth() { depth++; } void push_key(uint64_t k) { path.push_back(k); depth = 0; } bool is_sub_root() const { return depth == 0; // && path.size(); } unsigned level() const { return path.size(); } // Keys used to access this sub tree btree_path path; // in this sub tree unsigned depth; // This is the key from the parent node to this // node. If this node is a root then there will be // no parent, and hence no key. boost::optional key; }; } template class btree { public: typedef boost::shared_ptr > ptr; typedef uint64_t key[Levels]; typedef typename ValueTraits::value_type value_type; typedef boost::optional maybe_value; typedef boost::optional > maybe_pair; typedef typename block_manager<>::read_ref read_ref; typedef typename block_manager<>::write_ref write_ref; typedef typename btree_detail::node_ref leaf_node; typedef typename btree_detail::node_ref internal_node; btree(typename persistent_data::transaction_manager::ptr tm, typename ValueTraits::ref_counter rc); btree(typename transaction_manager::ptr tm, block_address root, typename ValueTraits::ref_counter rc); ~btree(); maybe_value lookup(key const &key) const; maybe_pair lookup_le(key const &key) const; maybe_pair lookup_ge(key const &key) const; void insert(key const &key, typename ValueTraits::value_type const &value); void remove(key const &key); void set_root(block_address root); block_address get_root() const; ptr clone() const; // free the on disk btree when the destructor is called void destroy(); // Derive a class from this base class if you need to // inspect the individual nodes that make up a btree. class visitor { public: typedef boost::shared_ptr ptr; typedef btree_detail::node_location node_location; virtual ~visitor() {} // The bool return values indicate whether the walk // should be continued into sub trees of the node (true == continue). virtual bool visit_internal(node_location const &l, internal_node const &n) = 0; virtual bool visit_internal_leaf(node_location const &l, internal_node const &n) = 0; virtual bool visit_leaf(node_location const &l, leaf_node const &n) = 0; virtual void visit_complete() {} enum error_outcome { EXCEPTION_HANDLED, RETHROW_EXCEPTION }; virtual error_outcome error_accessing_node(node_location const &l, block_address b, std::string const &what) { return RETHROW_EXCEPTION; } }; // Walks the tree in depth first order void visit_depth_first(visitor &visitor) const; private: template boost::optional lookup_raw(btree_detail::ro_spine &spine, block_address block, uint64_t key) const; template void split_node(btree_detail::shadow_spine &spine, block_address parent_index, uint64_t key, bool top); template void split_beneath(btree_detail::shadow_spine &spine, uint64_t key); template void split_sibling(btree_detail::shadow_spine &spine, block_address parent_index, uint64_t key); template bool insert_location(btree_detail::shadow_spine &spine, block_address block, uint64_t key, int *index, RC &leaf_rc); void walk_tree(visitor &visitor, btree_detail::node_location const &loc, block_address b) const; void walk_tree_internal(visitor &visitor, btree_detail::node_location const &loc, block_address b) const; template void inc_children(btree_detail::shadow_spine &spine, RefCounter &leaf_rc); typename persistent_data::transaction_manager::ptr tm_; bool destroy_; block_address root_; block_ref_counter internal_rc_; typename ValueTraits::ref_counter rc_; typename block_manager<>::validator::ptr validator_; }; }; #include "btree.tcc" //---------------------------------------------------------------- #endif