diff --git a/Makefile.in b/Makefile.in index 59f8f14..97b86c4 100644 --- a/Makefile.in +++ b/Makefile.in @@ -40,6 +40,7 @@ SOURCE=\ base/error_state.cc \ base/error_string.cc \ base/grid_layout.cc \ + base/io_generator.cc \ base/file_utils.cc \ base/progress_monitor.cc \ base/rolling_hash.cc \ @@ -128,6 +129,7 @@ DEVTOOLS_SOURCE=\ thin-provisioning/thin_ll_restore.cc \ thin-provisioning/thin_show_duplicates.cc \ thin-provisioning/thin_generate_metadata.cc \ + thin-provisioning/thin_generate_mappings.cc \ thin-provisioning/variable_chunk_stream.cc \ thin-provisioning/thin_show_metadata.cc \ thin-provisioning/thin_scan.cc \ diff --git a/base/io.h b/base/io.h new file mode 100644 index 0000000..75f2d3a --- /dev/null +++ b/base/io.h @@ -0,0 +1,25 @@ +#ifndef BASE_IO_H +#define BASE_IO_H + +#include "base/types.h" +#include + +//---------------------------------------------------------------- + +namespace base { + enum req_op { + REQ_OP_READ, + REQ_OP_WRITE, + REQ_OP_DISCARD + }; + + struct io { + unsigned op_; + sector_t sector_; + sector_t size_; + }; +} + +//---------------------------------------------------------------- + +#endif diff --git a/base/io_generator.cc b/base/io_generator.cc new file mode 100644 index 0000000..0543d31 --- /dev/null +++ b/base/io_generator.cc @@ -0,0 +1,240 @@ +#include "base/io_generator.h" +#include +#include +#include + +using namespace base; + +//---------------------------------------------------------------- + +namespace { + std::pair patterns[] = { + {"read", io_pattern::READ}, + {"write", io_pattern::WRITE}, + {"trim", io_pattern::TRIM}, + {"readwrite", io_pattern::READ_WRITE}, + {"trimwrite", io_pattern::TRIM_WRITE}, + {"randread", io_pattern::RAND_READ}, + {"randwrite", io_pattern::RAND_WRITE}, + {"randtrim", io_pattern::RAND_TRIM}, + {"randrw", io_pattern::RAND_RW}, + {"randtw", io_pattern::RAND_TW} + }; + + unsigned const nr_patterns = sizeof(patterns) / sizeof(patterns[0]); + + //-------------------------------- + + class offset_generator { + public: + typedef std::shared_ptr ptr; + + virtual base::sector_t next_offset() = 0; + }; + + class sequential_offset_generator: public offset_generator { + public: + sequential_offset_generator(base::sector_t offset, + base::sector_t size, + base::sector_t block_size) + : block_size_(block_size), + begin_(offset), + end_(offset + size), + current_(offset) { + if (size < block_size) + throw std::runtime_error("size must be greater than block_size"); + } + + base::sector_t next_offset() { + sector_t r = current_; + current_ += block_size_; + if (current_ > end_) + current_ = begin_; + return r; + } + + private: + unsigned block_size_; + base::sector_t begin_; + base::sector_t end_; + base::sector_t current_; + }; + + class random_offset_generator: public offset_generator { + public: + random_offset_generator(sector_t offset, + sector_t size, + sector_t block_size) + : block_begin_(offset / block_size), + nr_blocks_(size / block_size), + block_size_(block_size) { + } + + sector_t next_offset() { + return ((std::rand() % nr_blocks_) + block_begin_) * block_size_; + } + + private: + uint64_t block_begin_; + uint64_t nr_blocks_; + unsigned block_size_; + }; + + //-------------------------------- + + class op_generator { + public: + typedef std::shared_ptr ptr; + + op_generator(base::req_op op1) + : op1_(op1), op2_(op1), op1_pct_(100) { + } + + op_generator(base::req_op op1, + base::req_op op2, + unsigned op1_pct) + : op1_(op1), op2_(op2), op1_pct_(op1_pct) { + if (op1_pct > 100) + throw std::runtime_error("invalid percentage"); + } + + base::req_op next_op() { + if (static_cast(std::rand()) % 100 > op1_pct_) + return op2_; + return op1_; + } + + private: + base::req_op op1_; + base::req_op op2_; + unsigned op1_pct_; + }; + + //-------------------------------- + + class base_io_generator: public io_generator { + public: + base_io_generator(io_generator_options const &opts); + virtual bool has_next(); + virtual void next(base::io &next_io); + + private: + offset_generator::ptr + create_offset_generator(io_generator_options const &opts); + + op_generator::ptr + create_op_generator(io_generator_options const &opts); + + offset_generator::ptr offset_gen_; + op_generator::ptr op_gen_; + sector_t block_size_; + size_t io_size_finished_; + size_t io_size_total_; + }; + + base_io_generator::base_io_generator(io_generator_options const &opts) + : offset_gen_(create_offset_generator(opts)), + op_gen_(create_op_generator(opts)), + block_size_(opts.block_size_), + io_size_finished_(0), + io_size_total_(opts.io_size_) { + } + + bool base_io_generator::has_next() { + return io_size_finished_ < io_size_total_; + } + + void base_io_generator::next(base::io &next_io) { + if (io_size_finished_ >= io_size_total_) + throw std::runtime_error(""); + + next_io.op_ = op_gen_->next_op(); + next_io.sector_ = offset_gen_->next_offset(); + next_io.size_ = block_size_; + + io_size_finished_ += block_size_; + } + + offset_generator::ptr + base_io_generator::create_offset_generator(io_generator_options const &opts) { + if (opts.pattern_.is_random()) + return offset_generator::ptr( + new random_offset_generator(opts.offset_, + opts.size_, + opts.block_size_)); + + return offset_generator::ptr( + new sequential_offset_generator(opts.offset_, + opts.size_, + opts.block_size_)); + } + + op_generator::ptr + base_io_generator::create_op_generator(io_generator_options const &opts) { + // FIXME: elimiate the switch-case and hide enum values + switch (opts.pattern_.val_) { + case io_pattern::READ: + case io_pattern::RAND_READ: + return op_generator::ptr(new op_generator(base::REQ_OP_READ)); + case io_pattern::WRITE: + case io_pattern::RAND_WRITE: + return op_generator::ptr(new op_generator(base::REQ_OP_WRITE)); + case io_pattern::TRIM: + case io_pattern::RAND_TRIM: + return op_generator::ptr(new op_generator(base::REQ_OP_DISCARD)); + case io_pattern::READ_WRITE: + case io_pattern::RAND_RW: + return op_generator::ptr(new op_generator(base::REQ_OP_READ, + base::REQ_OP_WRITE, + 50)); + case io_pattern::TRIM_WRITE: + case io_pattern::RAND_TW: + return op_generator::ptr(new op_generator(base::REQ_OP_DISCARD, + base::REQ_OP_WRITE, + 50)); + default: + throw std::runtime_error("unknown pattern"); + } + } +} + +//---------------------------------------------------------------- + +io_pattern::io_pattern() + : val_(pattern::READ) { +} + +io_pattern::io_pattern(char const *pattern) { + parse(pattern); +} + +void +io_pattern::parse(char const *pattern) { + bool found = false; + unsigned i = 0; + for (i = 0; i < nr_patterns; i++) { + if (!strcmp(patterns[i].first, pattern)) { + found = true; + break; + } + } + + if (!found) + throw std::runtime_error("unknow pattern"); + + val_ = patterns[i].second; +} + +bool +io_pattern::is_random() const { + return val_ & pattern::RANDOM; +} + +//---------------------------------------------------------------- + +io_generator::ptr +base::create_io_generator(io_generator_options const &opts) { + return io_generator::ptr(new base_io_generator(opts)); +} + +//---------------------------------------------------------------- diff --git a/base/io_generator.h b/base/io_generator.h new file mode 100644 index 0000000..4a33550 --- /dev/null +++ b/base/io_generator.h @@ -0,0 +1,55 @@ +#ifndef BASE_IO_GENERATOR_H +#define BASE_IO_GENERATOR_H + +#include "base/io.h" +#include + +//---------------------------------------------------------------- + +namespace base { + struct io_pattern { + enum pattern { + READ = 1 << 1, + WRITE = 1 << 2, + TRIM = 1 << 3, + RANDOM = 1 << 8, + READ_WRITE = READ | WRITE, + TRIM_WRITE = WRITE | TRIM, + RAND_READ = READ | RANDOM, + RAND_WRITE = WRITE | RANDOM, + RAND_TRIM = TRIM | RANDOM, + RAND_RW = READ_WRITE | RANDOM, + RAND_TW = TRIM_WRITE | RANDOM, + }; + + io_pattern(); + io_pattern(char const *pattern); + void parse(char const *pattern); + bool is_random() const; + + pattern val_; + }; + + struct io_generator_options { + io_pattern pattern_; + sector_t offset_; + sector_t block_size_; + sector_t size_; + sector_t io_size_; + }; + + class io_generator { + public: + typedef std::shared_ptr ptr; + + virtual bool has_next() = 0; + virtual void next(base::io &next_io) = 0; + }; + + io_generator::ptr + create_io_generator(io_generator_options const &opts); +} + +//---------------------------------------------------------------- + +#endif diff --git a/persistent-data/math_utils.h b/base/math_utils.h similarity index 90% rename from persistent-data/math_utils.h rename to base/math_utils.h index cb387f4..de00d88 100644 --- a/persistent-data/math_utils.h +++ b/base/math_utils.h @@ -16,8 +16,8 @@ // with thin-provisioning-tools. If not, see // . -#ifndef THINP_MATH_H -#define THINP_MATH_H +#ifndef BASE_MATH_H +#define BASE_MATH_H //---------------------------------------------------------------- @@ -34,6 +34,11 @@ namespace base { T div_down(T const &v, T const &divisor) { return v / divisor; } + + template + bool is_power_of_two(T const v) { + return !(v & (v - 1)); + } } //---------------------------------------------------------------- diff --git a/base/types.h b/base/types.h new file mode 100644 index 0000000..7e56139 --- /dev/null +++ b/base/types.h @@ -0,0 +1,15 @@ +#ifndef BASE_TYPES_H +#define BASE_TYPES_H + +#include + +//---------------------------------------------------------------- + +namespace base { + using sector_t = uint64_t; + unsigned const SECTOR_SHIFT = 9; +} + +//---------------------------------------------------------------- + +#endif diff --git a/block-cache/block_cache.h b/block-cache/block_cache.h index 5312468..28dec8b 100644 --- a/block-cache/block_cache.h +++ b/block-cache/block_cache.h @@ -24,7 +24,6 @@ namespace bi = boost::intrusive; namespace bcache { typedef uint64_t block_address; - typedef uint64_t sector_t; class validator { public: diff --git a/block-cache/io_engine.h b/block-cache/io_engine.h index 0e13957..04e05e1 100644 --- a/block-cache/io_engine.h +++ b/block-cache/io_engine.h @@ -1,6 +1,7 @@ #ifndef BLOCK_CACHE_IO_ENGINE_H #define BLOCK_CACHE_IO_ENGINE_H +#include "base/types.h" #include "base/unique_handle.h" #include @@ -18,9 +19,8 @@ //---------------------------------------------------------------- namespace bcache { - using sector_t = uint64_t; - - unsigned const SECTOR_SHIFT = 9; + using base::sector_t; + using base::SECTOR_SHIFT; // Virtual base class to aid unit testing class io_engine { diff --git a/caching/cache_metadata.h b/caching/cache_metadata.h index a89afd5..761f196 100644 --- a/caching/cache_metadata.h +++ b/caching/cache_metadata.h @@ -34,8 +34,6 @@ namespace cache { block_address const SUPERBLOCK_LOCATION = 0; - typedef uint64_t sector_t; - //------------------------------------------------ class space_map_ref_counter { diff --git a/persistent-data/data-structures/array.h b/persistent-data/data-structures/array.h index f7a3ac4..d5063e7 100644 --- a/persistent-data/data-structures/array.h +++ b/persistent-data/data-structures/array.h @@ -19,7 +19,7 @@ #ifndef ARRAY_H #define ARRAY_H -#include "persistent-data/math_utils.h" +#include "base/math_utils.h" #include "persistent-data/data-structures/btree.h" #include "persistent-data/data-structures/btree_counter.h" #include "persistent-data/data-structures/btree_damage_visitor.h" diff --git a/persistent-data/data-structures/bitset.cc b/persistent-data/data-structures/bitset.cc index a4f0b67..02b6b9e 100644 --- a/persistent-data/data-structures/bitset.cc +++ b/persistent-data/data-structures/bitset.cc @@ -1,6 +1,6 @@ #include "persistent-data/data-structures/array.h" #include "persistent-data/data-structures/bitset.h" -#include "persistent-data/math_utils.h" +#include "base/math_utils.h" using namespace persistent_data; using namespace persistent_data::bitset_detail; diff --git a/persistent-data/data-structures/btree-remove.tcc b/persistent-data/data-structures/btree-remove.tcc new file mode 100644 index 0000000..d222273 --- /dev/null +++ b/persistent-data/data-structures/btree-remove.tcc @@ -0,0 +1,373 @@ +// This file is part of the thin-provisioning-tools source. +// +// thin-provisioning-tools is free software: you can redistribute it +// and/or modify it under the terms of the GNU General Public License +// as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// thin-provisioning-tools is distributed in the hope that it will be +// useful, but WITHOUT ANY WARRANTY; without even the implied warranty +// of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License along +// with thin-provisioning-tools. If not, see +// . + +namespace persistent_data { + template + btree_detail::shadow_child + btree:: + create_shadow_child(internal_node &parent, + unsigned index) + { + block_address b = parent.value_at(index); + + pair p = tm_.shadow(b, validator_); + write_ref &wr = p.first; + btree_detail::node_type type; + + node_ref n = to_node(wr); + if (n.get_type() == btree_detail::INTERNAL) { + type = btree_detail::INTERNAL; + if (p.second) + n.inc_children(internal_rc_); + } else { + type = btree_detail::LEAF; + if (p.second) { + node_ref leaf = to_node(wr); + leaf.inc_children(rc_); + } + } + + parent.set_value(index, wr.get_location()); + + return btree_detail::shadow_child(wr, type); + } + + template + void + btree:: + remove(key const &key) + { + using namespace btree_detail; + + block_address block = root_; + unsigned index = 0; + shadow_spine spine(tm_, validator_); + bool need_remove = true; + + for (unsigned level = 0; level < Levels - 1; ++level) { + need_remove = remove_location(spine, block, + key[level], &index, + internal_rc_); + if (!need_remove) + break; + + internal_node n = spine.get_node(); + block = n.value_at(index); + } + + if (need_remove) { + need_remove = remove_location(spine, block, + key[Levels - 1], &index, + rc_); + if (need_remove) { + leaf_node leaf = spine.get_node(); + leaf.delete_at(index); + } + } + + root_ = spine.get_root(); + } + + template + template + bool + btree:: + remove_location(btree_detail::shadow_spine &spine, + block_address block, + uint64_t key, + unsigned *index, + RC &leaf_rc) + { + using namespace btree_detail; + + unsigned i = 0; + bool r = false; + + for (;;) { + r = spine.step(block); + + // patch up the parent to point to the new shadow + if (spine.has_parent()) { + internal_node p = spine.get_parent(); + p.set_value(i, spine.get_block()); + } + + internal_node n = spine.get_node(); + if (n.get_type() == btree_detail::LEAF) { + node_ref leaf = spine.get_node(); + boost::optional idx = leaf.exact_search(key); + if (!idx) + return false; + *index = *idx; + return true; + } + + r = rebalance_children(spine, key); + if (!r) + break; + + n = spine.get_node(); + if (n.get_type() == btree_detail::LEAF) { + node_ref leaf = spine.get_node(); + boost::optional idx = leaf.exact_search(key); + if (!idx) + return false; + *index = *idx; + return true; + } + + i = n.lower_bound(key); + block = n.value_at(i); + } + + return r; + } + + template + template + bool + btree:: + rebalance_children(btree_detail::shadow_spine &spine, uint64_t key) + { + internal_node n = spine.get_node(); + + if (n.get_nr_entries() == 1) { + block_address b = n.value_at(0); + read_ref child = tm_.read_lock(b, validator_); + + // FIXME: is it safe? + ::memcpy(n.raw(), child.data(), read_ref::BLOCK_SIZE); + + tm_.get_sm()->dec(child.get_location()); + return true; + } + + int i = n.lower_bound(key); + if (i < 0) + return false; + + bool has_left_sibling = i > 0; + bool has_right_sibling = static_cast(i) < (n.get_nr_entries() - 1); + + if (!has_left_sibling) + rebalance2(spine, i); + else if (!has_right_sibling) + rebalance2(spine, i - 1); + else + rebalance3(spine, i - 1); + + return true; + } + + template + template + void + btree:: + rebalance2(btree_detail::shadow_spine &spine, unsigned left_index) + { + internal_node parent = spine.get_node(); + shadow_child left = create_shadow_child(parent, left_index); + shadow_child right = create_shadow_child(parent, left_index + 1); + + // FIXME: ugly + if (left.get_type() == btree_detail::INTERNAL) { + internal_node l = left.get_node(); + internal_node r = right.get_node(); + __rebalance2(parent, l, r, left_index); + } else { + node_ref l = left.get_node(); + node_ref r = right.get_node(); + __rebalance2(parent, l, r, left_index); + } + } + + template + template + void + btree:: + __rebalance2(internal_node &parent, + node_ref &left, + node_ref &right, + unsigned left_index) + { + unsigned nr_left = left.get_nr_entries(); + unsigned nr_right = right.get_nr_entries(); + unsigned right_index = left_index + 1; + + unsigned threshold = 2 * (left.merge_threshold() + 1); + if (nr_left + nr_right < threshold) { + // Merge the right child into the left + left.copy_entries_to_left(right, nr_right); + left.set_nr_entries(nr_left + nr_right); + parent.delete_at(right_index); + tm_.get_sm()->dec(right.get_location()); + } else { + // Rebalance + unsigned target_left = (nr_left + nr_right) / 2; + left.move_entries(right, nr_left - target_left); + parent.set_key(right_index, right.key_at(0)); + } + } + + template + template + void + btree:: + rebalance3(btree_detail::shadow_spine &spine, unsigned left_index) + { + internal_node parent = spine.get_node(); + shadow_child left = create_shadow_child(parent, left_index); + shadow_child center = create_shadow_child(parent, left_index + 1); + shadow_child right = create_shadow_child(parent, left_index + 2); + + // FIXME: ugly + if (left.get_type() == btree_detail::INTERNAL) { + internal_node l = left.get_node(); + internal_node c = center.get_node(); + internal_node r = right.get_node(); + __rebalance3(parent, l, c, r, left_index); + } else { + node_ref l = left.get_node(); + node_ref c = center.get_node(); + node_ref r = right.get_node(); + __rebalance3(parent, l, c, r, left_index); + } + } + + template + template + void + btree:: + __rebalance3(internal_node &parent, + node_ref &left, + node_ref ¢er, + node_ref &right, + unsigned left_index) + { + unsigned nr_left = left.get_nr_entries(); + unsigned nr_center = center.get_nr_entries(); + unsigned nr_right = right.get_nr_entries(); + + unsigned threshold = left.merge_threshold() * 4 + 1; + + if ((nr_left + nr_center + nr_right) < threshold) + delete_center_node(parent, left, center, right, left_index); + else + redistribute3(parent, left, center, right, left_index); + } + + template + template + void + btree:: + delete_center_node(internal_node &parent, + node_ref &left, + node_ref ¢er, + node_ref &right, + unsigned left_index) + { + unsigned center_index = left_index + 1; + unsigned right_index = left_index + 2; + + unsigned max_entries = left.get_max_entries(); + unsigned nr_left = left.get_nr_entries(); + unsigned nr_center = center.get_nr_entries(); + unsigned nr_right = right.get_nr_entries(); + unsigned shift = std::min(max_entries - nr_left, nr_center); + + if (nr_left + shift > max_entries) + throw std::runtime_error("too many entries"); + + left.copy_entries_to_left(center, shift); + left.set_nr_entries(nr_left + shift); + + if (shift != nr_center) { + shift = nr_center - shift; + if ((nr_right + shift) > max_entries) + throw std::runtime_error("too many entries"); + right.shift_entries_right(shift); + center.copy_entries_to_right(right, shift); + right.set_nr_entries(nr_right + shift); + } + parent.set_key(right_index, right.key_at(0)); + + parent.delete_at(center_index); + --right_index; + + tm_.get_sm()->dec(center.get_location()); + __rebalance2(parent, left, right, left_index); + } + + template + template + void + btree:: + redistribute3(internal_node &parent, + node_ref &left, + node_ref ¢er, + node_ref &right, + unsigned left_index) + { + unsigned center_index = left_index + 1; + unsigned right_index = left_index + 2; + + unsigned nr_left = left.get_nr_entries(); + unsigned nr_center = center.get_nr_entries(); + unsigned nr_right = right.get_nr_entries(); + + unsigned max_entries = left.get_max_entries(); + unsigned total = nr_left + nr_center + nr_right; + unsigned target_right = total / 3; + unsigned remainder = (target_right * 3) != total; + unsigned target_left = target_right + remainder; + + if (target_left > max_entries || target_right > max_entries) + throw std::runtime_error("too many entries"); + + if (nr_left < nr_right) { + int s = nr_left - target_left; + + // FIXME: signed & unsigned comparison + if (s < 0 && nr_center < static_cast(-s)) { + // not enough in central node + left.move_entries(center, -nr_center); + s += nr_center; + left.move_entries(right, s); + nr_right += s; + } else + left.move_entries(center, s); + + center.move_entries(right, target_right - nr_right); + + } else { + int s = target_right - nr_right; + + if (s > 0 && nr_center < static_cast(s)) { + // not enough in central node + center.move_entries(right, nr_center); + s -= nr_center; + left.move_entries(right, s); + nr_left -= s; + } else + center.move_entries(right, s); + + left.move_entries(center, nr_left - target_left); + } + + parent.set_key(center_index, center.key_at(0)); + parent.set_key(right_index, right.key_at(0)); + } +}; diff --git a/persistent-data/data-structures/btree.h b/persistent-data/data-structures/btree.h index 3b84df7..d3423c7 100644 --- a/persistent-data/data-structures/btree.h +++ b/persistent-data/data-structures/btree.h @@ -110,12 +110,34 @@ namespace persistent_data { uint64_t key, typename ValueTraits::value_type const &v); + // Decrements the nr_entries field + void delete_at(unsigned i); + // Copies entries from another node, appends them // to the back of this node. Adjusts nr_entries. void copy_entries(node_ref const &rhs, unsigned begin, unsigned end); + // Moves entries between the sibling node, + // and maintains the key ordering. + // The nr_entreis of both nodes are adjusted. + void move_entries(node_ref &rhs, + int count); + + // Copies entries from the beginning of rhs to the end of lhs, + // or copies entries from the end of lhs to the beginning of rhs. + // The nr_entries is not adjusted. + void copy_entries_to_left(node_ref const &rhs, unsigned count); + void copy_entries_to_right(node_ref &rhs, unsigned count) const; + + // Shifts entries to left or right. + // The nr_entries is not adjusted. + void shift_entries_left(unsigned shift); + void shift_entries_right(unsigned shift); + + unsigned merge_threshold() const; + // Various searches int bsearch(uint64_t key, int want_hi) const; boost::optional exact_search(uint64_t key) const; @@ -124,6 +146,9 @@ namespace persistent_data { template void inc_children(RefCounter &rc); + template + void dec_children(RefCounter &rc); + disk_node *raw() { return raw_; } @@ -256,6 +281,26 @@ namespace persistent_data { maybe_block root_; }; + class shadow_child { + public: + shadow_child(block_manager::write_ref &wr, node_type type) + : wr_(wr), type_(type) { + } + + node_type get_type() const { + return type_; + } + + template + node_ref get_node() { + return to_node(wr_); + } + + private: + block_manager::write_ref wr_; + node_type type_; + }; + // Used to keep a record of a nested btree's position. typedef std::vector btree_path; @@ -396,6 +441,14 @@ namespace persistent_data { int *index, RC &leaf_rc); + template + bool + remove_location(btree_detail::shadow_spine &spine, + block_address block, + uint64_t key, + unsigned *index, + RC &leaf_rc); + void walk_tree(visitor &visitor, btree_detail::node_location const &loc, block_address b) const; @@ -408,6 +461,53 @@ namespace persistent_data { void inc_children(btree_detail::shadow_spine &spine, RefCounter &leaf_rc); + btree_detail::shadow_child + create_shadow_child(internal_node &parent, + unsigned index); + + template + bool rebalance_children(btree_detail::shadow_spine &spine, + uint64_t key); + + template + void rebalance2(btree_detail::shadow_spine &spine, + unsigned left_index); + + template + void rebalance3(btree_detail::shadow_spine &spine, + unsigned left_index); + + template + void + __rebalance2(internal_node &parent, + btree_detail::node_ref &left, + btree_detail::node_ref &right, + unsigned left_index); + + template + void + __rebalance3(internal_node &parent, + btree_detail::node_ref &left, + btree_detail::node_ref ¢er, + btree_detail::node_ref &right, + unsigned left_index); + + template + void + delete_center_node(internal_node &parent, + btree_detail::node_ref &left, + btree_detail::node_ref ¢er, + btree_detail::node_ref &right, + unsigned left_index); + + template + void + redistribute3(internal_node &parent, + btree_detail::node_ref &left, + btree_detail::node_ref ¢er, + btree_detail::node_ref &right, + unsigned left_index); + transaction_manager &tm_; bool destroy_; block_address root_; @@ -418,6 +518,7 @@ namespace persistent_data { }; #include "btree.tcc" +#include "btree-remove.tcc" //---------------------------------------------------------------- diff --git a/persistent-data/data-structures/btree.tcc b/persistent-data/data-structures/btree.tcc index b24ec29..059ebaa 100644 --- a/persistent-data/data-structures/btree.tcc +++ b/persistent-data/data-structures/btree.tcc @@ -25,6 +25,7 @@ #include #include +#include //---------------------------------------------------------------- @@ -33,6 +34,56 @@ namespace { using namespace persistent_data; using namespace btree_detail; using namespace std; + + struct frame { + frame(block_address blocknr, + uint32_t level, + uint32_t nr_entries) + : blocknr_(blocknr), + level_(level), + nr_entries_(nr_entries), + current_child_(0) { + } + block_address blocknr_; + uint32_t level_; + uint32_t nr_entries_; + uint32_t current_child_; + }; + + // stack for postorder DFS traversal + // TODO: Refactor it into a spine-like class, e.g., btree_del_spine, + // "Spine" sounds better for btree operations. + struct btree_del_stack { + public: + btree_del_stack(transaction_manager &tm): tm_(tm) { + } + + void push_frame(block_address blocknr, + uint32_t level, + uint32_t nr_entries) { + if (tm_.get_sm()->get_count(blocknr) > 1) + tm_.get_sm()->dec(blocknr); + else + spine_.push(frame(blocknr, level, nr_entries)); + } + + void pop_frame() { + tm_.get_sm()->dec(spine_.top().blocknr_); + spine_.pop(); + } + + frame &top_frame() { + return spine_.top(); + } + + bool is_empty() { + return spine_.empty(); + } + + private: + transaction_manager &tm_; + std::stack spine_; + }; } //---------------------------------------------------------------- @@ -242,6 +293,23 @@ namespace persistent_data { set_value(i, v); } + template + void + node_ref::delete_at(unsigned i) + { + unsigned nr_entries = get_nr_entries(); + if (i >= nr_entries) + throw runtime_error("key index out of bounds"); + unsigned nr_to_copy = nr_entries - (i + 1); + + if (nr_to_copy) { + ::memmove(key_ptr(i), key_ptr(i + 1), sizeof(uint64_t) * nr_to_copy); + ::memmove(value_ptr(i), value_ptr(i + 1), sizeof(typename ValueTraits::disk_type) * nr_to_copy); + } + + set_nr_entries(nr_entries - 1); + } + template void node_ref::copy_entries(node_ref const &rhs, @@ -258,6 +326,90 @@ namespace persistent_data { set_nr_entries(n + count); } + template + void + node_ref::move_entries(node_ref &rhs, + int count) + { + if (!count) + return; + + unsigned nr_left = get_nr_entries(); + unsigned nr_right = rhs.get_nr_entries(); + unsigned max_entries = get_max_entries(); + + if (nr_left - count > max_entries || nr_right - count > max_entries) + throw runtime_error("too many entries"); + + if (count > 0) { + rhs.shift_entries_right(count); + copy_entries_to_right(rhs, count); + } else { + copy_entries_to_left(rhs, -count); + rhs.shift_entries_left(-count); + } + + set_nr_entries(nr_left - count); + rhs.set_nr_entries(nr_right + count); + } + + template + void + node_ref::copy_entries_to_left(node_ref const &rhs, unsigned count) + { + unsigned n = get_nr_entries(); + if ((n + count) > get_max_entries()) + throw runtime_error("too many entries"); + + ::memcpy(key_ptr(n), rhs.key_ptr(0), sizeof(uint64_t) * count); + ::memcpy(value_ptr(n), rhs.value_ptr(0), sizeof(typename ValueTraits::disk_type) * count); + } + + template + void + node_ref::copy_entries_to_right(node_ref &rhs, unsigned count) const + { + unsigned n = rhs.get_nr_entries(); + if ((n + count) > get_max_entries()) + throw runtime_error("too many entries"); + + unsigned nr_left = get_nr_entries(); + ::memcpy(rhs.key_ptr(0), key_ptr(nr_left - count), sizeof(uint64_t) * count); + ::memcpy(rhs.value_ptr(0), value_ptr(nr_left - count), sizeof(typename ValueTraits::disk_type) * count); + } + + template + void + node_ref::shift_entries_left(unsigned shift) + { + unsigned n = get_nr_entries(); + if (shift > n) + throw runtime_error("too many entries"); + + unsigned nr_shifted = n - shift; + ::memmove(key_ptr(0), key_ptr(shift), sizeof(uint64_t) * nr_shifted); + ::memmove(value_ptr(0), value_ptr(shift), sizeof(typename ValueTraits::disk_type) * nr_shifted); + } + + template + void + node_ref::shift_entries_right(unsigned shift) + { + unsigned n = get_nr_entries(); + if (n + shift > get_max_entries()) + throw runtime_error("too many entries"); + + ::memmove(key_ptr(shift), key_ptr(0), sizeof(uint64_t) * n); + ::memmove(value_ptr(shift), value_ptr(0), sizeof(typename ValueTraits::disk_type) * n); + } + + template + unsigned + node_ref::merge_threshold() const + { + return get_max_entries() / 3; + } + template int node_ref::bsearch(uint64_t key, int want_hi) const @@ -348,6 +500,21 @@ namespace persistent_data { } } + template + template + void + node_ref::dec_children(RefCounter &rc) + { + unsigned nr_entries = get_nr_entries(); + for (unsigned i = 0; i < nr_entries; i++) { + typename ValueTraits::value_type v; + typename ValueTraits::disk_type d; + ::memcpy(&d, value_ptr(i), sizeof(d)); + ValueTraits::unpack(d, v); + rc.dec(v); + } + } + template bool node_ref::value_sizes_match() const { @@ -535,13 +702,6 @@ namespace persistent_data { return need_insert; } - template - void - btree::remove(key const &key) - { - using namespace btree_detail; - } - template block_address btree::get_root() const @@ -565,15 +725,57 @@ namespace persistent_data { return ptr(new btree(tm_, root_, rc_)); } -#if 0 template void btree::destroy() { using namespace btree_detail; + btree_del_stack s(tm_); + + { + read_ref blk = tm_.read_lock(root_, validator_); + internal_node n = to_node(blk); + s.push_frame(root_, 0, n.get_nr_entries()); + } + + while (!s.is_empty()) { + frame &f = s.top_frame(); + + if (f.current_child_ >= f.nr_entries_) { + s.pop_frame(); + continue; + } + + // FIXME: Cache the read_ref object in the stack to avoid temporary objects? + read_ref current = tm_.read_lock(f.blocknr_, validator_); + internal_node n = to_node(current); + + if (n.get_type() == INTERNAL) { + // TODO: test performance penalty of prefetching + //if (!f.current_child_) + // for (unsigned i = 0; i < n.get_nr_entries(); i++) + // tm_.prefetch(n.value_at(i)); + + block_address b = n.value_at(f.current_child_); + read_ref leaf = tm_.read_lock(b, validator_); + internal_node o = to_node(leaf); + s.push_frame(b, f.level_, o.get_nr_entries()); + ++f.current_child_; + // internal leaf + } else if (f.level_ < Levels - 1) { + block_address b = n.value_at(f.current_child_); + read_ref leaf = tm_.read_lock(b, validator_); + internal_node o = to_node(leaf); + s.push_frame(b, f.level_ + 1, o.get_nr_entries()); + ++f.current_child_; + } else { + leaf_node o = to_node(current); + o.dec_children(rc_); // FIXME: move this into pop_frame() + s.pop_frame(); + } + } } -#endif template template diff --git a/persistent-data/file_utils.cc b/persistent-data/file_utils.cc index 98b31c4..329af86 100644 --- a/persistent-data/file_utils.cc +++ b/persistent-data/file_utils.cc @@ -1,4 +1,4 @@ -#include "persistent-data/math_utils.h" +#include "base/math_utils.h" #include "persistent-data/file_utils.h" #include "persistent-data/space-maps/core.h" diff --git a/persistent-data/space-maps/core.cc b/persistent-data/space-maps/core.cc index 463d212..4251ef6 100644 --- a/persistent-data/space-maps/core.cc +++ b/persistent-data/space-maps/core.cc @@ -17,7 +17,7 @@ // . #include "persistent-data/space-maps/core.h" -#include "persistent-data/math_utils.h" +#include "base/math_utils.h" #include diff --git a/persistent-data/space-maps/disk.cc b/persistent-data/space-maps/disk.cc index 28e13d3..87c8fe5 100644 --- a/persistent-data/space-maps/disk.cc +++ b/persistent-data/space-maps/disk.cc @@ -26,7 +26,7 @@ #include "persistent-data/data-structures/btree_damage_visitor.h" #include "persistent-data/data-structures/btree_counter.h" #include "persistent-data/checksum.h" -#include "persistent-data/math_utils.h" +#include "base/math_utils.h" #include "persistent-data/transaction_manager.h" using namespace persistent_data; diff --git a/thin-provisioning/commands.cc b/thin-provisioning/commands.cc index 13dc76c..23f4b0e 100644 --- a/thin-provisioning/commands.cc +++ b/thin-provisioning/commands.cc @@ -23,6 +23,7 @@ thin_provisioning::register_thin_commands(base::application &app) app.add_cmd(command::ptr(new thin_ll_restore_cmd())); app.add_cmd(command::ptr(new thin_scan_cmd())); app.add_cmd(command::ptr(new thin_generate_metadata_cmd())); + app.add_cmd(command::ptr(new thin_generate_mappings_cmd())); app.add_cmd(command::ptr(new thin_show_duplicates_cmd())); app.add_cmd(command::ptr(new thin_show_metadata_cmd())); app.add_cmd(command::ptr(new thin_journal_cmd())); diff --git a/thin-provisioning/commands.h b/thin-provisioning/commands.h index 6b80bdb..e3a41e7 100644 --- a/thin-provisioning/commands.h +++ b/thin-provisioning/commands.h @@ -110,6 +110,13 @@ namespace thin_provisioning { virtual int run(int argc, char **argv); }; + class thin_generate_mappings_cmd : public base::command { + public: + thin_generate_mappings_cmd(); + virtual void usage(std::ostream &out) const; + virtual int run(int argc, char **argv); + }; + class thin_show_metadata_cmd : public base::command { public: thin_show_metadata_cmd(); diff --git a/thin-provisioning/device_tree.cc b/thin-provisioning/device_tree.cc index 4837cb7..4b4e0dd 100644 --- a/thin-provisioning/device_tree.cc +++ b/thin-provisioning/device_tree.cc @@ -54,6 +54,13 @@ namespace thin_provisioning { snapshotted_time_(0) { } + device_details::device_details(uint64_t tid, uint32_t time) + : mapped_blocks_(0), + transaction_id_(tid), + creation_time_(time), + snapshotted_time_(time) { + } + void device_details_traits::unpack(device_details_disk const &disk, device_details &value) { diff --git a/thin-provisioning/device_tree.h b/thin-provisioning/device_tree.h index d284ac5..d7178cd 100644 --- a/thin-provisioning/device_tree.h +++ b/thin-provisioning/device_tree.h @@ -17,6 +17,7 @@ namespace thin_provisioning { struct device_details { device_details(); + device_details(uint64_t tid, uint32_t time); uint64_t mapped_blocks_; uint64_t transaction_id_; /* when created */ diff --git a/thin-provisioning/metadata.cc b/thin-provisioning/metadata.cc index 9f81a51..3c5446a 100644 --- a/thin-provisioning/metadata.cc +++ b/thin-provisioning/metadata.cc @@ -20,7 +20,7 @@ #include "thin-provisioning/metadata.h" #include "persistent-data/file_utils.h" -#include "persistent-data/math_utils.h" +#include "base/math_utils.h" #include "persistent-data/space-maps/core.h" #include "persistent-data/space-maps/disk.h" diff --git a/thin-provisioning/metadata.h b/thin-provisioning/metadata.h index e8bb5fc..d7d1afe 100644 --- a/thin-provisioning/metadata.h +++ b/thin-provisioning/metadata.h @@ -37,7 +37,6 @@ namespace thin_provisioning { using namespace base; using namespace persistent_data; - typedef uint64_t sector_t; typedef uint32_t thin_dev_t; //------------------------------------------------ diff --git a/thin-provisioning/thin_generate_mappings.cc b/thin-provisioning/thin_generate_mappings.cc new file mode 100644 index 0000000..a366eba --- /dev/null +++ b/thin-provisioning/thin_generate_mappings.cc @@ -0,0 +1,207 @@ +// This file is part of the thin-provisioning-tools source. +// +// thin-provisioning-tools is free software: you can redistribute it +// and/or modify it under the terms of the GNU General Public License +// as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// thin-provisioning-tools is distributed in the hope that it will be +// useful, but WITHOUT ANY WARRANTY; without even the implied warranty +// of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License along +// with thin-provisioning-tools. If not, see +// . + +#include "base/io_generator.h" +#include "base/output_file_requirements.h" +#include "persistent-data/file_utils.h" +#include "thin-provisioning/commands.h" +#include "thin-provisioning/thin_pool.h" +#include "version.h" + +#include +#include +#include + +using namespace boost; +using namespace thin_provisioning; + +//---------------------------------------------------------------- + +namespace { + struct flags { + flags() + : pattern("write"), + offset(0) + { + } + + bool check_conformance(); + + boost::optional output; + base::io_pattern pattern; + boost::optional dev_id; + boost::optional block_size; + base::sector_t offset; + boost::optional size; + boost::optional io_size; + }; + + bool flags::check_conformance() { + if (!output) { + cerr << "No output file provided." << endl; + return false; + } + + if (!dev_id) { + cerr << "No device id provided." << endl; + return false; + } + + if (!size) { + cerr << "No device size specified" << endl; + return false; + } + + check_output_file_requirements(*output); + + return true; + } + + //-------------------------------- + + thin_pool::ptr open_pool(flags const &fs) { + block_manager::ptr bm = open_bm(*fs.output, block_manager::READ_WRITE); + return thin_pool::ptr(new thin_pool(bm)); + } + + int generate_mappings(flags const &fs) { + thin_pool::ptr pool = open_pool(fs); + + thin::ptr td = pool->open_thin(*fs.dev_id); + + io_generator_options opts; + opts.pattern_ = fs.pattern; + opts.block_size_ = !fs.block_size ? + pool->get_data_block_size() : + *fs.block_size; + opts.offset_ = fs.offset; + opts.size_ = *fs.size; + opts.io_size_ = !fs.io_size ? *fs.size : *fs.io_size; + io_generator::ptr gen = create_io_generator(opts); + + base::io io; + while (gen->has_next()) { + // TODO: support io.size_ + gen->next(io); + + switch (io.op_) { + case base::REQ_OP_READ: + process_read(td, pool, io.sector_); + break; + case base::REQ_OP_WRITE: + process_write(td, pool, io.sector_); + break; + case base::REQ_OP_DISCARD: + process_discard(td, pool, io.sector_); + break; + } + } + + pool->commit(); + + return 0; + } +} + +//---------------------------------------------------------------- + +thin_generate_mappings_cmd::thin_generate_mappings_cmd() + : command("thin_generate_mappings") +{ +} + +void +thin_generate_mappings_cmd::usage(std::ostream &out) const +{ + out << "Usage: " << get_name() << " [options]\n" + << "Options:\n" + << " {-h|--help}\n" + << " {-o|--output} \n" + << " {--dev-id} \n" + << " {--offset} \n" + << " {--io-size} \n" + << " {--rw write|trim|randwrite|randtrim|randtw}\n" + << " {--size} \n" + << " {-V|--version}" << endl; +} + +int +thin_generate_mappings_cmd::run(int argc, char **argv) +{ + int c; + struct flags fs; + const char *shortopts = "hi:o:qV"; + const struct option longopts[] = { + { "help", no_argument, NULL, 'h' }, + { "output", required_argument, NULL, 'o' }, + { "dev-id", required_argument, NULL, 1 }, + { "rw", required_argument, NULL, 2 }, + { "offset", required_argument, NULL, 3 }, + { "size", required_argument, NULL, 4 }, + { "io-size", required_argument, NULL, 5 }, + { "version", no_argument, NULL, 'V' }, + { NULL, no_argument, NULL, 0 } + }; + + while ((c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1) { + switch(c) { + case 'h': + usage(cout); + return 0; + + case 'o': + fs.output = optarg; + break; + + case 1: + fs.dev_id = parse_uint64(optarg, "dev_id"); + break; + + case 2: + fs.pattern.parse(optarg); + break; + + case 3: + fs.offset = parse_uint64(optarg, "offset"); + break; + + case 4: + fs.size = parse_uint64(optarg, "size"); + break; + + case 5: + fs.io_size = parse_uint64(optarg, "io_size"); + break; + + case 'V': + cout << THIN_PROVISIONING_TOOLS_VERSION << endl; + return 0; + + default: + usage(cerr); + return 1; + } + } + + if (!fs.check_conformance()) { + usage(cerr); + return 1; + } + + return generate_mappings(fs); +} + +//---------------------------------------------------------------- diff --git a/thin-provisioning/thin_generate_metadata.cc b/thin-provisioning/thin_generate_metadata.cc index f7127cc..e535273 100644 --- a/thin-provisioning/thin_generate_metadata.cc +++ b/thin-provisioning/thin_generate_metadata.cc @@ -19,7 +19,7 @@ #include "base/output_file_requirements.h" #include "persistent-data/file_utils.h" #include "thin-provisioning/commands.h" -#include "thin-provisioning/metadata.h" +#include "thin-provisioning/thin_pool.h" #include "version.h" #include @@ -27,8 +27,6 @@ #include using namespace boost; -using namespace persistent_data; -using namespace std; using namespace thin_provisioning; //---------------------------------------------------------------- @@ -40,6 +38,11 @@ namespace { METADATA_OP_FORMAT, METADATA_OP_OPEN, METADATA_OP_CREATE_THIN, + METADATA_OP_CREATE_SNAP, + METADATA_OP_DELETE_DEV, + METADATA_OP_SET_TRANSACTION_ID, + METADATA_OP_RESERVE_METADATA_SNAP, + METADATA_OP_RELEASE_METADATA_SNAP, METADATA_OP_LAST }; @@ -55,7 +58,9 @@ namespace { metadata_operations op; sector_t data_block_size; block_address nr_data_blocks; - optional dev_id; + optional dev_id; + optional origin; + optional trans_id; optional output; }; @@ -77,88 +82,63 @@ namespace { return false; } + if (op == METADATA_OP_CREATE_SNAP && (!dev_id || !origin)) { + cerr << "no device id provided." << endl; + return false; + } + + if (op == METADATA_OP_DELETE_DEV && !dev_id) { + cerr << "no device id provided." << endl; + return false; + } + + if (op == METADATA_OP_SET_TRANSACTION_ID && !trans_id) { + cerr << "no transaction id provided." << endl; + return false; + } + return true; } //-------------------------------- - single_mapping_tree::ptr new_mapping_tree(metadata::ptr md) { - return single_mapping_tree::ptr( - new single_mapping_tree(*md->tm_, - mapping_tree_detail::block_time_ref_counter(md->data_sm_))); - } - - bool is_device_exists(metadata::ptr md, uint64_t dev_id) { - uint64_t key[1] = {dev_id}; - - device_tree::maybe_value v1 = md->details_->lookup(key); - if (v1) - return true; - - dev_tree::maybe_value v2 = md->mappings_top_level_->lookup(key); - if (v2) - return true; - - return false; - } - - //-------------------------------- - - metadata::ptr format_metadata(block_manager::ptr bm, - sector_t data_block_size, - block_address nr_data_blocks) { - metadata::ptr md(new metadata(bm, - metadata::CREATE, - data_block_size, - nr_data_blocks)); - md->commit(); - return md; - } - - metadata::ptr open_metadata(block_manager::ptr bm) { - metadata::ptr md(new metadata(bm, true)); - return md; - } - - void create_thin(metadata::ptr md, uint64_t dev_id) { - uint64_t key[1] = {dev_id}; - - if (is_device_exists(md, dev_id)) - throw runtime_error("device already exists"); - - device_tree_detail::device_details details; - details.transaction_id_ = md->sb_.trans_id_; - details.creation_time_ = md->sb_.time_; - details.snapshotted_time_ = details.creation_time_; - md->details_->insert(key, details); - - single_mapping_tree::ptr subtree = new_mapping_tree(md); - md->mappings_top_level_->insert(key, subtree->get_root()); - md->mappings_->set_root(md->mappings_top_level_->get_root()); // FIXME: ugly - - md->commit(); - } - - metadata::ptr open_or_format_metadata(block_manager::ptr bm, flags const &fs) { + thin_pool::ptr open_or_create_pool(flags const &fs) { + block_manager::ptr bm = open_bm(*fs.output, block_manager::READ_WRITE); if (fs.op == flags::METADATA_OP_FORMAT) - return format_metadata(bm, fs.data_block_size, fs.nr_data_blocks); + return thin_pool::ptr(new thin_pool(bm, fs.data_block_size, fs.nr_data_blocks)); else - return open_metadata(bm); + return thin_pool::ptr(new thin_pool(bm)); } int generate_metadata(flags const &fs) { - block_manager::ptr bm = open_bm(*fs.output, block_manager::READ_WRITE); - metadata::ptr md = open_or_format_metadata(bm, fs); + thin_pool::ptr pool = open_or_create_pool(fs); switch (fs.op) { case flags::METADATA_OP_CREATE_THIN: - create_thin(md, *fs.dev_id); + pool->create_thin(*fs.dev_id); + break; + case flags::METADATA_OP_CREATE_SNAP: + pool->create_snap(*fs.dev_id, *fs.origin); + break; + case flags::METADATA_OP_DELETE_DEV: + pool->del(*fs.dev_id); + break; + case flags::METADATA_OP_SET_TRANSACTION_ID: + pool->set_transaction_id(*fs.trans_id); + break; + case flags::METADATA_OP_RESERVE_METADATA_SNAP: + pool->reserve_metadata_snap(); + break; + case flags::METADATA_OP_RELEASE_METADATA_SNAP: + pool->release_metadata_snap(); break; default: break; } + pool->commit(); + return 0; } } @@ -176,8 +156,16 @@ thin_generate_metadata_cmd::usage(std::ostream &out) const out << "Usage: " << get_name() << " [options]\n" << "Options:\n" << " {-h|--help}\n" - << " --data-block-size \n" - << " --nr-data-blocks \n" + << " {--format}\n" + << " {--create-thin} \n" + << " {--create-snap} \n" + << " {--delete} \n" + << " {--reserve-metadata-snap}\n" + << " {--release-metadata-snap}\n" + << " {--set-transaction-id} \n" + << " {--data-block-size} \n" + << " {--nr-data-blocks} \n" + << " {--origin} \n" << " {-o|--output} \n" << " {-V|--version}" << endl; } @@ -193,10 +181,15 @@ thin_generate_metadata_cmd::run(int argc, char **argv) { "output", required_argument, NULL, 'o' }, { "format", no_argument, NULL, 1 }, { "open", no_argument, NULL, 2 }, - { "create-thin", no_argument, NULL, 3 }, + { "create-thin", required_argument, NULL, 3 }, + { "create-snap", required_argument, NULL, 4 }, + { "delete", required_argument, NULL, 5 }, + { "set-transaction-id", required_argument, NULL, 6 }, + { "reserve-metadata-snap", no_argument, NULL, 7 }, + { "release-metadata-snap", no_argument, NULL, 8 }, { "data-block-size", required_argument, NULL, 101 }, { "nr-data-blocks", required_argument, NULL, 102 }, - { "dev-id", required_argument, NULL, 301 }, + { "origin", required_argument, NULL, 401 }, { "version", no_argument, NULL, 'V' }, { NULL, no_argument, NULL, 0 } }; @@ -221,6 +214,30 @@ thin_generate_metadata_cmd::run(int argc, char **argv) case 3: fs.op = flags::METADATA_OP_CREATE_THIN; + fs.dev_id = parse_uint64(optarg, "device id"); + break; + + case 4: + fs.op = flags::METADATA_OP_CREATE_SNAP; + fs.dev_id = parse_uint64(optarg, "device id"); + break; + + case 5: + fs.op = flags::METADATA_OP_DELETE_DEV; + fs.dev_id = parse_uint64(optarg, "device id"); + break; + + case 6: + fs.op = flags::METADATA_OP_SET_TRANSACTION_ID; + fs.trans_id = parse_uint64(optarg, "transaction id"); + break; + + case 7: + fs.op = flags::METADATA_OP_RESERVE_METADATA_SNAP; + break; + + case 8: + fs.op = flags::METADATA_OP_RELEASE_METADATA_SNAP; break; case 101: @@ -231,8 +248,8 @@ thin_generate_metadata_cmd::run(int argc, char **argv) fs.nr_data_blocks = parse_uint64(optarg, "nr data blocks"); break; - case 301: - fs.dev_id = parse_uint64(optarg, "dev id"); + case 401: + fs.origin = parse_uint64(optarg, "origin"); break; case 'V': diff --git a/thin-provisioning/thin_pool.cc b/thin-provisioning/thin_pool.cc index d133711..3bdc9e7 100644 --- a/thin-provisioning/thin_pool.cc +++ b/thin-provisioning/thin_pool.cc @@ -16,13 +16,10 @@ // with thin-provisioning-tools. If not, see // . +#include "base/math_utils.h" #include "thin-provisioning/thin_pool.h" #include -#include -#include -#include -#include using namespace base; using namespace std; @@ -31,9 +28,22 @@ using namespace thin_provisioning; //---------------------------------------------------------------- -thin::thin(thin_dev_t dev, thin_pool *pool) +thin::thin(thin_dev_t dev, thin_pool &pool) : dev_(dev), - pool_(pool) + pool_(pool), + details_(pool.get_transaction_id(), pool.get_time()), + open_count_(1), + changed_(true) +{ +} + +thin::thin(thin_dev_t dev, thin_pool &pool, + device_tree_detail::device_details const &details) + : dev_(dev), + pool_(pool), + details_(details), + open_count_(1), + changed_(false) { } @@ -47,66 +57,76 @@ thin::maybe_address thin::lookup(block_address thin_block) { uint64_t key[2] = {dev_, thin_block}; - return pool_->md_->mappings_->lookup(key); + mapping_tree::maybe_value m = pool_.md_->mappings_->lookup(key); + if (!m) + return thin::maybe_address(); + + lookup_result r; + r.block_ = m->block_; + r.shared_ = m->time_ < details_.snapshotted_time_; + return r; } bool thin::insert(block_address thin_block, block_address data_block) { uint64_t key[2] = {dev_, thin_block}; + + ++details_.mapped_blocks_; + changed_ = true; + mapping_tree_detail::block_time bt; bt.block_ = data_block; - bt.time_ = 0; // FIXME: use current time. - return pool_->md_->mappings_->insert(key, bt); + bt.time_ = pool_.get_time(); + return pool_.md_->mappings_->insert(key, bt); } void thin::remove(block_address thin_block) { uint64_t key[2] = {dev_, thin_block}; - pool_->md_->mappings_->remove(key); + pool_.md_->mappings_->remove(key); + + --details_.mapped_blocks_; + changed_ = true; } void thin::set_snapshot_time(uint32_t time) { - uint64_t key[1] = { dev_ }; - boost::optional mdetail = pool_->md_->details_->lookup(key); - if (!mdetail) - throw runtime_error("no such device"); - - mdetail->snapshotted_time_ = time; - pool_->md_->details_->insert(key, *mdetail); + details_.snapshotted_time_ = time; + changed_ = true; } block_address thin::get_mapped_blocks() const { - uint64_t key[1] = { dev_ }; - boost::optional mdetail = pool_->md_->details_->lookup(key); - if (!mdetail) - throw runtime_error("no such device"); - - return mdetail->mapped_blocks_; + return details_.mapped_blocks_; } void thin::set_mapped_blocks(block_address count) { - uint64_t key[1] = { dev_ }; - boost::optional mdetail = pool_->md_->details_->lookup(key); - if (!mdetail) - throw runtime_error("no such device"); - - mdetail->mapped_blocks_ = count; - pool_->md_->details_->insert(key, *mdetail); + details_.mapped_blocks_ = count; + changed_ = true; } //-------------------------------- -thin_pool::thin_pool(metadata::ptr md) - : md_(md) +thin_pool::thin_pool(block_manager::ptr bm) { + md_ = metadata::ptr(new metadata(bm, true)); +} + +thin_pool::thin_pool(block_manager::ptr bm, + sector_t data_block_size, + block_address nr_data_blocks) +{ + md_ = metadata::ptr(new metadata(bm, + metadata::CREATE, + data_block_size, + nr_data_blocks)); + md_->commit(); } thin_pool::~thin_pool() @@ -120,14 +140,15 @@ thin_pool::create_thin(thin_dev_t dev) uint64_t key[1] = {dev}; if (device_exists(dev)) - throw std::runtime_error("Device already exists"); + throw std::runtime_error("device already exists"); single_mapping_tree::ptr new_tree(new single_mapping_tree(*md_->tm_, mapping_tree_detail::block_time_ref_counter(md_->data_sm_))); md_->mappings_top_level_->insert(key, new_tree->get_root()); md_->mappings_->set_root(md_->mappings_top_level_->get_root()); // FIXME: ugly - // FIXME: doesn't set up the device details + thin::ptr r = create_device(dev); + close_device(r); } void @@ -136,31 +157,62 @@ thin_pool::create_snap(thin_dev_t dev, thin_dev_t origin) uint64_t snap_key[1] = {dev}; uint64_t origin_key[1] = {origin}; - boost::optional mtree_root = md_->mappings_top_level_->lookup(origin_key); + if (device_exists(dev)) + throw std::runtime_error("device already exists"); + + // find the mapping tree of the origin + dev_tree::maybe_value mtree_root = md_->mappings_top_level_->lookup(origin_key); if (!mtree_root) throw std::runtime_error("unknown origin"); - single_mapping_tree otree(*md_->tm_, *mtree_root, mapping_tree_detail::block_time_ref_counter(md_->data_sm_)); + // clone the origin single_mapping_tree::ptr clone(otree.clone()); md_->mappings_top_level_->insert(snap_key, clone->get_root()); md_->mappings_->set_root(md_->mappings_top_level_->get_root()); // FIXME: ugly md_->sb_.time_++; - thin::ptr o = open_thin(origin); - thin::ptr s = open_thin(dev); - o->set_snapshot_time(md_->sb_.time_); - s->set_snapshot_time(md_->sb_.time_); - s->set_mapped_blocks(o->get_mapped_blocks()); + // create details for the snapshot + thin::ptr s = create_device(dev); + set_snapshot_details(s, origin); + close_device(s); } void thin_pool::del(thin_dev_t dev) { uint64_t key[1] = {dev}; + + thin::ptr td = open_device(dev); + if (td->open_count_ > 1) { + close_device(td); + throw std::runtime_error("device busy"); + } + + thin_devices_.erase(dev); + + dev_tree::maybe_value mtree_root = md_->mappings_top_level_->lookup(key); + if (!device_exists(dev) || !mtree_root) + throw std::runtime_error("unknown device"); + + // TODO: trigger subtree deletion from the mtree_ref_counter, + // like the kenrel subtree_dec() does. + single_mapping_tree mtree(*md_->tm_, *mtree_root, + mapping_tree_detail::block_time_ref_counter(md_->data_sm_)); + mtree.destroy(); + + md_->details_->remove(key); md_->mappings_top_level_->remove(key); + md_->mappings_->set_root(md_->mappings_top_level_->get_root()); // FIXME: ugly +} + +void +thin_pool::commit() +{ + write_changed_details(); + md_->commit(); } void @@ -175,6 +227,52 @@ thin_pool::get_transaction_id() const return md_->sb_.trans_id_; } +void +thin_pool::reserve_metadata_snap() +{ + if (md_->sb_.metadata_snap_) + throw std::runtime_error("pool metadata snapshot already exists."); + + commit(); + + md_->metadata_sm_->inc(superblock_detail::SUPERBLOCK_LOCATION); + transaction_manager::write_ref wr = md_->tm_->shadow( + superblock_detail::SUPERBLOCK_LOCATION, + superblock_validator()).first; + + superblock_detail::superblock sb; + superblock_detail::superblock_disk *sbd = reinterpret_cast(wr.data()); + superblock_detail::superblock_traits::unpack(*sbd, sb); + + memset(sb.data_space_map_root_, 0, superblock_detail::SPACE_MAP_ROOT_SIZE); + memset(sb.metadata_space_map_root_, 0, superblock_detail::SPACE_MAP_ROOT_SIZE); + md_->metadata_sm_->inc(sb.data_mapping_root_); + md_->metadata_sm_->inc(sb.device_details_root_); + + superblock_detail::superblock_traits::pack(sb, *sbd); + + md_->sb_.metadata_snap_ = wr.get_location(); +} + +void +thin_pool::release_metadata_snap() +{ + if (!md_->sb_.metadata_snap_) + throw std::runtime_error("No pool metadata snapshot found"); + + superblock_detail::superblock sb = read_superblock(md_->tm_->get_bm(), + md_->sb_.metadata_snap_); + device_tree dtree(*md_->tm_, sb.device_details_root_, + device_tree_detail::device_details_traits::ref_counter()); + dtree.destroy(); + mapping_tree mtree(*md_->tm_, sb.data_mapping_root_, + mapping_tree_detail::block_traits::ref_counter(md_->tm_->get_sm())); + mtree.destroy(); + md_->metadata_sm_->dec(md_->sb_.metadata_snap_); + + md_->sb_.metadata_snap_ = 0; +} + block_address thin_pool::get_metadata_snap() const { @@ -184,7 +282,7 @@ thin_pool::get_metadata_snap() const block_address thin_pool::alloc_data_block() { - boost::optional mb = md_->data_sm_->new_block(); + space_map::maybe_block mb = md_->data_sm_->new_block(); if (!mb) throw runtime_error("couldn't allocate new block"); @@ -203,7 +301,7 @@ thin_pool::get_nr_free_data_blocks() const return md_->data_sm_->get_nr_free(); } -thin_provisioning::sector_t +sector_t thin_pool::get_data_block_size() const { return md_->sb_.data_block_size_; @@ -215,17 +313,22 @@ thin_pool::get_data_dev_size() const return md_->data_sm_->get_nr_blocks(); } +uint32_t +thin_pool::get_time() const +{ + return md_->sb_.time_; +} + thin::ptr thin_pool::open_thin(thin_dev_t dev) { - uint64_t key[1] = {dev}; - boost::optional mdetails = md_->details_->lookup(key); - if (!mdetails) - throw runtime_error("no such device"); + return open_device(dev); +} - thin *ptr = new thin(dev, this); - thin::ptr r(ptr); - return r; +void +thin_pool::close_thin(thin::ptr td) +{ + close_device(td); } bool @@ -235,4 +338,107 @@ thin_pool::device_exists(thin_dev_t dev) const return !!md_->details_->lookup(key); } +thin::ptr +thin_pool::create_device(thin_dev_t dev) +{ + device_map::iterator it = thin_devices_.find(dev); + if (it != thin_devices_.end()) + throw std::runtime_error("device already exists"); + + thin::ptr td(new thin(dev, *this)); + thin_devices_[dev] = td; + return td; +} + +thin::ptr +thin_pool::open_device(thin_dev_t dev) +{ + device_map::iterator it = thin_devices_.find(dev); + if (it != thin_devices_.end()) { + thin::ptr td = it->second; + td->open_count_++; + return td; + } + + uint64_t key[1] = {dev}; + device_tree::maybe_value details = md_->details_->lookup(key); + if (!details) + throw std::runtime_error("no such device"); + + thin::ptr td(new thin(dev, *this, *details)); + thin_devices_[dev] = td; + return td; +} + +void +thin_pool::close_device(thin::ptr td) +{ + td->open_count_--; +} + +void +thin_pool::set_snapshot_details(thin::ptr snap, thin_dev_t origin) +{ + thin::ptr o = open_device(origin); + o->set_snapshot_time(md_->sb_.time_); + snap->set_snapshot_time(md_->sb_.time_); + snap->set_mapped_blocks(o->get_mapped_blocks()); + close_device(o); +} + +void +thin_pool::write_changed_details() +{ + for (auto it = thin_devices_.cbegin(); it != thin_devices_.cend(); ) { + uint64_t key[1] = {it->first}; + thin::ptr td = it->second; + + if (td->changed_) { + md_->details_->insert(key, td->details_); + td->changed_ = false; + } + + if (!td->open_count_) + it = thin_devices_.erase(it); + else + ++it; + } +} + +//---------------------------------------------------------------- + +void +thin_provisioning::process_read(thin::ptr td, thin_pool::ptr tp, + sector_t offset) +{ + block_address blocknr = base::div_up(offset, tp->get_data_block_size()); + td->lookup(blocknr); +} + +void +thin_provisioning::process_write(thin::ptr td, thin_pool::ptr tp, + sector_t offset) +{ + block_address blocknr = base::div_up(offset, tp->get_data_block_size()); + thin::maybe_address result = td->lookup(blocknr); + if (!!result && !result->shared_) + return; + // TODO: handle out-of-space errors + block_address data_block = tp->alloc_data_block(); + td->insert(blocknr, data_block); +} + +void +thin_provisioning::process_discard(thin::ptr td, thin_pool::ptr tp, + sector_t offset) +{ + block_address blocknr = base::div_up(offset, tp->get_data_block_size()); + thin::maybe_address result = td->lookup(blocknr); + if (!result) + return; + td->remove(blocknr); + if (!result->shared_) + tp->free_data_block(result->block_); +} + //---------------------------------------------------------------- diff --git a/thin-provisioning/thin_pool.h b/thin-provisioning/thin_pool.h index fe4248b..62f6945 100644 --- a/thin-provisioning/thin_pool.h +++ b/thin-provisioning/thin_pool.h @@ -33,8 +33,13 @@ namespace thin_provisioning { class thin_pool; class thin { public: + struct lookup_result { + block_address block_; + bool shared_; + }; + typedef std::shared_ptr ptr; - typedef boost::optional maybe_address; + typedef boost::optional maybe_address; thin_dev_t get_dev_t() const; maybe_address lookup(block_address thin_block); @@ -48,26 +53,38 @@ namespace thin_provisioning { private: friend class thin_pool; - thin(thin_dev_t dev, thin_pool *pool); // FIXME: pass a reference rather than a ptr + thin(thin_dev_t dev, thin_pool &pool); + thin(thin_dev_t dev, thin_pool &pool, + device_tree_detail::device_details const &details); thin_dev_t dev_; - thin_pool *pool_; + thin_pool &pool_; + device_tree_detail::device_details details_; + uint32_t open_count_; + bool changed_; }; class thin_pool { public: typedef std::shared_ptr ptr; - thin_pool(metadata::ptr md); + thin_pool(block_manager::ptr bm); + thin_pool(block_manager::ptr bm, + sector_t data_block_size, + block_address nr_data_blocks); ~thin_pool(); void create_thin(thin_dev_t dev); void create_snap(thin_dev_t dev, thin_dev_t origin); void del(thin_dev_t); + void commit(); void set_transaction_id(uint64_t id); uint64_t get_transaction_id() const; + // handling metadata snapshot + void reserve_metadata_snap(); + void release_metadata_snap(); block_address get_metadata_snap() const; block_address alloc_data_block(); @@ -77,15 +94,29 @@ namespace thin_provisioning { block_address get_nr_free_data_blocks() const; sector_t get_data_block_size() const; block_address get_data_dev_size() const; + uint32_t get_time() const; thin::ptr open_thin(thin_dev_t); + void close_thin(thin::ptr td); private: friend class thin; + typedef std::map device_map; + bool device_exists(thin_dev_t dev) const; + thin::ptr create_device(thin_dev_t dev); + thin::ptr open_device(thin_dev_t dev); + void close_device(thin::ptr device); + void set_snapshot_details(thin::ptr snap, thin_dev_t origin); + void write_changed_details(); metadata::ptr md_; + device_map thin_devices_; }; + + void process_read(thin::ptr td, thin_pool::ptr tp, sector_t offset); + void process_write(thin::ptr td, thin_pool::ptr tp, sector_t offset); + void process_discard(thin::ptr td, thin_pool::ptr tp, sector_t offset); }; //----------------------------------------------------------------