diff --git a/Makefile.in b/Makefile.in index d4ff973..9e62019 100644 --- a/Makefile.in +++ b/Makefile.in @@ -33,6 +33,7 @@ SOURCE=\ base/error_string.cc \ base/grid_layout.cc \ base/progress_monitor.cc \ + base/rolling_hash.cc \ base/xml_utils.cc \ block-cache/block_cache.cc \ caching/cache_check.cc \ @@ -76,12 +77,16 @@ SOURCE=\ persistent-data/transaction_manager.cc \ persistent-data/validators.cc \ thin-provisioning/commands.cc \ + thin-provisioning/cache_stream.cc \ + thin-provisioning/chunk_stream.cc \ thin-provisioning/device_tree.cc \ + thin-provisioning/fixed_chunk_stream.cc \ thin-provisioning/human_readable_format.cc \ thin-provisioning/mapping_tree.cc \ thin-provisioning/metadata.cc \ thin-provisioning/metadata_checker.cc \ thin-provisioning/metadata_dumper.cc \ + thin-provisioning/pool_stream.cc \ thin-provisioning/restore_emitter.cc \ thin-provisioning/rmap_visitor.cc \ thin-provisioning/superblock.cc \ @@ -94,11 +99,13 @@ SOURCE=\ thin-provisioning/thin_repair.cc \ thin-provisioning/thin_restore.cc \ thin-provisioning/thin_rmap.cc \ + thin-provisioning/thin_show_duplicates.cc \ thin-provisioning/thin_trim.cc \ - thin-provisioning/xml_format.cc \ + thin-provisioning/xml_format.cc DEVTOOLS_SOURCE=\ - thin-provisioning/thin_generate_metadata.cc + thin-provisioning/thin_generate_metadata.cc \ + thin-provisioning/variable_chunk_stream.cc ifeq ("@DEVTOOLS@", "yes") SOURCE+=$(DEVTOOLS_SOURCE) @@ -206,6 +213,7 @@ install: bin/pdata_tools ln -s -f pdata_tools $(BINDIR)/thin_repair ln -s -f pdata_tools $(BINDIR)/thin_restore ln -s -f pdata_tools $(BINDIR)/thin_rmap + ln -s -f pdata_tools $(BINDIR)/thin_show_duplicates ln -s -f pdata_tools $(BINDIR)/thin_trim ln -s -f pdata_tools $(BINDIR)/thin_metadata_size ln -s -f pdata_tools $(BINDIR)/era_check diff --git a/base/progress_monitor.cc b/base/progress_monitor.cc index d9f7e48..2cd3901 100644 --- a/base/progress_monitor.cc +++ b/base/progress_monitor.cc @@ -31,17 +31,22 @@ namespace { if (nr_equals < progress_width_) cout << '>'; + else + cout << "="; for (unsigned i = 0; i < nr_spaces; i++) cout << ' '; - cout << "] " << spinner_char() << " " << p << "%\r" << flush; + cout << "] " << spinner_char(p) << " " << p << "%\r" << flush; spinner_++; } private: - char spinner_char() const { + char spinner_char(unsigned p) const { + if (p == 100) + return ' '; + char cs[] = {'|', '/', '-', '\\'}; unsigned index = spinner_ % sizeof(cs); diff --git a/base/rolling_hash.cc b/base/rolling_hash.cc new file mode 100644 index 0000000..d2d273a --- /dev/null +++ b/base/rolling_hash.cc @@ -0,0 +1,57 @@ +#include "base/rolling_hash.h" + +using namespace base; +using namespace boost; +using namespace hash_detail; +using namespace std; + +//---------------------------------------------------------------- + +rolling_hash::rolling_hash(unsigned window_size) + : a_(MULTIPLIER), + a_to_k_minus_1_(a_), + window_size_(window_size), + buffer_(window_size) { + + for (unsigned i = 1; i < window_size_ - 1; i++) + a_to_k_minus_1_ *= a_; + + reset(); +} + +void +rolling_hash::reset() +{ + // prime with zeroes + buffer_.clear(); + + hash_ = 0; + for (unsigned i = 0; i < window_size_; i++) { + hash_ = (hash_ * a_) + SEED; + buffer_.push_back(0); + } +} + +//-------------------------------- + +content_based_hash::content_based_hash(unsigned window_size) + : rhash_(window_size), + + // FIXME: hard coded values + backup_div_((window_size / 4) - 1), + div_((window_size / 2) - 1), + min_len_(window_size / 4), + max_len_(window_size), + len_(0) +{ +} + +void +content_based_hash::reset() +{ + len_ = 0; + backup_break_.reset(); + rhash_.reset(); +} + +//---------------------------------------------------------------- diff --git a/base/rolling_hash.h b/base/rolling_hash.h new file mode 100644 index 0000000..dff3145 --- /dev/null +++ b/base/rolling_hash.h @@ -0,0 +1,109 @@ +#ifndef BASE_ROLLING_HASH_H +#define BASE_ROLLING_HASH_H + +#include +#include +#include + +//---------------------------------------------------------------- + +namespace base { + namespace hash_detail { + uint32_t const MULTIPLIER = 4294967291UL; + uint32_t const SEED = 123; + } + + class rolling_hash { + public: + rolling_hash(unsigned window_size); + + void reset(); + + // Returns the current hash + uint32_t step(uint8_t byte) { + update_hash(byte); + return hash_; + } + + uint32_t get_hash() const { + return hash_; + } + + private: + void update_hash(uint8_t byte) { + hash_ -= a_to_k_minus_1_ * (buffer_.front() + hash_detail::SEED); + buffer_.push_back(byte); + hash_ = (hash_ * a_) + byte + hash_detail::SEED; + } + + uint32_t a_; + uint32_t a_to_k_minus_1_; + + uint32_t hash_; + uint32_t window_size_; + + boost::circular_buffer buffer_; + }; + + class content_based_hash { + public: + content_based_hash(unsigned window_size); + void reset(); + + // Returns a break point relative to the last reset/break. + boost::optional step(uint8_t byte) { + boost::optional r; + + rhash_.step(byte); + len_++; + + if (len_ < min_len_) + return r; + + if (hit_break(backup_div_)) + backup_break_ = len_; + + if (hit_break(div_)) { + // found a break + r = len_; + len_ = 0; + backup_break_.reset(); + + } else if (len_ >= max_len_) { + // too big, is there a backup? + if (backup_break_) { + len_ -= *backup_break_; + r = backup_break_; + backup_break_.reset(); + + } else { + r = len_; + len_ = 0; + } + } + + return r; + } + + private: + bool hit_break(uint32_t mask) const { + uint32_t h = rhash_.get_hash() >> 8; + return !(h & mask); + } + + rolling_hash rhash_; + + uint32_t backup_div_; + uint32_t div_; + + unsigned min_len_; + unsigned max_len_; + + unsigned len_; + boost::optional backup_break_; + }; +} + +//---------------------------------------------------------------- + +#endif diff --git a/bin/thin_show_duplicates b/bin/thin_show_duplicates new file mode 120000 index 0000000..84c01e7 --- /dev/null +++ b/bin/thin_show_duplicates @@ -0,0 +1 @@ +pdata_tools \ No newline at end of file diff --git a/block-cache/block_cache.cc b/block-cache/block_cache.cc index 1b489ea..d130401 100644 --- a/block-cache/block_cache.cc +++ b/block-cache/block_cache.cc @@ -188,7 +188,7 @@ block_cache::wait_io() for (i = 0; i < static_cast(r); i++) { io_event const &e = events_[i]; - block *b = container_of(e.obj, &block::control_block_); + block *b = base::container_of(e.obj, &block::control_block_); if (e.res == block_size_ << SECTOR_SHIFT) complete_io(*b, 0); diff --git a/block-cache/block_cache.h b/block-cache/block_cache.h index 8c79a77..fca90ea 100644 --- a/block-cache/block_cache.h +++ b/block-cache/block_cache.h @@ -1,6 +1,8 @@ #ifndef BLOCK_CACHE_H #define BLOCK_CACHE_H +#include "base/container_of.h" + #include #include #include @@ -13,26 +15,12 @@ #include #include #include +#include namespace bi = boost::intrusive; //---------------------------------------------------------------- -// FIXME: move to own file in base -template -size_t offsetof__(const M P::*member) -{ - return (size_t) &( reinterpret_cast(0)->*member); -} - -template -P *container_of(M *ptr, M const P::*member) -{ - return (P *)((char *)(ptr) - offsetof__(member)); -} - -//---------------------------------------------------------------- - namespace bcache { typedef uint64_t block_address; typedef uint64_t sector_t; diff --git a/persistent-data/block.tcc b/persistent-data/block.tcc index 6e47a91..0824bf3 100644 --- a/persistent-data/block.tcc +++ b/persistent-data/block.tcc @@ -223,6 +223,7 @@ namespace persistent_data { unsigned max_concurrent_blocks, mode m, bool excl) + // FIXME: * BlockSize ? : fd_(open_or_create_block_file(path, nr_blocks * BlockSize, m, excl)), bc_(fd_, BlockSize >> SECTOR_SHIFT, nr_blocks, 1024u * 1024u * 16), superblock_ref_count_(0) diff --git a/persistent-data/file_utils.cc b/persistent-data/file_utils.cc index 2467079..88ee945 100644 --- a/persistent-data/file_utils.cc +++ b/persistent-data/file_utils.cc @@ -12,7 +12,7 @@ using namespace base; //---------------------------------------------------------------- persistent_data::block_address -persistent_data::get_nr_blocks(string const &path) +persistent_data::get_nr_blocks(string const &path, sector_t block_size) { using namespace persistent_data; @@ -24,7 +24,7 @@ persistent_data::get_nr_blocks(string const &path) throw runtime_error("Couldn't stat dev path"); if (S_ISREG(info.st_mode) && info.st_size) - nr_blocks = div_up(info.st_size, MD_BLOCK_SIZE); + nr_blocks = div_up(info.st_size, block_size); else if (S_ISBLK(info.st_mode)) { // To get the size of a block device we need to @@ -39,7 +39,7 @@ persistent_data::get_nr_blocks(string const &path) throw runtime_error("ioctl BLKGETSIZE64 failed"); } ::close(fd); - nr_blocks = div_down(nr_blocks, MD_BLOCK_SIZE); + nr_blocks = div_down(nr_blocks, block_size); } else // FIXME: needs a better message throw runtime_error("bad path"); diff --git a/persistent-data/file_utils.h b/persistent-data/file_utils.h index fcf203d..e641b7a 100644 --- a/persistent-data/file_utils.h +++ b/persistent-data/file_utils.h @@ -9,7 +9,7 @@ // FIXME: move to a different unit namespace persistent_data { - persistent_data::block_address get_nr_blocks(string const &path); + persistent_data::block_address get_nr_blocks(string const &path, sector_t block_size = MD_BLOCK_SIZE); block_manager<>::ptr open_bm(std::string const &dev_path, block_manager<>::mode m, bool excl = true); diff --git a/thin-provisioning/cache_stream.cc b/thin-provisioning/cache_stream.cc new file mode 100644 index 0000000..a21ed64 --- /dev/null +++ b/thin-provisioning/cache_stream.cc @@ -0,0 +1,95 @@ +#include "base/container_of.h" +#include "thin-provisioning/cache_stream.h" +#include "persistent-data/file_utils.h" + +using namespace thin_provisioning; +using namespace std; +using namespace persistent_data; + +//---------------------------------------------------------------- + +namespace { + int open_file(string const &path) { + int fd = ::open(path.c_str(), O_RDONLY | O_DIRECT | O_EXCL, 0666); + if (fd < 0) + syscall_failed("open", + "Note: you cannot run this tool with these options on live metadata."); + + return fd; + } +} + +//---------------------------------------------------------------- + +cache_stream::cache_stream(string const &path, + block_address block_size, + size_t cache_mem) + : block_size_(block_size), + nr_blocks_(get_nr_blocks(path, block_size)), + + // hack because cache uses LRU rather than MRU + cache_blocks_((cache_mem / block_size) / 2u), + fd_(open_file(path)), + v_(new bcache::noop_validator()), + cache_(new block_cache(fd_, block_size / 512, nr_blocks_, cache_mem)), + current_index_(0) { + + rewind(); +} + +block_address +cache_stream::size() const +{ + return nr_blocks_ * block_size_; +} + +void +cache_stream::rewind() +{ + current_index_ = 0; + + for (block_address i = 1; i < min(cache_blocks_, nr_blocks_); i++) + cache_->prefetch(i); +} + +bool +cache_stream::next(block_address count) +{ + current_index_ = min(current_index_ + count, nr_blocks_); + + if (current_index_ + cache_blocks_ < nr_blocks_) + cache_->prefetch(current_index_ + cache_blocks_); + + return !eof(); +} + +bool +cache_stream::eof() const +{ + return current_index_ >= nr_blocks_; +} + +chunk const & +cache_stream::get() +{ + chunk_wrapper *w = new chunk_wrapper(*this); + return w->c_; +} + +void +cache_stream::put(chunk const &c) +{ + chunk_wrapper *w = base::container_of(const_cast(&c), &chunk_wrapper::c_); + delete w; +} + +cache_stream::chunk_wrapper::chunk_wrapper(cache_stream &parent) + : block_(parent.cache_->get(parent.current_index_, 0, parent.v_)) +{ + c_.offset_ = parent.current_index_ * parent.block_size_; + c_.len_ = parent.block_size_; + c_.mem_.begin = static_cast(block_.get_data()); + c_.mem_.end = c_.mem_.begin + parent.block_size_; +} + +//---------------------------------------------------------------- diff --git a/thin-provisioning/cache_stream.h b/thin-provisioning/cache_stream.h new file mode 100644 index 0000000..b7af995 --- /dev/null +++ b/thin-provisioning/cache_stream.h @@ -0,0 +1,51 @@ +#ifndef THIN_PROVISIONING_CACHE_STREAM_H +#define THIN_PROVISIONING_CACHE_STREAM_H + +#include "thin-provisioning/chunk_stream.h" + +//---------------------------------------------------------------- + +namespace thin_provisioning { + using namespace bcache; + + class cache_stream : public chunk_stream { + public: + cache_stream(std::string const &path, + block_address block_size, + size_t cache_mem); + + block_address size() const; + + virtual void rewind(); + + virtual bool next(block_address count = 1ull); + virtual bool eof() const; + + virtual chunk const &get(); + virtual void put(chunk const &c); + + private: + struct chunk_wrapper { + chunk_wrapper(cache_stream &parent); + + block_cache::auto_block block_; + chunk c_; + }; + + friend class chunk_wrapper; + + block_address block_size_; + block_address nr_blocks_; + block_address cache_blocks_; + + int fd_; + validator::ptr v_; + std::auto_ptr cache_; + + block_address current_index_; + }; +} + +//---------------------------------------------------------------- + +#endif diff --git a/thin-provisioning/chunk_stream.cc b/thin-provisioning/chunk_stream.cc new file mode 100644 index 0000000..adc41d0 --- /dev/null +++ b/thin-provisioning/chunk_stream.cc @@ -0,0 +1,9 @@ +#include "thin-provisioning/chunk_stream.h" + +using namespace std; +using namespace thin_provisioning; + +//---------------------------------------------------------------- + + +//---------------------------------------------------------------- diff --git a/thin-provisioning/chunk_stream.h b/thin-provisioning/chunk_stream.h new file mode 100644 index 0000000..1831f27 --- /dev/null +++ b/thin-provisioning/chunk_stream.h @@ -0,0 +1,66 @@ +// Copyright (C) 2015 Red Hat, Inc. All rights reserved. +// +// This file is part of the thin-provisioning-tools source. +// +// thin-provisioning-tools is free software: you can redistribute it +// and/or modify it under the terms of the GNU General Public License +// as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// thin-provisioning-tools is distributed in the hope that it will be +// useful, but WITHOUT ANY WARRANTY; without even the implied warranty +// of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License along +// with thin-provisioning-tools. If not, see +// . + +#ifndef CHUNK_STREAM_H +#define CHUNK_STREAM_H + +#include "block-cache/block_cache.h" + +#include +#include + +//---------------------------------------------------------------- + +namespace thin_provisioning { + struct mem { + mem() + : begin(0), + end(0) { + } + + mem(uint8_t *b, uint8_t *e) + : begin(b), + end(e) { + } + + uint8_t *begin, *end; + }; + + struct chunk { + uint64_t offset_, len_; + mem mem_; + }; + + class chunk_stream { + public: + virtual ~chunk_stream() {} + + virtual void rewind() = 0; + virtual bcache::block_address size() const = 0; + + virtual bool next(bcache::block_address count = 1ull) = 0; + virtual bool eof() const = 0; + + virtual chunk const &get() = 0; + virtual void put(chunk const &c) = 0; + }; +} + +//---------------------------------------------------------------- + +#endif diff --git a/thin-provisioning/commands.cc b/thin-provisioning/commands.cc index 62b2c60..66aca99 100644 --- a/thin-provisioning/commands.cc +++ b/thin-provisioning/commands.cc @@ -18,8 +18,9 @@ thin_provisioning::register_thin_commands(base::application &app) app.add_cmd(command::ptr(new thin_rmap_cmd())); app.add_cmd(command::ptr(new thin_trim_cmd())); -#if DEV_COMMANDS +#ifdef DEV_TOOLS app.add_cmd(command::ptr(new thin_generate_metadata_cmd())); + app.add_cmd(command::ptr(new thin_show_duplicates_cmd())); #endif } diff --git a/thin-provisioning/commands.h b/thin-provisioning/commands.h index 4ff97ea..74fbe99 100644 --- a/thin-provisioning/commands.h +++ b/thin-provisioning/commands.h @@ -70,6 +70,13 @@ namespace thin_provisioning { virtual int run(int argc, char **argv); }; + class thin_show_duplicates_cmd : public base::command { + public: + thin_show_duplicates_cmd(); + virtual void usage(std::ostream &out) const; + virtual int run(int argc, char **argv); + }; + class thin_generate_metadata_cmd : public base::command { public: thin_generate_metadata_cmd(); diff --git a/thin-provisioning/fixed_chunk_stream.cc b/thin-provisioning/fixed_chunk_stream.cc new file mode 100644 index 0000000..ea031dd --- /dev/null +++ b/thin-provisioning/fixed_chunk_stream.cc @@ -0,0 +1,113 @@ +#include "thin-provisioning/fixed_chunk_stream.h" + +using namespace thin_provisioning; + +//---------------------------------------------------------------- + +fixed_chunk_stream::fixed_chunk_stream(chunk_stream &stream, unsigned chunk_size) + : index_(0), + stream_(stream), + chunk_size_(chunk_size), + big_chunk_(0) { + next_big_chunk(); +} + +fixed_chunk_stream::~fixed_chunk_stream() +{ + put_big_chunk(); +} + +bcache::block_address +fixed_chunk_stream::size() const +{ + return stream_.size(); +} + +void +fixed_chunk_stream::rewind() +{ + // FIXME: not complete + index_ = 0; + stream_.rewind(); +} + +bool +fixed_chunk_stream::next(bcache::block_address count) +{ + while (count--) { + index_++; + advance_one(); + } + + return !eof(); +} + +bool +fixed_chunk_stream::eof() const +{ + return stream_.eof(); +} + +chunk const & +fixed_chunk_stream::get() +{ + assert(big_chunk_); + + little_chunk_.len_ = little_e_ - little_b_; + little_chunk_.offset_ = big_chunk_->offset_ + little_chunk_.len_; + + little_chunk_.mem_.begin = little_b_; + little_chunk_.mem_.end = little_e_; + + return little_chunk_; +} + +void +fixed_chunk_stream::put(chunk const &c) +{ + // noop +} + +bool +fixed_chunk_stream::next_big_chunk() +{ + put_big_chunk(); + + if (!stream_.next()) + return false; + + big_chunk_ = &stream_.get(); + little_b_ = little_e_ = last_hashed_ = big_chunk_->mem_.begin; + + return true; +} + +bool +fixed_chunk_stream::advance_one() +{ + uint8_t *big_e; + + big_e = big_chunk_->mem_.end; + little_b_ = little_e_; + + if (little_b_ >= big_e) { + if (next_big_chunk()) + big_e = big_chunk_->mem_.end; + else + return false; + } + + little_e_ += chunk_size_; + return true; +} + +void +fixed_chunk_stream::put_big_chunk() +{ + if (big_chunk_) + stream_.put(*big_chunk_); + + big_chunk_ = 0; +} + +//---------------------------------------------------------------- diff --git a/thin-provisioning/fixed_chunk_stream.h b/thin-provisioning/fixed_chunk_stream.h new file mode 100644 index 0000000..f17d15a --- /dev/null +++ b/thin-provisioning/fixed_chunk_stream.h @@ -0,0 +1,39 @@ +#ifndef THIN_PROVISIONING_FIXED_CHUNK_STREAM_H +#define THIN_PROVISIONING_FIXED_CHUNK_STREAM_H + +#include "thin-provisioning/chunk_stream.h" + +//---------------------------------------------------------------- + +namespace thin_provisioning { + class fixed_chunk_stream : public chunk_stream { + public: + fixed_chunk_stream(chunk_stream &stream, unsigned chunk_size); + ~fixed_chunk_stream(); + + virtual bcache::block_address size() const; + virtual void rewind(); + virtual bool next(bcache::block_address count = 1ull); + virtual bool eof() const; + virtual chunk const &get(); + virtual void put(chunk const &c); + + private: + bool next_big_chunk(); + bool advance_one(); + void put_big_chunk(); + + bcache::block_address index_; + + chunk_stream &stream_; + unsigned chunk_size_; + chunk const *big_chunk_; + + uint8_t *little_b_, *little_e_, *last_hashed_; + chunk little_chunk_; + }; +} + +//---------------------------------------------------------------- + +#endif diff --git a/thin-provisioning/pool_stream.cc b/thin-provisioning/pool_stream.cc new file mode 100644 index 0000000..41a0ab0 --- /dev/null +++ b/thin-provisioning/pool_stream.cc @@ -0,0 +1,151 @@ +// Copyright (C) 2015 Red Hat, Inc. All rights reserved. +// +// This file is part of the thin-provisioning-tools source. +// +// thin-provisioning-tools is free software: you can redistribute it +// and/or modify it under the terms of the GNU General Public License +// as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// thin-provisioning-tools is distributed in the hope that it will be +// useful, but WITHOUT ANY WARRANTY; without even the implied warranty +// of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License along +// with thin-provisioning-tools. If not, see +// . + +#include "thin-provisioning/pool_stream.h" +#include "persistent-data/data-structures/btree_damage_visitor.h" + +using namespace thin_provisioning; +using namespace persistent_data; + +//---------------------------------------------------------------- + +namespace { + class damage_visitor { + public: + virtual void visit(btree_path const &path, btree_detail::damage const &d) { + throw std::runtime_error("damage in mapping tree, please run thin_check"); + } + }; + + uint32_t const UNMAPPED = -1; +} + +//---------------------------------------------------------------- + +pool_stream::pool_stream(cache_stream &stream, + transaction_manager::ptr tm, superblock_detail::superblock const &sb, + block_address nr_blocks) + : stream_(stream), + block_to_thin_(nr_blocks, UNMAPPED), + nr_mapped_(0), + index_(0), + block_size_(sb.data_block_size_ * 512) +{ + init_rmap(tm, sb, nr_blocks); +} + +block_address +pool_stream::size() const +{ + return nr_mapped_ * block_size_; +} + +void +pool_stream::rewind() +{ + stream_.rewind(); + index_ = 0; +} + +bool +pool_stream::next(block_address count) +{ + while (count--) + if (!advance_one()) + return false; + + return true; +} + +bool +pool_stream::eof() const +{ + return stream_.eof(); +} + +chunk const & +pool_stream::get() +{ + return stream_.get(); +} + +void +pool_stream::put(chunk const &c) +{ + stream_.put(c); +} + +// FIXME: too big to return by value +vector +pool_stream::read_rmap(transaction_manager::ptr tm, + superblock_detail::superblock const &sb, + block_address nr_blocks) +{ + damage_visitor dv; + rmap_visitor rv; + + mapping_tree mtree(*tm, sb.data_mapping_root_, + mapping_tree_detail::block_traits::ref_counter(tm->get_sm())); + + rv.add_data_region(rmap_visitor::region(0, nr_blocks)); + + btree_visit_values(mtree, rv, dv); + rv.complete(); + cerr << "rmap size: " << rv.get_rmap().size() << "\n"; + return rv.get_rmap(); +} + +void +pool_stream::init_rmap(transaction_manager::ptr tm, + superblock_detail::superblock const &sb, + block_address nr_blocks) +{ + cerr << "reading rmap..."; + vector rmap = read_rmap(tm, sb, nr_blocks); + cerr << "done\n"; + + vector::const_iterator it; + set thins; + for (it = rmap.begin(); it != rmap.end(); ++it) { + rmap_region const &r = *it; + for (block_address b = r.data_begin; b != r.data_end; b++) + if (block_to_thin_[b] == UNMAPPED) { + nr_mapped_++; + block_to_thin_[b] = r.thin_dev; + } + thins.insert(r.thin_dev); + } + + cerr << nr_mapped_ << " mapped blocks\n"; + cerr << "there are " << thins.size() << " thin devices\n"; +} + +bool +pool_stream::advance_one() +{ + block_address count = 1; + + while (((index_ + count) < block_to_thin_.size()) && + (block_to_thin_[index_ + count] == UNMAPPED)) + count++; + + index_ += count; + return stream_.next(count); +} + +//---------------------------------------------------------------- diff --git a/thin-provisioning/pool_stream.h b/thin-provisioning/pool_stream.h new file mode 100644 index 0000000..e419842 --- /dev/null +++ b/thin-provisioning/pool_stream.h @@ -0,0 +1,65 @@ +// Copyright (C) 2015 Red Hat, Inc. All rights reserved. +// +// This file is part of the thin-provisioning-tools source. +// +// thin-provisioning-tools is free software: you can redistribute it +// and/or modify it under the terms of the GNU General Public License +// as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// thin-provisioning-tools is distributed in the hope that it will be +// useful, but WITHOUT ANY WARRANTY; without even the implied warranty +// of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License along +// with thin-provisioning-tools. If not, see +// . + +#ifndef POOL_STREAM_H +#define POOL_STREAM_H + +#include "thin-provisioning/cache_stream.h" +#include "thin-provisioning/rmap_visitor.h" +#include "thin-provisioning/superblock.h" + +//---------------------------------------------------------------- + +namespace thin_provisioning { + class pool_stream : public chunk_stream { + public: + pool_stream(cache_stream &stream, + transaction_manager::ptr tm, superblock_detail::superblock const &sb, + block_address nr_blocks); + + block_address size() const; + void rewind(); + bool next(block_address count = 1ull); + bool eof() const; + + chunk const &get(); + void put(chunk const &c); + + private: + typedef rmap_visitor::region region; + typedef rmap_visitor::rmap_region rmap_region; + + // FIXME: too big to return by value + vector read_rmap(transaction_manager::ptr tm, + superblock_detail::superblock const &sb, + block_address nr_blocks); + void init_rmap(transaction_manager::ptr tm, superblock_detail::superblock const &sb, + block_address nr_blocks); + bool advance_one(); + + cache_stream &stream_; + vector block_to_thin_; + block_address nr_mapped_; + block_address index_; + block_address block_size_; + }; +} + +//---------------------------------------------------------------- + +#endif diff --git a/thin-provisioning/thin_show_duplicates.cc b/thin-provisioning/thin_show_duplicates.cc new file mode 100644 index 0000000..6c1dc27 --- /dev/null +++ b/thin-provisioning/thin_show_duplicates.cc @@ -0,0 +1,358 @@ +// Copyright (C) 2015 Red Hat, Inc. All rights reserved. +// +// This file is part of the thin-provisioning-tools source. +// +// thin-provisioning-tools is free software: you can redistribute it +// and/or modify it under the terms of the GNU General Public License +// as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// thin-provisioning-tools is distributed in the hope that it will be +// useful, but WITHOUT ANY WARRANTY; without even the implied warranty +// of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License along +// with thin-provisioning-tools. If not, see +// . + +#include +#include +#include + +#include "version.h" + +#include "base/application.h" +#include "base/error_state.h" +#include "base/progress_monitor.h" +#include "persistent-data/data-structures/btree_damage_visitor.h" +#include "persistent-data/file_utils.h" +#include "persistent-data/space-maps/core.h" +#include "persistent-data/space-maps/disk.h" +#include "thin-provisioning/cache_stream.h" +#include "thin-provisioning/fixed_chunk_stream.h" +#include "thin-provisioning/pool_stream.h" +#include "thin-provisioning/commands.h" +#include "thin-provisioning/device_tree.h" +#include "thin-provisioning/mapping_tree.h" +#include "thin-provisioning/rmap_visitor.h" +#include "thin-provisioning/superblock.h" +#include "thin-provisioning/variable_chunk_stream.h" + +#include +#include +#include +#include +#include + +using namespace base; +using namespace boost; +using namespace persistent_data; +using namespace std; +using namespace thin_provisioning; + +//---------------------------------------------------------------- + +namespace { + bool factor_of(block_address f, block_address n) { + return (n % f) == 0; + } + + block_manager<>::ptr + open_bm(string const &path) { + block_address nr_blocks = get_nr_blocks(path); + block_manager<>::mode m = block_manager<>::READ_ONLY; + return block_manager<>::ptr(new block_manager<>(path, nr_blocks, 1, m)); + } + + transaction_manager::ptr + open_tm(block_manager<>::ptr bm) { + space_map::ptr sm(new core_map(bm->get_nr_blocks())); + sm->inc(superblock_detail::SUPERBLOCK_LOCATION); + transaction_manager::ptr tm(new transaction_manager(bm, sm)); + return tm; + } + + uint64_t parse_int(string const &str, string const &desc) { + try { + return boost::lexical_cast(str); + + } catch (...) { + ostringstream out; + out << "Couldn't parse " << desc << ": '" << str << "'"; + exit(1); + } + + return 0; // never get here + } + + //-------------------------------- + + struct flags { + flags() + : cache_mem(64 * 1024 * 1024), + content_based_chunks(false) { + } + + string data_dev; + optional metadata_dev; + optional block_size; + unsigned cache_mem; + bool content_based_chunks; + }; + + using namespace mapping_tree_detail; + + class duplicate_counter { + public: + duplicate_counter() + : non_zero_dups_(0), + zero_dups_(0) { + } + + void add_duplicate(block_address len) { + non_zero_dups_ += len; + } + + void add_zero_duplicate(block_address len) { + zero_dups_ += len; + } + + block_address get_total() const { + return non_zero_dups_ + zero_dups_; + } + + block_address get_non_zeroes() const { + return non_zero_dups_; + } + + block_address get_zeroes() const { + return zero_dups_; + } + + void display_results(chunk_stream const &stream) const { + block_address meg = 1024 * 1024; + cout << "\n\n" + << stream.size() / meg << "m examined, " + << get_non_zeroes() / meg << "m duplicates, " + << get_zeroes() / meg << "m zeroes\n"; + } + + private: + block_address non_zero_dups_; + block_address zero_dups_; + }; + + class duplicate_detector { + public: + void scan_with_variable_sized_chunks(chunk_stream &stream) { + variable_chunk_stream vstream(stream, 4096); + scan(vstream); + } + + void scan_with_fixed_sized_chunks(chunk_stream &stream, block_address chunk_size) { + fixed_chunk_stream fstream(stream, chunk_size); + scan(fstream); + } + + duplicate_counter const &get_results() const { + return results_; + } + + private: + void scan(chunk_stream &stream) { + block_address total_seen(0); + unique_ptr pbar = create_progress_bar("Examining data"); + + do { + // FIXME: use a wrapper class to automate the put() + chunk const &c = stream.get(); + examine(c); + stream.put(c); + + total_seen += c.len_; + pbar->update_percent((total_seen * 100) / stream.size()); + + } while (stream.next()); + + pbar->update_percent(100); + results_.display_results(stream); + } + + void examine(chunk const &c) { + if (all_zeroes(c)) + results_.add_zero_duplicate(c.len_); + + else { + digestor_.reset(); + digestor_.process_bytes(c.mem_.begin, c.mem_.end - c.mem_.begin); + + unsigned int digest[5]; + digestor_.get_digest(digest); + + // hack + vector v(5); + for (unsigned i = 0; i < 5; i++) + v[i] = digest[i]; + + fingerprint_map::const_iterator it = fm_.find(v); + if (it != fm_.end()) { + results_.add_duplicate(c.len_); + } else + fm_.insert(make_pair(v, c.offset_)); + } + } + + bool all_zeroes(chunk const &c) const { + for (uint8_t *ptr = c.mem_.begin; ptr != c.mem_.end; ptr++) { + if (*ptr != 0) + return false; + } + + return true; + } + + typedef map, block_address> fingerprint_map; + + unsigned block_size_; + boost::uuids::detail::sha1 digestor_; + fingerprint_map fm_; + duplicate_counter results_; + }; + + int show_dups_pool(flags const &fs) { + block_manager<>::ptr bm = open_bm(*fs.metadata_dev); + transaction_manager::ptr tm = open_tm(bm); + superblock_detail::superblock sb = read_superblock(bm); + block_address block_size = sb.data_block_size_ * 512; + block_address nr_blocks = get_nr_blocks(fs.data_dev, block_size); + + cache_stream stream(fs.data_dev, block_size, fs.cache_mem); + pool_stream pstream(stream, tm, sb, nr_blocks); + + duplicate_detector detector; + + if (fs.content_based_chunks) + detector.scan_with_variable_sized_chunks(pstream); + else { + if (*fs.block_size) { + if (factor_of(*fs.block_size, block_size)) + block_size = *fs.block_size; + else + throw runtime_error("specified block size is not a factor of the pool chunk size\n"); + } + + detector.scan_with_fixed_sized_chunks(pstream, block_size); + } + + return 0; + } + + int show_dups_linear(flags const &fs) { + if (!fs.block_size) + // FIXME: this check should be moved to the switch parsing + throw runtime_error("--block-sectors or --metadata-dev must be supplied"); + + block_address block_size = *fs.block_size; + block_address nr_blocks = get_nr_blocks(fs.data_dev, *fs.block_size); + + cerr << "path = " << fs.data_dev << "\n"; + cerr << "nr_blocks = " << nr_blocks << "\n"; + cerr << "block size = " << block_size << "\n"; + + cache_stream stream(fs.data_dev, block_size, fs.cache_mem); + duplicate_detector dd; + + if (fs.content_based_chunks) + dd.scan_with_variable_sized_chunks(stream); + else + dd.scan_with_fixed_sized_chunks(stream, block_size); + + return 0; + } + + int show_dups(flags const &fs) { + if (fs.metadata_dev) + return show_dups_pool(fs); + else { + cerr << "No metadata device provided, so treating data device as a linear device\n"; + return show_dups_linear(fs); + } + } +} + +//---------------------------------------------------------------- + +thin_show_duplicates_cmd::thin_show_duplicates_cmd() + : command("thin_show_duplicates") +{ +} + +void +thin_show_duplicates_cmd::usage(std::ostream &out) const +{ + out << "Usage: " << get_name() << " [options] {device|file}\n" + << "Options:\n" + << " {--block-sectors} \n" + << " {--content-based-chunks}\n" + << " {--metadata-dev} \n" + << " {-h|--help}\n" + << " {-V|--version}" << endl; +} + +int +thin_show_duplicates_cmd::run(int argc, char **argv) +{ + int c; + flags fs; + + char const shortopts[] = "qhV"; + option const longopts[] = { + { "block-sectors", required_argument, NULL, 1}, + { "content-based-chunks", no_argument, NULL, 2}, + { "metadata-dev", required_argument, NULL, 3}, + { "help", no_argument, NULL, 'h'}, + { "version", no_argument, NULL, 'V'}, + { NULL, no_argument, NULL, 0 } + }; + + while ((c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1) { + switch(c) { + case 'h': + usage(cout); + return 0; + + case 'V': + cout << THIN_PROVISIONING_TOOLS_VERSION << endl; + return 0; + + case 1: + fs.block_size = 512 * parse_int(optarg, "block sectors"); + break; + + case 2: + fs.content_based_chunks = true; + break; + + case 3: + fs.metadata_dev = optarg; + break; + + default: + usage(cerr); + return 1; + } + } + + if (argc == optind) { + cerr << "No data device/file provided." << endl; + usage(cerr); + exit(1); + } + + fs.data_dev = argv[optind]; + + return show_dups(fs); +} + +//---------------------------------------------------------------- diff --git a/thin-provisioning/variable_chunk_stream.cc b/thin-provisioning/variable_chunk_stream.cc new file mode 100644 index 0000000..f572db7 --- /dev/null +++ b/thin-provisioning/variable_chunk_stream.cc @@ -0,0 +1,133 @@ +#include "thin-provisioning/variable_chunk_stream.h" + +using namespace boost; +using namespace std; +using namespace thin_provisioning; + +//---------------------------------------------------------------- + +variable_chunk_stream::variable_chunk_stream(chunk_stream &stream, unsigned window_size) + : index_(0), + h_(window_size), + stream_(stream), + big_chunk_(0) { + next_big_chunk(); +} + +variable_chunk_stream::~variable_chunk_stream() +{ + put_big_chunk(); +} + +bcache::block_address +variable_chunk_stream::size() const +{ + return stream_.size(); +} + +void +variable_chunk_stream::rewind() +{ + // FIXME: not complete + index_ = 0; + stream_.rewind(); + h_.reset(); +} + +bool +variable_chunk_stream::next(bcache::block_address count) +{ + while (count--) { + index_++; + advance_one(); + } + + return !eof(); +} + +bool +variable_chunk_stream::eof() const +{ + return stream_.eof(); +} + +chunk const & +variable_chunk_stream::get() +{ + assert(big_chunk_); + + little_chunk_.len_ = little_e_ - little_b_; + little_chunk_.offset_ = big_chunk_->offset_ + little_chunk_.len_; + + little_chunk_.mem_.begin = little_b_; + little_chunk_.mem_.end = little_e_; + + return little_chunk_; +} + +void +variable_chunk_stream::put(chunk const &c) +{ + // noop +} + +bool +variable_chunk_stream::next_big_chunk() +{ + put_big_chunk(); + + if (!stream_.next()) + return false; + + big_chunk_ = &stream_.get(); + little_b_ = little_e_ = last_hashed_ = big_chunk_->mem_.begin; + h_.reset(); + + return true; +} + +bool +variable_chunk_stream::advance_one() +{ + uint8_t *big_e; + + big_e = big_chunk_->mem_.end; + little_b_ = little_e_; + little_e_ = last_hashed_; + + if (little_b_ == big_e) { + if (next_big_chunk()) + big_e = big_chunk_->mem_.end; + else + return false; + } + + while (little_e_ != big_e) { + optional maybe_break = h_.step(*little_e_); + little_e_++; + + if (maybe_break) { + // The break is not neccessarily at the current + // byte. + last_hashed_ = little_e_; + little_e_ = little_b_ + *maybe_break; + break; + } + } + + if (little_e_ == big_e) + last_hashed_ = little_e_; + + return true; +} + +void +variable_chunk_stream::put_big_chunk() +{ + if (big_chunk_) + stream_.put(*big_chunk_); + + big_chunk_ = 0; +} + +//---------------------------------------------------------------- diff --git a/thin-provisioning/variable_chunk_stream.h b/thin-provisioning/variable_chunk_stream.h new file mode 100644 index 0000000..cc62945 --- /dev/null +++ b/thin-provisioning/variable_chunk_stream.h @@ -0,0 +1,41 @@ +#ifndef THIN_PROVISIONING_VARIABLE_CHUNK_STREAM_H +#define THIN_PROVISIONING_VARIABLE_CHUNK_STREAM_H + +#include "base/rolling_hash.h" +#include "thin-provisioning/chunk_stream.h" + +//---------------------------------------------------------------- + +namespace thin_provisioning { + class variable_chunk_stream : public chunk_stream { + public: + // window_size must be a power of 2 + variable_chunk_stream(chunk_stream &stream, unsigned window_size); + ~variable_chunk_stream(); + + virtual bcache::block_address size() const; + virtual void rewind(); + virtual bool next(bcache::block_address count = 1ull); + virtual bool eof() const; + virtual chunk const &get(); + virtual void put(chunk const &c); + + private: + bool next_big_chunk(); + bool advance_one(); + void put_big_chunk(); + + bcache::block_address index_; + base::content_based_hash h_; + + chunk_stream &stream_; + chunk const *big_chunk_; + + uint8_t *little_b_, *little_e_, *last_hashed_; + chunk little_chunk_; + }; +} + +//---------------------------------------------------------------- + +#endif diff --git a/unit-tests/Makefile.in b/unit-tests/Makefile.in index 9307e5f..38f3d04 100644 --- a/unit-tests/Makefile.in +++ b/unit-tests/Makefile.in @@ -59,6 +59,7 @@ TEST_SOURCE=\ unit-tests/endian_t.cc \ unit-tests/error_state_t.cc \ unit-tests/rmap_visitor_t.cc \ + unit-tests/rolling_hash_t.cc \ unit-tests/run_set_t.cc \ unit-tests/space_map_t.cc \ unit-tests/span_iterator_t.cc \ diff --git a/unit-tests/rolling_hash_t.cc b/unit-tests/rolling_hash_t.cc new file mode 100644 index 0000000..c25b650 --- /dev/null +++ b/unit-tests/rolling_hash_t.cc @@ -0,0 +1,153 @@ +#include "gmock/gmock.h" + +#include "base/rolling_hash.h" + +using namespace base; +using namespace boost; +using namespace std; +using namespace testing; + +//---------------------------------------------------------------- + +namespace { + class RollingHashTests : public Test { + public: + RollingHashTests() + : window_size_(4096), + rhash_(window_size_) { + } + + typedef vector bytes; + bytes random_bytes(unsigned count) { + bytes v(count, 0); + + for (unsigned i = 0; i < count; i++) + v[i] = random_byte(); + + return v; + } + + uint8_t random_byte() const { + return random() % 256; + } + + void apply_bytes(bytes const &bs) { + for (unsigned i = 0; i < bs.size(); i++) + rhash_.step(bs[i]); + } + + unsigned window_size_; + rolling_hash rhash_; + }; + + class ContentBasedHashTests : public Test { + public: + ContentBasedHashTests() + : window_size_(8192), + h_(window_size_) { + } + + typedef vector bytes; + bytes random_bytes(unsigned count) { + bytes v(count, 0); + + for (unsigned i = 0; i < count; i++) + v[i] = random_byte(); + + return v; + } + + uint8_t random_byte() const { + return random() % 256; + } + + unsigned window_size_; + content_based_hash h_; + }; +} + +//---------------------------------------------------------------- + +TEST_F(RollingHashTests, ctr) +{ +} + +//-------------------------------- + +TEST_F(RollingHashTests, hash_changes) +{ + bytes bs = random_bytes(window_size_ * 100); + + uint32_t prev = rhash_.get_hash(); + for (unsigned i = 0; i < bs.size(); i++) { + rhash_.step(bs[i]); + ASSERT_NE(rhash_.get_hash(), prev); + prev = rhash_.get_hash(); + } +} + +TEST_F(RollingHashTests, hash_repeats) +{ + bytes bs = random_bytes(window_size_); + + apply_bytes(bs); + uint32_t h1 = rhash_.get_hash(); + apply_bytes(bs); + + ASSERT_EQ(rhash_.get_hash(), h1); +} + +TEST_F(RollingHashTests, reset_is_deterministic) +{ + uint8_t bytes[] = "lksdfuwerh,sdg"; + + for (unsigned i = 0; i < sizeof(bytes) - 1; i++) + rhash_.step(bytes[i]); + + uint32_t h1 = rhash_.get_hash(); + + rhash_.reset(); + + for (unsigned i = 0; i < sizeof(bytes) - 1; i++) + rhash_.step(bytes[i]); + + uint32_t h2 = rhash_.get_hash(); + + ASSERT_EQ(h1, h2); +} + +//---------------------------------------------------------------- + +TEST_F(ContentBasedHashTests, ctr) +{ +} + +TEST_F(ContentBasedHashTests, chunk_limits_respected) +{ + unsigned min = 100000, max = 0; + + bytes bs = random_bytes(1024 * 1024 * 100); + vector counts(window_size_, 0); + + for (unsigned i = 0; i < bs.size(); i++) { + optional b = h_.step(bs[i]); + if (b) { + counts[*b]++; + + if (*b < min) + min = *b; + + if (*b > max) + max = *b; + } + } + +#if 1 + for (unsigned i = 0; i < counts.size(); i++) + cerr << i << ": " << counts[i] << "\n"; + + cerr << "min: " << min << ", max: " << max << "\n"; +#endif +} + +//----------------------------------------------------------------