[thin_show_dups] variable_chunk_stream

This commit is contained in:
Joe Thornber
2015-09-03 13:02:29 +01:00
parent d44a817c60
commit 750ce0f47b
14 changed files with 709 additions and 104 deletions

View File

@@ -32,9 +32,8 @@ cache_stream::cache_stream(string const &path,
v_(new bcache::noop_validator()),
cache_(new block_cache(fd_, block_size / 512, nr_blocks_, cache_mem)),
current_index_(0) {
load(0);
for (block_address i = 1; i < min(cache_blocks_, nr_blocks_); i++)
cache_->prefetch(i);
rewind();
}
block_address
@@ -46,19 +45,27 @@ cache_stream::nr_chunks() const
void
cache_stream::rewind()
{
load(0);
current_index_ = 0;
for (block_address i = 1; i < min(cache_blocks_, nr_blocks_); i++)
cache_->prefetch(i);
}
bool
cache_stream::advance(block_address count)
cache_stream::next(block_address count)
{
if (current_index_ + count >= nr_blocks_)
return false;
current_index_ = min(current_index_ + count, nr_blocks_);
current_index_ += count;
if (current_index_ + cache_blocks_ < nr_blocks_)
cache_->prefetch(current_index_ + cache_blocks_);
load(current_index_);
return true;
return !eof();
}
bool
cache_stream::eof() const
{
return current_index_ >= nr_blocks_;
}
block_address
@@ -68,24 +75,26 @@ cache_stream::index() const
}
chunk const &
cache_stream::get() const
cache_stream::get()
{
return current_chunk_;
chunk_wrapper *w = new chunk_wrapper(*this);
return w->c_;
}
void
cache_stream::load(block_address b)
cache_stream::put(chunk const &c)
{
current_index_ = b;
current_block_ = cache_->get(current_index_, 0, v_);
chunk_wrapper *w = container_of(const_cast<chunk *>(&c), chunk_wrapper, c_);
delete w;
}
current_chunk_.offset_sectors_ = (b * block_size_) / 512;
current_chunk_.mem_.clear();
current_chunk_.mem_.push_back(mem(static_cast<uint8_t *>(current_block_.get_data()),
static_cast<uint8_t *>(current_block_.get_data()) + block_size_));
if (current_index_ + cache_blocks_ < nr_blocks_)
cache_->prefetch(current_index_ + cache_blocks_);
cache_stream::chunk_wrapper::chunk_wrapper(cache_stream &parent)
: block_(parent.cache_->get(parent.current_index_, 0, parent.v_))
{
c_.offset_ = parent.current_index_ * parent.block_size_;
c_.len_ = parent.block_size_;
c_.mem_.push_back(mem(static_cast<uint8_t *>(block_.get_data()),
static_cast<uint8_t *>(block_.get_data()) + parent.block_size_));
}
//----------------------------------------------------------------

View File

@@ -14,25 +14,36 @@ namespace thin_provisioning {
block_address block_size,
size_t cache_mem);
virtual block_address nr_chunks() const;
block_address nr_chunks() const;
virtual void rewind();
virtual bool advance(block_address count = 1ull);
virtual block_address index() const;
virtual chunk const &get() const;
virtual bool next(block_address count = 1ull);
virtual bool eof() const;
virtual chunk const &get();
virtual void put(chunk const &c);
private:
void load(block_address b);
struct chunk_wrapper {
chunk_wrapper(cache_stream &parent);
block_cache::auto_block block_;
chunk c_;
};
friend class chunk_wrapper;
block_address block_size_;
block_address nr_blocks_;
block_address cache_blocks_;
int fd_;
validator::ptr v_;
std::auto_ptr<block_cache> cache_;
block_address current_index_;
block_cache::auto_block current_block_;
chunk current_chunk_;
};
}

View File

@@ -0,0 +1,23 @@
#include "thin-provisioning/chunk_stream.h"
using namespace std;
using namespace thin_provisioning;
//----------------------------------------------------------------
uint8_t
chunk::operator[](uint64_t n) const
{
std::deque<mem>::const_iterator it;
for (it = mem_.begin(); it != mem_.end(); it++) {
uint64_t mem_len = it->end - it->begin;
if (n > mem_len)
n -= mem_len;
else
return it->begin[n];
}
throw runtime_error("chunk out of bounds");
}
//----------------------------------------------------------------

View File

@@ -37,21 +37,24 @@ namespace thin_provisioning {
};
struct chunk {
// FIXME: switch to bytes rather than sectors
// FIXME: add length too
uint64_t offset_sectors_;
uint64_t offset_, len_;
std::deque<mem> mem_;
uint8_t operator[](uint64_t n) const;
};
class chunk_stream {
public:
virtual ~chunk_stream() {}
virtual bcache::block_address nr_chunks() const = 0;
virtual void rewind() = 0;
virtual bool advance(bcache::block_address count = 1ull) = 0;
virtual bcache::block_address index() const = 0;
virtual chunk const &get() const = 0;
virtual bool next(bcache::block_address count = 1ull) = 0;
virtual bool eof() const = 0;
virtual chunk const &get() = 0;
virtual void put(chunk const &c) = 0;
};
}

View File

@@ -60,7 +60,7 @@ pool_stream::rewind()
}
bool
pool_stream::advance(block_address count)
pool_stream::next(block_address count)
{
while (count--)
if (!advance_one())
@@ -69,6 +69,12 @@ pool_stream::advance(block_address count)
return true;
}
bool
pool_stream::eof() const
{
return stream_.eof();
}
block_address
pool_stream::index() const
{
@@ -76,12 +82,16 @@ pool_stream::index() const
}
chunk const &
pool_stream::get() const
pool_stream::get()
{
return stream_.get();
}
void
pool_stream::put(chunk const &c)
{
stream_.put(c);
}
// FIXME: too big to return by value
vector<pool_stream::rmap_region>
@@ -140,7 +150,7 @@ pool_stream::advance_one()
if (new_index >= nr_chunks())
return false;
return stream_.advance(new_index - index());
return stream_.next(new_index - index());
}
//----------------------------------------------------------------

View File

@@ -34,16 +34,20 @@ namespace thin_provisioning {
block_address nr_chunks() const;
void rewind();
bool advance(block_address count = 1ull);
bool next(block_address count = 1ull);
bool eof() const;
block_address index() const;
chunk const &get() const;
chunk const &get();
void put(chunk const &c);
private:
typedef rmap_visitor::region region;
typedef rmap_visitor::rmap_region rmap_region;
// FIXME: too big to return by value
vector<rmap_region> read_rmap(transaction_manager::ptr tm, superblock_detail::superblock const &sb,
vector<rmap_region> read_rmap(transaction_manager::ptr tm,
superblock_detail::superblock const &sb,
block_address nr_blocks);
void init_rmap(transaction_manager::ptr tm, superblock_detail::superblock const &sb,
block_address nr_blocks);

View File

@@ -36,6 +36,7 @@
#include "thin-provisioning/mapping_tree.h"
#include "thin-provisioning/rmap_visitor.h"
#include "thin-provisioning/superblock.h"
#include "thin-provisioning/variable_chunk_stream.h"
#include <boost/uuid/sha1.hpp>
#include <boost/lexical_cast.hpp>
@@ -98,25 +99,21 @@ namespace {
unsigned cache_mem;
};
// FIXME: introduce abstraction for a stream of segments
using namespace mapping_tree_detail;
class duplicate_counter {
public:
duplicate_counter(block_address nr_blocks)
: counts_(nr_blocks),
non_zero_dups_(0),
duplicate_counter()
: non_zero_dups_(0),
zero_dups_(0) {
}
void add_duplicate(block_address b1, block_address b2) {
non_zero_dups_++;
counts_[b1]++;
void add_duplicate(block_address len) {
non_zero_dups_ += len;
}
void add_zero_duplicate(block_address b) {
zero_dups_++;
void add_zero_duplicate(block_address len) {
zero_dups_ += len;
}
block_address get_total() const {
@@ -132,45 +129,35 @@ namespace {
}
private:
vector<block_address> counts_;
block_address non_zero_dups_;
block_address zero_dups_;
};
class duplicate_detector {
public:
duplicate_detector(unsigned block_size, block_address nr_blocks)
: block_size_(block_size),
results_(nr_blocks),
zero_fingerprint_(5, 0ull) {
calc_zero_fingerprint();
}
void examine(chunk const &c) {
digestor_.reset();
for (deque<mem>::const_iterator it = c.mem_.begin(); it != c.mem_.end(); it++)
digestor_.process_bytes(it->begin, it->end - it->begin);
unsigned int digest[5];
digestor_.get_digest(digest);
// hack
vector<unsigned int> v(5);
for (unsigned i = 0; i < 5; i++)
v[i] = digest[i];
block_address index = (c.offset_sectors_ * 512) / block_size_;
if (v == zero_fingerprint_)
results_.add_zero_duplicate(index);
if (all_zeroes(c))
results_.add_zero_duplicate(c.len_);
else {
digestor_.reset();
for (deque<mem>::const_iterator it = c.mem_.begin(); it != c.mem_.end(); it++)
digestor_.process_bytes(it->begin, it->end - it->begin);
unsigned int digest[5];
digestor_.get_digest(digest);
// hack
vector<unsigned int> v(5);
for (unsigned i = 0; i < 5; i++)
v[i] = digest[i];
fingerprint_map::const_iterator it = fm_.find(v);
if (it != fm_.end()) {
results_.add_duplicate(it->second, index);
results_.add_duplicate(c.len_);
} else
fm_.insert(make_pair(v, index));
fm_.insert(make_pair(v, c.offset_));
}
}
@@ -178,30 +165,24 @@ namespace {
return results_;
}
void calc_zero_fingerprint() {
auto_ptr<uint8_t> bytes(new uint8_t[block_size_]);
memset(bytes.get(), 0, block_size_);
private:
bool all_zeroes(chunk const &c) const {
for (deque<mem>::const_iterator it = c.mem_.begin(); it != c.mem_.end(); it++) {
for (uint8_t *ptr = it->begin; ptr != it->end; ptr++) {
if (*ptr != 0)
return false;
}
}
digestor_.reset();
digestor_.process_bytes(bytes.get(), block_size_);
unsigned int digest[5];
digestor_.get_digest(digest);
// hack
for (unsigned i = 0; i < 5; i++)
zero_fingerprint_[i] = digest[i];
return true;
}
private:
typedef map<vector<unsigned int>, block_address> fingerprint_map;
unsigned block_size_;
boost::uuids::detail::sha1 digestor_;
fingerprint_map fm_;
duplicate_counter results_;
vector<unsigned int> zero_fingerprint_;
};
int show_dups_pool(flags const &fs) {
@@ -218,15 +199,16 @@ namespace {
cache_stream stream(fs.data_dev, block_size, fs.cache_mem);
pool_stream pstream(stream, tm, sb, nr_blocks);
duplicate_detector detector(block_size, nr_blocks);
duplicate_detector detector;
auto_ptr<progress_monitor> pbar = create_progress_bar("Examining data");
do {
chunk const &c = pstream.get();
detector.examine(c);
pstream.put(c);
pbar->update_percent((pstream.index() * 100) / pstream.nr_chunks());
} while (pstream.advance());
} while (pstream.next());
pbar->update_percent(100);
cout << "\n\ntotal dups: " << detector.get_results().get_total() << endl;
@@ -247,24 +229,27 @@ namespace {
cerr << "nr_blocks = " << nr_blocks << "\n";
cerr << "block size = " << block_size << "\n";
cache_stream stream(fs.data_dev, block_size, fs.cache_mem);
duplicate_detector detector(block_size, nr_blocks);
cache_stream low_level_stream(fs.data_dev, block_size, fs.cache_mem);
variable_chunk_stream stream(low_level_stream, 4096);
duplicate_detector detector;
auto_ptr<progress_monitor> pbar = create_progress_bar("Examining data");
do {
// FIXME: use a wrapper class to automate the put()
chunk const &c = stream.get();
detector.examine(c);
pbar->update_percent((stream.index() * 100) / stream.nr_chunks());
stream.put(c);
// pbar->update_percent((stream.index() * 100) / stream.nr_chunks());
} while (stream.advance());
} while (stream.next());
pbar->update_percent(100);
cout << "\n\ntotal dups: " << detector.get_results().get_total() << endl;
cout << (detector.get_results().get_total() * 100) / nr_blocks << "% duplicates\n";
duplicate_counter r = detector.get_results();
cout << "\n\nchunks\tnon zero dups\tzero dups\n"
<< nr_blocks << "\t" << r.get_non_zeroes() << "\t" << r.get_zeroes() << "\n";
block_address meg = 1024 * 1024;
cout << "\n\n"
<< (nr_blocks * block_size) / meg << "m examined, "
<< r.get_non_zeroes() / meg << "m duplicates, "
<< r.get_zeroes() / meg << "m zeroes\n";
return 0;
}

View File

@@ -0,0 +1,152 @@
#include "thin-provisioning/variable_chunk_stream.h"
using namespace boost;
using namespace std;
using namespace thin_provisioning;
//----------------------------------------------------------------
variable_chunk_stream::variable_chunk_stream(chunk_stream &stream, unsigned window_size)
: index_(0),
h_(window_size),
stream_(stream),
big_chunk_(0) {
next_big_chunk();
}
variable_chunk_stream::~variable_chunk_stream()
{
put_big_chunk();
}
void
variable_chunk_stream::rewind()
{
// FIXME: not complete
index_ = 0;
stream_.rewind();
h_.reset();
}
bool
variable_chunk_stream::next(bcache::block_address count)
{
while (count--) {
index_++;
advance_one();
}
return !eof();
}
bool
variable_chunk_stream::eof() const
{
return stream_.eof();
}
bcache::block_address
variable_chunk_stream::index() const
{
return index_;
}
chunk const &
variable_chunk_stream::get()
{
assert(big_chunk_);
little_chunk_.len_ = little_e_ - little_b_;
little_chunk_.offset_ = big_chunk_->offset_ + little_chunk_.len_;
little_chunk_.mem_.clear();
little_chunk_.mem_.push_back(mem(little_b_, little_e_));
return little_chunk_;
}
void
variable_chunk_stream::put(chunk const &c)
{
// noop
}
bool
variable_chunk_stream::next_big_chunk()
{
put_big_chunk();
if (!stream_.next())
return false;
big_chunk_ = &stream_.get();
little_b_ = little_e_ = big_chunk_->mem_.front().begin;
h_.reset();
return true;
}
bool
variable_chunk_stream::advance_one()
{
uint8_t *big_e;
assert(big_chunk_);
big_e = big_chunk_->mem_.front().end;
little_b_ = little_e_;
if (little_b_ == big_e) {
if (next_big_chunk())
big_e = big_chunk_->mem_.front().end;
else
return false;
}
assert(little_e_ >= big_chunk_->mem_.front().begin);
assert(little_b_ >= big_chunk_->mem_.front().begin);
#if 1
if (little_e_ > big_e) {
cerr << "before -- little_e_: " << (void *) little_e_ << ", big_e: " << (void *) big_e << "\n";
}
#endif
assert(little_e_ <= big_e);
assert(little_b_ <= big_e);
while (little_e_ != big_e) {
optional<unsigned> maybe_break = h_.step(*little_e_);
if (maybe_break) {
// The break is not neccessarily at the current
// byte.
little_e_ = little_b_ + *maybe_break;
break;
}
little_e_++;
}
assert(little_e_ >= big_chunk_->mem_.front().begin);
assert(little_b_ >= big_chunk_->mem_.front().begin);
#if 1
if (little_e_ > big_e) {
cerr << "after -- little_e_: " << (void *) little_e_ << ", big_e: " << (void *) big_e << "\n";
}
#endif
assert(little_e_ <= big_e);
assert(little_b_ <= big_e);
return true;
}
void
variable_chunk_stream::put_big_chunk()
{
if (big_chunk_)
stream_.put(*big_chunk_);
big_chunk_ = 0;
}
//----------------------------------------------------------------

View File

@@ -0,0 +1,42 @@
#ifndef THIN_PROVISIONING_VARIABLE_CHUNK_STREAM_H
#define THIN_PROVISIONING_VARIABLE_CHUNK_STREAM_H
#include "base/rolling_hash.h"
#include "thin-provisioning/chunk_stream.h"
//----------------------------------------------------------------
namespace thin_provisioning {
class variable_chunk_stream : public chunk_stream {
public:
// window_size must be a power of 2
variable_chunk_stream(chunk_stream &stream, unsigned window_size);
~variable_chunk_stream();
// FIXME: we don't know in advance how many chunks we will have
virtual void rewind();
virtual bool next(bcache::block_address count = 1ull);
virtual bool eof() const;
virtual bcache::block_address index() const;
virtual chunk const &get();
virtual void put(chunk const &c);
private:
bool next_big_chunk();
bool advance_one();
void put_big_chunk();
bcache::block_address index_;
base::content_based_hash h_;
chunk_stream &stream_;
chunk const *big_chunk_;
uint8_t *little_b_, *little_e_;
chunk little_chunk_;
};
}
//----------------------------------------------------------------
#endif