diff --git a/Makefile.in b/Makefile.in index 0dd6c99..d09c873 100644 --- a/Makefile.in +++ b/Makefile.in @@ -31,6 +31,7 @@ SOURCE=\ base/error_state.cc \ base/error_string.cc \ base/progress_monitor.cc \ + base/rolling_hash.cc \ base/xml_utils.cc \ block-cache/block_cache.cc \ caching/cache_check.cc \ @@ -72,6 +73,7 @@ SOURCE=\ persistent-data/transaction_manager.cc \ persistent-data/validators.cc \ thin-provisioning/cache_stream.cc \ + thin-provisioning/chunk_stream.cc \ thin-provisioning/device_tree.cc \ thin-provisioning/human_readable_format.cc \ thin-provisioning/mapping_tree.cc \ @@ -92,6 +94,7 @@ SOURCE=\ thin-provisioning/thin_rmap.cc \ thin-provisioning/thin_show_duplicates.cc \ thin-provisioning/thin_trim.cc \ + thin-provisioning/variable_chunk_stream.cc \ thin-provisioning/xml_format.cc CC:=@CC@ diff --git a/base/rolling_hash.cc b/base/rolling_hash.cc new file mode 100644 index 0000000..1ea362f --- /dev/null +++ b/base/rolling_hash.cc @@ -0,0 +1,148 @@ +#include "base/rolling_hash.h" + +using namespace base; +using namespace boost; +using namespace std; + +//---------------------------------------------------------------- + +namespace { + uint32_t MULTIPLIER = 4294967291UL; + uint32_t SEED = 123; +} + +rolling_hash::rolling_hash(unsigned window_size) + : a_(MULTIPLIER), + a_to_k_minus_1_(a_), + window_size_(window_size) { + + for (unsigned i = 1; i < window_size_ - 1; i++) + a_to_k_minus_1_ *= a_; + + reset(); +} + +void +rolling_hash::reset() +{ + // prime with zeroes + chars_.clear(); + + hash_ = 0; + for (unsigned i = 0; i < window_size_; i++) { + hash_ = (hash_ * a_) + SEED; + chars_.push_back(0); + } +} + +uint32_t +rolling_hash::step(uint8_t byte) +{ + update_hash(byte); + return hash_; +} + +uint32_t +rolling_hash::get_hash() const +{ + return hash_; +} + +void +rolling_hash::update_hash(uint8_t byte) +{ + hash_ -= a_to_k_minus_1_ * (chars_.front() + SEED); + chars_.pop_front(); + chars_.push_back(byte); + hash_ = (hash_ * a_) + byte + SEED; +} + +//-------------------------------- + +content_based_hash::content_based_hash(unsigned window_size) + : rhash_(window_size), + + // FIXME: hard coded values + backup_div_((window_size / 4) - 1), + div_((window_size / 2) - 1), + min_len_(window_size / 8), + max_len_(window_size), + len_(0) +{ +} + +void +content_based_hash::reset() +{ + len_ = 0; + backup_break_.reset(); + rhash_.reset(); +} + +optional +content_based_hash::step(uint8_t byte) +{ +#if 0 + optional r; + + rhash_.step(byte); + len_++; + + if (len_ < min_len_) + return r; + + if (hit_break(backup_div_)) + backup_break_ = len_; + + if (hit_break(div_)) { + // found a break + r = len_; + len_ = 0; + backup_break_.reset(); + + } else if (len_ >= max_len_) { + // too big, is there a backup? + if (backup_break_) { + len_ -= *backup_break_; + r = backup_break_; + backup_break_.reset(); + + } else { + r = len_; + len_ = 0; + } + } + + return r; +#else + optional r; + + rhash_.step(byte); + len_++; + + if (len_ < min_len_) + return r; + + if (hit_break(div_)) { + // found a break + r = len_; + len_ = 0; + backup_break_.reset(); + + } else if (len_ >= max_len_) { + r = len_; + len_ = 0; + } + + return r; +#endif +} + +bool +content_based_hash::hit_break(uint32_t mask) const +{ + uint32_t h = rhash_.get_hash() >> 8; + return !(h & mask); +} + +//---------------------------------------------------------------- diff --git a/base/rolling_hash.h b/base/rolling_hash.h new file mode 100644 index 0000000..d44012a --- /dev/null +++ b/base/rolling_hash.h @@ -0,0 +1,61 @@ +#ifndef BASE_ROLLING_HASH_H +#define BASE_ROLLING_HASH_H + +#include +#include +#include + +//---------------------------------------------------------------- + +namespace base { + class rolling_hash { + public: + rolling_hash(unsigned window_size); + + void reset(); + + // Returns the current hash + uint32_t step(uint8_t byte); + + uint32_t get_hash() const; + + private: + void update_hash(uint8_t byte); + + uint32_t a_; + uint32_t a_to_k_minus_1_; + + // FIXME: use a ring buffer + std::list chars_; + + uint32_t hash_; + uint32_t window_size_; + }; + + class content_based_hash { + public: + content_based_hash(unsigned window_size); + void reset(); + + // Returns a break point relative to the last reset/break. + boost::optional step(uint8_t byte); + + private: + bool hit_break(uint32_t div) const; + + rolling_hash rhash_; + + uint32_t backup_div_; + uint32_t div_; + + unsigned min_len_; + unsigned max_len_; + + unsigned len_; + boost::optional backup_break_; + }; +} + +//---------------------------------------------------------------- + +#endif diff --git a/thin-provisioning/cache_stream.cc b/thin-provisioning/cache_stream.cc index 956fc4e..8fbcc72 100644 --- a/thin-provisioning/cache_stream.cc +++ b/thin-provisioning/cache_stream.cc @@ -32,9 +32,8 @@ cache_stream::cache_stream(string const &path, v_(new bcache::noop_validator()), cache_(new block_cache(fd_, block_size / 512, nr_blocks_, cache_mem)), current_index_(0) { - load(0); - for (block_address i = 1; i < min(cache_blocks_, nr_blocks_); i++) - cache_->prefetch(i); + + rewind(); } block_address @@ -46,19 +45,27 @@ cache_stream::nr_chunks() const void cache_stream::rewind() { - load(0); + current_index_ = 0; + + for (block_address i = 1; i < min(cache_blocks_, nr_blocks_); i++) + cache_->prefetch(i); } bool -cache_stream::advance(block_address count) +cache_stream::next(block_address count) { - if (current_index_ + count >= nr_blocks_) - return false; + current_index_ = min(current_index_ + count, nr_blocks_); - current_index_ += count; + if (current_index_ + cache_blocks_ < nr_blocks_) + cache_->prefetch(current_index_ + cache_blocks_); - load(current_index_); - return true; + return !eof(); +} + +bool +cache_stream::eof() const +{ + return current_index_ >= nr_blocks_; } block_address @@ -68,24 +75,26 @@ cache_stream::index() const } chunk const & -cache_stream::get() const +cache_stream::get() { - return current_chunk_; + chunk_wrapper *w = new chunk_wrapper(*this); + return w->c_; } void -cache_stream::load(block_address b) +cache_stream::put(chunk const &c) { - current_index_ = b; - current_block_ = cache_->get(current_index_, 0, v_); + chunk_wrapper *w = container_of(const_cast(&c), chunk_wrapper, c_); + delete w; +} - current_chunk_.offset_sectors_ = (b * block_size_) / 512; - current_chunk_.mem_.clear(); - current_chunk_.mem_.push_back(mem(static_cast(current_block_.get_data()), - static_cast(current_block_.get_data()) + block_size_)); - - if (current_index_ + cache_blocks_ < nr_blocks_) - cache_->prefetch(current_index_ + cache_blocks_); +cache_stream::chunk_wrapper::chunk_wrapper(cache_stream &parent) + : block_(parent.cache_->get(parent.current_index_, 0, parent.v_)) +{ + c_.offset_ = parent.current_index_ * parent.block_size_; + c_.len_ = parent.block_size_; + c_.mem_.push_back(mem(static_cast(block_.get_data()), + static_cast(block_.get_data()) + parent.block_size_)); } //---------------------------------------------------------------- diff --git a/thin-provisioning/cache_stream.h b/thin-provisioning/cache_stream.h index cfe6ca8..65c81b1 100644 --- a/thin-provisioning/cache_stream.h +++ b/thin-provisioning/cache_stream.h @@ -14,25 +14,36 @@ namespace thin_provisioning { block_address block_size, size_t cache_mem); - virtual block_address nr_chunks() const; + block_address nr_chunks() const; + virtual void rewind(); - virtual bool advance(block_address count = 1ull); virtual block_address index() const; - virtual chunk const &get() const; + + virtual bool next(block_address count = 1ull); + virtual bool eof() const; + + virtual chunk const &get(); + virtual void put(chunk const &c); private: - void load(block_address b); + struct chunk_wrapper { + chunk_wrapper(cache_stream &parent); + + block_cache::auto_block block_; + chunk c_; + }; + + friend class chunk_wrapper; block_address block_size_; block_address nr_blocks_; block_address cache_blocks_; + int fd_; validator::ptr v_; std::auto_ptr cache_; block_address current_index_; - block_cache::auto_block current_block_; - chunk current_chunk_; }; } diff --git a/thin-provisioning/chunk_stream.cc b/thin-provisioning/chunk_stream.cc new file mode 100644 index 0000000..4ac99ff --- /dev/null +++ b/thin-provisioning/chunk_stream.cc @@ -0,0 +1,23 @@ +#include "thin-provisioning/chunk_stream.h" + +using namespace std; +using namespace thin_provisioning; + +//---------------------------------------------------------------- + +uint8_t +chunk::operator[](uint64_t n) const +{ + std::deque::const_iterator it; + for (it = mem_.begin(); it != mem_.end(); it++) { + uint64_t mem_len = it->end - it->begin; + if (n > mem_len) + n -= mem_len; + else + return it->begin[n]; + } + + throw runtime_error("chunk out of bounds"); +} + +//---------------------------------------------------------------- diff --git a/thin-provisioning/chunk_stream.h b/thin-provisioning/chunk_stream.h index d87ee9c..4e1ea96 100644 --- a/thin-provisioning/chunk_stream.h +++ b/thin-provisioning/chunk_stream.h @@ -37,21 +37,24 @@ namespace thin_provisioning { }; struct chunk { - // FIXME: switch to bytes rather than sectors - // FIXME: add length too - uint64_t offset_sectors_; + uint64_t offset_, len_; std::deque mem_; + + uint8_t operator[](uint64_t n) const; }; class chunk_stream { public: virtual ~chunk_stream() {} - virtual bcache::block_address nr_chunks() const = 0; virtual void rewind() = 0; - virtual bool advance(bcache::block_address count = 1ull) = 0; virtual bcache::block_address index() const = 0; - virtual chunk const &get() const = 0; + + virtual bool next(bcache::block_address count = 1ull) = 0; + virtual bool eof() const = 0; + + virtual chunk const &get() = 0; + virtual void put(chunk const &c) = 0; }; } diff --git a/thin-provisioning/pool_stream.cc b/thin-provisioning/pool_stream.cc index 0191698..21964f9 100644 --- a/thin-provisioning/pool_stream.cc +++ b/thin-provisioning/pool_stream.cc @@ -60,7 +60,7 @@ pool_stream::rewind() } bool -pool_stream::advance(block_address count) +pool_stream::next(block_address count) { while (count--) if (!advance_one()) @@ -69,6 +69,12 @@ pool_stream::advance(block_address count) return true; } +bool +pool_stream::eof() const +{ + return stream_.eof(); +} + block_address pool_stream::index() const { @@ -76,12 +82,16 @@ pool_stream::index() const } chunk const & -pool_stream::get() const +pool_stream::get() { return stream_.get(); } - +void +pool_stream::put(chunk const &c) +{ + stream_.put(c); +} // FIXME: too big to return by value vector @@ -140,7 +150,7 @@ pool_stream::advance_one() if (new_index >= nr_chunks()) return false; - return stream_.advance(new_index - index()); + return stream_.next(new_index - index()); } //---------------------------------------------------------------- diff --git a/thin-provisioning/pool_stream.h b/thin-provisioning/pool_stream.h index 1fa5c51..71576ed 100644 --- a/thin-provisioning/pool_stream.h +++ b/thin-provisioning/pool_stream.h @@ -34,16 +34,20 @@ namespace thin_provisioning { block_address nr_chunks() const; void rewind(); - bool advance(block_address count = 1ull); + bool next(block_address count = 1ull); + bool eof() const; block_address index() const; - chunk const &get() const; + + chunk const &get(); + void put(chunk const &c); private: typedef rmap_visitor::region region; typedef rmap_visitor::rmap_region rmap_region; // FIXME: too big to return by value - vector read_rmap(transaction_manager::ptr tm, superblock_detail::superblock const &sb, + vector read_rmap(transaction_manager::ptr tm, + superblock_detail::superblock const &sb, block_address nr_blocks); void init_rmap(transaction_manager::ptr tm, superblock_detail::superblock const &sb, block_address nr_blocks); diff --git a/thin-provisioning/thin_show_duplicates.cc b/thin-provisioning/thin_show_duplicates.cc index 7819ca1..dafcd34 100644 --- a/thin-provisioning/thin_show_duplicates.cc +++ b/thin-provisioning/thin_show_duplicates.cc @@ -36,6 +36,7 @@ #include "thin-provisioning/mapping_tree.h" #include "thin-provisioning/rmap_visitor.h" #include "thin-provisioning/superblock.h" +#include "thin-provisioning/variable_chunk_stream.h" #include #include @@ -98,25 +99,21 @@ namespace { unsigned cache_mem; }; - // FIXME: introduce abstraction for a stream of segments - using namespace mapping_tree_detail; class duplicate_counter { public: - duplicate_counter(block_address nr_blocks) - : counts_(nr_blocks), - non_zero_dups_(0), + duplicate_counter() + : non_zero_dups_(0), zero_dups_(0) { } - void add_duplicate(block_address b1, block_address b2) { - non_zero_dups_++; - counts_[b1]++; + void add_duplicate(block_address len) { + non_zero_dups_ += len; } - void add_zero_duplicate(block_address b) { - zero_dups_++; + void add_zero_duplicate(block_address len) { + zero_dups_ += len; } block_address get_total() const { @@ -132,45 +129,35 @@ namespace { } private: - vector counts_; block_address non_zero_dups_; block_address zero_dups_; }; class duplicate_detector { public: - duplicate_detector(unsigned block_size, block_address nr_blocks) - : block_size_(block_size), - results_(nr_blocks), - zero_fingerprint_(5, 0ull) { - calc_zero_fingerprint(); - } - void examine(chunk const &c) { - digestor_.reset(); - - for (deque::const_iterator it = c.mem_.begin(); it != c.mem_.end(); it++) - digestor_.process_bytes(it->begin, it->end - it->begin); - - unsigned int digest[5]; - digestor_.get_digest(digest); - - // hack - vector v(5); - for (unsigned i = 0; i < 5; i++) - v[i] = digest[i]; - - block_address index = (c.offset_sectors_ * 512) / block_size_; - - if (v == zero_fingerprint_) - results_.add_zero_duplicate(index); + if (all_zeroes(c)) + results_.add_zero_duplicate(c.len_); else { + digestor_.reset(); + + for (deque::const_iterator it = c.mem_.begin(); it != c.mem_.end(); it++) + digestor_.process_bytes(it->begin, it->end - it->begin); + + unsigned int digest[5]; + digestor_.get_digest(digest); + + // hack + vector v(5); + for (unsigned i = 0; i < 5; i++) + v[i] = digest[i]; + fingerprint_map::const_iterator it = fm_.find(v); if (it != fm_.end()) { - results_.add_duplicate(it->second, index); + results_.add_duplicate(c.len_); } else - fm_.insert(make_pair(v, index)); + fm_.insert(make_pair(v, c.offset_)); } } @@ -178,30 +165,24 @@ namespace { return results_; } - void calc_zero_fingerprint() { - auto_ptr bytes(new uint8_t[block_size_]); - memset(bytes.get(), 0, block_size_); + private: + bool all_zeroes(chunk const &c) const { + for (deque::const_iterator it = c.mem_.begin(); it != c.mem_.end(); it++) { + for (uint8_t *ptr = it->begin; ptr != it->end; ptr++) { + if (*ptr != 0) + return false; + } + } - digestor_.reset(); - digestor_.process_bytes(bytes.get(), block_size_); - - unsigned int digest[5]; - digestor_.get_digest(digest); - - // hack - for (unsigned i = 0; i < 5; i++) - zero_fingerprint_[i] = digest[i]; + return true; } - private: typedef map, block_address> fingerprint_map; unsigned block_size_; boost::uuids::detail::sha1 digestor_; fingerprint_map fm_; duplicate_counter results_; - - vector zero_fingerprint_; }; int show_dups_pool(flags const &fs) { @@ -218,15 +199,16 @@ namespace { cache_stream stream(fs.data_dev, block_size, fs.cache_mem); pool_stream pstream(stream, tm, sb, nr_blocks); - duplicate_detector detector(block_size, nr_blocks); + duplicate_detector detector; auto_ptr pbar = create_progress_bar("Examining data"); do { chunk const &c = pstream.get(); detector.examine(c); + pstream.put(c); pbar->update_percent((pstream.index() * 100) / pstream.nr_chunks()); - } while (pstream.advance()); + } while (pstream.next()); pbar->update_percent(100); cout << "\n\ntotal dups: " << detector.get_results().get_total() << endl; @@ -247,24 +229,27 @@ namespace { cerr << "nr_blocks = " << nr_blocks << "\n"; cerr << "block size = " << block_size << "\n"; - cache_stream stream(fs.data_dev, block_size, fs.cache_mem); - duplicate_detector detector(block_size, nr_blocks); + cache_stream low_level_stream(fs.data_dev, block_size, fs.cache_mem); + variable_chunk_stream stream(low_level_stream, 4096); + duplicate_detector detector; auto_ptr pbar = create_progress_bar("Examining data"); do { + // FIXME: use a wrapper class to automate the put() chunk const &c = stream.get(); detector.examine(c); - pbar->update_percent((stream.index() * 100) / stream.nr_chunks()); + stream.put(c); +// pbar->update_percent((stream.index() * 100) / stream.nr_chunks()); - } while (stream.advance()); + } while (stream.next()); pbar->update_percent(100); - cout << "\n\ntotal dups: " << detector.get_results().get_total() << endl; - cout << (detector.get_results().get_total() * 100) / nr_blocks << "% duplicates\n"; - duplicate_counter r = detector.get_results(); - cout << "\n\nchunks\tnon zero dups\tzero dups\n" - << nr_blocks << "\t" << r.get_non_zeroes() << "\t" << r.get_zeroes() << "\n"; + block_address meg = 1024 * 1024; + cout << "\n\n" + << (nr_blocks * block_size) / meg << "m examined, " + << r.get_non_zeroes() / meg << "m duplicates, " + << r.get_zeroes() / meg << "m zeroes\n"; return 0; } diff --git a/thin-provisioning/variable_chunk_stream.cc b/thin-provisioning/variable_chunk_stream.cc new file mode 100644 index 0000000..9a9d11e --- /dev/null +++ b/thin-provisioning/variable_chunk_stream.cc @@ -0,0 +1,152 @@ +#include "thin-provisioning/variable_chunk_stream.h" + +using namespace boost; +using namespace std; +using namespace thin_provisioning; + +//---------------------------------------------------------------- + +variable_chunk_stream::variable_chunk_stream(chunk_stream &stream, unsigned window_size) + : index_(0), + h_(window_size), + stream_(stream), + big_chunk_(0) { + next_big_chunk(); +} + +variable_chunk_stream::~variable_chunk_stream() +{ + put_big_chunk(); +} + +void +variable_chunk_stream::rewind() +{ + // FIXME: not complete + index_ = 0; + stream_.rewind(); + h_.reset(); +} + +bool +variable_chunk_stream::next(bcache::block_address count) +{ + while (count--) { + index_++; + advance_one(); + } + + return !eof(); +} + +bool +variable_chunk_stream::eof() const +{ + return stream_.eof(); +} + +bcache::block_address +variable_chunk_stream::index() const +{ + return index_; +} + +chunk const & +variable_chunk_stream::get() +{ + assert(big_chunk_); + + little_chunk_.len_ = little_e_ - little_b_; + little_chunk_.offset_ = big_chunk_->offset_ + little_chunk_.len_; + + little_chunk_.mem_.clear(); + little_chunk_.mem_.push_back(mem(little_b_, little_e_)); + + return little_chunk_; +} + +void +variable_chunk_stream::put(chunk const &c) +{ + // noop +} + +bool +variable_chunk_stream::next_big_chunk() +{ + put_big_chunk(); + + if (!stream_.next()) + return false; + + big_chunk_ = &stream_.get(); + little_b_ = little_e_ = big_chunk_->mem_.front().begin; + h_.reset(); + + return true; +} + +bool +variable_chunk_stream::advance_one() +{ + uint8_t *big_e; + + assert(big_chunk_); + + big_e = big_chunk_->mem_.front().end; + little_b_ = little_e_; + + if (little_b_ == big_e) { + if (next_big_chunk()) + big_e = big_chunk_->mem_.front().end; + else + return false; + } + + assert(little_e_ >= big_chunk_->mem_.front().begin); + assert(little_b_ >= big_chunk_->mem_.front().begin); +#if 1 + if (little_e_ > big_e) { + cerr << "before -- little_e_: " << (void *) little_e_ << ", big_e: " << (void *) big_e << "\n"; + } +#endif + assert(little_e_ <= big_e); + assert(little_b_ <= big_e); + + + while (little_e_ != big_e) { + optional maybe_break = h_.step(*little_e_); + + if (maybe_break) { + // The break is not neccessarily at the current + // byte. + little_e_ = little_b_ + *maybe_break; + break; + } + + little_e_++; + } + + assert(little_e_ >= big_chunk_->mem_.front().begin); + assert(little_b_ >= big_chunk_->mem_.front().begin); +#if 1 + if (little_e_ > big_e) { + cerr << "after -- little_e_: " << (void *) little_e_ << ", big_e: " << (void *) big_e << "\n"; + } +#endif + assert(little_e_ <= big_e); + assert(little_b_ <= big_e); + + return true; +} + +void +variable_chunk_stream::put_big_chunk() +{ + if (big_chunk_) + stream_.put(*big_chunk_); + + big_chunk_ = 0; +} + +//---------------------------------------------------------------- diff --git a/thin-provisioning/variable_chunk_stream.h b/thin-provisioning/variable_chunk_stream.h new file mode 100644 index 0000000..0327f1d --- /dev/null +++ b/thin-provisioning/variable_chunk_stream.h @@ -0,0 +1,42 @@ +#ifndef THIN_PROVISIONING_VARIABLE_CHUNK_STREAM_H +#define THIN_PROVISIONING_VARIABLE_CHUNK_STREAM_H + +#include "base/rolling_hash.h" +#include "thin-provisioning/chunk_stream.h" + +//---------------------------------------------------------------- + +namespace thin_provisioning { + class variable_chunk_stream : public chunk_stream { + public: + // window_size must be a power of 2 + variable_chunk_stream(chunk_stream &stream, unsigned window_size); + ~variable_chunk_stream(); + + // FIXME: we don't know in advance how many chunks we will have + virtual void rewind(); + virtual bool next(bcache::block_address count = 1ull); + virtual bool eof() const; + virtual bcache::block_address index() const; + virtual chunk const &get(); + virtual void put(chunk const &c); + + private: + bool next_big_chunk(); + bool advance_one(); + void put_big_chunk(); + + bcache::block_address index_; + base::content_based_hash h_; + + chunk_stream &stream_; + chunk const *big_chunk_; + + uint8_t *little_b_, *little_e_; + chunk little_chunk_; + }; +} + +//---------------------------------------------------------------- + +#endif diff --git a/unit-tests/Makefile.in b/unit-tests/Makefile.in index 9307e5f..38f3d04 100644 --- a/unit-tests/Makefile.in +++ b/unit-tests/Makefile.in @@ -59,6 +59,7 @@ TEST_SOURCE=\ unit-tests/endian_t.cc \ unit-tests/error_state_t.cc \ unit-tests/rmap_visitor_t.cc \ + unit-tests/rolling_hash_t.cc \ unit-tests/run_set_t.cc \ unit-tests/space_map_t.cc \ unit-tests/span_iterator_t.cc \ diff --git a/unit-tests/rolling_hash_t.cc b/unit-tests/rolling_hash_t.cc new file mode 100644 index 0000000..c25b650 --- /dev/null +++ b/unit-tests/rolling_hash_t.cc @@ -0,0 +1,153 @@ +#include "gmock/gmock.h" + +#include "base/rolling_hash.h" + +using namespace base; +using namespace boost; +using namespace std; +using namespace testing; + +//---------------------------------------------------------------- + +namespace { + class RollingHashTests : public Test { + public: + RollingHashTests() + : window_size_(4096), + rhash_(window_size_) { + } + + typedef vector bytes; + bytes random_bytes(unsigned count) { + bytes v(count, 0); + + for (unsigned i = 0; i < count; i++) + v[i] = random_byte(); + + return v; + } + + uint8_t random_byte() const { + return random() % 256; + } + + void apply_bytes(bytes const &bs) { + for (unsigned i = 0; i < bs.size(); i++) + rhash_.step(bs[i]); + } + + unsigned window_size_; + rolling_hash rhash_; + }; + + class ContentBasedHashTests : public Test { + public: + ContentBasedHashTests() + : window_size_(8192), + h_(window_size_) { + } + + typedef vector bytes; + bytes random_bytes(unsigned count) { + bytes v(count, 0); + + for (unsigned i = 0; i < count; i++) + v[i] = random_byte(); + + return v; + } + + uint8_t random_byte() const { + return random() % 256; + } + + unsigned window_size_; + content_based_hash h_; + }; +} + +//---------------------------------------------------------------- + +TEST_F(RollingHashTests, ctr) +{ +} + +//-------------------------------- + +TEST_F(RollingHashTests, hash_changes) +{ + bytes bs = random_bytes(window_size_ * 100); + + uint32_t prev = rhash_.get_hash(); + for (unsigned i = 0; i < bs.size(); i++) { + rhash_.step(bs[i]); + ASSERT_NE(rhash_.get_hash(), prev); + prev = rhash_.get_hash(); + } +} + +TEST_F(RollingHashTests, hash_repeats) +{ + bytes bs = random_bytes(window_size_); + + apply_bytes(bs); + uint32_t h1 = rhash_.get_hash(); + apply_bytes(bs); + + ASSERT_EQ(rhash_.get_hash(), h1); +} + +TEST_F(RollingHashTests, reset_is_deterministic) +{ + uint8_t bytes[] = "lksdfuwerh,sdg"; + + for (unsigned i = 0; i < sizeof(bytes) - 1; i++) + rhash_.step(bytes[i]); + + uint32_t h1 = rhash_.get_hash(); + + rhash_.reset(); + + for (unsigned i = 0; i < sizeof(bytes) - 1; i++) + rhash_.step(bytes[i]); + + uint32_t h2 = rhash_.get_hash(); + + ASSERT_EQ(h1, h2); +} + +//---------------------------------------------------------------- + +TEST_F(ContentBasedHashTests, ctr) +{ +} + +TEST_F(ContentBasedHashTests, chunk_limits_respected) +{ + unsigned min = 100000, max = 0; + + bytes bs = random_bytes(1024 * 1024 * 100); + vector counts(window_size_, 0); + + for (unsigned i = 0; i < bs.size(); i++) { + optional b = h_.step(bs[i]); + if (b) { + counts[*b]++; + + if (*b < min) + min = *b; + + if (*b > max) + max = *b; + } + } + +#if 1 + for (unsigned i = 0; i < counts.size(); i++) + cerr << i << ": " << counts[i] << "\n"; + + cerr << "min: " << min << ", max: " << max << "\n"; +#endif +} + +//----------------------------------------------------------------