[thin_show_dups] variable_chunk_stream
This commit is contained in:
148
base/rolling_hash.cc
Normal file
148
base/rolling_hash.cc
Normal file
@@ -0,0 +1,148 @@
|
||||
#include "base/rolling_hash.h"
|
||||
|
||||
using namespace base;
|
||||
using namespace boost;
|
||||
using namespace std;
|
||||
|
||||
//----------------------------------------------------------------
|
||||
|
||||
namespace {
|
||||
uint32_t MULTIPLIER = 4294967291UL;
|
||||
uint32_t SEED = 123;
|
||||
}
|
||||
|
||||
rolling_hash::rolling_hash(unsigned window_size)
|
||||
: a_(MULTIPLIER),
|
||||
a_to_k_minus_1_(a_),
|
||||
window_size_(window_size) {
|
||||
|
||||
for (unsigned i = 1; i < window_size_ - 1; i++)
|
||||
a_to_k_minus_1_ *= a_;
|
||||
|
||||
reset();
|
||||
}
|
||||
|
||||
void
|
||||
rolling_hash::reset()
|
||||
{
|
||||
// prime with zeroes
|
||||
chars_.clear();
|
||||
|
||||
hash_ = 0;
|
||||
for (unsigned i = 0; i < window_size_; i++) {
|
||||
hash_ = (hash_ * a_) + SEED;
|
||||
chars_.push_back(0);
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t
|
||||
rolling_hash::step(uint8_t byte)
|
||||
{
|
||||
update_hash(byte);
|
||||
return hash_;
|
||||
}
|
||||
|
||||
uint32_t
|
||||
rolling_hash::get_hash() const
|
||||
{
|
||||
return hash_;
|
||||
}
|
||||
|
||||
void
|
||||
rolling_hash::update_hash(uint8_t byte)
|
||||
{
|
||||
hash_ -= a_to_k_minus_1_ * (chars_.front() + SEED);
|
||||
chars_.pop_front();
|
||||
chars_.push_back(byte);
|
||||
hash_ = (hash_ * a_) + byte + SEED;
|
||||
}
|
||||
|
||||
//--------------------------------
|
||||
|
||||
content_based_hash::content_based_hash(unsigned window_size)
|
||||
: rhash_(window_size),
|
||||
|
||||
// FIXME: hard coded values
|
||||
backup_div_((window_size / 4) - 1),
|
||||
div_((window_size / 2) - 1),
|
||||
min_len_(window_size / 8),
|
||||
max_len_(window_size),
|
||||
len_(0)
|
||||
{
|
||||
}
|
||||
|
||||
void
|
||||
content_based_hash::reset()
|
||||
{
|
||||
len_ = 0;
|
||||
backup_break_.reset();
|
||||
rhash_.reset();
|
||||
}
|
||||
|
||||
optional<unsigned>
|
||||
content_based_hash::step(uint8_t byte)
|
||||
{
|
||||
#if 0
|
||||
optional<unsigned> r;
|
||||
|
||||
rhash_.step(byte);
|
||||
len_++;
|
||||
|
||||
if (len_ < min_len_)
|
||||
return r;
|
||||
|
||||
if (hit_break(backup_div_))
|
||||
backup_break_ = len_;
|
||||
|
||||
if (hit_break(div_)) {
|
||||
// found a break
|
||||
r = len_;
|
||||
len_ = 0;
|
||||
backup_break_.reset();
|
||||
|
||||
} else if (len_ >= max_len_) {
|
||||
// too big, is there a backup?
|
||||
if (backup_break_) {
|
||||
len_ -= *backup_break_;
|
||||
r = backup_break_;
|
||||
backup_break_.reset();
|
||||
|
||||
} else {
|
||||
r = len_;
|
||||
len_ = 0;
|
||||
}
|
||||
}
|
||||
|
||||
return r;
|
||||
#else
|
||||
optional<unsigned> r;
|
||||
|
||||
rhash_.step(byte);
|
||||
len_++;
|
||||
|
||||
if (len_ < min_len_)
|
||||
return r;
|
||||
|
||||
if (hit_break(div_)) {
|
||||
// found a break
|
||||
r = len_;
|
||||
len_ = 0;
|
||||
backup_break_.reset();
|
||||
|
||||
} else if (len_ >= max_len_) {
|
||||
r = len_;
|
||||
len_ = 0;
|
||||
}
|
||||
|
||||
return r;
|
||||
#endif
|
||||
}
|
||||
|
||||
bool
|
||||
content_based_hash::hit_break(uint32_t mask) const
|
||||
{
|
||||
uint32_t h = rhash_.get_hash() >> 8;
|
||||
return !(h & mask);
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------
|
61
base/rolling_hash.h
Normal file
61
base/rolling_hash.h
Normal file
@@ -0,0 +1,61 @@
|
||||
#ifndef BASE_ROLLING_HASH_H
|
||||
#define BASE_ROLLING_HASH_H
|
||||
|
||||
#include <list>
|
||||
#include <stdint.h>
|
||||
#include <boost/optional.hpp>
|
||||
|
||||
//----------------------------------------------------------------
|
||||
|
||||
namespace base {
|
||||
class rolling_hash {
|
||||
public:
|
||||
rolling_hash(unsigned window_size);
|
||||
|
||||
void reset();
|
||||
|
||||
// Returns the current hash
|
||||
uint32_t step(uint8_t byte);
|
||||
|
||||
uint32_t get_hash() const;
|
||||
|
||||
private:
|
||||
void update_hash(uint8_t byte);
|
||||
|
||||
uint32_t a_;
|
||||
uint32_t a_to_k_minus_1_;
|
||||
|
||||
// FIXME: use a ring buffer
|
||||
std::list<uint8_t> chars_;
|
||||
|
||||
uint32_t hash_;
|
||||
uint32_t window_size_;
|
||||
};
|
||||
|
||||
class content_based_hash {
|
||||
public:
|
||||
content_based_hash(unsigned window_size);
|
||||
void reset();
|
||||
|
||||
// Returns a break point relative to the last reset/break.
|
||||
boost::optional<unsigned> step(uint8_t byte);
|
||||
|
||||
private:
|
||||
bool hit_break(uint32_t div) const;
|
||||
|
||||
rolling_hash rhash_;
|
||||
|
||||
uint32_t backup_div_;
|
||||
uint32_t div_;
|
||||
|
||||
unsigned min_len_;
|
||||
unsigned max_len_;
|
||||
|
||||
unsigned len_;
|
||||
boost::optional<unsigned> backup_break_;
|
||||
};
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------
|
||||
|
||||
#endif
|
Reference in New Issue
Block a user