2015-09-03 17:32:29 +05:30
|
|
|
#ifndef BASE_ROLLING_HASH_H
|
|
|
|
#define BASE_ROLLING_HASH_H
|
|
|
|
|
2015-09-04 15:58:33 +05:30
|
|
|
#include <boost/circular_buffer.hpp>
|
2015-09-03 17:32:29 +05:30
|
|
|
#include <stdint.h>
|
|
|
|
#include <boost/optional.hpp>
|
|
|
|
|
|
|
|
//----------------------------------------------------------------
|
|
|
|
|
|
|
|
namespace base {
|
2015-09-04 15:40:19 +05:30
|
|
|
namespace hash_detail {
|
|
|
|
uint32_t const MULTIPLIER = 4294967291UL;
|
|
|
|
uint32_t const SEED = 123;
|
|
|
|
}
|
|
|
|
|
2015-09-03 17:32:29 +05:30
|
|
|
class rolling_hash {
|
|
|
|
public:
|
|
|
|
rolling_hash(unsigned window_size);
|
|
|
|
|
|
|
|
void reset();
|
|
|
|
|
|
|
|
// Returns the current hash
|
2015-09-04 15:40:19 +05:30
|
|
|
uint32_t step(uint8_t byte) {
|
|
|
|
update_hash(byte);
|
|
|
|
return hash_;
|
|
|
|
}
|
2015-09-03 17:32:29 +05:30
|
|
|
|
2015-09-04 15:40:19 +05:30
|
|
|
uint32_t get_hash() const {
|
|
|
|
return hash_;
|
|
|
|
}
|
2015-09-03 17:32:29 +05:30
|
|
|
|
|
|
|
private:
|
2015-09-04 15:40:19 +05:30
|
|
|
void update_hash(uint8_t byte) {
|
2015-09-04 15:58:33 +05:30
|
|
|
hash_ -= a_to_k_minus_1_ * (buffer_.front() + hash_detail::SEED);
|
|
|
|
buffer_.push_back(byte);
|
2015-09-04 15:40:19 +05:30
|
|
|
hash_ = (hash_ * a_) + byte + hash_detail::SEED;
|
|
|
|
}
|
2015-09-03 17:32:29 +05:30
|
|
|
|
|
|
|
uint32_t a_;
|
|
|
|
uint32_t a_to_k_minus_1_;
|
|
|
|
|
|
|
|
uint32_t hash_;
|
|
|
|
uint32_t window_size_;
|
2015-09-04 15:58:33 +05:30
|
|
|
|
|
|
|
boost::circular_buffer<uint8_t> buffer_;
|
2015-09-03 17:32:29 +05:30
|
|
|
};
|
|
|
|
|
|
|
|
class content_based_hash {
|
|
|
|
public:
|
|
|
|
content_based_hash(unsigned window_size);
|
|
|
|
void reset();
|
|
|
|
|
|
|
|
// Returns a break point relative to the last reset/break.
|
2015-09-04 15:40:19 +05:30
|
|
|
boost::optional<unsigned> step(uint8_t byte) {
|
|
|
|
boost::optional<unsigned> r;
|
|
|
|
|
|
|
|
rhash_.step(byte);
|
|
|
|
len_++;
|
|
|
|
|
|
|
|
if (len_ < min_len_)
|
|
|
|
return r;
|
|
|
|
|
|
|
|
if (hit_break(backup_div_))
|
|
|
|
backup_break_ = len_;
|
|
|
|
|
|
|
|
if (hit_break(div_)) {
|
|
|
|
// found a break
|
|
|
|
r = len_;
|
|
|
|
len_ = 0;
|
|
|
|
backup_break_.reset();
|
|
|
|
|
|
|
|
} else if (len_ >= max_len_) {
|
|
|
|
// too big, is there a backup?
|
|
|
|
if (backup_break_) {
|
|
|
|
len_ -= *backup_break_;
|
|
|
|
r = backup_break_;
|
|
|
|
backup_break_.reset();
|
|
|
|
|
|
|
|
} else {
|
|
|
|
r = len_;
|
|
|
|
len_ = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return r;
|
|
|
|
}
|
2015-09-03 17:32:29 +05:30
|
|
|
|
|
|
|
private:
|
2015-09-04 15:40:19 +05:30
|
|
|
bool hit_break(uint32_t mask) const {
|
|
|
|
uint32_t h = rhash_.get_hash() >> 8;
|
|
|
|
return !(h & mask);
|
|
|
|
}
|
2015-09-03 17:32:29 +05:30
|
|
|
|
|
|
|
rolling_hash rhash_;
|
|
|
|
|
|
|
|
uint32_t backup_div_;
|
|
|
|
uint32_t div_;
|
|
|
|
|
|
|
|
unsigned min_len_;
|
|
|
|
unsigned max_len_;
|
|
|
|
|
|
|
|
unsigned len_;
|
|
|
|
boost::optional<unsigned> backup_break_;
|
|
|
|
};
|
|
|
|
}
|
|
|
|
|
|
|
|
//----------------------------------------------------------------
|
|
|
|
|
|
|
|
#endif
|