[block-cache] convert to use boost::intrusive rather than kernel style lists.

Major change, do not release until a lot of testing has been done.

Conflicts:
	block-cache/block_cache.h
This commit is contained in:
Joe Thornber 2016-02-04 08:57:41 +00:00
parent 8b792f5c79
commit 639af9c3bf
2 changed files with 183 additions and 176 deletions

@ -1,5 +1,6 @@
#include "block-cache/block_cache.h" #include "block-cache/block_cache.h"
#include <algorithm>
#include <assert.h> #include <assert.h>
#include <libaio.h> #include <libaio.h>
#include <errno.h> #include <errno.h>
@ -44,34 +45,19 @@ namespace {
int int
block_cache::init_free_list(unsigned count) block_cache::init_free_list(unsigned count)
{ {
size_t len;
block *blocks;
size_t block_size = block_size_ << SECTOR_SHIFT; size_t block_size = block_size_ << SECTOR_SHIFT;
void *data; unsigned char *data = static_cast<unsigned char *>(alloc_aligned(count * block_size, PAGE_SIZE));
unsigned i;
/* Allocate the block structures */
len = sizeof(block) * count;
blocks = static_cast<block *>(malloc(len));
if (!blocks)
return -ENOMEM;
blocks_memory_ = blocks;
/* Allocate the data for each block. We page align the data. */ /* Allocate the data for each block. We page align the data. */
data = alloc_aligned(count * block_size, PAGE_SIZE); if (!data)
if (!data) {
free(blocks);
return -ENOMEM; return -ENOMEM;
}
blocks_data_ = data; blocks_data_ = data;
for (i = 0; i < count; i++) { for (unsigned i = 0; i < count; i++) {
block *b = new (blocks + i) block(); block &b = (*blocks_memory_)[i];
b->data_ = static_cast<unsigned char *>(data) + block_size * i; b.data_ = data + (block_size * i);
free_.push_front(b);
list_add(&b->list_, &free_);
} }
return 0; return 0;
@ -82,28 +68,18 @@ block_cache::exit_free_list()
{ {
if (blocks_data_) if (blocks_data_)
free(blocks_data_); free(blocks_data_);
if (blocks_memory_) {
struct block *blocks = static_cast<block *>(blocks_memory_);
for (unsigned i = 0; i < nr_cache_blocks_; i++)
(blocks + i)->~block();
free(blocks_memory_);
}
} }
block_cache::block * block_cache::block *
block_cache::__alloc_block() block_cache::__alloc_block()
{ {
block *b; if (free_.empty())
if (list_empty(&free_))
return NULL; return NULL;
b = list_first_entry(&free_, block, list_); block &b = free_.front();
list_del(&b->list_); b.unlink();
return b; return &b;
} }
/*---------------------------------------------------------------- /*----------------------------------------------------------------
@ -131,15 +107,18 @@ block_cache::complete_io(block &b, int result)
b.clear_flags(BF_IO_PENDING); b.clear_flags(BF_IO_PENDING);
nr_io_pending_--; nr_io_pending_--;
if (b.error_) if (b.error_) {
list_move_tail(&b.list_, &errored_); b.unlink();
else { errored_.push_back(b);
} else {
if (b.test_flags(BF_DIRTY)) { if (b.test_flags(BF_DIRTY)) {
b.clear_flags(BF_DIRTY | BF_PREVIOUSLY_DIRTY); b.clear_flags(BF_DIRTY | BF_PREVIOUSLY_DIRTY);
nr_dirty_--; nr_dirty_--;
} }
list_move_tail(&b.list_, &clean_); b.unlink();
clean_.push_back(b);
} }
} }
@ -157,7 +136,8 @@ block_cache::issue_low_level(block &b, enum io_iocb_cmd opcode, const char *desc
assert(!b.test_flags(BF_IO_PENDING)); assert(!b.test_flags(BF_IO_PENDING));
b.set_flags(BF_IO_PENDING); b.set_flags(BF_IO_PENDING);
nr_io_pending_++; nr_io_pending_++;
list_move_tail(&b.list_, &io_pending_); b.unlink();
io_pending_.push_back(b);
b.control_block_.aio_lio_opcode = opcode; b.control_block_.aio_lio_opcode = opcode;
control_blocks[0] = &b.control_block_; control_blocks[0] = &b.control_block_;
@ -208,7 +188,7 @@ block_cache::wait_io()
for (i = 0; i < static_cast<unsigned>(r); i++) { for (i = 0; i < static_cast<unsigned>(r); i++) {
io_event const &e = events_[i]; io_event const &e = events_[i];
block *b = container_of(e.obj, block, control_block_); block *b = container_of(e.obj, &block::control_block_);
if (e.res == block_size_ << SECTOR_SHIFT) if (e.res == block_size_ << SECTOR_SHIFT)
complete_io(*b, 0); complete_io(*b, 0);
@ -236,19 +216,20 @@ block_cache::wait_io()
* We're using lru lists atm, but I think it would be worth * We're using lru lists atm, but I think it would be worth
* experimenting with a multiqueue approach. * experimenting with a multiqueue approach.
*/ */
list_head * block_cache::block_list &
block_cache::__categorise(block &b) block_cache::__categorise(block &b)
{ {
if (b.error_) if (b.error_)
return &errored_; return errored_;
return b.test_flags(BF_DIRTY) ? &dirty_ : &clean_; return b.test_flags(BF_DIRTY) ? dirty_ : clean_;
} }
void void
block_cache::hit(block &b) block_cache::hit(block &b)
{ {
list_move_tail(&b.list_, __categorise(b)); b.unlink();
__categorise(b).push_back(b);
} }
/*---------------------------------------------------------------- /*----------------------------------------------------------------
@ -257,7 +238,7 @@ block_cache::hit(block &b)
void void
block_cache::wait_all() block_cache::wait_all()
{ {
while (!list_empty(&io_pending_)) while (!io_pending_.empty())
wait_io(); wait_io();
} }
@ -271,10 +252,15 @@ block_cache::wait_specific(block &b)
unsigned unsigned
block_cache::writeback(unsigned count) block_cache::writeback(unsigned count)
{ {
block *b, *tmp;
unsigned actual = 0, dirty_length = 0; unsigned actual = 0, dirty_length = 0;
list_for_each_entry_safe (b, tmp, &dirty_, list_) { // issue_write unlinks b, which invalidates the iteration, so we
// keep track of the next element before removing.
auto it = dirty_.begin();
auto next = it;
while (it != dirty_.end()) {
next = it;
++next;
dirty_length++; dirty_length++;
if (actual == count) if (actual == count)
@ -282,69 +268,18 @@ block_cache::writeback(unsigned count)
// The block may be on the dirty list from a prior // The block may be on the dirty list from a prior
// acquisition. // acquisition.
if (b->ref_count_) if (it->ref_count_)
continue; continue;
issue_write(*b); issue_write(*it);
actual++; actual++;
it = next;
} }
return actual; return actual;
} }
/*----------------------------------------------------------------
* Hash table
*---------------------------------------------------------------*/
/*
* |nr_buckets| must be a power of two.
*/
void
block_cache::hash_init(unsigned nr_buckets)
{
unsigned i;
nr_buckets_ = nr_buckets;
mask_ = nr_buckets - 1;
for (i = 0; i < nr_buckets; i++)
INIT_LIST_HEAD(&buckets_[i]);
}
unsigned
block_cache::hash(uint64_t index)
{
const unsigned BIG_PRIME = 4294967291UL;
return (((unsigned) index) * BIG_PRIME) & mask_;
}
block_cache::block *
block_cache::hash_lookup(block_address index)
{
block *b;
unsigned bucket = hash(index);
list_for_each_entry (b, &buckets_[bucket], hash_list_) {
if (b->index_ == index)
return b;
}
return NULL;
}
void
block_cache::hash_insert(block &b)
{
unsigned bucket = hash(b.index_);
list_move_tail(&b.hash_list_, &buckets_[bucket]);
}
void
block_cache::hash_remove(block &b)
{
list_del_init(&b.hash_list_);
}
/*---------------------------------------------------------------- /*----------------------------------------------------------------
* High level allocation * High level allocation
*--------------------------------------------------------------*/ *--------------------------------------------------------------*/
@ -362,18 +297,17 @@ block_cache::setup_control_block(block &b)
cb->u.c.nbytes = block_size_bytes; cb->u.c.nbytes = block_size_bytes;
} }
// FIXME: return a reference
block_cache::block * block_cache::block *
block_cache::find_unused_clean_block() block_cache::find_unused_clean_block()
{ {
struct block *b, *tmp; for (block &b : clean_) {
if (b.ref_count_)
list_for_each_entry_safe (b, tmp, &clean_, list_) {
if (b->ref_count_)
continue; continue;
hash_remove(*b); block_set_.remove_node(b);
list_del(&b->list_); b.unlink();
return b; return &b;
} }
return NULL; return NULL;
@ -386,8 +320,8 @@ block_cache::new_block(block_address index)
b = __alloc_block(); b = __alloc_block();
if (!b) { if (!b) {
if (list_empty(&clean_)) { if (clean_.empty()) {
if (list_empty(&io_pending_)) if (io_pending_.empty())
writeback(16); writeback(16);
wait_io(); wait_io();
} }
@ -396,8 +330,6 @@ block_cache::new_block(block_address index)
} }
if (b) { if (b) {
INIT_LIST_HEAD(&b->list_);
INIT_LIST_HEAD(&b->hash_list_);
b->bc_ = this; b->bc_ = this;
b->ref_count_ = 0; b->ref_count_ = 0;
@ -408,7 +340,7 @@ block_cache::new_block(block_address index)
b->index_ = index; b->index_ = index;
setup_control_block(*b); setup_control_block(*b);
hash_insert(*b); block_set_.insert(*b);
} }
return b; return b;
@ -455,9 +387,6 @@ block_cache::block_cache(int fd, sector_t block_size, uint64_t on_disk_blocks, s
{ {
int r; int r;
unsigned nr_cache_blocks = calc_nr_cache_blocks(mem, block_size); unsigned nr_cache_blocks = calc_nr_cache_blocks(mem, block_size);
unsigned nr_buckets = calc_nr_buckets(nr_cache_blocks);
buckets_.resize(nr_buckets);
fd_ = fd; fd_ = fd;
block_size_ = block_size; block_size_ = block_size;
@ -473,12 +402,7 @@ block_cache::block_cache(int fd, sector_t block_size, uint64_t on_disk_blocks, s
throw std::runtime_error("io_setup failed"); throw std::runtime_error("io_setup failed");
} }
hash_init(nr_buckets); blocks_memory_.reset(new std::vector<block>(nr_cache_blocks));
INIT_LIST_HEAD(&free_);
INIT_LIST_HEAD(&errored_);
INIT_LIST_HEAD(&dirty_);
INIT_LIST_HEAD(&clean_);
INIT_LIST_HEAD(&io_pending_);
r = init_free_list(nr_cache_blocks); r = init_free_list(nr_cache_blocks);
if (r) if (r)
@ -552,30 +476,31 @@ block_cache::block *
block_cache::lookup_or_read_block(block_address index, unsigned flags, block_cache::lookup_or_read_block(block_address index, unsigned flags,
validator::ptr v) validator::ptr v)
{ {
block *b = hash_lookup(index); auto it = block_set_.find(index, cmp_index());
if (b) { if (it != block_set_.end()) {
if (b->test_flags(BF_IO_PENDING)) { if (it->test_flags(BF_IO_PENDING)) {
inc_miss_counter(flags); inc_miss_counter(flags);
wait_specific(*b); wait_specific(*it);
} else } else
inc_hit_counter(flags); inc_hit_counter(flags);
if (flags & GF_ZERO) if (flags & GF_ZERO)
zero_block(*b); zero_block(*it);
else { else {
if (b->v_.get() != v.get()) { if (it->v_.get() != v.get()) {
if (b->test_flags(BF_DIRTY)) if (it->test_flags(BF_DIRTY))
b->v_->prepare(b->data_, b->index_); it->v_->prepare(it->data_, it->index_);
v->check(b->data_, b->index_); v->check(it->data_, it->index_);
} }
} }
b->v_ = v; it->v_ = v;
return &(*it);
} else { } else {
inc_miss_counter(flags); inc_miss_counter(flags);
b = new_block(index); block *b = new_block(index);
if (b) { if (b) {
if (flags & GF_ZERO) if (flags & GF_ZERO)
zero_block(*b); zero_block(*b);
@ -587,9 +512,9 @@ block_cache::lookup_or_read_block(block_address index, unsigned flags,
b->v_ = v; b->v_ = v;
} }
}
return (!b || b->error_) ? NULL : b; return (!b || b->error_) ? NULL : b;
}
} }
block_cache::block & block_cache::block &
@ -644,7 +569,8 @@ block_cache::release(block_cache::block &b)
if (b.test_flags(BF_DIRTY)) { if (b.test_flags(BF_DIRTY)) {
if (!b.test_flags(BF_PREVIOUSLY_DIRTY)) { if (!b.test_flags(BF_PREVIOUSLY_DIRTY)) {
list_move_tail(&b.list_, &dirty_); b.unlink();
dirty_.push_back(b);
nr_dirty_++; nr_dirty_++;
b.set_flags(BF_PREVIOUSLY_DIRTY); b.set_flags(BF_PREVIOUSLY_DIRTY);
} }
@ -661,19 +587,18 @@ block_cache::release(block_cache::block &b)
int int
block_cache::flush() block_cache::flush()
{ {
block *b, *tmp; while (!dirty_.empty()) {
block &b = dirty_.front();
list_for_each_entry_safe (b, tmp, &dirty_, list_) { if (b.ref_count_ || b.test_flags(BF_IO_PENDING))
if (b->ref_count_ || b->test_flags(BF_IO_PENDING))
// The superblock may well be still locked. // The superblock may well be still locked.
continue; continue;
issue_write(*b); issue_write(b);
} }
wait_all(); wait_all();
return list_empty(&errored_) ? 0 : -EIO; return errored_.empty() ? 0 : -EIO;
} }
void void
@ -681,11 +606,12 @@ block_cache::prefetch(block_address index)
{ {
check_index(index); check_index(index);
block *b = hash_lookup(index); auto it = block_set_.find(index, cmp_index());
if (!b) {
if (it == block_set_.end()) {
prefetches_++; prefetches_++;
b = new_block(index); block *b = new_block(index);
if (b) if (b)
issue_read(*b); issue_read(*b);
} }

@ -1,18 +1,36 @@
#ifndef BLOCK_CACHE_H #ifndef BLOCK_CACHE_H
#define BLOCK_CACHE_H #define BLOCK_CACHE_H
#include "block-cache/list.h" #include <boost/intrusive/list.hpp>
#include <boost/intrusive/set.hpp>
#include <boost/shared_ptr.hpp>
#include <boost/noncopyable.hpp> #include <boost/noncopyable.hpp>
#include <boost/shared_ptr.hpp>
#include <stdexcept> #include <functional>
#include <iostream>
#include <libaio.h> #include <libaio.h>
#include <memory> #include <memory>
#include <stdexcept>
#include <stdint.h> #include <stdint.h>
#include <stdlib.h> #include <stdlib.h>
#include <vector> #include <vector>
namespace bi = boost::intrusive;
//----------------------------------------------------------------
// FIXME: move to own file in base
template<class P, class M>
size_t offsetof__(const M P::*member)
{
return (size_t) &( reinterpret_cast<P*>(0)->*member);
}
template<class P, class M>
P *container_of(M *ptr, M const P::*member)
{
return (P *)((char *)(ptr) - offsetof__(member));
}
//---------------------------------------------------------------- //----------------------------------------------------------------
namespace bcache { namespace bcache {
@ -50,7 +68,14 @@ namespace bcache {
public: public:
block() block()
: v_() { : v_() {
INIT_LIST_HEAD(&list_); }
bool operator <(block const &rhs) const {
return index_ > rhs.index_;
}
bool operator ==(block const &rhs) const {
return index_ == rhs.index_;
} }
// Do not give this class a destructor, it wont get // Do not give this class a destructor, it wont get
@ -92,16 +117,21 @@ namespace bcache {
bc_->release(*this); bc_->release(*this);
} }
void unlink() {
list_hook_.unlink();
}
private: private:
friend class block_cache; friend class block_cache;
friend class cmp_index;
block_cache *bc_; block_cache *bc_;
uint64_t index_; uint64_t index_;
void *data_; void *data_;
list_head list_; bi::list_member_hook<bi::link_mode<bi::auto_unlink>> list_hook_;
list_head hash_list_; bi::set_member_hook<> set_hook_;
unsigned ref_count_; unsigned ref_count_;
@ -112,6 +142,57 @@ namespace bcache {
validator::ptr v_; validator::ptr v_;
}; };
<<<<<<< HEAD
=======
struct cmp_index {
bool operator()(block_address index, block const &b) const {
return index > b.index_;
}
bool operator()(block const &b, block_address index) const {
return b.index_ > index;
}
};
class auto_block {
public:
auto_block()
: b_(0) {
}
auto_block(block &b)
: b_(&b) {
}
~auto_block() {
put();
}
auto_block &operator =(block &b) {
put();
b_ = &b;
return *this;
}
void *get_data() const {
if (b_)
return b_->get_data();
throw std::runtime_error("auto_block not set");
}
private:
void put() {
if (b_) {
b_->put();
b_ = 0;
}
}
block *b_;
};
>>>>>>> 7fadc34... [block-cache] convert to use boost::intrusive rather than kernel style lists.
//-------------------------------- //--------------------------------
block_cache(int fd, sector_t block_size, block_cache(int fd, sector_t block_size,
@ -137,24 +218,24 @@ namespace bcache {
void prefetch(block_address index); void prefetch(block_address index);
private: private:
typedef bi::member_hook<block,
bi::list_member_hook<bi::link_mode<bi::auto_unlink>>,
&block::list_hook_> list_hook_option;
typedef bi::list<block, list_hook_option,
bi::constant_time_size<false>> block_list;
int init_free_list(unsigned count); int init_free_list(unsigned count);
void exit_free_list();
block *__alloc_block(); block *__alloc_block();
void complete_io(block &b, int result); void complete_io(block &b, int result);
void issue_low_level(block &b, enum io_iocb_cmd opcode, const char *desc); void issue_low_level(block &b, enum io_iocb_cmd opcode, const char *desc);
void issue_read(block &b); void issue_read(block &b);
void issue_write(block &b); void issue_write(block &b);
void wait_io(); void wait_io();
list_head *__categorise(block &b); block_list &__categorise(block &b);
void hit(block &b); void hit(block &b);
void wait_all(); void wait_all();
void wait_specific(block &b); void wait_specific(block &b);
unsigned writeback(unsigned count); unsigned writeback(unsigned count);
void hash_init(unsigned nr_buckets);
unsigned hash(uint64_t index);
block *hash_lookup(block_address index);
void hash_insert(block &b);
void hash_remove(block &b);
void setup_control_block(block &b); void setup_control_block(block &b);
block *find_unused_clean_block(); block *find_unused_clean_block();
block *new_block(block_address index); block *new_block(block_address index);
@ -163,6 +244,7 @@ namespace bcache {
unsigned calc_nr_buckets(unsigned nr_blocks); unsigned calc_nr_buckets(unsigned nr_blocks);
void zero_block(block &b); void zero_block(block &b);
block *lookup_or_read_block(block_address index, unsigned flags, validator::ptr v); block *lookup_or_read_block(block_address index, unsigned flags, validator::ptr v);
void exit_free_list();
void preemptive_writeback(); void preemptive_writeback();
void release(block_cache::block &block); void release(block_cache::block &block);
@ -178,9 +260,8 @@ namespace bcache {
uint64_t nr_data_blocks_; uint64_t nr_data_blocks_;
uint64_t nr_cache_blocks_; uint64_t nr_cache_blocks_;
// We can't use auto_ptr or unique_ptr because the memory is allocated with malloc std::unique_ptr<std::vector<block>> blocks_memory_;
void *blocks_memory_; unsigned char *blocks_data_;
void *blocks_data_;
io_context_t aio_context_; io_context_t aio_context_;
std::vector<io_event> events_; std::vector<io_event> events_;
@ -189,23 +270,23 @@ namespace bcache {
* Blocks on the free list are not initialised, apart from the * Blocks on the free list are not initialised, apart from the
* b.data field. * b.data field.
*/ */
list_head free_; block_list free_;
list_head errored_; block_list errored_;
list_head dirty_; block_list dirty_;
list_head clean_; block_list clean_;
unsigned nr_locked_; unsigned nr_locked_;
unsigned nr_dirty_; unsigned nr_dirty_;
unsigned nr_io_pending_; unsigned nr_io_pending_;
struct list_head io_pending_; block_list io_pending_;
/* typedef bi::member_hook<block,
* Hash table fields. bi::set_member_hook<>,
*/ &block::set_hook_> block_option;
unsigned nr_buckets_; typedef bi::set<block, block_option,
unsigned mask_; bi::constant_time_size<false>> block_set;
std::vector<list_head> buckets_; block_set block_set_;
// Stats // Stats
unsigned read_hits_; unsigned read_hits_;