[btree_builder] Fix issues with under populated shared nodes

A pre-built node could be under populated (less than half-full) due to
following reasons:

- A single shared leaf generated by dm-thin key removal. The residency
  could drop below 50%, until it reaches the merge threshold (33% or 44%,
  depends on its context).
- A shared root, which could have any possible nr_entries.
- Underfull shared nodes (less than 33% residency) caused by kernel issues.

To avoid producing under populated nodes, those kinds of pre-built nodes,
except the roots, will be merged into their siblings.
This commit is contained in:
Ming-Hung Tsai 2021-07-30 22:37:01 +08:00
parent bd39b570ef
commit e158dc7601

View File

@ -272,13 +272,19 @@ impl<'a, V: Pack + Unpack + Clone> NodeBuilder<V> {
shared, shared,
} }
} }
/// Push a single value. This may emit a new node, hence the Result /// Push a single value. This may emit a new node, hence the Result
/// return type. The value's ref count will be incremented. /// return type. The value's ref count will be incremented.
pub fn push_value(&mut self, w: &mut WriteBatcher, key: u64, val: V) -> Result<()> { pub fn push_value(&mut self, w: &mut WriteBatcher, key: u64, val: V) -> Result<()> {
// Unshift the previously pushed node since it is not the root
let half_full = self.max_entries_per_node / 2;
if self.nodes.len() == 1 && (self.nodes.last().unwrap().nr_entries < half_full) {
self.unshift_node(w)?;
}
// Have we got enough values to emit a node? We try and keep // Have we got enough values to emit a node? We try and keep
// at least max_entries_per_node entries unflushed so we // at least max_entries_per_node entries unflushed so we
// can ensure the final node is balanced properly. // can ensure the final node is balanced properly.
if self.values.len() == self.max_entries_per_node * 2 { else if self.values.len() == self.max_entries_per_node * 2 {
self.emit_node(w)?; self.emit_node(w)?;
} }
@ -287,6 +293,19 @@ impl<'a, V: Pack + Unpack + Clone> NodeBuilder<V> {
Ok(()) Ok(())
} }
// To avoid writing an under populated node we have to grab some
// values from the first of the shared nodes.
fn append_values(&mut self, w: &mut WriteBatcher, node: &NodeSummary) -> Result<()> {
let (keys, values) = self.read_node(w, node.block)?;
for i in 0..keys.len() {
self.value_rc.inc(&values[i])?;
self.values.push_back((keys[i], values[i].clone()));
}
Ok(())
}
/// Push a number of prebuilt, shared nodes. The builder may decide to not /// Push a number of prebuilt, shared nodes. The builder may decide to not
/// use a shared node, instead reading the values and packing them /// use a shared node, instead reading the values and packing them
/// directly. This may do IO to emit nodes, so returns a Result. /// directly. This may do IO to emit nodes, so returns a Result.
@ -296,45 +315,69 @@ impl<'a, V: Pack + Unpack + Clone> NodeBuilder<V> {
pub fn push_nodes(&mut self, w: &mut WriteBatcher, nodes: &[NodeSummary]) -> Result<()> { pub fn push_nodes(&mut self, w: &mut WriteBatcher, nodes: &[NodeSummary]) -> Result<()> {
assert!(!nodes.is_empty()); assert!(!nodes.is_empty());
// Assume that the node is a shared root if it is the first comer.
// A rooted leaf could have any number of entries.
let maybe_root = (nodes.len() == 1) && self.nodes.is_empty() && self.values.is_empty();
if maybe_root {
let n = &nodes[0];
w.sm.lock().unwrap().inc(n.block, 1)?;
self.nodes.push(n.clone());
return Ok(());
}
// As a sanity check we make sure that all the shared nodes contain the // As a sanity check we make sure that all the shared nodes contain the
// minimum nr of entries. // minimum nr of entries.
// A single shared node could be possibly under populated (less than half-full)
// due to btree removal, or even underfull (<33% residency) due to kernel issues.
// Those kinds of nodes will be merged into their siblings.
let half_full = self.max_entries_per_node / 2; let half_full = self.max_entries_per_node / 2;
if nodes.len() > 1 {
for n in nodes { for n in nodes {
if n.nr_entries < half_full { if n.nr_entries < half_full {
panic!("under populated node"); panic!("under populated node");
} }
} }
}
// Unshift the previously pushed node since it is not the root
if self.nodes.len() == 1 && (self.nodes.last().unwrap().nr_entries < half_full) {
self.unshift_node(w)?;
}
// Decide if we're going to use the pre-built nodes. // Decide if we're going to use the pre-built nodes.
if !self.values.is_empty() && (self.values.len() < half_full) { if !self.values.is_empty() && (self.values.len() < half_full) {
// To avoid writing an under populated node we have to grab some let mut nodes_iter = nodes.iter();
// values from the first of the shared nodes. let n = nodes_iter.next();
let (keys, values) = self.read_node(w, nodes.get(0).unwrap().block)?; self.append_values(w, n.unwrap())?;
for i in 0..keys.len() {
self.value_rc.inc(&values[i])?;
self.values.push_back((keys[i], values[i].clone()));
}
// Do not flush if there's no succeeding nodes,
// so that it could produce a more compact metadata.
if nodes.len() > 1 {
// Flush all the values. // Flush all the values.
self.emit_all(w)?; self.emit_all(w)?;
// Add the remaining nodes. // Add the remaining nodes.
for i in 1..nodes.len() { for n in nodes_iter {
let n = nodes.get(i).unwrap();
w.sm.lock().unwrap().inc(n.block, 1)?; w.sm.lock().unwrap().inc(n.block, 1)?;
self.nodes.push(n.clone()); self.nodes.push(n.clone());
} }
}
} else { } else {
// Flush all the values. // Flush all the values.
self.emit_all(w)?; self.emit_all(w)?;
// add the nodes if nodes[0].nr_entries < half_full {
// An under populated nodes[0] implies nodes.len() == 1,
// and that has to be merged into their siblings.
self.append_values(w, &nodes[0])?;
} else {
// Add the nodes.
for n in nodes { for n in nodes {
w.sm.lock().unwrap().inc(n.block, 1)?; w.sm.lock().unwrap().inc(n.block, 1)?;
self.nodes.push(n.clone()); self.nodes.push(n.clone());
} }
} }
}
Ok(()) Ok(())
} }