From 937d84fa23e81b8be0685fd4de385c53a64fe6b4 Mon Sep 17 00:00:00 2001 From: Hadrien Grasland <hadrien.grasland@gmx.fr> Date: Sun, 7 Jul 2024 21:29:56 +0200 Subject: [PATCH] Add copy elision optimization --- handouts/src/26-up-to-date.md | 2 +- handouts/src/27-copyright.md | 144 ++++++++++++++++++++++++++++++++++ handouts/src/SUMMARY.md | 4 +- 3 files changed, 148 insertions(+), 2 deletions(-) create mode 100644 handouts/src/27-copyright.md diff --git a/handouts/src/26-up-to-date.md b/handouts/src/26-up-to-date.md index ed22a18..d40dab8 100644 --- a/handouts/src/26-up-to-date.md +++ b/handouts/src/26-up-to-date.md @@ -5,7 +5,7 @@ should hopefully now run for you. Now let's tune it for better performance, starting with the `gpu::update()` function. -## Problem statement +## Why `update()`? The `gpu::update()` function is very performance-critical because it is the only nontrivial function that runs once per compute step, and therefore the only diff --git a/handouts/src/27-copyright.md b/handouts/src/27-copyright.md new file mode 100644 index 0000000..9b76ddf --- /dev/null +++ b/handouts/src/27-copyright.md @@ -0,0 +1,144 @@ +# Avoiding copies + +After cleaning up the CPU side of the simulation inner loop, let us now look +into the outer loop, which generates images and saves them to disk. + +Since writing to disk is mostly handled by HDF5, we will first look into the +code which generates the output images, on which we have more leverage. + + +## A problematic copy + +Remember that to simplify the porting of the code from CPU to GPU, we initially +made the GPU version of `Concentrations` expose a `current_v()` interface with +the same happy-path return type as its CPU conterpart... + +```rust,ignore +pub fn current_v(&mut self) -> Result<&Array2<Float>, Box<dyn Error>> { +``` + +...which forced us to engage in some ugly data copying at the end of the +function: + +```rust,ignore + // Access the CPU-side buffer + let v_data = self.v_buffer.read()?; + let v_target = self + .v_ndarray + .as_slice_mut() + .expect("Failed to access ndarray as slice"); + v_target.copy_from_slice(&v_data); + Ok(&self.v_ndarray) +} +``` + +As a trip through a CPU profiler will tell you, that copy is where most of the +simulation time is spent in our current default configuration, where an output +image is emitted every 34 simulation steps. + +Of course, one way to speed up this copy would be to parallelize it. But it +would be more satisfying and more efficient to get rid of it entirely, along +with the associated `v_ndarray` member of the `Concentrations` struct. Let's see +what it would take. + + +## What's in a buffer `read()`? + +To access the `vulkano` buffer into which the simulation results where +downloaded, we have used [the `Subbuffer::read()` +method](https://docs.rs/vulkano/latest/vulkano/buffer/subbuffer/struct.Subbuffer.html#method.read). +This method, in turn, is here to support `vulkano`'s goal of extending Rust's +memory safety guarantees to GPU-accessible memory. + +In Vulkan, both the GPU and the CPU can access the contents of a buffer. But +when we are working in Rust, they have to do so in a manner that upholds Rust's +invariants. At any point in time, either only one code path can access the inner +data for writing, or any number of code paths can access it in a read-only +fashion. This is not a guarantee that is built in to the Vulkan API, so +`vulkano` has to implement it, and it opted to so at run time using a +reader-writer lock. `Subbuffer::read()` is how we acquire this lock for reading +on the CPU side. + +Of course, if we acquire a lock, we must release it at some point. This is done +using an RAII type called +[`BufferReadGuard`](https://docs.rs/vulkano/latest/vulkano/buffer/subbuffer/struct.BufferReadGuard.html) +which lets us access the reader data while we hold it, and will automatically +release the lock when we drop it. Unfortunately, this design means that we +cannot just wrap the lock's inner data into an `ArrayView2` and return it like +this: + +```rust,ignore +use ndarray::ArrayView2; + +// ... + +pub fn current_v(&mut self) -> Result<ArrayView2<'_, Float>, Box<dyn Error>> { + // ... download the data ... + + // Return a 2D array view of the freshly downloaded buffer + let v_data = self.v_buffer.read()?; + // ERROR: Returning a borrow of a stack variable + Ok(ArrayView2::from_shape(self.shape(), &v_data[..])?) +} +``` + +...because if we were allowed to do it, we would be returning a reference to the +memory of `v_buffer` after the `v_data` read lock has been dropped, and then +another thread could trivially start a job that overwrites `v_buffer` and create +a data race. Which is not what Rust stands for. + + +## If not `ArrayView2`, what else? + +The easiest alternative that we have at our disposal[^1] is to return the Vulkan +buffer RAII lock object from `Concentration::current_v()`... + +```rust,ignore +use vulkano::buffer::BufferReadGuard; + +pub fn current_v(&mut self) -> Result<BufferReadGuard<'_, [Float]>, Box<dyn Error>> { + // ... download the data ... + + // Expose the inner buffer's read lock + Ok(self.v_buffer.read()?) +} +``` + +...and then, in `run_simulation()`, turn it into an `ArrayView2` before sending +it to the HDF5 writer: + +```rust,ignore +use ndarray::ArrayView2; + +let shape = concentrations.shape(); +let data = concentrations.current_v()?; +hdf5.write(ArrayView2::from_shape(shape, &data[..])?)?; +``` + +We must then adapt said HDF5 writer so that it accepts an `ArrayView2` instead +of an `&Array2`... + +```rust,ignore +use ndarray::ArrayView2; + +pub fn write(&mut self, result_v: ArrayView2<Float>) -> hdf5::Result<()> { +``` + +...and then modify the CPU version of `run_simulation()` so that it turns the +`&Array2` that it has into an `ArrayView2`, which is done with a simple call to +the `view()` method: + +```rust,ignore +hdf5.write(concentrations.current_v().view())?; +``` + + +## Exercise + +Integrate these changes, and measure their effect on runtime performance. + + +--- + +[^1]: Alternatives with nicer-looking APIs involve creating self-referential + objects, which in Rust are a rather advanced topic to put it mildly. diff --git a/handouts/src/SUMMARY.md b/handouts/src/SUMMARY.md index e9732e7..ceabb95 100644 --- a/handouts/src/SUMMARY.md +++ b/handouts/src/SUMMARY.md @@ -36,7 +36,9 @@ - [Resources](24-resourceful.md) - [Execution](25-simulating.md) - [Faster updates](26-up-to-date.md) -- [TODO: More optimizations]() +- [Avoiding copies](27-copyright.md) +- [Asynchronous I/O]() +- [Harder ideas]() --- -- GitLab