diff --git a/exercises/Cargo.lock b/exercises/Cargo.lock index 2e50d63c8087586016a6f6187a30091c447a9790..c179cf27e4ad15ab70a871f39f504d52afe3a985 100644 --- a/exercises/Cargo.lock +++ b/exercises/Cargo.lock @@ -430,6 +430,17 @@ version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" +[[package]] +name = "errno" +version = "0.2.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f639046355ee4f37944e44f60642c6f3a7efa3cf6b78c78a0d989a8ce6c396a1" +dependencies = [ + "errno-dragonfly", + "libc", + "winapi", +] + [[package]] name = "errno" version = "0.3.9" @@ -440,6 +451,16 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "errno-dragonfly" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf" +dependencies = [ + "cc", + "libc", +] + [[package]] name = "getrandom" version = "0.2.15" @@ -494,11 +515,13 @@ checksum = "bdcd9b131fd67bb827b386d0dc63d3e74196a14616ef800acf87ca5fef741a10" dependencies = [ "bitflags 1.3.2", "cfg-if", + "errno 0.2.8", "hdf5-derive", "hdf5-sys", "hdf5-types", "lazy_static", "libc", + "lzf-sys", "ndarray", "parking_lot 0.11.2", "paste", @@ -570,7 +593,7 @@ dependencies = [ "bitflags 2.6.0", "derive_more", "enum-iterator", - "errno", + "errno 0.3.9", "hwlocality-sys", "libc", "num_enum", @@ -729,6 +752,15 @@ version = "0.4.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c" +[[package]] +name = "lzf-sys" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0798d023ce0905e2c77ed96de92aab929ff9db2036cbef4edfee0daf33582aec" +dependencies = [ + "cc", +] + [[package]] name = "malloc_buf" version = "0.0.6" diff --git a/exercises/Cargo.toml b/exercises/Cargo.toml index cf5709cead7dc195a3b27e52599ee74de10118a1..94c5797b923e1402072ccb38935ac39d93a2bd95 100644 --- a/exercises/Cargo.toml +++ b/exercises/Cargo.toml @@ -9,7 +9,7 @@ gpu = ["dep:directories", "dep:vulkano", "dep:vulkano-shaders"] [dependencies] clap = { version = "4.5.7", features = ["derive", "env"] } directories = { version = "5.0.1", optional = true } -hdf5 = "0.8.1" +hdf5 = { version = "0.8.1", features = ["lzf"] } hwlocality = "1.0.0-alpha.5" indicatif = "0.17.8" iterator_ilp = "2.1.2" diff --git a/handouts/src/28-compressor.md b/handouts/src/28-compressor.md new file mode 100644 index 0000000000000000000000000000000000000000..8de94b9c9bf897152ffc6d7a4674c1a9733b01a9 --- /dev/null +++ b/handouts/src/28-compressor.md @@ -0,0 +1,92 @@ +# Compression + +During the last exercise, you may have been disappointed by the fact that an +optimization which has a great impact in microbenchmarks (removing the inner +copy in `Concentrations::current_v()`) had no impact on the full simulation +workload. + +This becomes easily understandable, once you realize that the simulation is +currently not CPU-bound, but storage-bound. As you can check by running a system +monitor in parallel with the simulation, the storage device is intermittently at +100% utilization while the simulation is running, indicating that it is very +likely to be the bottleneck. + +Therefore, we actually need to speed up the I/O. And this means that we must now +pay attention to what our HDF5 writer is doing. + + +## A closer look at the I/O code + +Intuitively, performance while the simulation is running can be affected either +by the way the simulation is configured, or by what we are doing on every +storage write: + +```rust,ignore +/// Create or truncate the file +/// +/// The file will be dimensioned to store a certain amount of V species +/// concentration arrays. +/// +/// The `Result` return type indicates that this method can fail and the +/// associated I/O errors must be handled somehow. +pub fn create(file_name: &str, shape: [usize; 2], num_images: usize) -> hdf5::Result<Self> { + // The ? syntax lets us propagate errors from an inner function call to + // the caller, when we cannot handle them ourselves. + let file = File::create(file_name)?; + let [rows, cols] = shape; + let dataset = file + .new_dataset::<Float>() + .chunk([1, rows, cols]) + .shape([num_images, rows, cols]) + .create("matrix")?; + Ok(Self { + file, + dataset, + position: 0, + }) +} + +/// Write a new V species concentration table to the file +pub fn write(&mut self, result_v: ArrayView2<Float>) -> hdf5::Result<()> { + self.dataset + .write_slice(result_v, (self.position, .., ..))?; + self.position += 1; + Ok(()) +} +``` + +Obviously, we cannot change much in `write()`, so let's focus on chat happens +inside of `create()`. There are two obvious areas of leverage: + +- We can change our hardcoded chunk size of 1 to something larger, and see if + doing I/O at a higher granularity helps. +- Try to enable additional HDF5 options, such as compression, to reduce the + volume of data that is eventually sent to the storage device. + +In which order should we perform these optimizations? Well, compression is +affected by block size, since it feeds the compression engine with more data, +which can be either good (more patterns to compress) or bad (worse CPU cache +locality slowing down the compression algorithm). Therefore, we should try to +enable compression first. + + +## Exercise + +Previous experience from the course's author suggests that on modern NVMe +storage devices, only the LZ4/LZO/LZF family of fast compressors are still +worthwhile. Anything more sophisticated, even Zstandard at compression level 1, +will result in a net slowdown. + +Therefore, please try to enable LZF dataset compression... + +```rust,ignore +let dataset = file + .new_dataset::<Float>() + .chunk([1, rows, cols]) + .lzf() + .shape([num_images, rows, cols]) + .create("matrix")?; +``` + +...and see if it helps or hurts for this particular computation, on your +storage hardware. diff --git a/handouts/src/SUMMARY.md b/handouts/src/SUMMARY.md index 14e2f2aa5dbf559f816b5e291eebfe85ab6e55d5..95565ac1b32f4ab733a619543f49378ee519d27c 100644 --- a/handouts/src/SUMMARY.md +++ b/handouts/src/SUMMARY.md @@ -37,7 +37,8 @@ - [Execution](25-simulating.md) - [Batching](26-batch-system.md) - [Avoiding copies](27-copyright.md) -- [Compression]() +- [Compression](28-compressor.md) +- [Block tuning]() - [Async storage]() - [Harder ideas]()