Commit 3d2481ec authored by Hadrien G's avatar Hadrien G
Browse files

Remove packet block size autotuning, it doesn't pull its weight

parent ae4e8379
Pipeline #159889 passed with stages
in 33 seconds
......@@ -116,83 +116,20 @@ constexpr size_t NUM_VISIBILITIES = NUM_FEEDS * (NUM_FEEDS + 1) / 2;
// arithmetic instructions. However, there are also limits to how far we can go:
//
// - More packets mean more SIMD registers used by feed data. If we request too
// much, we spill out of CPU registers, and performance will be terrible.
// much, we spill out of CPU registers, and this costs performance.
// - More packets mean a more complex loop, which means that the compiler may be
// more hesitant to perform agressive loop optimizations like full unrolling.
// more hesitant to perform agressive loop optimizations like full unrolling,
// and if it does the generated code may spill out of CPU (micro)code caches.
// - More packets means more L1d cache pressure in the inner loop, which can in
// turn reduce memory performance.
//
// The default setting auto-tunes the number of packets per block to achieve
// sufficient arithmetic intensity on current Intel CPUs. Specific CPUS may
// benefit from a slightly higher setting.
// Given this tradeoff, it is hard to predict what will be optimal on a
// particular combination of compiler and CPU model. The default was measured
// to be optimal on Intel Cascade Lake, Comet Lake and AMD Zen 3 CPUs, driven by
// various GCC and clang versions.
//
#ifndef PACKETS_PER_BLOCK
constexpr size_t constexpr_ceil(float val) {
size_t truncated = val;
return truncated + static_cast<size_t>(val > truncated);
}
constexpr size_t autotune_packets_per_block() {
// Machine properties of the correlation computation
//
// FIXME: Will need to be kept in sync with correlation algorithm, in
// particular once feed blocks are introduced. Figure out a way to
// ensure this synchronization as automatically as possible.
//
constexpr float ACC_LOAD_STORES_PER_PACKET_BLOCK = 2 * NUM_VISIBILITIES;
constexpr float MES_LOADS_PER_PACKET =
2 * NUM_FEEDS /* feed1 */ + 2 * (NUM_VISIBILITIES - NUM_FEEDS) /* feed2 */;
constexpr float MUL_ADDS_PER_PACKET =
2 * NUM_FEEDS /* self-corr */ + 4 * (NUM_VISIBILITIES - NUM_FEEDS) /* cross-corr */;
constexpr size_t MIN_COMPUTE_REGISTERS = 3; /* empirically observed */
constexpr size_t EXTRA_REGISTERS_PER_PACKET = 2 /* re+im */ * 2 /* feed1+feed2 */;
// FIXME: This hardware model is an Intel-ism, cross-check how well it applies
// to other CPUs when broader perf portability will be desired.
#ifdef __FMA__
// Since Haswell, high-end Intel CPUs can do 2 FMAs, 2 loads and 1 store
// per cycle. Hence given 2 cycles they can do 4 FMAs, 4 loads and 2 stores.
// On those CPUs, we must do 4 mul-adds for every 4 loads and 2 stores.
constexpr float MIN_MUL_ADDS_PER_4LOADS_2STORES = 4;
#else
// Before that, Intel CPUs could do 2 muls, 2 adds, and either 2 loads
// or 1 load and 1 store every 2 cycles. So in 6 cycles they could do 6 muls
// 6 adds, 4 loads and 2 stores. On those CPUs, we must do 6 mul-adds for
// every 4 loads and 2 stores.
constexpr float MIN_MUL_ADDS_PER_4LOADS_2STORES = 6;
#endif
// For load-bound code (i.e. all other measurement loads), the situation
// hasn't changed across Intel CPU generations.
constexpr float MIN_MUL_ADDS_PER_EXTRA_LOAD = 1;
// First, we want to achieve sufficient arithmetic intensity with respect
// to accumulator loads and stores, which eat into precious store bandwidth.
constexpr float NUM_4LOADS_2STORES_PER_PACKET_BLOCK =
ACC_LOAD_STORES_PER_PACKET_BLOCK / 2;
constexpr float MIN_MUL_ADDS_PER_PACKET_BLOCK_STORE_BOUND =
MIN_MUL_ADDS_PER_4LOADS_2STORES * NUM_4LOADS_2STORES_PER_PACKET_BLOCK;
constexpr size_t MIN_PACKETS_PER_BLOCK_STORE_BOUND =
constexpr_ceil(MIN_MUL_ADDS_PER_PACKET_BLOCK_STORE_BOUND / MUL_ADDS_PER_PACKET);
// Second, we want to achieve sufficient arithmetic intensity with respect to
// measurement loads not covered by the above "4 loads 2 stores" arithmetic
// intensity criterion. We have no leverage on this with the current algorithm
// (this will eventually be taken care of via feed blocking), so for now we
// just do a rough approximate check that things seems to work out.
static_assert(MUL_ADDS_PER_PACKET >= MIN_MUL_ADDS_PER_EXTRA_LOAD * MES_LOADS_PER_PACKET);
// While arithmetic intensity is very important, we must also ensure that we
// do not blow up our register budget. If we do, it means we just can't run
// efficiently on the target hardware...
constexpr size_t MAX_PACKETS_PER_BLOCK_REGISTER_BOUND =
(NUM_SIMD_REGISTERS - MIN_COMPUTE_REGISTERS) / EXTRA_REGISTERS_PER_PACKET;
static_assert(MIN_PACKETS_PER_BLOCK_STORE_BOUND <= MAX_PACKETS_PER_BLOCK_REGISTER_BOUND);
// And with that, we're done.
return MIN_PACKETS_PER_BLOCK_STORE_BOUND;
}
// Auto-tune the number of packets per block according to current criteria
constexpr size_t PACKETS_PER_BLOCK = autotune_packets_per_block();
constexpr size_t PACKETS_PER_BLOCK = 3;
#endif
// Number of packets per radio feed (freely tunable parameter)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment