diff --git a/exercises/src/gpu/interface.comp b/exercises/src/gpu/interface.comp index 0af6f4e7f1c0c34b5824cf16623c66b9bb820c52..4d117e2b560bd779c8b4916daeb7f99c5820b6bb 100644 --- a/exercises/src/gpu/interface.comp +++ b/exercises/src/gpu/interface.comp @@ -249,12 +249,23 @@ bool set_error(in uint new_error) { // work-group, with an outer ring of 1 pixels removed. To process multiple steps // per compute pipeline dispatch, we need to dynamically rebalance work between // work-groups, which entail having a way to track the current processing status -// of each tile. This is that tracking state. +// of each tile. g_tile_status[0] is that tracking state. // -// See tile_status.comp for more info on the structure of each status word. +// For more details about what status information we track regarding each +// simulation tile and how we encode it into a single uint per tile for +// efficient atomic operations, see the docs of tile_status.comp. // -// This buffer must be cleared with a 0x00_ff_0000 pattern before each dispatch -layout(set = 1, binding = 1) buffer restrict coherent TileStatus { uint[] data; } g_tile_status; +// When a work-group runs out of work, it uses the status data within +// g_tile_status[0] in order to find out what is the best tile to process next. +// But even with a very parallel GPU scanning g_tile_status[0] exhaustively gets +// prohibitively expensive at larger domain sizes. Instead, we speed up the +// search for the smallest status using a tree, stored at the higher indices of +// g_tile_status[i], and described in the docs of tile_map.comp. +// +// This buffer array must be initialized by doing a dispatch in MODE_INIT before +// each MODE_RUN simulation dispatch. +layout(constant_id = 3) const uint TILE_STATUS_TREE_DEPTH = 1; +layout(set = 1, binding = 1) buffer restrict coherent TileStatus { uint[] data; } g_tile_status[TILE_STATUS_TREE_DEPTH + 1]; // Histogram tracking how many tiles have >=N work-groups waiting for them // @@ -282,15 +293,15 @@ layout(set = 1, binding = 2) buffer restrict coherent AwaitCounts { uint[AWAIT_C // === Simulation parameters === // Computation parameters -layout(constant_id = 3) const float WEIGHT00 = 0.25; -layout(constant_id = 4) const float WEIGHT01 = 0.5; -layout(constant_id = 5) const float WEIGHT02 = 0.25; -layout(constant_id = 6) const float WEIGHT10 = 0.5; -layout(constant_id = 7) const float WEIGHT11 = 0.0; -layout(constant_id = 8) const float WEIGHT12 = 0.5; -layout(constant_id = 9) const float WEIGHT20 = 0.25; -layout(constant_id = 10) const float WEIGHT21 = 0.5; -layout(constant_id = 11) const float WEIGHT22 = 0.25; +layout(constant_id = 4) const float WEIGHT00 = 0.25; +layout(constant_id = 5) const float WEIGHT01 = 0.5; +layout(constant_id = 6) const float WEIGHT02 = 0.25; +layout(constant_id = 7) const float WEIGHT10 = 0.5; +layout(constant_id = 8) const float WEIGHT11 = 0.0; +layout(constant_id = 9) const float WEIGHT12 = 0.5; +layout(constant_id = 10) const float WEIGHT20 = 0.25; +layout(constant_id = 11) const float WEIGHT21 = 0.5; +layout(constant_id = 12) const float WEIGHT22 = 0.25; mat3 weights() { const float center = -( WEIGHT00 + WEIGHT01 + WEIGHT02 + WEIGHT10 + WEIGHT12 @@ -300,15 +311,15 @@ mat3 weights() { WEIGHT20, WEIGHT21, WEIGHT22); } // -layout(constant_id = 12) const float DIFFUSION_RATE_U = 0.1; -layout(constant_id = 13) const float DIFFUSION_RATE_V = 0.05; -layout(constant_id = 14) const float FEED_RATE = 0.014; -layout(constant_id = 15) const float KILL_RATE = 0.054; -layout(constant_id = 16) const float TIME_STEP = 1.0; +layout(constant_id = 13) const float DIFFUSION_RATE_U = 0.1; +layout(constant_id = 14) const float DIFFUSION_RATE_V = 0.05; +layout(constant_id = 15) const float FEED_RATE = 0.014; +layout(constant_id = 16) const float KILL_RATE = 0.054; +layout(constant_id = 17) const float TIME_STEP = 1.0; // Simulation domain size -layout(constant_id = 17) const uint DOMAIN_COLS = 1920; -layout(constant_id = 18) const uint DOMAIN_ROWS = 1080; +layout(constant_id = 18) const uint DOMAIN_COLS = 1920; +layout(constant_id = 19) const uint DOMAIN_ROWS = 1080; const uvec2 OUTPUT_SIZE = uvec2(DOMAIN_COLS, DOMAIN_ROWS); const uvec2 INPUT_SIZE = OUTPUT_SIZE + uvec2(2); @@ -322,9 +333,9 @@ const uvec2 INPUT_SIZE = OUTPUT_SIZE + uvec2(2); // pressure on the global tile status trackign state and 2/this allows some // work-groups to sacrifice themselves to allow others to make progress, // increasing the amount of successful waits. -layout(constant_id = 19) const uint MIN_POLLS_BEFORE_SWITCH = 1; -layout(constant_id = 20) const uint MAX_POLLS_BEFORE_SWITCH = 64; +layout(constant_id = 20) const uint MIN_POLLS_BEFORE_SWITCH = 1; +layout(constant_id = 21) const uint MAX_POLLS_BEFORE_SWITCH = 64; // How many unsuccessful attempts to switch tiles we should tolerate before // concluding that something went wrong and the simulation is stuck. -layout(constant_id = 21) const uint MAX_SWITCH_ATTEMPTS = 2; +layout(constant_id = 22) const uint MAX_SWITCH_ATTEMPTS = 2; diff --git a/exercises/src/gpu/tile_map.comp b/exercises/src/gpu/tile_map.comp index 0d07a339b42d43e8dcdc92f69ece12f259f6f591..8bb6888724e05fd3c028461230c65ae59df66a72 100644 --- a/exercises/src/gpu/tile_map.comp +++ b/exercises/src/gpu/tile_map.comp @@ -2,7 +2,7 @@ // // As explained in the documentation of g_tile_status, the simulation domain // is cut in overlapping work-group-sized tiles, and a map of the status of -// these tiles is stored in g_tile_status. +// these tiles is stored in g_tile_status[0]. // // The map is padded with one strip of end-of-simulation statuses (input idx // is LAST_BUFFER_IDX) on either side, and further padded on the right and at @@ -16,6 +16,75 @@ // make a simulation step by looking at the input_idx of its 2D neighbors, // since thanks to the padding edge tiles have fake neighbors that are always // ready for another simulation step (since they have "reached" the last step). +// +// --- +// +// This is all fine and good until we have a work-group that cannot process its +// original simulation tile and needs to switch to a different one. Ideally, +// we'd like to pick the tile with the lowest status, as status word encoding is +// designed such that this is optimal. But for large simulation domains, +// scanning g_tile_status[0] to find the minimum status takes forever. +// Therefore, we have a tree that speeds up this search, and that's what the +// other elements of the g_tile_status[] array are here for. +// +// The general idea is this: +// +// - To decide if a tile with a certain status can be processed, we need to look +// at the status of neighboring tiles in g_tile_status[0]. This mirrors our +// original stencil problem, but on a wider spatial scale (tiles instead of +// data points) and with a different reduction (tile status comparisons +// instead of Gray-Scott reaction integration). +// - From this observation, we can use a similar domain decomposition strategy +// and slice up g_tile_status[0] in overlapping tiles. And for each of these +// tiles, we can compute what the minimal tile status of inner tiles that can +// move forward (excluding the outer edge) is. This gives us OUTPUT_TILES / +// LOCAL_OUTPUT_SIZE minimal tile statuses, one per tile of tiles, which we +// can write down to another 2D array, g_tile_status[1]. When no tile can +// move forward, we use TILE_STATUS_MAX as a placeholder. +// - If we locate the minimal tile status g_tile_status[1], we find the minimal +// tile status within g_tile_status[0], and the block that contains it. So we +// can scan that block with our work-group, find the minimal tile status +// there, and hopefully quickly steal it before someone else does. +// - Now, for large simulation domains, even g_tile_status[1] may be too +// large to cheaply scan. But that's okay. We just build higher- and +// higher-level g_tile_status[n] maps, where for each we take tiles of +// LOCAL_INPUT_SIZE elements of g_tile_status[n-1] and track the minimum +// status of each tile of g_tile_status[n-1] in a single element of +// g_tile_status[n]. +// - In the end, we get to g_tile_status[TILE_STATUS_TREE_DEPTH], which is +// so small that a single work-group can locate the minimal status inside of +// it in a single parallel reduction step, without iteration. +// +// We use this tree to locate the smallest tile status as follows: +// +// - First, we read all of g_tile_status[TILE_STATUS_TREE_DEPTH] inside of the +// active work-group, and we locate the smallest status inside of it. +// - We then load the corresponding tile of +// g_tile_status[TILE_STATUS_TREE_DEPTH-1] and locate the smallest status +// inside of it (whose status value should match) +// - This tells us about a tile inside of +// g_tile_status[TILE_STATUS_TREE_DEPTH-2] where the smallest status should +// lie... and so on until we find a tile inside of g_tile_status[0]. +// - We then locate the smallest tile status inside of that tile. +// - If, at any point, the smallest tile status we read does not match our +// expectations, our tile status information is stale, so we retry the search +// from the beginning. +// +// Now, the question is, when should the tree be updated? Doing every +// tile g_tile_status[0] changes seems too expensive, as we're modifying the +// status of each tile multiple times per simulation step so instead I'll try to +// only update the tree at points where a work-group has nothing better to do: +// +// - When the work-group that processes a simulation tile must wait because the +// neighbors of its active tile are not ready, it updates the part of the +// g_tile_status[1..] tree that pertain to the active tile and its neighbors +// (whose status it also changed to indicate that it's waiting for them). +// - Before switching to a different tile, a work-group updates the element of +// each layer of the g_tile_status[1..] tree that pertain to its former tile, +// if they seem out of date. +// - Before exiting due to lack of work, a work-group updates the elements of +// each layer of the g_tile_status[1..] tree that pertain to its former tile +// one last time. // === Dimensions ===