Skip to content
Snippets Groups Projects
Commit 91df5ea0 authored by GRASLAND Hadrien's avatar GRASLAND Hadrien
Browse files

First steps towards a tile status search tree

parent 9d8f3287
No related branches found
No related tags found
No related merge requests found
Pipeline #339064 failed
...@@ -249,12 +249,23 @@ bool set_error(in uint new_error) { ...@@ -249,12 +249,23 @@ bool set_error(in uint new_error) {
// work-group, with an outer ring of 1 pixels removed. To process multiple steps // work-group, with an outer ring of 1 pixels removed. To process multiple steps
// per compute pipeline dispatch, we need to dynamically rebalance work between // per compute pipeline dispatch, we need to dynamically rebalance work between
// work-groups, which entail having a way to track the current processing status // work-groups, which entail having a way to track the current processing status
// of each tile. This is that tracking state. // of each tile. g_tile_status[0] is that tracking state.
// //
// See tile_status.comp for more info on the structure of each status word. // For more details about what status information we track regarding each
// simulation tile and how we encode it into a single uint per tile for
// efficient atomic operations, see the docs of tile_status.comp.
// //
// This buffer must be cleared with a 0x00_ff_0000 pattern before each dispatch // When a work-group runs out of work, it uses the status data within
layout(set = 1, binding = 1) buffer restrict coherent TileStatus { uint[] data; } g_tile_status; // g_tile_status[0] in order to find out what is the best tile to process next.
// But even with a very parallel GPU scanning g_tile_status[0] exhaustively gets
// prohibitively expensive at larger domain sizes. Instead, we speed up the
// search for the smallest status using a tree, stored at the higher indices of
// g_tile_status[i], and described in the docs of tile_map.comp.
//
// This buffer array must be initialized by doing a dispatch in MODE_INIT before
// each MODE_RUN simulation dispatch.
layout(constant_id = 3) const uint TILE_STATUS_TREE_DEPTH = 1;
layout(set = 1, binding = 1) buffer restrict coherent TileStatus { uint[] data; } g_tile_status[TILE_STATUS_TREE_DEPTH + 1];
// Histogram tracking how many tiles have >=N work-groups waiting for them // Histogram tracking how many tiles have >=N work-groups waiting for them
// //
...@@ -282,15 +293,15 @@ layout(set = 1, binding = 2) buffer restrict coherent AwaitCounts { uint[AWAIT_C ...@@ -282,15 +293,15 @@ layout(set = 1, binding = 2) buffer restrict coherent AwaitCounts { uint[AWAIT_C
// === Simulation parameters === // === Simulation parameters ===
// Computation parameters // Computation parameters
layout(constant_id = 3) const float WEIGHT00 = 0.25; layout(constant_id = 4) const float WEIGHT00 = 0.25;
layout(constant_id = 4) const float WEIGHT01 = 0.5; layout(constant_id = 5) const float WEIGHT01 = 0.5;
layout(constant_id = 5) const float WEIGHT02 = 0.25; layout(constant_id = 6) const float WEIGHT02 = 0.25;
layout(constant_id = 6) const float WEIGHT10 = 0.5; layout(constant_id = 7) const float WEIGHT10 = 0.5;
layout(constant_id = 7) const float WEIGHT11 = 0.0; layout(constant_id = 8) const float WEIGHT11 = 0.0;
layout(constant_id = 8) const float WEIGHT12 = 0.5; layout(constant_id = 9) const float WEIGHT12 = 0.5;
layout(constant_id = 9) const float WEIGHT20 = 0.25; layout(constant_id = 10) const float WEIGHT20 = 0.25;
layout(constant_id = 10) const float WEIGHT21 = 0.5; layout(constant_id = 11) const float WEIGHT21 = 0.5;
layout(constant_id = 11) const float WEIGHT22 = 0.25; layout(constant_id = 12) const float WEIGHT22 = 0.25;
mat3 weights() { mat3 weights() {
const float center = -( WEIGHT00 + WEIGHT01 + WEIGHT02 const float center = -( WEIGHT00 + WEIGHT01 + WEIGHT02
+ WEIGHT10 + WEIGHT12 + WEIGHT10 + WEIGHT12
...@@ -300,15 +311,15 @@ mat3 weights() { ...@@ -300,15 +311,15 @@ mat3 weights() {
WEIGHT20, WEIGHT21, WEIGHT22); WEIGHT20, WEIGHT21, WEIGHT22);
} }
// //
layout(constant_id = 12) const float DIFFUSION_RATE_U = 0.1; layout(constant_id = 13) const float DIFFUSION_RATE_U = 0.1;
layout(constant_id = 13) const float DIFFUSION_RATE_V = 0.05; layout(constant_id = 14) const float DIFFUSION_RATE_V = 0.05;
layout(constant_id = 14) const float FEED_RATE = 0.014; layout(constant_id = 15) const float FEED_RATE = 0.014;
layout(constant_id = 15) const float KILL_RATE = 0.054; layout(constant_id = 16) const float KILL_RATE = 0.054;
layout(constant_id = 16) const float TIME_STEP = 1.0; layout(constant_id = 17) const float TIME_STEP = 1.0;
// Simulation domain size // Simulation domain size
layout(constant_id = 17) const uint DOMAIN_COLS = 1920; layout(constant_id = 18) const uint DOMAIN_COLS = 1920;
layout(constant_id = 18) const uint DOMAIN_ROWS = 1080; layout(constant_id = 19) const uint DOMAIN_ROWS = 1080;
const uvec2 OUTPUT_SIZE = uvec2(DOMAIN_COLS, DOMAIN_ROWS); const uvec2 OUTPUT_SIZE = uvec2(DOMAIN_COLS, DOMAIN_ROWS);
const uvec2 INPUT_SIZE = OUTPUT_SIZE + uvec2(2); const uvec2 INPUT_SIZE = OUTPUT_SIZE + uvec2(2);
...@@ -322,9 +333,9 @@ const uvec2 INPUT_SIZE = OUTPUT_SIZE + uvec2(2); ...@@ -322,9 +333,9 @@ const uvec2 INPUT_SIZE = OUTPUT_SIZE + uvec2(2);
// pressure on the global tile status trackign state and 2/this allows some // pressure on the global tile status trackign state and 2/this allows some
// work-groups to sacrifice themselves to allow others to make progress, // work-groups to sacrifice themselves to allow others to make progress,
// increasing the amount of successful waits. // increasing the amount of successful waits.
layout(constant_id = 19) const uint MIN_POLLS_BEFORE_SWITCH = 1; layout(constant_id = 20) const uint MIN_POLLS_BEFORE_SWITCH = 1;
layout(constant_id = 20) const uint MAX_POLLS_BEFORE_SWITCH = 64; layout(constant_id = 21) const uint MAX_POLLS_BEFORE_SWITCH = 64;
// How many unsuccessful attempts to switch tiles we should tolerate before // How many unsuccessful attempts to switch tiles we should tolerate before
// concluding that something went wrong and the simulation is stuck. // concluding that something went wrong and the simulation is stuck.
layout(constant_id = 21) const uint MAX_SWITCH_ATTEMPTS = 2; layout(constant_id = 22) const uint MAX_SWITCH_ATTEMPTS = 2;
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
// //
// As explained in the documentation of g_tile_status, the simulation domain // As explained in the documentation of g_tile_status, the simulation domain
// is cut in overlapping work-group-sized tiles, and a map of the status of // is cut in overlapping work-group-sized tiles, and a map of the status of
// these tiles is stored in g_tile_status. // these tiles is stored in g_tile_status[0].
// //
// The map is padded with one strip of end-of-simulation statuses (input idx // The map is padded with one strip of end-of-simulation statuses (input idx
// is LAST_BUFFER_IDX) on either side, and further padded on the right and at // is LAST_BUFFER_IDX) on either side, and further padded on the right and at
...@@ -16,6 +16,75 @@ ...@@ -16,6 +16,75 @@
// make a simulation step by looking at the input_idx of its 2D neighbors, // make a simulation step by looking at the input_idx of its 2D neighbors,
// since thanks to the padding edge tiles have fake neighbors that are always // since thanks to the padding edge tiles have fake neighbors that are always
// ready for another simulation step (since they have "reached" the last step). // ready for another simulation step (since they have "reached" the last step).
//
// ---
//
// This is all fine and good until we have a work-group that cannot process its
// original simulation tile and needs to switch to a different one. Ideally,
// we'd like to pick the tile with the lowest status, as status word encoding is
// designed such that this is optimal. But for large simulation domains,
// scanning g_tile_status[0] to find the minimum status takes forever.
// Therefore, we have a tree that speeds up this search, and that's what the
// other elements of the g_tile_status[] array are here for.
//
// The general idea is this:
//
// - To decide if a tile with a certain status can be processed, we need to look
// at the status of neighboring tiles in g_tile_status[0]. This mirrors our
// original stencil problem, but on a wider spatial scale (tiles instead of
// data points) and with a different reduction (tile status comparisons
// instead of Gray-Scott reaction integration).
// - From this observation, we can use a similar domain decomposition strategy
// and slice up g_tile_status[0] in overlapping tiles. And for each of these
// tiles, we can compute what the minimal tile status of inner tiles that can
// move forward (excluding the outer edge) is. This gives us OUTPUT_TILES /
// LOCAL_OUTPUT_SIZE minimal tile statuses, one per tile of tiles, which we
// can write down to another 2D array, g_tile_status[1]. When no tile can
// move forward, we use TILE_STATUS_MAX as a placeholder.
// - If we locate the minimal tile status g_tile_status[1], we find the minimal
// tile status within g_tile_status[0], and the block that contains it. So we
// can scan that block with our work-group, find the minimal tile status
// there, and hopefully quickly steal it before someone else does.
// - Now, for large simulation domains, even g_tile_status[1] may be too
// large to cheaply scan. But that's okay. We just build higher- and
// higher-level g_tile_status[n] maps, where for each we take tiles of
// LOCAL_INPUT_SIZE elements of g_tile_status[n-1] and track the minimum
// status of each tile of g_tile_status[n-1] in a single element of
// g_tile_status[n].
// - In the end, we get to g_tile_status[TILE_STATUS_TREE_DEPTH], which is
// so small that a single work-group can locate the minimal status inside of
// it in a single parallel reduction step, without iteration.
//
// We use this tree to locate the smallest tile status as follows:
//
// - First, we read all of g_tile_status[TILE_STATUS_TREE_DEPTH] inside of the
// active work-group, and we locate the smallest status inside of it.
// - We then load the corresponding tile of
// g_tile_status[TILE_STATUS_TREE_DEPTH-1] and locate the smallest status
// inside of it (whose status value should match)
// - This tells us about a tile inside of
// g_tile_status[TILE_STATUS_TREE_DEPTH-2] where the smallest status should
// lie... and so on until we find a tile inside of g_tile_status[0].
// - We then locate the smallest tile status inside of that tile.
// - If, at any point, the smallest tile status we read does not match our
// expectations, our tile status information is stale, so we retry the search
// from the beginning.
//
// Now, the question is, when should the tree be updated? Doing every
// tile g_tile_status[0] changes seems too expensive, as we're modifying the
// status of each tile multiple times per simulation step so instead I'll try to
// only update the tree at points where a work-group has nothing better to do:
//
// - When the work-group that processes a simulation tile must wait because the
// neighbors of its active tile are not ready, it updates the part of the
// g_tile_status[1..] tree that pertain to the active tile and its neighbors
// (whose status it also changed to indicate that it's waiting for them).
// - Before switching to a different tile, a work-group updates the element of
// each layer of the g_tile_status[1..] tree that pertain to its former tile,
// if they seem out of date.
// - Before exiting due to lack of work, a work-group updates the elements of
// each layer of the g_tile_status[1..] tree that pertain to its former tile
// one last time.
// === Dimensions === // === Dimensions ===
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment