From 6c0d7c1f6cf344ada3ef366e8ffea8da695373f5 Mon Sep 17 00:00:00 2001
From: Hadrien Grasland <hadrien.grasland@ijclab.in2p3.fr>
Date: Wed, 24 Jul 2024 18:33:54 +0200
Subject: [PATCH] Allow work-group reuse after completing a tile

---
 exercises/src/gpu/grayscott.comp | 49 ++++++++++++++++++++++----------
 exercises/src/gpu/interface.comp | 10 ++++++-
 exercises/src/gpu/mod.rs         |  8 ++----
 exercises/src/gpu/pipeline.rs    |  2 ++
 exercises/src/gpu/shared.comp    |  8 +++++-
 exercises/src/gpu/switch.comp    |  7 +++--
 exercises/src/gpu/tile_map.comp  | 48 +++++++++++++++++++++++++++++++
 exercises/src/gpu/wait.comp      |  2 +-
 8 files changed, 109 insertions(+), 25 deletions(-)

diff --git a/exercises/src/gpu/grayscott.comp b/exercises/src/gpu/grayscott.comp
index da65477..53245c4 100644
--- a/exercises/src/gpu/grayscott.comp
+++ b/exercises/src/gpu/grayscott.comp
@@ -14,6 +14,7 @@
 #include "wait.comp"
 #include "switch.comp"
 
+
 // Initialize the global debug/profiling state
 #if DEBUG || PROFILE
     void init_global() {
@@ -86,6 +87,12 @@
     }
 #endif
 
+// Entry point that sets up global state, preparing for a simulation run
+void init_simulation() {
+    global_init_remaining_tiles();
+    clear_tile_map();
+}
+
 // Truth that this work-group must switch to a different tile
 shared bool s_must_switch;
 
@@ -190,13 +197,19 @@ void init_shared() {
     }
 }
 
-// Main entry point of the Gray-Scott reaction simulation
+// Entry point that executes the simulation
 void run_simulation() {
-    // Initialize global, shared and local state
+    // Early exit if the work-group is not needed
     #if DEBUG || PROFILE
-        init_global();
         if(is_leader()) atomicAdd(g_metadata.entry, 1);
     #endif
+    if (work_group_finished()) return;
+
+    // Set up global, shared and local state
+    #if DEBUG || PROFILE
+        init_global();
+        if(is_leader()) atomicAdd(g_metadata.has_work, 1);
+    #endif
     init_shared();
     vec2 uv_cache;
     barrier();
@@ -211,7 +224,7 @@ void run_simulation() {
     //   simulation step (same or greater input_buffer_idx).
     // - uv_cache contains the UV data from step `s_input_buffer_idx` or is
     //   marked invalid by setting `s_uv_cache_valid` to false.
-    while(g_metadata.error == ERROR_NONE) {
+    while((work_group_finished() == false) && (g_metadata.error == ERROR_NONE)) {
         #if DEBUG || PROFILE
             if (is_leader()) {
                 s_iteration += 1;
@@ -224,8 +237,8 @@ void run_simulation() {
 
         // Switch to a different tile if we have to
         //
-        // This should always succeed, failure to do so is a fatal error that
-        // will lead the entire dispatch to abord ASAP.
+        // This almost always succeeds, if it means it should mean that the
+        // work-group has no hope of ever finding work again and should exit.
         //
         // This will clear s_acquired_tile, and thus execute the next block too.
         if (s_must_switch && !try_switch_tile()) break;
@@ -282,11 +295,9 @@ void run_simulation() {
 
         // Terminate work-group once it finishes a tile
         if (s_input_buffer_idx == LAST_BUFFER_IDX) {
-            #if DEBUG || PROFILE
-                if (is_leader()) atomicAdd(g_metadata.finish_final_step, 1);
-                barrier();
-            #endif
-            break;
+            if (is_leader()) leader_notify_tile_finished();
+            barrier();
+            continue;
         }
         #if DEBUG
             if (is_leader()) s_checkpoint = 5;
@@ -306,6 +317,14 @@ void run_simulation() {
         #endif
     }
 
+    // Liberate any acquired tile
+    if (s_acquired_tile && is_leader()) {
+        leader_release_tile();
+        #if DEBUG || PROFILE
+            atomicAdd(g_metadata.final_release, 1);
+        #endif
+    }
+
     // Aggregate local execution statistics into global execution statistics
     #if PROFILE
         #define MERGE_HIST(histogram)  \
@@ -348,13 +367,13 @@ void run_simulation() {
     #endif
 }
 
-// Main entry point that dispatches to the tile status clear or simulation
+// Main entry point that dispatches to simulation initialization or execution
 void main() {
     switch (g_push.mode) {
-        case 0:
-            clear_tile_map();
+        case MODE_INIT:
+            init_simulation();
             break;
-        case 1:
+        case MODE_RUN:
             run_simulation();
             break;
         default:
diff --git a/exercises/src/gpu/interface.comp b/exercises/src/gpu/interface.comp
index 8383c35..15bc105 100644
--- a/exercises/src/gpu/interface.comp
+++ b/exercises/src/gpu/interface.comp
@@ -12,8 +12,10 @@ const uint NUM_BUFFERS = MAX_STEPS + 1;
 const uint LAST_BUFFER_IDX = NUM_BUFFERS - 1;
 layout(push_constant) uniform PushConstants {
     uint num_steps;  // Number of computation steps to be performed
-    uint mode;  // 0 for clearing the tile status, 1 for running the simulation
+    uint mode;  // See MODE_ constants below
 } g_push;
+const uint MODE_INIT = 0;
+const uint MODE_RUN = 1;
 
 // Last input buffer index
 uint last_input_buffer_idx() {
@@ -76,6 +78,9 @@ layout(set = 1, binding = 0) buffer restrict coherent Metadata {
         uvec2 work_group_size;
     #endif
 
+    // Number of simulation tiles that are not fully completed yet
+    uint remaining_tiles;
+
     // Detailed error context reporting in debug builds
     #if DEBUG
         // Work-group and work-item that encountered the first error
@@ -111,6 +116,7 @@ layout(set = 1, binding = 0) buffer restrict coherent Metadata {
     // Dispatch-wide work-group control flow tracking
     #if DEBUG || PROFILE
         uint entry;
+        uint has_work;
         uint iteration;
         uint try_switch_tile;  // See also hist_search_distance and hist_switch_distance
         uint try_switch_tile_release;
@@ -128,6 +134,7 @@ layout(set = 1, binding = 0) buffer restrict coherent Metadata {
         uint try_await_neighbors;
         uint try_await_neighbors_failure;
         uint try_await_neighbors_success;  // See also hist_log2_polls_to_success
+        uint final_release;
     #endif
 
     // Optional detailed performance monitoring
@@ -187,6 +194,7 @@ const uint ERROR_3D_WORK_GROUP = 20;
 const uint ERROR_INVALID_UV_LEN = 21;
 const uint ERROR_INVALID_TILE_STATUS_LEN = 22;
 const uint ERROR_INVALID_MODE = 23;
+const uint ERROR_INVALID_REMAINING_TILES = 24;
 
 // Set the metadata error
 //
diff --git a/exercises/src/gpu/mod.rs b/exercises/src/gpu/mod.rs
index f61e4d3..f91da4a 100644
--- a/exercises/src/gpu/mod.rs
+++ b/exercises/src/gpu/mod.rs
@@ -2,7 +2,7 @@
 
 use self::{
     context::{CommandBufferAllocator, VulkanContext},
-    pipeline::{create_pipeline, IMAGES_SET, MODE, NUM_STEPS, STATIC_SET},
+    pipeline::{create_pipeline, IMAGES_SET, MODE, MODE_INIT, MODE_RUN, NUM_STEPS, STATIC_SET},
     resources::{Concentrations, StaticComputeState},
 };
 use crate::{
@@ -70,11 +70,9 @@ fn add_update(
             IMAGES_SET,
             concentrations,
         )?
-        // FIXME: Investigate ways to merge clear into main shader
-        .push_constants(pipeline_layout.clone(), MODE, 0)?
+        .push_constants(pipeline_layout.clone(), MODE, MODE_INIT)?
         .dispatch(static_state.tile_status_clear_dispatch())?
-        .push_constants(pipeline_layout.clone(), MODE, 1)?
-        //
+        .push_constants(pipeline_layout.clone(), MODE, MODE_RUN)?
         .dispatch(dispatch_size)?;
     Ok(())
 }
diff --git a/exercises/src/gpu/pipeline.rs b/exercises/src/gpu/pipeline.rs
index c3864ac..ffb4cbd 100644
--- a/exercises/src/gpu/pipeline.rs
+++ b/exercises/src/gpu/pipeline.rs
@@ -85,6 +85,8 @@ pub const NUM_STEPS: u32 = 0;
 /// Integer push constant used to indicate if we should set up the simulation
 /// state by clearing the tile status (0) or run the simulation (1)
 pub const MODE: u32 = 4;
+pub const MODE_INIT: u32 = 0;
+pub const MODE_RUN: u32 = 1;
 
 /// Create the compute pipeline
 pub fn create_pipeline(
diff --git a/exercises/src/gpu/shared.comp b/exercises/src/gpu/shared.comp
index f91b27d..0695ba6 100644
--- a/exercises/src/gpu/shared.comp
+++ b/exercises/src/gpu/shared.comp
@@ -7,7 +7,13 @@
 //!
 //! After this, the work-group will attempt to move the simulation to the next
 //! step again and again, hopefully until the last simulation step is reached.
-//! At this point, the work-group would stop executing.
+//!
+//! At this point, the work-group will take note that there is one less tile to
+//! be processed, which will cause one work-group to exit. We priorize killing
+//! workgroups with higher IDs, as these tend to be auxilliary work-groups that
+//! are spawned by the GPU but do not do anything until other workgroups have
+//! exited. These workgroups are not useful to us, as we are trying to make the
+//! most of each workgroup here.
 //!
 //! If the work-group manages to process several simulation steps in a row,
 //! work-items will be able to go faster by keeping the previous (U, V) data
diff --git a/exercises/src/gpu/switch.comp b/exercises/src/gpu/switch.comp
index 1290f52..2162076 100644
--- a/exercises/src/gpu/switch.comp
+++ b/exercises/src/gpu/switch.comp
@@ -425,6 +425,9 @@ bool find_tile(in ivec2 center_block_idx) {
 //
 // This will call leader_release_tile() for you if needed, so at the end of this
 // function it is guaranteed that `s_acquired_tile` is false.
+//
+// This function will almost always return true. If it returns false, it means
+// that this work-group cannot ever expect to find work again and should exit.
 bool try_switch_tile() {
     #if DEBUG || PROFILE
         if (is_leader()) s_try_switch_tile += 1;
@@ -463,8 +466,8 @@ bool try_switch_tile() {
     uint retries;
     for (retries = 0; retries < MAX_SWITCH_ATTEMPTS; ++retries) {
         if (find_tile(center_block_idx)) break;
-        // Maximize odds of a tile_status cache flush => Up-to-date view
-        memoryBarrierBuffer();
+        if (work_group_finished()) return false;
+        memoryBarrierBuffer();  // Maximize odds of fresh find_tile reads
     }
 
     // Handle search success or failure
diff --git a/exercises/src/gpu/tile_map.comp b/exercises/src/gpu/tile_map.comp
index 6279696..3f7b205 100644
--- a/exercises/src/gpu/tile_map.comp
+++ b/exercises/src/gpu/tile_map.comp
@@ -18,6 +18,7 @@
 
 // Number of simulation tiles (and thus dispatch size)
 const uvec2 OUTPUT_SIZE_IN_TILES = DIV_CEIL_UVEC2(OUTPUT_SIZE, LOCAL_OUTPUT_SIZE);
+const uint NUM_ACTIVE_TILES = OUTPUT_SIZE_IN_TILES.x * OUTPUT_SIZE_IN_TILES.y;
 
 // The g_tile_status map is actually padded such that the number of tiles is a
 // multiple of the local output size, and there is an extra strip of tile status
@@ -27,6 +28,53 @@ const uvec2 OUTPUT_TILES = NEXT_MULTIPLE_UVEC2(OUTPUT_SIZE_IN_TILES,
 const uvec2 INPUT_TILES = OUTPUT_TILES + uvec2(2);
 
 
+// === Tracking of completed tiles ===
+
+// Set up tracking of fully processed tiles
+//
+// Must be done in a separate initialization dispatch before the main simulation
+// dispatch so that every work-group sees it.
+void global_init_remaining_tiles() {
+    if (gl_GlobalInvocationID == uvec3(0)) {
+        g_metadata.remaining_tiles = NUM_ACTIVE_TILES;
+    }
+}
+
+// (Leader-only) Notify other work-groups that one more tile is fully processed
+void leader_notify_tile_finished() {
+    #if DEBUG
+        if (!is_leader()) {
+            set_error(ERROR_NOT_LEADER);
+            return;
+        }
+    #endif
+    const uint prev_remaining = atomicAdd(g_metadata.remaining_tiles,
+                                          0xffffffff /* -1 */);
+    #if DEBUG
+        if ((prev_remaining == 0) || (prev_remaining > NUM_ACTIVE_TILES)) {
+            if (set_error(ERROR_INVALID_REMAINING_TILES)) {
+                g_metadata.invalid_uint = prev_remaining;
+            }
+        }
+    #endif
+    #if DEBUG || PROFILE
+        atomicAdd(g_metadata.finish_final_step, 1);
+    #endif
+}
+
+// Truth that this work-group is not needed anymore
+//
+// Each time a simulation tile is fully processed, one less work-group is
+// necessary to process remaining tiles. We eliminate work-groups with larger
+// dispatch indices first because these are more likely to be extraneous
+// work-groups that were spawned by the GPU but will not run until the main
+// work-groups have completed their work.
+bool work_group_finished() {
+    const uint work_group_idx = gl_WorkGroupID.x + gl_WorkGroupID.y * gl_WorkGroupSize.x;
+    return (work_group_idx >= g_metadata.remaining_tiles);
+}
+
+
 // === Basic reads and writes ===
 
 // Check that a tile index is correct
diff --git a/exercises/src/gpu/wait.comp b/exercises/src/gpu/wait.comp
index 1a5e599..9e62d02 100644
--- a/exercises/src/gpu/wait.comp
+++ b/exercises/src/gpu/wait.comp
@@ -133,7 +133,7 @@ bool try_wait_for_neighbors() {
 
             // ...wait a bit more...
             for (num_polls = 2; num_polls <= s_max_polls; ++num_polls) {
-                memoryBarrierBuffer();
+                memoryBarrierBuffer();  // Maximize odds of fresh tile_status reads
                 neighbor_status = read_tile_status(neighbor_idx);
                 decode_tile_status(neighbor_status,
                                    neighbor_input_idx,
-- 
GitLab