From 6c0d7c1f6cf344ada3ef366e8ffea8da695373f5 Mon Sep 17 00:00:00 2001 From: Hadrien Grasland <hadrien.grasland@ijclab.in2p3.fr> Date: Wed, 24 Jul 2024 18:33:54 +0200 Subject: [PATCH] Allow work-group reuse after completing a tile --- exercises/src/gpu/grayscott.comp | 49 ++++++++++++++++++++++---------- exercises/src/gpu/interface.comp | 10 ++++++- exercises/src/gpu/mod.rs | 8 ++---- exercises/src/gpu/pipeline.rs | 2 ++ exercises/src/gpu/shared.comp | 8 +++++- exercises/src/gpu/switch.comp | 7 +++-- exercises/src/gpu/tile_map.comp | 48 +++++++++++++++++++++++++++++++ exercises/src/gpu/wait.comp | 2 +- 8 files changed, 109 insertions(+), 25 deletions(-) diff --git a/exercises/src/gpu/grayscott.comp b/exercises/src/gpu/grayscott.comp index da65477..53245c4 100644 --- a/exercises/src/gpu/grayscott.comp +++ b/exercises/src/gpu/grayscott.comp @@ -14,6 +14,7 @@ #include "wait.comp" #include "switch.comp" + // Initialize the global debug/profiling state #if DEBUG || PROFILE void init_global() { @@ -86,6 +87,12 @@ } #endif +// Entry point that sets up global state, preparing for a simulation run +void init_simulation() { + global_init_remaining_tiles(); + clear_tile_map(); +} + // Truth that this work-group must switch to a different tile shared bool s_must_switch; @@ -190,13 +197,19 @@ void init_shared() { } } -// Main entry point of the Gray-Scott reaction simulation +// Entry point that executes the simulation void run_simulation() { - // Initialize global, shared and local state + // Early exit if the work-group is not needed #if DEBUG || PROFILE - init_global(); if(is_leader()) atomicAdd(g_metadata.entry, 1); #endif + if (work_group_finished()) return; + + // Set up global, shared and local state + #if DEBUG || PROFILE + init_global(); + if(is_leader()) atomicAdd(g_metadata.has_work, 1); + #endif init_shared(); vec2 uv_cache; barrier(); @@ -211,7 +224,7 @@ void run_simulation() { // simulation step (same or greater input_buffer_idx). // - uv_cache contains the UV data from step `s_input_buffer_idx` or is // marked invalid by setting `s_uv_cache_valid` to false. - while(g_metadata.error == ERROR_NONE) { + while((work_group_finished() == false) && (g_metadata.error == ERROR_NONE)) { #if DEBUG || PROFILE if (is_leader()) { s_iteration += 1; @@ -224,8 +237,8 @@ void run_simulation() { // Switch to a different tile if we have to // - // This should always succeed, failure to do so is a fatal error that - // will lead the entire dispatch to abord ASAP. + // This almost always succeeds, if it means it should mean that the + // work-group has no hope of ever finding work again and should exit. // // This will clear s_acquired_tile, and thus execute the next block too. if (s_must_switch && !try_switch_tile()) break; @@ -282,11 +295,9 @@ void run_simulation() { // Terminate work-group once it finishes a tile if (s_input_buffer_idx == LAST_BUFFER_IDX) { - #if DEBUG || PROFILE - if (is_leader()) atomicAdd(g_metadata.finish_final_step, 1); - barrier(); - #endif - break; + if (is_leader()) leader_notify_tile_finished(); + barrier(); + continue; } #if DEBUG if (is_leader()) s_checkpoint = 5; @@ -306,6 +317,14 @@ void run_simulation() { #endif } + // Liberate any acquired tile + if (s_acquired_tile && is_leader()) { + leader_release_tile(); + #if DEBUG || PROFILE + atomicAdd(g_metadata.final_release, 1); + #endif + } + // Aggregate local execution statistics into global execution statistics #if PROFILE #define MERGE_HIST(histogram) \ @@ -348,13 +367,13 @@ void run_simulation() { #endif } -// Main entry point that dispatches to the tile status clear or simulation +// Main entry point that dispatches to simulation initialization or execution void main() { switch (g_push.mode) { - case 0: - clear_tile_map(); + case MODE_INIT: + init_simulation(); break; - case 1: + case MODE_RUN: run_simulation(); break; default: diff --git a/exercises/src/gpu/interface.comp b/exercises/src/gpu/interface.comp index 8383c35..15bc105 100644 --- a/exercises/src/gpu/interface.comp +++ b/exercises/src/gpu/interface.comp @@ -12,8 +12,10 @@ const uint NUM_BUFFERS = MAX_STEPS + 1; const uint LAST_BUFFER_IDX = NUM_BUFFERS - 1; layout(push_constant) uniform PushConstants { uint num_steps; // Number of computation steps to be performed - uint mode; // 0 for clearing the tile status, 1 for running the simulation + uint mode; // See MODE_ constants below } g_push; +const uint MODE_INIT = 0; +const uint MODE_RUN = 1; // Last input buffer index uint last_input_buffer_idx() { @@ -76,6 +78,9 @@ layout(set = 1, binding = 0) buffer restrict coherent Metadata { uvec2 work_group_size; #endif + // Number of simulation tiles that are not fully completed yet + uint remaining_tiles; + // Detailed error context reporting in debug builds #if DEBUG // Work-group and work-item that encountered the first error @@ -111,6 +116,7 @@ layout(set = 1, binding = 0) buffer restrict coherent Metadata { // Dispatch-wide work-group control flow tracking #if DEBUG || PROFILE uint entry; + uint has_work; uint iteration; uint try_switch_tile; // See also hist_search_distance and hist_switch_distance uint try_switch_tile_release; @@ -128,6 +134,7 @@ layout(set = 1, binding = 0) buffer restrict coherent Metadata { uint try_await_neighbors; uint try_await_neighbors_failure; uint try_await_neighbors_success; // See also hist_log2_polls_to_success + uint final_release; #endif // Optional detailed performance monitoring @@ -187,6 +194,7 @@ const uint ERROR_3D_WORK_GROUP = 20; const uint ERROR_INVALID_UV_LEN = 21; const uint ERROR_INVALID_TILE_STATUS_LEN = 22; const uint ERROR_INVALID_MODE = 23; +const uint ERROR_INVALID_REMAINING_TILES = 24; // Set the metadata error // diff --git a/exercises/src/gpu/mod.rs b/exercises/src/gpu/mod.rs index f61e4d3..f91da4a 100644 --- a/exercises/src/gpu/mod.rs +++ b/exercises/src/gpu/mod.rs @@ -2,7 +2,7 @@ use self::{ context::{CommandBufferAllocator, VulkanContext}, - pipeline::{create_pipeline, IMAGES_SET, MODE, NUM_STEPS, STATIC_SET}, + pipeline::{create_pipeline, IMAGES_SET, MODE, MODE_INIT, MODE_RUN, NUM_STEPS, STATIC_SET}, resources::{Concentrations, StaticComputeState}, }; use crate::{ @@ -70,11 +70,9 @@ fn add_update( IMAGES_SET, concentrations, )? - // FIXME: Investigate ways to merge clear into main shader - .push_constants(pipeline_layout.clone(), MODE, 0)? + .push_constants(pipeline_layout.clone(), MODE, MODE_INIT)? .dispatch(static_state.tile_status_clear_dispatch())? - .push_constants(pipeline_layout.clone(), MODE, 1)? - // + .push_constants(pipeline_layout.clone(), MODE, MODE_RUN)? .dispatch(dispatch_size)?; Ok(()) } diff --git a/exercises/src/gpu/pipeline.rs b/exercises/src/gpu/pipeline.rs index c3864ac..ffb4cbd 100644 --- a/exercises/src/gpu/pipeline.rs +++ b/exercises/src/gpu/pipeline.rs @@ -85,6 +85,8 @@ pub const NUM_STEPS: u32 = 0; /// Integer push constant used to indicate if we should set up the simulation /// state by clearing the tile status (0) or run the simulation (1) pub const MODE: u32 = 4; +pub const MODE_INIT: u32 = 0; +pub const MODE_RUN: u32 = 1; /// Create the compute pipeline pub fn create_pipeline( diff --git a/exercises/src/gpu/shared.comp b/exercises/src/gpu/shared.comp index f91b27d..0695ba6 100644 --- a/exercises/src/gpu/shared.comp +++ b/exercises/src/gpu/shared.comp @@ -7,7 +7,13 @@ //! //! After this, the work-group will attempt to move the simulation to the next //! step again and again, hopefully until the last simulation step is reached. -//! At this point, the work-group would stop executing. +//! +//! At this point, the work-group will take note that there is one less tile to +//! be processed, which will cause one work-group to exit. We priorize killing +//! workgroups with higher IDs, as these tend to be auxilliary work-groups that +//! are spawned by the GPU but do not do anything until other workgroups have +//! exited. These workgroups are not useful to us, as we are trying to make the +//! most of each workgroup here. //! //! If the work-group manages to process several simulation steps in a row, //! work-items will be able to go faster by keeping the previous (U, V) data diff --git a/exercises/src/gpu/switch.comp b/exercises/src/gpu/switch.comp index 1290f52..2162076 100644 --- a/exercises/src/gpu/switch.comp +++ b/exercises/src/gpu/switch.comp @@ -425,6 +425,9 @@ bool find_tile(in ivec2 center_block_idx) { // // This will call leader_release_tile() for you if needed, so at the end of this // function it is guaranteed that `s_acquired_tile` is false. +// +// This function will almost always return true. If it returns false, it means +// that this work-group cannot ever expect to find work again and should exit. bool try_switch_tile() { #if DEBUG || PROFILE if (is_leader()) s_try_switch_tile += 1; @@ -463,8 +466,8 @@ bool try_switch_tile() { uint retries; for (retries = 0; retries < MAX_SWITCH_ATTEMPTS; ++retries) { if (find_tile(center_block_idx)) break; - // Maximize odds of a tile_status cache flush => Up-to-date view - memoryBarrierBuffer(); + if (work_group_finished()) return false; + memoryBarrierBuffer(); // Maximize odds of fresh find_tile reads } // Handle search success or failure diff --git a/exercises/src/gpu/tile_map.comp b/exercises/src/gpu/tile_map.comp index 6279696..3f7b205 100644 --- a/exercises/src/gpu/tile_map.comp +++ b/exercises/src/gpu/tile_map.comp @@ -18,6 +18,7 @@ // Number of simulation tiles (and thus dispatch size) const uvec2 OUTPUT_SIZE_IN_TILES = DIV_CEIL_UVEC2(OUTPUT_SIZE, LOCAL_OUTPUT_SIZE); +const uint NUM_ACTIVE_TILES = OUTPUT_SIZE_IN_TILES.x * OUTPUT_SIZE_IN_TILES.y; // The g_tile_status map is actually padded such that the number of tiles is a // multiple of the local output size, and there is an extra strip of tile status @@ -27,6 +28,53 @@ const uvec2 OUTPUT_TILES = NEXT_MULTIPLE_UVEC2(OUTPUT_SIZE_IN_TILES, const uvec2 INPUT_TILES = OUTPUT_TILES + uvec2(2); +// === Tracking of completed tiles === + +// Set up tracking of fully processed tiles +// +// Must be done in a separate initialization dispatch before the main simulation +// dispatch so that every work-group sees it. +void global_init_remaining_tiles() { + if (gl_GlobalInvocationID == uvec3(0)) { + g_metadata.remaining_tiles = NUM_ACTIVE_TILES; + } +} + +// (Leader-only) Notify other work-groups that one more tile is fully processed +void leader_notify_tile_finished() { + #if DEBUG + if (!is_leader()) { + set_error(ERROR_NOT_LEADER); + return; + } + #endif + const uint prev_remaining = atomicAdd(g_metadata.remaining_tiles, + 0xffffffff /* -1 */); + #if DEBUG + if ((prev_remaining == 0) || (prev_remaining > NUM_ACTIVE_TILES)) { + if (set_error(ERROR_INVALID_REMAINING_TILES)) { + g_metadata.invalid_uint = prev_remaining; + } + } + #endif + #if DEBUG || PROFILE + atomicAdd(g_metadata.finish_final_step, 1); + #endif +} + +// Truth that this work-group is not needed anymore +// +// Each time a simulation tile is fully processed, one less work-group is +// necessary to process remaining tiles. We eliminate work-groups with larger +// dispatch indices first because these are more likely to be extraneous +// work-groups that were spawned by the GPU but will not run until the main +// work-groups have completed their work. +bool work_group_finished() { + const uint work_group_idx = gl_WorkGroupID.x + gl_WorkGroupID.y * gl_WorkGroupSize.x; + return (work_group_idx >= g_metadata.remaining_tiles); +} + + // === Basic reads and writes === // Check that a tile index is correct diff --git a/exercises/src/gpu/wait.comp b/exercises/src/gpu/wait.comp index 1a5e599..9e62d02 100644 --- a/exercises/src/gpu/wait.comp +++ b/exercises/src/gpu/wait.comp @@ -133,7 +133,7 @@ bool try_wait_for_neighbors() { // ...wait a bit more... for (num_polls = 2; num_polls <= s_max_polls; ++num_polls) { - memoryBarrierBuffer(); + memoryBarrierBuffer(); // Maximize odds of fresh tile_status reads neighbor_status = read_tile_status(neighbor_idx); decode_tile_status(neighbor_status, neighbor_input_idx, -- GitLab