From 49fc30d03b4215986beffca7ac85f2c2344dbc0a Mon Sep 17 00:00:00 2001
From: David Chamont <>
Date: Thu, 6 Jul 2023 12:37:55 -0700
Subject: [PATCH] Ajout d'une variante regroupant explicitement les iterations
 en inter au GPU.

 .../CMakeLists.txt                            |   0
 .../build.bash                                |   0
 .../cmake.bash                                |   0
 {SyclGrayScott => GrayScottBuffers}/run.bash  |   0
 .../src/CMakeLists.txt                        |   4 +-
 .../src/main.cpp                              |   3 +-
 GrayScottDevice/CMakeLists.txt                |  12 ++
 GrayScottDevice/build.bash                    |   3 +
 GrayScottDevice/cmake.bash                    |   5 +
 GrayScottDevice/run.bash                      |   5 +
 GrayScottDevice/src/CMakeLists.txt            |  10 +
 GrayScottDevice/src/main.cpp                  | 177 ++++++++++++++++++
 GrayScottIterations/CMakeLists.txt            |  12 ++
 GrayScottIterations/build.bash                |   3 +
 GrayScottIterations/cmake.bash                |   5 +
 GrayScottIterations/run.bash                  |   5 +
 GrayScottIterations/src/CMakeLists.txt        |  10 +
 GrayScottIterations/src/main.cpp              | 177 ++++++++++++++++++
 GrayScottShared/CMakeLists.txt                |  12 ++
 GrayScottShared/build.bash                    |   3 +
 GrayScottShared/cmake.bash                    |   5 +
 GrayScottShared/run.bash                      |   5 +
 GrayScottShared/src/CMakeLists.txt            |  10 +
 GrayScottShared/src/main.cpp                  | 163 ++++++++++++++++
 SquareDevice/src/main.cpp                     |  19 +-
 SquareShared/src/main.cpp                     |   4 +-
 26 files changed, 637 insertions(+), 15 deletions(-)
 rename {SyclGrayScott => GrayScottBuffers}/CMakeLists.txt (100%)
 rename {SyclGrayScott => GrayScottBuffers}/build.bash (100%)
 rename {SyclGrayScott => GrayScottBuffers}/cmake.bash (100%)
 rename {SyclGrayScott => GrayScottBuffers}/run.bash (100%)
 rename {SyclGrayScott => GrayScottBuffers}/src/CMakeLists.txt (80%)
 rename SyclGrayScott/src/sycl-gray-scott.cpp => GrayScottBuffers/src/main.cpp (98%)
 create mode 100755 GrayScottDevice/CMakeLists.txt
 create mode 100755 GrayScottDevice/build.bash
 create mode 100755 GrayScottDevice/cmake.bash
 create mode 100755 GrayScottDevice/run.bash
 create mode 100755 GrayScottDevice/src/CMakeLists.txt
 create mode 100644 GrayScottDevice/src/main.cpp
 create mode 100755 GrayScottIterations/CMakeLists.txt
 create mode 100755 GrayScottIterations/build.bash
 create mode 100755 GrayScottIterations/cmake.bash
 create mode 100755 GrayScottIterations/run.bash
 create mode 100755 GrayScottIterations/src/CMakeLists.txt
 create mode 100644 GrayScottIterations/src/main.cpp
 create mode 100755 GrayScottShared/CMakeLists.txt
 create mode 100755 GrayScottShared/build.bash
 create mode 100755 GrayScottShared/cmake.bash
 create mode 100755 GrayScottShared/run.bash
 create mode 100755 GrayScottShared/src/CMakeLists.txt
 create mode 100644 GrayScottShared/src/main.cpp

diff --git a/SyclGrayScott/CMakeLists.txt b/GrayScottBuffers/CMakeLists.txt
similarity index 100%
rename from SyclGrayScott/CMakeLists.txt
rename to GrayScottBuffers/CMakeLists.txt
diff --git a/SyclGrayScott/build.bash b/GrayScottBuffers/build.bash
similarity index 100%
rename from SyclGrayScott/build.bash
rename to GrayScottBuffers/build.bash
diff --git a/SyclGrayScott/cmake.bash b/GrayScottBuffers/cmake.bash
similarity index 100%
rename from SyclGrayScott/cmake.bash
rename to GrayScottBuffers/cmake.bash
diff --git a/SyclGrayScott/run.bash b/GrayScottBuffers/run.bash
similarity index 100%
rename from SyclGrayScott/run.bash
rename to GrayScottBuffers/run.bash
diff --git a/SyclGrayScott/src/CMakeLists.txt b/GrayScottBuffers/src/CMakeLists.txt
similarity index 80%
rename from SyclGrayScott/src/CMakeLists.txt
rename to GrayScottBuffers/src/CMakeLists.txt
index 1f5496e..fe04241 100755
--- a/SyclGrayScott/src/CMakeLists.txt
+++ b/GrayScottBuffers/src/CMakeLists.txt
@@ -1,5 +1,5 @@
-set(SOURCE_FILE sycl-gray-scott.cpp)
-set(TARGET_NAME sycl-gray-scott.exe)
+set(SOURCE_FILE main.cpp)
+set(TARGET_NAME main.exe)
 set(COMPILE_FLAGS "-fsycl -Wall")
 set(LINK_FLAGS "-fsycl")
diff --git a/SyclGrayScott/src/sycl-gray-scott.cpp b/GrayScottBuffers/src/main.cpp
similarity index 98%
rename from SyclGrayScott/src/sycl-gray-scott.cpp
rename to GrayScottBuffers/src/main.cpp
index 99f8bc3..f88fd74 100644
--- a/SyclGrayScott/src/sycl-gray-scott.cpp
+++ b/GrayScottBuffers/src/main.cpp
@@ -169,8 +169,7 @@ int main( int argc, char * argv[] ) {
     catch (sycl::exception & e) {
       std::cout << e.what() << std::endl;
-      std::cout << e.category() << std::endl;
-      std::cout << e.code() << std::endl;
+      std::cout << e.code().message() << std::endl;
     catch (std::exception & e) {
       std::cout << e.what() << std::endl;
diff --git a/GrayScottDevice/CMakeLists.txt b/GrayScottDevice/CMakeLists.txt
new file mode 100755
index 0000000..36103fa
--- /dev/null
+++ b/GrayScottDevice/CMakeLists.txt
@@ -0,0 +1,12 @@
+# Direct CMake to use icpx rather than the default C++ compiler/linker
+cmake_minimum_required (VERSION 3.4)
+project(SyclSquare CXX)
+add_subdirectory (src)
diff --git a/GrayScottDevice/build.bash b/GrayScottDevice/build.bash
new file mode 100755
index 0000000..49344dd
--- /dev/null
+++ b/GrayScottDevice/build.bash
@@ -0,0 +1,3 @@
+cd build
+make all
diff --git a/GrayScottDevice/cmake.bash b/GrayScottDevice/cmake.bash
new file mode 100755
index 0000000..869411d
--- /dev/null
+++ b/GrayScottDevice/cmake.bash
@@ -0,0 +1,5 @@
+rm -rf build
+mkdir -p build
+cd build
+cmake ..
diff --git a/GrayScottDevice/run.bash b/GrayScottDevice/run.bash
new file mode 100755
index 0000000..26a9b8e
--- /dev/null
+++ b/GrayScottDevice/run.bash
@@ -0,0 +1,5 @@
+time ./build/main.exe 270 480 5 10000
+#time ./build/main.exe 540 960 5 10000
+#time ./build/main.exe 1080 1920 5 10000
+#time ./build/sycl-gray-scott.exe 2160 3840 5 1000
diff --git a/GrayScottDevice/src/CMakeLists.txt b/GrayScottDevice/src/CMakeLists.txt
new file mode 100755
index 0000000..fe04241
--- /dev/null
+++ b/GrayScottDevice/src/CMakeLists.txt
@@ -0,0 +1,10 @@
+set(SOURCE_FILE main.cpp)
+set(TARGET_NAME main.exe)
+set(COMPILE_FLAGS "-fsycl -Wall")
+set(LINK_FLAGS "-fsycl")
+add_executable(${TARGET_NAME} ${SOURCE_FILE})
+set_target_properties(${TARGET_NAME} PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
+#add_custom_target(all DEPENDS ${TARGET_NAME})
diff --git a/GrayScottDevice/src/main.cpp b/GrayScottDevice/src/main.cpp
new file mode 100644
index 0000000..acab7db
--- /dev/null
+++ b/GrayScottDevice/src/main.cpp
@@ -0,0 +1,177 @@
+#include <CL/sycl.hpp>
+#include <array>
+#include <iostream>
+#include <iomanip>
+#include <cmath>
+using namespace cl::sycl;
+constexpr float KILL_RATE { 0.062f };
+constexpr float FEED_RATE { 0.03f };
+constexpr float DT { 1.0f };
+constexpr float DIFFUSION_RATE_U { 0.1f };
+constexpr float DIFFUSION_RATE_V { 0.05f };
+void submit( queue & q, 
+    float const * iu, float const * iv,
+    float * ou, float * ov,
+    float * iud, float * ivd,
+    float * oud, float * ovd,
+    std::size_t nb_rows, std::size_t nb_cols ) {
+    // Submit command group for execution
+    q.memcpy(iud,iu,(nb_rows+2)*(nb_cols+2)*sizeof(float)).wait();
+    q.memcpy(ivd,iv,(nb_rows+2)*(nb_cols+2)*sizeof(float)).wait();
+    q.submit([&](handler& h) {
+        // Define the kernel
+        h.parallel_for(range<2>{nb_rows,nb_cols}, [=](item<2> it) {
+            id<2> xy = it.get_id();
+            std::size_t row = xy[0] ;
+            std::size_t col = xy[1] ;
+			float u = iud[(row+1)*(nb_cols+2)+col+1];
+			float v = ivd[(row+1)*(nb_cols+2)+col+1];
+			float uvv = u*v*v;
+			float full_u = 0.0f;
+            float full_v = 0.0f;
+			for(long k = 0l; k < 3l; ++k){
+				for(long l = 0l; l < 3l; ++l){
+					full_u += (iud[(row+k)*(nb_cols+2)+col+l] - u);
+					full_v += (ivd[(row+k)*(nb_cols+2)+col+l] - v);
+				}
+			}
+			float du = DIFFUSION_RATE_U*full_u - uvv + FEED_RATE*(1.0f - u);
+			float dv = DIFFUSION_RATE_V*full_v + uvv - (FEED_RATE + KILL_RATE)*v;
+			oud[(row+1)*(nb_cols+2)+col+1] = u + du*DT;
+			ovd[(row+1)*(nb_cols+2)+col+1] = v + dv*DT;
+        });
+    }).wait();
+    q.memcpy(ou,oud,(nb_rows+2)*(nb_cols+2)*sizeof(float)).wait();
+    q.memcpy(ov,ovd,(nb_rows+2)*(nb_cols+2)*sizeof(float)).wait();
+int main( int argc, char * argv[] ) {
+    // runtime parameters
+    assert(argc=5) ;
+    std::size_t nb_rows {std::stoul(argv[1])} ;
+    std::size_t nb_cols {std::stoul(argv[2])} ;
+    std::size_t nb_images {std::stoul(argv[3])} ;
+    std::size_t nb_iterations {std::stoul(argv[4])} ;
+    assert(nb_iterations % 2 == 0); // nb_iterations must be even
+    try {
+        // Loop through available platforms and devices
+        for (auto const& this_platform : platform::get_platforms() ) {
+            std::cout << "Found platform: "
+                << this_platform.get_info<info::platform::name>() << std::endl;
+            for (auto const& this_device : this_platform.get_devices() ) {
+                std::cout << "  Device: "
+                    << this_device.get_info<info::device::name>() << std::endl;
+            }
+        }
+        // Create SYCL queue
+        queue q;
+        // Running platform and device
+        std::cout << "Running on platform: "
+            << q.get_device().get_platform().get_info<info::platform::name>() << std::endl;
+        std::cout << "  Device: "
+            << q.get_device().get_info<info::device::name>() << std::endl;
+        std::cout << std::endl;
+        // Initialize local arrays
+        const std::size_t padded_nb_rows { nb_rows+2 };
+        const std::size_t padded_nb_cols { nb_cols+2 };
+        const std::size_t size { padded_nb_rows*padded_nb_cols };
+        std::vector<float> u1(size);
+        std::vector<float> v1(size);
+        std::vector<float> u2(size);
+        std::vector<float> v2(size);
+        for (int i = 0; i < padded_nb_rows; i++) {
+            for (int j = 0; j < padded_nb_cols; j++) {
+                u1[i*padded_nb_cols+j] = 1.f;
+                v1[i*padded_nb_cols+j] = 0.f;
+                u2[i*padded_nb_cols+j] = 1.f;
+                v2[i*padded_nb_cols+j] = 0.f;
+            }
+        }
+        const std::size_t v_row_begin { (7ul*padded_nb_rows+8ul)/16ul };
+        const std::size_t v_row_end { (9ul*padded_nb_rows+8ul)/16ul };
+        const std::size_t v_col_begin { (7ul*padded_nb_cols+8ul)/16ul };
+        const std::size_t v_col_end { (9ul*padded_nb_cols+8ul)/16ul };
+        std::cout << "v_row_begin: " << v_row_begin << std::endl;
+        std::cout << "v_row_end:   " << v_row_end   << std::endl;
+        std::cout << "v_col_begin: " << v_col_begin << std::endl;
+        std::cout << "v_col_end:   " << v_col_end   << std::endl;
+        std::cout << std::endl;
+        for (int i = v_row_begin; i < v_row_end; i++) {
+            for (int j = v_col_begin; j < v_col_end; j++) {
+                u1[i*padded_nb_cols+j] = 0.f;
+                v1[i*padded_nb_cols+j] = 1.f;
+            }
+        }
+        // Create device arrays
+        float * iud  = malloc_device<float>(size, q);
+        float * ivd  = malloc_device<float>(size, q);
+        float * oud  = malloc_device<float>(size, q);
+        float * ovd  = malloc_device<float>(size, q);
+        // iterations
+        for ( std::size_t image = 0 ; image < nb_images ; ++image ) {
+            for ( std::size_t iter = 0 ; iter < nb_iterations ; iter += 2 ) {
+                submit( q,,,,, iud, ivd, oud, ovd, nb_rows, nb_cols );
+                submit( q,,,,, iud, ivd, oud, ovd, nb_rows, nb_cols );
+            }
+        }
+        // Print some result
+        const std::size_t row_center { padded_nb_rows/2ul };
+        const std::size_t col_center { padded_nb_cols/2ul };
+        std::cout<<std::fixed<<std::setprecision(2) ;
+        for (std::size_t i = (row_center-5ul) ; i < (row_center+5ul); i++) {
+            for (std::size_t j = (col_center-5ul); j < (col_center+5ul); j++) {
+                std::cout << u1[i*padded_nb_cols+j] << " ";
+            }
+            std::cout << "\n";
+        }
+        std::cout << std::endl;
+        for (std::size_t i = (row_center-5ul) ; i < (row_center+5ul); i++) {
+            for (std::size_t j = (col_center-5ul); j < (col_center+5ul); j++) {
+                std::cout << v1[i*padded_nb_cols+j] << " ";
+            }
+            std::cout << "\n";
+        }
+        std::cout << std::endl;
+        // Release device arrays
+        sycl::free(iud,q);
+        sycl::free(ivd,q);
+        sycl::free(oud,q);
+        sycl::free(ovd,q);
+    }
+    catch (sycl::exception & e) {
+      std::cout << e.what() << std::endl;
+      std::cout << e.code().message() << std::endl;
+    }
+    catch (std::exception & e) {
+      std::cout << e.what() << std::endl;
+    }
+    catch (const char * e) {
+      std::cout << e << std::endl;
+    }
+    return 0;
diff --git a/GrayScottIterations/CMakeLists.txt b/GrayScottIterations/CMakeLists.txt
new file mode 100755
index 0000000..36103fa
--- /dev/null
+++ b/GrayScottIterations/CMakeLists.txt
@@ -0,0 +1,12 @@
+# Direct CMake to use icpx rather than the default C++ compiler/linker
+cmake_minimum_required (VERSION 3.4)
+project(SyclSquare CXX)
+add_subdirectory (src)
diff --git a/GrayScottIterations/build.bash b/GrayScottIterations/build.bash
new file mode 100755
index 0000000..49344dd
--- /dev/null
+++ b/GrayScottIterations/build.bash
@@ -0,0 +1,3 @@
+cd build
+make all
diff --git a/GrayScottIterations/cmake.bash b/GrayScottIterations/cmake.bash
new file mode 100755
index 0000000..869411d
--- /dev/null
+++ b/GrayScottIterations/cmake.bash
@@ -0,0 +1,5 @@
+rm -rf build
+mkdir -p build
+cd build
+cmake ..
diff --git a/GrayScottIterations/run.bash b/GrayScottIterations/run.bash
new file mode 100755
index 0000000..26a9b8e
--- /dev/null
+++ b/GrayScottIterations/run.bash
@@ -0,0 +1,5 @@
+time ./build/main.exe 270 480 5 10000
+#time ./build/main.exe 540 960 5 10000
+#time ./build/main.exe 1080 1920 5 10000
+#time ./build/sycl-gray-scott.exe 2160 3840 5 1000
diff --git a/GrayScottIterations/src/CMakeLists.txt b/GrayScottIterations/src/CMakeLists.txt
new file mode 100755
index 0000000..fe04241
--- /dev/null
+++ b/GrayScottIterations/src/CMakeLists.txt
@@ -0,0 +1,10 @@
+set(SOURCE_FILE main.cpp)
+set(TARGET_NAME main.exe)
+set(COMPILE_FLAGS "-fsycl -Wall")
+set(LINK_FLAGS "-fsycl")
+add_executable(${TARGET_NAME} ${SOURCE_FILE})
+set_target_properties(${TARGET_NAME} PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
+#add_custom_target(all DEPENDS ${TARGET_NAME})
diff --git a/GrayScottIterations/src/main.cpp b/GrayScottIterations/src/main.cpp
new file mode 100644
index 0000000..acab7db
--- /dev/null
+++ b/GrayScottIterations/src/main.cpp
@@ -0,0 +1,177 @@
+#include <CL/sycl.hpp>
+#include <array>
+#include <iostream>
+#include <iomanip>
+#include <cmath>
+using namespace cl::sycl;
+constexpr float KILL_RATE { 0.062f };
+constexpr float FEED_RATE { 0.03f };
+constexpr float DT { 1.0f };
+constexpr float DIFFUSION_RATE_U { 0.1f };
+constexpr float DIFFUSION_RATE_V { 0.05f };
+void submit( queue & q, 
+    float const * iu, float const * iv,
+    float * ou, float * ov,
+    float * iud, float * ivd,
+    float * oud, float * ovd,
+    std::size_t nb_rows, std::size_t nb_cols ) {
+    // Submit command group for execution
+    q.memcpy(iud,iu,(nb_rows+2)*(nb_cols+2)*sizeof(float)).wait();
+    q.memcpy(ivd,iv,(nb_rows+2)*(nb_cols+2)*sizeof(float)).wait();
+    q.submit([&](handler& h) {
+        // Define the kernel
+        h.parallel_for(range<2>{nb_rows,nb_cols}, [=](item<2> it) {
+            id<2> xy = it.get_id();
+            std::size_t row = xy[0] ;
+            std::size_t col = xy[1] ;
+			float u = iud[(row+1)*(nb_cols+2)+col+1];
+			float v = ivd[(row+1)*(nb_cols+2)+col+1];
+			float uvv = u*v*v;
+			float full_u = 0.0f;
+            float full_v = 0.0f;
+			for(long k = 0l; k < 3l; ++k){
+				for(long l = 0l; l < 3l; ++l){
+					full_u += (iud[(row+k)*(nb_cols+2)+col+l] - u);
+					full_v += (ivd[(row+k)*(nb_cols+2)+col+l] - v);
+				}
+			}
+			float du = DIFFUSION_RATE_U*full_u - uvv + FEED_RATE*(1.0f - u);
+			float dv = DIFFUSION_RATE_V*full_v + uvv - (FEED_RATE + KILL_RATE)*v;
+			oud[(row+1)*(nb_cols+2)+col+1] = u + du*DT;
+			ovd[(row+1)*(nb_cols+2)+col+1] = v + dv*DT;
+        });
+    }).wait();
+    q.memcpy(ou,oud,(nb_rows+2)*(nb_cols+2)*sizeof(float)).wait();
+    q.memcpy(ov,ovd,(nb_rows+2)*(nb_cols+2)*sizeof(float)).wait();
+int main( int argc, char * argv[] ) {
+    // runtime parameters
+    assert(argc=5) ;
+    std::size_t nb_rows {std::stoul(argv[1])} ;
+    std::size_t nb_cols {std::stoul(argv[2])} ;
+    std::size_t nb_images {std::stoul(argv[3])} ;
+    std::size_t nb_iterations {std::stoul(argv[4])} ;
+    assert(nb_iterations % 2 == 0); // nb_iterations must be even
+    try {
+        // Loop through available platforms and devices
+        for (auto const& this_platform : platform::get_platforms() ) {
+            std::cout << "Found platform: "
+                << this_platform.get_info<info::platform::name>() << std::endl;
+            for (auto const& this_device : this_platform.get_devices() ) {
+                std::cout << "  Device: "
+                    << this_device.get_info<info::device::name>() << std::endl;
+            }
+        }
+        // Create SYCL queue
+        queue q;
+        // Running platform and device
+        std::cout << "Running on platform: "
+            << q.get_device().get_platform().get_info<info::platform::name>() << std::endl;
+        std::cout << "  Device: "
+            << q.get_device().get_info<info::device::name>() << std::endl;
+        std::cout << std::endl;
+        // Initialize local arrays
+        const std::size_t padded_nb_rows { nb_rows+2 };
+        const std::size_t padded_nb_cols { nb_cols+2 };
+        const std::size_t size { padded_nb_rows*padded_nb_cols };
+        std::vector<float> u1(size);
+        std::vector<float> v1(size);
+        std::vector<float> u2(size);
+        std::vector<float> v2(size);
+        for (int i = 0; i < padded_nb_rows; i++) {
+            for (int j = 0; j < padded_nb_cols; j++) {
+                u1[i*padded_nb_cols+j] = 1.f;
+                v1[i*padded_nb_cols+j] = 0.f;
+                u2[i*padded_nb_cols+j] = 1.f;
+                v2[i*padded_nb_cols+j] = 0.f;
+            }
+        }
+        const std::size_t v_row_begin { (7ul*padded_nb_rows+8ul)/16ul };
+        const std::size_t v_row_end { (9ul*padded_nb_rows+8ul)/16ul };
+        const std::size_t v_col_begin { (7ul*padded_nb_cols+8ul)/16ul };
+        const std::size_t v_col_end { (9ul*padded_nb_cols+8ul)/16ul };
+        std::cout << "v_row_begin: " << v_row_begin << std::endl;
+        std::cout << "v_row_end:   " << v_row_end   << std::endl;
+        std::cout << "v_col_begin: " << v_col_begin << std::endl;
+        std::cout << "v_col_end:   " << v_col_end   << std::endl;
+        std::cout << std::endl;
+        for (int i = v_row_begin; i < v_row_end; i++) {
+            for (int j = v_col_begin; j < v_col_end; j++) {
+                u1[i*padded_nb_cols+j] = 0.f;
+                v1[i*padded_nb_cols+j] = 1.f;
+            }
+        }
+        // Create device arrays
+        float * iud  = malloc_device<float>(size, q);
+        float * ivd  = malloc_device<float>(size, q);
+        float * oud  = malloc_device<float>(size, q);
+        float * ovd  = malloc_device<float>(size, q);
+        // iterations
+        for ( std::size_t image = 0 ; image < nb_images ; ++image ) {
+            for ( std::size_t iter = 0 ; iter < nb_iterations ; iter += 2 ) {
+                submit( q,,,,, iud, ivd, oud, ovd, nb_rows, nb_cols );
+                submit( q,,,,, iud, ivd, oud, ovd, nb_rows, nb_cols );
+            }
+        }
+        // Print some result
+        const std::size_t row_center { padded_nb_rows/2ul };
+        const std::size_t col_center { padded_nb_cols/2ul };
+        std::cout<<std::fixed<<std::setprecision(2) ;
+        for (std::size_t i = (row_center-5ul) ; i < (row_center+5ul); i++) {
+            for (std::size_t j = (col_center-5ul); j < (col_center+5ul); j++) {
+                std::cout << u1[i*padded_nb_cols+j] << " ";
+            }
+            std::cout << "\n";
+        }
+        std::cout << std::endl;
+        for (std::size_t i = (row_center-5ul) ; i < (row_center+5ul); i++) {
+            for (std::size_t j = (col_center-5ul); j < (col_center+5ul); j++) {
+                std::cout << v1[i*padded_nb_cols+j] << " ";
+            }
+            std::cout << "\n";
+        }
+        std::cout << std::endl;
+        // Release device arrays
+        sycl::free(iud,q);
+        sycl::free(ivd,q);
+        sycl::free(oud,q);
+        sycl::free(ovd,q);
+    }
+    catch (sycl::exception & e) {
+      std::cout << e.what() << std::endl;
+      std::cout << e.code().message() << std::endl;
+    }
+    catch (std::exception & e) {
+      std::cout << e.what() << std::endl;
+    }
+    catch (const char * e) {
+      std::cout << e << std::endl;
+    }
+    return 0;
diff --git a/GrayScottShared/CMakeLists.txt b/GrayScottShared/CMakeLists.txt
new file mode 100755
index 0000000..36103fa
--- /dev/null
+++ b/GrayScottShared/CMakeLists.txt
@@ -0,0 +1,12 @@
+# Direct CMake to use icpx rather than the default C++ compiler/linker
+cmake_minimum_required (VERSION 3.4)
+project(SyclSquare CXX)
+add_subdirectory (src)
diff --git a/GrayScottShared/build.bash b/GrayScottShared/build.bash
new file mode 100755
index 0000000..49344dd
--- /dev/null
+++ b/GrayScottShared/build.bash
@@ -0,0 +1,3 @@
+cd build
+make all
diff --git a/GrayScottShared/cmake.bash b/GrayScottShared/cmake.bash
new file mode 100755
index 0000000..869411d
--- /dev/null
+++ b/GrayScottShared/cmake.bash
@@ -0,0 +1,5 @@
+rm -rf build
+mkdir -p build
+cd build
+cmake ..
diff --git a/GrayScottShared/run.bash b/GrayScottShared/run.bash
new file mode 100755
index 0000000..26a9b8e
--- /dev/null
+++ b/GrayScottShared/run.bash
@@ -0,0 +1,5 @@
+time ./build/main.exe 270 480 5 10000
+#time ./build/main.exe 540 960 5 10000
+#time ./build/main.exe 1080 1920 5 10000
+#time ./build/sycl-gray-scott.exe 2160 3840 5 1000
diff --git a/GrayScottShared/src/CMakeLists.txt b/GrayScottShared/src/CMakeLists.txt
new file mode 100755
index 0000000..fe04241
--- /dev/null
+++ b/GrayScottShared/src/CMakeLists.txt
@@ -0,0 +1,10 @@
+set(SOURCE_FILE main.cpp)
+set(TARGET_NAME main.exe)
+set(COMPILE_FLAGS "-fsycl -Wall")
+set(LINK_FLAGS "-fsycl")
+add_executable(${TARGET_NAME} ${SOURCE_FILE})
+set_target_properties(${TARGET_NAME} PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
+#add_custom_target(all DEPENDS ${TARGET_NAME})
diff --git a/GrayScottShared/src/main.cpp b/GrayScottShared/src/main.cpp
new file mode 100644
index 0000000..6b21005
--- /dev/null
+++ b/GrayScottShared/src/main.cpp
@@ -0,0 +1,163 @@
+#include <CL/sycl.hpp>
+#include <array>
+#include <iostream>
+#include <iomanip>
+#include <cmath>
+using namespace cl::sycl;
+constexpr float KILL_RATE { 0.062f };
+constexpr float FEED_RATE { 0.03f };
+constexpr float DT { 1.0f };
+constexpr float DIFFUSION_RATE_U { 0.1f };
+constexpr float DIFFUSION_RATE_V { 0.05f };
+void submit( queue & q, 
+    float const * iu, float const * iv,
+    float * ou, float * ov,
+    std::size_t nb_rows, std::size_t nb_cols ) {
+    // Submit command group for execution
+    q.submit([&](handler& h) {
+        // Define the kernel
+        h.parallel_for(range<2>{nb_rows,nb_cols}, [=](item<2> it) {
+            id<2> xy = it.get_id();
+            std::size_t row = xy[0] ;
+            std::size_t col = xy[1] ;
+			float u = iu[(row+1)*(nb_cols+2)+col+1];
+			float v = iv[(row+1)*(nb_cols+2)+col+1];
+			float uvv = u*v*v;
+			float full_u = 0.0f;
+            float full_v = 0.0f;
+			for(long k = 0l; k < 3l; ++k){
+				for(long l = 0l; l < 3l; ++l){
+					full_u += (iu[(row+k)*(nb_cols+2)+col+l] - u);
+					full_v += (iv[(row+k)*(nb_cols+2)+col+l] - v);
+				}
+			}
+			float du = DIFFUSION_RATE_U*full_u - uvv + FEED_RATE*(1.0f - u);
+			float dv = DIFFUSION_RATE_V*full_v + uvv - (FEED_RATE + KILL_RATE)*v;
+			ou[(row+1)*(nb_cols+2)+col+1] = u + du*DT;
+			ov[(row+1)*(nb_cols+2)+col+1] = v + dv*DT;
+        });
+    });
+    // Wait for the command group to finish
+    q.wait();
+int main( int argc, char * argv[] ) {
+    // runtime parameters
+    assert(argc=5) ;
+    std::size_t nb_rows {std::stoul(argv[1])} ;
+    std::size_t nb_cols {std::stoul(argv[2])} ;
+    std::size_t nb_images {std::stoul(argv[3])} ;
+    std::size_t nb_iterations {std::stoul(argv[4])} ;
+    assert(nb_iterations % 2 == 0); // nb_iterations must be even
+    try {
+        // Loop through available platforms and devices
+        for (auto const& this_platform : platform::get_platforms() ) {
+            std::cout << "Found platform: "
+                << this_platform.get_info<info::platform::name>() << std::endl;
+            for (auto const& this_device : this_platform.get_devices() ) {
+                std::cout << "  Device: "
+                    << this_device.get_info<info::device::name>() << std::endl;
+            }
+        }
+        // Create SYCL queue
+        queue q;
+        // Running platform and device
+        std::cout << "Running on platform: "
+            << q.get_device().get_platform().get_info<info::platform::name>() << std::endl;
+        std::cout << "  Device: "
+            << q.get_device().get_info<info::device::name>() << std::endl;
+        std::cout << std::endl;
+        // Initialize input array
+        const std::size_t padded_nb_rows { nb_rows+2 };
+        const std::size_t padded_nb_cols { nb_cols+2 };
+        const std::size_t size { padded_nb_rows*padded_nb_cols };
+        float * u1  = malloc_shared<float>(size, q);
+        float * u2  = malloc_shared<float>(size, q);
+        float * v1  = malloc_shared<float>(size, q);
+        float * v2  = malloc_shared<float>(size, q);
+        for (int i = 0; i < padded_nb_rows; i++) {
+            for (int j = 0; j < padded_nb_cols; j++) {
+                u1[i*padded_nb_cols+j] = 1.f;
+                v1[i*padded_nb_cols+j] = 0.f;
+                u2[i*padded_nb_cols+j] = 1.f;
+                v2[i*padded_nb_cols+j] = 0.f;
+            }
+        }
+        const std::size_t v_row_begin { (7ul*padded_nb_rows+8ul)/16ul };
+        const std::size_t v_row_end { (9ul*padded_nb_rows+8ul)/16ul };
+        const std::size_t v_col_begin { (7ul*padded_nb_cols+8ul)/16ul };
+        const std::size_t v_col_end { (9ul*padded_nb_cols+8ul)/16ul };
+        std::cout << "v_row_begin: " << v_row_begin << std::endl;
+        std::cout << "v_row_end:   " << v_row_end   << std::endl;
+        std::cout << "v_col_begin: " << v_col_begin << std::endl;
+        std::cout << "v_col_end:   " << v_col_end   << std::endl;
+        std::cout << std::endl;
+        for (int i = v_row_begin; i < v_row_end; i++) {
+            for (int j = v_col_begin; j < v_col_end; j++) {
+                u1[i*padded_nb_cols+j] = 0.f;
+                v1[i*padded_nb_cols+j] = 1.f;
+            }
+        }
+        // iterations
+        for ( std::size_t image = 0 ; image < nb_images ; ++image ) {
+            for ( std::size_t iter = 0 ; iter < nb_iterations ; iter += 2 ) {
+                submit( q, u1, v1, u2, v2, nb_rows, nb_cols );
+                submit( q, u2, v2, u1, v1, nb_rows, nb_cols );
+            }
+        }
+        // Print some result
+        const std::size_t row_center { padded_nb_rows/2ul };
+        const std::size_t col_center { padded_nb_cols/2ul };
+        std::cout<<std::fixed<<std::setprecision(2) ;
+        for (std::size_t i = (row_center-5ul) ; i < (row_center+5ul); i++) {
+            for (std::size_t j = (col_center-5ul); j < (col_center+5ul); j++) {
+                std::cout << u1[i*padded_nb_cols+j] << " ";
+            }
+            std::cout << "\n";
+        }
+        std::cout << std::endl;
+        std::cout<<std::fixed<<std::setprecision(2) ;
+        for (std::size_t i = (row_center-5ul) ; i < (row_center+5ul); i++) {
+            for (std::size_t j = (col_center-5ul); j < (col_center+5ul); j++) {
+                std::cout << v1[i*padded_nb_cols+j] << " ";
+            }
+            std::cout << "\n";
+        }
+        std::cout << std::endl;
+    }
+    catch (sycl::exception & e) {
+      std::cout << e.what() << std::endl;
+      std::cout << e.code().message() << std::endl;
+    }
+    catch (std::exception & e) {
+      std::cout << e.what() << std::endl;
+    }
+    catch (const char * e) {
+      std::cout << e << std::endl;
+    }
+    return 0;
diff --git a/SquareDevice/src/main.cpp b/SquareDevice/src/main.cpp
index 4e4ccee..7f1beae 100644
--- a/SquareDevice/src/main.cpp
+++ b/SquareDevice/src/main.cpp
@@ -24,21 +24,22 @@ int main() {
               << q.get_device().get_info<info::device::name>() << "\n";
     std::cout << "\n";
-    std::array<float,SIZE> input, output ;
-    float * binput  = sycl::malloc_device<float>(SIZE, q);
-    float * boutput = sycl::malloc_device<float>(SIZE, q);
     // Initialize input array
+    std::array<float,SIZE> input, output ;
     for (std::size_t i = 0; i < SIZE; i++) {
         input[i] = i + 1;
+    // Alloc memory on device
+    float * dinput  = malloc_device<float>(SIZE, q);
+    float * doutput = malloc_device<float>(SIZE, q);
     // Submit command group for execution
-    q.memcpy(binput,,SIZE*sizeof(float)).wait();
+    q.memcpy(dinput,,SIZE*sizeof(float)).wait();
     q.parallel_for(SIZE, [=](id<1> idx) {
-            boutput[idx] = binput[idx] * binput[idx];
+            houtput[idx] = hinput[idx] * hinput[idx];
-    q.memcpy(,boutput,SIZE*sizeof(float)).wait();
+    q.memcpy(,doutput,SIZE*sizeof(float)).wait();
     // Print the result
     for (int i = 0; i < SIZE; i++) {
@@ -47,8 +48,8 @@ int main() {
     std::cout << std::endl;
     // Release resources
-    sycl::free(binput, q);
-    sycl::free(boutput, q);
+    sycl::free(dinput, q);
+    sycl::free(doutput, q);
     return 0;
diff --git a/SquareShared/src/main.cpp b/SquareShared/src/main.cpp
index ff77c3d..a4882ae 100644
--- a/SquareShared/src/main.cpp
+++ b/SquareShared/src/main.cpp
@@ -24,8 +24,8 @@ int main() {
               << q.get_device().get_info<info::device::name>() << "\n";
     std::cout << "\n";
-    auto * input  = sycl::malloc_shared<float>(SIZE, q);
-    auto * output = sycl::malloc_shared<float>(SIZE, q);
+    auto * input  = malloc_shared<float>(SIZE, q);
+    auto * output = malloc_shared<float>(SIZE, q);
     // Initialize input array
     for (std::size_t i = 0; i < SIZE; i++) {