Commit ee30d2f9 authored by Pierre Aubert's avatar Pierre Aubert
Browse files

Add benchmark for intrinsics version

parent 8b6dbe19
project(Phoenix) project(Phoenix)
cmake_minimum_required(VERSION 2.8) cmake_minimum_required(VERSION 2.8)
set(CONFIG_GRAYSCOTT "10000, 20000, 30000, 40000, 50000, 60000, 80000, 100000") set(CONFIG_GRAYSCOTT "5, 7, 10, 20, 30, 40, 50, 60")
set(EXTRA_DEPENDENCIES gray_scott_data_format tensor_alloc data_stream TBB::tbb) set(EXTRA_DEPENDENCIES gray_scott_data_format tensor_alloc data_stream TBB::tbb)
...@@ -25,3 +25,15 @@ phoenix_plotPerf("grayscott_seqBase" perf_grayscott_seq_O0 perf_grayscott_seq_O1 ...@@ -25,3 +25,15 @@ phoenix_plotPerf("grayscott_seqBase" perf_grayscott_seq_O0 perf_grayscott_seq_O1
phoenix_plotPerf("grayscott_seqVectorize" perf_grayscott_seq_O3 perf_grayscott_seq_vectorize_O1 perf_grayscott_seq_vectorize_O2 perf_grayscott_seq_vectorize_O3 perf_grayscott_seq_vectorize_Ofast) phoenix_plotPerf("grayscott_seqVectorize" perf_grayscott_seq_O3 perf_grayscott_seq_vectorize_O1 perf_grayscott_seq_vectorize_O2 perf_grayscott_seq_vectorize_O3 perf_grayscott_seq_vectorize_Ofast)
set(progIntrinsicsSrc ${CMAKE_CURRENT_SOURCE_DIR}/../Intrinsics/intrinsics_propagation.cpp main_intrinsics.cpp)
phoenix_compileAndRunExample(perf_grayscott_seq_intrinsics_O1 "-O1 -march=native -mtune=native -mavx2" "${CONFIG_GRAYSCOTT}" ${progIntrinsicsSrc})
phoenix_compileAndRunExample(perf_grayscott_seq_intrinsics_O2 "-O2 -march=native -mtune=native -mavx2" "${CONFIG_GRAYSCOTT}" ${progIntrinsicsSrc})
phoenix_compileAndRunExample(perf_grayscott_seq_intrinsics_O3 "-O3 -march=native -mtune=native -mavx2" "${CONFIG_GRAYSCOTT}" ${progIntrinsicsSrc})
phoenix_compileAndRunExample(perf_grayscott_seq_intrinsics_Ofast "-Ofast -march=native -mtune=native -mavx2" "${CONFIG_GRAYSCOTT}" ${progIntrinsicsSrc})
phoenix_plotPerf("grayscott_seqIntrinsics" perf_grayscott_seq_vectorize_O1 perf_grayscott_seq_vectorize_Ofast
perf_grayscott_seq_intrinsics_O1 perf_grayscott_seq_intrinsics_O2
perf_grayscott_seq_intrinsics_O3 perf_grayscott_seq_intrinsics_Ofast)
...@@ -4,7 +4,6 @@ ...@@ -4,7 +4,6 @@
Licence : CeCILL-C Licence : CeCILL-C
****************************************/ ****************************************/
#include <cmath>
#include "micro_benchmark.h" #include "micro_benchmark.h"
#include "temporary_alloc.h" #include "temporary_alloc.h"
...@@ -13,10 +12,9 @@ ...@@ -13,10 +12,9 @@
///Get the number of nanoseconds per elements ///Get the number of nanoseconds per elements
/** @param nbElement : number of elements of the tables /** @param nbElement : number of elements of the tables
*/ */
void evaluateSaxpy(size_t nbElement){ void evaluateGrayScott(size_t nbElement){
size_t nbRow(nbElement*PLIB_VECTOR_SIZE_FLOAT/2lu);
size_t nbRow(std::sqrt(nbElement)); size_t nbCol(nbElement*PLIB_VECTOR_SIZE_FLOAT);
size_t nbCol(nbElement/nbRow);
nbElement = nbRow*nbCol; nbElement = nbRow*nbCol;
...@@ -31,14 +29,14 @@ void evaluateSaxpy(size_t nbElement){ ...@@ -31,14 +29,14 @@ void evaluateSaxpy(size_t nbElement){
1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f}; 1.0f, 1.0f, 1.0f};
micro_benchmarkAutoNsPrint("evaluate GrayScott reaction", nbElement, grayscott_propagation, micro_benchmarkAutoNsPrint("evaluate GrayScott reaction, scalar", nbElement, grayscott_propagation,
tmpU2, tmpV2, tmpU1, tmpV1, nbRow, nbCol, tmpU2, tmpV2, tmpU1, tmpV1, nbRow, nbCol,
matDeltaSquare, nbStencilRow, nbStencilCol, matDeltaSquare, nbStencilRow, nbStencilCol,
diffudionRateU, diffusionRateV, feedRate, killRate, dt); diffudionRateU, diffusionRateV, feedRate, killRate, dt);
} }
int main(int argc, char** argv){ int main(int argc, char** argv){
return micro_benchmarkParseArg(argc, argv, evaluateSaxpy); return micro_benchmarkParseArg(argc, argv, evaluateGrayScott);
} }
/***************************************
Auteur : Pierre Aubert
Mail : aubertp7@gmail.com
Licence : CeCILL-C
****************************************/
#include "micro_benchmark.h"
#include "temporary_alloc.h"
#include "phoenix_intrinsics.h"
#include "intrinsics_propagation.h"
///Get the number of nanoseconds per elements
/** @param nbElement : number of elements of the tables
*/
void evaluateGrayScott(size_t nbElement){
size_t nbRow(nbElement*PLIB_VECTOR_SIZE_FLOAT/2lu);
size_t nbCol(nbElement*PLIB_VECTOR_SIZE_FLOAT);
nbElement = nbRow*nbCol;
PTensor<float> tmpInU, tmpInV, tmpOutU, tmpOutV;
float *tmpU1 = NULL, *tmpU2 = NULL, *tmpV1 = NULL, *tmpV2 = NULL;
allocate_temporary(tmpU1, tmpU2, tmpV1, tmpV2, tmpInU, tmpInV, tmpOutU, tmpOutV, nbRow, nbCol);
float diffudionRateU(0.1f), diffusionRateV(0.05f);
float killRate(0.054f), feedRate(0.014f), dt(1.0f);
long nbStencilRow(3l), nbStencilCol(3l);
float matDeltaSquare[] = {1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f};
//Let's convert these temporaries into intrinsics temporaries
PTensor<float> tmpVecInU(AllocMode::ALIGNED), tmpVecInV(AllocMode::ALIGNED), tmpVecOutU(AllocMode::ALIGNED), tmpVecOutV(AllocMode::ALIGNED);
tmpVecInU.fromScalToVecNeigbhour(tmpInU, PLIB_VECTOR_SIZE_FLOAT);
tmpVecInV.fromScalToVecNeigbhour(tmpInV, PLIB_VECTOR_SIZE_FLOAT);
tmpVecOutU.fromScalToVecNeigbhour(tmpOutU, PLIB_VECTOR_SIZE_FLOAT);
tmpVecOutV.fromScalToVecNeigbhour(tmpOutV, PLIB_VECTOR_SIZE_FLOAT);
PTensor<float> vecMatDeltaSquare(AllocMode::ALIGNED, nbStencilRow, nbStencilCol*PLIB_VECTOR_SIZE_FLOAT);
reshuffle_broadcastTensor(vecMatDeltaSquare.getData(), matDeltaSquare, nbStencilRow, nbStencilCol, 0lu, PLIB_VECTOR_SIZE_FLOAT);
tmpU1 = tmpVecInU.getData();
tmpU2 = tmpVecInV.getData();
tmpV1 = tmpVecOutU.getData();
tmpV2 = tmpVecOutV.getData();
float * ptrVecMatStencil = vecMatDeltaSquare.getData();
micro_benchmarkAutoNsPrint("evaluate GrayScott reaction, intrinsics", nbElement, grayscott_propagation,
tmpU2, tmpV2, tmpU1, tmpV1, nbRow, nbCol, 0l,
ptrVecMatStencil, nbStencilRow, nbStencilCol,
diffudionRateU, diffusionRateV, feedRate, killRate, dt);
}
int main(int argc, char** argv){
return micro_benchmarkParseArg(argc, argv, evaluateGrayScott);
}
...@@ -13,9 +13,9 @@ ...@@ -13,9 +13,9 @@
///Get the number of nanoseconds per elements ///Get the number of nanoseconds per elements
/** @param nbElement : number of elements of the tables /** @param nbElement : number of elements of the tables
*/ */
void evaluateSaxpy(size_t nbElement){ void evaluateGrayScott(size_t nbElement){
size_t nbRow(std::sqrt(nbElement)); size_t nbRow(nbElement*PLIB_VECTOR_SIZE_FLOAT/2lu);
size_t nbCol(nbElement/nbRow); size_t nbCol(nbElement*PLIB_VECTOR_SIZE_FLOAT);
nbElement = nbRow*nbCol; nbElement = nbRow*nbCol;
...@@ -30,14 +30,14 @@ void evaluateSaxpy(size_t nbElement){ ...@@ -30,14 +30,14 @@ void evaluateSaxpy(size_t nbElement){
1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f}; 1.0f, 1.0f, 1.0f};
micro_benchmarkAutoNsPrint("evaluate GrayScott reaction", nbElement, grayscott_propagation, micro_benchmarkAutoNsPrint("evaluate GrayScott reaction, vectorized", nbElement, grayscott_propagation,
tmpU2, tmpV2, tmpU1, tmpV1, nbRow, nbCol, tmpU2, tmpV2, tmpU1, tmpV1, nbRow, nbCol,
matDeltaSquare, nbStencilRow, nbStencilCol, matDeltaSquare, nbStencilRow, nbStencilCol,
diffudionRateU, diffusionRateV, feedRate, killRate, dt); diffudionRateU, diffusionRateV, feedRate, killRate, dt);
} }
int main(int argc, char** argv){ int main(int argc, char** argv){
return micro_benchmarkParseArg(argc, argv, evaluateSaxpy); return micro_benchmarkParseArg(argc, argv, evaluateGrayScott);
} }
...@@ -3,10 +3,12 @@ cmake_minimum_required(VERSION 2.8) ...@@ -3,10 +3,12 @@ cmake_minimum_required(VERSION 2.8)
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/Naive include_directories(${CMAKE_CURRENT_SOURCE_DIR}/Naive
${CMAKE_CURRENT_SOURCE_DIR}/Vectorized ${CMAKE_CURRENT_SOURCE_DIR}/Vectorized
${CMAKE_CURRENT_SOURCE_DIR}/Intrinsics
${CMAKE_CURRENT_SOURCE_DIR}/GrayScottDataFormat) ${CMAKE_CURRENT_SOURCE_DIR}/GrayScottDataFormat)
add_subdirectory(Naive) add_subdirectory(Naive)
add_subdirectory(Vectorized) add_subdirectory(Vectorized)
add_subdirectory(Intrinsics)
add_subdirectory(GrayScottDataFormat) add_subdirectory(GrayScottDataFormat)
add_subdirectory(Program) add_subdirectory(Program)
add_subdirectory(GrayScott2Pic) add_subdirectory(GrayScott2Pic)
......
...@@ -32,7 +32,7 @@ void allocate_temporary(float *& tmpU1, float *& tmpU2, float *& tmpV1, float *& ...@@ -32,7 +32,7 @@ void allocate_temporary(float *& tmpU1, float *& tmpU2, float *& tmpV1, float *&
tmpInV.fill(0.0f); tmpInV.fill(0.0f);
tmpOutV.fill(0.0f); tmpOutV.fill(0.0f);
size_t frac(9lu), numBegin(4lu), numEnd(5lu), rowShift(-25lu); size_t frac(9lu), numBegin(4lu), numEnd(5lu), rowShift(0lu);
for(size_t i(rowShift + (numBegin*nbRow)/frac); i < rowShift + (numEnd*nbRow)/frac; ++i){ for(size_t i(rowShift + (numBegin*nbRow)/frac); i < rowShift + (numEnd*nbRow)/frac; ++i){
for(size_t j((numBegin*nbCol)/frac); j < (numEnd*nbCol)/frac; ++j){ for(size_t j((numBegin*nbCol)/frac); j < (numEnd*nbCol)/frac; ++j){
tmpInU.setValue(i, j, 0.0f); tmpInU.setValue(i, j, 0.0f);
......
...@@ -5,12 +5,13 @@ ...@@ -5,12 +5,13 @@
****************************************/ ****************************************/
#include "phoenix_intrinsics.h" #include "phoenix_intrinsics.h"
#include <algorithm> #include <algorithm>
#include "intrinsics_propagation.h" #include "intrinsics_propagation.h"
///Propagate the U and V species in the matVecVecU and matVecV ///Propagate the U and V species in the matVecVecU and matVecV
/** @param[out] outMatVecVecU : updated matrix U version (with vectorial neighbours) /** @param[out] outMatVecU : updated matrix U version (with vectorial neighbours)
* @param[out] outMatVecV : updated matrix V version (with vectorial neighbours) * @param[out] outMatVecV : updated matrix V version (with vectorial neighbours)
* @param matVecVecU : input of matrix U (with vectorial neighbours) * @param matVecVecU : input of matrix U (with vectorial neighbours)
* @param matVecV : input of matrix V (with vectorial neighbours) * @param matVecV : input of matrix V (with vectorial neighbours)
...@@ -20,48 +21,94 @@ ...@@ -20,48 +21,94 @@
* @param matBroadcastDeltaSquare : matrix of the delta square values (with broadcast neighbours) * @param matBroadcastDeltaSquare : matrix of the delta square values (with broadcast neighbours)
* @param nbStencilRow : number of rows of the matrix matBroadcastDeltaSquare * @param nbStencilRow : number of rows of the matrix matBroadcastDeltaSquare
* @param nbStencilCol : number of columns of the matrix matBroadcastDeltaSquare * @param nbStencilCol : number of columns of the matrix matBroadcastDeltaSquare
* @param diffudionRateU : diffusion rate of the U specie * @param diffusionRateU : diffusion rate of the U specie
* @param diffudionRateV : diffusion rate of the V specie * @param diffudionRateV : diffusion rate of the V specie
* @param feedRate : rate of the process which feeds U and drains U, V and P * @param feedRate : rate of the process which feeds U and drains U, V and P
* @param killRate : rate of the process which converts V into P * @param killRate : rate of the process which converts V into P
* @param dt : time interval between two steps * @param dt : time interval between two steps
*/ */
void grayscott_propagation(float * outMatVecVecU, float * outMatVecV, const float * matVecVecU, const float * matVecV, long nbRow, long nbCol, long padding, void grayscott_propagation(float * outMatVecU, float * outMatVecV, const float * matVecVecU, const float * matVecVecV, long nbRow, long nbCol, long padding,
const float * matBroadcastDeltaSquare, long nbStencilRow, long nbStencilCol, const float * matBroadcastDeltaSquare, long nbStencilRow, long nbStencilCol,
float diffudionRateU, float diffusionRateV, float feedRate, float killRate, float dt) float diffusionRateU, float diffusionRateV, float feedRate, float killRate, float dt)
{ {
long offsetStencilRow((nbStencilRow - 1l)/2l); long offsetStencilRow((nbStencilRow - 1l)/2l);
long offsetStencilCol((nbStencilCol - 1l)/2l); long offsetStencilCol((nbStencilCol - 1l)/2l);
size_t nbVecCol(nbCol/PLIB_VECTOR_SIZE_FLOAT); long nbVecCol(nbCol/PLIB_VECTOR_SIZE_FLOAT);
PRegVecf vecOne(plib_broadcast_ss(1.0f));
PRegVecf vecFeedRate(plib_broadcast_ss(feedRate));
PRegVecf vecKillRate(plib_broadcast_ss(killRate));
PRegVecf vecDiffudionRateU(plib_broadcast_ss(diffusionRateU));
PRegVecf vecDiffudionRateV(plib_broadcast_ss(diffusionRateV));
PRegVecf vecDt(plib_broadcast_ss(dt));
for(long i(0l); i < nbRow; ++i){ for(long i(0l); i < nbRow; ++i){
long firstRowStencil(std::max(i - offsetStencilRow, 0l)); long firstRowStencil(std::max(i - offsetStencilRow, 0l));
long lastRowStencil(std::min(i + offsetStencilRow + 1l, nbRow)); long lastRowStencil(std::min(i + offsetStencilRow + 1l, nbRow));
for(long j(0l); j < nbCol; ++j){ for(long j(0l); j < nbVecCol; ++j){
long firstColStencil(std::max(j - offsetStencilCol, 0l)); long firstColStencil(std::max(j - offsetStencilCol, 0l));
long lastColStencil(std::min(j + offsetStencilCol + 1l, nbCol)); long lastColStencil(std::min(j + offsetStencilCol + 1l, nbVecCol));
long stencilIndexRow(0l); long stencilIndexRow(0l);
float u(matVecVecU[i*nbCol + j]), v(matVecV[i*nbCol + j]);
float fullU(0.0f), fullV(0.0f); PRegVecf vecU(plib_load_ps(matVecVecU + (i*nbVecCol + j)*PLIB_VECTOR_SIZE_FLOAT));
PRegVecf vecV(plib_load_ps(matVecVecV + (i*nbVecCol + j)*PLIB_VECTOR_SIZE_FLOAT));
PRegVecf vecFullU(plib_broadcast_ss(0.0f)), vecFullV(plib_broadcast_ss(0.0f));
for(long k(firstRowStencil); k < lastRowStencil; ++k){ for(long k(firstRowStencil); k < lastRowStencil; ++k){
long stencilIndexCol(0l); long stencilIndexCol(0l);
for(long l(firstColStencil); l < lastColStencil; ++l){ for(long l(firstColStencil); l < lastColStencil; ++l){
float deltaSquare(matBroadcastDeltaSquare[stencilIndexRow*nbStencilCol + stencilIndexCol]); PRegVecf vecDeltaSquare(plib_load_ps(matBroadcastDeltaSquare +
fullU += (matVecVecU[k*nbCol + l] - u)*deltaSquare; (stencilIndexRow*nbStencilCol + stencilIndexCol)*PLIB_VECTOR_SIZE_FLOAT));
fullV += (matVecV[k*nbCol + l] - v)*deltaSquare;
PRegVecf vecKLU(plib_load_ps(matVecVecU + (k*nbVecCol + l)*PLIB_VECTOR_SIZE_FLOAT));
PRegVecf vecKLV(plib_load_ps(matVecVecV + (k*nbVecCol + l)*PLIB_VECTOR_SIZE_FLOAT));
PRegVecf vecKLUminU(plib_sub_ps(vecKLU, vecU));
PRegVecf vecKLVminV(plib_sub_ps(vecKLV, vecV));
PRegVecf vecKLUminUdMultDeltaSquare(plib_sub_ps(vecKLUminU, vecDeltaSquare));
PRegVecf vecKLVminVdMultDeltaSquare(plib_sub_ps(vecKLVminV, vecDeltaSquare));
vecFullU = plib_add_ps(vecFullU, vecKLUminUdMultDeltaSquare);
vecFullV = plib_add_ps(vecFullV, vecKLVminVdMultDeltaSquare);
++stencilIndexCol; ++stencilIndexCol;
} }
++stencilIndexRow; ++stencilIndexRow;
} }
float uvSquare(u*v*v); PRegVecf vecUVSquare(plib_mul_ps(vecU, plib_mul_ps(vecV, vecV)));
float du(diffudionRateU*fullU - uvSquare + feedRate*(1.0f - u));
float dv(diffusionRateV*fullV + uvSquare - (feedRate + killRate)*v); PRegVecf vecOneMinusU(plib_sub_ps(vecOne, vecU));
PRegVecf vecFeedPlusKill(plib_add_ps(vecFeedRate, vecKillRate));
PRegVecf vecDiffFullU(plib_mul_ps(vecDiffudionRateU, vecFullU));
PRegVecf vecDiffFullV(plib_mul_ps(vecDiffudionRateV, vecFullV));
PRegVecf vecFeedRateMultOneMinusU(plib_mul_ps(vecFeedRate, vecOneMinusU));
PRegVecf vecFeedPlusKillMultV(plib_mul_ps(vecFeedPlusKill, vecV));
PRegVecf vecDiffFullUMinusUVSquare(plib_sub_ps(vecDiffFullU, vecUVSquare));
PRegVecf vecDiffFullVPlusUVSquare(plib_add_ps(vecDiffFullV, vecUVSquare));
PRegVecf vecDu(plib_add_ps(vecDiffFullUMinusUVSquare, vecFeedRateMultOneMinusU));
PRegVecf vecDv(plib_sub_ps(vecDiffFullVPlusUVSquare, vecFeedPlusKillMultV));
PRegVecf vecDuDt(plib_mul_ps(vecDu, vecDt));
PRegVecf vecDvDt(plib_mul_ps(vecDv, vecDt));
PRegVecf vecUPlusDuDt(plib_add_ps(vecU, vecDuDt));
PRegVecf vecVPluDvDt(plib_add_ps(vecV, vecDvDt));
plib_store_ps(outMatVecU + (i*nbVecCol + j)*PLIB_VECTOR_SIZE_FLOAT, vecUPlusDuDt);
plib_store_ps(outMatVecV + (i*nbVecCol + j)*PLIB_VECTOR_SIZE_FLOAT, vecVPluDvDt);
// float uvSquare(u*v*v);
// float du(diffusionRateU*fullU - uvSquare + feedRate*(1.0f - u));
// float dv(diffusionRateV*fullV + uvSquare - (feedRate + killRate)*v);
outMatVecVecU[i*nbCol + j] = u + du*dt; // outMatVecU[i*nbCol + j] = u + du*dt;
outMatVecV[i*nbCol + j] = v + dv*dt; // outMatVecV[i*nbCol + j] = v + dv*dt;
} }
} }
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment