Commit 741b2034 authored by Pierre Aubert's avatar Pierre Aubert
Browse files

C++20 not sufficient for small tables ~2000 elements clearly faster...

C++20 not sufficient for small tables ~2000 elements clearly faster computation with explicit vectorisation hint such as builtin_assume_aligned and __restrict__
parent de286542
......@@ -15,6 +15,12 @@ phoenix_compileAndRunExample(perf_hadamard_seq_vectorize_Ofast "-Ofast -ftree-ve
phoenix_plotPerf("hadamard_seqBase" perf_hadamard_seq_O0 perf_hadamard_seq_O1 perf_hadamard_seq_O2 perf_hadamard_seq_O3 perf_hadamard_seq_Ofast)
phoenix_plotPerf("hadamard_seqVectorize" perf_hadamard_seq_O3 perf_hadamard_seq_vectorize_O3 perf_hadamard_seq_vectorize_Ofast)
set(progVectorizedSrc hadamard_product_vectorized.cpp main_vectorized.cpp)
phoenix_compileAndRunExample(perf_hadamard_seq_explicit_vectorize_O3 "-O3 -ftree-vectorize -march=native -mtune=native -mavx2" "${CONFIG_HADAMARD}" ${progVectorizedSrc})
phoenix_compileAndRunExample(perf_hadamard_seq_explicit_vectorize_Ofast "-Ofast -ftree-vectorize -march=native -mtune=native -mavx2" "${CONFIG_HADAMARD}" ${progVectorizedSrc})
phoenix_plotPerf("hadamard_seqVectorize" perf_hadamard_seq_O3 perf_hadamard_seq_vectorize_O3 perf_hadamard_seq_vectorize_Ofast
perf_hadamard_seq_explicit_vectorize_O3 perf_hadamard_seq_explicit_vectorize_Ofast)
/***************************************
Auteur : Pierre Aubert
Mail : aubertp7@gmail.com
Licence : CeCILL-C
****************************************/
#include "hadamard_product_vectorized.h"
#define FLOAT_VECTOR_ALIGNEMENT 32
///Do the Hadamard product
/** @param[out] ptabResult : table of results of tabX*tabY
* @param ptabX : input table
* @param ptabY : input table
* @param nbElement : number of elements in the tables
*/
void hadamard_product(float* __restrict__ ptabResult, const float* __restrict__ ptabX, const float* __restrict__ ptabY, long unsigned int nbElement){
const float* tabX = (const float*)__builtin_assume_aligned(ptabX, FLOAT_VECTOR_ALIGNEMENT);
const float* tabY = (const float*)__builtin_assume_aligned(ptabY, FLOAT_VECTOR_ALIGNEMENT);
float* tabResult = (float*)__builtin_assume_aligned(ptabResult, FLOAT_VECTOR_ALIGNEMENT);
for(long unsigned int i(0lu); i < nbElement; ++i){
tabResult[i] = tabX[i]*tabY[i];
}
}
/***************************************
Auteur : Pierre Aubert
Mail : aubertp7@gmail.com
Licence : CeCILL-C
****************************************/
#ifndef __HADAMARD_PRODUCT_H__
#define __HADAMARD_PRODUCT_H__
void hadamard_product(float* __restrict__ ptabResult, const float* __restrict__ ptabX, const float* __restrict__ ptabY, long unsigned int nbElement);
#endif
/***************************************
Auteur : Pierre Aubert
Mail : aubertp7@gmail.com
Licence : CeCILL-C
****************************************/
#include <iostream>
#include "micro_benchmark.h"
#include "PTensor.h"
#include "hadamard_product_vectorized.h"
///Get the number of nanoseconds per elements
/** @param nbElement : number of elements of the tables
*/
void evaluateHadamardProduct(size_t nbElement){
PTensor<float> tabX(AllocMode::ALIGNED, nbElement);
PTensor<float> tabY(AllocMode::ALIGNED, nbElement);
PTensor<float> tabRes(AllocMode::ALIGNED, nbElement);
for(size_t i(0lu); i < nbElement; ++i){
tabX[i] = i*19lu%11;
tabY[i] = i*27lu%19;
}
micro_benchmarkAutoNsPrint("evaluate hadamard vectorized", nbElement, hadamard_product, tabRes.getData(), tabX.getData(), tabY.getData(), nbElement);
}
int main(int argc, char** argv){
return micro_benchmarkParseArg(argc, argv, evaluateHadamardProduct);
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment