Commit 331069eb authored by dino's avatar dino
Browse files

First implementation of a PSA Grid Search with two interactions per hit segment.

This feature is enabled by the keyword "GridSearch AdaptiveTwoHits" in PSSFilter.conf

git-svn-id: svn://gal-serv.lnl.infn.it/agata/trunk/narval_emulator@1015 170316e4-aea8-4b27-aad4-0380ec0519c9
parent af670de4
......@@ -270,7 +270,7 @@ int main(int argc, char **argv)
ADF::CrystalInterface::kDefaultLength = defTraceLengthRaw;
// Because of the static variables used to register and configure the actors in the Narval environment,
// a flat emulator that creates multiple instances of the same class in the same process, has t2 options:
// a flat emulator that creates multiple instances of the same class in the same process, has 2 options:
// 1) call process_config immediately before creating each of the objects so that the constructors can
// move the globals to local storage (essentially gConfPath --> fConfPath)
// 2) create the objects and call process_config and process_initialise in sequence for all of them; in this
......
......@@ -45,7 +45,7 @@
Name="VCCLCompilerTool"
Optimization="0"
AdditionalIncludeDirectories="..\PRISMA\src\lib_prisma\include;"C:\Program Files (x86)\boost\boost_1_40";..\myADF0.2;..\myADF0.2\standalone;..\common;..\producers\Crystal;..\producers\Crystal\includeATCA;..\producers\AncillaryTCP;..\filters\Preprocessing;..\filters\Preprocessing\includePrePSA;..\filters\Ancillary;..\filters\Ancillary\includeVME;..\filters\PSA;..\filters\PSA\includePSA;..\filters\Tracking;..\filters\Tracking\includeOFT;..\filters\Tracking\includeMGT;..\builders;C:\root\include"
PreprocessorDefinitions="WIN32;_DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;NRV_TYPE=NRV_OFFLINE;TF_ROOTTREE;USE_SSE_VERSION"
PreprocessorDefinitions="WIN32;_DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;NRV_TYPE=NRV_OFFLINE;TF_ROOTTREE"
MinimalRebuild="true"
BasicRuntimeChecks="3"
RuntimeLibrary="3"
......
......@@ -479,7 +479,7 @@ void EventBuilder::process_block( int ichain,
cout << " **";
cout << endl;
}
//cServer.Check();
cServer.Check();
if(oevCount > 0)
noBuild1 = noBuild2 = 0;
......
......@@ -113,7 +113,7 @@ const int defTriggerSample = 10; // 60-10 useful samples passed to the PS
/////////////////////////////////////
//#define PPF_NO_ADF // PreprocessingFilter does not use ADF to decode/code the IO buffers (still working??)
//#define PPF_CHECK_EE_ES // Discard event if CC energy is too different from energy derived from trace amplitude
#define PPF_CHECK_EE_ES // Discard event if CC energy is too different from energy derived from trace amplitude
#define PPF_LOCALSPECTRA // enable nDhist spectra
//#define PPF_FromGRU_ // enable GRU/ViGRU
......@@ -121,7 +121,7 @@ const int defTriggerSample = 10; // 60-10 useful samples passed to the PS
//////// PSAFilter ////////
///////////////////////////
#define PSA_THREADED // enable splitting the PSA algorithm into parallel threads
#define PSA_THREADED // enable running the PSA algorithm in parallel threads; their actual number is given in the set-up (zero --> no threads)
#define PSA_LOCALSPECTRA // enable nDhist spectra
//#define PSA_FromGRU_ // enable GRU/ViGRU
......@@ -144,7 +144,7 @@ const int defTriggerSample = 10; // 60-10 useful samples passed to the PS
//#define TF_ROOTTREE // enable the root tree in this actor (ENABLED IN THE MAKEFILE)
//#define TF_WRITE_TRACKED // enable writing the tracked gammas into the Oft_TrackedEnergies ascii file
//#define TF_WRITE_GSORT // enable writing input and tracked data in gasp format
//#define TF_WRITE_MGT // enable writing the input hits in mgt format
#define TF_WRITE_MGT // enable writing the input hits in mgt format
#define TF_LOCALSPECTRA // enable nDhist spectra
//#define TF_FromGRU_ // enable GRU/ViGRU
......
......@@ -66,7 +66,7 @@ public:
class cycleServer
{
unsigned long long tstStep; // first few times is faster 1, 2, 4, ..., tStep
unsigned long long tstStart; // tst-seconds or the first timestamp
unsigned long long tstStart; // tst-seconds of the first timestamp
unsigned long long tstCheck; // tst-seconds of last check
unsigned long long tstWrite; // tst-seconds of last write
unsigned long long tstThis; // tst-seconds of this call
......
......@@ -314,8 +314,8 @@ void AncillaryFilter::GetParameters(UInt_t *error_code)
}
}
// this file contains the definition of the Converters
// and calibration coefficients. The method is invoked by the daughter class
// This file contains the definition of the Converters and the Calibration Coefficients.
// The method is invoked by the daughter class
bool AncillaryFilter::Decodesetup(bool verbose)
{
if(!setupFile.size()) {
......@@ -560,7 +560,7 @@ UInt_t AncillaryFilter::ProcessBlock (ADF::FrameBlock &in, ADF::FrameBlock &out)
LOCK_COUT;
cServer.Prompt(-1, nevs, fBlockOut.GetSize());
//cServer.Check();
cServer.Check();
#ifdef AF_ROOTTREE
if(fWriteRootTree && stop_called) {
......
......@@ -66,6 +66,7 @@ PSAFilter::PSAFilter() :
#endif
fUseAdaptive = true;
fTryTwoHits = false;
fCoarseOnly = false;
fPsaSegCenter = false;
fFullSearch = false;
......@@ -170,7 +171,7 @@ void PSAFilter::process_initialise (UInt_t *error_code)
fFrameIO.SetStatus(BaseFrameIO::kIdle);
// generate the interface to the grid search algorithm
if(fPsaCount < 1 || fPsaModulo < 1) {
if(fPsaCount < 1 || fPsaModulo < 1 || fPsaSegCenter) {
fPsaCount = 0;
fPsaModulo = 1;
fPsaSlots = 1;
......@@ -259,27 +260,33 @@ void PSAFilter::GetParameters(UInt_t *error_code)
ok = true;
}
else if( stringEq(keyw, "GridSearch") || stringEq(keyw, "GridSearchType") ) {
if( stringEq(data, "Center") || stringEq(data, "SegCenter") ) {
fUseAdaptive = false; fCoarseOnly = false; fPsaSegCenter = true; fFullSearch = false;
}
else if( stringEq(data, "Full") ) {
fUseAdaptive = false; fCoarseOnly = false; fPsaSegCenter = false; fFullSearch = true;
if( stringEq(data, "Full") ) {
fUseAdaptive = false; fTryTwoHits = false; fCoarseOnly = false; fPsaSegCenter = false; fFullSearch = true;
}
else if( stringEq(data, "Adaptive") ) {
fUseAdaptive = true; fCoarseOnly = false; fPsaSegCenter = false; fFullSearch = false;
fUseAdaptive = true; fTryTwoHits = false; fCoarseOnly = false; fPsaSegCenter = false; fFullSearch = false;
}
else if( stringEq(data, "AdaptiveTwoHits") ) {
fUseAdaptive = true; fTryTwoHits = true ; fCoarseOnly = false; fPsaSegCenter = false; fFullSearch = false;
}
else if(stringEq(data, "CoarseOnly") ) {
fUseAdaptive = true; fCoarseOnly = true; fPsaSegCenter = false; fFullSearch = false;
fUseAdaptive = true; fTryTwoHits = false; fCoarseOnly = true; fPsaSegCenter = false; fFullSearch = false;
}
//else if(stringEq(data, "CoarseOnlyTwoHits") ) {
// fUseAdaptive = true; fTryTwoHits = true ; fCoarseOnly = true; fPsaSegCenter = false; fFullSearch = false;
//}
else if( stringEq(data, "Center") || stringEq(data, "SegCenter") ) {
fUseAdaptive = false; fTryTwoHits = false; fCoarseOnly = false; fPsaSegCenter = true; fFullSearch = false;
}
else {
fUseAdaptive = true; fCoarseOnly = false; fPsaSegCenter = false; fFullSearch = false; // Adaptive
fUseAdaptive = true; fTryTwoHits = false; fCoarseOnly = false; fPsaSegCenter = false; fFullSearch = false; // defaults to Adaptive
}
ok = true;
}
else if( stringEq(keyw, "PlaceAtSegCenter") ) { // keyword to be removed
fUseAdaptive = false; fCoarseOnly = false; fPsaSegCenter = true; fFullSearch = false;
ok = true;
}
//else if( stringEq(keyw, "PlaceAtSegCenter") ) { // keyword to be removed
// fUseAdaptive = false; fTryTwoHits = false; fCoarseOnly = false; fPsaSegCenter = true; fFullSearch = false;
// ok = true;
//}
else if( stringEq(keyw, "Threads") ) {
#ifdef PSA_THREADED
ok = 2 == sscanf(data.c_str(), "%d %d", &fPsaCount, &fPsaModulo);
......@@ -384,11 +391,20 @@ Int_t PSAFilter::SetOutput(int slot)
for(UInt_t nh = 0; nh < pD->numHits; nh++, pOut++) {
ADF::PSAHit *hit = (PSAHit*)fFramePSA->Data()->NewHit();
hit->Reset();
hit->SetE(pOut->enerSG);
hit->SetXYZ(pOut->fx, pOut->fy, pOut->fz);
hit->SetE(pOut->enerSG*pOut->bestFactor);
hit->SetXYZ(pOut->fx1, pOut->fy1, pOut->fz1);
hit->SetID(pOut->netChargeSeg);
hit->SetT(pOut->dTns);
hit->SetDT(pOut->chiSq); // chi2 temporarily stored in the error DT
if(pOut->bestPoint2 >= 0) {
ADF::PSAHit *hit = (PSAHit*)fFramePSA->Data()->NewHit();
hit->Reset();
hit->SetE(pOut->enerSG*(1.f-pOut->bestFactor));
hit->SetXYZ(pOut->fx2, pOut->fy2, pOut->fz2);
hit->SetID(pOut->netChargeSeg);
hit->SetT(pOut->dTns);
hit->SetDT(pOut->chiSq); // chi2 temporarily stored in the error DT
}
}
((AgataKey *)fFramePSA->GetKey())->SetEventNumber(pD->evnumber);
......@@ -502,7 +518,7 @@ UInt_t PSAFilter::ProcessBlockNoThreads(ADF::FrameBlock &in, ADF::FrameBlock &ou
LOCK_COUT;
cServer.Prompt(crystal_id, nevs, fBlockOut.GetSize() );
//cServer.Check();
cServer.Check();
return error_code;
}
......@@ -712,7 +728,7 @@ UInt_t PSAFilter::ProcessBlockThreads(ADF::FrameBlock &in, ADF::FrameBlock &out)
LOCK_COUT;
cServer.Prompt(crystal_id, nevs, fBlockOut.GetSize() );
//cServer.Check();
cServer.Check();
return error_code;
}
......
......@@ -54,26 +54,33 @@ struct CSlot
struct PsaOut_t
{
PsaOut_t() : enerSG(0), enerCC(0),
fx(0), fy(0), fz(0),
fx1(0), fy1(0), fz1(0), fx2(0), fy2(0), fz2(0),
dTns(0), chiSq(0),
netChargeSeg(0), bestPoint(0),
corrCC(1.f), corrSG(1.f)
netChargeSeg(-1), bestPoint1(-1), bestPoint2(-1), bestFactor(1.f),
corrCC1(1.f), corrSG1(1.f), corrCC2(1.f), corrSG2(1.f)
{;}
void Reset() {
enerSG = enerCC = 0;
fx = fy = fz = 0;
fx1 = fy1 = fz1 = 0;
fx2 = fy2 = fz2 = 0;
dTns = chiSq = 0;
netChargeSeg = bestPoint = 0;
corrCC = corrSG = 1.f;
netChargeSeg = bestPoint1 = bestPoint2 = -1;
bestFactor = 1.f;
corrCC1 = corrSG1 = 1.f;
corrCC2 = corrSG2 = 1.f;
}
float enerSG; // segment energy
float enerCC; // fractional core energy
float fx, fy, fz; // hit position (mm)
float fx1, fy1, fz1; // hit position (mm) of first hit
float fx2, fy2, fz2; // hit position (mm) of second hit
float dTns; // time shift of this interaction <== unused
float chiSq; // chi square of the best point <== what is the meaning of this ?
int netChargeSeg; // the index of the segment of this hit
int bestPoint; // used in PostProcess() to extract information stored in the signal basis
float corrCC, corrSG; // charge-trapping correction factors
int bestPoint1; // used in PostProcess() to extract information stored in the signal basis
int bestPoint2; // used in PostProcess() to extract information stored in the signal basis
float bestFactor; // relative amplitude of bestPoint1
float corrCC1, corrSG1; // charge-trapping correction factors of first hit
float corrCC2, corrSG2; // charge-trapping correction factors of second hit
};
// Data interface to PSAGridSearch.
......@@ -187,6 +194,7 @@ public:
#endif
Bool_t fUseAdaptive;
Bool_t fTryTwoHits;
Bool_t fCoarseOnly;
Bool_t fPsaSegCenter;
Bool_t fFullSearch;
......
This diff is collapsed.
......@@ -12,6 +12,11 @@
#include "GridSearchParams.h"
#include "SignalBasis.h"
#ifdef USE_SSE_VERSION // defined in GridSearchParams.h
#include <xmmintrin.h>
#include <tmmintrin.h>
#endif
//#define SPECMETRICS // to produce specMetrics in SearchFullGrid and SearchAdaptive ==> will be very slow and is not thread-safe
#ifdef PSA_FromGRU_
......@@ -74,6 +79,7 @@ private:
char hmask[NSEGS][NSEGS+4]; // Some padding to have a "good" length
bool gUseAdaptive;
bool gTryTwoHits;
bool gCoarseOnly;
bool gPsaSegCenter;
bool gFullSearch;
......@@ -117,7 +123,8 @@ private:
void InitialSolution(pointFull &pS);
void PreSearchCoarse(pointFull &pS);
int SearchFullGrid (pointFull *pS, int netChSeg);
int SearchAdaptive (pointFull *pS, int netChSeg, bool bCoarseOnly = false, bool bFirstOnly = false);
int SearchAdaptive1(pointFull *pS, int netChSeg, bool bCoarseOnly = false, bool bFirstOnly = false); // 1 hit
int SearchAdaptive2(pointFull *pS, int netChSeg, bool bCoarseOnly = false, bool bFirstOnly = false); // 2 hits
void MakeLocalMask (pointFull &pS);
bool WriteBaseAverPt(float scale, std::string ext);
bool WriteBasePoint (float scale, pointPsa &pt, FILE *FP);
......@@ -126,7 +133,7 @@ private:
int RandomSplit2(pointFull &pS, int ntry);
protected:
// this is written in a thread-safe mode and can be called in parallel, using different data slots
// this is written in a thread-safe mode and can be called in parallel, using different data slots
Int_t Process(int slot = 0, int nslots = 1);
// this is not thread-safe and must be called sequentially
Int_t PostProcess(int slot = 0);
......@@ -145,6 +152,7 @@ public:
int WriteTraces (PsaData *pD);
int WritePsaHits (PsaOut_t *pOut, int index = 0);
void AddToSolution (pointFull &pS, pointPsa &bestPoint, int numsamp, float scaleFact);
void AddToSolution (pointFull &pS, pointPsa &bestPoint1, pointPsa &bestPoint2, float bestFac, int numsamp, float scaleFact);
void SaveTotalTrace(pointFull &pS, PsaData *pD);
Float_t FitT0FromCore (pointFull &pS, int tsamp);
......@@ -157,9 +165,24 @@ private:
void MatChi2(float chi, pointPsa* pPsa);
private:
float Chi2InnerLoop(const float *pReal, const float *pBase, const int uSamples);
float Chi2InnerLoop(const int *pReal, const int *pBase, const int uSamples);
float Chi2InnerLoop(const short *pReal, const short *pBase, const int uSamples);
float Chi2InnerLoop(const float *pReal, const float *pBase);
float Chi2InnerLoop(const int *pReal, const int *pBase);
float Chi2InnerLoop(const short *pReal, const short *pBase);
float Chi2InnerLoop(const float *pReal, const float *pBase1, const float *pBase2);
float Chi2OuterLoop(const float **pAmpliReal, const float **pAmpliBase, const int nss, const float chi2min);
float Chi2OuterLoop(const float **pAmpliReal, const float **pAmpliBase1, const float **pAmpliBase2, const int nss, const float chi2min);
int loopSamples;
void SetLoopSamples(int nn) {loopSamples = nn;}
#ifdef USE_SSE_VERSION
__m128 loopFac1, loopFac2;
void SetLoopFactors(float ff1, float ff2) {loopFac1 = _mm_set_ps(ff1, ff1, ff1, ff1); loopFac2 = _mm_set_ps(ff2, ff2, ff2, ff2);}
#else
float loopFac1, loopFac2;
void SetLoopFactors(float ff1, float ff2) {loopFac1 = ff1; loopFac2 = ff2;}
#endif
};
#if defined(__GNUC__)
......@@ -170,12 +193,13 @@ private:
#ifdef USE_SSE_VERSION // defined in GridSearchParams.h
#include <xmmintrin.h>
#include <tmmintrin.h>
// The vectorized version of the FOM loop does not use the mapped metrics, but a fixed power of 2, 1, 0.5 or 0.25, by chosing one of the following macros
#define COMPUTE_VALUE0(nn) ( diff = _mm_sub_ps(realTrace[nn], baseTrace[nn]), chis = _mm_add_ps(chis, _mm_mul_ps(diff, diff)) ) // pow( d ,2)
//////////////////////////////////////
///////////// 1 interaction //////////
//////////////////////////////////////
#define COMPUTE_VALUE0(nn) ( diff = _mm_sub_ps(realTrace[nn], baseTrace[nn]), chis = _mm_add_ps(chis, _mm_mul_ps(diff, diff)) ) // pow( d ,2 )
#define COMPUTE_VALUE1(nn) ( chis = _mm_add_ps(chis, _mm_andnot_ps(masks, _mm_sub_ps(realTrace[nn], baseTrace[nn])) ) ) // pow(|d|,1 )
#define COMPUTE_VALUE2(nn) ( chis = _mm_add_ps(chis, _mm_sqrt_ps(_mm_andnot_ps(masks, _mm_sub_ps(realTrace[nn], baseTrace[nn])) ) ) ) // pow(|d|,1/2)
#define COMPUTE_VALUE3(nn) ( chis = _mm_add_ps(chis, _mm_sqrt_ps(_mm_sqrt_ps(_mm_andnot_ps(masks, _mm_sub_ps(realTrace[nn], baseTrace[nn])) )) ) ) // pow(|d|,1/4)
......@@ -184,7 +208,7 @@ private:
// As realTrace is used repeatedly, it will be in the cache after the first point. Therefore it is not worth prefetching it (actually it seems to be a bit slower)
// The basis is used only once ==> we should prefetch it with _MM_HINT_NTA (non temporal with minimal cache pollution). But _MM_HINT_T0 seems to be a bit faster.
// Anyway none of them brings real improvements (at least with my laptop) and can therefore be safely commented-out
// Anyway none of them brings real improvements (at least with my laptop) and could therefore be safely commented-out
#if defined(__GNUC__)
# define PREFETCH_REAL(nn) //( __builtin_prefetch (&realTrace[nn], 0, 1) )
# define PREFETCH_BASE(nn) ( __builtin_prefetch (&baseTrace[nn], 0, 0) )
......@@ -232,67 +256,200 @@ private:
#define COMPUTE_VALUE_11_
#endif
INLINE_ALWAYS float PSAFilterGridSearch::Chi2InnerLoop(const float *pReal, const float *pBase, const int/* uSamples*/)
// the number of used samples is fixed to SFIXEDLEN
INLINE_ALWAYS float PSAFilterGridSearch::Chi2InnerLoop(const float *pReal, const float *pBase)
{
const __m128 masks = _mm_set1_ps(-0.0f);
const __m128 zeros = _mm_setzero_ps();
__m128* realTrace = (__m128*)pReal;
__m128* baseTrace = (__m128*)pBase;
__m128* realTrace = (__m128*)pReal;
__m128* baseTrace = (__m128*)pBase;
__m128 diff = _mm_setzero_ps();
__m128 chis = _mm_setzero_ps();
//// intermix the prefetch and compute lines to get the desired prefetch lag
//PREFETCH_REAL( 0); PREFETCH_REAL( 1); PREFETCH_REAL( 2); PREFETCH_REAL( 3);
//PREFETCH_REAL( 4); PREFETCH_REAL( 5); PREFETCH_REAL( 6); PREFETCH_REAL( 7);
//PREFETCH_REAL_08_; PREFETCH_REAL_09_; PREFETCH_REAL_10_; PREFETCH_REAL_11_;
//PREFETCH_BASE( 0); PREFETCH_BASE( 1); PREFETCH_BASE( 2); PREFETCH_BASE( 3);
//PREFETCH_BASE( 4); PREFETCH_BASE( 5); PREFETCH_BASE( 6); PREFETCH_BASE( 7);
//PREFETCH_BASE_08_; PREFETCH_BASE_09_; PREFETCH_BASE_10_; PREFETCH_BASE_11_;
COMPUTE_VALUE( 0); COMPUTE_VALUE( 1); COMPUTE_VALUE( 2); COMPUTE_VALUE( 3);
COMPUTE_VALUE( 4); COMPUTE_VALUE( 5); COMPUTE_VALUE( 6); COMPUTE_VALUE( 7);
COMPUTE_VALUE_08_; COMPUTE_VALUE_09_; COMPUTE_VALUE_10_; COMPUTE_VALUE_11_;
//for(int ii = 0; ii < SSE_NUM; ii++) // slightly slower than than the unrolled upper version
// COMPUTE_VALUE(ii);
//for(int ii = 0; ii < loopSamples/4; ii++) // 5% slower if the number of iteration is not a compile-time constant.
// COMPUTE_VALUE(ii);
float chi2 = _mm_cvtss_f32(_mm_hadd_ps(_mm_hadd_ps(chis, zeros), zeros)); // sum the 4 values and save in chi2
return chi2;
}
///////////////////////////////////////
///////////// 2 interactions //////////
///////////////////////////////////////
#define C2_WAVE(nn) ( wave = _mm_add_ps(_mm_mul_ps(loopFac1, baseTrace1[nn]), _mm_mul_ps(loopFac2, baseTrace2[nn]))) // ff1*ww1+ff2*ww2
#define COMPUTE2_VALUE0(nn) ( diff = _mm_sub_ps(realTrace[nn], C2_WAVE(nn)), chis = _mm_add_ps(chis, _mm_mul_ps(diff, diff)) ) // pow( d ,2 )
#define COMPUTE2_VALUE1(nn) ( chis = _mm_add_ps(chis, _mm_andnot_ps(masks, _mm_sub_ps(realTrace[nn], C2_WAVE(nn))) ) ) // pow(|d|,1 )
#define COMPUTE2_VALUE2(nn) ( chis = _mm_add_ps(chis, _mm_sqrt_ps(_mm_andnot_ps(masks, _mm_sub_ps(realTrace[nn], C2_WAVE(nn))) ) ) ) // pow(|d|,1/2)
#define COMPUTE2_VALUE3(nn) ( chis = _mm_add_ps(chis, _mm_sqrt_ps(_mm_sqrt_ps(_mm_andnot_ps(masks, _mm_sub_ps(realTrace[nn], C2_WAVE(nn))) )) ) ) // pow(|d|,1/4)
#define COMPUTE2_VALUE(nn) COMPUTE2_VALUE0(nn)
#if defined(__GNUC__)
# define PREFETCH_BASE1(nn) ( __builtin_prefetch (&baseTrace1[nn], 0, 0) )
# define PREFETCH_BASE2(nn) ( __builtin_prefetch (&baseTrace2[nn], 0, 0) )
#elif defined(_MSC_VER)
# define PREFETCH_BASE1(nn) ( _mm_prefetch ((const char *)&baseTrace1[ nn], _MM_HINT_NTA) )
# define PREFETCH_BASE2(nn) ( _mm_prefetch ((const char *)&baseTrace2[ nn], _MM_HINT_NTA) )
#endif
#if SSE_NUM > 8
#define PREFETCH_BASE1_08_ PREFETCH_BASE1( 8)
#define PREFETCH_BASE2_08_ PREFETCH_BASE2( 8)
#define COMPUTE2_VALUE_08_ COMPUTE2_VALUE( 8)
#else
#define PREFETCH_BASE1_08_
#define PREFETCH_BASE2_08_
#define COMPUTE2_VALUE_08_
#endif
#if SSE_NUM > 9
#define PREFETCH_BASE1_09_ PREFETCH_BASE1( 9)
#define PREFETCH_BASE2_09_ PREFETCH_BASE2( 9)
#define COMPUTE2_VALUE_09_ COMPUTE2_VALUE( 9)
#else
#define PREFETCH_BASE1_09_
#define PREFETCH_BASE2_09_
#define COMPUTE2_VALUE_09_
#endif
#if SSE_NUM > 10
#define PREFETCH_BASE1_10_ PREFETCH_BASE1(10)
#define PREFETCH_BASE2_10_ PREFETCH_BASE2(10)
#define COMPUTE2_VALUE_10_ COMPUTE2_VALUE(10)
#else
#define PREFETCH_BASE1_10_
#define PREFETCH_BASE2_10_
#define COMPUTE2_VALUE_10_
#endif
#if SSE_NUM > 11
#define PREFETCH_BASE1_11_ PREFETCH_BASE1(11)
#define PREFETCH_BASE2_11_ PREFETCH_BASE2(11)
#define COMPUTE2_VALUE_11_ COMPUTE2_VALUE(11)
#else
#define PREFETCH_BASE1_11_
#define PREFETCH_BASE2_11_
#define COMPUTE2_VALUE_11_
#endif
// the number of used samples is fixed to SFIXEDLEN
// the weight of the two components must be set by a preceeding call to SetLoopFactors(ff1, ff2)
INLINE_ALWAYS float PSAFilterGridSearch::Chi2InnerLoop(const float *pReal, const float *pBase1, const float *pBase2)
{
const __m128 masks = _mm_set1_ps(-0.0f);
const __m128 zeros = _mm_setzero_ps();
__m128* realTrace = (__m128*)pReal;
__m128* baseTrace1 = (__m128*)pBase1;
__m128* baseTrace2 = (__m128*)pBase2;
__m128 wave = _mm_setzero_ps();
__m128 diff = _mm_setzero_ps();
__m128 chis = _mm_setzero_ps();
// intermix the prefetch and compute lines to get the desired prefetch lag
PREFETCH_REAL( 0); PREFETCH_REAL( 1); PREFETCH_REAL( 2); PREFETCH_REAL( 3);
PREFETCH_REAL( 4); PREFETCH_REAL( 5); PREFETCH_REAL( 6); PREFETCH_REAL( 7);
PREFETCH_REAL_08_; PREFETCH_REAL_09_; PREFETCH_REAL_10_; PREFETCH_REAL_11_;
PREFETCH_BASE( 0); PREFETCH_BASE( 1); PREFETCH_BASE( 2); PREFETCH_BASE( 3);
PREFETCH_BASE( 4); PREFETCH_BASE( 5); PREFETCH_BASE( 6); PREFETCH_BASE( 7);
PREFETCH_BASE_08_; PREFETCH_BASE_09_; PREFETCH_BASE_10_; PREFETCH_BASE_11_;
PREFETCH_BASE1( 0); PREFETCH_BASE1( 1); PREFETCH_BASE1( 2); PREFETCH_BASE1( 3);
PREFETCH_BASE1( 4); PREFETCH_BASE1( 5); PREFETCH_BASE1( 6); PREFETCH_BASE1( 7);
PREFETCH_BASE1_08_; PREFETCH_BASE1_09_; PREFETCH_BASE1_10_; PREFETCH_BASE1_11_;
COMPUTE_VALUE( 0); COMPUTE_VALUE( 1); COMPUTE_VALUE( 2); COMPUTE_VALUE( 3);
COMPUTE_VALUE( 4); COMPUTE_VALUE( 5); COMPUTE_VALUE( 6); COMPUTE_VALUE( 7);
COMPUTE_VALUE_08_; COMPUTE_VALUE_09_; COMPUTE_VALUE_10_; COMPUTE_VALUE_11_;
PREFETCH_BASE2( 0); PREFETCH_BASE2( 1); PREFETCH_BASE2( 2); PREFETCH_BASE2( 3);
PREFETCH_BASE2( 4); PREFETCH_BASE2( 5); PREFETCH_BASE2( 6); PREFETCH_BASE2( 7);
PREFETCH_BASE2_08_; PREFETCH_BASE2_09_; PREFETCH_BASE2_10_; PREFETCH_BASE2_11_;
COMPUTE2_VALUE( 0); COMPUTE2_VALUE( 1); COMPUTE2_VALUE( 2); COMPUTE2_VALUE( 3);
COMPUTE2_VALUE( 4); COMPUTE2_VALUE( 5); COMPUTE2_VALUE( 6); COMPUTE2_VALUE( 7);
COMPUTE2_VALUE_08_; COMPUTE2_VALUE_09_; COMPUTE2_VALUE_10_; COMPUTE2_VALUE_11_;
float chi2 = _mm_cvtss_f32(_mm_hadd_ps(_mm_hadd_ps(chis, zeros), zeros)); // sum the 4 values and save in chi2
return chi2;
}
INLINE_ALWAYS float PSAFilterGridSearch::Chi2OuterLoop(const float **pAmpliReal, const float **pAmpliBase, const int nss, const float chi2min)
{
const __m128 masks = _mm_set1_ps(-0.0f);
const __m128 zeros = _mm_setzero_ps();
__m128 diff = _mm_setzero_ps();
__m128 chis = _mm_setzero_ps();
float chi2 = 0;
for(int iss = 0; iss < nss; iss++) {
__m128* realTrace = (__m128*)pAmpliReal[iss];
__m128* baseTrace = (__m128*)pAmpliBase[iss];
COMPUTE_VALUE( 0); COMPUTE_VALUE( 1); COMPUTE_VALUE( 2); COMPUTE_VALUE( 3);
COMPUTE_VALUE( 4); COMPUTE_VALUE( 5); COMPUTE_VALUE( 6); COMPUTE_VALUE( 7);
COMPUTE_VALUE_08_; COMPUTE_VALUE_09_; COMPUTE_VALUE_10_; COMPUTE_VALUE_11_;
chi2 = _mm_cvtss_f32(_mm_hadd_ps(_mm_hadd_ps(chis, zeros), zeros)); // sum the 4 values add to chi2
if(chi2 > chi2min)
break;
}
return chi2;
}
#else // #ifdef USE_SSE_VERSION
// the standard, non-vectorized versions
INLINE_ALWAYS float PSAFilterGridSearch::Chi2InnerLoop(const float *realTrace, const float *baseTrace, const int uSamples)
// loopSamples is defined by a preceeding cal to SetLoopSamples(uSamples)
INLINE_ALWAYS float PSAFilterGridSearch::Chi2InnerLoop(const float *pReal, const float *pBase)
{
const float *metrics = fMetrics + RMETRIC;
float chi2 = 0;
for(int nn = 0; nn < uSamples; nn++) {
float diff = realTrace[nn] - baseTrace[nn];
for(int nn = 0; nn < loopSamples; nn++) {
float diff = pReal[nn] - pBase[nn];
#ifdef SPECMETRICS // be sure that specMetrics is defined and that you are NOT USING THREADS
int idiff = int(diff + RMETRIC);
specMetrics->incr(0, idiff);
if(idiff<=0 || idiff>=2*RMETRIC-1) {
int nerrr = 1; // just to catch the outliers with the debugger
int nerrr = 1; // to catch the outliers with the debugger
}
#endif
chi2 += metrics[int(diff)];
//chi2 += sqrt(fabs(diff)); // to test direct calculation of the metrics
//chi2 += metrics[int(diff)];
//chi2 += pow(fabs(diff), float(METRIC)); // direct use of the pow function --> very slow
chi2 += diff*diff; // direct calculation of pow( d , 2 )
//chi2 += fabs(diff); // direct calculation of pow(|d|, 1 )
//chi2 += sqrt(fabs(diff)); // direct calculation of pow(|d|, 1/2)
//chi2 += sqrt(sqrt(fabs(diff))); // direct calculation of pow(|d|, 1/4)
}
return chi2;
}
INLINE_ALWAYS float PSAFilterGridSearch::Chi2InnerLoop(const int *realTrace, const int *baseTrace, const int uSamples)
INLINE_ALWAYS float PSAFilterGridSearch::Chi2InnerLoop(const int *pReal, const int *pBase)
{
const float *metrics = fMetrics + RMETRIC;
float chi2 = 0;
for(int nn = 0; nn < uSamples; nn++) {
int diff = realTrace[nn] - baseTrace[nn];
for(int nn = 0; nn < loopSamples; nn++) {
int diff = pReal[nn] - pBase[nn];
diff >>= RESCALE;
#ifdef SPECMETRICS // be sure that specMetrics is defined and that you are NOT USING THREADS
int idiff = int(diff + RMETRIC);
specMetrics->incr(0, idiff);
if(idiff<=0 || idiff>=2*RMETRIC-1) {
int nerrr = 1; // just to catch the outliers with the debugger
int nerrr = 1; // to catch the outliers with the debugger
}
#endif
chi2 += metrics[diff];
......@@ -300,17 +457,17 @@ INLINE_ALWAYS float PSAFilterGridSearch::Chi2InnerLoop(const int *realTrace, con
return chi2;
}
INLINE_ALWAYS float PSAFilterGridSearch::Chi2InnerLoop(const short *realTrace, const short *baseTrace, const int uSamples)
INLINE_ALWAYS float PSAFilterGridSearch::Chi2InnerLoop(const short *pReal, const short *pBase)
{
const float *metrics = fMetrics + RMETRIC;
float chi2 = 0;
for(int nn = 0; nn < uSamples; nn++) {
short diff = realTrace[nn] - baseTrace[nn];
for(int nn = 0; nn < loopSamples; nn++) {
short diff = pReal[nn] - pBase[nn];
#ifdef SPECMETRICS // be sure that specMetrics is defined and that you are NOT USING THREADS
int idiff = int(diff + RMETRIC);
specMetrics->incr(0, idiff);
if(idiff<=0 || idiff>=2*RMETRIC-1) {
int nerrr = 1; // just to catch the outliers with the debugger
int nerrr = 1; // to catch the outliers with the debugger
}
#endif
chi2 += metrics[diff];
......@@ -318,6 +475,46 @@ INLINE_ALWAYS float PSAFilterGridSearch::Chi2InnerLoop(const short *realTrace, c
return chi2;
}
// loop with 2 hits
// the weight of the two components must be set by a preceeding call to SetLoopFactors(ff1, ff2)
INLINE_ALWAYS float PSAFilterGridSearch::Chi2InnerLoop(const float *pReal, const float *pBase1, const float *pBase2)
{
const float *metrics = fMetrics + RMETRIC;
float chi2 = 0;
for(int nn = 0; nn < loopSamples; nn++) {
float diff = pReal[nn] - (loopFac1*pBase1[nn] + loopFac2*pBase2[nn]);
#ifdef SPECMETRICS // be sure that specMetrics is defined and that you are NOT USING THREADS
int idiff = int(diff + RMETRIC);