32#include "PBBS/radixSort.h"
33#include "Tommy/tommyhashdyn.h"
56template <
class SR,
class IT,
class NUM,
class IVT,
class OVT>
57void SpImpl<SR,IT,NUM,IVT,OVT>::SpMXSpV(
const Dcsc<IT,NUM> &
Adcsc,
int32_t mA,
const int32_t *
indx,
const IVT * numx,
int32_t veclen,
58 std::vector<int32_t> &
indy, std::vector< OVT > &
numy)
65 if(
sizeof(
NUM) >
sizeof(
OVT))
73 if(SR::returnedSAID())
103 if(!SR::returnedSAID())
134 if (!SR::returnedSAID())
167template <
class SR,
class IT,
class IVT,
class OVT>
168void SpImpl<SR,IT,bool,IVT,OVT>::SpMXSpV(
const Dcsc<IT,bool> &
Adcsc,
int32_t mA,
const int32_t *
indx,
const IVT * numx,
int32_t veclen,
169 std::vector<int32_t> &
indy, std::vector<OVT> &
numy)
171 IT inf = std::numeric_limits<IT>::min();
172 IT sup = std::numeric_limits<IT>::max();
194 if(
sHeap.getSize() > 0)
198 numy.push_back( num );
200 while(
sHeap.getSize() > 0)
205 numy.back() = SR::add(
numy.back(), num);
222template <
typename SR,
typename IT,
typename IVT,
class OVT>
223void SpImpl<SR,IT,bool,IVT,OVT>::SpMXSpV(
const Dcsc<IT,bool> &
Adcsc,
int32_t mA,
const int32_t *
indx,
const IVT * numx,
int32_t veclen,
228 std::vector< std::vector<int32_t> >
nzinds(p_c);
259 for(
int p = 0; p< p_c; ++p)
265 for(
int i=0; i<
cnts[p]; ++i)
276template <
typename SR,
typename IT,
typename IVT,
typename OVT>
277void SpImpl<SR,IT,bool,IVT,OVT>::SpMXSpV_ForThreading(
const Dcsc<IT,bool> &
Adcsc,
int32_t mA,
const int32_t *
indx,
const IVT * numx,
int32_t veclen,
282 std::vector<uint32_t>
nzinds;
284 SpMXSpV_ForThreading(
Adcsc,
mA,
indx, numx,
veclen,
indy,
numy,
offset,
localy, isthere,
nzinds);
290template <
typename SR,
typename IT,
typename IVT,
typename OVT>
291void SpImpl<SR,IT,bool,IVT,OVT>::SpMXSpV_ForThreading(
const Dcsc<IT,bool> &
Adcsc,
int32_t mA,
const int32_t *
indx,
const IVT * numx,
int32_t veclen, std::vector<int32_t> &
indy, std::vector<OVT> &
numy,
int32_t offset, std::vector<OVT> &
localy,
BitMap & isthere, std::vector<uint32_t> &
nzinds)
323 for(
int i=0; i<
nnzy; ++i)
344template <
typename SR,
typename IT,
typename NT,
typename IVT,
typename OVT>
347 IT inf = std::numeric_limits<IT>::min();
348 IT sup = std::numeric_limits<IT>::max();
357 OVT val = SR::multiply(
Acsc.num[
j], numx[k]);
364 if(
sHeap.getSize() > 0)
369 numy.push_back( num );
371 while(
sHeap.getSize() > 0)
377 numy.back() = SR::add(
numy.back(), num);
389template <
typename SR,
typename IT,
typename NT,
typename IVT,
typename OVT>
407 std::ostringstream
outs;
408 outs <<
"Warning in SpMXSpV_Bucket: " <<
rowSplits <<
" buckets are supplied for " <<
nthreads <<
" threads\n";
409 outs <<
"4 times the number of threads are recommended when creating PreAllocatedSPA\n";
426#ifdef BENCHMARK_SPMSPV
431#pragma omp parallel for schedule(dynamic, 1)
463#ifdef BENCHMARK_SPMSPV
493#ifdef BENCHMARK_SPMSPV
502#define L2_CACHE_SIZE 256000
514#ifdef BENCHMARK_SPMSPV
527#pragma omp for schedule(dynamic,1)
544 OVT val = SR::multiply(
Acsc.num[
j], numx[i]);
578#ifdef BENCHMARK_SPMSPV
586#pragma omp parallel for schedule(dynamic,1)
591 for(
int i=disp[
rs]; i<disp[
rs+1] ; i++)
597 for(
int i=disp[
rs]; i<disp[
rs+1] ; i++)
603 SPA.V_localy[0][rowid] =
SPA.numSplitA[i];
609 SPA.V_localy[0][rowid] = SR::add(
SPA.V_localy[0][rowid],
SPA.numSplitA[i]);
618#ifdef BENCHMARK_SPMSPV
629#ifdef BENCHMARK_SPMSPV
635#ifdef BENCHMARK_SPMSPV
650#pragma omp for schedule(dynamic,1)
683#ifdef BENCHMARK_SPMSPV
691#ifdef BENCHMARK_SPMSPV
693 std::ostringstream
outs1;
694 outs1 <<
"Time breakdown of SpMSpV-bucket." << std::endl;
695 outs1 <<
"Estimate buckets: "<<
t1 <<
" Bucketing: " <<
t2 <<
" SPA-merge: " <<
t3 <<
" Output: " <<
t4 <<
" Total: "<<
tall << std::endl;
bool get_bit(uint64_t pos)
void set_bit(uint64_t pos)
static void Print(const std::string &s)
void integerSort(std::pair< uint32_t, T > *A, int n)
void SpMXSpV_Bucket(const Csc< IT, NT > &Acsc, int32_t mA, const int32_t *indx, const IVT *numx, int32_t veclen, std::vector< int32_t > &indy, std::vector< OVT > &numy, PreAllocatedSPA< OVT > &SPA)
void SpMXSpV_ForThreading(const Dcsc< IT, NUM > &Adcsc, int32_t mA, const int32_t *indx, const IVT *numx, int32_t veclen, std::vector< int32_t > &indy, std::vector< OVT > &numy, int32_t offset)
Overload #3: DCSC.
void SpMXSpV_HeapSort(const Csc< IT, NT > &Acsc, int32_t mA, const int32_t *indx, const IVT *numx, int32_t veclen, std::vector< int32_t > &indy, std::vector< OVT > &numy, int32_t offset)
static void SpMXSpV(const Dcsc< IT, NUM > &Adcsc, int32_t mA, const int32_t *indx, const IVT *numx, int32_t veclen, std::vector< int32_t > &indy, std::vector< OVT > &numy)
static void SpMXSpV_ForThreading(const Dcsc< IT, NUM > &Adcsc, int32_t mA, const int32_t *indx, const IVT *numx, int32_t veclen, std::vector< int32_t > &indy, std::vector< OVT > &numy, int32_t offset)