47template <
class IU,
class NU>
50template <
class IU,
class NU>
53template <
class IU,
class NU>
63template <
typename SR,
typename IU,
typename NU,
typename RHS,
typename LHS>
68 for(
IU j =0;
j<
A.dcsc->nzc; ++
j)
70 IU colid =
A.dcsc->jc[
j];
71 for(
IU i =
A.dcsc->cp[
j]; i<
A.dcsc->cp[
j+1]; ++i)
73 IU rowid =
A.dcsc->ir[i];
74 SR::axpy(
A.dcsc->numx[i],
x[colid],
y[rowid]);
81template <
typename SR,
typename IU,
typename NU,
typename RHS,
typename LHS>
103 #pragma omp parallel for
104 for(
IU j =0;
j<
A.dcsc->nzc; ++
j)
113 IU colid =
A.dcsc->jc[
j];
114 for(
IU i =
A.dcsc->cp[
j]; i<
A.dcsc->cp[
j+1]; ++i)
116 IU rowid =
A.dcsc->ir[i];
117 SR::axpy(
A.dcsc->numx[i],
x[colid],
loc2merge[rowid]);
121 #pragma omp parallel for
139 template <
typename SR,
typename IU,
typename NU,
typename RHS,
typename LHS>
144 int splits =
A.getnsplit();
149 std::vector<int> disp(splits, 0);
150 for(
int i=1; i<splits; ++i)
153#pragma omp parallel for
155 for(
int s=0; s<splits; ++s)
158 for(
IU j =0;
j<dcsc->nzc; ++
j)
160 IU colid = dcsc->jc[
j];
161 for(
IU i = dcsc->cp[
j]; i< dcsc->cp[
j+1]; ++i)
163 IU rowid = dcsc->ir[i] + disp[s];
164 SR::axpy(dcsc->numx[i],
x[colid],
y[rowid]);
181template <
typename SR,
typename IU,
typename NUM,
typename DER,
typename IVT,
typename OVT>
189 sdispls =
new int[p_c]();
190 if(
A.getnnz() > 0 &&
nnzx > 0)
192 int splits =
A.getnsplit();
197 std::vector< std::vector< int32_t > >
indy(splits);
198 std::vector< std::vector< OVT > >
numy(splits);
202 #pragma omp parallel for
204 for(
int i=0; i<splits; ++i)
209 SpMXSpV_ForThreading<SR>(*(
A.GetInternal(i)),
perpiece,
indx, numx,
nnzx,
indy[i],
numy[i], i*
perpiece,
SPA.V_localy[i],
SPA.V_isthere[i],
SPA.V_inds[i]);
211 SpMXSpV_ForThreading<SR>(*(
A.GetInternal(i)),
nlocrows -
perpiece*i,
indx, numx,
nnzx,
indy[i],
numy[i], i*
perpiece,
SPA.V_localy[i],
SPA.V_isthere[i],
SPA.V_inds[i]);
222 std::vector<int>
accum(splits+1, 0);
223 for(
int i=0; i<splits; ++i)
233 std::vector<int32_t>
end_recs(splits);
234 for(
int i=0; i<splits; ++i)
242 #pragma omp parallel for
244 for(
int i=0; i<splits; ++i)
257 while (k >= 0 &&
end_recs[k] == -1) k--;
275 int end =
indy[i].size();
287 std::vector<int32_t>().swap(
indy[i]);
288 std::vector<OVT>().swap(
numy[i]);
290 for(
int k=i+1; k < splits; ++k)
299 return accum[splits];
303 std::cout <<
"Something is wrong, splits should be nonzero for multithreaded execution" << std::endl;
322template <
typename SR,
typename IU,
typename NUM,
typename DER,
typename IVT,
typename OVT>
326 if(
A.getnnz() > 0 &&
nnzx > 0)
328 int splits =
A.getnsplit();
331 std::vector< std::vector<int32_t> >
indy(splits);
332 std::vector< std::vector< OVT > >
numy(splits);
337 #pragma omp parallel for
339 for(
int i=0; i<splits; ++i)
352 std::vector<int32_t>
end_recs(splits);
353 for(
int i=0; i<splits; ++i)
363 #pragma omp parallel for
365 for(
int i=0; i<splits; ++i)
372 for(
typename std::vector<int32_t>::iterator
it =
indy[i].begin();
it !=
indy[i].end(); ++
it)
385 #pragma omp parallel for
387 for(
int i=0; i<splits; ++i)
408 for(
typename std::vector<int32_t>::iterator
it =
indy[i].begin();
it !=
indy[i].end(); ++
it)
426 for(
int i=0; i< splits; ++i)
428 for(
int j=0;
j< p_c; ++
j)
436 std::cout <<
"Something is wrong, splits should be nonzero for multithreaded execution" << std::endl;
444template <
typename SR,
typename MIND,
typename VIND,
typename DER,
typename NUM,
typename IVT,
typename OVT>
447 if(
A.getnnz() > 0 &&
nnzx > 0)
449 if(
A.getnsplit() > 0)
451 std::cout <<
"Call dcsc_gespmv_threaded instead" << std::endl;
463template <
typename SR,
typename IU,
typename DER,
typename NUM,
typename IVT,
typename OVT>
467 if(
A.getnnz() > 0 &&
nnzx > 0)
469 if(
A.getnsplit() > 0)
486 std::cerr<<
"Warning: Matrix is too small to be splitted for multithreading" << std::endl;
492 std::vector<IU>
nzcs(
A.splits, 0);
493 std::vector<IU>
nnzs(
A.splits, 0);
494 std::vector < std::vector < std::pair<IU,IU> > >
colrowpairs(
A.splits);
495 if(
A.nnz > 0 &&
A.dcsc !=
NULL)
497 for(
IU i=0; i<
A.dcsc->nzc; ++i)
499 for(
IU j =
A.dcsc->cp[i];
j<
A.dcsc->cp[i+1]; ++
j)
501 IU colid =
A.dcsc->jc[i];
502 IU rowid =
A.dcsc->ir[
j];
521 for(
int i=0; i<
A.splits; ++i)
527 std::fill(
A.dcscarr[i]->numx,
A.dcscarr[i]->numx+
nnzs[i],
static_cast<bool>(1));
534 A.dcscarr[i]->cp[
curnzc++] = 0;
564template<
class SR,
class NUO,
class IU,
class NU1,
class NU2>
573 if(
A.isZero() ||
B.isZero())
607template<
class SR,
class NUO,
class IU,
class NU1,
class NU2>
615 if(
A.isZero() ||
B.isZero())
631template<
class SR,
class NUO,
class IU,
class NU1,
class NU2>
639 std::cout <<
"Tuples_AtXBt function has not been implemented yet !" << std::endl;
644template<
class SR,
class NUO,
class IU,
class NU1,
class NU2>
652 std::cout <<
"Tuples_AtXBn function has not been implemented yet !" << std::endl;
659template<
class SR,
class IU,
class NU>
672 for(
int i=1; i<
hsize; ++i)
676 std::cerr <<
"Dimensions do not match on MergeAll()" << std::endl;
683 std::tuple<IU, IU, int> *
heap =
new std::tuple<IU, IU, int> [
hsize];
688 for(
int i=0; i<
hsize; ++i)
695 std::tuple<IU, IU, NU> *
ntuples =
new std::tuple<IU,IU,NU>[
estnnz];
747template <
typename IU,
typename NU1,
typename NU2>
765 if(
A.jc[i] >
B->jc[
j]) ++
j;
766 else if(
A.jc[i] <
B->jc[
j])
769 for(
IU k =
A.cp[i-1]; k<
A.cp[i]; k++)
784 else if (
A.ir[
ii] <
B->ir[
jj])
795 while (
ii <
A.cp[i+1])
813 for(
IU k =
A.cp[i-1]; k<
A.cp[i]; ++k)
833template <
typename IU,
typename NU1,
typename NU2>
857 if(
A.jc[i] >
B->jc[
j]) ++
j;
858 else if(
A.jc[i] <
B->jc[
j]) ++i;
867 else if (
A.ir[
ii] >
B->ir[
jj]) ++
jj;
889template <
typename N_promote,
typename IU,
typename NU1,
typename NU2,
typename _BinaryOperation>
917 if(
A.jc[i] >
B->jc[
j]) ++
j;
918 else if(
A.jc[i] <
B->jc[
j]) ++i;
927 else if (
A.ir[
ii] >
B->ir[
jj]) ++
jj;
948 if(
A.jc[i] >
B->jc[
j]) ++
j;
949 else if(
A.jc[i] <
B->jc[
j])
952 for(
IU k =
A.cp[i-1]; k<
A.cp[i]; k++)
967 else if (
A.ir[
ii] <
B->ir[
jj])
978 while (
ii <
A.cp[i+1])
996 for(
IU k =
A.cp[i-1]; k<
A.cp[i]; ++k)
1010template<
typename IU,
typename NU1,
typename NU2>
1018 if(
A.nnz > 0 &&
B.nnz > 0)
1035template<
typename N_promote,
typename IU,
typename NU1,
typename NU2,
typename _BinaryOperation>
1043 if(
A.nnz > 0 &&
B.nnz > 0)
1048 else if (
A.nnz > 0 &&
notB)
1068template <
typename RETT,
typename IU,
typename NU1,
typename NU2,
typename _BinaryOperation,
typename _BinaryPredicate>
1069Dcsc<IU, RETT> EWiseApply(
const Dcsc<IU,NU1> *
Ap,
const Dcsc<IU,NU2> *
Bp,
_BinaryOperation __binary_op,
_BinaryPredicate do_op,
bool allowANulls,
bool allowBNulls,
const NU1&
ANullVal,
const NU2&
BNullVal,
const bool allowIntersect)
1095 for(
IU k =
B.cp[
j-1]; k<
B.cp[
j]; ++k)
1130 for(
IU k =
A.cp[i-1]; k<
A.cp[i]; k++)
1158 while(i<
A.nzc &&
j<
B.nzc)
1160 if(
A.jc[i] >
B.jc[
j])
1167 for(
IU k =
B.cp[
j-1]; k<
B.cp[
j]; ++k)
1179 else if(
A.jc[i] <
B.jc[
j])
1186 for(
IU k =
A.cp[i-1]; k<
A.cp[i]; k++)
1204 while (
ii <
A.cp[i+1] &&
jj <
B.cp[
j+1])
1216 else if (
A.ir[
ii] >
B.ir[
jj])
1240 while (
ii <
A.cp[i+1])
1250 while (
jj <
B.cp[
j+1])
1269 for(
IU k =
A.cp[i-1]; k<
A.cp[i]; ++k)
1284 for(
IU k =
B.cp[
j-1]; k<
B.cp[
j]; ++k)
1299template <
typename RETT,
typename IU,
typename NU1,
typename NU2,
typename _BinaryOperation,
typename _BinaryPredicate>
1300SpDCCols<IU,RETT> EWiseApply (
const SpDCCols<IU,NU1> &
A,
const SpDCCols<IU,NU2> &
B,
_BinaryOperation __binary_op,
_BinaryPredicate do_op,
bool allowANulls,
bool allowBNulls,
const NU1&
ANullVal,
const NU2&
BNullVal,
const bool allowIntersect)
1305 Dcsc<IU, RETT> *
tdcsc =
new Dcsc<IU, RETT>(
EWiseApply<RETT>(
A.dcsc,
B.dcsc,
__binary_op,
do_op,
allowANulls,
allowBNulls,
ANullVal,
BNullVal,
allowIntersect));
static void SpIntersect(const Dcsc< IT, NT1 > &Adcsc, const Dcsc< IT, NT2 > &Bdcsc, Isect< IT > *&cols, Isect< IT > *&rows, Isect< IT > *&isect1, Isect< IT > *&isect2, Isect< IT > *&itr1, Isect< IT > *&itr2)
static void deallocate2D(T **array, I m)
static void ShrinkArray(NT *&array, IT newsize)
static void Print(const std::string &s)
void generic_gespmv_threaded_setbuffers(const SpMat< IU, NUM, DER > &A, const int32_t *indx, const IVT *numx, int32_t nnzx, int32_t *sendindbuf, OVT *sendnumbuf, int *cnts, int *dspls, int p_c)
void dcsc_gespmv_threaded(const SpDCCols< IU, NU > &A, const RHS *x, LHS *y)
SpTuples< IU, NUO > * Tuples_AtXBt(const SpDCCols< IU, NU1 > &A, const SpDCCols< IU, NU2 > &B, bool clearA=false, bool clearB=false)
void BooleanRowSplit(SpDCCols< IU, bool > &A, int numsplits)
Dcsc< IU, typename promote_trait< NU1, NU2 >::T_promote > SetDifference(const Dcsc< IU, NU1 > &A, const Dcsc< IU, NU2 > *B)
SpTuples< IU, NUO > * Tuples_AtXBn(const SpDCCols< IU, NU1 > &A, const SpDCCols< IU, NU2 > &B, bool clearA=false, bool clearB=false)
SpTuples< IU, NUO > * Tuples_AnXBn(const SpDCCols< IU, NU1 > &A, const SpDCCols< IU, NU2 > &B, bool clearA=false, bool clearB=false)
int generic_gespmv_threaded(const SpMat< IU, NUM, DER > &A, const int32_t *indx, const IVT *numx, int32_t nnzx, int32_t *&sendindbuf, OVT *&sendnumbuf, int *&sdispls, int p_c, PreAllocatedSPA< OVT > &SPA)
Dcsc< IU, N_promote > EWiseApply(const Dcsc< IU, NU1 > &A, const Dcsc< IU, NU2 > *B, _BinaryOperation __binary_op, bool notB, const NU2 &defaultBVal)
void dcsc_gespmv_threaded_nosplit(const SpDCCols< IU, NU > &A, const RHS *x, LHS *y)
SpMV with dense vector (multithreaded version)
SpTuples< IU, NUO > * Tuples_AnXBt(const SpDCCols< IU, NU1 > &A, const SpDCCols< IU, NU2 > &B, bool clearA=false, bool clearB=false)
void generic_gespmv(const SpMat< MIND, NUM, DER > &A, const VIND *indx, const IVT *numx, VIND nnzx, std::vector< VIND > &indy, std::vector< OVT > &numy, PreAllocatedSPA< OVT > &SPA)
SpTuples< IU, NU > MergeAll(const std::vector< SpTuples< IU, NU > * > &ArrSpTups, IU mstar=0, IU nstar=0, bool delarrs=false)
void dcsc_gespmv(const SpDCCols< IU, NU > &A, const RHS *x, LHS *y)
SpMV with dense vector.
Dcsc< IU, typename promote_trait< NU1, NU2 >::T_promote > EWiseMult(const Dcsc< IU, NU1 > &A, const Dcsc< IU, NU2 > *B, bool exclude)