47template <
class IU,
class NU>
50template <
class IU,
class NU>
53template <
class IU,
class NU>
63template <
typename SR,
typename IU,
typename NU,
typename RHS,
typename LHS>
64void dcsc_gespmv (
const SpDCCols<IU, NU> &
A,
const RHS * x, LHS * y)
68 for(IU j =0; j<
A.dcsc->nzc; ++j)
70 IU colid =
A.dcsc->jc[j];
71 for(IU i =
A.dcsc->cp[j]; i<
A.dcsc->cp[j+1]; ++i)
73 IU rowid =
A.dcsc->ir[i];
74 SR::axpy(
A.dcsc->numx[i], x[colid], y[rowid]);
81template <
typename SR,
typename IU,
typename NU,
typename RHS,
typename LHS>
90 nthreads = omp_get_num_threads();
94 IU nlocrows =
A.getnrow();
95 LHS ** tomerge = SpHelper::allocate2D<LHS>(nthreads, nlocrows);
98 for(
int i=0; i<nthreads; ++i)
100 std::fill_n(tomerge[i], nlocrows,
id);
103 #pragma omp parallel for
104 for(IU j =0; j<
A.dcsc->nzc; ++j)
108 curthread = omp_get_thread_num();
111 LHS * loc2merge = tomerge[curthread];
113 IU colid =
A.dcsc->jc[j];
114 for(IU i =
A.dcsc->cp[j]; i<
A.dcsc->cp[j+1]; ++i)
116 IU rowid =
A.dcsc->ir[i];
117 SR::axpy(
A.dcsc->numx[i], x[colid], loc2merge[rowid]);
121 #pragma omp parallel for
122 for(IU j=0; j < nlocrows; ++j)
124 for(
int i=0; i< nthreads; ++i)
126 y[j] = SR::add(y[j], tomerge[i][j]);
139 template <
typename SR,
typename IU,
typename NU,
typename RHS,
typename LHS>
144 int splits =
A.getnsplit();
147 IU nlocrows =
A.getnrow();
148 IU perpiece = nlocrows / splits;
149 std::vector<int> disp(splits, 0);
150 for(
int i=1; i<splits; ++i)
151 disp[i] = disp[i-1] + perpiece;
153#pragma omp parallel for
155 for(
int s=0; s<splits; ++s)
157 Dcsc<IU, NU> * dcsc =
A.GetInternal(s);
158 for(IU j =0; j<dcsc->nzc; ++j)
160 IU colid = dcsc->jc[j];
161 for(IU i = dcsc->cp[j]; i< dcsc->cp[j+1]; ++i)
163 IU rowid = dcsc->ir[i] + disp[s];
164 SR::axpy(dcsc->numx[i], x[colid], y[rowid]);
171 dcsc_gespmv_threaded_nosplit<SR>(
A,x,y);
181template <
typename SR,
typename IU,
typename NUM,
typename DER,
typename IVT,
typename OVT>
183 int32_t * & sendindbuf, OVT * & sendnumbuf,
int * & sdispls,
int p_c, PreAllocatedSPA<OVT> & SPA)
189 sdispls =
new int[p_c]();
190 if(
A.getnnz() > 0 && nnzx > 0)
192 int splits =
A.getnsplit();
196 int32_t perpiece = nlocrows / splits;
197 std::vector< std::vector< int32_t > > indy(splits);
198 std::vector< std::vector< OVT > > numy(splits);
202 #pragma omp parallel for
204 for(
int i=0; i<splits; ++i)
209 SpMXSpV_ForThreading<SR>(*(
A.GetInternal(i)), perpiece, indx, numx, nnzx, indy[i], numy[i], i*perpiece, SPA.V_localy[i], SPA.V_isthere[i], SPA.V_inds[i]);
211 SpMXSpV_ForThreading<SR>(*(
A.GetInternal(i)), nlocrows - perpiece*i, indx, numx, nnzx, indy[i], numy[i], i*perpiece, SPA.V_localy[i], SPA.V_isthere[i], SPA.V_inds[i]);
216 SpMXSpV_ForThreading<SR>(*(
A.GetInternal(i)), perpiece, indx, numx, nnzx, indy[i], numy[i], i*perpiece);
218 SpMXSpV_ForThreading<SR>(*(
A.GetInternal(i)), nlocrows - perpiece*i, indx, numx, nnzx, indy[i], numy[i], i*perpiece);
222 std::vector<int> accum(splits+1, 0);
223 for(
int i=0; i<splits; ++i)
224 accum[i+1] = accum[i] + indy[i].
size();
226 sendindbuf =
new int32_t[accum[splits]];
227 sendnumbuf =
new OVT[accum[splits]];
228 int32_t perproc = nlocrows / p_c;
233 std::vector<int32_t> end_recs(splits);
234 for(
int i=0; i<splits; ++i)
239 end_recs[i] = std::min(indy[i].back() / perproc, last_rec);
242 #pragma omp parallel for
244 for(
int i=0; i<splits; ++i)
250 int32_t beg_rec = std::min( indy[i].front() / perproc, last_rec);
257 while (k >= 0 && end_recs[k] == -1) k--;
260 std::fill(sdispls+end_recs[k]+1, sdispls+beg_rec+1, accum[i]);
265 if(beg_rec == end_recs[i])
267 std::transform(indy[i].begin(), indy[i].end(), indy[i].begin(), std::bind2nd(std::minus<int32_t>(), perproc*beg_rec));
268 std::copy(indy[i].begin(), indy[i].end(), sendindbuf+accum[i]);
269 std::copy(numy[i].begin(), numy[i].end(), sendnumbuf+accum[i]);
275 int end = indy[i].size();
276 for(
int cur=0; cur< end; ++cur)
278 int32_t cur_rec = std::min( indy[i][cur] / perproc, last_rec);
279 while(beg_rec != cur_rec)
281 sdispls[++beg_rec] = accum[i] + cur;
283 sendindbuf[ accum[i] + cur ] = indy[i][cur] - perproc*beg_rec;
284 sendnumbuf[ accum[i] + cur ] = numy[i][cur];
287 std::vector<int32_t>().swap(indy[i]);
288 std::vector<OVT>().swap(numy[i]);
289 bool lastnonzero =
true;
290 for(
int k=i+1; k < splits; ++k)
292 if(end_recs[k] != -1)
296 std::fill(sdispls+end_recs[i]+1, sdispls+p_c, accum[i+1]);
299 return accum[splits];
303 std::cout <<
"Something is wrong, splits should be nonzero for multithreaded execution" << std::endl;
322template <
typename SR,
typename IU,
typename NUM,
typename DER,
typename IVT,
typename OVT>
324 int32_t * sendindbuf, OVT * sendnumbuf,
int * cnts,
int * dspls,
int p_c)
326 if(
A.getnnz() > 0 && nnzx > 0)
328 int splits =
A.getnsplit();
331 std::vector< std::vector<int32_t> > indy(splits);
332 std::vector< std::vector< OVT > > numy(splits);
334 int32_t perpiece = nlocrows / splits;
337 #pragma omp parallel for
339 for(
int i=0; i<splits; ++i)
342 SpMXSpV_ForThreading<SR>(*(
A.GetInternal(i)), perpiece, indx, numx, nnzx, indy[i], numy[i], i*perpiece);
344 SpMXSpV_ForThreading<SR>(*(
A.GetInternal(i)), nlocrows - perpiece*i, indx, numx, nnzx, indy[i], numy[i], i*perpiece);
347 int32_t perproc = nlocrows / p_c;
352 std::vector<int32_t> end_recs(splits);
353 for(
int i=0; i<splits; ++i)
358 end_recs[i] = std::min(indy[i].back() / perproc, last_rec);
361 int ** loc_rec_cnts =
new int *[splits];
363 #pragma omp parallel for
365 for(
int i=0; i<splits; ++i)
367 loc_rec_cnts[i] =
new int[p_c]();
370 int32_t cur_rec = std::min( indy[i].front() / perproc, last_rec);
371 int32_t lastdata = (cur_rec+1) * perproc;
372 for(
typename std::vector<int32_t>::iterator it = indy[i].begin(); it != indy[i].end(); ++it)
375 if( ( (*it) >= lastdata ) && cur_rec != last_rec )
377 cur_rec = std::min( (*it) / perproc, last_rec);
378 lastdata = (cur_rec+1) * perproc;
380 ++loc_rec_cnts[i][cur_rec];
385 #pragma omp parallel for
387 for(
int i=0; i<splits; ++i)
393 int32_t beg_rec = std::min( indy[i].front() / perproc, last_rec);
395 for(
int before = i-1; before >= 0; before--)
396 alreadysent += loc_rec_cnts[before][beg_rec];
398 if(beg_rec == end_recs[i])
400 std::transform(indy[i].begin(), indy[i].end(), indy[i].begin(), std::bind2nd(std::minus<int32_t>(), perproc*beg_rec));
401 std::copy(indy[i].begin(), indy[i].end(), sendindbuf + dspls[beg_rec] + alreadysent);
402 std::copy(numy[i].begin(), numy[i].end(), sendnumbuf + dspls[beg_rec] + alreadysent);
407 int32_t lastdata = (cur_rec+1) * perproc;
408 for(
typename std::vector<int32_t>::iterator it = indy[i].begin(); it != indy[i].end(); ++it)
410 if( ( (*it) >= lastdata ) && cur_rec != last_rec )
412 cur_rec = std::min( (*it) / perproc, last_rec);
413 lastdata = (cur_rec+1) * perproc;
419 sendindbuf[ dspls[cur_rec] + alreadysent ] = (*it) - perproc*cur_rec;
420 sendnumbuf[ dspls[cur_rec] + (alreadysent++) ] = *(numy[i].begin() + (it-indy[i].begin()));
426 for(
int i=0; i< splits; ++i)
428 for(
int j=0; j< p_c; ++j)
429 cnts[j] += loc_rec_cnts[i][j];
430 delete [] loc_rec_cnts[i];
432 delete [] loc_rec_cnts;
436 std::cout <<
"Something is wrong, splits should be nonzero for multithreaded execution" << std::endl;
444template <
typename SR,
typename MIND,
typename VIND,
typename DER,
typename NUM,
typename IVT,
typename OVT>
445void generic_gespmv (
const SpMat<MIND,NUM,DER> &
A,
const VIND * indx,
const IVT * numx, VIND nnzx, std::vector<VIND> & indy, std::vector<OVT> & numy, PreAllocatedSPA<OVT> & SPA)
447 if(
A.getnnz() > 0 && nnzx > 0)
449 if(
A.getnsplit() > 0)
451 std::cout <<
"Call dcsc_gespmv_threaded instead" << std::endl;
455 SpMXSpV<SR>(*(
A.GetInternal()), (VIND)
A.getnrow(), indx, numx, nnzx, indy, numy, SPA);
463template <
typename SR,
typename IU,
typename DER,
typename NUM,
typename IVT,
typename OVT>
465 int32_t * indy, OVT * numy,
int * cnts,
int * dspls,
int p_c,
bool indexisvalue)
467 if(
A.getnnz() > 0 && nnzx > 0)
469 if(
A.getnsplit() > 0)
475 SpMXSpV<SR>(*(
A.GetInternal()), (
int32_t)
A.getnrow(), indx, numx, nnzx, indy, numy, cnts, dspls, p_c);
486 std::cerr<<
"Warning: Matrix is too small to be splitted for multithreading" << std::endl;
489 A.splits = numsplits;
490 IU perpiece =
A.m /
A.splits;
491 std::vector<IU> prevcolids(
A.splits, -1);
492 std::vector<IU> nzcs(
A.splits, 0);
493 std::vector<IU> nnzs(
A.splits, 0);
494 std::vector < std::vector < std::pair<IU,IU> > > colrowpairs(
A.splits);
495 if(
A.nnz > 0 &&
A.dcsc != NULL)
497 for(IU i=0; i<
A.dcsc->nzc; ++i)
499 for(IU j =
A.dcsc->cp[i]; j<
A.dcsc->cp[i+1]; ++j)
501 IU colid =
A.dcsc->jc[i];
502 IU rowid =
A.dcsc->ir[j];
503 IU owner = std::min(rowid / perpiece,
static_cast<IU
>(
A.splits-1));
504 colrowpairs[owner].push_back(std::make_pair(colid, rowid - owner*perpiece));
506 if(prevcolids[owner] != colid)
508 prevcolids[owner] = colid;
518 A.dcscarr =
new Dcsc<IU,bool>*[
A.splits];
521 for(
int i=0; i<
A.splits; ++i)
523 sort(colrowpairs[i].begin(), colrowpairs[i].end());
526 A.dcscarr[i] =
new Dcsc<IU,bool>(nnzs[i],nzcs[i]);
527 std::fill(
A.dcscarr[i]->numx,
A.dcscarr[i]->numx+nnzs[i],
static_cast<bool>(1));
529 IU cindex = colrowpairs[i][0].first;
530 IU rindex = colrowpairs[i][0].second;
532 A.dcscarr[i]->ir[0] = rindex;
533 A.dcscarr[i]->jc[curnzc] = cindex;
534 A.dcscarr[i]->cp[curnzc++] = 0;
536 for(IU j=1; j<nnzs[i]; ++j)
538 cindex = colrowpairs[i][j].first;
539 rindex = colrowpairs[i][j].second;
541 A.dcscarr[i]->ir[j] = rindex;
542 if(cindex !=
A.dcscarr[i]->jc[curnzc-1])
544 A.dcscarr[i]->jc[curnzc] = cindex;
545 A.dcscarr[i]->cp[curnzc++] = j;
548 A.dcscarr[i]->cp[curnzc] = nnzs[i];
552 A.dcscarr[i] =
new Dcsc<IU,bool>();
564template<
class SR,
class NUO,
class IU,
class NU1,
class NU2>
566 (
const SpDCCols<IU, NU1> &
A,
567 const SpDCCols<IU, NU2> &
B,
568 bool clearA =
false,
bool clearB =
false)
573 if(
A.isZero() ||
B.isZero())
575 if(clearA)
delete const_cast<SpDCCols<IU, NU1> *
>(&
A);
576 if(clearB)
delete const_cast<SpDCCols<IU, NU2> *
>(&
B);
577 return new SpTuples< IU, NUO >(0, mdim, ndim);
579 Isect<IU> *isect1, *isect2, *itr1, *itr2, *cols, *rows;
582 IU kisect =
static_cast<IU
>(itr1-isect1);
585 if(clearA)
delete const_cast<SpDCCols<IU, NU1> *
>(&
A);
586 if(clearB)
delete const_cast<SpDCCols<IU, NU2> *
>(&
B);
588 return new SpTuples< IU, NUO >(0, mdim, ndim);
591 StackEntry< NUO, std::pair<IU,IU> > * multstack;
593 IU cnz = SpHelper::SpCartesian< SR > (*(
A.dcsc), *(
B.dcsc), kisect, isect1, isect2, multstack);
596 if(clearA)
delete const_cast<SpDCCols<IU, NU1> *
>(&
A);
597 if(clearB)
delete const_cast<SpDCCols<IU, NU2> *
>(&
B);
598 return new SpTuples<IU, NUO> (cnz, mdim, ndim, multstack);
607template<
class SR,
class NUO,
class IU,
class NU1,
class NU2>
609 (
const SpDCCols<IU, NU1> &
A,
610 const SpDCCols<IU, NU2> &
B,
611 bool clearA =
false,
bool clearB =
false)
615 if(
A.isZero() ||
B.isZero())
617 return new SpTuples<IU, NUO>(0, mdim, ndim);
619 StackEntry< NUO, std::pair<IU,IU> > * multstack;
620 IU cnz = SpHelper::SpColByCol< SR > (*(
A.dcsc), *(
B.dcsc),
A.n, multstack);
623 delete const_cast<SpDCCols<IU, NU1> *
>(&
A);
625 delete const_cast<SpDCCols<IU, NU2> *
>(&
B);
627 return new SpTuples<IU, NUO> (cnz, mdim, ndim, multstack);
631template<
class SR,
class NUO,
class IU,
class NU1,
class NU2>
633 (
const SpDCCols<IU, NU1> &
A,
634 const SpDCCols<IU, NU2> &
B,
635 bool clearA =
false,
bool clearB =
false)
639 std::cout <<
"Tuples_AtXBt function has not been implemented yet !" << std::endl;
641 return new SpTuples<IU, NUO> (0, mdim, ndim);
644template<
class SR,
class NUO,
class IU,
class NU1,
class NU2>
646 (
const SpDCCols<IU, NU1> &
A,
647 const SpDCCols<IU, NU2> &
B,
648 bool clearA =
false,
bool clearB =
false)
652 std::cout <<
"Tuples_AtXBn function has not been implemented yet !" << std::endl;
654 return new SpTuples<IU, NUO> (0, mdim, ndim);
659template<
class SR,
class IU,
class NU>
660SpTuples<IU,NU>
MergeAll(
const std::vector<SpTuples<IU,NU> *> & ArrSpTups, IU mstar = 0, IU nstar = 0,
bool delarrs =
false )
662 int hsize = ArrSpTups.size();
665 return SpTuples<IU,NU>(0, mstar,nstar);
669 mstar = ArrSpTups[0]->m;
670 nstar = ArrSpTups[0]->n;
672 for(
int i=1; i< hsize; ++i)
674 if((mstar != ArrSpTups[i]->m) || nstar != ArrSpTups[i]->n)
676 std::cerr <<
"Dimensions do not match on MergeAll()" << std::endl;
677 return SpTuples<IU,NU>(0,0,0);
682 ColLexiCompare<IU,int> heapcomp;
683 std::tuple<IU, IU, int> * heap =
new std::tuple<IU, IU, int> [hsize];
684 IU * curptr =
new IU[hsize];
685 std::fill_n(curptr, hsize,
static_cast<IU
>(0));
688 for(
int i=0; i< hsize; ++i)
690 estnnz += ArrSpTups[i]->getnnz();
691 heap[i] = std::make_tuple(std::get<0>(ArrSpTups[i]->tuples[0]), std::get<1>(ArrSpTups[i]->tuples[0]), i);
693 std::make_heap(heap, heap+hsize, std::not2(heapcomp));
695 std::tuple<IU, IU, NU> * ntuples =
new std::tuple<IU,IU,NU>[estnnz];
700 std::pop_heap(heap, heap + hsize, std::not2(heapcomp));
701 int source = std::get<2>(heap[hsize-1]);
704 ((std::get<0>(ntuples[cnz-1]) == std::get<0>(heap[hsize-1])) && (std::get<1>(ntuples[cnz-1]) == std::get<1>(heap[hsize-1]))) )
706 std::get<2>(ntuples[cnz-1]) = SR::add(std::get<2>(ntuples[cnz-1]), ArrSpTups[source]->numvalue(curptr[source]++));
710 ntuples[cnz++] = ArrSpTups[source]->tuples[curptr[source]++];
713 if(curptr[source] != ArrSpTups[source]->getnnz())
715 heap[hsize-1] = std::make_tuple(std::get<0>(ArrSpTups[source]->tuples[curptr[source]]),
716 std::get<1>(ArrSpTups[source]->tuples[curptr[source]]), source);
717 std::push_heap(heap, heap+hsize, std::not2(heapcomp));
729 for(
size_t i=0; i<ArrSpTups.size(); ++i)
732 return SpTuples<IU,NU> (cnz, mstar, nstar, ntuples);
736 SpTuples<IU,NU> ret = *ArrSpTups[0];
747template <
typename IU,
typename NU1,
typename NU2>
748Dcsc<IU, typename promote_trait<NU1,NU2>::T_promote>
SetDifference(
const Dcsc<IU,NU1> &
A,
const Dcsc<IU,NU2> *
B)
755 Dcsc<IU,N_promote> temp(estnz, estnzc);
763 while(i<
A.nzc &&
B != NULL && j< B->nzc)
765 if(
A.jc[i] >
B->jc[j]) ++j;
766 else if(
A.jc[i] <
B->jc[j])
768 temp.jc[curnzc++] =
A.jc[i++];
769 for(IU k =
A.cp[i-1]; k<
A.cp[i]; k++)
771 temp.ir[curnz] =
A.ir[k];
772 temp.numx[curnz++] =
A.numx[k];
774 temp.cp[curnzc] = temp.cp[curnzc-1] + (
A.cp[i] -
A.cp[i-1]);
781 while (ii <
A.cp[i+1] && jj < B->cp[j+1])
783 if (
A.ir[ii] >
B->ir[jj]) ++jj;
784 else if (
A.ir[ii] <
B->ir[jj])
786 temp.ir[curnz] =
A.ir[ii];
787 temp.numx[curnz++] =
A.numx[ii++];
795 while (ii <
A.cp[i+1])
797 temp.ir[curnz] =
A.ir[ii];
798 temp.numx[curnz++] =
A.numx[ii++];
803 temp.jc[curnzc++] =
A.jc[i];
804 temp.cp[curnzc] = temp.cp[curnzc-1] + curnz-prevnz;
812 temp.jc[curnzc++] =
A.jc[i++];
813 for(IU k =
A.cp[i-1]; k<
A.cp[i]; ++k)
815 temp.ir[curnz] =
A.ir[k];
816 temp.numx[curnz++] =
A.numx[k];
818 temp.cp[curnzc] = temp.cp[curnzc-1] + (
A.cp[i] -
A.cp[i-1]);
821 temp.Resize(curnzc, curnz);
833template <
typename IU,
typename NU1,
typename NU2>
834Dcsc<IU, typename promote_trait<NU1,NU2>::T_promote>
EWiseMult(
const Dcsc<IU,NU1> &
A,
const Dcsc<IU,NU2> *
B,
bool exclude)
844 estnzc = std::min(
A.nzc,
B->nzc);
845 estnz = std::min(
A.nz,
B->nz);
847 Dcsc<IU,N_promote> temp(estnz, estnzc);
855 while(i<
A.nzc &&
B != NULL && j<B->nzc)
857 if(
A.jc[i] >
B->jc[j]) ++j;
858 else if(
A.jc[i] <
B->jc[j]) ++i;
864 while (ii <
A.cp[i+1] && jj < B->cp[j+1])
866 if (
A.ir[ii] <
B->ir[jj]) ++ii;
867 else if (
A.ir[ii] >
B->ir[jj]) ++jj;
870 temp.ir[curnz] =
A.ir[ii];
871 temp.numx[curnz++] =
A.numx[ii++] *
B->numx[jj++];
876 temp.jc[curnzc++] =
A.jc[i];
877 temp.cp[curnzc] = temp.cp[curnzc-1] + curnz-prevnz;
884 temp.Resize(curnzc, curnz);
889template <
typename N_promote,
typename IU,
typename NU1,
typename NU2,
typename _BinaryOperation>
890Dcsc<IU, N_promote>
EWiseApply(
const Dcsc<IU,NU1> &
A,
const Dcsc<IU,NU2> *
B, _BinaryOperation __binary_op,
bool notB,
const NU2& defaultBVal)
901 estnzc = std::min(
A.nzc,
B->nzc);
902 estnz = std::min(
A.nz,
B->nz);
905 Dcsc<IU,N_promote> temp(estnz, estnzc);
915 while(i<
A.nzc &&
B != NULL && j<B->nzc)
917 if(
A.jc[i] >
B->jc[j]) ++j;
918 else if(
A.jc[i] <
B->jc[j]) ++i;
924 while (ii <
A.cp[i+1] && jj < B->cp[j+1])
926 if (
A.ir[ii] <
B->ir[jj]) ++ii;
927 else if (
A.ir[ii] >
B->ir[jj]) ++jj;
930 temp.ir[curnz] =
A.ir[ii];
931 temp.numx[curnz++] = __binary_op(
A.numx[ii++],
B->numx[jj++]);
936 temp.jc[curnzc++] =
A.jc[i];
937 temp.cp[curnzc] = temp.cp[curnzc-1] + curnz-prevnz;
946 while(i<
A.nzc &&
B != NULL && j< B->nzc)
948 if(
A.jc[i] >
B->jc[j]) ++j;
949 else if(
A.jc[i] <
B->jc[j])
951 temp.jc[curnzc++] =
A.jc[i++];
952 for(IU k =
A.cp[i-1]; k<
A.cp[i]; k++)
954 temp.ir[curnz] =
A.ir[k];
955 temp.numx[curnz++] = __binary_op(
A.numx[k], defaultBVal);
957 temp.cp[curnzc] = temp.cp[curnzc-1] + (
A.cp[i] -
A.cp[i-1]);
964 while (ii <
A.cp[i+1] && jj < B->cp[j+1])
966 if (
A.ir[ii] >
B->ir[jj]) ++jj;
967 else if (
A.ir[ii] <
B->ir[jj])
969 temp.ir[curnz] =
A.ir[ii];
970 temp.numx[curnz++] = __binary_op(
A.numx[ii++], defaultBVal);
978 while (ii <
A.cp[i+1])
980 temp.ir[curnz] =
A.ir[ii];
981 temp.numx[curnz++] = __binary_op(
A.numx[ii++], defaultBVal);
986 temp.jc[curnzc++] =
A.jc[i];
987 temp.cp[curnzc] = temp.cp[curnzc-1] + curnz-prevnz;
995 temp.jc[curnzc++] =
A.jc[i++];
996 for(IU k =
A.cp[i-1]; k<
A.cp[i]; ++k)
998 temp.ir[curnz] =
A.ir[k];
999 temp.numx[curnz++] = __binary_op(
A.numx[k], defaultBVal);
1001 temp.cp[curnzc] = temp.cp[curnzc-1] + (
A.cp[i] -
A.cp[i-1]);
1005 temp.Resize(curnzc, curnz);
1010template<
typename IU,
typename NU1,
typename NU2>
1011SpDCCols<IU, typename promote_trait<NU1,NU2>::T_promote >
EWiseMult (
const SpDCCols<IU,NU1> &
A,
const SpDCCols<IU,NU2> &
B,
bool exclude)
1017 Dcsc<IU, N_promote> * tdcsc = NULL;
1018 if(
A.nnz > 0 &&
B.nnz > 0)
1020 tdcsc =
new Dcsc<IU, N_promote>(
EWiseMult(*(
A.dcsc),
B.dcsc, exclude));
1021 return SpDCCols<IU, N_promote> (
A.m ,
A.n, tdcsc);
1023 else if (
A.nnz > 0 && exclude)
1025 tdcsc =
new Dcsc<IU, N_promote>(
EWiseMult(*(
A.dcsc), (
const Dcsc<IU,NU2>*)NULL, exclude));
1026 return SpDCCols<IU, N_promote> (
A.m ,
A.n, tdcsc);
1030 return SpDCCols<IU, N_promote> (
A.m ,
A.n, tdcsc);
1035template<
typename N_promote,
typename IU,
typename NU1,
typename NU2,
typename _BinaryOperation>
1036SpDCCols<IU, N_promote>
EWiseApply (
const SpDCCols<IU,NU1> &
A,
const SpDCCols<IU,NU2> &
B, _BinaryOperation __binary_op,
bool notB,
const NU2& defaultBVal)
1042 Dcsc<IU, N_promote> * tdcsc = NULL;
1043 if(
A.nnz > 0 &&
B.nnz > 0)
1045 tdcsc =
new Dcsc<IU, N_promote>(EWiseApply<N_promote>(*(
A.dcsc),
B.dcsc, __binary_op, notB, defaultBVal));
1046 return SpDCCols<IU, N_promote> (
A.m ,
A.n, tdcsc);
1048 else if (
A.nnz > 0 && notB)
1050 tdcsc =
new Dcsc<IU, N_promote>(EWiseApply<N_promote>(*(
A.dcsc), (
const Dcsc<IU,NU2>*)NULL, __binary_op, notB, defaultBVal));
1051 return SpDCCols<IU, N_promote> (
A.m ,
A.n, tdcsc);
1055 return SpDCCols<IU, N_promote> (
A.m ,
A.n, tdcsc);
1068template <
typename RETT,
typename IU,
typename NU1,
typename NU2,
typename _BinaryOperation,
typename _BinaryPredicate>
1069Dcsc<IU, RETT>
EWiseApply(
const Dcsc<IU,NU1> * Ap,
const Dcsc<IU,NU2> * Bp, _BinaryOperation __binary_op, _BinaryPredicate do_op,
bool allowANulls,
bool allowBNulls,
const NU1& ANullVal,
const NU2& BNullVal,
const bool allowIntersect)
1071 if (Ap == NULL && Bp == NULL)
1072 return Dcsc<IU,RETT>(0, 0);
1074 if (Ap == NULL && Bp != NULL)
1077 return Dcsc<IU,RETT>(0, 0);
1079 const Dcsc<IU,NU2> &
B = *Bp;
1082 Dcsc<IU,RETT> temp(estnz, estnzc);
1094 temp.jc[curnzc++] =
B.jc[j-1];
1095 for(IU k =
B.cp[j-1]; k<
B.cp[j]; ++k)
1097 if (do_op(ANullVal,
B.numx[k],
true,
false))
1099 temp.ir[curnz] =
B.ir[k];
1100 temp.numx[curnz++] = __binary_op(ANullVal,
B.numx[k],
true,
false);
1104 temp.cp[curnzc] = temp.cp[curnzc-1] + curnz-prevnz;
1106 temp.Resize(curnzc, curnz);
1110 if (Ap != NULL && Bp == NULL)
1113 return Dcsc<IU,RETT>(0, 0);
1115 const Dcsc<IU,NU1> &
A = *Ap;
1118 Dcsc<IU,RETT> temp(estnz, estnzc);
1129 temp.jc[curnzc++] =
A.jc[i-1];
1130 for(IU k =
A.cp[i-1]; k<
A.cp[i]; k++)
1132 if (do_op(
A.numx[k], BNullVal,
false,
true))
1134 temp.ir[curnz] =
A.ir[k];
1135 temp.numx[curnz++] = __binary_op(
A.numx[k], BNullVal,
false,
true);
1139 temp.cp[curnzc] = temp.cp[curnzc-1] + curnz-prevnz;
1141 temp.Resize(curnzc, curnz);
1146 const Dcsc<IU,NU1> &
A = *Ap;
1147 const Dcsc<IU,NU2> &
B = *Bp;
1149 IU estnzc =
A.nzc +
B.nzc;
1150 IU estnz =
A.nz +
B.nz;
1151 Dcsc<IU,RETT> temp(estnz, estnzc);
1158 while(i<
A.nzc && j<
B.nzc)
1160 if(
A.jc[i] >
B.jc[j])
1166 temp.jc[curnzc++] =
B.jc[j-1];
1167 for(IU k =
B.cp[j-1]; k<
B.cp[j]; ++k)
1169 if (do_op(ANullVal,
B.numx[k],
true,
false))
1171 temp.ir[curnz] =
B.ir[k];
1172 temp.numx[curnz++] = __binary_op(ANullVal,
B.numx[k],
true,
false);
1176 temp.cp[curnzc] = temp.cp[curnzc-1] + curnz-prevnz;
1179 else if(
A.jc[i] <
B.jc[j])
1185 temp.jc[curnzc++] =
A.jc[i-1];
1186 for(IU k =
A.cp[i-1]; k<
A.cp[i]; k++)
1188 if (do_op(
A.numx[k], BNullVal,
false,
true))
1190 temp.ir[curnz] =
A.ir[k];
1191 temp.numx[curnz++] = __binary_op(
A.numx[k], BNullVal,
false,
true);
1195 temp.cp[curnzc] = temp.cp[curnzc-1] + curnz-prevnz;
1200 temp.jc[curnzc++] =
A.jc[i];
1204 while (ii <
A.cp[i+1] && jj <
B.cp[j+1])
1206 if (
A.ir[ii] <
B.ir[jj])
1208 if (allowBNulls && do_op(
A.numx[ii], BNullVal,
false,
true))
1210 temp.ir[curnz] =
A.ir[ii];
1211 temp.numx[curnz++] = __binary_op(
A.numx[ii++], BNullVal,
false,
true);
1216 else if (
A.ir[ii] >
B.ir[jj])
1218 if (allowANulls && do_op(ANullVal,
B.numx[jj],
true,
false))
1220 temp.ir[curnz] =
B.ir[jj];
1221 temp.numx[curnz++] = __binary_op(ANullVal,
B.numx[jj++],
true,
false);
1228 if (allowIntersect && do_op(
A.numx[ii],
B.numx[jj],
false,
false))
1230 temp.ir[curnz] =
A.ir[ii];
1231 temp.numx[curnz++] = __binary_op(
A.numx[ii++],
B.numx[jj++],
false,
false);
1240 while (ii <
A.cp[i+1])
1242 if (allowBNulls && do_op(
A.numx[ii], BNullVal,
false,
true))
1244 temp.ir[curnz] =
A.ir[ii];
1245 temp.numx[curnz++] = __binary_op(
A.numx[ii++], BNullVal,
false,
true);
1250 while (jj <
B.cp[j+1])
1252 if (allowANulls && do_op(ANullVal,
B.numx[jj],
true,
false))
1254 temp.ir[curnz] =
B.ir[jj];
1255 temp.numx[curnz++] = __binary_op(ANullVal,
B.numx[jj++],
true,
false);
1260 temp.cp[curnzc] = temp.cp[curnzc-1] + curnz-prevnz;
1265 while(allowBNulls && i<
A.nzc)
1268 temp.jc[curnzc++] =
A.jc[i++];
1269 for(IU k =
A.cp[i-1]; k<
A.cp[i]; ++k)
1271 if (do_op(
A.numx[k], BNullVal,
false,
true))
1273 temp.ir[curnz] =
A.ir[k];
1274 temp.numx[curnz++] = __binary_op(
A.numx[k], BNullVal,
false,
true);
1278 temp.cp[curnzc] = temp.cp[curnzc-1] + curnz-prevnz;
1280 while(allowANulls && j <
B.nzc)
1283 temp.jc[curnzc++] =
B.jc[j++];
1284 for(IU k =
B.cp[j-1]; k<
B.cp[j]; ++k)
1286 if (do_op(ANullVal,
B.numx[k],
true,
false))
1288 temp.ir[curnz] =
B.ir[k];
1289 temp.numx[curnz++] = __binary_op(ANullVal,
B.numx[k],
true,
false);
1293 temp.cp[curnzc] = temp.cp[curnzc-1] + curnz-prevnz;
1295 temp.Resize(curnzc, curnz);
1299template <
typename RETT,
typename IU,
typename NU1,
typename NU2,
typename _BinaryOperation,
typename _BinaryPredicate>
1300SpDCCols<IU,RETT>
EWiseApply (
const SpDCCols<IU,NU1> &
A,
const SpDCCols<IU,NU2> &
B, _BinaryOperation __binary_op, _BinaryPredicate do_op,
bool allowANulls,
bool allowBNulls,
const NU1& ANullVal,
const NU2& BNullVal,
const bool allowIntersect)
1305 Dcsc<IU, RETT> * tdcsc =
new Dcsc<IU, RETT>(EWiseApply<RETT>(
A.dcsc,
B.dcsc, __binary_op, do_op, allowANulls, allowBNulls, ANullVal, BNullVal, allowIntersect));
1306 return SpDCCols<IU, RETT> (
A.m ,
A.n, tdcsc);
static void SpIntersect(const Dcsc< IT, NT1 > &Adcsc, const Dcsc< IT, NT2 > &Bdcsc, Isect< IT > *&cols, Isect< IT > *&rows, Isect< IT > *&isect1, Isect< IT > *&isect2, Isect< IT > *&itr1, Isect< IT > *&itr2)
static void deallocate2D(T **array, I m)
static void ShrinkArray(NT *&array, IT newsize)
static void Print(const std::string &s)
void generic_gespmv_threaded_setbuffers(const SpMat< IU, NUM, DER > &A, const int32_t *indx, const IVT *numx, int32_t nnzx, int32_t *sendindbuf, OVT *sendnumbuf, int *cnts, int *dspls, int p_c)
void dcsc_gespmv_threaded(const SpDCCols< IU, NU > &A, const RHS *x, LHS *y)
SpTuples< IU, NUO > * Tuples_AtXBt(const SpDCCols< IU, NU1 > &A, const SpDCCols< IU, NU2 > &B, bool clearA=false, bool clearB=false)
void BooleanRowSplit(SpDCCols< IU, bool > &A, int numsplits)
Dcsc< IU, typename promote_trait< NU1, NU2 >::T_promote > SetDifference(const Dcsc< IU, NU1 > &A, const Dcsc< IU, NU2 > *B)
SpTuples< IU, NUO > * Tuples_AtXBn(const SpDCCols< IU, NU1 > &A, const SpDCCols< IU, NU2 > &B, bool clearA=false, bool clearB=false)
SpTuples< IU, NUO > * Tuples_AnXBn(const SpDCCols< IU, NU1 > &A, const SpDCCols< IU, NU2 > &B, bool clearA=false, bool clearB=false)
int generic_gespmv_threaded(const SpMat< IU, NUM, DER > &A, const int32_t *indx, const IVT *numx, int32_t nnzx, int32_t *&sendindbuf, OVT *&sendnumbuf, int *&sdispls, int p_c, PreAllocatedSPA< OVT > &SPA)
Dcsc< IU, N_promote > EWiseApply(const Dcsc< IU, NU1 > &A, const Dcsc< IU, NU2 > *B, _BinaryOperation __binary_op, bool notB, const NU2 &defaultBVal)
void dcsc_gespmv_threaded_nosplit(const SpDCCols< IU, NU > &A, const RHS *x, LHS *y)
SpMV with dense vector (multithreaded version)
SpTuples< IU, NUO > * Tuples_AnXBt(const SpDCCols< IU, NU1 > &A, const SpDCCols< IU, NU2 > &B, bool clearA=false, bool clearB=false)
void generic_gespmv(const SpMat< MIND, NUM, DER > &A, const VIND *indx, const IVT *numx, VIND nnzx, std::vector< VIND > &indy, std::vector< OVT > &numy, PreAllocatedSPA< OVT > &SPA)
SpTuples< IU, NU > MergeAll(const std::vector< SpTuples< IU, NU > * > &ArrSpTups, IU mstar=0, IU nstar=0, bool delarrs=false)
void dcsc_gespmv(const SpDCCols< IU, NU > &A, const RHS *x, LHS *y)
SpMV with dense vector.
Dcsc< IU, typename promote_trait< NU1, NU2 >::T_promote > EWiseMult(const Dcsc< IU, NU1 > &A, const Dcsc< IU, NU2 > *B, bool exclude)