30#ifndef _PAR_FRIENDS_H_
31#define _PAR_FRIENDS_H_
49template <
class IT,
class NT,
class DER>
60template <
typename IT,
typename NT>
61FullyDistVec<IT,NT>
Concatenate ( std::vector< FullyDistVec<IT,NT> > & vecs)
66 return FullyDistVec<IT,NT>();
68 else if (vecs.size() < 2)
75 typename std::vector< FullyDistVec<IT,NT> >::iterator it = vecs.begin();
76 std::shared_ptr<CommGrid> commGridPtr = it->getcommgrid();
77 MPI_Comm World = commGridPtr->GetWorld();
79 IT nglen = it->TotalLength();
80 IT cumloclen = it->MyLocLength();
82 for(; it != vecs.end(); ++it)
84 if(*(commGridPtr) != *(it->getcommgrid()))
89 nglen += it->TotalLength();
90 cumloclen += it->MyLocLength();
92 FullyDistVec<IT,NT> ConCat (commGridPtr, nglen,
NT());
93 int nprocs = commGridPtr->GetSize();
95 std::vector< std::vector< NT > > data(
nprocs);
96 std::vector< std::vector< IT > > inds(
nprocs);
98 for(it = vecs.begin(); it != vecs.end(); ++it)
100 IT loclen = it->LocArrSize();
101 for(
IT i=0; i < loclen; ++i)
104 IT loffset = it->LengthUntil();
105 int owner = ConCat.Owner(gloffset+loffset+i, locind);
106 data[owner].push_back(it->arr[i]);
107 inds[owner].push_back(locind);
109 gloffset += it->TotalLength();
112 int * sendcnt =
new int[
nprocs];
113 int * sdispls =
new int[
nprocs];
114 for(
int i=0; i<
nprocs; ++i)
115 sendcnt[i] = (
int) data[i].size();
117 int * rdispls =
new int[
nprocs];
118 int * recvcnt =
new int[
nprocs];
119 MPI_Alltoall(sendcnt, 1, MPI_INT, recvcnt, 1, MPI_INT, World);
122 for(
int i=0; i<
nprocs-1; ++i)
124 sdispls[i+1] = sdispls[i] + sendcnt[i];
125 rdispls[i+1] = rdispls[i] + recvcnt[i];
127 IT totrecv = std::accumulate(recvcnt,recvcnt+
nprocs,
static_cast<IT>(0));
128 NT * senddatabuf =
new NT[cumloclen];
129 for(
int i=0; i<
nprocs; ++i)
131 std::copy(data[i].begin(), data[i].end(), senddatabuf+sdispls[i]);
132 std::vector<NT>().swap(data[i]);
134 NT * recvdatabuf =
new NT[totrecv];
135 MPI_Alltoallv(senddatabuf, sendcnt, sdispls, MPIType<NT>(), recvdatabuf, recvcnt, rdispls, MPIType<NT>(), World);
136 delete [] senddatabuf;
138 IT * sendindsbuf =
new IT[cumloclen];
139 for(
int i=0; i<
nprocs; ++i)
141 std::copy(inds[i].begin(), inds[i].end(), sendindsbuf+sdispls[i]);
142 std::vector<IT>().swap(inds[i]);
144 IT * recvindsbuf =
new IT[totrecv];
145 MPI_Alltoallv(sendindsbuf, sendcnt, sdispls, MPIType<IT>(), recvindsbuf, recvcnt, rdispls, MPIType<IT>(), World);
146 DeleteAll(sendindsbuf, sendcnt, sdispls);
148 for(
int i=0; i<
nprocs; ++i)
150 for(
int j = rdispls[i]; j < rdispls[i] + recvcnt[i]; ++j)
152 ConCat.arr[recvindsbuf[j]] = recvdatabuf[j];
155 DeleteAll(recvindsbuf, recvcnt, rdispls);
160template <
typename MATRIXA,
typename MATRIXB>
163 if(
A.getncol() !=
B.getnrow())
165 std::ostringstream outs;
166 outs <<
"Can not multiply, dimensions does not match"<< std::endl;
167 outs <<
A.getncol() <<
" != " <<
B.getnrow() << std::endl;
172 if((
void*) &
A == (
void*) &
B)
174 std::ostringstream outs;
175 outs <<
"Can not multiply, inputs alias (make a temporary copy of one of them first)"<< std::endl;
185template <
typename IT,
typename NT,
typename DER>
189 MPI_Comm_rank(MPI_COMM_WORLD,&myrank);
196 SpParMat<IT,NT,DER> PrunedA =
A.Prune(std::bind2nd(std::less_equal<NT>(), hardThreshold),
false);
198 FullyDistVec<IT,NT> colSums = PrunedA.Reduce(
Column, std::plus<NT>(), 0.0);
199 FullyDistVec<IT,NT> nnzPerColumnUnpruned =
A.Reduce(
Column, std::plus<NT>(), 0.0, [](
NT val){
return 1.0;});
200 FullyDistVec<IT,NT> nnzPerColumn = PrunedA.Reduce(
Column, std::plus<NT>(), 0.0, [](
NT val){
return 1.0;});
202 FullyDistVec<IT,NT> pruneCols(nnzPerColumn);
203 pruneCols = hardThreshold;
205 PrunedA.FreeMemory();
207 FullyDistSpVec<IT,NT> recoverCols(nnzPerColumn, std::bind2nd(std::less<NT>(), recoverNum));
210 recoverCols = EWiseApply<NT>(recoverCols, nnzPerColumnUnpruned,
211 [](
NT spval,
NT dval){
return spval;},
212 [](
NT spval,
NT dval){
return dval > spval;},
216 recoverCols = recoverPct;
218 recoverCols = EWiseApply<NT>(recoverCols, colSums,
219 [](
NT spval,
NT dval){
return spval;},
220 [](
NT spval,
NT dval){
return dval < spval;},
223 IT nrecover = recoverCols.getnnz();
230 A.Kselect(recoverCols, recoverNum, kselectVersion);
237 pruneCols.Set(recoverCols);
240 std::ostringstream outs;
241 outs <<
"Number of columns needing recovery: " << nrecover << std::endl;
250 FullyDistSpVec<IT,NT> selectCols = EWiseApply<NT>(recoverCols, colSums,
251 [](
NT spval,
NT dval){
return spval;},
252 [](
NT spval,
NT dval){
return spval==-1;},
253 true,
static_cast<NT>(-1));
255 selectCols = selectNum;
256 selectCols = EWiseApply<NT>(selectCols, nnzPerColumn,
257 [](
NT spval,
NT dval){
return spval;},
258 [](
NT spval,
NT dval){
return dval > spval;},
260 IT nselect = selectCols.getnnz();
267 A.Kselect(selectCols, selectNum, kselectVersion);
273 pruneCols.Set(selectCols);
275 std::ostringstream outs;
276 outs <<
"Number of columns needing selection: " << nselect << std::endl;
282 SpParMat<IT,NT,DER> selectedA =
A.PruneColumn(pruneCols, std::less<NT>(),
false);
290 FullyDistVec<IT,NT> nnzPerColumn1 = selectedA.Reduce(
Column, std::plus<NT>(), 0.0, [](
NT val){
return 1.0;});
291 FullyDistVec<IT,NT> colSums1 = selectedA.Reduce(
Column, std::plus<NT>(), 0.0);
292 selectedA.FreeMemory();
295 selectCols = recoverNum;
296 selectCols = EWiseApply<NT>(selectCols, nnzPerColumn1,
297 [](
NT spval,
NT dval){
return spval;},
298 [](
NT spval,
NT dval){
return dval < spval;},
302 selectCols = recoverPct;
303 selectCols = EWiseApply<NT>(selectCols, colSums1,
304 [](
NT spval,
NT dval){
return spval;},
305 [](
NT spval,
NT dval){
return dval < spval;},
308 IT n_recovery_after_select = selectCols.getnnz();
309 if(n_recovery_after_select>0)
316 A.Kselect(selectCols, recoverNum, kselectVersion);
321 pruneCols.Set(selectCols);
324 std::ostringstream outs1;
325 outs1 <<
"Number of columns needing recovery after selection: " << nselect << std::endl;
338 A.PruneColumn(pruneCols, std::less<NT>(),
true);
346 FullyDistVec<IT,NT> nnzPerColumnA =
A.Reduce(
Column, std::plus<NT>(), 0.0, [](
NT val){
return 1.0;});
347 FullyDistSpVec<IT,NT> emptyColumns(nnzPerColumnA, std::bind2nd(std::equal_to<NT>(), 0.0));
355template <
typename SR,
typename IU,
typename NU1,
typename NU2,
typename UDERA,
typename UDERB>
357 (SpParMat<IU,NU1,UDERA> &
A, SpParMat<IU,NU2,UDERB> &
B,
bool clearA =
false,
bool clearB =
false)
361 MPI_Comm_rank(MPI_COMM_WORLD,&myrank);
363 std::shared_ptr<CommGrid> GridC =
ProductGrid((
A.
commGrid).get(), (
B.commGrid).get(), stages, dummy, dummy);
364 IU C_m =
A.spSeq->getnrow();
365 IU C_n =
B.spSeq->getncol();
369 IU ** ARecvSizes = SpHelper::allocate2D<IU>(UDERA::esscount, stages);
370 IU ** BRecvSizes = SpHelper::allocate2D<IU>(UDERB::esscount, stages);
380 int Aself = (
A.
commGrid)->GetRankInProcRow();
381 int Bself = (
B.commGrid)->GetRankInProcCol();
383 for(
int i = 0; i < stages; ++i)
392 ess.resize(UDERA::esscount);
393 for(
int j=0; j< UDERA::esscount; ++j)
395 ess[j] = ARecvSizes[j][i];
408 ess.resize(UDERB::esscount);
409 for(
int j=0; j< UDERB::esscount; ++j)
411 ess[j] = BRecvSizes[j][i];
417 local_flops += EstimateLocalFLOP<SR>
423 if(clearA &&
A.spSeq != NULL) {
427 if(clearB &&
B.spSeq != NULL) {
439 MPI_Allreduce(&local_flops, &global_flops, 1, MPI_LONG_LONG_INT, MPI_SUM,
A.getcommgrid()->GetWorld());
449template <
typename SR,
typename NUO,
typename UDERO,
typename IU,
typename NU1,
typename NU2,
typename UDERA,
typename UDERB>
450SpParMat<IU,NUO,UDERO>
MemEfficientSpGEMM (SpParMat<IU,NU1,UDERA> &
A, SpParMat<IU,NU2,UDERB> &
B,
451 int phases, NUO hardThreshold, IU selectNum, IU recoverNum, NUO recoverPct,
int kselectVersion,
int computationKernel,
int64_t perProcessMemory)
453 typedef typename UDERA::LocalIT LIA;
454 typedef typename UDERB::LocalIT LIB;
455 typedef typename UDERO::LocalIT LIC;
458 MPI_Comm_rank(MPI_COMM_WORLD,&myrank);
459 if(
A.getncol() !=
B.getnrow())
461 std::ostringstream outs;
462 outs <<
"Can not multiply, dimensions does not match"<< std::endl;
463 outs <<
A.getncol() <<
" != " <<
B.getnrow() << std::endl;
466 return SpParMat< IU,NUO,UDERO >();
468 if(phases <1 || phases >=
A.getncol())
470 SpParHelper::Print(
"MemEfficientSpGEMM: The value of phases is too small or large. Resetting to 1.\n");
475 std::shared_ptr<CommGrid> GridC =
ProductGrid((
A.
commGrid).get(), (
B.commGrid).get(), stages, dummy, dummy);
477 double t0, t1, t2, t3, t4, t5;
479 MPI_Barrier(
A.getcommgrid()->GetWorld());
482 if(perProcessMemory>0)
485 MPI_Comm World = GridC->GetWorld();
486 MPI_Comm_size(World,&p);
488 int64_t perNNZMem_in =
sizeof(IU)*2 +
sizeof(NU1);
489 int64_t perNNZMem_out =
sizeof(IU)*2 +
sizeof(NUO);
495 int64_t inputMem = gannz * perNNZMem_in * 4;
499 int64_t asquareMem = asquareNNZ * perNNZMem_out * 2;
503 int64_t d = ceil( (asquareNNZ * sqrt(p))/
B.getlocalcols() );
505 int64_t k = std::min(
int64_t(std::max(selectNum, recoverNum)), d );
506 int64_t kselectmem =
B.getlocalcols() * k * 8 * 3;
509 int64_t outputNNZ = (
B.getlocalcols() * k)/sqrt(p);
510 int64_t outputMem = outputNNZ * perNNZMem_in * 2;
513 int64_t remainingMem = perProcessMemory*1000000000 - inputMem - outputMem;
516 phases = 1 + (asquareMem+kselectmem) / remainingMem;
524 std::cout <<
"!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n Warning: input and output memory requirement is greater than per-process avaiable memory. Keeping phase to the value supplied at the command line. The program may go out of memory and crash! \n !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" << std::endl;
526#ifdef SHOW_MEMORY_USAGE
527 int64_t maxMemory = kselectmem/phases + inputMem + outputMem + asquareMem / phases;
528 if(maxMemory>1000000000)
529 std::cout <<
"phases: " << phases <<
": per process memory: " << perProcessMemory <<
" GB asquareMem: " << asquareMem/1000000000.00 <<
" GB" <<
" inputMem: " << inputMem/1000000000.00 <<
" GB" <<
" outputMem: " << outputMem/1000000000.00 <<
" GB" <<
" kselectmem: " << kselectmem/1000000000.00 <<
" GB" << std::endl;
531 std::cout <<
"phases: " << phases <<
": per process memory: " << perProcessMemory <<
" GB asquareMem: " << asquareMem/1000000.00 <<
" MB" <<
" inputMem: " << inputMem/1000000.00 <<
" MB" <<
" outputMem: " << outputMem/1000000.00 <<
" MB" <<
" kselectmem: " << kselectmem/1000000.00 <<
" MB" << std::endl;
542 MPI_Barrier(
A.getcommgrid()->GetWorld());
547 LIA C_m =
A.spSeq->getnrow();
548 LIB C_n =
B.spSeq->getncol();
550 std::vector< UDERB > PiecesOfB;
551 UDERB CopyB = *(
B.spSeq);
553 CopyB.ColSplit(phases, PiecesOfB);
554 MPI_Barrier(GridC->GetWorld());
556 LIA ** ARecvSizes = SpHelper::allocate2D<LIA>(UDERA::esscount, stages);
557 LIB ** BRecvSizes = SpHelper::allocate2D<LIB>(UDERB::esscount, stages);
559 static_assert(std::is_same<LIA, LIB>::value,
"local index types for both input matrices should be the same");
560 static_assert(std::is_same<LIA, LIC>::value,
"local index types for input and output matrices should be the same");
569 std::vector< UDERO > toconcatenate;
571 int Aself = (
A.
commGrid)->GetRankInProcRow();
572 int Bself = (
B.commGrid)->GetRankInProcCol();
574 for(
int p = 0; p< phases; ++p)
577 std::vector< SpTuples<LIC,NUO> *> tomerge;
578 for(
int i = 0; i < stages; ++i)
580 std::vector<LIA> ess;
581 if(i == Aself) ARecv =
A.spSeq;
584 ess.resize(UDERA::esscount);
585 for(
int j=0; j< UDERA::esscount; ++j)
586 ess[j] = ARecvSizes[j][i];
591 MPI_Barrier(
A.getcommgrid()->GetWorld());
596 MPI_Barrier(
A.getcommgrid()->GetWorld());
602 if(i == Bself) BRecv = &(PiecesOfB[p]);
605 ess.resize(UDERB::esscount);
606 for(
int j=0; j< UDERB::esscount; ++j)
607 ess[j] = BRecvSizes[j][i];
611 MPI_Barrier(
A.getcommgrid()->GetWorld());
612 double t2=MPI_Wtime();
616 MPI_Barrier(
A.getcommgrid()->GetWorld());
617 double t3=MPI_Wtime();
623 MPI_Barrier(
A.getcommgrid()->GetWorld());
624 double t4=MPI_Wtime();
626 SpTuples<LIC,NUO> * C_cont;
627 if(computationKernel == 1) C_cont = LocalSpGEMMHash<SR, NUO>(*ARecv, *BRecv,i != Aself, i != Bself,
false);
628 else if(computationKernel == 2) C_cont=LocalSpGEMM<SR, NUO>(*ARecv, *BRecv,i != Aself, i != Bself);
631 MPI_Barrier(
A.getcommgrid()->GetWorld());
632 double t5=MPI_Wtime();
636 if(!C_cont->isZero())
637 tomerge.push_back(C_cont);
643#ifdef SHOW_MEMORY_USAGE
644 int64_t gcnnz_unmerged, lcnnz_unmerged = 0;
645 for(
size_t i = 0; i < tomerge.size(); ++i)
647 lcnnz_unmerged += tomerge[i]->getnnz();
649 MPI_Allreduce(&lcnnz_unmerged, &gcnnz_unmerged, 1,
MPIType<int64_t>(), MPI_MAX, MPI_COMM_WORLD);
650 int64_t summa_memory = gcnnz_unmerged*20;
654 if(summa_memory>1000000000)
655 std::cout << p+1 <<
". unmerged: " << summa_memory/1000000000.00 <<
"GB " ;
657 std::cout << p+1 <<
". unmerged: " << summa_memory/1000000.00 <<
" MB " ;
663 MPI_Barrier(
A.getcommgrid()->GetWorld());
664 double t6=MPI_Wtime();
667 SpTuples<LIC,NUO> * OnePieceOfC_tuples;
668 if(computationKernel == 1) OnePieceOfC_tuples = MultiwayMergeHash<SR>(tomerge, C_m, PiecesOfB[p].getncol(),
true,
false);
669 else if(computationKernel == 2) OnePieceOfC_tuples = MultiwayMerge<SR>(tomerge, C_m, PiecesOfB[p].getncol(),
true);
671#ifdef SHOW_MEMORY_USAGE
672 int64_t gcnnz_merged, lcnnz_merged ;
673 lcnnz_merged = OnePieceOfC_tuples->getnnz();
674 MPI_Allreduce(&lcnnz_merged, &gcnnz_merged, 1,
MPIType<int64_t>(), MPI_MAX, MPI_COMM_WORLD);
677 int64_t merge_memory = gcnnz_merged*2*20;
681 if(merge_memory>1000000000)
682 std::cout <<
" merged: " << merge_memory/1000000000.00 <<
"GB " ;
684 std::cout <<
" merged: " << merge_memory/1000000.00 <<
" MB " ;
690 MPI_Barrier(
A.getcommgrid()->GetWorld());
691 double t7=MPI_Wtime();
694 UDERO * OnePieceOfC =
new UDERO(* OnePieceOfC_tuples,
false);
695 delete OnePieceOfC_tuples;
697 SpParMat<IU,NUO,UDERO> OnePieceOfC_mat(OnePieceOfC, GridC);
698 MCLPruneRecoverySelect(OnePieceOfC_mat, hardThreshold, selectNum, recoverNum, recoverPct, kselectVersion);
700#ifdef SHOW_MEMORY_USAGE
701 int64_t gcnnz_pruned, lcnnz_pruned ;
702 lcnnz_pruned = OnePieceOfC_mat.getlocalnnz();
703 MPI_Allreduce(&lcnnz_pruned, &gcnnz_pruned, 1,
MPIType<int64_t>(), MPI_MAX, MPI_COMM_WORLD);
707 int64_t prune_memory = gcnnz_pruned*2*20;
712 if(prune_memory>1000000000)
713 std::cout <<
"Prune: " << prune_memory/1000000000.00 <<
"GB " << std::endl ;
715 std::cout <<
"Prune: " << prune_memory/1000000.00 <<
" MB " << std::endl ;
721 toconcatenate.push_back(OnePieceOfC_mat.seq());
724 UDERO *
C =
new UDERO(0,C_m, C_n,0);
725 C->ColConcatenate(toconcatenate);
729 return SpParMat<IU,NUO,UDERO> (
C, GridC);
732template <
typename SR,
typename NUO,
typename UDERO,
typename IU,
typename NU1,
typename NU2,
typename UDERA,
typename UDERB>
734 NUO hardThreshold, IU selectNum, IU recoverNum, NUO recoverPct,
int kselectVersion,
int64_t perProcessMemory){
738 typedef typename UDERA::LocalIT LIA;
739 typedef typename UDERB::LocalIT LIB;
740 typedef typename UDERO::LocalIT LIC;
743 MPI_Comm_rank(MPI_COMM_WORLD,&myrank);
746 std::shared_ptr<CommGrid> GridC =
ProductGrid((
A.
commGrid).get(), (
B.commGrid).get(), stages, dummy, dummy);
748 double t0, t1, t2, t3, t4, t5;
750 MPI_Comm World = GridC->GetWorld();
751 MPI_Comm_size(World,&p);
753 int64_t perNNZMem_in =
sizeof(IU)*2 +
sizeof(NU1);
754 int64_t perNNZMem_out =
sizeof(IU)*2 +
sizeof(NUO);
760 int64_t inputMem = gannz * perNNZMem_in * 4;
764 int64_t asquareMem = asquareNNZ * perNNZMem_out * 2;
768 int64_t d = ceil( (asquareNNZ * sqrt(p))/
B.getlocalcols() );
770 int64_t k = std::min(
int64_t(std::max(selectNum, recoverNum)), d );
771 int64_t kselectmem =
B.getlocalcols() * k * 8 * 3;
774 int64_t outputNNZ = (
B.getlocalcols() * d)/sqrt(p);
776 int64_t outputMem = outputNNZ * perNNZMem_in * 2;
780 int64_t remainingMem = perProcessMemory*1000000000 - inputMem;
785 phases = 1 + asquareMem / remainingMem;
798template <
typename SR,
typename NUO,
typename UDERO,
typename IU,
typename NU1,
typename NU2,
typename UDERA,
typename UDERB>
800 (SpParMat<IU,NU1,UDERA> &
A, SpParMat<IU,NU2,UDERB> &
B,
bool clearA =
false,
bool clearB =
false )
805 return SpParMat< IU,NUO,UDERO >();
807 typedef typename UDERA::LocalIT LIA;
808 typedef typename UDERB::LocalIT LIB;
809 typedef typename UDERO::LocalIT LIC;
811 static_assert(std::is_same<LIA, LIB>::value,
"local index types for both input matrices should be the same");
812 static_assert(std::is_same<LIA, LIC>::value,
"local index types for input and output matrices should be the same");
815 std::shared_ptr<CommGrid> GridC =
ProductGrid((
A.
commGrid).get(), (
B.commGrid).get(), stages, dummy, dummy);
816 LIA C_m =
A.spSeq->getnrow();
817 LIB C_n =
B.spSeq->getncol();
819 UDERA * A1seq =
new UDERA();
820 UDERA * A2seq =
new UDERA();
821 UDERB * B1seq =
new UDERB();
822 UDERB * B2seq =
new UDERB();
823 (
A.spSeq)->Split( *A1seq, *A2seq);
824 const_cast< UDERB*
>(
B.spSeq)->Transpose();
825 (
B.spSeq)->Split( *B1seq, *B2seq);
828 const_cast< UDERB*
>(B1seq)->Transpose();
829 const_cast< UDERB*
>(B2seq)->Transpose();
831 LIA ** ARecvSizes = SpHelper::allocate2D<LIA>(UDERA::esscount, stages);
832 LIB ** BRecvSizes = SpHelper::allocate2D<LIB>(UDERB::esscount, stages);
840 std::vector< SpTuples<LIC,NUO> *> tomerge;
842 int Aself = (
A.
commGrid)->GetRankInProcRow();
843 int Bself = (
B.commGrid)->GetRankInProcCol();
845 for(
int i = 0; i < stages; ++i)
847 std::vector<LIA> ess;
854 ess.resize(UDERA::esscount);
855 for(
int j=0; j< UDERA::esscount; ++j)
857 ess[j] = ARecvSizes[j][i];
869 ess.resize(UDERB::esscount);
870 for(
int j=0; j< UDERB::esscount; ++j)
872 ess[j] = BRecvSizes[j][i];
888 SpTuples<LIC,NUO> * C_cont = LocalHybridSpGEMM<SR, NUO>
896 if(!C_cont->isZero())
897 tomerge.push_back(C_cont);
901 if(clearA)
delete A1seq;
902 if(clearB)
delete B1seq;
909 for(
int i = 0; i < stages; ++i)
911 std::vector<LIA> ess;
918 ess.resize(UDERA::esscount);
919 for(
int j=0; j< UDERA::esscount; ++j)
921 ess[j] = ARecvSizes[j][i];
935 ess.resize(UDERB::esscount);
936 for(
int j=0; j< UDERB::esscount; ++j)
938 ess[j] = BRecvSizes[j][i];
955 SpTuples<LIC,NUO> * C_cont = LocalHybridSpGEMM<SR, NUO>
960 if(!C_cont->isZero())
961 tomerge.push_back(C_cont);
975 (
A.spSeq)->Merge(*A1seq, *A2seq);
989 (
B.spSeq)->Merge(*B1seq, *B2seq);
992 const_cast< UDERB*
>(
B.spSeq)->Transpose();
995 UDERO *
C =
new UDERO(MergeAll<SR>(tomerge, C_m, C_n,
true),
false);
996 return SpParMat<IU,NUO,UDERO> (
C, GridC);
1004template <
typename SR,
typename NUO,
typename UDERO,
typename IU,
typename NU1,
typename NU2,
typename UDERA,
typename UDERB>
1006 (SpParMat<IU,NU1,UDERA> &
A, SpParMat<IU,NU2,UDERB> &
B,
bool clearA =
false,
bool clearB =
false )
1010 MPI_Comm_rank(MPI_COMM_WORLD,&myrank);
1013 return SpParMat< IU,NUO,UDERO >();
1016 std::shared_ptr<CommGrid> GridC =
ProductGrid((
A.
commGrid).get(), (
B.commGrid).get(), stages, dummy, dummy);
1017 IU C_m =
A.spSeq->getnrow();
1018 IU C_n =
B.spSeq->getncol();
1022 IU ** ARecvSizes = SpHelper::allocate2D<IU>(UDERA::esscount, stages);
1023 IU ** BRecvSizes = SpHelper::allocate2D<IU>(UDERB::esscount, stages);
1031 std::vector< SpTuples<IU,NUO> *> tomerge;
1033 int Aself = (
A.
commGrid)->GetRankInProcRow();
1034 int Bself = (
B.commGrid)->GetRankInProcCol();
1036 for(
int i = 0; i < stages; ++i)
1038 std::vector<IU> ess;
1045 ess.resize(UDERA::esscount);
1046 for(
int j=0; j< UDERA::esscount; ++j)
1048 ess[j] = ARecvSizes[j][i];
1050 ARecv =
new UDERA();
1061 ess.resize(UDERB::esscount);
1062 for(
int j=0; j< UDERB::esscount; ++j)
1064 ess[j] = BRecvSizes[j][i];
1066 BRecv =
new UDERB();
1070 SpTuples<IU,NUO> * C_cont = LocalHybridSpGEMM<SR, NUO>
1075 if(!C_cont->isZero())
1076 tomerge.push_back(C_cont);
1078#ifdef COMBBLAS_DEBUG
1079 std::ostringstream outs;
1080 outs << i <<
"th SUMMA iteration"<< std::endl;
1085 if(clearA &&
A.spSeq != NULL)
1090 if(clearB &&
B.spSeq != NULL)
1100 SpTuples<IU,NUO> * C_tuples = MultiwayMerge<SR>(tomerge, C_m, C_n,
false);
1101 UDERO *
C =
new UDERO(*C_tuples,
false);
1107 return SpParMat<IU,NUO,UDERO> (
C, GridC);
1110template <
typename SR,
typename NUO,
typename UDERO,
typename IU,
typename NU1,
typename NU2,
typename UDERA,
typename UDERB>
1112 (SpParMat<IU,NU1,UDERA> &
A, SpParMat<IU,NU2,UDERB> &
B,
bool clearA =
false,
bool clearB =
false )
1115 MPI_Comm_rank(MPI_COMM_WORLD,&myrank);
1118 return SpParMat< IU,NUO,UDERO >();
1121 std::shared_ptr<CommGrid> GridC =
ProductGrid((
A.
commGrid).get(), (
B.commGrid).get(), stages, dummy, dummy);
1122 IU C_m =
A.spSeq->getnrow();
1123 IU C_n =
B.spSeq->getncol();
1127 IU ** ARecvSizes = SpHelper::allocate2D<IU>(UDERA::esscount, stages);
1128 IU ** BRecvSizes = SpHelper::allocate2D<IU>(UDERB::esscount, stages);
1134 UDERA ** ARecv =
new UDERA* [stages];
1135 UDERB ** BRecv =
new UDERB* [stages];
1137 Arr<IU,NU1> Aarrinfo =
A.seqptr()->GetArrays();
1138 Arr<IU,NU2> Barrinfo =
B.seqptr()->GetArrays();
1139 std::vector< std::vector<MPI_Request> > ABCastIndarrayReq;
1140 std::vector< std::vector<MPI_Request> > ABCastNumarrayReq;
1141 std::vector< std::vector<MPI_Request> > BBCastIndarrayReq;
1142 std::vector< std::vector<MPI_Request> > BBCastNumarrayReq;
1143 for(
int i = 0; i < stages; i++){
1144 ABCastIndarrayReq.push_back( std::vector<MPI_Request>(Aarrinfo.indarrs.size(), MPI_REQUEST_NULL) );
1145 ABCastNumarrayReq.push_back( std::vector<MPI_Request>(Aarrinfo.numarrs.size(), MPI_REQUEST_NULL) );
1146 BBCastIndarrayReq.push_back( std::vector<MPI_Request>(Barrinfo.indarrs.size(), MPI_REQUEST_NULL) );
1147 BBCastNumarrayReq.push_back( std::vector<MPI_Request>(Barrinfo.numarrs.size(), MPI_REQUEST_NULL) );
1150 int Aself = (
A.
commGrid)->GetRankInProcRow();
1151 int Bself = (
B.commGrid)->GetRankInProcCol();
1153 std::vector< SpTuples<IU,NUO> *> tomerge;
1155 for(
int i = 0; i < stages; ++i){
1156 std::vector<IU> ess;
1157 if(i == Aself) ARecv[i] =
A.spSeq;
1159 ess.resize(UDERA::esscount);
1160 for(
int j=0; j< UDERA::esscount; ++j) ess[j] = ARecvSizes[j][i];
1161 ARecv[i] =
new UDERA();
1167 if(i == Bself) BRecv[i] =
B.spSeq;
1169 ess.resize(UDERB::esscount);
1170 for(
int j=0; j< UDERB::esscount; ++j) ess[j] = BRecvSizes[j][i];
1171 BRecv[i] =
new UDERB();
1176 MPI_Waitall(ABCastIndarrayReq[i-1].
size(), ABCastIndarrayReq[i-1].data(), MPI_STATUSES_IGNORE);
1177 MPI_Waitall(ABCastNumarrayReq[i-1].
size(), ABCastNumarrayReq[i-1].data(), MPI_STATUSES_IGNORE);
1178 MPI_Waitall(BBCastIndarrayReq[i-1].
size(), BBCastIndarrayReq[i-1].data(), MPI_STATUSES_IGNORE);
1179 MPI_Waitall(BBCastNumarrayReq[i-1].
size(), BBCastNumarrayReq[i-1].data(), MPI_STATUSES_IGNORE);
1181 SpTuples<IU,NUO> * C_cont = LocalHybridSpGEMM<SR, NUO>
1182 (*(ARecv[i-1]), *(BRecv[i-1]),
1185 if(!C_cont->isZero()) tomerge.push_back(C_cont);
1187 SpTuples<IU,NUO> * C_tuples = MultiwayMerge<SR>(tomerge, C_m, C_n,
true);
1188 std::vector< SpTuples<IU,NUO> *>().swap(tomerge);
1189 tomerge.push_back(C_tuples);
1191 #ifdef COMBBLAS_DEBUG
1192 std::ostringstream outs;
1193 outs << i <<
"th SUMMA iteration"<< std::endl;
1198 MPI_Waitall(ABCastIndarrayReq[stages-1].
size(), ABCastIndarrayReq[stages-1].data(), MPI_STATUSES_IGNORE);
1199 MPI_Waitall(ABCastNumarrayReq[stages-1].
size(), ABCastNumarrayReq[stages-1].data(), MPI_STATUSES_IGNORE);
1200 MPI_Waitall(BBCastIndarrayReq[stages-1].
size(), BBCastIndarrayReq[stages-1].data(), MPI_STATUSES_IGNORE);
1201 MPI_Waitall(BBCastNumarrayReq[stages-1].
size(), BBCastNumarrayReq[stages-1].data(), MPI_STATUSES_IGNORE);
1203 SpTuples<IU,NUO> * C_cont = LocalHybridSpGEMM<SR, NUO>
1204 (*(ARecv[stages-1]), *(BRecv[stages-1]),
1207 if(!C_cont->isZero()) tomerge.push_back(C_cont);
1209 if(clearA &&
A.spSeq != NULL) {
1213 if(clearB &&
B.spSeq != NULL) {
1225 SpTuples<IU,NUO> * C_tuples = MultiwayMerge<SR>(tomerge, C_m, C_n,
true);
1226 std::vector< SpTuples<IU,NUO> *>().swap(tomerge);
1228 UDERO *
C =
new UDERO(*C_tuples,
false);
1234 return SpParMat<IU,NUO,UDERO> (
C, GridC);
1242template <
typename IU,
typename NU1,
typename NU2,
typename UDERA,
typename UDERB>
1245 typedef typename UDERA::LocalIT LIA;
1246 typedef typename UDERB::LocalIT LIB;
1247 static_assert(std::is_same<LIA, LIB>::value,
"local index types for both input matrices should be the same");
1253 if(
A.getncol() !=
B.getnrow())
1255 std::ostringstream outs;
1256 outs <<
"Can not multiply, dimensions does not match"<< std::endl;
1257 outs <<
A.getncol() <<
" != " <<
B.getnrow() << std::endl;
1264 std::shared_ptr<CommGrid> GridC =
ProductGrid((
A.
commGrid).get(), (
B.commGrid).get(), stages, dummy, dummy);
1266 MPI_Barrier(GridC->GetWorld());
1268 LIA ** ARecvSizes = SpHelper::allocate2D<LIA>(UDERA::esscount, stages);
1269 LIB ** BRecvSizes = SpHelper::allocate2D<LIB>(UDERB::esscount, stages);
1277 int Aself = (
A.
commGrid)->GetRankInProcRow();
1278 int Bself = (
B.commGrid)->GetRankInProcCol();
1281 for(
int i = 0; i < stages; ++i)
1283 std::vector<LIA> ess;
1290 ess.resize(UDERA::esscount);
1291 for(
int j=0; j< UDERA::esscount; ++j)
1293 ess[j] = ARecvSizes[j][i];
1295 ARecv =
new UDERA();
1307 ess.resize(UDERB::esscount);
1308 for(
int j=0; j< UDERB::esscount; ++j)
1310 ess[j] = BRecvSizes[j][i];
1312 BRecv =
new UDERB();
1323 LIB nzc = BRecv->GetDCSC()->nzc;
1325 if (flopC)
delete [] flopC;
1326 if(colnnzC)
delete [] colnnzC;
1344 MPI_Allreduce(&nnzC_SUMMA, &nnzC_SUMMA_max, 1,
MPIType<int64_t>(), MPI_MAX, GridC->GetWorld());
1346 return nnzC_SUMMA_max;
1350template <
typename MATRIX,
typename VECTOR>
1353 if(
A.getncol() != x.TotalLength())
1355 std::ostringstream outs;
1356 outs <<
"Can not multiply, dimensions does not match"<< std::endl;
1357 outs <<
A.getncol() <<
" != " << x.TotalLength() << std::endl;
1361 if(! ( *(
A.getcommgrid()) == *(x.getcommgrid())) )
1363 std::cout <<
"Grids are not comparable for SpMV" << std::endl;
1369template <
typename SR,
typename IU,
typename NUM,
typename UDER>
1370FullyDistSpVec<IU,typename promote_trait<NUM,IU>::T_promote>
SpMV
1373template <
typename SR,
typename IU,
typename NUM,
typename UDER>
1374FullyDistSpVec<IU,typename promote_trait<NUM,IU>::T_promote>
SpMV
1375 (
const SpParMat<IU,NUM,UDER> &
A,
const FullyDistSpVec<IU,IU> & x,
bool indexisvalue)
1378 OptBuf<int32_t, T_promote > optbuf = OptBuf<int32_t, T_promote >();
1379 return SpMV<SR>(
A, x, indexisvalue, optbuf);
1387template<
typename IU,
typename NV>
1388void TransposeVector(MPI_Comm & World,
const FullyDistSpVec<IU,NV> & x,
int32_t & trxlocnz, IU & lenuntil,
int32_t * & trxinds, NV * & trxnums,
bool indexisvalue)
1393 IU luntil = x.LengthUntil();
1394 int diagneigh = x.
commGrid->GetComplementRank();
1397 MPI_Sendrecv(&roffst, 1,
MPIType<int32_t>(), diagneigh,
TROST, &roffset, 1,
MPIType<int32_t>(), diagneigh,
TROST, World, &status);
1398 MPI_Sendrecv(&xlocnz, 1,
MPIType<int32_t>(), diagneigh,
TRNNZ, &trxlocnz, 1,
MPIType<int32_t>(), diagneigh,
TRNNZ, World, &status);
1399 MPI_Sendrecv(&luntil, 1, MPIType<IU>(), diagneigh,
TRLUT, &lenuntil, 1, MPIType<IU>(), diagneigh,
TRLUT, World, &status);
1403 trxinds =
new int32_t[trxlocnz];
1406#pragma omp parallel for
1408 for(
int i=0; i< xlocnz; ++i)
1409 temp_xind[i] = (
int32_t) x.ind[i];
1410 MPI_Sendrecv(temp_xind, xlocnz,
MPIType<int32_t>(), diagneigh,
TRI, trxinds, trxlocnz,
MPIType<int32_t>(), diagneigh,
TRI, World, &status);
1411 delete [] temp_xind;
1414 trxnums =
new NV[trxlocnz];
1415 MPI_Sendrecv(
const_cast<NV*
>(
SpHelper::p2a(x.num)), xlocnz, MPIType<NV>(), diagneigh,
TRX, trxnums, trxlocnz, MPIType<NV>(), diagneigh,
TRX, World, &status);
1418 std::transform(trxinds, trxinds+trxlocnz, trxinds, std::bind2nd(std::plus<int32_t>(), roffset));
1429template<
typename IU,
typename NV>
1431 int32_t * & indacc, NV * & numacc,
int & accnz,
bool indexisvalue)
1433 int colneighs, colrank;
1434 MPI_Comm_size(ColWorld, &colneighs);
1435 MPI_Comm_rank(ColWorld, &colrank);
1436 int * colnz =
new int[colneighs];
1437 colnz[colrank] = trxlocnz;
1438 MPI_Allgather(MPI_IN_PLACE, 1, MPI_INT, colnz, 1, MPI_INT, ColWorld);
1439 int * dpls =
new int[colneighs]();
1440 std::partial_sum(colnz, colnz+colneighs-1, dpls+1);
1441 accnz = std::accumulate(colnz, colnz+colneighs, 0);
1443 numacc =
new NV[accnz];
1453 double t0=MPI_Wtime();
1461 if(colrank == 0) lenuntilcol = lenuntil;
1462 MPI_Bcast(&lenuntilcol, 1, MPIType<IU>(), 0, ColWorld);
1463 for(
int i=0; i< accnz; ++i)
1465 numacc[i] = indacc[i] + lenuntilcol;
1470 MPI_Allgatherv(trxnums, trxlocnz, MPIType<NV>(), numacc, colnz, dpls, MPIType<NV>(), ColWorld);
1474 double t1=MPI_Wtime();
1488template<
typename SR,
typename IVT,
typename OVT,
typename IU,
typename NUM,
typename UDER>
1489void LocalSpMV(
const SpParMat<IU,NUM,UDER> &
A,
int rowneighs, OptBuf<int32_t, OVT > & optbuf,
int32_t * & indacc, IVT * & numacc,
1490 int32_t * & sendindbuf, OVT * & sendnumbuf,
int * & sdispls,
int * sendcnt,
int accnz,
bool indexisvalue, PreAllocatedSPA<OVT> & SPA)
1492 if(optbuf.totmax > 0)
1494 if(
A.spSeq->getnsplit() > 0)
1497 generic_gespmv_threaded_setbuffers<SR> (*(
A.spSeq), indacc, numacc, accnz, optbuf.inds, optbuf.nums, sendcnt, optbuf.dspls, rowneighs);
1501 generic_gespmv<SR> (*(
A.spSeq), indacc, numacc, accnz, optbuf.inds, optbuf.nums, sendcnt, optbuf.dspls, rowneighs, indexisvalue);
1507 if(
A.spSeq->getnsplit() > 0)
1510 int totalsent = generic_gespmv_threaded<SR> (*(
A.spSeq), indacc, numacc, accnz, sendindbuf, sendnumbuf, sdispls, rowneighs, SPA);
1513 for(
int i=0; i<rowneighs-1; ++i)
1514 sendcnt[i] = sdispls[i+1] - sdispls[i];
1515 sendcnt[rowneighs-1] = totalsent - sdispls[rowneighs-1];
1520 std::vector< int32_t > indy;
1521 std::vector< OVT > numy;
1522 generic_gespmv<SR>(*(
A.spSeq), indacc, numacc, accnz, indy, numy, SPA);
1526 int32_t bufsize = indy.size();
1527 sendindbuf =
new int32_t[bufsize];
1528 sendnumbuf =
new OVT[bufsize];
1529 int32_t perproc =
A.getlocalrows() / rowneighs;
1532 for(
int i=0; i<rowneighs; ++i)
1534 int32_t end_this = (i==rowneighs-1) ?
A.getlocalrows(): (i+1)*perproc;
1535 while(k < bufsize && indy[k] < end_this)
1537 sendindbuf[k] = indy[k] - i*perproc;
1538 sendnumbuf[k] = numy[k];
1543 sdispls =
new int[rowneighs]();
1544 std::partial_sum(sendcnt, sendcnt+rowneighs-1, sdispls+1);
1556template <
typename SR,
typename IU,
typename OVT>
1557void MergeContributions(
int* listSizes, std::vector<int32_t *> & indsvec, std::vector<OVT *> & numsvec, std::vector<IU>& mergedind, std::vector<OVT>& mergednum)
1560 int nlists = indsvec.size();
1566 int veclen = listSizes[0];
1567 mergedind.resize(veclen);
1568 mergednum.resize(veclen);
1569 for(
int i=0; i<veclen; i++)
1571 mergedind[i] = indsvec[0][i];
1572 mergednum[i] = numsvec[0][i];
1578 int32_t inf = std::numeric_limits<int32_t>::min();
1579 int32_t sup = std::numeric_limits<int32_t>::max();
1581 int * processed =
new int[nlists]();
1582 for(
int i=0; i<nlists; ++i)
1584 if(listSizes[i] > 0)
1587 sHeap.insert(indsvec[i][0], i);
1594 sHeap.deleteMin(&key, &locv);
1595 mergedind.push_back(
static_cast<IU
>(key));
1596 mergednum.push_back(numsvec[locv][0]);
1598 if( (++(processed[locv])) < listSizes[locv] )
1599 sHeap.insert(indsvec[locv][processed[locv]], locv);
1605 sHeap.deleteMin(&key, &locv);
1606 if(mergedind.back() ==
static_cast<IU
>(key))
1608 mergednum.back() = SR::add(mergednum.back(), numsvec[locv][processed[locv]]);
1614 mergedind.push_back(
static_cast<IU
>(key));
1615 mergednum.push_back(numsvec[locv][processed[locv]]);
1618 if( (++(processed[locv])) < listSizes[locv] )
1619 sHeap.insert(indsvec[locv][processed[locv]], locv);
1628template <
typename SR,
typename IU,
typename OVT>
1629void MergeContributions_threaded(
int * & listSizes, std::vector<int32_t *> & indsvec, std::vector<OVT *> & numsvec, std::vector<IU> & mergedind, std::vector<OVT> & mergednum, IU maxindex)
1632 int nlists = indsvec.size();
1638 int veclen = listSizes[0];
1639 mergedind.resize(veclen);
1640 mergednum.resize(veclen);
1643#pragma omp parallel for
1645 for(
int i=0; i<veclen; i++)
1647 mergedind[i] = indsvec[0][i];
1648 mergednum[i] = numsvec[0][i];
1657 nthreads = omp_get_num_threads();
1660 int nsplits = 4*nthreads;
1661 nsplits = std::min(nsplits, (
int)maxindex);
1662 std::vector< std::vector<int32_t> > splitters(nlists);
1663 for(
int k=0; k< nlists; k++)
1665 splitters[k].resize(nsplits+1);
1666 splitters[k][0] =
static_cast<int32_t>(0);
1667#pragma omp parallel for
1668 for(
int i=1; i< nsplits; i++)
1670 IU cur_idx = i * (maxindex/nsplits);
1671 auto it = std::lower_bound (indsvec[k], indsvec[k] + listSizes[k], cur_idx);
1672 splitters[k][i] = (
int32_t) (it - indsvec[k]);
1674 splitters[k][nsplits] = listSizes[k];
1678 std::vector<std::vector<IU>> indsBuf(nsplits);
1679 std::vector<std::vector<OVT>> numsBuf(nsplits);
1681#pragma omp parallel for schedule(dynamic)
1682 for(
int i=0; i< nsplits; i++)
1684 std::vector<int32_t *> tIndsVec(nlists);
1685 std::vector<OVT *> tNumsVec(nlists);
1686 std::vector<int> tLengths(nlists);
1687 for(
int j=0; j< nlists; ++j)
1689 tIndsVec[j] = indsvec[j] + splitters[j][i];
1690 tNumsVec[j] = numsvec[j] + splitters[j][i];
1691 tLengths[j]= splitters[j][i+1] - splitters[j][i];
1694 MergeContributions<SR>(tLengths.data(), tIndsVec, tNumsVec, indsBuf[i], numsBuf[i]);
1698 std::vector<IU> tdisp(nsplits+1);
1700 for(
int i=0; i<nsplits; ++i)
1702 tdisp[i+1] = tdisp[i] + indsBuf[i].size();
1705 mergedind.resize(tdisp[nsplits]);
1706 mergednum.resize(tdisp[nsplits]);
1709#pragma omp parallel for schedule(dynamic)
1710 for(
int i=0; i< nsplits; i++)
1712 std::copy(indsBuf[i].data() , indsBuf[i].data() + indsBuf[i].
size(), mergedind.data() + tdisp[i]);
1713 std::copy(numsBuf[i].data() , numsBuf[i].data() + numsBuf[i].
size(), mergednum.data() + tdisp[i]);
1724template <
typename SR,
typename IVT,
typename OVT,
typename IU,
typename NUM,
typename UDER>
1725void SpMV (
const SpParMat<IU,NUM,UDER> &
A,
const FullyDistSpVec<IU,IVT> & x, FullyDistSpVec<IU,OVT> & y,
1726 bool indexisvalue, OptBuf<int32_t, OVT > & optbuf, PreAllocatedSPA<OVT> & SPA)
1730 y.glen =
A.getnrow();
1732 MPI_Comm World = x.
commGrid->GetWorld();
1733 MPI_Comm ColWorld = x.commGrid->GetColWorld();
1734 MPI_Comm RowWorld = x.commGrid->GetRowWorld();
1740 IVT *trxnums, *numacc;
1743 double t0=MPI_Wtime();
1746 TransposeVector(World, x, trxlocnz, lenuntil, trxinds, trxnums, indexisvalue);
1749 double t1=MPI_Wtime();
1753 if(x.commGrid->GetGridRows() > 1)
1755 AllGatherVector(ColWorld, trxlocnz, lenuntil, trxinds, trxnums, indacc, numacc, accnz, indexisvalue);
1765 MPI_Comm_size(RowWorld, &rowneighs);
1766 int * sendcnt =
new int[rowneighs]();
1772 double t2=MPI_Wtime();
1775 LocalSpMV<SR>(
A, rowneighs, optbuf, indacc, numacc, sendindbuf, sendnumbuf, sdispls, sendcnt, accnz, indexisvalue, SPA);
1778 double t3=MPI_Wtime();
1783 if(x.commGrid->GetGridCols() == 1)
1785 y.ind.resize(sendcnt[0]);
1786 y.num.resize(sendcnt[0]);
1789 if(optbuf.totmax > 0 )
1792#pragma omp parallel for
1794 for(
int i=0; i<sendcnt[0]; i++)
1796 y.ind[i] = optbuf.inds[i];
1797 y.num[i] = optbuf.nums[i];
1803#pragma omp parallel for
1805 for(
int i=0; i<sendcnt[0]; i++)
1807 y.ind[i] = sendindbuf[i];
1808 y.num[i] = sendnumbuf[i];
1810 DeleteAll(sendindbuf, sendnumbuf,sdispls);
1815 int * rdispls =
new int[rowneighs];
1816 int * recvcnt =
new int[rowneighs];
1817 MPI_Alltoall(sendcnt, 1, MPI_INT, recvcnt, 1, MPI_INT, RowWorld);
1821 for(
int i=0; i<rowneighs-1; ++i)
1823 rdispls[i+1] = rdispls[i] + recvcnt[i];
1826 int totrecv = std::accumulate(recvcnt,recvcnt+rowneighs,0);
1828 OVT * recvnumbuf =
new OVT[totrecv];
1831 double t4=MPI_Wtime();
1833 if(optbuf.totmax > 0 )
1836 MPI_Alltoallv(optbuf.nums, sendcnt, optbuf.dspls, MPIType<OVT>(), recvnumbuf, recvcnt, rdispls, MPIType<OVT>(), RowWorld);
1842 MPI_Alltoallv(sendnumbuf, sendcnt, sdispls, MPIType<OVT>(), recvnumbuf, recvcnt, rdispls, MPIType<OVT>(), RowWorld);
1843 DeleteAll(sendindbuf, sendnumbuf, sendcnt, sdispls);
1846 double t5=MPI_Wtime();
1851 double t6=MPI_Wtime();
1855 std::vector<IU>().swap(y.ind);
1856 std::vector<OVT>().swap(y.num);
1858 std::vector<int32_t *> indsvec(rowneighs);
1859 std::vector<OVT *> numsvec(rowneighs);
1862#pragma omp parallel for
1864 for(
int i=0; i<rowneighs; i++)
1866 indsvec[i] = recvindbuf+rdispls[i];
1867 numsvec[i] = recvnumbuf+rdispls[i];
1870 MergeContributions_threaded<SR>(recvcnt, indsvec, numsvec, y.ind, y.num, y.MyLocLength());
1872 MergeContributions<SR>(recvcnt, indsvec, numsvec, y.ind, y.num);
1875 DeleteAll(recvcnt, rdispls,recvindbuf, recvnumbuf);
1877 double t7=MPI_Wtime();
1884template <
typename SR,
typename IVT,
typename OVT,
typename IU,
typename NUM,
typename UDER>
1885void SpMV (
const SpParMat<IU,NUM,UDER> &
A,
const FullyDistSpVec<IU,IVT> & x, FullyDistSpVec<IU,OVT> & y,
bool indexisvalue, PreAllocatedSPA<OVT> & SPA)
1887 OptBuf< int32_t, OVT > optbuf = OptBuf< int32_t,OVT >();
1888 SpMV<SR>(
A, x, y, indexisvalue, optbuf, SPA);
1891template <
typename SR,
typename IVT,
typename OVT,
typename IU,
typename NUM,
typename UDER>
1892void SpMV (
const SpParMat<IU,NUM,UDER> &
A,
const FullyDistSpVec<IU,IVT> & x, FullyDistSpVec<IU,OVT> & y,
bool indexisvalue)
1894 OptBuf< int32_t, OVT > optbuf = OptBuf< int32_t,OVT >();
1895 PreAllocatedSPA<OVT> SPA;
1896 SpMV<SR>(
A, x, y, indexisvalue, optbuf, SPA);
1899template <
typename SR,
typename IVT,
typename OVT,
typename IU,
typename NUM,
typename UDER>
1900void SpMV (
const SpParMat<IU,NUM,UDER> &
A,
const FullyDistSpVec<IU,IVT> & x, FullyDistSpVec<IU,OVT> & y,
bool indexisvalue, OptBuf<int32_t, OVT > & optbuf)
1902 PreAllocatedSPA<OVT> SPA;
1903 SpMV<SR>(
A, x, y, indexisvalue, optbuf, SPA);
1911template <
typename SR,
typename IU,
typename NUM,
typename UDER>
1912FullyDistSpVec<IU,typename promote_trait<NUM,IU>::T_promote>
SpMV
1916 FullyDistSpVec<IU, T_promote> y ( x.getcommgrid(),
A.getnrow());
1917 SpMV<SR>(
A, x, y, indexisvalue, optbuf);
1924template <
typename SR,
typename IU,
typename NUM,
typename NUV,
typename UDER>
1925FullyDistVec<IU,typename promote_trait<NUM,NUV>::T_promote>
SpMV
1926 (
const SpParMat<IU,NUM,UDER> &
A,
const FullyDistVec<IU,NUV> & x )
1931 MPI_Comm World = x.commGrid->GetWorld();
1932 MPI_Comm ColWorld = x.commGrid->GetColWorld();
1933 MPI_Comm RowWorld = x.commGrid->GetRowWorld();
1935 int xsize = (int) x.LocArrSize();
1938 int diagneigh = x.commGrid->GetComplementRank();
1940 MPI_Sendrecv(&xsize, 1, MPI_INT, diagneigh,
TRX, &trxsize, 1, MPI_INT, diagneigh,
TRX, World, &status);
1942 NUV * trxnums =
new NUV[trxsize];
1943 MPI_Sendrecv(
const_cast<NUV*
>(
SpHelper::p2a(x.arr)), xsize, MPIType<NUV>(), diagneigh,
TRX, trxnums, trxsize, MPIType<NUV>(), diagneigh,
TRX, World, &status);
1945 int colneighs, colrank;
1946 MPI_Comm_size(ColWorld, &colneighs);
1947 MPI_Comm_rank(ColWorld, &colrank);
1948 int * colsize =
new int[colneighs];
1949 colsize[colrank] = trxsize;
1950 MPI_Allgather(MPI_IN_PLACE, 1, MPI_INT, colsize, 1, MPI_INT, ColWorld);
1951 int * dpls =
new int[colneighs]();
1952 std::partial_sum(colsize, colsize+colneighs-1, dpls+1);
1953 int accsize = std::accumulate(colsize, colsize+colneighs, 0);
1954 NUV * numacc =
new NUV[accsize];
1956 MPI_Allgatherv(trxnums, trxsize, MPIType<NUV>(), numacc, colsize, dpls, MPIType<NUV>(), ColWorld);
1960 T_promote
id = SR::id();
1961 IU ysize =
A.getlocalrows();
1962 T_promote * localy =
new T_promote[ysize];
1963 std::fill_n(localy, ysize,
id);
1966 dcsc_gespmv_threaded<SR>(*(
A.spSeq), numacc, localy);
1968 dcsc_gespmv<SR>(*(
A.spSeq), numacc, localy);
1975 FullyDistVec<IU, T_promote> y ( x.commGrid,
A.getnrow(),
id);
1978 MPI_Comm_size(RowWorld, &rowneighs);
1981 for(
int i=0; i< rowneighs; ++i)
1983 begptr = y.RowLenUntil(i);
1984 if(i == rowneighs-1)
1990 endptr = y.RowLenUntil(i+1);
1992 MPI_Reduce(localy+begptr,
SpHelper::p2a(y.arr), endptr-begptr, MPIType<T_promote>(), SR::mpi_op(), i, RowWorld);
2004template <
typename SR,
typename IU,
typename NUM,
typename NUV,
typename UDER>
2005FullyDistSpVec<IU,typename promote_trait<NUM,NUV>::T_promote>
SpMV
2006 (
const SpParMat<IU,NUM,UDER> &
A,
const FullyDistSpVec<IU,NUV> & x)
2011 MPI_Comm World = x.commGrid->GetWorld();
2012 MPI_Comm ColWorld = x.commGrid->GetColWorld();
2013 MPI_Comm RowWorld = x.commGrid->GetRowWorld();
2015 int xlocnz = (int) x.getlocnnz();
2017 int roffst = x.RowLenUntil();
2020 int diagneigh = x.commGrid->GetComplementRank();
2022 MPI_Sendrecv(&xlocnz, 1, MPI_INT, diagneigh,
TRX, &trxlocnz, 1, MPI_INT, diagneigh,
TRX, World, &status);
2023 MPI_Sendrecv(&roffst, 1, MPI_INT, diagneigh,
TROST, &offset, 1, MPI_INT, diagneigh,
TROST, World, &status);
2025 IU * trxinds =
new IU[trxlocnz];
2026 NUV * trxnums =
new NUV[trxlocnz];
2027 MPI_Sendrecv(
const_cast<IU*
>(
SpHelper::p2a(x.ind)), xlocnz, MPIType<IU>(), diagneigh,
TRX, trxinds, trxlocnz, MPIType<IU>(), diagneigh,
TRX, World, &status);
2028 MPI_Sendrecv(
const_cast<NUV*
>(
SpHelper::p2a(x.num)), xlocnz, MPIType<NUV>(), diagneigh,
TRX, trxnums, trxlocnz, MPIType<NUV>(), diagneigh,
TRX, World, &status);
2029 std::transform(trxinds, trxinds+trxlocnz, trxinds, std::bind2nd(std::plus<IU>(), offset));
2031 int colneighs, colrank;
2032 MPI_Comm_size(ColWorld, &colneighs);
2033 MPI_Comm_rank(ColWorld, &colrank);
2034 int * colnz =
new int[colneighs];
2035 colnz[colrank] = trxlocnz;
2036 MPI_Allgather(MPI_IN_PLACE, 1, MPI_INT, colnz, 1, MPI_INT, ColWorld);
2037 int * dpls =
new int[colneighs]();
2038 std::partial_sum(colnz, colnz+colneighs-1, dpls+1);
2039 int accnz = std::accumulate(colnz, colnz+colneighs, 0);
2040 IU * indacc =
new IU[accnz];
2041 NUV * numacc =
new NUV[accnz];
2045 MPI_Allgatherv(trxinds, trxlocnz, MPIType<IU>(), indacc, colnz, dpls, MPIType<IU>(), ColWorld);
2046 MPI_Allgatherv(trxnums, trxlocnz, MPIType<NUV>(), numacc, colnz, dpls, MPIType<NUV>(), ColWorld);
2050 std::vector< int32_t > indy;
2051 std::vector< T_promote > numy;
2054 for(
int i=0; i< accnz; ++i) tmpindacc[i] = indacc[i];
2057 dcsc_gespmv<SR>(*(
A.spSeq), tmpindacc, numacc, accnz, indy, numy);
2062 FullyDistSpVec<IU, T_promote> y ( x.commGrid,
A.getnrow());
2063 IU yintlen = y.MyRowLength();
2066 MPI_Comm_size(RowWorld,&rowneighs);
2067 std::vector< std::vector<IU> > sendind(rowneighs);
2068 std::vector< std::vector<T_promote> > sendnum(rowneighs);
2069 typename std::vector<int32_t>::size_type outnz = indy.size();
2070 for(
typename std::vector<IU>::size_type i=0; i< outnz; ++i)
2073 int rown = y.OwnerWithinRow(yintlen,
static_cast<IU
>(indy[i]), locind);
2074 sendind[rown].push_back(locind);
2075 sendnum[rown].push_back(numy[i]);
2078 IU * sendindbuf =
new IU[outnz];
2079 T_promote * sendnumbuf =
new T_promote[outnz];
2080 int * sendcnt =
new int[rowneighs];
2081 int * sdispls =
new int[rowneighs];
2082 for(
int i=0; i<rowneighs; ++i)
2083 sendcnt[i] = sendind[i].
size();
2085 int * rdispls =
new int[rowneighs];
2086 int * recvcnt =
new int[rowneighs];
2087 MPI_Alltoall(sendcnt, 1, MPI_INT, recvcnt, 1, MPI_INT, RowWorld);
2091 for(
int i=0; i<rowneighs-1; ++i)
2093 sdispls[i+1] = sdispls[i] + sendcnt[i];
2094 rdispls[i+1] = rdispls[i] + recvcnt[i];
2096 int totrecv = std::accumulate(recvcnt,recvcnt+rowneighs,0);
2097 IU * recvindbuf =
new IU[totrecv];
2098 T_promote * recvnumbuf =
new T_promote[totrecv];
2100 for(
int i=0; i<rowneighs; ++i)
2102 std::copy(sendind[i].begin(), sendind[i].end(), sendindbuf+sdispls[i]);
2103 std::vector<IU>().swap(sendind[i]);
2105 for(
int i=0; i<rowneighs; ++i)
2107 std::copy(sendnum[i].begin(), sendnum[i].end(), sendnumbuf+sdispls[i]);
2108 std::vector<T_promote>().swap(sendnum[i]);
2110 MPI_Alltoallv(sendindbuf, sendcnt, sdispls, MPIType<IU>(), recvindbuf, recvcnt, rdispls, MPIType<IU>(), RowWorld);
2111 MPI_Alltoallv(sendnumbuf, sendcnt, sdispls, MPIType<T_promote>(), recvnumbuf, recvcnt, rdispls, MPIType<T_promote>(), RowWorld);
2114 DeleteAll(sendcnt, recvcnt, sdispls, rdispls);
2117 IU ysize = y.MyLocLength();
2118 T_promote * localy =
new T_promote[ysize];
2119 bool * isthere =
new bool[ysize];
2120 std::vector<IU> nzinds;
2121 std::fill_n(isthere, ysize,
false);
2123 for(
int i=0; i< totrecv; ++i)
2125 if(!isthere[recvindbuf[i]])
2127 localy[recvindbuf[i]] = recvnumbuf[i];
2128 nzinds.push_back(recvindbuf[i]);
2129 isthere[recvindbuf[i]] =
true;
2133 localy[recvindbuf[i]] = SR::add(localy[recvindbuf[i]], recvnumbuf[i]);
2136 DeleteAll(isthere, recvindbuf, recvnumbuf);
2137 sort(nzinds.begin(), nzinds.end());
2138 int nnzy = nzinds.size();
2141 for(
int i=0; i< nnzy; ++i)
2143 y.ind[i] = nzinds[i];
2144 y.num[i] = localy[nzinds[i]];
2156template <
typename IU,
typename NU1,
typename NU2,
typename UDERA,
typename UDERB>
2157SpParMat<IU,NU1,UDERA>
SetDifference(
const SpParMat<IU,NU1,UDERA> &
A,
const SpParMat<IU,NU2,UDERB> &
B)
2162 return SpParMat<IU, NU1, UDERA> (result,
A.
commGrid);
2166 std::cout <<
"Grids are not comparable for set difference" << std::endl;
2168 return SpParMat< IU,NU1,UDERA >();
2173template <
typename IU,
typename NU1,
typename NU2,
typename UDERA,
typename UDERB>
2175 (
const SpParMat<IU,NU1,UDERA> &
A,
const SpParMat<IU,NU2,UDERB> &
B ,
bool exclude)
2182 DER_promote * result =
new DER_promote(
EWiseMult(*(
A.spSeq),*(
B.spSeq),exclude) );
2183 return SpParMat<IU, N_promote, DER_promote> (result,
A.
commGrid);
2187 std::cout <<
"Grids are not comparable elementwise multiplication" << std::endl;
2189 return SpParMat< IU,N_promote,DER_promote >();
2193template <
typename RETT,
typename RETDER,
typename IU,
typename NU1,
typename NU2,
typename UDERA,
typename UDERB,
typename _BinaryOperation>
2195 (
const SpParMat<IU,NU1,UDERA> &
A,
const SpParMat<IU,NU2,UDERB> &
B, _BinaryOperation __binary_op,
bool notB,
const NU2& defaultBVal)
2199 RETDER * result =
new RETDER( EWiseApply<RETT>(*(
A.spSeq),*(
B.spSeq), __binary_op, notB, defaultBVal) );
2200 return SpParMat<IU, RETT, RETDER> (result,
A.
commGrid);
2204 std::cout <<
"Grids are not comparable elementwise apply" << std::endl;
2206 return SpParMat< IU,RETT,RETDER >();
2210template <
typename RETT,
typename RETDER,
typename IU,
typename NU1,
typename NU2,
typename UDERA,
typename UDERB,
typename _BinaryOperation,
typename _BinaryPredicate>
2212 (
const SpParMat<IU,NU1,UDERA> &
A,
const SpParMat<IU,NU2,UDERB> &
B, _BinaryOperation __binary_op, _BinaryPredicate do_op,
bool allowANulls,
bool allowBNulls,
const NU1& ANullVal,
const NU2& BNullVal,
const bool allowIntersect,
const bool useExtendedBinOp)
2216 RETDER * result =
new RETDER( EWiseApply<RETT>(*(
A.spSeq),*(
B.spSeq), __binary_op, do_op, allowANulls, allowBNulls, ANullVal, BNullVal, allowIntersect) );
2217 return SpParMat<IU, RETT, RETDER> (result,
A.
commGrid);
2221 std::cout <<
"Grids are not comparable elementwise apply" << std::endl;
2223 return SpParMat< IU,RETT,RETDER >();
2228template <
typename RETT,
typename RETDER,
typename IU,
typename NU1,
typename NU2,
typename UDERA,
typename UDERB,
typename _BinaryOperation,
typename _BinaryPredicate>
2229SpParMat<IU,RETT,RETDER>
2230EWiseApply (
const SpParMat<IU,NU1,UDERA> &
A,
const SpParMat<IU,NU2,UDERB> &
B, _BinaryOperation __binary_op, _BinaryPredicate do_op,
bool allowANulls,
bool allowBNulls,
const NU1& ANullVal,
const NU2& BNullVal,
const bool allowIntersect =
true)
2232 return EWiseApply<RETT, RETDER>(
A,
B,
2235 allowANulls, allowBNulls, ANullVal, BNullVal, allowIntersect,
true);
2243template <
typename IU,
typename NU1,
typename NU2>
2244FullyDistSpVec<IU,typename promote_trait<NU1,NU2>::T_promote>
EWiseMult
2245 (
const FullyDistSpVec<IU,NU1> & V,
const FullyDistVec<IU,NU2> & W ,
bool exclude, NU2 zero)
2249 if(*(V.commGrid) == *(W.commGrid))
2251 FullyDistSpVec< IU, T_promote> Product(V.commGrid);
2252 if(V.glen != W.glen)
2254 std::cerr <<
"Vector dimensions don't match for EWiseMult\n";
2259 Product.glen = V.glen;
2260 IU
size= V.getlocnnz();
2263 #if defined(_OPENMP) && defined(CBLAS_EXPERIMENTAL)
2265 std::vector <IU> tlosizes (actual_splits, 0);
2266 std::vector < std::vector<IU> > tlinds(actual_splits);
2267 std::vector < std::vector<T_promote> > tlnums(actual_splits);
2268 IU tlsize =
size / actual_splits;
2269 #pragma omp parallel for
2270 for(IU t = 0; t < actual_splits; ++t)
2272 IU tlbegin = t*tlsize;
2273 IU tlend = (t==actual_splits-1)?
size : (t+1)*tlsize;
2274 for(IU i=tlbegin; i<tlend; ++i)
2276 if(W.arr[V.ind[i]] == zero)
2278 tlinds[t].push_back(V.ind[i]);
2279 tlnums[t].push_back(V.num[i]);
2284 std::vector<IU> prefix_sum(actual_splits+1,0);
2285 std::partial_sum(tlosizes.begin(), tlosizes.end(), prefix_sum.begin()+1);
2286 Product.ind.resize(prefix_sum[actual_splits]);
2287 Product.num.resize(prefix_sum[actual_splits]);
2289 #pragma omp parallel for
2290 for(IU t=0; t< actual_splits; ++t)
2292 std::copy(tlinds[t].begin(), tlinds[t].end(), Product.ind.begin()+prefix_sum[t]);
2293 std::copy(tlnums[t].begin(), tlnums[t].end(), Product.num.begin()+prefix_sum[t]);
2296 for(IU i=0; i<
size; ++i)
2298 if(W.arr[V.ind[i]] == zero)
2300 Product.ind.push_back(V.ind[i]);
2301 Product.num.push_back(V.num[i]);
2308 for(IU i=0; i<
size; ++i)
2310 if(W.arr[V.ind[i]] != zero)
2312 Product.ind.push_back(V.ind[i]);
2313 Product.num.push_back(V.num[i] * W.arr[V.ind[i]]);
2322 std::cout <<
"Grids are not comparable elementwise multiplication" << std::endl;
2324 return FullyDistSpVec< IU,T_promote>();
2332template <
typename RET,
typename IU,
typename NU1,
typename NU2,
typename _BinaryOperation,
typename _BinaryPredicate>
2334 (
const FullyDistSpVec<IU,NU1> & V,
const FullyDistVec<IU,NU2> & W , _BinaryOperation _binary_op, _BinaryPredicate _doOp,
bool allowVNulls, NU1 Vzero,
const bool useExtendedBinOp)
2336 typedef RET T_promote;
2337 if(*(V.commGrid) == *(W.commGrid))
2339 FullyDistSpVec< IU, T_promote> Product(V.commGrid);
2340 if(V.TotalLength() != W.TotalLength())
2342 std::ostringstream outs;
2343 outs <<
"Vector dimensions don't match (" << V.TotalLength() <<
" vs " << W.TotalLength() <<
") for EWiseApply (short version)\n";
2353 nthreads = omp_get_num_threads();
2357 Product.glen = V.glen;
2358 IU
size= W.LocArrSize();
2359 IU spsize = V.getlocnnz();
2362 std::vector<std::vector<IU>> tProductInd(nthreads);
2363 std::vector<std::vector<T_promote>> tProductVal(nthreads);
2366 perthread =
size/nthreads;
2368 perthread = spsize/nthreads;
2376 curthread = omp_get_thread_num();
2378 IU tStartIdx = perthread * curthread;
2379 IU tNextIdx = perthread * (curthread+1);
2383 if(curthread == nthreads-1) tNextIdx =
size;
2386 auto it = std::lower_bound (V.ind.begin(), V.ind.end(), tStartIdx);
2387 IU tSpIdx = (IU) std::distance(V.ind.begin(), it);
2390 for(IU tIdx=tStartIdx; tIdx < tNextIdx; ++tIdx)
2392 if(tSpIdx < spsize && V.ind[tSpIdx] < tNextIdx && V.ind[tSpIdx] == tIdx)
2394 if (_doOp(V.num[tSpIdx], W.arr[tIdx],
false,
false))
2396 tProductInd[curthread].push_back(tIdx);
2397 tProductVal[curthread].push_back (_binary_op(V.num[tSpIdx], W.arr[tIdx],
false,
false));
2403 if (_doOp(Vzero, W.arr[tIdx],
true,
false))
2405 tProductInd[curthread].push_back(tIdx);
2406 tProductVal[curthread].push_back (_binary_op(Vzero, W.arr[tIdx],
true,
false));
2413 if(curthread == nthreads-1) tNextIdx = spsize;
2414 for(IU tSpIdx=tStartIdx; tSpIdx < tNextIdx; ++tSpIdx)
2416 if (_doOp(V.num[tSpIdx], W.arr[V.ind[tSpIdx]],
false,
false))
2419 tProductInd[curthread].push_back( V.ind[tSpIdx]);
2420 tProductVal[curthread].push_back (_binary_op(V.num[tSpIdx], W.arr[V.ind[tSpIdx]],
false,
false));
2426 std::vector<IU> tdisp(nthreads+1);
2428 for(
int i=0; i<nthreads; ++i)
2430 tdisp[i+1] = tdisp[i] + tProductInd[i].size();
2434 Product.ind.resize(tdisp[nthreads]);
2435 Product.num.resize(tdisp[nthreads]);
2443 curthread = omp_get_thread_num();
2445 std::copy(tProductInd[curthread].begin(), tProductInd[curthread].end(), Product.ind.data() + tdisp[curthread]);
2446 std::copy(tProductVal[curthread].begin() , tProductVal[curthread].end(), Product.num.data() + tdisp[curthread]);
2453 std::cout <<
"Grids are not comparable for EWiseApply" << std::endl;
2455 return FullyDistSpVec< IU,T_promote>();
2478template <
typename RET,
typename IU,
typename NU1,
typename NU2,
typename _BinaryOperation,
typename _BinaryPredicate>
2480(
const FullyDistSpVec<IU,NU1> & V,
const FullyDistVec<IU,NU2> & W , _BinaryOperation _binary_op, _BinaryPredicate _doOp,
bool allowVNulls, NU1 Vzero,
const bool useExtendedBinOp)
2484 return EWiseApply_threaded<RET>(V, W, _binary_op, _doOp, allowVNulls, Vzero, useExtendedBinOp);
2487 typedef RET T_promote;
2488 if(*(V.commGrid) == *(W.commGrid))
2490 FullyDistSpVec< IU, T_promote> Product(V.commGrid);
2492 if(V.TotalLength() != W.TotalLength())
2494 std::ostringstream outs;
2495 outs <<
"Vector dimensions don't match (" << V.TotalLength() <<
" vs " << W.TotalLength() <<
") for EWiseApply (short version)\n";
2501 Product.glen = V.glen;
2502 IU
size= W.LocArrSize();
2503 IU spsize = V.getlocnnz();
2508 for(IU i=0; i<
size; ++i)
2510 if(sp_iter < spsize && V.ind[sp_iter] == i)
2512 if (_doOp(V.num[sp_iter], W.arr[i],
false,
false))
2514 Product.ind.push_back(i);
2515 Product.num.push_back(_binary_op(V.num[sp_iter], W.arr[i],
false,
false));
2521 if (_doOp(Vzero, W.arr[i],
true,
false))
2523 Product.ind.push_back(i);
2524 Product.num.push_back(_binary_op(Vzero, W.arr[i],
true,
false));
2532 for(sp_iter = 0; sp_iter < spsize; ++sp_iter)
2534 if (_doOp(V.num[sp_iter], W.arr[V.ind[sp_iter]],
false,
false))
2536 Product.ind.push_back(V.ind[sp_iter]);
2537 Product.num.push_back(_binary_op(V.num[sp_iter], W.arr[V.ind[sp_iter]],
false,
false));
2547 std::cout <<
"Grids are not comparable for EWiseApply" << std::endl;
2549 return FullyDistSpVec< IU,T_promote>();
2579template <
typename RET,
typename IU,
typename NU1,
typename NU2,
typename _BinaryOperation,
typename _BinaryPredicate>
2581 (
const FullyDistSpVec<IU,NU1> & V,
const FullyDistSpVec<IU,NU2> & W , _BinaryOperation _binary_op, _BinaryPredicate _doOp,
bool allowVNulls,
bool allowWNulls, NU1 Vzero, NU2 Wzero,
const bool allowIntersect,
const bool useExtendedBinOp)
2584 typedef RET T_promote;
2585 if(*(V.commGrid) == *(W.commGrid))
2587 FullyDistSpVec< IU, T_promote> Product(V.commGrid);
2588 if(V.glen != W.glen)
2590 std::ostringstream outs;
2591 outs <<
"Vector dimensions don't match (" << V.glen <<
" vs " << W.glen <<
") for EWiseApply (full version)\n";
2597 Product.glen = V.glen;
2598 typename std::vector< IU >::const_iterator indV = V.ind.begin();
2599 typename std::vector< NU1 >::const_iterator numV = V.num.begin();
2600 typename std::vector< IU >::const_iterator indW = W.ind.begin();
2601 typename std::vector< NU2 >::const_iterator numW = W.num.begin();
2603 while (indV < V.ind.end() && indW < W.ind.end())
2610 if (_doOp(*numV, *numW,
false,
false))
2612 Product.ind.push_back(*indV);
2613 Product.num.push_back(_binary_op(*numV, *numW,
false,
false));
2619 else if (*indV < *indW)
2624 if (_doOp(*numV, Wzero,
false,
true))
2626 Product.ind.push_back(*indV);
2627 Product.num.push_back(_binary_op(*numV, Wzero,
false,
true));
2637 if (_doOp(Vzero, *numW,
true,
false))
2639 Product.ind.push_back(*indW);
2640 Product.num.push_back(_binary_op(Vzero, *numW,
true,
false));
2647 while (allowWNulls && indV < V.ind.end())
2649 if (_doOp(*numV, Wzero,
false,
true))
2651 Product.ind.push_back(*indV);
2652 Product.num.push_back(_binary_op(*numV, Wzero,
false,
true));
2656 while (allowVNulls && indW < W.ind.end())
2658 if (_doOp(Vzero, *numW,
true,
false))
2660 Product.ind.push_back(*indW);
2661 Product.num.push_back(_binary_op(Vzero, *numW,
true,
false));
2670 std::cout <<
"Grids are not comparable for EWiseApply" << std::endl;
2672 return FullyDistSpVec< IU,T_promote>();
2677template <
typename RET,
typename IU,
typename NU1,
typename NU2,
typename _BinaryOperation,
typename _BinaryPredicate>
2679 (
const FullyDistSpVec<IU,NU1> & V,
const FullyDistVec<IU,NU2> & W , _BinaryOperation _binary_op, _BinaryPredicate _doOp,
bool allowVNulls, NU1 Vzero)
2683 return EWiseApply<RET>(V, W,
2686 allowVNulls, Vzero,
true);
2691template <
typename RET,
typename IU,
typename NU1,
typename NU2,
typename _BinaryOperation,
typename _BinaryPredicate>
2693 (
const FullyDistSpVec<IU,NU1> & V,
const FullyDistSpVec<IU,NU2> & W , _BinaryOperation _binary_op, _BinaryPredicate _doOp,
bool allowVNulls,
bool allowWNulls, NU1 Vzero, NU2 Wzero,
const bool allowIntersect =
true)
2695 return EWiseApply<RET>(V, W,
2698 allowVNulls, allowWNulls, Vzero, Wzero, allowIntersect,
true);
2714typedef std::array<float, NROUNDS>
samparr_t;
2716template <
typename NZT>
2727 template<
typename c,
typename t,
typename V>
2739template<
typename NZT>
2745 for (
auto it = arr.begin();
it != arr.end(); ++
it)
2746 *
it = std::numeric_limits<float>::max();
2761 for (
int i = 0; i <
NROUNDS; ++i)
2783 static bool exists =
false;
2800 for (
int i = 0; i < *
len; ++i)
2807template <
typename IU,
typename NU1,
typename NU2,
2808 typename UDERA,
typename UDERB>
2811 SpParMat<IU, NU1, UDERA> &
A, SpParMat<IU, NU2, UDERB> &
B
2815 MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
2816 float lambda = 1.0f;
2820 #pragma omp parallel
2823 nthds = omp_get_num_threads();
2827 std::cout <<
"taking transposes." << std::endl;
2833 std::cout <<
"setting initial samples." << std::endl;
2836 FullyDistVec<IU, samparr_t> samples_init(
A.getcommgrid(),
A.getncol(), sa);
2839 #pragma omp parallel
2842 std::default_random_engine gen;
2843 std::exponential_distribution<float> exp_dist(lambda);
2846 #pragma omp parallel for
2848 for (IU i = 0; i < samples_init.LocArrSize(); ++i)
2851 for (
auto it = tmp.begin(); it != tmp.end(); ++it)
2852 *it = exp_dist(gen);
2853 samples_init.SetLocalElement(i, tmp);
2861 std::cout <<
"computing mid samples." << std::endl;
2863 FullyDistVec<IU, samparr_t> samples_mid =
2864 SpMV<SelectMinxSR<NU1> > (
A, samples_init);
2870 std::cout <<
"computing final samples." << std::endl;
2872 FullyDistVec<IU, samparr_t> samples_final =
2873 SpMV<SelectMinxSR<NU2> > (
B, samples_mid);
2879 std::cout <<
"computing nnz estimation." << std::endl;
2881 float nnzest = 0.0f;
2883 std::cout << myrank <<
"samples_final loc size: "
2884 << samples_final.LocArrSize() << std::endl;
2886 const samparr_t *lsamples = samples_final.GetLocArr();
2889 #pragma omp parallel for reduction (+:nnzest)
2891 for (IU i = 0; i < samples_final.LocArrSize(); ++i)
2894 for (
auto it = lsamples[i].begin(); it != lsamples[i].end(); ++it)
2896 nnzest +=
static_cast<float>(
NROUNDS - 1) / tmp;
2900 std::cout <<
"taking transposes again." << std::endl;
2905 (
B.commGrid)->GetWorld());
2908 std::cout <<
"sampling-based spmv est tot: " << nnzC_tot << std::endl;
2918template <
typename SR,
typename NUO,
typename UDERO,
typename IU,
typename NU1,
typename NU2,
typename UDER1,
typename UDER2>
2919SpParMat3D<IU,NUO,UDERO>
Mult_AnXBn_SUMMA3D(SpParMat3D<IU,NU1,UDER1> &
A, SpParMat3D<IU,NU2,UDER2> &
B){
2921 MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
2922 typedef typename UDERO::LocalIT LIC;
2923 typedef typename UDER1::LocalIT LIA;
2924 typedef typename UDER2::LocalIT LIB;
2927 double t0, t1, t2, t3;
2933 if(
A.getncol() !=
B.getnrow()){
2934 std::ostringstream outs;
2935 outs <<
"Can not multiply, dimensions does not match"<< std::endl;
2936 outs <<
A.getncol() <<
" != " <<
B.getnrow() << std::endl;
2944 vector<LIB> divisions3d;
2947 B.CalculateColSplitDistributionOfLayer(divisions3d);
2957 std::shared_ptr<CommGrid> GridC =
ProductGrid((
A.GetLayerMat()->getcommgrid()).get(),
2958 (
B.GetLayerMat()->getcommgrid()).get(),
2959 stages, dummy, dummy);
2960 IU C_m =
A.GetLayerMat()->seqptr()->getnrow();
2961 IU C_n =
B.GetLayerMat()->seqptr()->getncol();
2963 IU ** ARecvSizes = SpHelper::allocate2D<IU>(UDERO::esscount, stages);
2964 IU ** BRecvSizes = SpHelper::allocate2D<IU>(UDERO::esscount, stages);
2972 std::vector< SpTuples<IU,NUO> *> tomerge;
2974 int Aself = (
A.GetLayerMat()->getcommgrid())->GetRankInProcRow();
2975 int Bself = (
B.GetLayerMat()->getcommgrid())->GetRankInProcCol();
2977 double Abcast_time = 0;
2978 double Bbcast_time = 0;
2979 double Local_multiplication_time = 0;
2981 for(
int i = 0; i < stages; ++i) {
2982 std::vector<IU> ess;
2985 ARecv =
A.GetLayerMat()->seqptr();
2988 ess.resize(UDER1::esscount);
2989 for(
int j=0; j<UDER1::esscount; ++j) {
2990 ess[j] = ARecvSizes[j][i];
2992 ARecv =
new UDER1();
3001 Arr<IU,NU1> Aarrinfo = ARecv->GetArrays();
3003 for(
unsigned int idx = 0; idx < Aarrinfo.indarrs.size(); ++idx) {
3004 MPI_Bcast(Aarrinfo.indarrs[idx].addr, Aarrinfo.indarrs[idx].count, MPIType<IU>(), i, GridC->GetRowWorld());
3007 for(
unsigned int idx = 0; idx < Aarrinfo.numarrs.size(); ++idx) {
3008 MPI_Bcast(Aarrinfo.numarrs[idx].addr, Aarrinfo.numarrs[idx].count, MPIType<NU1>(), i, GridC->GetRowWorld());
3012 Abcast_time += (t3-t2);
3016 BRecv =
B.GetLayerMat()->seqptr();
3019 ess.resize(UDER2::esscount);
3020 for(
int j=0; j<UDER2::esscount; ++j) {
3021 ess[j] = BRecvSizes[j][i];
3023 BRecv =
new UDER2();
3026 MPI_Barrier(
A.GetLayerMat()->getcommgrid()->GetWorld());
3033 Arr<IU,NU2> Barrinfo = BRecv->GetArrays();
3035 for(
unsigned int idx = 0; idx < Barrinfo.indarrs.size(); ++idx) {
3036 MPI_Bcast(Barrinfo.indarrs[idx].addr, Barrinfo.indarrs[idx].count, MPIType<IU>(), i, GridC->GetColWorld());
3038 for(
unsigned int idx = 0; idx < Barrinfo.numarrs.size(); ++idx) {
3039 MPI_Bcast(Barrinfo.numarrs[idx].addr, Barrinfo.numarrs[idx].count, MPIType<NU2>(), i, GridC->GetColWorld());
3043 Bbcast_time += (t3-t2);
3049 SpTuples<IU,NUO> * C_cont = LocalSpGEMMHash<SR, NUO>
3056 Local_multiplication_time += (t3-t2);
3059 if(!C_cont->isZero()) tomerge.push_back(C_cont);
3069 SpTuples<IU,NUO> * C_tuples = MultiwayMergeHash<SR>(tomerge, C_m, C_n,
true,
false);
3077 fprintf(stderr,
"[SUMMA3D]\tAbcast_time: %lf\n", Abcast_time);
3078 fprintf(stderr,
"[SUMMA3D]\tBbcast_time: %lf\n", Bbcast_time);
3079 fprintf(stderr,
"[SUMMA3D]\tLocal_multiplication_time: %lf\n", Local_multiplication_time);
3080 fprintf(stderr,
"[SUMMA3D]\tMerge_layer_time: %lf\n", (t3-t2));
3088 if(myrank == 0) fprintf(stderr,
"[SUMMA3D]\tSUMMA time: %lf\n", (t1-t0));
3097 MPI_Datatype MPI_tuple;
3098 MPI_Type_contiguous(
sizeof(std::tuple<LIC,LIC,NUO>), MPI_CHAR, &MPI_tuple);
3099 MPI_Type_commit(&MPI_tuple);
3106 int * sendcnt =
new int[
A.getcommgrid3D()->GetGridLayers()];
3107 int * sendprfl =
new int[
A.getcommgrid3D()->GetGridLayers()*3];
3108 int * sdispls =
new int[
A.getcommgrid3D()->GetGridLayers()]();
3109 int * recvcnt =
new int[
A.getcommgrid3D()->GetGridLayers()];
3110 int * recvprfl =
new int[
A.getcommgrid3D()->GetGridLayers()*3];
3111 int * rdispls =
new int[
A.getcommgrid3D()->GetGridLayers()]();
3113 vector<IU> divisions3dPrefixSum(divisions3d.size());
3114 divisions3dPrefixSum[0] = 0;
3115 std::partial_sum(divisions3d.begin(), divisions3d.end()-1, divisions3dPrefixSum.begin()+1);
3116 ColLexiCompare<IU,NUO> comp;
3117 IU totsend = C_tuples->getnnz();
3119#pragma omp parallel for
3120 for(
int i=0; i <
A.getcommgrid3D()->GetGridLayers(); ++i){
3121 IU start_col = divisions3dPrefixSum[i];
3122 IU end_col = divisions3dPrefixSum[i] + divisions3d[i];
3123 std::tuple<IU, IU, NUO> search_tuple_start(0, start_col, NUO());
3124 std::tuple<IU, IU, NUO> search_tuple_end(0, end_col, NUO());
3125 std::tuple<IU, IU, NUO>* start_it = std::lower_bound(C_tuples->tuples, C_tuples->tuples + C_tuples->getnnz(), search_tuple_start, comp);
3126 std::tuple<IU, IU, NUO>* end_it = std::lower_bound(C_tuples->tuples, C_tuples->tuples + C_tuples->getnnz(), search_tuple_end, comp);
3128 sendcnt[i] = (int)(end_it - start_it);
3129 sendprfl[i*3+0] = (int)(sendcnt[i]);
3130 sendprfl[i*3+1] = (int)(
A.GetLayerMat()->seqptr()->getnrow());
3131 sendprfl[i*3+2] = (int)(divisions3d[i]);
3133 std::partial_sum(sendcnt, sendcnt+
A.getcommgrid3D()->GetGridLayers()-1, sdispls+1);
3136 for(
int i=0; i <
A.getcommgrid3D()->GetGridLayers(); ++i){
3137#pragma omp parallel for schedule(static)
3138 for(
int j = 0; j < sendcnt[i]; j++){
3139 std::get<1>(C_tuples->tuples[sdispls[i]+j]) = std::get<1>(C_tuples->tuples[sdispls[i]+j]) - divisions3dPrefixSum[i];
3143 MPI_Alltoall(sendprfl, 3, MPI_INT, recvprfl, 3, MPI_INT,
A.getcommgrid3D()->GetFiberWorld());
3145 for(
int i = 0; i <
A.getcommgrid3D()->GetGridLayers(); i++) recvcnt[i] = recvprfl[i*3];
3146 std::partial_sum(recvcnt, recvcnt+
A.getcommgrid3D()->GetGridLayers()-1, rdispls+1);
3147 IU totrecv = std::accumulate(recvcnt,recvcnt+
A.getcommgrid3D()->GetGridLayers(),
static_cast<IU
>(0));
3148 std::tuple<LIC,LIC,NUO>* recvTuples =
static_cast<std::tuple<LIC,LIC,NUO>*
> (::operator
new (
sizeof(std::tuple<LIC,LIC,NUO>[totrecv])));
3153 MPI_Alltoallv(C_tuples->tuples, sendcnt, sdispls, MPI_tuple, recvTuples, recvcnt, rdispls, MPI_tuple,
A.getcommgrid3D()->GetFiberWorld());
3157 if(myrank == 0) fprintf(stderr,
"[SUMMA3D]\tAlltoallv: %lf\n", (t3-t2));
3159 vector<SpTuples<IU, NUO>*> recvChunks(
A.getcommgrid3D()->GetGridLayers());
3160#pragma omp parallel for
3161 for (
int i = 0; i <
A.getcommgrid3D()->GetGridLayers(); i++){
3162 recvChunks[i] =
new SpTuples<LIC, NUO>(recvcnt[i], recvprfl[i*3+1], recvprfl[i*3+2], recvTuples + rdispls[i],
true,
false);
3168 MPI_Type_free(&MPI_tuple);
3175 if(myrank == 0) fprintf(stderr,
"[SUMMA3D]\tReduction time: %lf\n", (t1-t0));
3183 SpTuples<IU, NUO> * merged_tuples = MultiwayMergeHash<SR, IU, NUO>(recvChunks, recvChunks[0]->getnrow(), recvChunks[0]->getncol(),
false,
false);
3186 if(myrank == 0) fprintf(stderr,
"[SUMMA3D]\tMerge_fiber_time: %lf\n", (t1-t0));
3189 UDERO * localResultant =
new UDERO(*merged_tuples,
false);
3190 delete merged_tuples;
3194 ::operator
delete(recvTuples);
3195 for(
int i = 0; i < recvChunks.size(); i++){
3196 recvChunks[i]->tuples_deleted =
true;
3197 delete recvChunks[i];
3199 vector<SpTuples<IU,NUO>*>().swap(recvChunks);
3204 std::shared_ptr<CommGrid3D> grid3d;
3205 grid3d.reset(
new CommGrid3D(
A.getcommgrid3D()->GetWorld(),
A.getcommgrid3D()->GetGridLayers(),
A.getcommgrid3D()->GetGridRows(),
A.getcommgrid3D()->GetGridCols(),
A.isSpecial()));
3206 SpParMat3D<IU, NUO, UDERO>
C(localResultant, grid3d,
A.isColSplit(),
A.isSpecial());
3214template <
typename SR,
typename NUO,
typename UDERO,
typename IU,
typename NU1,
typename NU2,
typename UDERA,
typename UDERB>
3215SpParMat3D<IU, NUO, UDERO>
MemEfficientSpGEMM3D(SpParMat3D<IU, NU1, UDERA> &
A, SpParMat3D<IU, NU2, UDERB> &
B,
3216 int phases, NUO hardThreshold, IU selectNum, IU recoverNum, NUO recoverPct,
int kselectVersion,
int computationKernel,
int64_t perProcessMemory){
3218 MPI_Comm_rank(MPI_COMM_WORLD,&myrank);
3219 typedef typename UDERA::LocalIT LIA;
3220 typedef typename UDERB::LocalIT LIB;
3221 typedef typename UDERO::LocalIT LIC;
3226 if(
A.getncol() !=
B.getnrow()){
3227 std::ostringstream outs;
3228 outs <<
"Can not multiply, dimensions does not match"<< std::endl;
3229 outs <<
A.getncol() <<
" != " <<
B.getnrow() << std::endl;
3237 if(phases < 1 || phases >=
B.getncol()){
3238 SpParHelper::Print(
"[MemEfficientSpGEMM3D]\tThe value of phases is too small or large. Resetting to 1.\n");
3241 double t0, t1, t2, t3, t4, t5, t6, t7, t8, t9;
3243 MPI_Barrier(
B.getcommgrid3D()->GetWorld());
3250 if(perProcessMemory > 0) {
3251 int p, calculatedPhases;
3252 MPI_Comm_size(
A.getcommgrid3D()->GetLayerWorld(),&p);
3253 int64_t perNNZMem_in =
sizeof(IU)*2 +
sizeof(NU1);
3254 int64_t perNNZMem_out =
sizeof(IU)*2 +
sizeof(NUO);
3256 int64_t lannz =
A.GetLayerMat()->getlocalnnz();
3259 MPI_Allreduce(&lannz, &gannz, 1,
MPIType<int64_t>(), MPI_MAX,
A.getcommgrid3D()->GetWorld());
3261 int64_t ginputMem = gannz * perNNZMem_in * 5;
3267 MPI_Allreduce(&asquareNNZ, &gasquareNNZ, 1,
MPIType<int64_t>(), MPI_MAX,
A.getcommgrid3D()->GetFiberWorld());
3270 int64_t gasquareMem = gasquareNNZ * perNNZMem_out * 2;
3272 int64_t d = ceil( ( ( gasquareNNZ /
B.getcommgrid3D()->GetGridLayers() ) * sqrt(p) ) /
B.GetLayerMat()->getlocalcols() );
3274 int64_t k = std::min(
int64_t(std::max(selectNum, recoverNum)), d );
3277 int64_t postKselectOutputNNZ = ceil(( (
B.GetLayerMat()->getlocalcols() /
B.getcommgrid3D()->GetGridLayers() ) * k)/sqrt(p));
3278 int64_t postKselectOutputMem = postKselectOutputNNZ * perNNZMem_out * 2;
3279 double remainingMem = perProcessMemory*1000000000 - ginputMem - postKselectOutputMem;
3280 int64_t kselectMem =
B.GetLayerMat()->getlocalcols() * k *
sizeof(NUO) * 3;
3283 if(remainingMem > 0){
3284 calculatedPhases = ceil( (gasquareMem + kselectMem) / remainingMem );
3286 else calculatedPhases = -1;
3288 int gCalculatedPhases;
3289 MPI_Allreduce(&calculatedPhases, &gCalculatedPhases, 1, MPI_INT, MPI_MAX,
A.getcommgrid3D()->GetFiberWorld());
3290 if(gCalculatedPhases > phases) phases = gCalculatedPhases;
3296 MPI_Barrier(
B.getcommgrid3D()->GetWorld());
3306 vector<LIB> divisions3d;
3309 B.CalculateColSplitDistributionOfLayer(divisions3d);
3315 vector<UDERB*> PiecesOfB;
3316 vector<UDERB*> tempPiecesOfB;
3317 UDERB CopyB = *(
B.GetLayerMat()->seqptr());
3318 CopyB.ColSplit(divisions3d, tempPiecesOfB);
3319 for(
int i = 0; i < tempPiecesOfB.size(); i++){
3320 vector<UDERB*> temp;
3321 tempPiecesOfB[i]->ColSplit(phases, temp);
3322 for(
int j = 0; j < temp.size(); j++){
3323 PiecesOfB.push_back(temp[j]);
3327 vector<UDERO> toconcatenate;
3332 for(
int p = 0; p < phases; p++){
3337 vector<LIB> lbDivisions3d;
3338 LIB totalLocalColumnInvolved = 0;
3339 vector<UDERB*> targetPiecesOfB;
3340 for(
int i = 0; i < PiecesOfB.size(); i++){
3341 if(i % phases == p){
3342 targetPiecesOfB.push_back(
new UDERB(*(PiecesOfB[i])));
3343 lbDivisions3d.push_back(PiecesOfB[i]->getncol());
3344 totalLocalColumnInvolved += PiecesOfB[i]->getncol();
3351 UDERB * OnePieceOfB =
new UDERB(0, (
B.GetLayerMat())->seqptr()->getnrow(), totalLocalColumnInvolved, 0);
3352 OnePieceOfB->ColConcatenate(targetPiecesOfB);
3353 vector<UDERB*>().swap(targetPiecesOfB);
3359 SpParMat<IU, NU2, UDERB> OnePieceOfBLayer(OnePieceOfB,
A.getcommgrid3D()->GetLayerWorld());
3368 std::shared_ptr<CommGrid> GridC =
ProductGrid((
A.GetLayerMat()->getcommgrid()).get(),
3369 (OnePieceOfBLayer.getcommgrid()).get(),
3370 stages, dummy, dummy);
3371 LIA C_m =
A.GetLayerMat()->seqptr()->getnrow();
3372 LIB C_n = OnePieceOfBLayer.seqptr()->getncol();
3374 LIA ** ARecvSizes = SpHelper::allocate2D<LIA>(UDERA::esscount, stages);
3375 LIB ** BRecvSizes = SpHelper::allocate2D<LIB>(UDERB::esscount, stages);
3378 SpParHelper::GetSetSizes( *(OnePieceOfBLayer.seqptr()), BRecvSizes, (OnePieceOfBLayer.getcommgrid())->GetColWorld() );
3383 std::vector< SpTuples<LIC,NUO> *> tomerge;
3385 int Aself = (
A.GetLayerMat()->getcommgrid())->GetRankInProcRow();
3386 int Bself = (OnePieceOfBLayer.getcommgrid())->GetRankInProcCol();
3388 double Abcast_time = 0;
3389 double Bbcast_time = 0;
3390 double Local_multiplication_time = 0;
3392 for(
int i = 0; i < stages; ++i) {
3393 std::vector<LIA> ess;
3396 ARecv =
A.GetLayerMat()->seqptr();
3399 ess.resize(UDERA::esscount);
3400 for(
int j=0; j<UDERA::esscount; ++j) {
3401 ess[j] = ARecvSizes[j][i];
3403 ARecv =
new UDERA();
3412 Arr<LIA,NU1> Aarrinfo = ARecv->GetArrays();
3414 for(
unsigned int idx = 0; idx < Aarrinfo.indarrs.size(); ++idx) {
3415 MPI_Bcast(Aarrinfo.indarrs[idx].addr, Aarrinfo.indarrs[idx].count, MPIType<IU>(), i, GridC->GetRowWorld());
3418 for(
unsigned int idx = 0; idx < Aarrinfo.numarrs.size(); ++idx) {
3419 MPI_Bcast(Aarrinfo.numarrs[idx].addr, Aarrinfo.numarrs[idx].count, MPIType<NU1>(), i, GridC->GetRowWorld());
3424 Abcast_time += (t3-t2);
3428 BRecv = OnePieceOfBLayer.seqptr();
3431 ess.resize(UDERB::esscount);
3432 for(
int j=0; j<UDERB::esscount; ++j) {
3433 ess[j] = BRecvSizes[j][i];
3435 BRecv =
new UDERB();
3438 MPI_Barrier(
A.GetLayerMat()->getcommgrid()->GetWorld());
3445 Arr<LIB,NU2> Barrinfo = BRecv->GetArrays();
3447 for(
unsigned int idx = 0; idx < Barrinfo.indarrs.size(); ++idx) {
3448 MPI_Bcast(Barrinfo.indarrs[idx].addr, Barrinfo.indarrs[idx].count, MPIType<IU>(), i, GridC->GetColWorld());
3450 for(
unsigned int idx = 0; idx < Barrinfo.numarrs.size(); ++idx) {
3451 MPI_Bcast(Barrinfo.numarrs[idx].addr, Barrinfo.numarrs[idx].count, MPIType<NU2>(), i, GridC->GetColWorld());
3456 Bbcast_time += (t3-t2);
3462 SpTuples<LIC,NUO> * C_cont;
3463 if(computationKernel == 1){
3464 C_cont = LocalSpGEMMHash<SR, NUO>
3470 else if(computationKernel == 2){
3471 C_cont = LocalSpGEMM<SR, NUO>
3481 Local_multiplication_time += (t3-t2);
3484 if(!C_cont->isZero()) tomerge.push_back(C_cont);
3493 SpTuples<LIC,NUO> * C_tuples;
3494 if(computationKernel == 1) C_tuples = MultiwayMergeHash<SR>(tomerge, C_m, C_n,
true,
true);
3495 else if(computationKernel == 2) C_tuples = MultiwayMerge<SR>(tomerge, C_m, C_n,
true);
3504 fprintf(stderr,
"[MemEfficientSpGEMM3D]\tPhase: %d\tAbcast_time: %lf\n", p, Abcast_time);
3505 fprintf(stderr,
"[MemEfficientSpGEMM3D]\tPhase: %d\tBbcast_time: %lf\n", p, Bbcast_time);
3506 fprintf(stderr,
"[MemEfficientSpGEMM3D]\tPhase: %d\tLocal_multiplication_time: %lf\n", p, Local_multiplication_time);
3507 fprintf(stderr,
"[MemEfficientSpGEMM3D]\tPhase: %d\tSUMMA Merge time: %lf\n", p, (t3-t2));
3516 if(myrank == 0) fprintf(stderr,
"[MemEfficientSpGEMM3D]\tPhase: %d\tSUMMA time: %lf\n", p, (t1-t0));
3526 MPI_Datatype MPI_tuple;
3527 MPI_Type_contiguous(
sizeof(std::tuple<LIC,LIC,NUO>), MPI_CHAR, &MPI_tuple);
3528 MPI_Type_commit(&MPI_tuple);
3535 int * sendcnt =
new int[
A.getcommgrid3D()->GetGridLayers()];
3536 int * sendprfl =
new int[
A.getcommgrid3D()->GetGridLayers()*3];
3537 int * sdispls =
new int[
A.getcommgrid3D()->GetGridLayers()]();
3538 int * recvcnt =
new int[
A.getcommgrid3D()->GetGridLayers()];
3539 int * recvprfl =
new int[
A.getcommgrid3D()->GetGridLayers()*3];
3540 int * rdispls =
new int[
A.getcommgrid3D()->GetGridLayers()]();
3542 vector<LIC> lbDivisions3dPrefixSum(lbDivisions3d.size());
3543 lbDivisions3dPrefixSum[0] = 0;
3544 std::partial_sum(lbDivisions3d.begin(), lbDivisions3d.end()-1, lbDivisions3dPrefixSum.begin()+1);
3545 ColLexiCompare<LIC,NUO> comp;
3546 LIC totsend = C_tuples->getnnz();
3549 if(myrank == 0) fprintf(stderr,
"[MemEfficientSpGEMM3D]\tPhase: %d\tAllocation of alltoall information: %lf\n", p, (t3-t2));
3555#pragma omp parallel for
3556 for(
int i=0; i <
A.getcommgrid3D()->GetGridLayers(); ++i){
3557 LIC start_col = lbDivisions3dPrefixSum[i];
3558 LIC end_col = lbDivisions3dPrefixSum[i] + lbDivisions3d[i];
3559 std::tuple<LIC, LIC, NUO> search_tuple_start(0, start_col, NUO());
3560 std::tuple<LIC, LIC, NUO> search_tuple_end(0, end_col, NUO());
3561 std::tuple<LIC, LIC, NUO>* start_it = std::lower_bound(C_tuples->tuples, C_tuples->tuples + C_tuples->getnnz(), search_tuple_start, comp);
3562 std::tuple<LIC, LIC, NUO>* end_it = std::lower_bound(C_tuples->tuples, C_tuples->tuples + C_tuples->getnnz(), search_tuple_end, comp);
3564 sendcnt[i] = (int)(end_it - start_it);
3565 sendprfl[i*3+0] = (int)(sendcnt[i]);
3566 sendprfl[i*3+1] = (int)(
A.GetLayerMat()->seqptr()->getnrow());
3567 sendprfl[i*3+2] = (int)(lbDivisions3d[i]);
3569 std::partial_sum(sendcnt, sendcnt+
A.getcommgrid3D()->GetGridLayers()-1, sdispls+1);
3572 if(myrank == 0) fprintf(stderr,
"[MemEfficientSpGEMM3D]\tPhase: %d\tGetting Alltoall data ready: %lf\n", p, (t3-t2));
3579 for(
int i=0; i <
A.getcommgrid3D()->GetGridLayers(); ++i){
3580#pragma omp parallel for schedule(static)
3581 for(
int j = 0; j < sendcnt[i]; j++){
3582 std::get<1>(C_tuples->tuples[sdispls[i]+j]) = std::get<1>(C_tuples->tuples[sdispls[i]+j]) - lbDivisions3dPrefixSum[i];
3587 if(myrank == 0) fprintf(stderr,
"[MemEfficientSpGEMM3D]\tPhase: %d\tGetting Alltoallv data ready: %lf\n", p, (t3-t2));
3593 MPI_Alltoall(sendprfl, 3, MPI_INT, recvprfl, 3, MPI_INT,
A.getcommgrid3D()->GetFiberWorld());
3596 if(myrank == 0) fprintf(stderr,
"[MemEfficientSpGEMM3D]\tPhase: %d\tAlltoall: %lf\n", p, (t3-t2));
3601 for(
int i = 0; i <
A.getcommgrid3D()->GetGridLayers(); i++) recvcnt[i] = recvprfl[i*3];
3602 std::partial_sum(recvcnt, recvcnt+
A.getcommgrid3D()->GetGridLayers()-1, rdispls+1);
3603 LIC totrecv = std::accumulate(recvcnt,recvcnt+
A.getcommgrid3D()->GetGridLayers(),
static_cast<IU
>(0));
3604 std::tuple<LIC,LIC,NUO>* recvTuples =
static_cast<std::tuple<LIC,LIC,NUO>*
> (::operator
new (
sizeof(std::tuple<LIC,LIC,NUO>[totrecv])));
3607 if(myrank == 0) fprintf(stderr,
"[MemEfficientSpGEMM3D]\tPhase: %d\tAllocation of receive data: %lf\n", p, (t3-t2));
3613 MPI_Alltoallv(C_tuples->tuples, sendcnt, sdispls, MPI_tuple, recvTuples, recvcnt, rdispls, MPI_tuple,
A.getcommgrid3D()->GetFiberWorld());
3617 if(myrank == 0) fprintf(stderr,
"[MemEfficientSpGEMM3D]\tPhase: %d\tAlltoallv: %lf\n", p, (t3-t2));
3622 vector<SpTuples<LIC, NUO>*> recvChunks(
A.getcommgrid3D()->GetGridLayers());
3623#pragma omp parallel for
3624 for (
int i = 0; i <
A.getcommgrid3D()->GetGridLayers(); i++){
3625 recvChunks[i] =
new SpTuples<LIC, NUO>(recvcnt[i], recvprfl[i*3+1], recvprfl[i*3+2], recvTuples + rdispls[i],
true,
false);
3629 if(myrank == 0) fprintf(stderr,
"[MemEfficientSpGEMM3D]\tPhase: %d\trecvChunks creation: %lf\n", p, (t3-t2));
3638 MPI_Type_free(&MPI_tuple);
3641 if(myrank == 0) fprintf(stderr,
"[MemEfficientSpGEMM3D]\tPhase: %d\tMemory freeing: %lf\n", p, (t3-t2));
3650 if(myrank == 0) fprintf(stderr,
"[MemEfficientSpGEMM3D]\tPhase: %d\tReduction time: %lf\n", p, (t1-t0));
3658 SpTuples<LIC, NUO> * merged_tuples;
3660 if(computationKernel == 1) merged_tuples = MultiwayMergeHash<SR, LIC, NUO>(recvChunks, recvChunks[0]->getnrow(), recvChunks[0]->getncol(),
false,
false);
3661 else if(computationKernel == 2) merged_tuples = MultiwayMerge<SR, LIC, NUO>(recvChunks, recvChunks[0]->getnrow(), recvChunks[0]->getncol(),
false);
3665 if(myrank == 0) fprintf(stderr,
"[MemEfficientSpGEMM3D]\tPhase: %d\t3D Merge time: %lf\n", p, (t1-t0));
3674 ::operator
delete(recvTuples);
3675 for(
int i = 0; i < recvChunks.size(); i++){
3676 recvChunks[i]->tuples_deleted =
true;
3677 delete recvChunks[i];
3679 vector<SpTuples<LIC,NUO>*>().swap(recvChunks);
3683 UDERO * phaseResultant =
new UDERO(*merged_tuples,
false);
3684 delete merged_tuples;
3685 SpParMat<IU, NUO, UDERO> phaseResultantLayer(phaseResultant,
A.getcommgrid3D()->GetLayerWorld());
3686 MCLPruneRecoverySelect(phaseResultantLayer, hardThreshold, selectNum, recoverNum, recoverPct, kselectVersion);
3690 if(myrank == 0) fprintf(stderr,
"[MemEfficientSpGEMM3D]\tPhase: %d\tMCLPruneRecoverySelect time: %lf\n",p, (t1-t0));
3692 toconcatenate.push_back(phaseResultantLayer.seq());
3694 if(myrank == 0) fprintf(stderr,
"***\n");
3697 for(
int i = 0; i < PiecesOfB.size(); i++)
delete PiecesOfB[i];
3699 std::shared_ptr<CommGrid3D> grid3d;
3700 grid3d.reset(
new CommGrid3D(
A.getcommgrid3D()->GetWorld(),
A.getcommgrid3D()->GetGridLayers(),
A.getcommgrid3D()->GetGridRows(),
A.getcommgrid3D()->GetGridCols(),
A.isSpecial()));
3701 UDERO * localResultant =
new UDERO(0,
A.GetLayerMat()->seqptr()->getnrow(), divisions3d[
A.getcommgrid3D()->GetRankInFiber()], 0);
3702 localResultant->ColConcatenate(toconcatenate);
3703 SpParMat3D<IU, NUO, UDERO> C3D(localResultant, grid3d,
A.isColSplit(),
A.isSpecial());
double cblas_transvectime
double cblas_localspmvtime
double cblas_allgathertime
double cblas_alltoalltime
double cblas_mergeconttime
double mcl_localspgemmtime
double mcl_multiwaymergetime
double mcl_prunecolumntime
double mcl3d_symbolictime
double mcl3d_SUMMAmergetime
double mcl3d_reductiontime
double mcl3d_localspgemmtime
std::shared_ptr< CommGrid > commGrid
void save(std::basic_ostream< c, t > &os, std::array< V, NROUNDS > &sample_vec, int64_t index)
static void deallocate2D(T **array, I m)
static const T * p2a(const std::vector< T > &v)
static void GetSetSizes(const SpMat< IT, NT, DER > &Matrix, IT **&sizes, MPI_Comm &comm1d)
static void BCastMatrix(MPI_Comm &comm1d, SpMat< IT, NT, DER > &Matrix, const std::vector< IT > &essentials, int root)
static void Print(const std::string &s)
static void IBCastMatrix(MPI_Comm &comm1d, SpMat< IT, NT, DER > &Matrix, const std::vector< IT > &essentials, int root, std::vector< MPI_Request > &indarrayReq, std::vector< MPI_Request > &numarrayReq)
SpParMat3D< IU, NUO, UDERO > Mult_AnXBn_SUMMA3D(SpParMat3D< IU, NU1, UDER1 > &A, SpParMat3D< IU, NU2, UDER2 > &B)
IT * estimateFLOP(const SpDCCols< IT, NT1 > &A, const SpDCCols< IT, NT2 > &B, IT *aux=nullptr)
int64_t EstPerProcessNnzSUMMA(SpParMat< IU, NU1, UDERA > &A, SpParMat< IU, NU2, UDERB > &B, bool hashEstimate)
void MCLPruneRecoverySelect(SpParMat< IT, NT, DER > &A, NT hardThreshold, IT selectNum, IT recoverNum, NT recoverPct, int kselectVersion)
MPI_Datatype MPIType< int32_t >(void)
SpParMat3D< IU, NUO, UDERO > MemEfficientSpGEMM3D(SpParMat3D< IU, NU1, UDERA > &A, SpParMat3D< IU, NU2, UDERB > &B, int phases, NUO hardThreshold, IU selectNum, IU recoverNum, NUO recoverPct, int kselectVersion, int computationKernel, int64_t perProcessMemory)
void MergeContributions(FullyDistSpVec< IU, VT > &y, int *&recvcnt, int *&rdispls, int32_t *&recvindbuf, VT *&recvnumbuf, int rowneighs)
FullyDistSpVec< IU, RET > EWiseApply_threaded(const FullyDistSpVec< IU, NU1 > &V, const FullyDistVec< IU, NU2 > &W, _BinaryOperation _binary_op, _BinaryPredicate _doOp, bool allowVNulls, NU1 Vzero, const bool useExtendedBinOp)
void CheckSpMVCompliance(const MATRIX &A, const VECTOR &x)
Dcsc< IU, typename promote_trait< NU1, NU2 >::T_promote > SetDifference(const Dcsc< IU, NU1 > &A, const Dcsc< IU, NU2 > *B)
int64_t EstPerProcessNnzSpMV(SpParMat< IU, NU1, UDERA > &A, SpParMat< IU, NU2, UDERB > &B)
int CalculateNumberOfPhases(SpParMat< IU, NU1, UDERA > &A, SpParMat< IU, NU2, UDERB > &B, NUO hardThreshold, IU selectNum, IU recoverNum, NUO recoverPct, int kselectVersion, int64_t perProcessMemory)
SpParMat< IU, NUO, UDERO > Mult_AnXBn_Overlap(SpParMat< IU, NU1, UDERA > &A, SpParMat< IU, NU2, UDERB > &B, bool clearA=false, bool clearB=false)
SpParMat< IU, NUO, UDERO > MemEfficientSpGEMM(SpParMat< IU, NU1, UDERA > &A, SpParMat< IU, NU2, UDERB > &B, int phases, NUO hardThreshold, IU selectNum, IU recoverNum, NUO recoverPct, int kselectVersion, int computationKernel, int64_t perProcessMemory)
FullyDistVec< IT, NT > Concatenate(std::vector< FullyDistVec< IT, NT > > &vecs)
SpParMat< IU, NUO, UDERO > Mult_AnXBn_Synch(SpParMat< IU, NU1, UDERA > &A, SpParMat< IU, NU2, UDERB > &B, bool clearA=false, bool clearB=false)
MPI_Datatype MPIType< int64_t >(void)
void AllGatherVector(MPI_Comm &ColWorld, int trxlocnz, IU lenuntil, int32_t *&trxinds, NV *&trxnums, int32_t *&indacc, NV *&numacc, int &accnz, bool indexisvalue)
IU EstimateFLOP(SpParMat< IU, NU1, UDERA > &A, SpParMat< IU, NU2, UDERB > &B, bool clearA=false, bool clearB=false)
SpParMat< IU, NUO, UDERO > Mult_AnXBn_DoubleBuff(SpParMat< IU, NU1, UDERA > &A, SpParMat< IU, NU2, UDERB > &B, bool clearA=false, bool clearB=false)
Dcsc< IU, N_promote > EWiseApply(const Dcsc< IU, NU1 > &A, const Dcsc< IU, NU2 > *B, _BinaryOperation __binary_op, bool notB, const NU2 &defaultBVal)
void TransposeVector(MPI_Comm &World, const FullyDistSpVec< IU, NV > &x, int32_t &trxlocnz, IU &lenuntil, int32_t *&trxinds, NV *&trxnums, bool indexisvalue)
void LocalSpMV(const SpParMat< IT, bool, UDER > &A, int rowneighs, OptBuf< int32_t, VT > &optbuf, int32_t *&indacc, VT *&numacc, int *sendcnt, int accnz)
std::array< float, NROUNDS > samparr_t
void MergeContributions_threaded(int *&listSizes, std::vector< int32_t * > &indsvec, std::vector< OVT * > &numsvec, std::vector< IU > &mergedind, std::vector< OVT > &mergednum, IU maxindex)
bool CheckSpGEMMCompliance(const MATRIXA &A, const MATRIXB &B)
Dcsc< IU, typename promote_trait< NU1, NU2 >::T_promote > EWiseMult(const Dcsc< IU, NU1 > &A, const Dcsc< IU, NU2 > *B, bool exclude)
shared_ptr< CommGrid > ProductGrid(CommGrid *gridA, CommGrid *gridB, int &innerdim, int &Aoffset, int &Boffset)
IT * estimateNNZ_Hash(const SpDCCols< IT, NT1 > &A, const SpDCCols< IT, NT2 > &B, IT *flopC, IT *aux=nullptr)
static void MPI_func(void *invec, void *inoutvec, int *len, MPI_Datatype *datatype)
static void axpy(const NZT a, const samparr_t &x, samparr_t &y)
static samparr_t multiply(const NZT arg1, const samparr_t &arg2)
static samparr_t add(const samparr_t &arg1, const samparr_t &arg2)
static bool returnedSAID()