COMBINATORIAL_BLAS 1.6
 
Loading...
Searching...
No Matches
ParFriends.h
Go to the documentation of this file.
1/****************************************************************/
2/* Parallel Combinatorial BLAS Library (for Graph Computations) */
3/* version 1.6 -------------------------------------------------*/
4/* date: 6/15/2017 ---------------------------------------------*/
5/* authors: Ariful Azad, Aydin Buluc --------------------------*/
6/****************************************************************/
7/*
8 Copyright (c) 2010-2017, The Regents of the University of California
9
10 Permission is hereby granted, free of charge, to any person obtaining a copy
11 of this software and associated documentation files (the "Software"), to deal
12 in the Software without restriction, including without limitation the rights
13 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 copies of the Software, and to permit persons to whom the Software is
15 furnished to do so, subject to the following conditions:
16
17 The above copyright notice and this permission notice shall be included in
18 all copies or substantial portions of the Software.
19
20 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 THE SOFTWARE.
27 */
28
29
30#ifndef _PAR_FRIENDS_H_
31#define _PAR_FRIENDS_H_
32
33#include "mpi.h"
34#include <iostream>
35#include <cstdarg>
36#include "SpParMat.h"
37#include "SpParMat3D.h"
38#include "SpParHelper.h"
39#include "MPIType.h"
40#include "Friends.h"
41#include "OptBuf.h"
42#include "mtSpGEMM.h"
43#include "MultiwayMerge.h"
44#include <unistd.h>
45#include <type_traits>
46
47namespace combblas {
48
49template <class IT, class NT, class DER>
50class SpParMat;
51
52/*************************************************************************************************/
53/**************************** FRIEND FUNCTIONS FOR PARALLEL CLASSES ******************************/
54/*************************************************************************************************/
55
56
60template <typename IT, typename NT>
62{
63 if(vecs.size() < 1)
64 {
65 SpParHelper::Print("Warning: Nothing to concatenate, returning empty ");
66 return FullyDistVec<IT,NT>();
67 }
68 else if (vecs.size() < 2)
69 {
70 return vecs[1];
71
72 }
73 else
74 {
75 typename std::vector< FullyDistVec<IT,NT> >::iterator it = vecs.begin();
76 std::shared_ptr<CommGrid> commGridPtr = it->getcommgrid();
77 MPI_Comm World = commGridPtr->GetWorld();
78
79 IT nglen = it->TotalLength(); // new global length
80 IT cumloclen = it->MyLocLength(); // existing cumulative local lengths
81 ++it;
82 for(; it != vecs.end(); ++it)
83 {
84 if(*(commGridPtr) != *(it->getcommgrid()))
85 {
86 SpParHelper::Print("Grids are not comparable for FullyDistVec<IT,NT>::EWiseApply\n");
88 }
89 nglen += it->TotalLength();
90 cumloclen += it->MyLocLength();
91 }
93 int nprocs = commGridPtr->GetSize();
94
95 std::vector< std::vector< NT > > data(nprocs);
96 std::vector< std::vector< IT > > inds(nprocs);
97 IT gloffset = 0;
98 for(it = vecs.begin(); it != vecs.end(); ++it)
99 {
100 IT loclen = it->LocArrSize();
101 for(IT i=0; i < loclen; ++i)
102 {
103 IT locind;
104 IT loffset = it->LengthUntil();
105 int owner = ConCat.Owner(gloffset+loffset+i, locind);
106 data[owner].push_back(it->arr[i]);
107 inds[owner].push_back(locind);
108 }
109 gloffset += it->TotalLength();
110 }
111
112 int * sendcnt = new int[nprocs];
113 int * sdispls = new int[nprocs];
114 for(int i=0; i<nprocs; ++i)
115 sendcnt[i] = (int) data[i].size();
116
117 int * rdispls = new int[nprocs];
118 int * recvcnt = new int[nprocs];
119 MPI_Alltoall(sendcnt, 1, MPI_INT, recvcnt, 1, MPI_INT, World); // share the request counts
120 sdispls[0] = 0;
121 rdispls[0] = 0;
122 for(int i=0; i<nprocs-1; ++i)
123 {
124 sdispls[i+1] = sdispls[i] + sendcnt[i];
125 rdispls[i+1] = rdispls[i] + recvcnt[i];
126 }
127 IT totrecv = std::accumulate(recvcnt,recvcnt+nprocs,static_cast<IT>(0));
128 NT * senddatabuf = new NT[cumloclen];
129 for(int i=0; i<nprocs; ++i)
130 {
131 std::copy(data[i].begin(), data[i].end(), senddatabuf+sdispls[i]);
132 std::vector<NT>().swap(data[i]); // delete data vectors
133 }
134 NT * recvdatabuf = new NT[totrecv];
135 MPI_Alltoallv(senddatabuf, sendcnt, sdispls, MPIType<NT>(), recvdatabuf, recvcnt, rdispls, MPIType<NT>(), World); // send data
136 delete [] senddatabuf;
137
138 IT * sendindsbuf = new IT[cumloclen];
139 for(int i=0; i<nprocs; ++i)
140 {
141 std::copy(inds[i].begin(), inds[i].end(), sendindsbuf+sdispls[i]);
142 std::vector<IT>().swap(inds[i]); // delete inds vectors
143 }
144 IT * recvindsbuf = new IT[totrecv];
145 MPI_Alltoallv(sendindsbuf, sendcnt, sdispls, MPIType<IT>(), recvindsbuf, recvcnt, rdispls, MPIType<IT>(), World); // send new inds
146 DeleteAll(sendindsbuf, sendcnt, sdispls);
147
148 for(int i=0; i<nprocs; ++i)
149 {
150 for(int j = rdispls[i]; j < rdispls[i] + recvcnt[i]; ++j)
151 {
153 }
154 }
155 DeleteAll(recvindsbuf, recvcnt, rdispls);
156 return ConCat;
157 }
158}
159
160template <typename MATRIXA, typename MATRIXB>
162{
163 if(A.getncol() != B.getnrow())
164 {
165 std::ostringstream outs;
166 outs << "Can not multiply, dimensions does not match"<< std::endl;
167 outs << A.getncol() << " != " << B.getnrow() << std::endl;
170 return false;
171 }
172 if((void*) &A == (void*) &B)
173 {
174 std::ostringstream outs;
175 outs << "Can not multiply, inputs alias (make a temporary copy of one of them first)"<< std::endl;
178 return false;
179 }
180 return true;
181}
182
183
184// Combined logic for prune, recovery, and select
185template <typename IT, typename NT, typename DER>
187{
188 int myrank;
190
191#ifdef TIMING
192 double t0, t1;
193#endif
194
195 // Prune and create a new pruned matrix
196 SpParMat<IT,NT,DER> PrunedA = A.Prune(std::bind2nd(std::less_equal<NT>(), hardThreshold), false);
197 // column-wise statistics of the pruned matrix
198 FullyDistVec<IT,NT> colSums = PrunedA.Reduce(Column, std::plus<NT>(), 0.0);
199 FullyDistVec<IT,NT> nnzPerColumnUnpruned = A.Reduce(Column, std::plus<NT>(), 0.0, [](NT val){return 1.0;});
200 FullyDistVec<IT,NT> nnzPerColumn = PrunedA.Reduce(Column, std::plus<NT>(), 0.0, [](NT val){return 1.0;});
201 //FullyDistVec<IT,NT> pruneCols(A.getcommgrid(), A.getncol(), hardThreshold);
204
205 PrunedA.FreeMemory();
206
207 FullyDistSpVec<IT,NT> recoverCols(nnzPerColumn, std::bind2nd(std::less<NT>(), recoverNum));
208
209 // recover only when nnzs in unprunned columns are greater than nnzs in pruned column
211 [](NT spval, NT dval){return spval;},
212 [](NT spval, NT dval){return dval > spval;},
213 false, NT());
214
215
217 // columns with nnz < r AND sum < recoverPct (pct)
219 [](NT spval, NT dval){return spval;},
220 [](NT spval, NT dval){return dval < spval;},
221 false, NT());
222
223 IT nrecover = recoverCols.getnnz();
224
225 if(nrecover > 0)
226 {
227#ifdef TIMING
228 t0=MPI_Wtime();
229#endif
230 A.Kselect(recoverCols, recoverNum, kselectVersion);
231
232#ifdef TIMING
233 t1=MPI_Wtime();
234 mcl_kselecttime += (t1-t0);
235#endif
236
238
239#ifdef COMBBLAS_DEBUG
240 std::ostringstream outs;
241 outs << "Number of columns needing recovery: " << nrecover << std::endl;
243#endif
244
245 }
246
247 if(selectNum>0)
248 {
249 // remaining columns will be up for selection
251 [](NT spval, NT dval){return spval;},
252 [](NT spval, NT dval){return spval==-1;},
253 true, static_cast<NT>(-1));
254
257 [](NT spval, NT dval){return spval;},
258 [](NT spval, NT dval){return dval > spval;},
259 false, NT());
260 IT nselect = selectCols.getnnz();
261
262 if(nselect > 0 )
263 {
264#ifdef TIMING
265 t0=MPI_Wtime();
266#endif
267 A.Kselect(selectCols, selectNum, kselectVersion); // PrunedA would also work
268#ifdef TIMING
269 t1=MPI_Wtime();
270 mcl_kselecttime += (t1-t0);
271#endif
272
274#ifdef COMBBLAS_DEBUG
275 std::ostringstream outs;
276 outs << "Number of columns needing selection: " << nselect << std::endl;
278#endif
279#ifdef TIMING
280 t0=MPI_Wtime();
281#endif
282 SpParMat<IT,NT,DER> selectedA = A.PruneColumn(pruneCols, std::less<NT>(), false);
283#ifdef TIMING
284 t1=MPI_Wtime();
286#endif
287 if(recoverNum>0 ) // recovery can be attempted after selection
288 {
289
290 FullyDistVec<IT,NT> nnzPerColumn1 = selectedA.Reduce(Column, std::plus<NT>(), 0.0, [](NT val){return 1.0;});
291 FullyDistVec<IT,NT> colSums1 = selectedA.Reduce(Column, std::plus<NT>(), 0.0);
292 selectedA.FreeMemory();
293
294 // slected columns with nnz < recoverNum (r)
297 [](NT spval, NT dval){return spval;},
298 [](NT spval, NT dval){return dval < spval;},
299 false, NT());
300
301 // selected columns with sum < recoverPct (pct)
304 [](NT spval, NT dval){return spval;},
305 [](NT spval, NT dval){return dval < spval;},
306 false, NT());
307
310 {
311 // mclExpandVector2 does it on the original vector
312 // mclExpandVector1 does it one pruned vector
313#ifdef TIMING
314 t0=MPI_Wtime();
315#endif
316 A.Kselect(selectCols, recoverNum, kselectVersion); // Kselect on PrunedA might give different result
317#ifdef TIMING
318 t1=MPI_Wtime();
319 mcl_kselecttime += (t1-t0);
320#endif
322
323#ifdef COMBBLAS_DEBUG
324 std::ostringstream outs1;
325 outs1 << "Number of columns needing recovery after selection: " << nselect << std::endl;
327#endif
328 }
329
330 }
331 }
332 }
333
334 // final prune
335#ifdef TIMING
337#endif
338 A.PruneColumn(pruneCols, std::less<NT>(), true);
339#ifdef TIMING
340 t1=MPI_Wtime();
342#endif
343 // Add loops for empty columns
344 if(recoverNum<=0 ) // if recoverNum>0, recovery would have added nonzeros in empty columns
345 {
346 FullyDistVec<IT,NT> nnzPerColumnA = A.Reduce(Column, std::plus<NT>(), 0.0, [](NT val){return 1.0;});
347 FullyDistSpVec<IT,NT> emptyColumns(nnzPerColumnA, std::bind2nd(std::equal_to<NT>(), 0.0));
348 emptyColumns = 1.00;
349 //Ariful: We need a selective AddLoops function with a sparse vector
350 //A.AddLoops(emptyColumns);
351 }
352
353}
354
355template <typename SR, typename IU, typename NU1, typename NU2, typename UDERA, typename UDERB>
357 (SpParMat<IU,NU1,UDERA> & A, SpParMat<IU,NU2,UDERB> & B, bool clearA = false, bool clearB = false)
358
360 int myrank;
362 int stages, dummy; // last two parameters of ProductGrid are ignored for Synch multiplication
363 std::shared_ptr<CommGrid> GridC = ProductGrid((A.commGrid).get(), (B.commGrid).get(), stages, dummy, dummy);
364 IU C_m = A.spSeq->getnrow();
365 IU C_n = B.spSeq->getncol();
366
367 //const_cast< UDERB* >(B.spSeq)->Transpose(); // do not transpose for colum-by-column multiplication
369 IU ** ARecvSizes = SpHelper::allocate2D<IU>(UDERA::esscount, stages);
370 IU ** BRecvSizes = SpHelper::allocate2D<IU>(UDERB::esscount, stages);
371
372 SpParHelper::GetSetSizes( *(A.spSeq), ARecvSizes, (A.commGrid)->GetRowWorld());
373 SpParHelper::GetSetSizes( *(B.spSeq), BRecvSizes, (B.commGrid)->GetColWorld());
374
375 // Remotely fetched matrices are stored as pointers
377 UDERB * BRecv;
378 IU local_flops = 0;
380 int Aself = (A.commGrid)->GetRankInProcRow();
381 int Bself = (B.commGrid)->GetRankInProcCol();
382
383 for(int i = 0; i < stages; ++i)
384 {
385 std::vector<IU> ess;
386 if(i == Aself)
387 {
388 ARecv = A.spSeq; // shallow-copy
389 }
390 else
392 ess.resize(UDERA::esscount);
393 for(int j=0; j< UDERA::esscount; ++j)
395 ess[j] = ARecvSizes[j][i]; // essentials of the ith matrix in this row
396 }
397 ARecv = new UDERA(); // first, create the object
398 }
399 SpParHelper::BCastMatrix(GridC->GetRowWorld(), *ARecv, ess, i); // then, receive its elements
400 ess.clear();
401
402 if(i == Bself)
403 {
404 BRecv = B.spSeq; // shallow-copy
405 }
406 else
407 {
408 ess.resize(UDERB::esscount);
409 for(int j=0; j< UDERB::esscount; ++j)
410 {
411 ess[j] = BRecvSizes[j][i];
412 }
413 BRecv = new UDERB();
414 }
415 SpParHelper::BCastMatrix(GridC->GetColWorld(), *BRecv, ess, i); // then, receive its elements
416
417 local_flops += EstimateLocalFLOP<SR>
418 (*ARecv, *BRecv, // parameters themselves
419 i != Aself, // 'delete A' condition
420 i != Bself); // 'delete B' condition
421 }
422
423 if(clearA && A.spSeq != NULL) {
424 delete A.spSeq;
425 A.spSeq = NULL;
426 }
427 if(clearB && B.spSeq != NULL) {
428 delete B.spSeq;
429 B.spSeq = NULL;
430 }
431
432 SpHelper::deallocate2D(ARecvSizes, UDERA::esscount);
433 SpHelper::deallocate2D(BRecvSizes, UDERB::esscount);
434
435 //if(!clearB)
436 // const_cast< UDERB* >(B.spSeq)->Transpose(); // transpose back to original
437
438 IU global_flops = 0;
439 MPI_Allreduce(&local_flops, &global_flops, 1, MPI_LONG_LONG_INT, MPI_SUM, A.getcommgrid()->GetWorld());
440 return global_flops;
441}
442
449template <typename SR, typename NUO, typename UDERO, typename IU, typename NU1, typename NU2, typename UDERA, typename UDERB>
452{
453 typedef typename UDERA::LocalIT LIA;
454 typedef typename UDERB::LocalIT LIB;
455 typedef typename UDERO::LocalIT LIC;
456
457 int myrank;
459 if(A.getncol() != B.getnrow())
460 {
461 std::ostringstream outs;
462 outs << "Can not multiply, dimensions does not match"<< std::endl;
463 outs << A.getncol() << " != " << B.getnrow() << std::endl;
467 }
468 if(phases <1 || phases >= A.getncol())
469 {
470 SpParHelper::Print("MemEfficientSpGEMM: The value of phases is too small or large. Resetting to 1.\n");
471 phases = 1;
472 }
473
474 int stages, dummy; // last two parameters of ProductGrid are ignored for Synch multiplication
475 std::shared_ptr<CommGrid> GridC = ProductGrid((A.commGrid).get(), (B.commGrid).get(), stages, dummy, dummy);
476
477 double t0, t1, t2, t3, t4, t5;
478#ifdef TIMING
479 MPI_Barrier(A.getcommgrid()->GetWorld());
480 t0 = MPI_Wtime();
481#endif
482 if(perProcessMemory>0) // estimate the number of phases permitted by memory
483 {
484 int p;
485 MPI_Comm World = GridC->GetWorld();
487
488 int64_t perNNZMem_in = sizeof(IU)*2 + sizeof(NU1);
489 int64_t perNNZMem_out = sizeof(IU)*2 + sizeof(NUO);
490
491 // max nnz(A) in a porcess
492 int64_t lannz = A.getlocalnnz();
495 int64_t inputMem = gannz * perNNZMem_in * 4; // for four copies (two for SUMMA)
496
497 // max nnz(A^2) stored by SUMMA in a porcess
499 int64_t asquareMem = asquareNNZ * perNNZMem_out * 2; // an extra copy in multiway merge and in selection/recovery step
500
501
502 // estimate kselect memory
503 int64_t d = ceil( (asquareNNZ * sqrt(p))/ B.getlocalcols() ); // average nnz per column in A^2 (it is an overestimate because asquareNNZ is estimated based on unmerged matrices)
504 // this is equivalent to (asquareNNZ * p) / B.getcol()
505 int64_t k = std::min(int64_t(std::max(selectNum, recoverNum)), d );
506 int64_t kselectmem = B.getlocalcols() * k * 8 * 3;
507
508 // estimate output memory
509 int64_t outputNNZ = (B.getlocalcols() * k)/sqrt(p);
511
512 //inputMem + outputMem + asquareMem/phases + kselectmem/phases < memory
514 if(remainingMem > 0)
515 {
516 phases = 1 + (asquareMem+kselectmem) / remainingMem;
517 }
518
519
520 if(myrank==0)
521 {
522 if(remainingMem < 0)
523 {
524 std::cout << "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n Warning: input and output memory requirement is greater than per-process avaiable memory. Keeping phase to the value supplied at the command line. The program may go out of memory and crash! \n !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" << std::endl;
525 }
526#ifdef SHOW_MEMORY_USAGE
528 if(maxMemory>1000000000)
529 std::cout << "phases: " << phases << ": per process memory: " << perProcessMemory << " GB asquareMem: " << asquareMem/1000000000.00 << " GB" << " inputMem: " << inputMem/1000000000.00 << " GB" << " outputMem: " << outputMem/1000000000.00 << " GB" << " kselectmem: " << kselectmem/1000000000.00 << " GB" << std::endl;
530 else
531 std::cout << "phases: " << phases << ": per process memory: " << perProcessMemory << " GB asquareMem: " << asquareMem/1000000.00 << " MB" << " inputMem: " << inputMem/1000000.00 << " MB" << " outputMem: " << outputMem/1000000.00 << " MB" << " kselectmem: " << kselectmem/1000000.00 << " MB" << std::endl;
532#endif
533
534 }
535 }
536
537 //if(myrank == 0){
538 //fprintf(stderr, "[MemEfficientSpGEMM] Running with phase: %d\n", phases);
539 //}
540
541#ifdef TIMING
542 MPI_Barrier(A.getcommgrid()->GetWorld());
543 t1 = MPI_Wtime();
545#endif
546
547 LIA C_m = A.spSeq->getnrow();
548 LIB C_n = B.spSeq->getncol();
549
550 std::vector< UDERB > PiecesOfB;
551 UDERB CopyB = *(B.spSeq); // we allow alias matrices as input because of this local copy
552
553 CopyB.ColSplit(phases, PiecesOfB); // CopyB's memory is destroyed at this point
554 MPI_Barrier(GridC->GetWorld());
555
556 LIA ** ARecvSizes = SpHelper::allocate2D<LIA>(UDERA::esscount, stages);
557 LIB ** BRecvSizes = SpHelper::allocate2D<LIB>(UDERB::esscount, stages);
558
559 static_assert(std::is_same<LIA, LIB>::value, "local index types for both input matrices should be the same");
560 static_assert(std::is_same<LIA, LIC>::value, "local index types for input and output matrices should be the same");
561
562
563 SpParHelper::GetSetSizes( *(A.spSeq), ARecvSizes, (A.commGrid)->GetRowWorld());
564
565 // Remotely fetched matrices are stored as pointers
566 UDERA * ARecv;
567 UDERB * BRecv;
568
569 std::vector< UDERO > toconcatenate;
570
571 int Aself = (A.commGrid)->GetRankInProcRow();
572 int Bself = (B.commGrid)->GetRankInProcCol();
573
574 for(int p = 0; p< phases; ++p)
575 {
576 SpParHelper::GetSetSizes( PiecesOfB[p], BRecvSizes, (B.commGrid)->GetColWorld());
577 std::vector< SpTuples<LIC,NUO> *> tomerge;
578 for(int i = 0; i < stages; ++i)
579 {
580 std::vector<LIA> ess;
581 if(i == Aself) ARecv = A.spSeq; // shallow-copy
582 else
583 {
584 ess.resize(UDERA::esscount);
585 for(int j=0; j< UDERA::esscount; ++j)
586 ess[j] = ARecvSizes[j][i]; // essentials of the ith matrix in this row
587 ARecv = new UDERA(); // first, create the object
588 }
589
590#ifdef TIMING
591 MPI_Barrier(A.getcommgrid()->GetWorld());
592 t0 = MPI_Wtime();
593#endif
594 SpParHelper::BCastMatrix(GridC->GetRowWorld(), *ARecv, ess, i); // then, receive its elements
595#ifdef TIMING
596 MPI_Barrier(A.getcommgrid()->GetWorld());
597 t1 = MPI_Wtime();
598 mcl_Abcasttime += (t1-t0);
599#endif
600 ess.clear();
601
602 if(i == Bself) BRecv = &(PiecesOfB[p]); // shallow-copy
603 else
604 {
605 ess.resize(UDERB::esscount);
606 for(int j=0; j< UDERB::esscount; ++j)
607 ess[j] = BRecvSizes[j][i];
608 BRecv = new UDERB();
609 }
610#ifdef TIMING
611 MPI_Barrier(A.getcommgrid()->GetWorld());
612 double t2=MPI_Wtime();
613#endif
614 SpParHelper::BCastMatrix(GridC->GetColWorld(), *BRecv, ess, i); // then, receive its elements
615#ifdef TIMING
616 MPI_Barrier(A.getcommgrid()->GetWorld());
617 double t3=MPI_Wtime();
618 mcl_Bbcasttime += (t3-t2);
619#endif
620
621
622#ifdef TIMING
623 MPI_Barrier(A.getcommgrid()->GetWorld());
624 double t4=MPI_Wtime();
625#endif
627 if(computationKernel == 1) C_cont = LocalSpGEMMHash<SR, NUO>(*ARecv, *BRecv,i != Aself, i != Bself, false); // Hash SpGEMM without per-column sorting
629
630#ifdef TIMING
631 MPI_Barrier(A.getcommgrid()->GetWorld());
632 double t5=MPI_Wtime();
634#endif
635
636 if(!C_cont->isZero())
637 tomerge.push_back(C_cont);
638 else
639 delete C_cont;
640
641 } // all stages executed
642
643#ifdef SHOW_MEMORY_USAGE
645 for(size_t i = 0; i < tomerge.size(); ++i)
646 {
647 lcnnz_unmerged += tomerge[i]->getnnz();
648 }
650 int64_t summa_memory = gcnnz_unmerged*20;//(gannz*2 + phase_nnz + gcnnz_unmerged + gannz + gannz/phases) * 20; // last two for broadcasts
651
652 if(myrank==0)
653 {
654 if(summa_memory>1000000000)
655 std::cout << p+1 << ". unmerged: " << summa_memory/1000000000.00 << "GB " ;
656 else
657 std::cout << p+1 << ". unmerged: " << summa_memory/1000000.00 << " MB " ;
658
659 }
660#endif
661
662#ifdef TIMING
663 MPI_Barrier(A.getcommgrid()->GetWorld());
664 double t6=MPI_Wtime();
665#endif
666 // TODO: MultiwayMerge can directly return UDERO inorder to avoid the extra copy
668 if(computationKernel == 1) OnePieceOfC_tuples = MultiwayMergeHash<SR>(tomerge, C_m, PiecesOfB[p].getncol(), true, false);
669 else if(computationKernel == 2) OnePieceOfC_tuples = MultiwayMerge<SR>(tomerge, C_m, PiecesOfB[p].getncol(), true);
670
671#ifdef SHOW_MEMORY_USAGE
675
676 // TODO: we can remove gcnnz_merged memory here because we don't need to concatenate anymore
677 int64_t merge_memory = gcnnz_merged*2*20;//(gannz*2 + phase_nnz + gcnnz_unmerged + gcnnz_merged*2) * 20;
678
679 if(myrank==0)
680 {
681 if(merge_memory>1000000000)
682 std::cout << " merged: " << merge_memory/1000000000.00 << "GB " ;
683 else
684 std::cout << " merged: " << merge_memory/1000000.00 << " MB " ;
685 }
686#endif
687
688
689#ifdef TIMING
690 MPI_Barrier(A.getcommgrid()->GetWorld());
691 double t7=MPI_Wtime();
693#endif
694 UDERO * OnePieceOfC = new UDERO(* OnePieceOfC_tuples, false);
695 delete OnePieceOfC_tuples;
696
699
700#ifdef SHOW_MEMORY_USAGE
702 lcnnz_pruned = OnePieceOfC_mat.getlocalnnz();
704
705
706 // TODO: we can remove gcnnz_merged memory here because we don't need to concatenate anymore
707 int64_t prune_memory = gcnnz_pruned*2*20;//(gannz*2 + phase_nnz + gcnnz_pruned*2) * 20 + kselectmem; // 3 extra copies of OnePieceOfC_mat, we can make it one extra copy!
708 //phase_nnz += gcnnz_pruned;
709
710 if(myrank==0)
711 {
712 if(prune_memory>1000000000)
713 std::cout << "Prune: " << prune_memory/1000000000.00 << "GB " << std::endl ;
714 else
715 std::cout << "Prune: " << prune_memory/1000000.00 << " MB " << std::endl ;
716
717 }
718#endif
719
720 // ABAB: Change this to accept pointers to objects
721 toconcatenate.push_back(OnePieceOfC_mat.seq());
722 }
723
724 UDERO * C = new UDERO(0,C_m, C_n,0);
725 C->ColConcatenate(toconcatenate); // ABAB: Change this to accept a vector of pointers to pointers to DER objects
726
727 SpHelper::deallocate2D(ARecvSizes, UDERA::esscount);
728 SpHelper::deallocate2D(BRecvSizes, UDERA::esscount);
730}
731
732template <typename SR, typename NUO, typename UDERO, typename IU, typename NU1, typename NU2, typename UDERA, typename UDERB>
735
736 int phases;
737
738 typedef typename UDERA::LocalIT LIA;
739 typedef typename UDERB::LocalIT LIB;
740 typedef typename UDERO::LocalIT LIC;
741
742 int myrank;
744
745 int stages, dummy; // last two parameters of ProductGrid are ignored for Synch multiplication
746 std::shared_ptr<CommGrid> GridC = ProductGrid((A.commGrid).get(), (B.commGrid).get(), stages, dummy, dummy);
747
748 double t0, t1, t2, t3, t4, t5;
749 int p;
750 MPI_Comm World = GridC->GetWorld();
752
753 int64_t perNNZMem_in = sizeof(IU)*2 + sizeof(NU1);
754 int64_t perNNZMem_out = sizeof(IU)*2 + sizeof(NUO);
755
756 // max nnz(A) in a porcess
757 int64_t lannz = A.getlocalnnz();
760 int64_t inputMem = gannz * perNNZMem_in * 4; // for four copies (two for SUMMA)
761
762 // max nnz(A^2) stored by SUMMA in a porcess
764 int64_t asquareMem = asquareNNZ * perNNZMem_out * 2; // an extra copy in multiway merge and in selection/recovery step
765
766
767 // estimate kselect memory
768 int64_t d = ceil( (asquareNNZ * sqrt(p))/ B.getlocalcols() ); // average nnz per column in A^2 (it is an overestimate because asquareNNZ is estimated based on unmerged matrices)
769 // this is equivalent to (asquareNNZ * p) / B.getcol()
770 int64_t k = std::min(int64_t(std::max(selectNum, recoverNum)), d );
771 int64_t kselectmem = B.getlocalcols() * k * 8 * 3;
772
773 // estimate output memory
774 int64_t outputNNZ = (B.getlocalcols() * d)/sqrt(p);
775 //int64_t outputNNZ = (B.getlocalcols() * k)/sqrt(p); // if kselect is used
777
778 //inputMem + outputMem + asquareMem/phases + kselectmem/phases < memory
779 //int64_t remainingMem = perProcessMemory*1000000000 - inputMem - outputMem;
780 int64_t remainingMem = perProcessMemory*1000000000 - inputMem; // if each phase result is discarded
781 //if(remainingMem > 0)
782 //{
783 //phases = 1 + (asquareMem+kselectmem) / remainingMem;
784 //}
785 phases = 1 + asquareMem / remainingMem;
786 return phases;
787}
788
789
798template <typename SR, typename NUO, typename UDERO, typename IU, typename NU1, typename NU2, typename UDERA, typename UDERB>
800 (SpParMat<IU,NU1,UDERA> & A, SpParMat<IU,NU2,UDERB> & B, bool clearA = false, bool clearB = false )
801
802{
804 {
806 }
807 typedef typename UDERA::LocalIT LIA;
808 typedef typename UDERB::LocalIT LIB;
809 typedef typename UDERO::LocalIT LIC;
810
811 static_assert(std::is_same<LIA, LIB>::value, "local index types for both input matrices should be the same");
812 static_assert(std::is_same<LIA, LIC>::value, "local index types for input and output matrices should be the same");
813
814 int stages, dummy; // last two parameters of ProductGrid are ignored for Synch multiplication
815 std::shared_ptr<CommGrid> GridC = ProductGrid((A.commGrid).get(), (B.commGrid).get(), stages, dummy, dummy);
816 LIA C_m = A.spSeq->getnrow();
817 LIB C_n = B.spSeq->getncol();
818
819 UDERA * A1seq = new UDERA();
820 UDERA * A2seq = new UDERA();
821 UDERB * B1seq = new UDERB();
822 UDERB * B2seq = new UDERB();
823 (A.spSeq)->Split( *A1seq, *A2seq);
824 const_cast< UDERB* >(B.spSeq)->Transpose();
825 (B.spSeq)->Split( *B1seq, *B2seq);
826
827 // Transpose back for the column-by-column algorithm
828 const_cast< UDERB* >(B1seq)->Transpose();
829 const_cast< UDERB* >(B2seq)->Transpose();
830
831 LIA ** ARecvSizes = SpHelper::allocate2D<LIA>(UDERA::esscount, stages);
832 LIB ** BRecvSizes = SpHelper::allocate2D<LIB>(UDERB::esscount, stages);
833
834 SpParHelper::GetSetSizes( *A1seq, ARecvSizes, (A.commGrid)->GetRowWorld());
835 SpParHelper::GetSetSizes( *B1seq, BRecvSizes, (B.commGrid)->GetColWorld());
836
837 // Remotely fetched matrices are stored as pointers
838 UDERA * ARecv;
839 UDERB * BRecv;
840 std::vector< SpTuples<LIC,NUO> *> tomerge;
841
842 int Aself = (A.commGrid)->GetRankInProcRow();
843 int Bself = (B.commGrid)->GetRankInProcCol();
844
845 for(int i = 0; i < stages; ++i)
846 {
847 std::vector<LIA> ess;
848 if(i == Aself)
849 {
850 ARecv = A1seq; // shallow-copy
851 }
852 else
853 {
854 ess.resize(UDERA::esscount);
855 for(int j=0; j< UDERA::esscount; ++j)
856 {
857 ess[j] = ARecvSizes[j][i]; // essentials of the ith matrix in this row
858 }
859 ARecv = new UDERA(); // first, create the object
860 }
861 SpParHelper::BCastMatrix(GridC->GetRowWorld(), *ARecv, ess, i); // then, receive its elements
862 ess.clear();
863 if(i == Bself)
864 {
865 BRecv = B1seq; // shallow-copy
866 }
867 else
868 {
869 ess.resize(UDERB::esscount);
870 for(int j=0; j< UDERB::esscount; ++j)
871 {
872 ess[j] = BRecvSizes[j][i];
873 }
874 BRecv = new UDERB();
875 }
876 SpParHelper::BCastMatrix(GridC->GetColWorld(), *BRecv, ess, i); // then, receive its elements
877
878 // before activating this remove transposing B1seq
879 /*
880 SpTuples<LIC,NUO> * C_cont = MultiplyReturnTuples<SR, NUO>
881 (*ARecv, *BRecv, // parameters themselves
882 false, true, // transpose information (B is transposed)
883 i != Aself, // 'delete A' condition
884 i != Bself); // 'delete B' condition
885
886 */
887
889 (*ARecv, *BRecv, // parameters themselves
890 i != Aself, // 'delete A' condition
891 i != Bself); // 'delete B' condition
892
893
894
895
896 if(!C_cont->isZero())
897 tomerge.push_back(C_cont);
898 else
899 delete C_cont;
900 }
901 if(clearA) delete A1seq;
902 if(clearB) delete B1seq;
903
904 // Set the new dimensions
905 SpParHelper::GetSetSizes( *A2seq, ARecvSizes, (A.commGrid)->GetRowWorld());
906 SpParHelper::GetSetSizes( *B2seq, BRecvSizes, (B.commGrid)->GetColWorld());
907
908 // Start the second round
909 for(int i = 0; i < stages; ++i)
910 {
911 std::vector<LIA> ess;
912 if(i == Aself)
913 {
914 ARecv = A2seq; // shallow-copy
915 }
916 else
917 {
918 ess.resize(UDERA::esscount);
919 for(int j=0; j< UDERA::esscount; ++j)
920 {
921 ess[j] = ARecvSizes[j][i]; // essentials of the ith matrix in this row
922 }
923 ARecv = new UDERA(); // first, create the object
924 }
925
926 SpParHelper::BCastMatrix(GridC->GetRowWorld(), *ARecv, ess, i); // then, receive its elements
927 ess.clear();
928
929 if(i == Bself)
930 {
931 BRecv = B2seq; // shallow-copy
932 }
933 else
934 {
935 ess.resize(UDERB::esscount);
936 for(int j=0; j< UDERB::esscount; ++j)
937 {
938 ess[j] = BRecvSizes[j][i];
939 }
940 BRecv = new UDERB();
941 }
942 SpParHelper::BCastMatrix(GridC->GetColWorld(), *BRecv, ess, i); // then, receive its elements
943
944 // before activating this remove transposing B2seq
945 /*
946 SpTuples<LIC,NUO> * C_cont = MultiplyReturnTuples<SR, NUO>
947 (*ARecv, *BRecv, // parameters themselves
948 false, true, // transpose information (B is transposed)
949 i != Aself, // 'delete A' condition
950 i != Bself); // 'delete B' condition
951
952
953 */
954
956 (*ARecv, *BRecv, // parameters themselves
957 i != Aself, // 'delete A' condition
958 i != Bself); // 'delete B' condition
959
960 if(!C_cont->isZero())
961 tomerge.push_back(C_cont);
962 else
963 delete C_cont;
964 }
965 SpHelper::deallocate2D(ARecvSizes, UDERA::esscount);
966 SpHelper::deallocate2D(BRecvSizes, UDERB::esscount);
967 if(clearA)
968 {
969 delete A2seq;
970 delete A.spSeq;
971 A.spSeq = NULL;
972 }
973 else
974 {
975 (A.spSeq)->Merge(*A1seq, *A2seq);
976 delete A1seq;
977 delete A2seq;
978 }
979 if(clearB)
980 {
981 delete B2seq;
982 delete B.spSeq;
983 B.spSeq = NULL;
984 }
985 else
986 {
987 B1seq->Transpose();
988 B2seq->Transpose();
989 (B.spSeq)->Merge(*B1seq, *B2seq);
990 delete B1seq;
991 delete B2seq;
992 const_cast< UDERB* >(B.spSeq)->Transpose(); // transpose back to original
993 }
994
995 UDERO * C = new UDERO(MergeAll<SR>(tomerge, C_m, C_n,true), false);
996 return SpParMat<IU,NUO,UDERO> (C, GridC); // return the result object
997}
998
1004template <typename SR, typename NUO, typename UDERO, typename IU, typename NU1, typename NU2, typename UDERA, typename UDERB>
1006 (SpParMat<IU,NU1,UDERA> & A, SpParMat<IU,NU2,UDERB> & B, bool clearA = false, bool clearB = false )
1007
1008{
1009 int myrank;
1012 {
1013 return SpParMat< IU,NUO,UDERO >();
1014 }
1015 int stages, dummy; // last two parameters of ProductGrid are ignored for Synch multiplication
1016 std::shared_ptr<CommGrid> GridC = ProductGrid((A.commGrid).get(), (B.commGrid).get(), stages, dummy, dummy);
1017 IU C_m = A.spSeq->getnrow();
1018 IU C_n = B.spSeq->getncol();
1019
1020 //const_cast< UDERB* >(B.spSeq)->Transpose(); // do not transpose for colum-by-column multiplication
1021
1022 IU ** ARecvSizes = SpHelper::allocate2D<IU>(UDERA::esscount, stages);
1023 IU ** BRecvSizes = SpHelper::allocate2D<IU>(UDERB::esscount, stages);
1024
1025 SpParHelper::GetSetSizes( *(A.spSeq), ARecvSizes, (A.commGrid)->GetRowWorld());
1026 SpParHelper::GetSetSizes( *(B.spSeq), BRecvSizes, (B.commGrid)->GetColWorld());
1027
1028 // Remotely fetched matrices are stored as pointers
1029 UDERA * ARecv;
1030 UDERB * BRecv;
1031 std::vector< SpTuples<IU,NUO> *> tomerge;
1032
1033 int Aself = (A.commGrid)->GetRankInProcRow();
1034 int Bself = (B.commGrid)->GetRankInProcCol();
1035
1036 for(int i = 0; i < stages; ++i)
1037 {
1038 std::vector<IU> ess;
1039 if(i == Aself)
1040 {
1041 ARecv = A.spSeq; // shallow-copy
1042 }
1043 else
1044 {
1045 ess.resize(UDERA::esscount);
1046 for(int j=0; j< UDERA::esscount; ++j)
1047 {
1048 ess[j] = ARecvSizes[j][i]; // essentials of the ith matrix in this row
1049 }
1050 ARecv = new UDERA(); // first, create the object
1051 }
1052 SpParHelper::BCastMatrix(GridC->GetRowWorld(), *ARecv, ess, i); // then, receive its elements
1053 ess.clear();
1054
1055 if(i == Bself)
1056 {
1057 BRecv = B.spSeq; // shallow-copy
1058 }
1059 else
1060 {
1061 ess.resize(UDERB::esscount);
1062 for(int j=0; j< UDERB::esscount; ++j)
1063 {
1064 ess[j] = BRecvSizes[j][i];
1065 }
1066 BRecv = new UDERB();
1067 }
1068 SpParHelper::BCastMatrix(GridC->GetColWorld(), *BRecv, ess, i); // then, receive its elements
1069
1071 (*ARecv, *BRecv, // parameters themselves
1072 i != Aself, // 'delete A' condition
1073 i != Bself); // 'delete B' condition
1074
1075 if(!C_cont->isZero())
1076 tomerge.push_back(C_cont);
1077
1078#ifdef COMBBLAS_DEBUG
1079 std::ostringstream outs;
1080 outs << i << "th SUMMA iteration"<< std::endl;
1081 SpParHelper::Print(outs.str());
1082#endif
1083 }
1084
1085 if(clearA && A.spSeq != NULL)
1086 {
1087 delete A.spSeq;
1088 A.spSeq = NULL;
1089 }
1090 if(clearB && B.spSeq != NULL)
1091 {
1092 delete B.spSeq;
1093 B.spSeq = NULL;
1094 }
1095
1096 SpHelper::deallocate2D(ARecvSizes, UDERA::esscount);
1097 SpHelper::deallocate2D(BRecvSizes, UDERB::esscount);
1098
1099
1101 UDERO * C = new UDERO(*C_tuples, false);
1102 delete C_tuples;
1103
1104 //if(!clearB)
1105 // const_cast< UDERB* >(B.spSeq)->Transpose(); // transpose back to original
1106
1107 return SpParMat<IU,NUO,UDERO> (C, GridC); // return the result object
1108}
1109
1110template <typename SR, typename NUO, typename UDERO, typename IU, typename NU1, typename NU2, typename UDERA, typename UDERB>
1112 (SpParMat<IU,NU1,UDERA> & A, SpParMat<IU,NU2,UDERB> & B, bool clearA = false, bool clearB = false )
1113{
1114 int myrank;
1117 {
1118 return SpParMat< IU,NUO,UDERO >();
1119 }
1120 int stages, dummy; // last two parameters of ProductGrid are ignored for Synch multiplication
1121 std::shared_ptr<CommGrid> GridC = ProductGrid((A.commGrid).get(), (B.commGrid).get(), stages, dummy, dummy);
1122 IU C_m = A.spSeq->getnrow();
1123 IU C_n = B.spSeq->getncol();
1124
1125 //const_cast< UDERB* >(B.spSeq)->Transpose(); // do not transpose for colum-by-column multiplication
1126
1127 IU ** ARecvSizes = SpHelper::allocate2D<IU>(UDERA::esscount, stages);
1128 IU ** BRecvSizes = SpHelper::allocate2D<IU>(UDERB::esscount, stages);
1129
1130 SpParHelper::GetSetSizes( *(A.spSeq), ARecvSizes, (A.commGrid)->GetRowWorld());
1131 SpParHelper::GetSetSizes( *(B.spSeq), BRecvSizes, (B.commGrid)->GetColWorld());
1132
1133 // Remotely fetched matrices are stored as pointers
1134 UDERA ** ARecv = new UDERA* [stages];
1135 UDERB ** BRecv = new UDERB* [stages];
1136
1137 Arr<IU,NU1> Aarrinfo = A.seqptr()->GetArrays();
1138 Arr<IU,NU2> Barrinfo = B.seqptr()->GetArrays();
1139 std::vector< std::vector<MPI_Request> > ABCastIndarrayReq;
1140 std::vector< std::vector<MPI_Request> > ABCastNumarrayReq;
1141 std::vector< std::vector<MPI_Request> > BBCastIndarrayReq;
1142 std::vector< std::vector<MPI_Request> > BBCastNumarrayReq;
1143 for(int i = 0; i < stages; i++){
1144 ABCastIndarrayReq.push_back( std::vector<MPI_Request>(Aarrinfo.indarrs.size(), MPI_REQUEST_NULL) );
1145 ABCastNumarrayReq.push_back( std::vector<MPI_Request>(Aarrinfo.numarrs.size(), MPI_REQUEST_NULL) );
1146 BBCastIndarrayReq.push_back( std::vector<MPI_Request>(Barrinfo.indarrs.size(), MPI_REQUEST_NULL) );
1147 BBCastNumarrayReq.push_back( std::vector<MPI_Request>(Barrinfo.numarrs.size(), MPI_REQUEST_NULL) );
1148 }
1149
1150 int Aself = (A.commGrid)->GetRankInProcRow();
1151 int Bself = (B.commGrid)->GetRankInProcCol();
1152
1153 std::vector< SpTuples<IU,NUO> *> tomerge;
1154
1155 for(int i = 0; i < stages; ++i){
1156 std::vector<IU> ess;
1157 if(i == Aself) ARecv[i] = A.spSeq; // shallow-copy
1158 else{
1159 ess.resize(UDERA::esscount);
1160 for(int j=0; j< UDERA::esscount; ++j) ess[j] = ARecvSizes[j][i]; // essentials of the ith matrix in this row
1161 ARecv[i] = new UDERA(); // first, create the object
1162 }
1163 SpParHelper::IBCastMatrix(GridC->GetRowWorld(), *(ARecv[i]), ess, i, ABCastIndarrayReq[i], ABCastNumarrayReq[i]); // then, receive its elements
1164
1165 ess.clear();
1166
1167 if(i == Bself) BRecv[i] = B.spSeq; // shallow-copy
1168 else{
1169 ess.resize(UDERB::esscount);
1170 for(int j=0; j< UDERB::esscount; ++j) ess[j] = BRecvSizes[j][i];
1171 BRecv[i] = new UDERB();
1172 }
1173 SpParHelper::IBCastMatrix(GridC->GetColWorld(), *(BRecv[i]), ess, i, BBCastIndarrayReq[i], BBCastNumarrayReq[i]); // then, receive its elements
1174
1175 if(i > 0){
1180
1182 (*(ARecv[i-1]), *(BRecv[i-1]), // parameters themselves
1183 i-1 != Aself, // 'delete A' condition
1184 i-1 != Bself); // 'delete B' condition
1185 if(!C_cont->isZero()) tomerge.push_back(C_cont);
1186
1188 std::vector< SpTuples<IU,NUO> *>().swap(tomerge);
1189 tomerge.push_back(C_tuples);
1190 }
1191 #ifdef COMBBLAS_DEBUG
1192 std::ostringstream outs;
1193 outs << i << "th SUMMA iteration"<< std::endl;
1194 SpParHelper::Print(outs.str());
1195 #endif
1196 }
1197
1202
1204 (*(ARecv[stages-1]), *(BRecv[stages-1]), // parameters themselves
1205 stages-1 != Aself, // 'delete A' condition
1206 stages-1 != Bself); // 'delete B' condition
1207 if(!C_cont->isZero()) tomerge.push_back(C_cont);
1208
1209 if(clearA && A.spSeq != NULL) {
1210 delete A.spSeq;
1211 A.spSeq = NULL;
1212 }
1213 if(clearB && B.spSeq != NULL) {
1214 delete B.spSeq;
1215 B.spSeq = NULL;
1216 }
1217
1218 delete ARecv;
1219 delete BRecv;
1220
1221 SpHelper::deallocate2D(ARecvSizes, UDERA::esscount);
1222 SpHelper::deallocate2D(BRecvSizes, UDERB::esscount);
1223
1224 // the last parameter to MergeAll deletes tomerge arrays
1226 std::vector< SpTuples<IU,NUO> *>().swap(tomerge);
1227
1228 UDERO * C = new UDERO(*C_tuples, false);
1229 delete C_tuples;
1230
1231 //if(!clearB)
1232 // const_cast< UDERB* >(B.spSeq)->Transpose(); // transpose back to original
1233
1234 return SpParMat<IU,NUO,UDERO> (C, GridC); // return the result object
1235}
1236
1237
1242template <typename IU, typename NU1, typename NU2, typename UDERA, typename UDERB>
1244{
1245 typedef typename UDERA::LocalIT LIA;
1246 typedef typename UDERB::LocalIT LIB;
1247 static_assert(std::is_same<LIA, LIB>::value, "local index types for both input matrices should be the same");
1248
1249 double t0, t1;
1250
1251 int64_t nnzC_SUMMA = 0;
1252
1253 if(A.getncol() != B.getnrow())
1254 {
1255 std::ostringstream outs;
1256 outs << "Can not multiply, dimensions does not match"<< std::endl;
1257 outs << A.getncol() << " != " << B.getnrow() << std::endl;
1258 SpParHelper::Print(outs.str());
1260 return nnzC_SUMMA;
1261 }
1262
1263 int stages, dummy; // last two parameters of ProductGrid are ignored for Synch multiplication
1264 std::shared_ptr<CommGrid> GridC = ProductGrid((A.commGrid).get(), (B.commGrid).get(), stages, dummy, dummy);
1265
1266 MPI_Barrier(GridC->GetWorld());
1267
1268 LIA ** ARecvSizes = SpHelper::allocate2D<LIA>(UDERA::esscount, stages);
1269 LIB ** BRecvSizes = SpHelper::allocate2D<LIB>(UDERB::esscount, stages);
1270 SpParHelper::GetSetSizes( *(A.spSeq), ARecvSizes, (A.commGrid)->GetRowWorld());
1271 SpParHelper::GetSetSizes( *(B.spSeq), BRecvSizes, (B.commGrid)->GetColWorld());
1272
1273 // Remotely fetched matrices are stored as pointers
1274 UDERA * ARecv;
1275 UDERB * BRecv;
1276
1277 int Aself = (A.commGrid)->GetRankInProcRow();
1278 int Bself = (B.commGrid)->GetRankInProcCol();
1279
1280
1281 for(int i = 0; i < stages; ++i)
1282 {
1283 std::vector<LIA> ess;
1284 if(i == Aself)
1285 {
1286 ARecv = A.spSeq; // shallow-copy
1287 }
1288 else
1289 {
1290 ess.resize(UDERA::esscount);
1291 for(int j=0; j< UDERA::esscount; ++j)
1292 {
1293 ess[j] = ARecvSizes[j][i]; // essentials of the ith matrix in this row
1294 }
1295 ARecv = new UDERA(); // first, create the object
1296 }
1297
1298 SpParHelper::BCastMatrix(GridC->GetRowWorld(), *ARecv, ess, i); // then, receive its elements
1299 ess.clear();
1300
1301 if(i == Bself)
1302 {
1303 BRecv = B.spSeq; // shallow-copy
1304 }
1305 else
1306 {
1307 ess.resize(UDERB::esscount);
1308 for(int j=0; j< UDERB::esscount; ++j)
1309 {
1310 ess[j] = BRecvSizes[j][i];
1311 }
1312 BRecv = new UDERB();
1313 }
1314
1315 SpParHelper::BCastMatrix(GridC->GetColWorld(), *BRecv, ess, i); // then, receive its elements
1316
1317 // no need to keep entries of colnnzC in larger precision
1318 // because colnnzC is of length nzc and estimates nnzs per column
1319 // @OGUZ-EDIT Using hash spgemm for estimation
1320 //LIB * colnnzC = estimateNNZ(*ARecv, *BRecv);
1323 LIB nzc = BRecv->GetDCSC()->nzc;
1324
1325 if (flopC) delete [] flopC;
1326 if(colnnzC) delete [] colnnzC;
1327
1328 // sampling-based estimation (comment the estimation above, and
1329 // comment out below to use)
1330 // int64_t nnzC_stage = estimateNNZ_sampling(*ARecv, *BRecv);
1331 // nnzC_SUMMA += nnzC_stage;
1332
1333 // delete received data
1334 if(i != Aself)
1335 delete ARecv;
1336 if(i != Bself)
1337 delete BRecv;
1338 }
1339
1340 SpHelper::deallocate2D(ARecvSizes, UDERA::esscount);
1341 SpHelper::deallocate2D(BRecvSizes, UDERB::esscount);
1342
1345
1346 return nnzC_SUMMA_max;
1347}
1348
1349
1350template <typename MATRIX, typename VECTOR>
1351void CheckSpMVCompliance(const MATRIX & A, const VECTOR & x)
1352{
1353 if(A.getncol() != x.TotalLength())
1354 {
1355 std::ostringstream outs;
1356 outs << "Can not multiply, dimensions does not match"<< std::endl;
1357 outs << A.getncol() << " != " << x.TotalLength() << std::endl;
1358 SpParHelper::Print(outs.str());
1360 }
1361 if(! ( *(A.getcommgrid()) == *(x.getcommgrid())) )
1362 {
1363 std::cout << "Grids are not comparable for SpMV" << std::endl;
1365 }
1366}
1367
1368
1369template <typename SR, typename IU, typename NUM, typename UDER>
1370FullyDistSpVec<IU,typename promote_trait<NUM,IU>::T_promote> SpMV
1371 (const SpParMat<IU,NUM,UDER> & A, const FullyDistSpVec<IU,IU> & x, bool indexisvalue, OptBuf<int32_t, typename promote_trait<NUM,IU>::T_promote > & optbuf);
1372
1373template <typename SR, typename IU, typename NUM, typename UDER>
1381
1387template<typename IU, typename NV>
1389{
1390 int32_t xlocnz = (int32_t) x.getlocnnz();
1391 int32_t roffst = (int32_t) x.RowLenUntil(); // since trxinds is int32_t
1393 IU luntil = x.LengthUntil();
1394 int diagneigh = x.commGrid->GetComplementRank();
1395
1397 MPI_Sendrecv(&roffst, 1, MPIType<int32_t>(), diagneigh, TROST, &roffset, 1, MPIType<int32_t>(), diagneigh, TROST, World, &status);
1398 MPI_Sendrecv(&xlocnz, 1, MPIType<int32_t>(), diagneigh, TRNNZ, &trxlocnz, 1, MPIType<int32_t>(), diagneigh, TRNNZ, World, &status);
1399 MPI_Sendrecv(&luntil, 1, MPIType<IU>(), diagneigh, TRLUT, &lenuntil, 1, MPIType<IU>(), diagneigh, TRLUT, World, &status);
1400
1401 // ABAB: Important observation is that local indices (given by x.ind) is 32-bit addressible
1402 // Copy them to 32 bit integers and transfer that to save 50% of off-node bandwidth
1403 trxinds = new int32_t[trxlocnz];
1405#ifdef THREADED
1406#pragma omp parallel for
1407#endif
1408 for(int i=0; i< xlocnz; ++i)
1409 temp_xind[i] = (int32_t) x.ind[i];
1411 delete [] temp_xind;
1412 if(!indexisvalue)
1413 {
1414 trxnums = new NV[trxlocnz];
1415 MPI_Sendrecv(const_cast<NV*>(SpHelper::p2a(x.num)), xlocnz, MPIType<NV>(), diagneigh, TRX, trxnums, trxlocnz, MPIType<NV>(), diagneigh, TRX, World, &status);
1416 }
1417
1418 std::transform(trxinds, trxinds+trxlocnz, trxinds, std::bind2nd(std::plus<int32_t>(), roffset)); // fullydist indexing (p pieces) -> matrix indexing (sqrt(p) pieces)
1419}
1420
1421
1429template<typename IU, typename NV>
1431 int32_t * & indacc, NV * & numacc, int & accnz, bool indexisvalue)
1432{
1433 int colneighs, colrank;
1434 MPI_Comm_size(ColWorld, &colneighs);
1435 MPI_Comm_rank(ColWorld, &colrank);
1436 int * colnz = new int[colneighs];
1437 colnz[colrank] = trxlocnz;
1439 int * dpls = new int[colneighs](); // displacements (zero initialized pid)
1440 std::partial_sum(colnz, colnz+colneighs-1, dpls+1);
1441 accnz = std::accumulate(colnz, colnz+colneighs, 0);
1442 indacc = new int32_t[accnz];
1443 numacc = new NV[accnz];
1444
1445 // ABAB: Future issues here, colnz is of type int (MPI limitation)
1446 // What if the aggregate vector size along the processor row/column is not 32-bit addressible?
1447 // This will happen when n/sqrt(p) > 2^31
1448 // Currently we can solve a small problem (scale 32) with 4096 processor
1449 // For a medium problem (scale 35), we'll need 32K processors which gives sqrt(p) ~ 180
1450 // 2^35 / 180 ~ 2^29 / 3 which is not an issue !
1451
1452#ifdef TIMING
1453 double t0=MPI_Wtime();
1454#endif
1456
1457 delete [] trxinds;
1458 if(indexisvalue)
1459 {
1461 if(colrank == 0) lenuntilcol = lenuntil;
1463 for(int i=0; i< accnz; ++i) // fill numerical values from indices
1464 {
1465 numacc[i] = indacc[i] + lenuntilcol;
1466 }
1467 }
1468 else
1469 {
1471 delete [] trxnums;
1472 }
1473#ifdef TIMING
1474 double t1=MPI_Wtime();
1476#endif
1478}
1479
1480
1481
1488template<typename SR, typename IVT, typename OVT, typename IU, typename NUM, typename UDER>
1490 int32_t * & sendindbuf, OVT * & sendnumbuf, int * & sdispls, int * sendcnt, int accnz, bool indexisvalue, PreAllocatedSPA<OVT> & SPA)
1491{
1492 if(optbuf.totmax > 0) // graph500 optimization enabled
1493 {
1494 if(A.spSeq->getnsplit() > 0)
1495 {
1496 // optbuf.{inds/nums/dspls} and sendcnt are all pre-allocated and only filled by dcsc_gespmv_threaded
1498 }
1499 else
1500 {
1502 }
1504 }
1505 else
1506 {
1507 if(A.spSeq->getnsplit() > 0)
1508 {
1509 // sendindbuf/sendnumbuf/sdispls are all allocated and filled by dcsc_gespmv_threaded
1511
1513 for(int i=0; i<rowneighs-1; ++i)
1514 sendcnt[i] = sdispls[i+1] - sdispls[i];
1515 sendcnt[rowneighs-1] = totalsent - sdispls[rowneighs-1];
1516 }
1517 else
1518 {
1519 // default SpMSpV
1520 std::vector< int32_t > indy;
1521 std::vector< OVT > numy;
1523
1525
1526 int32_t bufsize = indy.size(); // as compact as possible
1527 sendindbuf = new int32_t[bufsize];
1528 sendnumbuf = new OVT[bufsize];
1529 int32_t perproc = A.getlocalrows() / rowneighs;
1530
1531 int k = 0; // index to buffer
1532 for(int i=0; i<rowneighs; ++i)
1533 {
1534 int32_t end_this = (i==rowneighs-1) ? A.getlocalrows(): (i+1)*perproc;
1535 while(k < bufsize && indy[k] < end_this)
1536 {
1537 sendindbuf[k] = indy[k] - i*perproc;
1538 sendnumbuf[k] = numy[k];
1539 ++sendcnt[i];
1540 ++k;
1541 }
1542 }
1543 sdispls = new int[rowneighs]();
1544 std::partial_sum(sendcnt, sendcnt+rowneighs-1, sdispls+1);
1545
1546//#endif
1547
1548 }
1549 }
1550
1551}
1552
1553
1554
1555// non threaded
1556template <typename SR, typename IU, typename OVT>
1557void MergeContributions(int* listSizes, std::vector<int32_t *> & indsvec, std::vector<OVT *> & numsvec, std::vector<IU>& mergedind, std::vector<OVT>& mergednum)
1558{
1559
1560 int nlists = indsvec.size();
1561 // this condition is checked in the caller SpMV function.
1562 // I am still putting it here for completeness
1563 if(nlists == 1)
1564 {
1565 // simply copy data
1566 int veclen = listSizes[0];
1567 mergedind.resize(veclen);
1568 mergednum.resize(veclen);
1569 for(int i=0; i<veclen; i++)
1570 {
1571 mergedind[i] = indsvec[0][i];
1572 mergednum[i] = numsvec[0][i];
1573 }
1574 return;
1575 }
1576
1577 int32_t hsize = 0;
1578 int32_t inf = std::numeric_limits<int32_t>::min();
1579 int32_t sup = std::numeric_limits<int32_t>::max();
1581 int * processed = new int[nlists]();
1582 for(int i=0; i<nlists; ++i)
1583 {
1584 if(listSizes[i] > 0)
1585 {
1586 // key, list_id
1587 sHeap.insert(indsvec[i][0], i);
1588 ++hsize;
1589 }
1590 }
1591 int32_t key, locv;
1592 if(hsize > 0)
1593 {
1594 sHeap.deleteMin(&key, &locv);
1595 mergedind.push_back( static_cast<IU>(key));
1596 mergednum.push_back(numsvec[locv][0]); // nothing is processed yet
1597
1598 if( (++(processed[locv])) < listSizes[locv] )
1599 sHeap.insert(indsvec[locv][processed[locv]], locv);
1600 else
1601 --hsize;
1602 }
1603 while(hsize > 0)
1604 {
1605 sHeap.deleteMin(&key, &locv);
1606 if(mergedind.back() == static_cast<IU>(key))
1607 {
1608 mergednum.back() = SR::add(mergednum.back(), numsvec[locv][processed[locv]]);
1609 // ABAB: Benchmark actually allows us to be non-deterministic in terms of parent selection
1610 // We can just skip this addition operator (if it's a max/min select)
1611 }
1612 else
1613 {
1614 mergedind.push_back(static_cast<IU>(key));
1615 mergednum.push_back(numsvec[locv][processed[locv]]);
1616 }
1617
1618 if( (++(processed[locv])) < listSizes[locv] )
1619 sHeap.insert(indsvec[locv][processed[locv]], locv);
1620 else
1621 --hsize;
1622 }
1624}
1625
1626
1627
1628template <typename SR, typename IU, typename OVT>
1629void MergeContributions_threaded(int * & listSizes, std::vector<int32_t *> & indsvec, std::vector<OVT *> & numsvec, std::vector<IU> & mergedind, std::vector<OVT> & mergednum, IU maxindex)
1630{
1631
1632 int nlists = indsvec.size();
1633 // this condition is checked in the caller SpMV function.
1634 // I am still putting it here for completeness
1635 if(nlists == 1)
1636 {
1637 // simply copy data
1638 int veclen = listSizes[0];
1639 mergedind.resize(veclen);
1640 mergednum.resize(veclen);
1641
1642#ifdef THREADED
1643#pragma omp parallel for
1644#endif
1645 for(int i=0; i<veclen; i++)
1646 {
1647 mergedind[i] = indsvec[0][i];
1648 mergednum[i] = numsvec[0][i];
1649 }
1650 return;
1651 }
1652
1653 int nthreads=1;
1654#ifdef THREADED
1655#pragma omp parallel
1656 {
1657 nthreads = omp_get_num_threads();
1658 }
1659#endif
1660 int nsplits = 4*nthreads; // oversplit for load balance
1661 nsplits = std::min(nsplits, (int)maxindex);
1662 std::vector< std::vector<int32_t> > splitters(nlists);
1663 for(int k=0; k< nlists; k++)
1664 {
1665 splitters[k].resize(nsplits+1);
1666 splitters[k][0] = static_cast<int32_t>(0);
1667#pragma omp parallel for
1668 for(int i=1; i< nsplits; i++)
1669 {
1670 IU cur_idx = i * (maxindex/nsplits);
1671 auto it = std::lower_bound (indsvec[k], indsvec[k] + listSizes[k], cur_idx);
1672 splitters[k][i] = (int32_t) (it - indsvec[k]);
1673 }
1674 splitters[k][nsplits] = listSizes[k];
1675 }
1676
1677 // ------ perform merge in parallel ------
1678 std::vector<std::vector<IU>> indsBuf(nsplits);
1679 std::vector<std::vector<OVT>> numsBuf(nsplits);
1680 //TODO: allocate these vectors here before calling MergeContributions
1681#pragma omp parallel for schedule(dynamic)
1682 for(int i=0; i< nsplits; i++)
1683 {
1684 std::vector<int32_t *> tIndsVec(nlists);
1685 std::vector<OVT *> tNumsVec(nlists);
1686 std::vector<int> tLengths(nlists);
1687 for(int j=0; j< nlists; ++j)
1688 {
1689 tIndsVec[j] = indsvec[j] + splitters[j][i];
1690 tNumsVec[j] = numsvec[j] + splitters[j][i];
1691 tLengths[j]= splitters[j][i+1] - splitters[j][i];
1692
1693 }
1695 }
1696
1697 // ------ concatenate merged tuples processed by threads ------
1698 std::vector<IU> tdisp(nsplits+1);
1699 tdisp[0] = 0;
1700 for(int i=0; i<nsplits; ++i)
1701 {
1702 tdisp[i+1] = tdisp[i] + indsBuf[i].size();
1703 }
1704
1705 mergedind.resize(tdisp[nsplits]);
1706 mergednum.resize(tdisp[nsplits]);
1707
1708
1709#pragma omp parallel for schedule(dynamic)
1710 for(int i=0; i< nsplits; i++)
1711 {
1712 std::copy(indsBuf[i].data() , indsBuf[i].data() + indsBuf[i].size(), mergedind.data() + tdisp[i]);
1713 std::copy(numsBuf[i].data() , numsBuf[i].data() + numsBuf[i].size(), mergednum.data() + tdisp[i]);
1714 }
1715}
1716
1717
1724template <typename SR, typename IVT, typename OVT, typename IU, typename NUM, typename UDER>
1727{
1729 optbuf.MarkEmpty();
1730 y.glen = A.getnrow(); // in case it is not set already
1731
1732 MPI_Comm World = x.commGrid->GetWorld();
1733 MPI_Comm ColWorld = x.commGrid->GetColWorld();
1734 MPI_Comm RowWorld = x.commGrid->GetRowWorld();
1735
1736 int accnz;
1738 IU lenuntil;
1740 IVT *trxnums, *numacc;
1741
1742#ifdef TIMING
1743 double t0=MPI_Wtime();
1744#endif
1745
1747
1748#ifdef TIMING
1749 double t1=MPI_Wtime();
1751#endif
1752
1753 if(x.commGrid->GetGridRows() > 1)
1754 {
1755 AllGatherVector(ColWorld, trxlocnz, lenuntil, trxinds, trxnums, indacc, numacc, accnz, indexisvalue); // trxindS/trxnums deallocated, indacc/numacc allocated, accnz set
1756 }
1757 else
1758 {
1759 accnz = trxlocnz;
1760 indacc = trxinds; // aliasing ptr
1761 numacc = trxnums; // aliasing ptr
1762 }
1763
1764 int rowneighs;
1766 int * sendcnt = new int[rowneighs]();
1768 OVT * sendnumbuf;
1769 int * sdispls;
1770
1771#ifdef TIMING
1772 double t2=MPI_Wtime();
1773#endif
1774
1775 LocalSpMV<SR>(A, rowneighs, optbuf, indacc, numacc, sendindbuf, sendnumbuf, sdispls, sendcnt, accnz, indexisvalue, SPA); // indacc/numacc deallocated, sendindbuf/sendnumbuf/sdispls allocated
1776
1777#ifdef TIMING
1778 double t3=MPI_Wtime();
1780#endif
1781
1782
1783 if(x.commGrid->GetGridCols() == 1)
1784 {
1785 y.ind.resize(sendcnt[0]);
1786 y.num.resize(sendcnt[0]);
1787
1788
1789 if(optbuf.totmax > 0 ) // graph500 optimization enabled
1790 {
1791#ifdef THREADED
1792#pragma omp parallel for
1793#endif
1794 for(int i=0; i<sendcnt[0]; i++)
1795 {
1796 y.ind[i] = optbuf.inds[i];
1797 y.num[i] = optbuf.nums[i];
1798 }
1799 }
1800 else
1801 {
1802#ifdef THREADED
1803#pragma omp parallel for
1804#endif
1805 for(int i=0; i<sendcnt[0]; i++)
1806 {
1807 y.ind[i] = sendindbuf[i];
1808 y.num[i] = sendnumbuf[i];
1809 }
1811 }
1812 delete [] sendcnt;
1813 return;
1814 }
1815 int * rdispls = new int[rowneighs];
1816 int * recvcnt = new int[rowneighs];
1817 MPI_Alltoall(sendcnt, 1, MPI_INT, recvcnt, 1, MPI_INT, RowWorld); // share the request counts
1818
1819 // receive displacements are exact whereas send displacements have slack
1820 rdispls[0] = 0;
1821 for(int i=0; i<rowneighs-1; ++i)
1822 {
1823 rdispls[i+1] = rdispls[i] + recvcnt[i];
1824 }
1825
1826 int totrecv = std::accumulate(recvcnt,recvcnt+rowneighs,0);
1828 OVT * recvnumbuf = new OVT[totrecv];
1829
1830#ifdef TIMING
1831 double t4=MPI_Wtime();
1832#endif
1833 if(optbuf.totmax > 0 ) // graph500 optimization enabled
1834 {
1837 delete [] sendcnt;
1838 }
1839 else
1840 {
1844 }
1845#ifdef TIMING
1846 double t5=MPI_Wtime();
1848#endif
1849
1850#ifdef TIMING
1851 double t6=MPI_Wtime();
1852#endif
1853 //MergeContributions<SR>(y,recvcnt, rdispls, recvindbuf, recvnumbuf, rowneighs);
1854 // free memory of y, in case it was aliased
1855 std::vector<IU>().swap(y.ind);
1856 std::vector<OVT>().swap(y.num);
1857
1858 std::vector<int32_t *> indsvec(rowneighs);
1859 std::vector<OVT *> numsvec(rowneighs);
1860
1861#ifdef THREADED
1862#pragma omp parallel for
1863#endif
1864 for(int i=0; i<rowneighs; i++)
1865 {
1866 indsvec[i] = recvindbuf+rdispls[i];
1867 numsvec[i] = recvnumbuf+rdispls[i];
1868 }
1869#ifdef THREADED
1870 MergeContributions_threaded<SR>(recvcnt, indsvec, numsvec, y.ind, y.num, y.MyLocLength());
1871#else
1873#endif
1874
1876#ifdef TIMING
1877 double t7=MPI_Wtime();
1879#endif
1880
1881}
1882
1883
1884template <typename SR, typename IVT, typename OVT, typename IU, typename NUM, typename UDER>
1890
1891template <typename SR, typename IVT, typename OVT, typename IU, typename NUM, typename UDER>
1898
1899template <typename SR, typename IVT, typename OVT, typename IU, typename NUM, typename UDER>
1905
1906
1911template <typename SR, typename IU, typename NUM, typename UDER>
1914{
1915 typedef typename promote_trait<NUM,IU>::T_promote T_promote;
1916 FullyDistSpVec<IU, T_promote> y ( x.getcommgrid(), A.getnrow()); // identity doesn't matter for sparse vectors
1918 return y;
1919}
1920
1924template <typename SR, typename IU, typename NUM, typename NUV, typename UDER>
1926 (const SpParMat<IU,NUM,UDER> & A, const FullyDistVec<IU,NUV> & x )
1927{
1928 typedef typename promote_trait<NUM,NUV>::T_promote T_promote;
1930
1931 MPI_Comm World = x.commGrid->GetWorld();
1932 MPI_Comm ColWorld = x.commGrid->GetColWorld();
1933 MPI_Comm RowWorld = x.commGrid->GetRowWorld();
1934
1935 int xsize = (int) x.LocArrSize();
1936 int trxsize = 0;
1937
1938 int diagneigh = x.commGrid->GetComplementRank();
1940 MPI_Sendrecv(&xsize, 1, MPI_INT, diagneigh, TRX, &trxsize, 1, MPI_INT, diagneigh, TRX, World, &status);
1941
1942 NUV * trxnums = new NUV[trxsize];
1943 MPI_Sendrecv(const_cast<NUV*>(SpHelper::p2a(x.arr)), xsize, MPIType<NUV>(), diagneigh, TRX, trxnums, trxsize, MPIType<NUV>(), diagneigh, TRX, World, &status);
1944
1945 int colneighs, colrank;
1946 MPI_Comm_size(ColWorld, &colneighs);
1947 MPI_Comm_rank(ColWorld, &colrank);
1948 int * colsize = new int[colneighs];
1949 colsize[colrank] = trxsize;
1951 int * dpls = new int[colneighs](); // displacements (zero initialized pid)
1952 std::partial_sum(colsize, colsize+colneighs-1, dpls+1);
1953 int accsize = std::accumulate(colsize, colsize+colneighs, 0);
1954 NUV * numacc = new NUV[accsize];
1955
1957 delete [] trxnums;
1958
1959 // serial SpMV with dense vector
1960 T_promote id = SR::id();
1961 IU ysize = A.getlocalrows();
1962 T_promote * localy = new T_promote[ysize];
1963 std::fill_n(localy, ysize, id);
1964
1965#ifdef THREADED
1967#else
1968 dcsc_gespmv<SR>(*(A.spSeq), numacc, localy);
1969#endif
1970
1971
1973
1974 // FullyDistVec<IT,NT>(shared_ptr<CommGrid> grid, IT globallen, NT initval, NT id)
1975 FullyDistVec<IU, T_promote> y ( x.commGrid, A.getnrow(), id);
1976
1977 int rowneighs;
1979
1980 IU begptr, endptr;
1981 for(int i=0; i< rowneighs; ++i)
1982 {
1983 begptr = y.RowLenUntil(i);
1984 if(i == rowneighs-1)
1985 {
1986 endptr = ysize;
1987 }
1988 else
1989 {
1990 endptr = y.RowLenUntil(i+1);
1991 }
1993 }
1994 delete [] localy;
1995 return y;
1996}
1997
1998
2004template <typename SR, typename IU, typename NUM, typename NUV, typename UDER>
2007{
2008 typedef typename promote_trait<NUM,NUV>::T_promote T_promote;
2010
2011 MPI_Comm World = x.commGrid->GetWorld();
2012 MPI_Comm ColWorld = x.commGrid->GetColWorld();
2013 MPI_Comm RowWorld = x.commGrid->GetRowWorld();
2014
2015 int xlocnz = (int) x.getlocnnz();
2016 int trxlocnz = 0;
2017 int roffst = x.RowLenUntil();
2018 int offset;
2019
2020 int diagneigh = x.commGrid->GetComplementRank();
2022 MPI_Sendrecv(&xlocnz, 1, MPI_INT, diagneigh, TRX, &trxlocnz, 1, MPI_INT, diagneigh, TRX, World, &status);
2023 MPI_Sendrecv(&roffst, 1, MPI_INT, diagneigh, TROST, &offset, 1, MPI_INT, diagneigh, TROST, World, &status);
2024
2025 IU * trxinds = new IU[trxlocnz];
2026 NUV * trxnums = new NUV[trxlocnz];
2027 MPI_Sendrecv(const_cast<IU*>(SpHelper::p2a(x.ind)), xlocnz, MPIType<IU>(), diagneigh, TRX, trxinds, trxlocnz, MPIType<IU>(), diagneigh, TRX, World, &status);
2028 MPI_Sendrecv(const_cast<NUV*>(SpHelper::p2a(x.num)), xlocnz, MPIType<NUV>(), diagneigh, TRX, trxnums, trxlocnz, MPIType<NUV>(), diagneigh, TRX, World, &status);
2029 std::transform(trxinds, trxinds+trxlocnz, trxinds, std::bind2nd(std::plus<IU>(), offset)); // fullydist indexing (n pieces) -> matrix indexing (sqrt(p) pieces)
2030
2031 int colneighs, colrank;
2032 MPI_Comm_size(ColWorld, &colneighs);
2033 MPI_Comm_rank(ColWorld, &colrank);
2034 int * colnz = new int[colneighs];
2035 colnz[colrank] = trxlocnz;
2037 int * dpls = new int[colneighs](); // displacements (zero initialized pid)
2038 std::partial_sum(colnz, colnz+colneighs-1, dpls+1);
2039 int accnz = std::accumulate(colnz, colnz+colneighs, 0);
2040 IU * indacc = new IU[accnz];
2041 NUV * numacc = new NUV[accnz];
2042
2043 // ABAB: Future issues here, colnz is of type int (MPI limitation)
2044 // What if the aggregate vector size along the processor row/column is not 32-bit addressible?
2048
2049 // serial SpMV with sparse vector
2050 std::vector< int32_t > indy;
2051 std::vector< T_promote > numy;
2052
2053 int32_t * tmpindacc = new int32_t[accnz];
2054 for(int i=0; i< accnz; ++i) tmpindacc[i] = indacc[i];
2055 delete [] indacc;
2056
2057 dcsc_gespmv<SR>(*(A.spSeq), tmpindacc, numacc, accnz, indy, numy); // actual multiplication
2058
2061
2062 FullyDistSpVec<IU, T_promote> y ( x.commGrid, A.getnrow()); // identity doesn't matter for sparse vectors
2063 IU yintlen = y.MyRowLength();
2064
2065 int rowneighs;
2067 std::vector< std::vector<IU> > sendind(rowneighs);
2068 std::vector< std::vector<T_promote> > sendnum(rowneighs);
2069 typename std::vector<int32_t>::size_type outnz = indy.size();
2070 for(typename std::vector<IU>::size_type i=0; i< outnz; ++i)
2071 {
2072 IU locind;
2073 int rown = y.OwnerWithinRow(yintlen, static_cast<IU>(indy[i]), locind);
2074 sendind[rown].push_back(locind);
2075 sendnum[rown].push_back(numy[i]);
2076 }
2077
2078 IU * sendindbuf = new IU[outnz];
2079 T_promote * sendnumbuf = new T_promote[outnz];
2080 int * sendcnt = new int[rowneighs];
2081 int * sdispls = new int[rowneighs];
2082 for(int i=0; i<rowneighs; ++i)
2083 sendcnt[i] = sendind[i].size();
2084
2085 int * rdispls = new int[rowneighs];
2086 int * recvcnt = new int[rowneighs];
2087 MPI_Alltoall(sendcnt, 1, MPI_INT, recvcnt, 1, MPI_INT, RowWorld); // share the request counts
2088
2089 sdispls[0] = 0;
2090 rdispls[0] = 0;
2091 for(int i=0; i<rowneighs-1; ++i)
2092 {
2093 sdispls[i+1] = sdispls[i] + sendcnt[i];
2094 rdispls[i+1] = rdispls[i] + recvcnt[i];
2095 }
2096 int totrecv = std::accumulate(recvcnt,recvcnt+rowneighs,0);
2097 IU * recvindbuf = new IU[totrecv];
2098 T_promote * recvnumbuf = new T_promote[totrecv];
2099
2100 for(int i=0; i<rowneighs; ++i)
2101 {
2102 std::copy(sendind[i].begin(), sendind[i].end(), sendindbuf+sdispls[i]);
2103 std::vector<IU>().swap(sendind[i]);
2104 }
2105 for(int i=0; i<rowneighs; ++i)
2106 {
2107 std::copy(sendnum[i].begin(), sendnum[i].end(), sendnumbuf+sdispls[i]);
2108 std::vector<T_promote>().swap(sendnum[i]);
2109 }
2112
2114 DeleteAll(sendcnt, recvcnt, sdispls, rdispls);
2115
2116 // define a SPA-like data structure
2117 IU ysize = y.MyLocLength();
2118 T_promote * localy = new T_promote[ysize];
2119 bool * isthere = new bool[ysize];
2120 std::vector<IU> nzinds; // nonzero indices
2121 std::fill_n(isthere, ysize, false);
2122
2123 for(int i=0; i< totrecv; ++i)
2124 {
2125 if(!isthere[recvindbuf[i]])
2126 {
2127 localy[recvindbuf[i]] = recvnumbuf[i]; // initial assignment
2128 nzinds.push_back(recvindbuf[i]);
2129 isthere[recvindbuf[i]] = true;
2130 }
2131 else
2132 {
2133 localy[recvindbuf[i]] = SR::add(localy[recvindbuf[i]], recvnumbuf[i]);
2134 }
2135 }
2136 DeleteAll(isthere, recvindbuf, recvnumbuf);
2137 sort(nzinds.begin(), nzinds.end());
2138 int nnzy = nzinds.size();
2139 y.ind.resize(nnzy);
2140 y.num.resize(nnzy);
2141 for(int i=0; i< nnzy; ++i)
2142 {
2143 y.ind[i] = nzinds[i];
2144 y.num[i] = localy[nzinds[i]];
2145 }
2146 delete [] localy;
2147 return y;
2148}
2149
2150// Aydin (June 2021):
2151// This currently duplicates the work of EWiseMult with exclude = true
2152// However, this is the right way of implementing it because it allows set difference when
2153// the types of two matrices do not have a valid multiplication operator defined
2154// set difference should not require such an operator so we will move all code
2155// bases that use EWiseMult(..., exclude=true) to this one
2156template <typename IU, typename NU1, typename NU2, typename UDERA, typename UDERB>
2158{
2159 if(*(A.commGrid) == *(B.commGrid))
2160 {
2161 UDERA * result = new UDERA( SetDifference(*(A.spSeq),*(B.spSeq)));
2163 }
2164 else
2165 {
2166 std::cout << "Grids are not comparable for set difference" << std::endl;
2168 return SpParMat< IU,NU1,UDERA >();
2169 }
2170
2171}
2172
2173template <typename IU, typename NU1, typename NU2, typename UDERA, typename UDERB>
2175 (const SpParMat<IU,NU1,UDERA> & A, const SpParMat<IU,NU2,UDERB> & B , bool exclude)
2176{
2179
2180 if(*(A.commGrid) == *(B.commGrid))
2181 {
2182 DER_promote * result = new DER_promote( EWiseMult(*(A.spSeq),*(B.spSeq),exclude) );
2184 }
2185 else
2186 {
2187 std::cout << "Grids are not comparable elementwise multiplication" << std::endl;
2190 }
2191}
2192
2193template <typename RETT, typename RETDER, typename IU, typename NU1, typename NU2, typename UDERA, typename UDERB, typename _BinaryOperation>
2196{
2197 if(*(A.commGrid) == *(B.commGrid))
2198 {
2199 RETDER * result = new RETDER( EWiseApply<RETT>(*(A.spSeq),*(B.spSeq), __binary_op, notB, defaultBVal) );
2201 }
2202 else
2203 {
2204 std::cout << "Grids are not comparable elementwise apply" << std::endl;
2207 }
2208}
2209
2210template <typename RETT, typename RETDER, typename IU, typename NU1, typename NU2, typename UDERA, typename UDERB, typename _BinaryOperation, typename _BinaryPredicate>
2213{
2214 if(*(A.commGrid) == *(B.commGrid))
2215 {
2218 }
2219 else
2220 {
2221 std::cout << "Grids are not comparable elementwise apply" << std::endl;
2224 }
2225}
2226
2227// plain adapter
2228template <typename RETT, typename RETDER, typename IU, typename NU1, typename NU2, typename UDERA, typename UDERB, typename _BinaryOperation, typename _BinaryPredicate>
2237// end adapter
2238
2243template <typename IU, typename NU1, typename NU2>
2245 (const FullyDistSpVec<IU,NU1> & V, const FullyDistVec<IU,NU2> & W , bool exclude, NU2 zero)
2246{
2247 typedef typename promote_trait<NU1,NU2>::T_promote T_promote;
2248
2249 if(*(V.commGrid) == *(W.commGrid))
2250 {
2252 if(V.glen != W.glen)
2253 {
2254 std::cerr << "Vector dimensions don't match for EWiseMult\n";
2256 }
2257 else
2258 {
2259 Product.glen = V.glen;
2260 IU size= V.getlocnnz();
2261 if(exclude)
2262 {
2263 #if defined(_OPENMP) && defined(CBLAS_EXPERIMENTAL) // not faster than serial
2264 int actual_splits = cblas_splits * 1; // 1 is the parallel slackness
2265 std::vector <IU> tlosizes (actual_splits, 0);
2266 std::vector < std::vector<IU> > tlinds(actual_splits);
2267 std::vector < std::vector<T_promote> > tlnums(actual_splits);
2269 #pragma omp parallel for //schedule(dynamic, 1)
2270 for(IU t = 0; t < actual_splits; ++t)
2271 {
2272 IU tlbegin = t*tlsize;
2273 IU tlend = (t==actual_splits-1)? size : (t+1)*tlsize;
2274 for(IU i=tlbegin; i<tlend; ++i)
2275 {
2276 if(W.arr[V.ind[i]] == zero) // keep only those
2277 {
2278 tlinds[t].push_back(V.ind[i]);
2279 tlnums[t].push_back(V.num[i]);
2280 tlosizes[t]++;
2281 }
2282 }
2283 }
2284 std::vector<IU> prefix_sum(actual_splits+1,0);
2285 std::partial_sum(tlosizes.begin(), tlosizes.end(), prefix_sum.begin()+1);
2286 Product.ind.resize(prefix_sum[actual_splits]);
2287 Product.num.resize(prefix_sum[actual_splits]);
2288
2289 #pragma omp parallel for //schedule(dynamic, 1)
2290 for(IU t=0; t< actual_splits; ++t)
2291 {
2292 std::copy(tlinds[t].begin(), tlinds[t].end(), Product.ind.begin()+prefix_sum[t]);
2293 std::copy(tlnums[t].begin(), tlnums[t].end(), Product.num.begin()+prefix_sum[t]);
2294 }
2295 #else
2296 for(IU i=0; i<size; ++i)
2297 {
2298 if(W.arr[V.ind[i]] == zero) // keep only those
2299 {
2300 Product.ind.push_back(V.ind[i]);
2301 Product.num.push_back(V.num[i]);
2302 }
2303 }
2304 #endif
2305 }
2306 else
2307 {
2308 for(IU i=0; i<size; ++i)
2309 {
2310 if(W.arr[V.ind[i]] != zero) // keep only those
2311 {
2312 Product.ind.push_back(V.ind[i]);
2313 Product.num.push_back(V.num[i] * W.arr[V.ind[i]]);
2314 }
2315 }
2316 }
2317 }
2318 return Product;
2319 }
2320 else
2321 {
2322 std::cout << "Grids are not comparable elementwise multiplication" << std::endl;
2325 }
2326}
2327
2328
2332template <typename RET, typename IU, typename NU1, typename NU2, typename _BinaryOperation, typename _BinaryPredicate>
2335{
2336 typedef RET T_promote; //typedef typename promote_trait<NU1,NU2>::T_promote T_promote;
2337 if(*(V.commGrid) == *(W.commGrid))
2338 {
2340 if(V.TotalLength() != W.TotalLength())
2341 {
2342 std::ostringstream outs;
2343 outs << "Vector dimensions don't match (" << V.TotalLength() << " vs " << W.TotalLength() << ") for EWiseApply (short version)\n";
2344 SpParHelper::Print(outs.str());
2346 }
2347 else
2348 {
2349 int nthreads=1;
2350#ifdef _OPENMP
2351#pragma omp parallel
2352 {
2353 nthreads = omp_get_num_threads();
2354 }
2355#endif
2356
2357 Product.glen = V.glen;
2358 IU size= W.LocArrSize();
2359 IU spsize = V.getlocnnz();
2360
2361 // temporary result vectors per thread
2362 std::vector<std::vector<IU>> tProductInd(nthreads);
2363 std::vector<std::vector<T_promote>> tProductVal(nthreads);
2364 IU perthread; //chunk of tProductInd or tProductVal allocated to each thread
2365 if (allowVNulls)
2367 else
2369
2370#ifdef _OPENMP
2371#pragma omp parallel
2372#endif
2373 {
2374 int curthread = 0;
2375#ifdef _OPENMP
2376 curthread = omp_get_thread_num();
2377#endif
2380
2381 if (allowVNulls)
2382 {
2383 if(curthread == nthreads-1) tNextIdx = size;
2384
2385 // get sparse part for the current thread
2386 auto it = std::lower_bound (V.ind.begin(), V.ind.end(), tStartIdx);
2387 IU tSpIdx = (IU) std::distance(V.ind.begin(), it);
2388
2389 // iterate over the dense vector
2390 for(IU tIdx=tStartIdx; tIdx < tNextIdx; ++tIdx)
2391 {
2392 if(tSpIdx < spsize && V.ind[tSpIdx] < tNextIdx && V.ind[tSpIdx] == tIdx)
2393 {
2394 if (_doOp(V.num[tSpIdx], W.arr[tIdx], false, false))
2395 {
2396 tProductInd[curthread].push_back(tIdx);
2397 tProductVal[curthread].push_back (_binary_op(V.num[tSpIdx], W.arr[tIdx], false, false));
2398 }
2399 tSpIdx++;
2400 }
2401 else
2402 {
2403 if (_doOp(Vzero, W.arr[tIdx], true, false))
2404 {
2405 tProductInd[curthread].push_back(tIdx);
2406 tProductVal[curthread].push_back (_binary_op(Vzero, W.arr[tIdx], true, false));
2407 }
2408 }
2409 }
2410 }
2411 else // iterate over the sparse vector
2412 {
2413 if(curthread == nthreads-1) tNextIdx = spsize;
2415 {
2416 if (_doOp(V.num[tSpIdx], W.arr[V.ind[tSpIdx]], false, false))
2417 {
2418
2419 tProductInd[curthread].push_back( V.ind[tSpIdx]);
2420 tProductVal[curthread].push_back (_binary_op(V.num[tSpIdx], W.arr[V.ind[tSpIdx]], false, false));
2421 }
2422 }
2423 }
2424 }
2425
2426 std::vector<IU> tdisp(nthreads+1);
2427 tdisp[0] = 0;
2428 for(int i=0; i<nthreads; ++i)
2429 {
2430 tdisp[i+1] = tdisp[i] + tProductInd[i].size();
2431 }
2432
2433 // copy results from temporary vectors
2434 Product.ind.resize(tdisp[nthreads]);
2435 Product.num.resize(tdisp[nthreads]);
2436
2437#ifdef _OPENMP
2438#pragma omp parallel
2439#endif
2440 {
2441 int curthread = 0;
2442#ifdef _OPENMP
2443 curthread = omp_get_thread_num();
2444#endif
2445 std::copy(tProductInd[curthread].begin(), tProductInd[curthread].end(), Product.ind.data() + tdisp[curthread]);
2446 std::copy(tProductVal[curthread].begin() , tProductVal[curthread].end(), Product.num.data() + tdisp[curthread]);
2447 }
2448 }
2449 return Product;
2450 }
2451 else
2452 {
2453 std::cout << "Grids are not comparable for EWiseApply" << std::endl;
2456 }
2457}
2458
2459
2460
2478template <typename RET, typename IU, typename NU1, typename NU2, typename _BinaryOperation, typename _BinaryPredicate>
2481{
2482
2483#ifdef _OPENMP
2485
2486#else
2487 typedef RET T_promote; //typedef typename promote_trait<NU1,NU2>::T_promote T_promote;
2488 if(*(V.commGrid) == *(W.commGrid))
2489 {
2491 //FullyDistVec< IU, NU1> DV (V); // Ariful: I am not sure why it was there??
2492 if(V.TotalLength() != W.TotalLength())
2493 {
2494 std::ostringstream outs;
2495 outs << "Vector dimensions don't match (" << V.TotalLength() << " vs " << W.TotalLength() << ") for EWiseApply (short version)\n";
2496 SpParHelper::Print(outs.str());
2498 }
2499 else
2500 {
2501 Product.glen = V.glen;
2502 IU size= W.LocArrSize();
2503 IU spsize = V.getlocnnz();
2504 IU sp_iter = 0;
2505 if (allowVNulls)
2506 {
2507 // iterate over the dense vector
2508 for(IU i=0; i<size; ++i)
2509 {
2510 if(sp_iter < spsize && V.ind[sp_iter] == i)
2511 {
2512 if (_doOp(V.num[sp_iter], W.arr[i], false, false))
2513 {
2514 Product.ind.push_back(i);
2515 Product.num.push_back(_binary_op(V.num[sp_iter], W.arr[i], false, false));
2516 }
2517 sp_iter++;
2518 }
2519 else
2520 {
2521 if (_doOp(Vzero, W.arr[i], true, false))
2522 {
2523 Product.ind.push_back(i);
2524 Product.num.push_back(_binary_op(Vzero, W.arr[i], true, false));
2525 }
2526 }
2527 }
2528 }
2529 else
2530 {
2531 // iterate over the sparse vector
2532 for(sp_iter = 0; sp_iter < spsize; ++sp_iter)
2533 {
2534 if (_doOp(V.num[sp_iter], W.arr[V.ind[sp_iter]], false, false))
2535 {
2536 Product.ind.push_back(V.ind[sp_iter]);
2537 Product.num.push_back(_binary_op(V.num[sp_iter], W.arr[V.ind[sp_iter]], false, false));
2538 }
2539 }
2540
2541 }
2542 }
2543 return Product;
2544 }
2545 else
2546 {
2547 std::cout << "Grids are not comparable for EWiseApply" << std::endl;
2550 }
2551#endif
2552}
2553
2554
2555
2579template <typename RET, typename IU, typename NU1, typename NU2, typename _BinaryOperation, typename _BinaryPredicate>
2582{
2583
2584 typedef RET T_promote; // typename promote_trait<NU1,NU2>::T_promote T_promote;
2585 if(*(V.commGrid) == *(W.commGrid))
2586 {
2588 if(V.glen != W.glen)
2589 {
2590 std::ostringstream outs;
2591 outs << "Vector dimensions don't match (" << V.glen << " vs " << W.glen << ") for EWiseApply (full version)\n";
2592 SpParHelper::Print(outs.str());
2594 }
2595 else
2596 {
2597 Product.glen = V.glen;
2598 typename std::vector< IU >::const_iterator indV = V.ind.begin();
2599 typename std::vector< NU1 >::const_iterator numV = V.num.begin();
2600 typename std::vector< IU >::const_iterator indW = W.ind.begin();
2601 typename std::vector< NU2 >::const_iterator numW = W.num.begin();
2602
2603 while (indV < V.ind.end() && indW < W.ind.end())
2604 {
2605 if (*indV == *indW)
2606 {
2607 // overlap
2608 if (allowIntersect)
2609 {
2610 if (_doOp(*numV, *numW, false, false))
2611 {
2612 Product.ind.push_back(*indV);
2613 Product.num.push_back(_binary_op(*numV, *numW, false, false));
2614 }
2615 }
2616 indV++; numV++;
2617 indW++; numW++;
2618 }
2619 else if (*indV < *indW)
2620 {
2621 // V has value but W does not
2622 if (allowWNulls)
2623 {
2624 if (_doOp(*numV, Wzero, false, true))
2625 {
2626 Product.ind.push_back(*indV);
2627 Product.num.push_back(_binary_op(*numV, Wzero, false, true));
2628 }
2629 }
2630 indV++; numV++;
2631 }
2632 else //(*indV > *indW)
2633 {
2634 // W has value but V does not
2635 if (allowVNulls)
2636 {
2637 if (_doOp(Vzero, *numW, true, false))
2638 {
2639 Product.ind.push_back(*indW);
2640 Product.num.push_back(_binary_op(Vzero, *numW, true, false));
2641 }
2642 }
2643 indW++; numW++;
2644 }
2645 }
2646 // clean up
2647 while (allowWNulls && indV < V.ind.end())
2648 {
2649 if (_doOp(*numV, Wzero, false, true))
2650 {
2651 Product.ind.push_back(*indV);
2652 Product.num.push_back(_binary_op(*numV, Wzero, false, true));
2653 }
2654 indV++; numV++;
2655 }
2656 while (allowVNulls && indW < W.ind.end())
2657 {
2658 if (_doOp(Vzero, *numW, true, false))
2659 {
2660 Product.ind.push_back(*indW);
2661 Product.num.push_back(_binary_op(Vzero, *numW, true, false));
2662 }
2663 indW++; numW++;
2664 }
2665 }
2666 return Product;
2667 }
2668 else
2669 {
2670 std::cout << "Grids are not comparable for EWiseApply" << std::endl;
2673 }
2674}
2675
2676// plain callback versions
2677template <typename RET, typename IU, typename NU1, typename NU2, typename _BinaryOperation, typename _BinaryPredicate>
2688
2689
2690
2691template <typename RET, typename IU, typename NU1, typename NU2, typename _BinaryOperation, typename _BinaryPredicate>
2700
2701
2702
2703
2704
2708// sampling-based nnz estimation via SpMV
2709// @OGUZ-NOTE This is not based on SUMMA, do not use. Estimates the number of
2710// nonzeros in the final output matrix.
2711
2712
2713#define NROUNDS 5
2714typedef std::array<float, NROUNDS> samparr_t;
2715
2716template <typename NZT>
2718{
2720};
2721
2722
2723
2725{
2726public:
2727 template<typename c, typename t, typename V>
2728 void save(std::basic_ostream<c, t> &os,
2729 std::array<V, NROUNDS> &sample_vec,
2730 int64_t index)
2731 {
2732 for (auto it = sample_vec.begin(); it != sample_vec.end(); ++it)
2733 os << *it << " ";
2734 }
2735};
2736
2737
2738
2739template<typename NZT>
2741{
2742 static samparr_t id()
2743 {
2744 samparr_t arr;
2745 for (auto it = arr.begin(); it != arr.end(); ++it)
2746 *it = std::numeric_limits<float>::max();
2747 return arr;
2748 }
2749
2750
2751 static bool returnedSAID()
2752 {
2753 return false;
2754 }
2755
2756
2757 static samparr_t
2759 {
2760 samparr_t out;
2761 for (int i = 0; i < NROUNDS; ++i)
2762 out[i] = std::min(arg1[i], arg2[i]);
2763 return out;
2764 }
2765
2766
2767 static samparr_t
2769 {
2770 return arg2;
2771 }
2772
2773
2774 static void axpy (const NZT a, const samparr_t &x, samparr_t &y)
2775 {
2776 y = add(y, multiply(a, x));
2777 }
2778
2779
2781 {
2782 static MPI_Op mpiop;
2783 static bool exists = false;
2784 if (exists)
2785 return mpiop;
2786 else
2787 {
2788 MPI_Op_create(MPI_func, true, &mpiop);
2789 exists = true;
2790 return mpiop;
2791 }
2792 }
2793
2794
2795 static void
2797 {
2798 samparr_t *in = static_cast<samparr_t *>(invec);
2799 samparr_t *inout = static_cast<samparr_t *>(inoutvec);
2800 for (int i = 0; i < *len; ++i)
2801 inout[i] = add(inout[i], in[i]);
2802 }
2803};
2804
2805
2806
2807template <typename IU, typename NU1, typename NU2,
2808 typename UDERA, typename UDERB>
2809int64_t
2812 )
2813{
2814 int myrank;
2815 MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
2816 float lambda = 1.0f;
2817
2818 int nthds = 1;
2819 #ifdef THREADED
2820 #pragma omp parallel
2821 #endif
2822 {
2823 nthds = omp_get_num_threads();
2824 }
2825
2826 if (myrank == 0)
2827 std::cout << "taking transposes." << std::endl;
2828
2829 A.Transpose();
2830 B.Transpose();
2831
2832 if (myrank == 0)
2833 std::cout << "setting initial samples." << std::endl;
2834
2835 samparr_t sa;
2836 FullyDistVec<IU, samparr_t> samples_init(A.getcommgrid(), A.getncol(), sa);
2837
2838 #ifdef THREADED
2839 #pragma omp parallel
2840 #endif
2841 {
2842 std::default_random_engine gen;
2843 std::exponential_distribution<float> exp_dist(lambda);
2844
2845 #ifdef THREADED
2846 #pragma omp parallel for
2847 #endif
2848 for (IU i = 0; i < samples_init.LocArrSize(); ++i)
2849 {
2850 samparr_t tmp;
2851 for (auto it = tmp.begin(); it != tmp.end(); ++it)
2852 *it = exp_dist(gen);
2853 samples_init.SetLocalElement(i, tmp);
2854 }
2855 }
2856
2857 // std::string fname("samples_init");
2858 // samples_init.ParallelWrite(fname, 1, SamplesSaveHandler(), true);
2859
2860 if (myrank == 0)
2861 std::cout << "computing mid samples." << std::endl;
2862
2865
2866 // fname = "samples_mid";
2867 // samples_mid.ParallelWrite(fname, 1, SamplesSaveHandler(), true);
2868
2869 if (myrank == 0)
2870 std::cout << "computing final samples." << std::endl;
2871
2874
2875 // fname = "samples_final";
2876 // samples_final.ParallelWrite(fname, 1, SamplesSaveHandler(), true);
2877
2878 if (myrank == 0)
2879 std::cout << "computing nnz estimation." << std::endl;
2880
2881 float nnzest = 0.0f;
2882
2883 std::cout << myrank << "samples_final loc size: "
2884 << samples_final.LocArrSize() << std::endl;
2885
2886 const samparr_t *lsamples = samples_final.GetLocArr();
2887
2888 #ifdef THREADED
2889 #pragma omp parallel for reduction (+:nnzest)
2890 #endif
2891 for (IU i = 0; i < samples_final.LocArrSize(); ++i)
2892 {
2893 float tmp = 0.0f;
2894 for (auto it = lsamples[i].begin(); it != lsamples[i].end(); ++it)
2895 tmp += *it;
2896 nnzest += static_cast<float>(NROUNDS - 1) / tmp;
2897 }
2898
2899 if (myrank == 0)
2900 std::cout << "taking transposes again." << std::endl;
2901
2903 int64_t nnzC_tot = 0;
2905 (B.commGrid)->GetWorld());
2906
2907 if (myrank == 0)
2908 std::cout << "sampling-based spmv est tot: " << nnzC_tot << std::endl;
2909
2910 // revert back
2911 A.Transpose();
2912 B.Transpose();
2913
2914 return nnzC_tot;
2915
2916}
2917
2918template <typename SR, typename NUO, typename UDERO, typename IU, typename NU1, typename NU2, typename UDER1, typename UDER2>
2920 int myrank;
2921 MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
2922 typedef typename UDERO::LocalIT LIC;
2923 typedef typename UDER1::LocalIT LIA;
2924 typedef typename UDER2::LocalIT LIB;
2925
2926#ifdef TIMING
2927 double t0, t1, t2, t3;
2928#endif
2929
2930 /*
2931 * Check if A and B are multipliable
2932 * */
2933 if(A.getncol() != B.getnrow()){
2934 std::ostringstream outs;
2935 outs << "Can not multiply, dimensions does not match"<< std::endl;
2936 outs << A.getncol() << " != " << B.getnrow() << std::endl;
2937 SpParHelper::Print(outs.str());
2939 }
2940
2941 /*
2942 * Calculate, accross fibers, which process should get how many columns after redistribution
2943 * */
2945 // Calcuclate split boundaries as if all contents of the layer is being re-distributed along fiber
2946 // These boundaries will be used later on
2947 B.CalculateColSplitDistributionOfLayer(divisions3d);
2948
2949#ifdef TIMING
2950 t0 = MPI_Wtime();
2951#endif
2952 /*
2953 * SUMMA Starts
2954 * */
2955
2956 int stages, dummy; // last two parameters of ProductGrid are ignored for this multiplication
2957 std::shared_ptr<CommGrid> GridC = ProductGrid((A.GetLayerMat()->getcommgrid()).get(),
2958 (B.GetLayerMat()->getcommgrid()).get(),
2959 stages, dummy, dummy);
2960 IU C_m = A.GetLayerMat()->seqptr()->getnrow();
2961 IU C_n = B.GetLayerMat()->seqptr()->getncol();
2962
2963 IU ** ARecvSizes = SpHelper::allocate2D<IU>(UDERO::esscount, stages);
2964 IU ** BRecvSizes = SpHelper::allocate2D<IU>(UDERO::esscount, stages);
2965
2966 SpParHelper::GetSetSizes( *(A.GetLayerMat()->seqptr()), ARecvSizes, (A.GetLayerMat()->getcommgrid())->GetRowWorld() );
2967 SpParHelper::GetSetSizes( *(B.GetLayerMat()->seqptr()), BRecvSizes, (B.GetLayerMat()->getcommgrid())->GetColWorld() );
2968
2969 // Remotely fetched matrices are stored as pointers
2970 UDERO * ARecv;
2971 UDER2 * BRecv;
2972 std::vector< SpTuples<IU,NUO> *> tomerge;
2973
2974 int Aself = (A.GetLayerMat()->getcommgrid())->GetRankInProcRow();
2975 int Bself = (B.GetLayerMat()->getcommgrid())->GetRankInProcCol();
2976
2977 double Abcast_time = 0;
2978 double Bbcast_time = 0;
2979 double Local_multiplication_time = 0;
2980
2981 for(int i = 0; i < stages; ++i) {
2982 std::vector<IU> ess;
2983
2984 if(i == Aself){
2985 ARecv = A.GetLayerMat()->seqptr(); // shallow-copy
2986 }
2987 else{
2988 ess.resize(UDER1::esscount);
2989 for(int j=0; j<UDER1::esscount; ++j) {
2990 ess[j] = ARecvSizes[j][i]; // essentials of the ith matrix in this row
2991 }
2992 ARecv = new UDER1(); // first, create the object
2993 }
2994#ifdef TIMING
2995 t2 = MPI_Wtime();
2996#endif
2997 if (Aself != i) {
2998 ARecv->Create(ess);
2999 }
3000
3001 Arr<IU,NU1> Aarrinfo = ARecv->GetArrays();
3002
3003 for(unsigned int idx = 0; idx < Aarrinfo.indarrs.size(); ++idx) {
3004 MPI_Bcast(Aarrinfo.indarrs[idx].addr, Aarrinfo.indarrs[idx].count, MPIType<IU>(), i, GridC->GetRowWorld());
3005 }
3006
3007 for(unsigned int idx = 0; idx < Aarrinfo.numarrs.size(); ++idx) {
3008 MPI_Bcast(Aarrinfo.numarrs[idx].addr, Aarrinfo.numarrs[idx].count, MPIType<NU1>(), i, GridC->GetRowWorld());
3009 }
3010#ifdef TIMING
3011 t3 = MPI_Wtime();
3012 Abcast_time += (t3-t2);
3013#endif
3014 ess.clear();
3015 if(i == Bself){
3016 BRecv = B.GetLayerMat()->seqptr(); // shallow-copy
3017 }
3018 else{
3019 ess.resize(UDER2::esscount);
3020 for(int j=0; j<UDER2::esscount; ++j) {
3021 ess[j] = BRecvSizes[j][i];
3022 }
3023 BRecv = new UDER2();
3024 }
3025
3026 MPI_Barrier(A.GetLayerMat()->getcommgrid()->GetWorld());
3027#ifdef TIMING
3028 t2 = MPI_Wtime();
3029#endif
3030 if (Bself != i) {
3031 BRecv->Create(ess);
3032 }
3033 Arr<IU,NU2> Barrinfo = BRecv->GetArrays();
3034
3035 for(unsigned int idx = 0; idx < Barrinfo.indarrs.size(); ++idx) {
3036 MPI_Bcast(Barrinfo.indarrs[idx].addr, Barrinfo.indarrs[idx].count, MPIType<IU>(), i, GridC->GetColWorld());
3037 }
3038 for(unsigned int idx = 0; idx < Barrinfo.numarrs.size(); ++idx) {
3039 MPI_Bcast(Barrinfo.numarrs[idx].addr, Barrinfo.numarrs[idx].count, MPIType<NU2>(), i, GridC->GetColWorld());
3040 }
3041#ifdef TIMING
3042 t3 = MPI_Wtime();
3043 Bbcast_time += (t3-t2);
3044#endif
3045
3046#ifdef TIMING
3047 t2 = MPI_Wtime();
3048#endif
3050 (*ARecv, *BRecv, // parameters themselves
3051 i != Aself, // 'delete A' condition
3052 i != Bself, // 'delete B' condition
3053 false); // not to sort each column
3054#ifdef TIMING
3055 t3 = MPI_Wtime();
3057#endif
3058
3059 if(!C_cont->isZero()) tomerge.push_back(C_cont);
3060
3061 }
3062
3063 SpHelper::deallocate2D(ARecvSizes, UDER1::esscount);
3064 SpHelper::deallocate2D(BRecvSizes, UDER2::esscount);
3065
3066#ifdef TIMING
3067 t2 = MPI_Wtime();
3068#endif
3069 SpTuples<IU,NUO> * C_tuples = MultiwayMergeHash<SR>(tomerge, C_m, C_n, true, false); // Delete input arrays and do not sort
3070 //SpTuples<IU,NUO> * C_tuples = MultiwayMergeHashSliding<SR>(tomerge, C_m, C_n, true, false); // Delete input arrays and do not sort
3071#ifdef TIMING
3072 t3 = MPI_Wtime();
3073#endif
3074
3075#ifdef TIMING
3076 if(myrank == 0){
3077 fprintf(stderr, "[SUMMA3D]\tAbcast_time: %lf\n", Abcast_time);
3078 fprintf(stderr, "[SUMMA3D]\tBbcast_time: %lf\n", Bbcast_time);
3079 fprintf(stderr, "[SUMMA3D]\tLocal_multiplication_time: %lf\n", Local_multiplication_time);
3080 fprintf(stderr, "[SUMMA3D]\tMerge_layer_time: %lf\n", (t3-t2));
3081 }
3082#endif
3083 /*
3084 * SUMMA Ends
3085 * */
3086#ifdef TIMING
3087 t1 = MPI_Wtime();
3088 if(myrank == 0) fprintf(stderr, "[SUMMA3D]\tSUMMA time: %lf\n", (t1-t0));
3089#endif
3090 /*
3091 * 3d-reduction starts
3092 * */
3093#ifdef TIMING
3094 //MPI_Barrier(getcommgrid3D()->GetWorld());
3095 t0 = MPI_Wtime();
3096#endif
3098 MPI_Type_contiguous(sizeof(std::tuple<LIC,LIC,NUO>), MPI_CHAR, &MPI_tuple);
3100
3101 /*
3102 * Create a profile with information regarding data to be sent and received between layers
3103 * These memory allocation needs to be `int` specifically because some of these arrays would be used in communication
3104 * This is requirement is for MPI as MPI_Alltoallv takes pointer to integer exclusively as count and displacement
3105 * */
3106 int * sendcnt = new int[A.getcommgrid3D()->GetGridLayers()];
3107 int * sendprfl = new int[A.getcommgrid3D()->GetGridLayers()*3];
3108 int * sdispls = new int[A.getcommgrid3D()->GetGridLayers()]();
3109 int * recvcnt = new int[A.getcommgrid3D()->GetGridLayers()];
3110 int * recvprfl = new int[A.getcommgrid3D()->GetGridLayers()*3];
3111 int * rdispls = new int[A.getcommgrid3D()->GetGridLayers()]();
3112
3114 divisions3dPrefixSum[0] = 0;
3115 std::partial_sum(divisions3d.begin(), divisions3d.end()-1, divisions3dPrefixSum.begin()+1);
3117 IU totsend = C_tuples->getnnz();
3118
3119#pragma omp parallel for
3120 for(int i=0; i < A.getcommgrid3D()->GetGridLayers(); ++i){
3123 std::tuple<IU, IU, NUO> search_tuple_start(0, start_col, NUO());
3124 std::tuple<IU, IU, NUO> search_tuple_end(0, end_col, NUO());
3125 std::tuple<IU, IU, NUO>* start_it = std::lower_bound(C_tuples->tuples, C_tuples->tuples + C_tuples->getnnz(), search_tuple_start, comp);
3126 std::tuple<IU, IU, NUO>* end_it = std::lower_bound(C_tuples->tuples, C_tuples->tuples + C_tuples->getnnz(), search_tuple_end, comp);
3127 // This type casting is important from semantic point of view
3128 sendcnt[i] = (int)(end_it - start_it);
3129 sendprfl[i*3+0] = (int)(sendcnt[i]); // Number of nonzeros in ith chunk
3130 sendprfl[i*3+1] = (int)(A.GetLayerMat()->seqptr()->getnrow()); // Number of rows in ith chunk
3131 sendprfl[i*3+2] = (int)(divisions3d[i]); // Number of columns in ith chunk
3132 }
3133 std::partial_sum(sendcnt, sendcnt+A.getcommgrid3D()->GetGridLayers()-1, sdispls+1);
3134
3135 // Send profile ready. Now need to update the tuples to reflect correct column id after column split.
3136 for(int i=0; i < A.getcommgrid3D()->GetGridLayers(); ++i){
3137#pragma omp parallel for schedule(static)
3138 for(int j = 0; j < sendcnt[i]; j++){
3139 std::get<1>(C_tuples->tuples[sdispls[i]+j]) = std::get<1>(C_tuples->tuples[sdispls[i]+j]) - divisions3dPrefixSum[i];
3140 }
3141 }
3142
3143 MPI_Alltoall(sendprfl, 3, MPI_INT, recvprfl, 3, MPI_INT, A.getcommgrid3D()->GetFiberWorld());
3144
3145 for(int i = 0; i < A.getcommgrid3D()->GetGridLayers(); i++) recvcnt[i] = recvprfl[i*3];
3146 std::partial_sum(recvcnt, recvcnt+A.getcommgrid3D()->GetGridLayers()-1, rdispls+1);
3147 IU totrecv = std::accumulate(recvcnt,recvcnt+A.getcommgrid3D()->GetGridLayers(), static_cast<IU>(0));
3148 std::tuple<LIC,LIC,NUO>* recvTuples = static_cast<std::tuple<LIC,LIC,NUO>*> (::operator new (sizeof(std::tuple<LIC,LIC,NUO>[totrecv])));
3149
3150#ifdef TIMING
3151 t2 = MPI_Wtime();
3152#endif
3153 MPI_Alltoallv(C_tuples->tuples, sendcnt, sdispls, MPI_tuple, recvTuples, recvcnt, rdispls, MPI_tuple, A.getcommgrid3D()->GetFiberWorld());
3154 delete C_tuples;
3155#ifdef TIMING
3156 t3 = MPI_Wtime();
3157 if(myrank == 0) fprintf(stderr, "[SUMMA3D]\tAlltoallv: %lf\n", (t3-t2));
3158#endif
3159 vector<SpTuples<IU, NUO>*> recvChunks(A.getcommgrid3D()->GetGridLayers());
3160#pragma omp parallel for
3161 for (int i = 0; i < A.getcommgrid3D()->GetGridLayers(); i++){
3162 recvChunks[i] = new SpTuples<LIC, NUO>(recvcnt[i], recvprfl[i*3+1], recvprfl[i*3+2], recvTuples + rdispls[i], true, false);
3163 }
3164
3165 // Free all memory except tempTuples; Because that memory is holding data of newly created local matrices after receiving.
3166 DeleteAll(sendcnt, sendprfl, sdispls);
3167 DeleteAll(recvcnt, recvprfl, rdispls);
3169 /*
3170 * 3d-reduction ends
3171 * */
3172
3173#ifdef TIMING
3174 t1 = MPI_Wtime();
3175 if(myrank == 0) fprintf(stderr, "[SUMMA3D]\tReduction time: %lf\n", (t1-t0));
3176#endif
3177#ifdef TIMING
3178 t0 = MPI_Wtime();
3179#endif
3180 /*
3181 * 3d-merge starts
3182 * */
3183 SpTuples<IU, NUO> * merged_tuples = MultiwayMergeHash<SR, IU, NUO>(recvChunks, recvChunks[0]->getnrow(), recvChunks[0]->getncol(), false, false); // Do not delete
3184#ifdef TIMING
3185 t1 = MPI_Wtime();
3186 if(myrank == 0) fprintf(stderr, "[SUMMA3D]\tMerge_fiber_time: %lf\n", (t1-t0));
3187#endif
3188 //Create SpDCCol and delete merged_tuples;
3189 UDERO * localResultant = new UDERO(*merged_tuples, false);
3190 delete merged_tuples;
3191
3192 // Do not delete elements of recvChunks, because that would give segmentation fault due to double free
3193 //delete [] recvTuples;
3194 ::operator delete(recvTuples);
3195 for(int i = 0; i < recvChunks.size(); i++){
3196 recvChunks[i]->tuples_deleted = true; // Temporary patch to avoid memory leak and segfault
3197 delete recvChunks[i];
3198 }
3200 /*
3201 * 3d-merge ends
3202 * */
3203
3204 std::shared_ptr<CommGrid3D> grid3d;
3205 grid3d.reset(new CommGrid3D(A.getcommgrid3D()->GetWorld(), A.getcommgrid3D()->GetGridLayers(), A.getcommgrid3D()->GetGridRows(), A.getcommgrid3D()->GetGridCols(), A.isSpecial()));
3206 SpParMat3D<IU, NUO, UDERO> C(localResultant, grid3d, A.isColSplit(), A.isSpecial());
3207 return C;
3208}
3209
3210/*
3211 * Parameters:
3212 * - computationKernel: 1 for hash-based, 2 for heap-based
3213 * */
3214template <typename SR, typename NUO, typename UDERO, typename IU, typename NU1, typename NU2, typename UDERA, typename UDERB>
3217 int myrank;
3219 typedef typename UDERA::LocalIT LIA;
3220 typedef typename UDERB::LocalIT LIB;
3221 typedef typename UDERO::LocalIT LIC;
3222
3223 /*
3224 * Check if A and B are multipliable
3225 * */
3226 if(A.getncol() != B.getnrow()){
3227 std::ostringstream outs;
3228 outs << "Can not multiply, dimensions does not match"<< std::endl;
3229 outs << A.getncol() << " != " << B.getnrow() << std::endl;
3230 SpParHelper::Print(outs.str());
3232 }
3233
3234 /*
3235 * If provided number of phase is too low or too high then reset value of phase as 1
3236 * */
3237 if(phases < 1 || phases >= B.getncol()){
3238 SpParHelper::Print("[MemEfficientSpGEMM3D]\tThe value of phases is too small or large. Resetting to 1.\n");
3239 phases = 1;
3240 }
3241 double t0, t1, t2, t3, t4, t5, t6, t7, t8, t9; // To time different parts of the function
3242#ifdef TIMING
3243 MPI_Barrier(B.getcommgrid3D()->GetWorld());
3244 t0 = MPI_Wtime();
3245#endif
3246 /*
3247 * If per process memory is provided then calculate number of phases
3248 * Otherwise, proceed to multiplication.
3249 * */
3250 if(perProcessMemory > 0) {
3251 int p, calculatedPhases;
3252 MPI_Comm_size(A.getcommgrid3D()->GetLayerWorld(),&p);
3253 int64_t perNNZMem_in = sizeof(IU)*2 + sizeof(NU1);
3254 int64_t perNNZMem_out = sizeof(IU)*2 + sizeof(NUO);
3255
3256 int64_t lannz = A.GetLayerMat()->getlocalnnz();
3257 int64_t gannz = 0;
3258 // Get maximum number of nnz owned by one process
3259 MPI_Allreduce(&lannz, &gannz, 1, MPIType<int64_t>(), MPI_MAX, A.getcommgrid3D()->GetWorld());
3260 //int64_t ginputMem = gannz * perNNZMem_in * 4; // Four pieces per process: one piece of own A and B, one piece of received A and B
3261 int64_t ginputMem = gannz * perNNZMem_in * 5; // One extra copy for safety
3262
3263 // Estimate per layer nnz after multiplication. After this estimation each process would know an estimation of
3264 // how many nnz the corresponding layer will have after the layerwise operation.
3265 int64_t asquareNNZ = EstPerProcessNnzSUMMA(*(A.GetLayerMat()), *(B.GetLayerMat()), true);
3267 MPI_Allreduce(&asquareNNZ, &gasquareNNZ, 1, MPIType<int64_t>(), MPI_MAX, A.getcommgrid3D()->GetFiberWorld());
3268
3269 // Atmost two copies, one of a process's own, another received from fiber reduction
3271 // Calculate estimated average degree after multiplication
3272 int64_t d = ceil( ( ( gasquareNNZ / B.getcommgrid3D()->GetGridLayers() ) * sqrt(p) ) / B.GetLayerMat()->getlocalcols() );
3273 // Calculate per column nnz how left after k-select. Minimum of average degree and k-select parameters.
3274 int64_t k = std::min(int64_t(std::max(selectNum, recoverNum)), d );
3275
3276 //estimate output memory
3277 int64_t postKselectOutputNNZ = ceil(( (B.GetLayerMat()->getlocalcols() / B.getcommgrid3D()->GetGridLayers() ) * k)/sqrt(p)); // If kselect is run
3280 int64_t kselectMem = B.GetLayerMat()->getlocalcols() * k * sizeof(NUO) * 3;
3281
3282 //inputMem + outputMem + asquareMem/phases + kselectmem/phases < memory
3283 if(remainingMem > 0){
3284 calculatedPhases = ceil( (gasquareMem + kselectMem) / remainingMem ); // If kselect is run
3285 }
3286 else calculatedPhases = -1;
3287
3289 MPI_Allreduce(&calculatedPhases, &gCalculatedPhases, 1, MPI_INT, MPI_MAX, A.getcommgrid3D()->GetFiberWorld());
3290 if(gCalculatedPhases > phases) phases = gCalculatedPhases;
3291 }
3292 else{
3293 // Do nothing
3294 }
3295#ifdef TIMING
3296 MPI_Barrier(B.getcommgrid3D()->GetWorld());
3297 t1 = MPI_Wtime();
3299 //if(myrank == 0) fprintf(stderr, "[MemEfficientSpGEMM3D]\tSymbolic stage time: %lf\n", (t1-t0));
3300#endif
3301
3302
3303 /*
3304 * Calculate, accross fibers, which process should get how many columns after redistribution
3305 * */
3307 // Calculate split boundaries as if all contents of the layer is being re-distributed along fiber
3308 // These boundaries will be used later on
3309 B.CalculateColSplitDistributionOfLayer(divisions3d);
3310
3311 /*
3312 * Split B according to calculated number of phases
3313 * For better load balancing split B into nlayers*phases chunks
3314 * */
3317 UDERB CopyB = *(B.GetLayerMat()->seqptr());
3318 CopyB.ColSplit(divisions3d, tempPiecesOfB); // Split B into `nlayers` chunks at first
3319 for(int i = 0; i < tempPiecesOfB.size(); i++){
3321 tempPiecesOfB[i]->ColSplit(phases, temp); // Split each chunk of B into `phases` chunks
3322 for(int j = 0; j < temp.size(); j++){
3323 PiecesOfB.push_back(temp[j]);
3324 }
3325 }
3326
3328 //if(myrank == 0){
3329 //fprintf(stderr, "[MemEfficientSpGEMM3D]\tRunning with phase: %d\n", phases);
3330 //}
3331
3332 for(int p = 0; p < phases; p++){
3333 /*
3334 * At the start of each phase take appropriate pieces from previously created pieces of local B matrix
3335 * Appropriate means correct pieces so that 3D-merge can be properly load balanced.
3336 * */
3337 vector<LIB> lbDivisions3d; // load balance friendly division
3339 vector<UDERB*> targetPiecesOfB; // Pieces of B involved in current phase
3340 for(int i = 0; i < PiecesOfB.size(); i++){
3341 if(i % phases == p){
3342 targetPiecesOfB.push_back(new UDERB(*(PiecesOfB[i])));
3343 lbDivisions3d.push_back(PiecesOfB[i]->getncol());
3344 totalLocalColumnInvolved += PiecesOfB[i]->getncol();
3345 }
3346 }
3347
3348 /*
3349 * Create new local matrix by concatenating appropriately picked pieces
3350 * */
3351 UDERB * OnePieceOfB = new UDERB(0, (B.GetLayerMat())->seqptr()->getnrow(), totalLocalColumnInvolved, 0);
3352 OnePieceOfB->ColConcatenate(targetPiecesOfB);
3354
3355 /*
3356 * Create a new layer-wise distributed matrix with the newly created local matrix for this phase
3357 * This matrix is used in SUMMA multiplication of respective layer
3358 * */
3359 SpParMat<IU, NU2, UDERB> OnePieceOfBLayer(OnePieceOfB, A.getcommgrid3D()->GetLayerWorld());
3360#ifdef TIMING
3361 t0 = MPI_Wtime();
3362#endif
3363 /*
3364 * SUMMA Starts
3365 * */
3366
3367 int stages, dummy; // last two parameters of ProductGrid are ignored for this multiplication
3368 std::shared_ptr<CommGrid> GridC = ProductGrid((A.GetLayerMat()->getcommgrid()).get(),
3369 (OnePieceOfBLayer.getcommgrid()).get(),
3370 stages, dummy, dummy);
3371 LIA C_m = A.GetLayerMat()->seqptr()->getnrow();
3372 LIB C_n = OnePieceOfBLayer.seqptr()->getncol();
3373
3374 LIA ** ARecvSizes = SpHelper::allocate2D<LIA>(UDERA::esscount, stages);
3375 LIB ** BRecvSizes = SpHelper::allocate2D<LIB>(UDERB::esscount, stages);
3376
3377 SpParHelper::GetSetSizes( *(A.GetLayerMat()->seqptr()), ARecvSizes, (A.GetLayerMat()->getcommgrid())->GetRowWorld() );
3378 SpParHelper::GetSetSizes( *(OnePieceOfBLayer.seqptr()), BRecvSizes, (OnePieceOfBLayer.getcommgrid())->GetColWorld() );
3379
3380 // Remotely fetched matrices are stored as pointers
3381 UDERA * ARecv;
3382 UDERB * BRecv;
3383 std::vector< SpTuples<LIC,NUO> *> tomerge;
3384
3385 int Aself = (A.GetLayerMat()->getcommgrid())->GetRankInProcRow();
3386 int Bself = (OnePieceOfBLayer.getcommgrid())->GetRankInProcCol();
3387
3388 double Abcast_time = 0;
3389 double Bbcast_time = 0;
3390 double Local_multiplication_time = 0;
3391
3392 for(int i = 0; i < stages; ++i) {
3393 std::vector<LIA> ess;
3394
3395 if(i == Aself){
3396 ARecv = A.GetLayerMat()->seqptr(); // shallow-copy
3397 }
3398 else{
3399 ess.resize(UDERA::esscount);
3400 for(int j=0; j<UDERA::esscount; ++j) {
3401 ess[j] = ARecvSizes[j][i]; // essentials of the ith matrix in this row
3402 }
3403 ARecv = new UDERA(); // first, create the object
3404 }
3405#ifdef TIMING
3406 t2 = MPI_Wtime();
3407#endif
3408 if (Aself != i) {
3409 ARecv->Create(ess);
3410 }
3411
3412 Arr<LIA,NU1> Aarrinfo = ARecv->GetArrays();
3413
3414 for(unsigned int idx = 0; idx < Aarrinfo.indarrs.size(); ++idx) {
3415 MPI_Bcast(Aarrinfo.indarrs[idx].addr, Aarrinfo.indarrs[idx].count, MPIType<IU>(), i, GridC->GetRowWorld());
3416 }
3417
3418 for(unsigned int idx = 0; idx < Aarrinfo.numarrs.size(); ++idx) {
3419 MPI_Bcast(Aarrinfo.numarrs[idx].addr, Aarrinfo.numarrs[idx].count, MPIType<NU1>(), i, GridC->GetRowWorld());
3420 }
3421#ifdef TIMING
3422 t3 = MPI_Wtime();
3423 mcl3d_Abcasttime += (t3-t2);
3424 Abcast_time += (t3-t2);
3425#endif
3426 ess.clear();
3427 if(i == Bself){
3428 BRecv = OnePieceOfBLayer.seqptr(); // shallow-copy
3429 }
3430 else{
3431 ess.resize(UDERB::esscount);
3432 for(int j=0; j<UDERB::esscount; ++j) {
3433 ess[j] = BRecvSizes[j][i];
3434 }
3435 BRecv = new UDERB();
3436 }
3437
3438 MPI_Barrier(A.GetLayerMat()->getcommgrid()->GetWorld());
3439#ifdef TIMING
3440 t2 = MPI_Wtime();
3441#endif
3442 if (Bself != i) {
3443 BRecv->Create(ess);
3444 }
3445 Arr<LIB,NU2> Barrinfo = BRecv->GetArrays();
3446
3447 for(unsigned int idx = 0; idx < Barrinfo.indarrs.size(); ++idx) {
3448 MPI_Bcast(Barrinfo.indarrs[idx].addr, Barrinfo.indarrs[idx].count, MPIType<IU>(), i, GridC->GetColWorld());
3449 }
3450 for(unsigned int idx = 0; idx < Barrinfo.numarrs.size(); ++idx) {
3451 MPI_Bcast(Barrinfo.numarrs[idx].addr, Barrinfo.numarrs[idx].count, MPIType<NU2>(), i, GridC->GetColWorld());
3452 }
3453#ifdef TIMING
3454 t3 = MPI_Wtime();
3455 mcl3d_Bbcasttime += (t3-t2);
3456 Bbcast_time += (t3-t2);
3457#endif
3458
3459#ifdef TIMING
3460 t2 = MPI_Wtime();
3461#endif
3463 if(computationKernel == 1){
3465 (*ARecv, *BRecv, // parameters themselves
3466 i != Aself, // 'delete A' condition
3467 i != Bself, // 'delete B' condition
3468 false); // not to sort each column
3469 }
3470 else if(computationKernel == 2){
3472 (*ARecv, *BRecv, // parameters themselves
3473 i != Aself, // 'delete A' condition
3474 i != Bself); // 'delete B' condition
3475
3476 }
3477
3478#ifdef TIMING
3479 t3 = MPI_Wtime();
3482#endif
3483
3484 if(!C_cont->isZero()) tomerge.push_back(C_cont);
3485 }
3486
3487 SpHelper::deallocate2D(ARecvSizes, UDERA::esscount);
3488 SpHelper::deallocate2D(BRecvSizes, UDERB::esscount);
3489
3490#ifdef TIMING
3491 t2 = MPI_Wtime();
3492#endif
3494 if(computationKernel == 1) C_tuples = MultiwayMergeHash<SR>(tomerge, C_m, C_n, true, true); // Delete input arrays and sort
3495 else if(computationKernel == 2) C_tuples = MultiwayMerge<SR>(tomerge, C_m, C_n, true); // Delete input arrays and sort
3496
3497#ifdef TIMING
3498 t3 = MPI_Wtime();
3500#endif
3501
3502#ifdef TIMING
3503 if(myrank == 0){
3504 fprintf(stderr, "[MemEfficientSpGEMM3D]\tPhase: %d\tAbcast_time: %lf\n", p, Abcast_time);
3505 fprintf(stderr, "[MemEfficientSpGEMM3D]\tPhase: %d\tBbcast_time: %lf\n", p, Bbcast_time);
3506 fprintf(stderr, "[MemEfficientSpGEMM3D]\tPhase: %d\tLocal_multiplication_time: %lf\n", p, Local_multiplication_time);
3507 fprintf(stderr, "[MemEfficientSpGEMM3D]\tPhase: %d\tSUMMA Merge time: %lf\n", p, (t3-t2));
3508 }
3509#endif
3510 /*
3511 * SUMMA Ends
3512 * */
3513#ifdef TIMING
3514 t1 = MPI_Wtime();
3515 mcl3d_SUMMAtime += (t1-t0);
3516 if(myrank == 0) fprintf(stderr, "[MemEfficientSpGEMM3D]\tPhase: %d\tSUMMA time: %lf\n", p, (t1-t0));
3517#endif
3518
3519 /*
3520 * 3d-reduction starts
3521 * */
3522#ifdef TIMING
3523 t0 = MPI_Wtime();
3524 t2 = MPI_Wtime();
3525#endif
3527 MPI_Type_contiguous(sizeof(std::tuple<LIC,LIC,NUO>), MPI_CHAR, &MPI_tuple);
3529
3530 /*
3531 * Create a profile with information regarding data to be sent and received between layers
3532 * These memory allocation needs to be `int` specifically because some of these arrays would be used in communication
3533 * This is requirement is for MPI as MPI_Alltoallv takes pointer to integer exclusively as count and displacement
3534 * */
3535 int * sendcnt = new int[A.getcommgrid3D()->GetGridLayers()];
3536 int * sendprfl = new int[A.getcommgrid3D()->GetGridLayers()*3];
3537 int * sdispls = new int[A.getcommgrid3D()->GetGridLayers()]();
3538 int * recvcnt = new int[A.getcommgrid3D()->GetGridLayers()];
3539 int * recvprfl = new int[A.getcommgrid3D()->GetGridLayers()*3];
3540 int * rdispls = new int[A.getcommgrid3D()->GetGridLayers()]();
3541
3544 std::partial_sum(lbDivisions3d.begin(), lbDivisions3d.end()-1, lbDivisions3dPrefixSum.begin()+1);
3546 LIC totsend = C_tuples->getnnz();
3547#ifdef TIMING
3548 t3 = MPI_Wtime();
3549 if(myrank == 0) fprintf(stderr, "[MemEfficientSpGEMM3D]\tPhase: %d\tAllocation of alltoall information: %lf\n", p, (t3-t2));
3550#endif
3551
3552#ifdef TIMING
3553 t2 = MPI_Wtime();
3554#endif
3555#pragma omp parallel for
3556 for(int i=0; i < A.getcommgrid3D()->GetGridLayers(); ++i){
3559 std::tuple<LIC, LIC, NUO> search_tuple_start(0, start_col, NUO());
3560 std::tuple<LIC, LIC, NUO> search_tuple_end(0, end_col, NUO());
3561 std::tuple<LIC, LIC, NUO>* start_it = std::lower_bound(C_tuples->tuples, C_tuples->tuples + C_tuples->getnnz(), search_tuple_start, comp);
3562 std::tuple<LIC, LIC, NUO>* end_it = std::lower_bound(C_tuples->tuples, C_tuples->tuples + C_tuples->getnnz(), search_tuple_end, comp);
3563 // This type casting is important from semantic point of view
3564 sendcnt[i] = (int)(end_it - start_it);
3565 sendprfl[i*3+0] = (int)(sendcnt[i]); // Number of nonzeros in ith chunk
3566 sendprfl[i*3+1] = (int)(A.GetLayerMat()->seqptr()->getnrow()); // Number of rows in ith chunk
3567 sendprfl[i*3+2] = (int)(lbDivisions3d[i]); // Number of columns in ith chunk
3568 }
3569 std::partial_sum(sendcnt, sendcnt+A.getcommgrid3D()->GetGridLayers()-1, sdispls+1);
3570#ifdef TIMING
3571 t3 = MPI_Wtime();
3572 if(myrank == 0) fprintf(stderr, "[MemEfficientSpGEMM3D]\tPhase: %d\tGetting Alltoall data ready: %lf\n", p, (t3-t2));
3573#endif
3574
3575 // Send profile ready. Now need to update the tuples to reflect correct column id after column split.
3576#ifdef TIMING
3577 t2 = MPI_Wtime();
3578#endif
3579 for(int i=0; i < A.getcommgrid3D()->GetGridLayers(); ++i){
3580#pragma omp parallel for schedule(static)
3581 for(int j = 0; j < sendcnt[i]; j++){
3582 std::get<1>(C_tuples->tuples[sdispls[i]+j]) = std::get<1>(C_tuples->tuples[sdispls[i]+j]) - lbDivisions3dPrefixSum[i];
3583 }
3584 }
3585#ifdef TIMING
3586 t3 = MPI_Wtime();
3587 if(myrank == 0) fprintf(stderr, "[MemEfficientSpGEMM3D]\tPhase: %d\tGetting Alltoallv data ready: %lf\n", p, (t3-t2));
3588#endif
3589
3590#ifdef TIMING
3591 t2 = MPI_Wtime();
3592#endif
3593 MPI_Alltoall(sendprfl, 3, MPI_INT, recvprfl, 3, MPI_INT, A.getcommgrid3D()->GetFiberWorld());
3594#ifdef TIMING
3595 t3 = MPI_Wtime();
3596 if(myrank == 0) fprintf(stderr, "[MemEfficientSpGEMM3D]\tPhase: %d\tAlltoall: %lf\n", p, (t3-t2));
3597#endif
3598#ifdef TIMING
3599 t2 = MPI_Wtime();
3600#endif
3601 for(int i = 0; i < A.getcommgrid3D()->GetGridLayers(); i++) recvcnt[i] = recvprfl[i*3];
3602 std::partial_sum(recvcnt, recvcnt+A.getcommgrid3D()->GetGridLayers()-1, rdispls+1);
3603 LIC totrecv = std::accumulate(recvcnt,recvcnt+A.getcommgrid3D()->GetGridLayers(), static_cast<IU>(0));
3604 std::tuple<LIC,LIC,NUO>* recvTuples = static_cast<std::tuple<LIC,LIC,NUO>*> (::operator new (sizeof(std::tuple<LIC,LIC,NUO>[totrecv])));
3605#ifdef TIMING
3606 t3 = MPI_Wtime();
3607 if(myrank == 0) fprintf(stderr, "[MemEfficientSpGEMM3D]\tPhase: %d\tAllocation of receive data: %lf\n", p, (t3-t2));
3608#endif
3609
3610#ifdef TIMING
3611 t2 = MPI_Wtime();
3612#endif
3613 MPI_Alltoallv(C_tuples->tuples, sendcnt, sdispls, MPI_tuple, recvTuples, recvcnt, rdispls, MPI_tuple, A.getcommgrid3D()->GetFiberWorld());
3614 delete C_tuples;
3615#ifdef TIMING
3616 t3 = MPI_Wtime();
3617 if(myrank == 0) fprintf(stderr, "[MemEfficientSpGEMM3D]\tPhase: %d\tAlltoallv: %lf\n", p, (t3-t2));
3618#endif
3619#ifdef TIMING
3620 t2 = MPI_Wtime();
3621#endif
3622 vector<SpTuples<LIC, NUO>*> recvChunks(A.getcommgrid3D()->GetGridLayers());
3623#pragma omp parallel for
3624 for (int i = 0; i < A.getcommgrid3D()->GetGridLayers(); i++){
3625 recvChunks[i] = new SpTuples<LIC, NUO>(recvcnt[i], recvprfl[i*3+1], recvprfl[i*3+2], recvTuples + rdispls[i], true, false);
3626 }
3627#ifdef TIMING
3628 t3 = MPI_Wtime();
3629 if(myrank == 0) fprintf(stderr, "[MemEfficientSpGEMM3D]\tPhase: %d\trecvChunks creation: %lf\n", p, (t3-t2));
3630#endif
3631
3632#ifdef TIMING
3633 t2 = MPI_Wtime();
3634#endif
3635 // Free all memory except tempTuples; Because that is holding data of newly created local matrices after receiving.
3636 DeleteAll(sendcnt, sendprfl, sdispls);
3637 DeleteAll(recvcnt, recvprfl, rdispls);
3639#ifdef TIMING
3640 t3 = MPI_Wtime();
3641 if(myrank == 0) fprintf(stderr, "[MemEfficientSpGEMM3D]\tPhase: %d\tMemory freeing: %lf\n", p, (t3-t2));
3642#endif
3643 /*
3644 * 3d-reduction ends
3645 * */
3646
3647#ifdef TIMING
3648 t1 = MPI_Wtime();
3650 if(myrank == 0) fprintf(stderr, "[MemEfficientSpGEMM3D]\tPhase: %d\tReduction time: %lf\n", p, (t1-t0));
3651#endif
3652#ifdef TIMING
3653 t0 = MPI_Wtime();
3654#endif
3655 /*
3656 * 3d-merge starts
3657 * */
3659
3660 if(computationKernel == 1) merged_tuples = MultiwayMergeHash<SR, LIC, NUO>(recvChunks, recvChunks[0]->getnrow(), recvChunks[0]->getncol(), false, false); // Do not delete
3661 else if(computationKernel == 2) merged_tuples = MultiwayMerge<SR, LIC, NUO>(recvChunks, recvChunks[0]->getnrow(), recvChunks[0]->getncol(), false); // Do not delete
3662#ifdef TIMING
3663 t1 = MPI_Wtime();
3664 mcl3d_3dmergetime += (t1-t0);
3665 if(myrank == 0) fprintf(stderr, "[MemEfficientSpGEMM3D]\tPhase: %d\t3D Merge time: %lf\n", p, (t1-t0));
3666#endif
3667 /*
3668 * 3d-merge ends
3669 * */
3670#ifdef TIMING
3671 t0 = MPI_Wtime();
3672#endif
3673 // Do not delete elements of recvChunks, because that would give segmentation fault due to double free
3674 ::operator delete(recvTuples);
3675 for(int i = 0; i < recvChunks.size(); i++){
3676 recvChunks[i]->tuples_deleted = true; // Temporary patch to avoid memory leak and segfault
3677 delete recvChunks[i]; // As the patch is used, now delete each element of recvChunks
3678 }
3679 vector<SpTuples<LIC,NUO>*>().swap(recvChunks); // As the patch is used, now delete recvChunks
3680
3681 // This operation is not needed if result can be used and discareded right away
3682 // This operation is being done because it is needed by MCLPruneRecoverySelect
3683 UDERO * phaseResultant = new UDERO(*merged_tuples, false);
3684 delete merged_tuples;
3685 SpParMat<IU, NUO, UDERO> phaseResultantLayer(phaseResultant, A.getcommgrid3D()->GetLayerWorld());
3687#ifdef TIMING
3688 t1 = MPI_Wtime();
3689 mcl3d_kselecttime += (t1-t0);
3690 if(myrank == 0) fprintf(stderr, "[MemEfficientSpGEMM3D]\tPhase: %d\tMCLPruneRecoverySelect time: %lf\n",p, (t1-t0));
3691#endif
3692 toconcatenate.push_back(phaseResultantLayer.seq());
3693#ifdef TIMING
3694 if(myrank == 0) fprintf(stderr, "***\n");
3695#endif
3696 }
3697 for(int i = 0; i < PiecesOfB.size(); i++) delete PiecesOfB[i];
3698
3699 std::shared_ptr<CommGrid3D> grid3d;
3700 grid3d.reset(new CommGrid3D(A.getcommgrid3D()->GetWorld(), A.getcommgrid3D()->GetGridLayers(), A.getcommgrid3D()->GetGridRows(), A.getcommgrid3D()->GetGridCols(), A.isSpecial()));
3701 UDERO * localResultant = new UDERO(0, A.GetLayerMat()->seqptr()->getnrow(), divisions3d[A.getcommgrid3D()->GetRankInFiber()], 0);
3702 localResultant->ColConcatenate(toconcatenate);
3703 SpParMat3D<IU, NUO, UDERO> C3D(localResultant, grid3d, A.isColSplit(), A.isSpecial());
3704 return C3D;
3705}
3706
3707
3708}
3709
3710
3711#endif
3712
int cblas_splits
Definition BcastTest.cpp:26
int64_t IT
double NT
double cblas_transvectime
Definition DirOptBFS.cpp:60
double cblas_localspmvtime
Definition DirOptBFS.cpp:61
double cblas_allgathertime
Definition DirOptBFS.cpp:58
double cblas_alltoalltime
Definition DirOptBFS.cpp:57
double cblas_mergeconttime
Definition DirOptBFS.cpp:59
double mcl_localspgemmtime
Definition MCL.cpp:60
double mcl_symbolictime
Definition MCL.cpp:57
double mcl_multiwaymergetime
Definition MCL.cpp:61
double mcl_prunecolumntime
Definition MCL.cpp:63
double mcl3d_symbolictime
Definition MCL.cpp:67
double mcl_Bbcasttime
Definition MCL.cpp:59
double mcl3d_SUMMAmergetime
Definition MCL.cpp:72
double mcl3d_Bbcasttime
Definition MCL.cpp:69
double mcl_kselecttime
Definition MCL.cpp:62
double mcl3d_reductiontime
Definition MCL.cpp:73
double mcl3d_kselecttime
Definition MCL.cpp:75
double mcl_Abcasttime
Definition MCL.cpp:58
double mcl3d_Abcasttime
Definition MCL.cpp:68
double mcl3d_localspgemmtime
Definition MCL.cpp:71
double mcl3d_3dmergetime
Definition MCL.cpp:74
double mcl3d_SUMMAtime
Definition MCL.cpp:70
Definition test.cpp:53
std::shared_ptr< CommGrid > commGrid
void save(std::basic_ostream< c, t > &os, std::array< V, NROUNDS > &sample_vec, int64_t index)
static void deallocate2D(T **array, I m)
Definition SpHelper.h:249
static const T * p2a(const std::vector< T > &v)
Definition SpHelper.h:187
static void GetSetSizes(const SpMat< IT, NT, DER > &Matrix, IT **&sizes, MPI_Comm &comm1d)
static void BCastMatrix(MPI_Comm &comm1d, SpMat< IT, NT, DER > &Matrix, const std::vector< IT > &essentials, int root)
static void Print(const std::string &s)
static void IBCastMatrix(MPI_Comm &comm1d, SpMat< IT, NT, DER > &Matrix, const std::vector< IT > &essentials, int root, std::vector< MPI_Request > &indarrayReq, std::vector< MPI_Request > &numarrayReq)
int size
Definition common.h:20
int nprocs
Definition comms.cpp:55
long int64_t
Definition compat.h:21
#define NROUNDS
#define DIMMISMATCH
Definition SpDefs.h:73
#define TROST
Definition SpDefs.h:106
#define TRNNZ
Definition SpDefs.h:105
#define TRLUT
Definition SpDefs.h:107
#define TRX
Definition SpDefs.h:103
#define GRIDMISMATCH
Definition SpDefs.h:72
#define MATRIXALIAS
Definition SpDefs.h:76
#define TRI
Definition SpDefs.h:104
@ Column
Definition SpDefs.h:115
SpParMat3D< IU, NUO, UDERO > Mult_AnXBn_SUMMA3D(SpParMat3D< IU, NU1, UDER1 > &A, SpParMat3D< IU, NU2, UDER2 > &B)
IT * estimateFLOP(const SpDCCols< IT, NT1 > &A, const SpDCCols< IT, NT2 > &B, IT *aux=nullptr)
Definition mtSpGEMM.h:1058
int64_t EstPerProcessNnzSUMMA(SpParMat< IU, NU1, UDERA > &A, SpParMat< IU, NU2, UDERB > &B, bool hashEstimate)
void MCLPruneRecoverySelect(SpParMat< IT, NT, DER > &A, NT hardThreshold, IT selectNum, IT recoverNum, NT recoverPct, int kselectVersion)
Definition ParFriends.h:186
SpParMat3D< IU, NUO, UDERO > MemEfficientSpGEMM3D(SpParMat3D< IU, NU1, UDERA > &A, SpParMat3D< IU, NU2, UDERB > &B, int phases, NUO hardThreshold, IU selectNum, IU recoverNum, NUO recoverPct, int kselectVersion, int computationKernel, int64_t perProcessMemory)
void MergeContributions(FullyDistSpVec< IU, VT > &y, int *&recvcnt, int *&rdispls, int32_t *&recvindbuf, VT *&recvnumbuf, int rowneighs)
Definition BFSFriends.h:224
FullyDistSpVec< IU, RET > EWiseApply_threaded(const FullyDistSpVec< IU, NU1 > &V, const FullyDistVec< IU, NU2 > &W, _BinaryOperation _binary_op, _BinaryPredicate _doOp, bool allowVNulls, NU1 Vzero, const bool useExtendedBinOp)
void CheckSpMVCompliance(const MATRIX &A, const VECTOR &x)
Dcsc< IU, typename promote_trait< NU1, NU2 >::T_promote > SetDifference(const Dcsc< IU, NU1 > &A, const Dcsc< IU, NU2 > *B)
Definition Friends.h:748
int64_t EstPerProcessNnzSpMV(SpParMat< IU, NU1, UDERA > &A, SpParMat< IU, NU2, UDERB > &B)
int CalculateNumberOfPhases(SpParMat< IU, NU1, UDERA > &A, SpParMat< IU, NU2, UDERB > &B, NUO hardThreshold, IU selectNum, IU recoverNum, NUO recoverPct, int kselectVersion, int64_t perProcessMemory)
Definition ParFriends.h:733
SpParMat< IU, NUO, UDERO > Mult_AnXBn_Overlap(SpParMat< IU, NU1, UDERA > &A, SpParMat< IU, NU2, UDERB > &B, bool clearA=false, bool clearB=false)
SpParMat< IU, NUO, UDERO > MemEfficientSpGEMM(SpParMat< IU, NU1, UDERA > &A, SpParMat< IU, NU2, UDERB > &B, int phases, NUO hardThreshold, IU selectNum, IU recoverNum, NUO recoverPct, int kselectVersion, int computationKernel, int64_t perProcessMemory)
Definition ParFriends.h:450
FullyDistVec< IT, NT > Concatenate(std::vector< FullyDistVec< IT, NT > > &vecs)
Definition ParFriends.h:61
SpParMat< IU, NUO, UDERO > Mult_AnXBn_Synch(SpParMat< IU, NU1, UDERA > &A, SpParMat< IU, NU2, UDERB > &B, bool clearA=false, bool clearB=false)
void AllGatherVector(MPI_Comm &ColWorld, int trxlocnz, IU lenuntil, int32_t *&trxinds, NV *&trxnums, int32_t *&indacc, NV *&numacc, int &accnz, bool indexisvalue)
IU EstimateFLOP(SpParMat< IU, NU1, UDERA > &A, SpParMat< IU, NU2, UDERB > &B, bool clearA=false, bool clearB=false)
Definition ParFriends.h:357
SpParMat< IU, NUO, UDERO > Mult_AnXBn_DoubleBuff(SpParMat< IU, NU1, UDERA > &A, SpParMat< IU, NU2, UDERB > &B, bool clearA=false, bool clearB=false)
Definition ParFriends.h:800
void DeleteAll(A arr1)
Definition Deleter.h:48
Dcsc< IU, N_promote > EWiseApply(const Dcsc< IU, NU1 > &A, const Dcsc< IU, NU2 > *B, _BinaryOperation __binary_op, bool notB, const NU2 &defaultBVal)
Definition Friends.h:890
void TransposeVector(MPI_Comm &World, const FullyDistSpVec< IU, NV > &x, int32_t &trxlocnz, IU &lenuntil, int32_t *&trxinds, NV *&trxnums, bool indexisvalue)
void LocalSpMV(const SpParMat< IT, bool, UDER > &A, int rowneighs, OptBuf< int32_t, VT > &optbuf, int32_t *&indacc, VT *&numacc, int *sendcnt, int accnz)
Definition BFSFriends.h:184
std::array< float, NROUNDS > samparr_t
void MergeContributions_threaded(int *&listSizes, std::vector< int32_t * > &indsvec, std::vector< OVT * > &numsvec, std::vector< IU > &mergedind, std::vector< OVT > &mergednum, IU maxindex)
bool CheckSpGEMMCompliance(const MATRIXA &A, const MATRIXB &B)
Definition ParFriends.h:161
Dcsc< IU, typename promote_trait< NU1, NU2 >::T_promote > EWiseMult(const Dcsc< IU, NU1 > &A, const Dcsc< IU, NU2 > *B, bool exclude)
Definition Friends.h:834
shared_ptr< CommGrid > ProductGrid(CommGrid *gridA, CommGrid *gridB, int &innerdim, int &Aoffset, int &Boffset)
Definition CommGrid.cpp:164
IT * estimateNNZ_Hash(const SpDCCols< IT, NT1 > &A, const SpDCCols< IT, NT2 > &B, IT *flopC, IT *aux=nullptr)
Definition mtSpGEMM.h:807
double A
double C
Definition options.h:15
signed int int32_t
Definition stdint.h:77
static MPI_Op mpi_op()
static void MPI_func(void *invec, void *inoutvec, int *len, MPI_Datatype *datatype)
static void axpy(const NZT a, const samparr_t &x, samparr_t &y)
static samparr_t id()
static samparr_t multiply(const NZT arg1, const samparr_t &arg2)
static samparr_t add(const samparr_t &arg1, const samparr_t &arg2)
static bool returnedSAID()