COMBINATORIAL_BLAS 1.6
 
Loading...
Searching...
No Matches
BFSFriends.h
Go to the documentation of this file.
1/****************************************************************/
2/* Parallel Combinatorial BLAS Library (for Graph Computations) */
3/* version 1.4 -------------------------------------------------*/
4/* date: 1/17/2014 ---------------------------------------------*/
5/* authors: Aydin Buluc (abuluc@lbl.gov), Adam Lugowski --------*/
6/****************************************************************/
7/*
8 Copyright (c) 2010-2014, The Regents of the University of California
9
10 Permission is hereby granted, free of charge, to any person obtaining a copy
11 of this software and associated documentation files (the "Software"), to deal
12 in the Software without restriction, including without limitation the rights
13 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 copies of the Software, and to permit persons to whom the Software is
15 furnished to do so, subject to the following conditions:
16
17 The above copyright notice and this permission notice shall be included in
18 all copies or substantial portions of the Software.
19
20 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 THE SOFTWARE.
27 */
28
29#ifndef _BFS_FRIENDS_H_
30#define _BFS_FRIENDS_H_
31
32#include "mpi.h"
33#include <iostream>
34#include "SpParMat.h"
35#include "SpParHelper.h"
36#include "MPIType.h"
37#include "Friends.h"
38#include "OptBuf.h"
39#include "ParFriends.h"
40#include "BitMap.h"
41#include "BitMapCarousel.h"
42#include "BitMapFringe.h"
43
44namespace combblas {
45
46template <class IT, class NT, class DER>
47class SpParMat;
48
49/*************************************************************************************************/
50/*********************** FRIEND FUNCTIONS FOR BFS ONLY (NO SEMIRINGS) RUNS **********************/
51/***************************** BOTH PARALLEL AND SEQUENTIAL FUNCTIONS ****************************/
52/*************************************************************************************************/
53
58template <typename IT, typename VT>
59void dcsc_gespmv_threaded_setbuffers (const SpDCCols<IT, bool> & A, const int32_t * indx, const VT * numx, int32_t nnzx,
60 int32_t * sendindbuf, VT * sendnumbuf, int * cnts, int * dspls, int p_c)
61{
62 Select2ndSRing<bool, VT, VT> BFSsring;
63 if(A.getnnz() > 0 && nnzx > 0)
64 {
65 int splits = A.getnsplit();
66 if(splits > 0)
67 {
68 std::vector< std::vector<int32_t> > indy(splits);
69 std::vector< std::vector< VT > > numy(splits);
70 int32_t nlocrows = static_cast<int32_t>(A.getnrow());
71 int32_t perpiece = nlocrows / splits;
72
73 #ifdef _OPENMP
74 #pragma omp parallel for
75 #endif
76 for(int i=0; i<splits; ++i)
77 {
78 if(i != splits-1)
79 SpMXSpV_ForThreading<BFSsring>(*(A.GetDCSC(i)), perpiece, indx, numx, nnzx, indy[i], numy[i], i*perpiece);
80 else
81 SpMXSpV_ForThreading<BFSsring>(*(A.GetDCSC(i)), nlocrows - perpiece*i, indx, numx, nnzx, indy[i], numy[i], i*perpiece);
82 }
83
84 int32_t perproc = nlocrows / p_c;
85 int32_t last_rec = p_c-1;
86
87 // keep recipients of last entries in each split (-1 for an empty split)
88 // so that we can delete indy[] and numy[] contents as soon as they are processed
89 std::vector<int32_t> end_recs(splits);
90 for(int i=0; i<splits; ++i)
91 {
92 if(indy[i].empty())
93 end_recs[i] = -1;
94 else
95 end_recs[i] = std::min(indy[i].back() / perproc, last_rec);
96 }
97
98 int ** loc_rec_cnts = new int *[splits];
99 #ifdef _OPENMP
100 #pragma omp parallel for
101 #endif
102 for(int i=0; i<splits; ++i)
103 {
104 loc_rec_cnts[i] = new int[p_c](); // thread-local recipient data
105 if(!indy[i].empty()) // guarantee that .begin() and .end() are not null
106 {
107 int32_t cur_rec = std::min( indy[i].front() / perproc, last_rec);
108 int32_t lastdata = (cur_rec+1) * perproc; // one past last entry that goes to this current recipient
109 for(typename std::vector<int32_t>::iterator it = indy[i].begin(); it != indy[i].end(); ++it)
110 {
111 if( ( (*it) >= lastdata ) && cur_rec != last_rec)
112 {
113 cur_rec = std::min( (*it) / perproc, last_rec);
114 lastdata = (cur_rec+1) * perproc;
115 }
116 ++loc_rec_cnts[i][cur_rec];
117 }
118 }
119 }
120 #ifdef _OPENMP
121 #pragma omp parallel for
122 #endif
123 for(int i=0; i<splits; ++i)
124 {
125 if(!indy[i].empty()) // guarantee that .begin() and .end() are not null
126 {
127 // FACT: Data is sorted, so if the recipient of begin is the same as the owner of end,
128 // then the whole data is sent to the same processor
129 int32_t beg_rec = std::min( indy[i].front() / perproc, last_rec);
130 int32_t alreadysent = 0; // already sent per recipient
131 for(int before = i-1; before >= 0; before--)
132 alreadysent += loc_rec_cnts[before][beg_rec];
133
134 if(beg_rec == end_recs[i]) // fast case
135 {
136 std::transform(indy[i].begin(), indy[i].end(), indy[i].begin(), std::bind2nd(std::minus<int32_t>(), perproc*beg_rec));
137 std::copy(indy[i].begin(), indy[i].end(), sendindbuf + dspls[beg_rec] + alreadysent);
138 std::copy(numy[i].begin(), numy[i].end(), sendnumbuf + dspls[beg_rec] + alreadysent);
139 }
140 else // slow case
141 {
142 int32_t cur_rec = beg_rec;
143 int32_t lastdata = (cur_rec+1) * perproc; // one past last entry that goes to this current recipient
144 for(typename std::vector<int32_t>::iterator it = indy[i].begin(); it != indy[i].end(); ++it)
145 {
146 if( ( (*it) >= lastdata ) && cur_rec != last_rec )
147 {
148 cur_rec = std::min( (*it) / perproc, last_rec);
149 lastdata = (cur_rec+1) * perproc;
150
151 // if this split switches to a new recipient after sending some data
152 // then it's sure that no data has been sent to that recipient yet
153 alreadysent = 0;
154 }
155 sendindbuf[ dspls[cur_rec] + alreadysent ] = (*it) - perproc*cur_rec; // convert to receiver's local index
156 sendnumbuf[ dspls[cur_rec] + (alreadysent++) ] = *(numy[i].begin() + (it-indy[i].begin()));
157 }
158 }
159 }
160 }
161 // Deallocated rec counts serially once all threads complete
162 for(int i=0; i< splits; ++i)
163 {
164 for(int j=0; j< p_c; ++j)
165 cnts[j] += loc_rec_cnts[i][j];
166 delete [] loc_rec_cnts[i];
167 }
168 delete [] loc_rec_cnts;
169 }
170 else
171 {
172 std::cout << "Something is wrong, splits should be nonzero for multithreaded execution" << std::endl;
173 }
174 }
175}
176
183template<typename VT, typename IT, typename UDER>
184void LocalSpMV(const SpParMat<IT,bool,UDER> & A, int rowneighs, OptBuf<int32_t, VT > & optbuf, int32_t * & indacc, VT * & numacc, int * sendcnt, int accnz)
185{
186
187#ifdef TIMING
188 double t0=MPI_Wtime();
189#endif
190 if(optbuf.totmax > 0) // graph500 optimization enabled
191 {
192 if(A.spSeq->getnsplit() > 0)
193 {
194 // optbuf.{inds/nums/dspls} and sendcnt are all pre-allocated and only filled by dcsc_gespmv_threaded
195
196 generic_gespmv_threaded_setbuffers< Select2ndSRing<bool, VT, VT> > (*(A.spSeq), indacc, numacc, (int32_t) accnz, optbuf.inds, optbuf.nums, sendcnt, optbuf.dspls, rowneighs);
197 }
198 else
199 {
200 // by-pass dcsc_gespmv call
201 if(A.getlocalnnz() > 0 && accnz > 0)
202 {
203 // ABAB: ignoring optbuf.isthere here
204 // \TODO: Remove .isthere from optbuf definition
205 SpMXSpV< Select2ndSRing<bool, VT, VT> >(*((A.spSeq)->GetInternal()), (int32_t) A.getlocalrows(), indacc, numacc,
206 accnz, optbuf.inds, optbuf.nums, sendcnt, optbuf.dspls, rowneighs);
207 }
208 }
209 DeleteAll(indacc,numacc);
210 }
211 else
212 {
213 SpParHelper::Print("BFS only (no semiring) function only work with optimization buffers\n");
214 }
215
216#ifdef TIMING
217 double t1=MPI_Wtime();
218 cblas_localspmvtime += (t1-t0);
219#endif
220}
221
222
223template <typename IU, typename VT>
224void MergeContributions(FullyDistSpVec<IU,VT> & y, int * & recvcnt, int * & rdispls, int32_t * & recvindbuf, VT * & recvnumbuf, int rowneighs)
225{
226#ifdef TIMING
227 double t0=MPI_Wtime();
228#endif
229 // free memory of y, in case it was aliased
230 std::vector<IU>().swap(y.ind);
231 std::vector<VT>().swap(y.num);
232
233#ifndef HEAPMERGE
234 IU ysize = y.MyLocLength(); // my local length is only O(n/p)
235 bool * isthere = new bool[ysize];
236 std::vector< std::pair<IU,VT> > ts_pairs;
237 std::fill_n(isthere, ysize, false);
238
239 // We don't need to keep a "merger" because minimum will always come from the processor
240 // with the smallest rank; so a linear sweep over the received buffer is enough
241 for(int i=0; i<rowneighs; ++i)
242 {
243 for(int j=0; j< recvcnt[i]; ++j)
244 {
245 int32_t index = recvindbuf[rdispls[i] + j];
246 if(!isthere[index])
247 ts_pairs.push_back(std::make_pair(index, recvnumbuf[rdispls[i] + j]));
248
249 }
250 }
251 DeleteAll(recvcnt, rdispls);
252 DeleteAll(isthere, recvindbuf, recvnumbuf);
253 __gnu_parallel::sort(ts_pairs.begin(), ts_pairs.end());
254 int nnzy = ts_pairs.size();
255 y.ind.resize(nnzy);
256 y.num.resize(nnzy);
257 for(int i=0; i< nnzy; ++i)
258 {
259 y.ind[i] = ts_pairs[i].first;
260 y.num[i] = ts_pairs[i].second;
261 }
262
263#else
264 // Alternative 2: Heap-merge
265 int32_t hsize = 0;
266 int32_t inf = std::numeric_limits<int32_t>::min();
267 int32_t sup = std::numeric_limits<int32_t>::max();
268 KNHeap< int32_t, int32_t > sHeap(sup, inf);
269 int * processed = new int[rowneighs]();
270 for(int32_t i=0; i<rowneighs; ++i)
271 {
272 if(recvcnt[i] > 0)
273 {
274 // key, proc_id
275 sHeap.insert(recvindbuf[rdispls[i]], i);
276 ++hsize;
277 }
278 }
279 int32_t key, locv;
280 if(hsize > 0)
281 {
282 sHeap.deleteMin(&key, &locv);
283 y.ind.push_back( static_cast<IU>(key));
284 y.num.push_back(recvnumbuf[rdispls[locv]]); // nothing is processed yet
285
286 if( (++(processed[locv])) < recvcnt[locv] )
287 sHeap.insert(recvindbuf[rdispls[locv]+processed[locv]], locv);
288 else
289 --hsize;
290 }
291
292// ofstream oput;
293// y.commGrid->OpenDebugFile("Merge", oput);
294// oput << "From displacements: "; copy(rdispls, rdispls+rowneighs, ostream_iterator<int>(oput, " ")); oput << endl;
295// oput << "From counts: "; copy(recvcnt, recvcnt+rowneighs, ostream_iterator<int>(oput, " ")); oput << endl;
296 while(hsize > 0)
297 {
298 sHeap.deleteMin(&key, &locv);
299 IU deref = rdispls[locv] + processed[locv];
300 if(y.ind.back() != static_cast<IU>(key)) // y.ind is surely not empty
301 {
302 y.ind.push_back(static_cast<IU>(key));
303 y.num.push_back(recvnumbuf[deref]);
304 }
305
306 if( (++(processed[locv])) < recvcnt[locv] )
307 sHeap.insert(recvindbuf[rdispls[locv]+processed[locv]], locv);
308 else
309 --hsize;
310 }
311 DeleteAll(recvcnt, rdispls,processed);
312 DeleteAll(recvindbuf, recvnumbuf);
313#endif
314
315#ifdef TIMING
316 double t1=MPI_Wtime();
317 cblas_mergeconttime += (t1-t0);
318#endif
319}
320
327template <typename VT, typename IT, typename UDER>
328FullyDistSpVec<IT,VT> SpMV (const SpParMat<IT,bool,UDER> & A, const FullyDistSpVec<IT,VT> & x, OptBuf<int32_t, VT > & optbuf)
329{
331 optbuf.MarkEmpty();
332
333 MPI_Comm World = x.commGrid->GetWorld();
334 MPI_Comm ColWorld = x.commGrid->GetColWorld();
335 MPI_Comm RowWorld = x.commGrid->GetRowWorld();
336
337 int accnz;
338 int32_t trxlocnz;
339 IT lenuntil;
340 int32_t *trxinds, *indacc;
341 VT *trxnums, *numacc;
342
343
344#ifdef TIMING
345 double t0=MPI_Wtime();
346#endif
347 TransposeVector(World, x, trxlocnz, lenuntil, trxinds, trxnums, true); // trxinds (and potentially trxnums) is allocated
348#ifdef TIMING
349 double t1=MPI_Wtime();
350 cblas_transvectime += (t1-t0);
351#endif
352 AllGatherVector(ColWorld, trxlocnz, lenuntil, trxinds, trxnums, indacc, numacc, accnz, true); // trxinds (and potentially trxnums) is deallocated, indacc/numacc allocated
353
354 FullyDistSpVec<IT, VT> y ( x.commGrid, A.getnrow()); // identity doesn't matter for sparse vectors
355 int rowneighs; MPI_Comm_size(RowWorld,&rowneighs);
356 int * sendcnt = new int[rowneighs]();
357
358 LocalSpMV(A, rowneighs, optbuf, indacc, numacc, sendcnt, accnz); // indacc/numacc deallocated
359
360 int * rdispls = new int[rowneighs];
361 int * recvcnt = new int[rowneighs];
362 MPI_Alltoall(sendcnt, 1, MPI_INT, recvcnt, 1, MPI_INT, RowWorld); // share the request counts
363
364 // receive displacements are exact whereas send displacements have slack
365 rdispls[0] = 0;
366 for(int i=0; i<rowneighs-1; ++i)
367 {
368 rdispls[i+1] = rdispls[i] + recvcnt[i];
369 }
370 int totrecv = std::accumulate(recvcnt,recvcnt+rowneighs,0);
371 int32_t * recvindbuf = new int32_t[totrecv];
372 VT * recvnumbuf = new VT[totrecv];
373
374#ifdef TIMING
375 double t2=MPI_Wtime();
376#endif
377 if(optbuf.totmax > 0 ) // graph500 optimization enabled
378 {
379 MPI_Alltoallv(optbuf.inds, sendcnt, optbuf.dspls, MPIType<int32_t>(), recvindbuf, recvcnt, rdispls, MPIType<int32_t>(), RowWorld);
380 MPI_Alltoallv(optbuf.nums, sendcnt, optbuf.dspls, MPIType<VT>(), recvnumbuf, recvcnt, rdispls, MPIType<VT>(), RowWorld);
381 delete [] sendcnt;
382 }
383 else
384 {
385 SpParHelper::Print("BFS only (no semiring) function only work with optimization buffers\n");
386 }
387#ifdef TIMING
388 double t3=MPI_Wtime();
389 cblas_alltoalltime += (t3-t2);
390#endif
391
392 MergeContributions(y,recvcnt, rdispls, recvindbuf, recvnumbuf, rowneighs);
393 return y;
394}
395
396template <typename VT, typename IT, typename UDER>
397SpDCCols<int,bool>::SpColIter* CalcSubStarts(SpParMat<IT,bool,UDER> & A, FullyDistSpVec<IT,VT> & x, BitMapCarousel<IT,VT> &done) {
398 std::shared_ptr<CommGrid> cg = A.getcommgrid();
399 IT rowuntil = x.LengthUntil();
400 MPI_Comm RowWorld = cg->GetRowWorld();
401 MPI_Bcast(&rowuntil, 1, MPIType<IT>(), 0, RowWorld);
402 int numcols = cg->GetGridCols();
403 SpDCCols<int,bool>::SpColIter colit = A.seq().begcol();
404#ifdef THREADED
405 SpDCCols<int,bool>::SpColIter* starts = new SpDCCols<int,bool>::SpColIter[numcols*cblas_splits+1];
406 for(int c=0; c<numcols; c++) {
407 IT curr_sub_start = done.GetGlobalStartOfLocal(c) - rowuntil;
408 IT next_sub_start = done.GetGlobalEndOfLocal(c) - rowuntil;
409 IT sub_range = next_sub_start - curr_sub_start;
410 IT per_thread = (sub_range + cblas_splits - 1) / cblas_splits;
411 IT curr_thread_start = curr_sub_start;
412 for (int t=0; t<cblas_splits; t++) {
413 while(colit.colid() < curr_thread_start) {
414 ++colit;
415 }
416 starts[c*cblas_splits + t] = colit;
417 curr_thread_start = std::min(curr_thread_start + per_thread, next_sub_start);
418 }
419 }
420 starts[numcols*cblas_splits] = A.seq().endcol();
421#else
422 SpDCCols<int,bool>::SpColIter* starts = new SpDCCols<int,bool>::SpColIter[numcols+1];
423 for(int c=0; c<numcols; c++) {
424 IT next_start = done.GetGlobalStartOfLocal(c) - rowuntil;
425 while(colit.colid() < next_start) {
426 ++colit;
427 }
428 starts[c] = colit;
429 }
430 starts[numcols] = A.seq().endcol();
431#endif
432 return starts;
433}
434
435template <typename VT, typename IT>
436void UpdateParents(MPI_Comm & RowWorld, std::pair<IT,IT> *updates, int num_updates, FullyDistVec<IT,VT> &parents, int source, int dest, BitMapFringe<int64_t,int64_t> &bm_fringe) {
437 int send_words = num_updates<<1, recv_words;
438 MPI_Status status;
439 MPI_Sendrecv(&send_words, 1, MPI_INT, dest, PUPSIZE,
440 &recv_words, 1, MPI_INT, source, PUPSIZE, RowWorld, &status);
441 std::pair<IT,IT>* recv_buff = new std::pair<IT,IT>[recv_words>>1];
442 MPI_Sendrecv(updates, send_words, MPIType<IT>(), dest, PUPDATA,
443 recv_buff, recv_words, MPIType<IT>(), source, PUPDATA, RowWorld, &status);
444
445#ifdef THREADED
446#pragma omp parallel for
447#endif
448 for (int i=0; i<recv_words>>1; i++) {
449 parents.SetLocalElement(recv_buff[i].first, recv_buff[i].second);
450 }
451
452 bm_fringe.IncrementNumSet((recv_words>>1));
453 delete[] recv_buff;
454}
455
456
457template <typename VT, typename IT, typename UDER>
458void BottomUpStep(SpParMat<IT,bool,UDER> & A, FullyDistSpVec<IT,VT> & x, BitMapFringe<int64_t,int64_t> &bm_fringe, FullyDistVec<IT,VT> & parents, BitMapCarousel<IT,VT> &done, SpDCCols<int,bool>::SpColIter* starts)
459{
460 std::shared_ptr<CommGrid> cg = A.getcommgrid();
461 MPI_Comm World = cg->GetWorld();
462 MPI_Comm ColWorld = cg->GetColWorld();
463 MPI_Comm RowWorld = cg->GetRowWorld();
464 MPI_Status status;
465
466 // get row and column offsets
467 IT rowuntil = x.LengthUntil(), my_coluntil = x.LengthUntil(), coluntil;
468 int diagneigh = cg->GetComplementRank();
469 MPI_Sendrecv(&my_coluntil, 1, MPIType<IT>(), diagneigh, TROST, &coluntil, 1, MPIType<IT>(), diagneigh, TROST, World, &status);
470 MPI_Bcast(&coluntil, 1, MPIType<IT>(), 0, ColWorld);
471 MPI_Bcast(&rowuntil, 1, MPIType<IT>(), 0, RowWorld);
472
473 BitMap* frontier = bm_fringe.TransposeGather();
474 done.SaveOld();
475
476#ifdef THREADED
477 const int buff_size = 8192;
478 std::pair<IT,IT>* local_update_heads[cblas_splits];
479 for (int t=0; t<cblas_splits; t++)
480 local_update_heads[t] = new std::pair<IT,IT>[buff_size];
481#endif
482
483 // do bottom up work
484 int numcols = cg->GetGridCols();
485 int mycol = cg->GetRankInProcRow();
486 std::pair<IT,IT>* parent_updates = new std::pair<IT,IT>[done.SizeOfChunk()<<1]; // over-allocated
487
488 for (int sub_step=0; sub_step<numcols; sub_step++) {
489 int num_updates = 0;
490 IT sub_start = done.GetGlobalStartOfLocal();
491 int dest_slice = (mycol + sub_step) % numcols;
492 int source_slice = (mycol - sub_step + numcols) % numcols;
493#ifdef BOTTOMUPTIME
494 double t1 = MPI_Wtime();
495#endif
496#ifdef THREADED
497#pragma omp parallel
498 {
499 int id = omp_get_thread_num();
500 int num_locals=0;
501 SpDCCols<int,bool>::SpColIter::NzIter nzit, nzit_end;
502 SpDCCols<int,bool>::SpColIter colit, colit_end;
503 std::pair<IT,IT>* local_updates = local_update_heads[id];
504 // vector<pair<IT,IT> > local_updates;
505 colit_end = starts[dest_slice*cblas_splits + id + 1];
506 for(colit = starts[dest_slice*cblas_splits + id]; colit != colit_end; ++colit) {
507 int32_t local_row_ind = colit.colid();
508 IT row = local_row_ind + rowuntil;
509 if (!done.GetBit(row)) {
510 nzit_end = A.seq().endnz(colit);
511 for(nzit = A.seq().begnz(colit); nzit != nzit_end; ++nzit) {
512 int32_t local_col_ind = nzit.rowid();
513 IT col = local_col_ind + coluntil;
514 if (frontier->get_bit(local_col_ind)) {
515 // local_updates.push_back(make_pair(row-sub_start, col));
516 if (num_locals == buff_size) {
517 int copy_start = __sync_fetch_and_add(&num_updates, buff_size);
518 std::copy(local_updates, local_updates + buff_size, parent_updates + copy_start);
519 num_locals = 0;
520 }
521 local_updates[num_locals++] = std::make_pair(row-sub_start, col);
522 done.SetBit(row);
523 break;
524 }
525 }
526 }
527 }
528 int copy_start = __sync_fetch_and_add(&num_updates, num_locals);
529 std::copy(local_updates, local_updates + num_locals, parent_updates + copy_start);
530 }
531#else
532 SpDCCols<int,bool>::SpColIter::NzIter nzit, nzit_end;
533 SpDCCols<int,bool>::SpColIter colit, colit_end;
534 colit_end = starts[dest_slice+1];
535 for(colit = starts[dest_slice]; colit != colit_end; ++colit)
536 {
537 int32_t local_row_ind = colit.colid();
538 IT row = local_row_ind + rowuntil;
539 if (!done.GetBit(row))
540 {
541 nzit_end = A.seq().endnz(colit);
542 for(nzit = A.seq().begnz(colit); nzit != nzit_end; ++nzit)
543 {
544 int32_t local_col_ind = nzit.rowid();
545 IT col = local_col_ind + coluntil;
546 if (frontier->get_bit(local_col_ind))
547 {
548 parent_updates[num_updates++] = std::make_pair(row-sub_start, col);
549 done.SetBit(row);
550 break;
551 }
552 } // end_for
553 } // end_if
554 } // end_for
555#endif
556
557#ifdef BOTTOMUPTIME
558 double t2 = MPI_Wtime();
559 bu_local += (t2-t1);
560 t1 = MPI_Wtime();
561#endif
562 done.RotateAlongRow();
563
564#ifdef BOTTOMUPTIME
565 t2 = MPI_Wtime();
566 bu_rotate += (t2-t1);
567 t1 = MPI_Wtime();
568#endif
569 UpdateParents(RowWorld, parent_updates, num_updates, parents, source_slice, dest_slice, bm_fringe);
570#ifdef BOTTOMUPTIME
571 t2 = MPI_Wtime();
572 bu_update += (t2-t1);
573#endif
574 }
575 bm_fringe.LoadFromNext();
576 done.UpdateFringe(bm_fringe);
577#ifdef THREADED
578 for (int t=0; t<cblas_splits; t++)
579 delete[] local_update_heads[t];
580#endif
581 delete[] parent_updates;
582}
583
584}
585
586#endif
int cblas_splits
Definition BcastTest.cpp:26
double bu_local
Definition DirOptBFS.cpp:69
double cblas_transvectime
Definition DirOptBFS.cpp:60
double cblas_localspmvtime
Definition DirOptBFS.cpp:61
double bu_rotate
Definition DirOptBFS.cpp:71
double cblas_alltoalltime
Definition DirOptBFS.cpp:57
double bu_update
Definition DirOptBFS.cpp:70
double cblas_mergeconttime
Definition DirOptBFS.cpp:59
static void Print(const std::string &s)
#define TROST
Definition SpDefs.h:106
#define PUPSIZE
Definition SpDefs.h:110
#define PUPDATA
Definition SpDefs.h:111
SpDCCols< int, bool >::SpColIter * CalcSubStarts(SpParMat< IT, bool, UDER > &A, FullyDistSpVec< IT, VT > &x, BitMapCarousel< IT, VT > &done)
Definition BFSFriends.h:397
void BottomUpStep(SpParMat< IT, bool, UDER > &A, FullyDistSpVec< IT, VT > &x, BitMapFringe< int64_t, int64_t > &bm_fringe, FullyDistVec< IT, VT > &parents, BitMapCarousel< IT, VT > &done, SpDCCols< int, bool >::SpColIter *starts)
Definition BFSFriends.h:458
MPI_Datatype MPIType< int32_t >(void)
Definition MPIType.cpp:56
void MergeContributions(FullyDistSpVec< IU, VT > &y, int *&recvcnt, int *&rdispls, int32_t *&recvindbuf, VT *&recvnumbuf, int rowneighs)
Definition BFSFriends.h:224
void CheckSpMVCompliance(const MATRIX &A, const VECTOR &x)
void AllGatherVector(MPI_Comm &ColWorld, int trxlocnz, IU lenuntil, int32_t *&trxinds, NV *&trxnums, int32_t *&indacc, NV *&numacc, int &accnz, bool indexisvalue)
void UpdateParents(MPI_Comm &RowWorld, std::pair< IT, IT > *updates, int num_updates, FullyDistVec< IT, VT > &parents, int source, int dest, BitMapFringe< int64_t, int64_t > &bm_fringe)
Definition BFSFriends.h:436
void DeleteAll(A arr1)
Definition Deleter.h:48
void TransposeVector(MPI_Comm &World, const FullyDistSpVec< IU, NV > &x, int32_t &trxlocnz, IU &lenuntil, int32_t *&trxinds, NV *&trxnums, bool indexisvalue)
void LocalSpMV(const SpParMat< IT, bool, UDER > &A, int rowneighs, OptBuf< int32_t, VT > &optbuf, int32_t *&indacc, VT *&numacc, int *sendcnt, int accnz)
Definition BFSFriends.h:184
void dcsc_gespmv_threaded_setbuffers(const SpDCCols< IT, bool > &A, const int32_t *indx, const VT *numx, int32_t nnzx, int32_t *sendindbuf, VT *sendnumbuf, int *cnts, int *dspls, int p_c)
Definition BFSFriends.h:59
double A
signed int int32_t
Definition stdint.h:77