COMBINATORIAL_BLAS 1.6
 
Loading...
Searching...
No Matches
CC.h
Go to the documentation of this file.
1/****************************************************************/
2/* Parallel Combinatorial BLAS Library (for Graph Computations) */
3/* version 1.6 -------------------------------------------------*/
4/* date: 6/15/2017 ---------------------------------------------*/
5/* authors: Ariful Azad, Aydin Buluc --------------------------*/
6/****************************************************************/
7/*
8 Copyright (c) 2010-2017, The Regents of the University of California
9
10 Permission is hereby granted, free of charge, to any person obtaining a copy
11 of this software and associated documentation files (the "Software"), to deal
12 in the Software without restriction, including without limitation the rights
13 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 copies of the Software, and to permit persons to whom the Software is
15 furnished to do so, subject to the following conditions:
16
17 The above copyright notice and this permission notice shall be included in
18 all copies or substantial portions of the Software.
19
20 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 THE SOFTWARE.
27 */
28
29
30#include <mpi.h>
31
32// These macros should be defined before stdint.h is included
33#ifndef __STDC_CONSTANT_MACROS
34#define __STDC_CONSTANT_MACROS
35#endif
36#ifndef __STDC_LIMIT_MACROS
37#define __STDC_LIMIT_MACROS
38#endif
39#include <stdint.h>
40
41#include <sys/time.h>
42#include <iostream>
43#include <fstream>
44#include <string>
45#include <sstream>
46#include <ctime>
47#include <cmath>
48#include "CombBLAS/CombBLAS.h"
49//#define CC_TIMING 1
50
51#define NONSTAR 0
52#define STAR 1
53#define CONVERGED 2
54using namespace std;
55
60namespace combblas {
61
62 template <typename T1, typename T2>
63 struct Select2ndMinSR
64 {
66 static T_promote id(){ return std::numeric_limits<T_promote>::max(); };
67 static bool returnedSAID() { return false; }
68 static MPI_Op mpi_op() { return MPI_MIN; };
69
70 static T_promote add(const T_promote & arg1, const T_promote & arg2)
71 {
72 return std::min(arg1, arg2);
73 }
74
75 static T_promote multiply(const T1 & arg1, const T2 & arg2)
76 {
77 return static_cast<T_promote> (arg2);
78 }
79
80 static void axpy(const T1 a, const T2 & x, T_promote & y)
81 {
82 y = add(y, multiply(a, x));
83 }
84 };
85
86
87
88 template <class T, class I>
89 void omp_par_scan(T* A, T* B,I cnt)
90 {
91 int p=omp_get_max_threads();
92 if(cnt<100*p){
93 for(I i=1;i<cnt;i++)
94 B[i]=B[i-1]+A[i-1];
95 return;
96 }
97 I step_size=cnt/p;
98
99#pragma omp parallel for
100 for(int i=0; i<p; i++){
101 int start=i*step_size;
102 int end=start+step_size;
103 if(i==p-1) end=cnt;
104 if(i!=0)B[start]=0;
105 for(I j=start+1; j<end; j++)
106 B[j]=B[j-1]+A[j-1];
107 }
108
109 T* sum=new T[p];
110 sum[0]=0;
111 for(int i=1;i<p;i++)
112 sum[i]=sum[i-1]+B[i*step_size-1]+A[i*step_size-1];
113
114#pragma omp parallel for
115 for(int i=1; i<p; i++){
116 int start=i*step_size;
117 int end=start+step_size;
118 if(i==p-1) end=cnt;
119 T sum_=sum[i];
120 for(I j=start; j<end; j++)
121 B[j]+=sum_;
122 }
123 delete[] sum;
124 }
125
126
127
128
129 // copied from usort so that we can select k
130 // an increased value of k reduces the bandwidth cost, but increases the latency cost
131 // this does not work when p is not power of two and a processor is not sending data,
132 template <typename T>
134 T* rbuff_, int* r_cnt_, int* rdisp_, MPI_Comm c, int kway=2)
135 {
136 int np, pid;
137 MPI_Comm_size(c, &np);
138 MPI_Comm_rank(c, &pid);
139
140 if(np==1 || kway==1)
141 {
143 }
144
145 int range[2]={0,np};
146
147 std::vector<int> s_cnt(np);
148#pragma omp parallel for
149 for(int i=0;i<np;i++){
150 s_cnt[i]=s_cnt_[i]*sizeof(T)+2*sizeof(int);
151 }
152 std::vector<int> sdisp(np); sdisp[0]=0;
153 omp_par_scan(&s_cnt[0],&sdisp[0],np);
154
155 char* sbuff=new char[sdisp[np-1]+s_cnt[np-1]];
156#pragma omp parallel for
157 for(int i=0;i<np;i++){
158 ((int*)&sbuff[sdisp[i]])[0]=s_cnt[i];
159 ((int*)&sbuff[sdisp[i]])[1]=pid;
160 memcpy(&sbuff[sdisp[i]]+2*sizeof(int),&sbuff_[sdisp_[i]],s_cnt[i]-2*sizeof(int));
161 }
162
163 //int t_indx=0;
164 int iter_cnt=0;
165 while(range[1]-range[0]>1){
166 iter_cnt++;
167 if(kway>range[1]-range[0])
168 kway=range[1]-range[0];
169
170 std::vector<int> new_range(kway+1);
171 for(int i=0;i<=kway;i++)
172 new_range[i]=(range[0]*(kway-i)+range[1]*i)/kway;
173 int p_class=(std::upper_bound(&new_range[0],&new_range[kway],pid)-&new_range[0]-1);
176
177 //Communication.
178 {
179 std::vector<int> r_cnt (new_np*kway, 0);
180 std::vector<int> r_cnt_ext(new_np*kway, 0);
181 //Exchange send sizes.
182 for(int i=0;i<kway;i++){
184 int cmp_np=new_range[i+1]-new_range[i];
185 int partner=(new_pid<cmp_np? new_range[i]+new_pid: new_range[i+1]-1) ;
186 assert( (new_pid<cmp_np? true: new_range[i]+new_pid==new_range[i+1] )); //Remove this.
188 &r_cnt[new_np *i ], new_np, MPI_INT, partner, 0, c, &status);
189
190 //Handle extra communication.
191 if(new_pid==new_np-1 && cmp_np>new_np){
192 int partner=new_range[i+1]-1;
193 std::vector<int> s_cnt_ext(cmp_np, 0);
196 }
197 }
198
199 //Allocate receive buffer.
200 std::vector<int> rdisp (new_np*kway, 0);
201 std::vector<int> rdisp_ext(new_np*kway, 0);
203 char *rbuff, *rbuff_ext;
204 {
205 omp_par_scan(&r_cnt [0], &rdisp [0],new_np*kway);
209 rbuff = new char[rbuff_size ];
210 rbuff_ext = new char[rbuff_size_ext];
211 }
212
213 //Sendrecv data.
214 //*
215 int my_block=kway;
216 while(pid<new_range[my_block]) my_block--;
217 // MPI_Barrier(c);
218 for(int i_=0;i_<=kway/2;i_++){
219 int i1=(my_block+i_)%kway;
220 int i2=(my_block+kway-i_)%kway;
221
222 for(int j=0;j<(i_==0 || i_==kway/2?1:2);j++){
223 int i=(i_==0?i1:((j+my_block/i_)%2?i1:i2));
225 int cmp_np=new_range[i+1]-new_range[i];
226 int partner=(new_pid<cmp_np? new_range[i]+new_pid: new_range[i+1]-1) ;
227
228 int send_dsp =sdisp[new_range[i ]-new_range[0] ];
229 int send_dsp_last=sdisp[new_range[i+1]-new_range[0]-1];
231
232 // ttt=omp_get_wtime();
234 &rbuff[rdisp[new_np * i ]], r_cnt[new_np *(i+1)-1]+rdisp[new_np *(i+1)-1]-rdisp[new_np * i ], MPI_BYTE, partner, 0, c, &status);
235
236 //Handle extra communication.
237 if(pid==new_np-1 && cmp_np>new_np){
238 int partner=new_range[i+1]-1;
239 std::vector<int> s_cnt_ext(cmp_np, 0);
242 }
243 }
244 }
245
246 //Rearrange received data.
247 {
248 if(sbuff!=NULL) delete[] sbuff;
250
251 std::vector<int> cnt_new(2*new_np*kway, 0);
252 std::vector<int> disp_new(2*new_np*kway, 0);
253 for(int i=0;i<new_np;i++)
254 for(int j=0;j<kway;j++){
255 cnt_new[(i*2 )*kway+j]=r_cnt [j*new_np+i];
256 cnt_new[(i*2+1)*kway+j]=r_cnt_ext[j*new_np+i];
257 }
259
260#pragma omp parallel for
261 for(int i=0;i<new_np;i++)
262 for(int j=0;j<kway;j++){
263 memcpy(&sbuff[disp_new[(i*2 )*kway+j]], &rbuff [rdisp [j*new_np+i]], r_cnt [j*new_np+i]);
265 }
266
267 //Free memory.
268 if(rbuff !=NULL) delete[] rbuff ;
269 if(rbuff_ext!=NULL) delete[] rbuff_ext;
270
271 s_cnt.clear();
272 s_cnt.resize(new_np,0);
273 sdisp.resize(new_np);
274 for(int i=0;i<new_np;i++){
275 for(int j=0;j<2*kway;j++)
276 s_cnt[i]+=cnt_new[i*2*kway+j];
277 sdisp[i]=disp_new[i*2*kway];
278 }
279 }
280 }
281
283 range[1]=new_range[p_class+1];
284 }
285
286 //Copy data to rbuff_.
287 std::vector<char*> buff_ptr(np);
288 char* tmp_ptr=sbuff;
289 for(int i=0;i<np;i++){
290 int& blk_size=((int*)tmp_ptr)[0];
291 buff_ptr[i]=tmp_ptr;
293 }
294#pragma omp parallel for
295 for(int i=0;i<np;i++){
296 int& blk_size=((int*)buff_ptr[i])[0];
297 int& src_pid=((int*)buff_ptr[i])[1];
298 assert(blk_size-2*sizeof(int)<=r_cnt_[src_pid]*sizeof(T));
299 memcpy(&rbuff_[rdisp_[src_pid]],buff_ptr[i]+2*sizeof(int),blk_size-2*sizeof(int));
300 }
301
302 //Free memory.
303 if(sbuff !=NULL) delete[] sbuff;
304 return 1;
305
306 }
307
308
309
310 template <typename T>
311 int Mpi_Alltoallv(T* sbuff, int* s_cnt, int* sdisp,
312 T* rbuff, int* r_cnt, int* rdisp, MPI_Comm comm)
313 {
314 int nprocs, rank;
317
318 int commCnt = 0;
319 for(int i = 0; i < nprocs; i++)
320 {
321 if(i==rank) continue;
322 if(s_cnt[i] > 0) commCnt++;
323 if(r_cnt[i] > 0) commCnt++;
324 }
325 int totalCommCnt = 0;
327
328 if(totalCommCnt < 2*log2(nprocs))
329 {
331 }
332 else if((nprocs & (nprocs - 1)) == 0) // processor count is power of 2
333 {
335 }
336 else
337 {
339 }
340
341 return 1;
342
343 }
344
345
346 template <class IT, class NT>
348 {
349 auto commGrid = dense.getcommgrid();
350 MPI_Comm World = commGrid->GetWorld();
351 int nprocs = commGrid->GetSize();
352
355 std::vector<IT> rinum = ri.GetLocalNum();
356 IT riloclen = rinum.size();
357 for(IT i=0; i < riloclen; ++i)
358 {
359 IT locind;
360 int owner = dense.Owner(rinum[i], locind);
361 sendcnt[owner]++;
362 }
363
364 MPI_Alltoall(sendcnt.data(), 1, MPI_INT, recvcnt.data(), 1, MPI_INT, World);
365 IT totrecv = std::accumulate(recvcnt.begin(),recvcnt.end(), static_cast<IT>(0));
366
367 double broadcast_cost = dense.LocArrSize() * log2(nprocs); // bandwidth cost
368 IT bcastsize = 0;
370
371 int nbcast = 0;
373 {
374 bcastsize = dense.LocArrSize();
375 }
377
378 for(int i=0; i<nprocs; i++)
379 {
380 if(bcastcnt[i]>0) nbcast++;
381 }
382
383 if(nbcast > 0)
384 {
387
390
391 int ibcast = 0;
392 const NT * arr = dense.GetLocArr();
393 for(int i=0; i<nprocs; i++)
394 {
395 if(bcastcnt[i]>0)
396 {
397 bcastBuffer[i].resize(bcastcnt[i]);
398 std::copy(arr, arr+bcastcnt[i], bcastBuffer[i].begin());
400 }
401 }
402
404 delete [] requests;
405 delete [] statuses;
406 }
407 return nbcast;
408 }
409
410 // SubRef usign a sparse vector
411 // given a dense vector dv and a sparse vector sv
412 // sv_out[i]=dv[sv[i]] for all nonzero index i in sv
413 // return sv_out
414 // If sv has repeated entries, many processes are requesting same entries of dv from the same processes
415 // (usually from the low rank processes in LACC)
416 // In this case, it may be beneficial to broadcast some entries of dv so that dv[sv[i]] can be obtained locally.
417 // This logic is implemented in this function: replicate(dense, ri, bcastBuffer)
418
419 template <class IT, class NT>
421 {
422
423#ifdef CC_TIMING
424 double ts = MPI_Wtime();
425 std::ostringstream outs;
426 outs.str("");
427 outs.clear();
428 outs<< " Extract timing: ";
429#endif
430 auto commGrid = ri.getcommgrid();
431 MPI_Comm World = commGrid->GetWorld();
432 int nprocs = commGrid->GetSize();
433
434 if(!(commGrid == dense.getcommgrid()))
435 {
436 std::cout << "Grids are not comparable for dense vector subsref" << std::endl;
437 return FullyDistSpVec<IT,NT>();
438 }
439
440
441
443#ifdef CC_TIMING
444 double t1 = MPI_Wtime();
445#endif
447#ifdef CC_TIMING
448 double bcast = MPI_Wtime() - t1;
449 outs << "bcast ( " << nbcast << " ): " << bcast << " ";
450#endif
451
452 std::vector< std::vector< IT > > data_req(nprocs);
453 std::vector< std::vector< IT > > revr_map(nprocs); // to put the incoming data to the correct location
454 const NT * arr = dense.GetLocArr();
455
456 std::vector<IT> rinum = ri.GetLocalNum();
457 IT riloclen = rinum.size();
458 std::vector<NT> num(riloclen); // final output
459 for(IT i=0; i < riloclen; ++i)
460 {
461 IT locind;
462 int owner = dense.Owner(rinum[i], locind);
463 if(bcastBuffer[owner].size() == 0)
464 {
465 data_req[owner].push_back(locind);
466 revr_map[owner].push_back(i);
467 }
468 else
469 {
470 num[i] =bcastBuffer[owner][locind];
471 }
472 }
473
474 int * sendcnt = new int[nprocs];
475 int * sdispls = new int[nprocs];
476 for(int i=0; i<nprocs; ++i)
477 sendcnt[i] = (int) data_req[i].size();
478
479 int * rdispls = new int[nprocs];
480 int * recvcnt = new int[nprocs];
481
482#ifdef CC_TIMING
483 t1 = MPI_Wtime();
484#endif
485 MPI_Alltoall(sendcnt, 1, MPI_INT, recvcnt, 1, MPI_INT, World); // share the request counts
486#ifdef CC_TIMING
487 double all2ll1 = MPI_Wtime() - t1;
488 outs << "all2ll1: " << all2ll1 << " ";
489
490#endif
491 sdispls[0] = 0;
492 rdispls[0] = 0;
493 for(int i=0; i<nprocs-1; ++i)
494 {
495 sdispls[i+1] = sdispls[i] + sendcnt[i];
496 rdispls[i+1] = rdispls[i] + recvcnt[i];
497 }
498 IT totsend = std::accumulate(sendcnt,sendcnt+nprocs, static_cast<IT>(0));
499 IT totrecv = std::accumulate(recvcnt,recvcnt+nprocs, static_cast<IT>(0));
500
501
502 IT * sendbuf = new IT[totsend];
503 for(int i=0; i<nprocs; ++i)
504 {
505 std::copy(data_req[i].begin(), data_req[i].end(), sendbuf+sdispls[i]);
506 std::vector<IT>().swap(data_req[i]);
507 }
508
509
510 IT * reversemap = new IT[totsend];
511 for(int i=0; i<nprocs; ++i)
512 {
513 std::copy(revr_map[i].begin(), revr_map[i].end(), reversemap+sdispls[i]); // reversemap array is unique
514 std::vector<IT>().swap(revr_map[i]);
515 }
516
517 IT * recvbuf = new IT[totrecv];
518#ifdef CC_TIMING
519 t1 = MPI_Wtime();
520#endif
521
522 Mpi_Alltoallv(sendbuf, sendcnt, sdispls, recvbuf, recvcnt, rdispls, World);
523
524#ifdef CC_TIMING
525 double all2ll2 = MPI_Wtime() - t1;
526 outs << "all2ll2: " << all2ll2 << " ";
527#endif
528 delete [] sendbuf;
529
530 // access requested data
531 NT * databack = new NT[totrecv];
532
533#ifdef THREADED
534#pragma omp parallel for
535#endif
536 for(int i=0; i<totrecv; ++i)
537 databack[i] = arr[recvbuf[i]];
538 delete [] recvbuf;
539
540 // communicate requested data
541 NT * databuf = new NT[totsend];
542 // the response counts are the same as the request counts
543#ifdef CC_TIMING
544 t1 = MPI_Wtime();
545#endif
546 //Mpi_Alltoallv_sparse(databack, recvcnt, rdispls,databuf, sendcnt, sdispls, World);
547
548 Mpi_Alltoallv(databack, recvcnt, rdispls,databuf, sendcnt, sdispls, World);
549
550
551#ifdef CC_TIMING
552 double all2ll3 = MPI_Wtime() - t1;
553 outs << "all2ll3: " << all2ll3 << " ";
554#endif
555
556 // Create the output from databuf
557 for(int i=0; i<totsend; ++i)
558 num[reversemap[i]] = databuf[i];
559
560 DeleteAll(rdispls, recvcnt, databack);
562 std::vector<IT> ind = ri.GetLocalInd ();
563 IT globallen = ri.TotalLength();
564 FullyDistSpVec<IT, NT> indexed(commGrid, globallen, ind, num, true, true);
565
566#ifdef CC_TIMING
567 double total = MPI_Wtime() - ts;
568 outs << "others: " << total - (bcast + all2ll1 + all2ll2 + all2ll3) << " ";
569 outs<< endl;
571#endif
572
573 return indexed;
574
575 }
576
577
578
579 template <class IT, class NT>
581 {
582 auto commGrid = ind.getcommgrid();
583 MPI_Comm World = commGrid->GetWorld();
584 int nprocs = commGrid->GetSize();
585 int myrank;
586 MPI_Comm_rank(World,&myrank);
587
590 std::vector<std::vector<IT>> indBuf(nprocs);
591 std::vector<std::vector<NT>> valBuf(nprocs);
592 std::vector<IT> indices = ind.GetLocalNum();
593 std::vector<NT> values = val.GetLocalNum();
594
595 IT riloclen = indices.size();
596 for(IT i=0; i < riloclen; ++i)
597 {
598 IT locind;
599 int owner = ind.Owner(indices[i], locind);
600 indBuf[owner].push_back(locind);
601 valBuf[owner].push_back(values[i]);
602 sendcnt[owner]++;
603 }
604
605
606 MPI_Alltoall(sendcnt.data(), 1, MPI_INT, recvcnt.data(), 1, MPI_INT, World);
607 IT totrecv = std::accumulate(recvcnt.begin(),recvcnt.end(), static_cast<IT>(0));
608 double reduceCost = ind.MyLocLength() * log2(nprocs); // bandwidth cost
609 IT reducesize = 0;
611
612 int nreduce = 0;
613 if(reduceCost < totrecv)
614 {
615 reducesize = ind.MyLocLength();
616 }
618
619
620 for(int i=0; i<nprocs; ++i)
621 {
622 if(reducecnt[i]>0) nreduce++;
623 }
624
625 if(nreduce > 0)
626 {
629
632
633 int ireduce = 0;
634 for(int i=0; i<nprocs; ++i)
635 {
636 if(reducecnt[i]>0)
637 {
638 reduceBuffer[i].resize(reducecnt[i], MAX_FOR_REDUCE); // this is specific to LACC
639 for(int j=0; j<sendcnt[i]; j++)
640 reduceBuffer[i][indBuf[i][j]] = std::min(reduceBuffer[i][indBuf[i][j]], valBuf[i][j]);
641 if(myrank==i)
643 else
645 }
646 }
647
649 //MPI_Barrier(World);
650 delete [] requests;
651 delete [] statuses;
652 }
653
654
655
656
657 return nreduce;
658 }
659
660 // for fixed value
661 template <class IT, class NT>
663 {
664 auto commGrid = ind.getcommgrid();
665 MPI_Comm World = commGrid->GetWorld();
666 int nprocs = commGrid->GetSize();
667 int myrank;
668 MPI_Comm_rank(World,&myrank);
669
672 std::vector<std::vector<IT>> indBuf(nprocs);
673 std::vector<IT> indices = ind.GetLocalNum();
674
675 IT riloclen = indices.size();
676 for(IT i=0; i < riloclen; ++i)
677 {
678 IT locind;
679 int owner = ind.Owner(indices[i], locind);
680 indBuf[owner].push_back(locind);
681 sendcnt[owner]++;
682 }
683
684
685 MPI_Alltoall(sendcnt.data(), 1, MPI_INT, recvcnt.data(), 1, MPI_INT, World);
686 IT totrecv = std::accumulate(recvcnt.begin(),recvcnt.end(), static_cast<IT>(0));
687 double reduceCost = ind.MyLocLength() * log2(nprocs); // bandwidth cost
688 IT reducesize = 0;
690
691 int nreduce = 0;
692 if(reduceCost < totrecv)
693 {
694 reducesize = ind.MyLocLength();
695 }
697
698
699 for(int i=0; i<nprocs; ++i)
700 {
701 if(reducecnt[i]>0) nreduce++;
702 }
703
704 if(nreduce > 0)
705 {
708
711
712 int ireduce = 0;
713 for(int i=0; i<nprocs; ++i)
714 {
715 if(reducecnt[i]>0)
716 {
717 reduceBuffer[i].resize(reducecnt[i], MAX_FOR_REDUCE); // this is specific to LACC
718 for(int j=0; j<sendcnt[i]; j++)
719 reduceBuffer[i][indBuf[i][j]] = val;
720 if(myrank==i)
722 else
724 }
725 }
726
728 //MPI_Barrier(World);
729 delete [] requests;
730 delete [] statuses;
731 }
732
733
734
735
736 return nreduce;
737 }
738
739
740 // given two sparse vectors sv and val
741 // sv_out[sv[i]] = val[i] for all nonzero index i in sv, whre sv_out is the output sparse vector
742 // If sv has repeated entries, a process may receive the same values of sv from different processes
743 // In this case, it may be beneficial to reduce some entries of sv so that sv_out[sv[i]] can be updated locally.
744 // This logic is implemented in this function: ReduceAssign
745
746 template <class IT, class NT>
748 {
749 IT ploclen = ind.getlocnnz();
750 if(ploclen != val.getlocnnz())
751 {
752 SpParHelper::Print("Assign error: Index and value vectors have different size !!!\n");
753 return FullyDistSpVec<IT,NT>(ind.getcommgrid());
754 }
755
756 IT globallen = ind.TotalLength();
757 IT maxInd = ind.Reduce(maximum<IT>(), (IT) 0 ) ;
758 if(maxInd >= globallen)
759 {
760 std::cout << "At least one requested index is larger than the global length" << std::endl;
761 return FullyDistSpVec<IT,NT>(ind.getcommgrid());
762 }
763
764#ifdef CC_TIMING
765 double ts = MPI_Wtime();
766 std::ostringstream outs;
767 outs.str("");
768 outs.clear();
769 outs<< " Assign timing: ";
770#endif
771 auto commGrid = ind.getcommgrid();
772 MPI_Comm World = commGrid->GetWorld();
773 int nprocs = commGrid->GetSize();
774 int * rdispls = new int[nprocs+1];
775 int * recvcnt = new int[nprocs];
776 int * sendcnt = new int[nprocs](); // initialize to 0
777 int * sdispls = new int[nprocs+1];
778
780
781
782#ifdef CC_TIMING
783 double t1 = MPI_Wtime();
784#endif
785 NT MAX_FOR_REDUCE = static_cast<NT>(globallen);
787
788#ifdef CC_TIMING
789 double reduce = MPI_Wtime() - t1;
790 outs << "reduce (" << nreduce << "): " << reduce << " ";
791#endif
792
793
794
795 std::vector<std::vector<IT>> indBuf(nprocs);
796 std::vector<std::vector<NT>> valBuf(nprocs);
797 std::vector<IT> indices = ind.GetLocalNum();
798 std::vector<NT> values = val.GetLocalNum();
799 IT riloclen = indices.size();
800 for(IT i=0; i < riloclen; ++i)
801 {
802 IT locind;
803 int owner = ind.Owner(indices[i], locind);
804 if(reduceBuffer[owner].size() == 0)
805 {
806 indBuf[owner].push_back(locind);
807 valBuf[owner].push_back(values[i]);
808 sendcnt[owner]++;
809 }
810 }
811
812
813#ifdef CC_TIMING
814 t1 = MPI_Wtime();
815#endif
817#ifdef CC_TIMING
818 double all2ll1 = MPI_Wtime() - t1;
819 outs << "all2ll1: " << all2ll1 << " ";
820#endif
821 sdispls[0] = 0;
822 rdispls[0] = 0;
823 for(int i=0; i<nprocs; ++i)
824 {
825 sdispls[i+1] = sdispls[i] + sendcnt[i];
826 rdispls[i+1] = rdispls[i] + recvcnt[i];
827 }
828 IT totsend = sdispls[nprocs];
829 IT totrecv = rdispls[nprocs];
830
831
834 for(int i=0; i<nprocs; ++i)
835 {
836 std::copy(indBuf[i].begin(), indBuf[i].end(), sendInd.begin()+sdispls[i]);
837 std::vector<IT>().swap(indBuf[i]);
838 std::copy(valBuf[i].begin(), valBuf[i].end(), sendVal.begin()+sdispls[i]);
839 std::vector<NT>().swap(valBuf[i]);
840 }
841
844#ifdef CC_TIMING
845 t1 = MPI_Wtime();
846#endif
847
848
849 Mpi_Alltoallv(sendInd.data(), sendcnt, sdispls, recvInd.data(), recvcnt, rdispls, World);
850 //MPI_Alltoallv(sendInd.data(), sendcnt, sdispls, MPIType<IT>(), recvInd.data(), recvcnt, rdispls, MPIType<IT>(), World);
851#ifdef CC_TIMING
852 double all2ll2 = MPI_Wtime() - t1;
853 outs << "all2ll2: " << all2ll2 << " ";
854#endif
855#ifdef CC_TIMING
856 t1 = MPI_Wtime();
857#endif
858
859 Mpi_Alltoallv(sendVal.data(), sendcnt, sdispls, recvVal.data(), recvcnt, rdispls, World);
860
861#ifdef CC_TIMING
862 double all2ll3 = MPI_Wtime() - t1;
863 outs << "all2ll3: " << all2ll3 << " ";
864#endif
865 DeleteAll(sdispls, rdispls, sendcnt, recvcnt);
866
867
868 int myrank;
869 MPI_Comm_rank(World,&myrank);
870 if(reduceBuffer[myrank].size()>0)
871 {
872 //cout << myrank << " : " << recvInd.size() << endl;
873 for(int i=0; i<reduceBuffer[myrank].size(); i++)
874 {
875
876 if(reduceBuffer[myrank][i] < MAX_FOR_REDUCE)
877 {
878 recvInd.push_back(i);
879 recvVal.push_back(reduceBuffer[myrank][i]);
880 }
881 }
882 }
883
884 FullyDistSpVec<IT, NT> indexed(commGrid, globallen, recvInd, recvVal, false, false);
885
886
887
888#ifdef CC_TIMING
889 double total = MPI_Wtime() - ts;
890 outs << "others: " << total - (reduce + all2ll1 + all2ll2 + all2ll3) << " ";
891 outs<< endl;
893#endif
894 return indexed;
895
896 }
897
898
899 // given a sparse vector sv
900 // sv_out[sv[i]] = val for all nonzero index i in sv, whre sv_out is the output sparse vector
901 // If sv has repeated entries, a process may receive the same values of sv from different processes
902 // In this case, it may be beneficial to reduce some entries of sv so that sv_out[sv[i]] can be updated locally.
903 // This logic is implemented in this function: ReduceAssign
904 template <class IT, class NT>
906 {
907 IT globallen = ind.TotalLength();
908 IT maxInd = ind.Reduce(maximum<IT>(), (IT) 0 ) ;
909 if(maxInd >= globallen)
910 {
911 std::cout << "At least one requested index is larger than the global length" << std::endl;
912 return FullyDistSpVec<IT,NT>(ind.getcommgrid());
913 }
914
915#ifdef CC_TIMING
916 double ts = MPI_Wtime();
917 std::ostringstream outs;
918 outs.str("");
919 outs.clear();
920 outs<< " Assign timing: ";
921#endif
922 auto commGrid = ind.getcommgrid();
923 MPI_Comm World = commGrid->GetWorld();
924 int nprocs = commGrid->GetSize();
925 int * rdispls = new int[nprocs+1];
926 int * recvcnt = new int[nprocs];
927 int * sendcnt = new int[nprocs](); // initialize to 0
928 int * sdispls = new int[nprocs+1];
929
931
932
933#ifdef CC_TIMING
934 double t1 = MPI_Wtime();
935#endif
936 NT MAX_FOR_REDUCE = static_cast<NT>(globallen);
938
939#ifdef CC_TIMING
940 double reduce = MPI_Wtime() - t1;
941 outs << "reduce ( " << nreduce << " ): " << reduce << " ";
942#endif
943
944
945
946 std::vector<std::vector<IT>> indBuf(nprocs);
947 std::vector<IT> indices = ind.GetLocalNum();
948 IT riloclen = indices.size();
949 for(IT i=0; i < riloclen; ++i)
950 {
951 IT locind;
952 int owner = ind.Owner(indices[i], locind);
953 if(reduceBuffer[owner].size() == 0)
954 {
955 indBuf[owner].push_back(locind);
956 sendcnt[owner]++;
957 }
958 }
959
960
961#ifdef CC_TIMING
962 t1 = MPI_Wtime();
963#endif
965#ifdef CC_TIMING
966 double all2ll1 = MPI_Wtime() - t1;
967 outs << "all2ll1: " << all2ll1 << " ";
968#endif
969 sdispls[0] = 0;
970 rdispls[0] = 0;
971 for(int i=0; i<nprocs; ++i)
972 {
973 sdispls[i+1] = sdispls[i] + sendcnt[i];
974 rdispls[i+1] = rdispls[i] + recvcnt[i];
975 }
976 IT totsend = sdispls[nprocs];
977 IT totrecv = rdispls[nprocs];
978
979
981 for(int i=0; i<nprocs; ++i)
982 {
983 std::copy(indBuf[i].begin(), indBuf[i].end(), sendInd.begin()+sdispls[i]);
984 std::vector<IT>().swap(indBuf[i]);
985 }
986
988#ifdef CC_TIMING
989 t1 = MPI_Wtime();
990#endif
991
992
993 Mpi_Alltoallv(sendInd.data(), sendcnt, sdispls, recvInd.data(), recvcnt, rdispls, World);
994 //MPI_Alltoallv(sendInd.data(), sendcnt, sdispls, MPIType<IT>(), recvInd.data(), recvcnt, rdispls, MPIType<IT>(), World);
995#ifdef CC_TIMING
996 double all2ll2 = MPI_Wtime() - t1;
997 outs << "all2ll2: " << all2ll2 << " ";
998 outs << "all2ll3: " << 0 << " ";
999#endif
1000 DeleteAll(sdispls, rdispls, sendcnt, recvcnt);
1001
1002
1003 int myrank;
1004 MPI_Comm_rank(World,&myrank);
1006 if(reduceBuffer[myrank].size()>0)
1007 {
1008 //cout << myrank << " : " << recvInd.size() << endl;
1009 for(int i=0; i<reduceBuffer[myrank].size(); i++)
1010 {
1011 if(reduceBuffer[myrank][i] < MAX_FOR_REDUCE)
1012 {
1013 recvInd.push_back(i);
1014 recvVal.push_back(val);
1015 }
1016 }
1017 }
1018 FullyDistSpVec<IT, NT> indexed(commGrid, globallen, recvInd, recvVal, false, false);
1019
1020#ifdef CC_TIMING
1021 double total = MPI_Wtime() - ts;
1022 outs << "others: " << total - (reduce + all2ll1 + all2ll2) << " ";
1023 outs<< endl;
1024 SpParHelper::Print(outs.str());
1025#endif
1026 return indexed;
1027
1028 }
1029
1030
1031
1032
1033 // special starcheck after conditional and unconditional hooking
1034 template <typename IT, typename NT, typename DER>
1036 {
1037 // hooks are nonstars
1038 star.EWiseApply(condhooks, [](short isStar, IT x){return static_cast<short>(NONSTAR);},
1039 false, static_cast<IT>(NONSTAR));
1040
1042 {
1043 // this is not needed in the first iteration see the complicated proof in the paper
1044 // parents of hooks are nonstars
1045 // needed only after conditional hooking because in that case star can hook to a star
1047 star.Set(pNonStar);
1048 }
1049
1050 //star(parent)
1051 // If I am a star, I would like to know the star information of my parent
1052 // children of hooks and parents of hooks are nonstars
1053 // NOTE: they are not needed in the first iteration
1054
1055 FullyDistSpVec<IT,short> spStars(star, [](short isStar){return isStar==STAR;});
1057 [](short isStar, IT p){return p;},
1058 [](short isStar, IT p){return true;},
1059 false, static_cast<short>(0));
1061 star.Set(isParentStar);
1062 }
1063
1064 /*
1065 // In iteration 1: "stars" has both vertices belongihg to stars and nonstars (no converged)
1066 // we only process nonstars and identify starts from them
1067 // After iteration 1: "stars" has vertices belongihg to converged and nonstars (no stars)
1068 // we only process nonstars and identify starts from them
1069 template <typename IT>
1070 void StarCheck(FullyDistVec<IT, IT> & parents, FullyDistVec<IT,short>& stars)
1071 {
1072
1073 // this is done here so that in the first iteration, we don't process STAR vertices
1074 FullyDistSpVec<IT,short> nonStars(stars, [](short isStar){return isStar==NONSTAR;});
1075 // initialize all nonstars to stars
1076 stars.Apply([](short isStar){return isStar==NONSTAR? STAR: isStar;});
1077
1078 // identify vertices at level >= 2 (grandchildren of roots)
1079 FullyDistSpVec<IT, IT> pOfNonStars = EWiseApply<IT>(nonStars, parents,
1080 [](short isStar, IT p){return p;},
1081 [](short isStar, IT p){return true;},
1082 false, static_cast<short>(0));
1083 FullyDistSpVec<IT,IT> gpOfNonStars = Extract(parents, pOfNonStars);
1084
1085 FullyDistSpVec<IT,short> keptNonStars = EWiseApply<short>(pOfNonStars, gpOfNonStars,
1086 [](IT p, IT gp){return static_cast<short>(NONSTAR);},
1087 [](IT p, IT gp){return p!=gp;},
1088 false, false, static_cast<IT>(0), static_cast<IT>(0));
1089 stars.Set(keptNonStars); // setting level > 2 vertices as nonstars
1090
1091 // identify grand parents of kept nonstars
1092 FullyDistSpVec<IT,IT> gpOfKeptNonStars = EWiseApply<IT>(pOfNonStars, gpOfNonStars,
1093 [](IT p, IT gp){return gp;},
1094 [](IT p, IT gp){return p!=gp;},
1095 false, false, static_cast<IT>(0), static_cast<IT>(0));
1096
1097 //FullyDistSpVec<IT, short> fixedNS = gpOfKeptNonStars;
1098 //fixedNS = NONSTAR;
1099 FullyDistSpVec<IT, short> gpNonStar= Assign(gpOfKeptNonStars, NONSTAR);
1100 stars.Set(gpNonStar);
1101
1102
1103 // remaining vertices: level-1 leaves of nonstars and any vertices in previous stars (iteration 1 only)
1104 FullyDistSpVec<IT,short> spStars(stars, [](short isStar){return isStar==STAR;});
1105 // further optimization can be done to remove previous stars
1106
1107 FullyDistSpVec<IT, IT> pOfStars = EWiseApply<IT>(spStars, parents,
1108 [](short isStar, IT p){return p;},
1109 [](short isStar, IT p){return true;},
1110 false, static_cast<short>(0));
1111
1112 FullyDistSpVec<IT,short> isParentStar = Extract(stars, pOfStars);
1113 stars.Set(isParentStar);
1114 }
1115 */
1116
1117 // In iteration>1:
1118 // We have only CONVERGED or NONSTAR vertices
1119 // some of the NONSTAR vertices may become STAR in the last shortcut operation
1120 // We would like to identify those new stars
1121 // In iteration 1:
1122 // we have STAR and NONSTAR vertices
1123 // every hooked vertex is marked as NONSTARs
1124 // roots are marked as STARs (includign singletones)
1125 template <typename IT>
1127 {
1128
1129 // this is done here so that in the first iteration, we don't process STAR vertices
1130 // all current nonstars
1132
1133 // initialize all nonstars to stars
1134 stars.Apply([](short isStar){return isStar==NONSTAR? STAR: isStar;});
1135
1136 // parents of all current nonstars
1138 [](short isStar, IT p){return p;},
1139 [](short isStar, IT p){return true;},
1140 false, static_cast<short>(0));
1141
1142 // parents of all current nonstars indexed by parent
1143 // any vertex with a child should be here
1144 // leaves are not present as indices, but roots are present
1146 // copy parent information (the values are grandparents)
1148 [](short isStar, IT p){return p;},
1149 [](short isStar, IT p){return true;},
1150 false, static_cast<short>(0));
1151 // identify if they are parents/grandparents of a vertex with level > 2
1153 temp.setNumToInd();
1155 [](IT p, IT gp){return gp;},
1156 [](IT p, IT gp){return p!=gp;},
1157 false, false, static_cast<IT>(0), static_cast<IT>(0));
1158
1159 // index has parents of vertices with level > 2
1160 // value has grand parents of vertices with level > 2
1161 // update parents
1162 // All vertices (except the root and leave ) in a non-star tree will be updated
1163 stars.EWiseApply(gpOfNonStars_pindexed, [](short isStar, IT idx){return static_cast<short>(NONSTAR);},
1164 false, static_cast<IT>(NONSTAR));
1165
1166 // now everything is updated except the root and leaves of nonstars
1167 // identify roots (indexed by level-1 vertices)
1169 [](IT p, short isStar){return p;},
1170 [](IT p, short isStar){return isStar==NONSTAR;},
1171 false, static_cast<IT>(0));
1172
1173
1176
1177
1178 // remaining vertices
1179 // they must be stars (created after the shortcut) or level-1 leaves of a non-star
1181 [](short s, short isStar){return static_cast<IT> (s);},
1182 [](short s, short isStar){return isStar==STAR;},
1183 false, static_cast<short>(0));
1185 [](IT s, IT p){return p;},
1186 [](IT s, IT p){return true;},
1187 false, static_cast<IT>(0));
1188
1190 stars.Set(isParentStar);
1191 }
1192
1193
1194 template <typename IT, typename NT, typename DER>
1196 {
1197
1198#ifdef CC_TIMING
1199 double t1 = MPI_Wtime();
1200#endif
1201 FullyDistVec<IT, IT> minNeighborparent ( A.getcommgrid());
1202 minNeighborparent = SpMV<Select2ndMinSR<NT, IT>>(A, parent); // value is the minimum of all neighbors' parents
1203
1204#ifdef CC_TIMING
1205 double tspmv = MPI_Wtime() - t1;
1206#endif
1207
1208 FullyDistSpVec<IT,IT> hooksMNP(stars, [](short isStar){return isStar==STAR;});
1210 [](IT x, IT mnp){return true;}, false, static_cast<IT> (0));
1211 hooksMNP = EWiseApply<IT>(hooksMNP, parent, [](IT mnp, IT p){return mnp;},
1212 [](IT mnp, IT p){return p > mnp;}, false, static_cast<IT> (0));
1213
1214 FullyDistSpVec<IT, IT> finalhooks (A.getcommgrid());
1215 if(iteration == 1)
1216 {
1218 }
1219 else
1220 {
1221 FullyDistSpVec<IT,IT> hooksP = EWiseApply<IT>(hooksMNP, parent, [](IT mnp, IT p){return p;},
1222 [](IT mnp, IT p){return true;}, false, static_cast<IT> (0));
1223
1225
1226 }
1227 parent.Set(finalhooks);
1228
1229#ifdef CC_TIMING
1230 double tall = MPI_Wtime() - t1;
1231 std::ostringstream outs;
1232 outs.str("");
1233 outs.clear();
1234 outs << " Conditional Hooking Time: SpMV: " << tspmv << " Other: "<< tall-tspmv;
1235 outs<< endl;
1236 SpParHelper::Print(outs.str());
1237#endif
1238 return finalhooks;
1239 }
1240
1241
1242 template <typename IT, typename NT, typename DER>
1244 {
1245
1246#ifdef CC_TIMING
1247 double ts = MPI_Wtime();
1248 double t1, tspmv;
1249#endif
1250 string spmv = "dense";
1251 IT nNonStars = stars.Reduce(std::plus<IT>(), static_cast<IT>(0), [](short isStar){return static_cast<IT>(isStar==NONSTAR);});
1252 IT nv = A.getnrow();
1253
1254 FullyDistSpVec<IT, IT> hooks(A.getcommgrid(), nv);
1255
1256 if(nNonStars * 50 < nv) // use SpMSpV
1257 {
1258 spmv = "sparse";
1261 [](short isStar, IT p){return p;},
1262 [](short isStar, IT p){return true;},
1263 false, static_cast<IT>(0));
1264 //hooks = SpMV<Select2ndMinSR<NT, IT>>(A, pOfNonStars);
1265#ifdef CC_TIMING
1266 t1 = MPI_Wtime();
1267#endif
1269#ifdef CC_TIMING
1270 tspmv = MPI_Wtime() - t1;
1271#endif
1272 hooks = EWiseApply<IT>(hooks, stars, [](IT mnp, short isStar){return mnp;},
1273 [](IT mnp, short isStar){return isStar==STAR;},
1274 false, static_cast<IT> (0));
1275 }
1276 else // use SpMV
1277 {
1279 parents1.EWiseApply(stars, [nv](IT p, short isStar){return isStar == STAR? nv: p;});
1280
1281
1282 FullyDistVec<IT, IT> minNeighborParent ( A.getcommgrid());
1283#ifdef CC_TIMING
1284 t1 = MPI_Wtime();
1285#endif
1286 minNeighborParent = SpMV<Select2ndMinSR<NT, IT>>(A, parents1); // value is the minimum of all neighbors' parents
1287#ifdef CC_TIMING
1288 tspmv = MPI_Wtime() - t1;
1289#endif
1290 hooks = minNeighborParent.Find([nv](IT mnf){return mnf != nv;});
1291 hooks = EWiseApply<IT>(hooks, stars, [](IT mnp, short isStar){return mnp;},
1292 [](IT mnp, short isStar){return isStar==STAR;},
1293 false, static_cast<IT> (0));
1294 }
1295
1296
1297
1298
1300 [](IT mnp, IT p){return true;}, false, static_cast<IT> (0));
1301
1303 parents.Set(finalHooks);
1304
1305#ifdef CC_TIMING
1306 double tall = MPI_Wtime() - ts;
1307 std::ostringstream outs;
1308 outs.str("");
1309 outs.clear();
1310 outs << " Unconditional Hooking Time " << spmv << " : " << tspmv << " Other: "<< tall-tspmv;
1311 outs<< endl;
1312 SpParHelper::Print(outs.str());
1313#endif
1314
1315 return finalHooks;
1316
1317 }
1318
1319
1320
1321 template <typename IT>
1323 {
1324 FullyDistVec<IT, IT> grandparent = parent(parent);
1325 parent = grandparent; // we can do it unconditionally because it is trivially true for stars
1326 }
1327
1328 // before shortcut, we will make all remaining start as inactive
1329 // shortcut only on nonstar vertices
1330 // then find stars on nonstar vertices
1331 template <typename IT>
1342
1343
1344
1345
1346 template <typename IT, typename NT, typename DER>
1353
1354
1355 // works only on P=1
1356 template <typename IT, typename NT, typename DER>
1358 {
1359 DER* spSeq = A.seqptr(); // local submatrix
1360
1361 for(auto colit = spSeq->begcol(); colit != spSeq->endcol(); ++colit) // iterate over columns
1362 {
1363 IT j = colit.colid(); // local numbering
1364 for(auto nzit = spSeq->begnz(colit); nzit < spSeq->endnz(colit); ++nzit)
1365 {
1366 IT i = nzit.rowid();
1367 if( cclabel[i] != cclabel[j])
1368 {
1369 std::cout << i << " (" << parent[i] << ", "<< cclabel[i] << ") & "<< j << "("<< parent[j] << ", " << cclabel[j] << ")\n";
1370 }
1371 }
1372
1373 }
1374 }
1375
1376 // Input:
1377 // parent: parent of each vertex. parent is essentilly the root of the star which a vertex belongs to.
1378 // parent of the root is itself
1379 // Output:
1380 // cclabel: connected components are incrementally labeled
1381 // returns the number of connected components
1382 // Example: input = [0, 0, 2, 3, 0, 2], output = (0, 0, 1, 2, 0, 1), return 3
1383 template <typename IT>
1385 {
1386 cclabel = parent;
1387 cclabel.ApplyInd([](IT val, IT ind){return val==ind ? -1 : val;});
1388
1389 FullyDistSpVec<IT, IT> roots (cclabel, bind2nd(std::equal_to<IT>(), -1));
1390 // parents of leaves are still correct
1391 FullyDistSpVec<IT, IT> pOfLeaves (cclabel, bind2nd(std::not_equal_to<IT>(), -1));
1392
1393 roots.nziota(0);
1394 cclabel.Set(roots);
1395
1396
1399 //cclabel = cclabel(parent);
1400 return roots.getnnz();
1401 }
1402
1403
1404 template <typename IT, typename NT, typename DER>
1406 {
1407 IT nrows = A.getnrow();
1408 //A.AddLoops(1); // needed for isolated vertices: not needed anymore
1409 FullyDistVec<IT,IT> parent(A.getcommgrid());
1410 parent.iota(nrows, 0); // parent(i)=i initially
1411 FullyDistVec<IT,short> stars(A.getcommgrid(), nrows, STAR);// initially every vertex belongs to a star
1412 int iteration = 1;
1413 std::ostringstream outs;
1414
1415 // isolated vertices are marked as converged
1416 FullyDistVec<int64_t,double> degree = A.Reduce(Column, plus<double>(), 0.0, [](double val){return 1.0;});
1417 stars.EWiseApply(degree, [](short isStar, double degree){return degree == 0.0? CONVERGED: isStar;});
1418
1419 int nthreads = 1;
1420#ifdef THREADED
1421#pragma omp parallel
1422 {
1423 nthreads = omp_get_num_threads();
1424 }
1425#endif
1427 Abool.ActivateThreading(nthreads*4);
1428
1429
1430 while (true)
1431 {
1432#ifdef CC_TIMING
1433 double t1 = MPI_Wtime();
1434#endif
1436#ifdef CC_TIMING
1437 double t_cond_hook = MPI_Wtime() - t1;
1438 t1 = MPI_Wtime();
1439#endif
1440 // Any iteration other than the first iteration,
1441 // a non-star is formed after a conditional hooking
1442 // In the first iteration, we can hook two vertices to create a star
1443 // After the first iteratio, only singletone CCs reamin isolated
1444 // Here, we are ignoring the first iteration (still correct, but may ignore few possible
1445 // unconditional hooking in the first iteration)
1446 // remove cond hooks from stars
1447
1448 if(iteration > 1)
1449 {
1450 StarCheckAfterHooking(Abool, parent, stars, condhooks, true);
1451 }
1452 else
1453 {
1454 // explain
1455 stars.EWiseApply(condhooks, [](short isStar, IT x){return static_cast<short>(NONSTAR);},
1456 false, static_cast<IT>(NONSTAR));
1458 stars.Set(pNonStar);
1459 // it does not create any cycle in the unconditional hooking, see the proof in the paper
1460 }
1461
1462#ifdef CC_TIMING
1463 double t_starcheck1 = MPI_Wtime() - t1;
1464 t1 = MPI_Wtime();
1465#endif
1466
1468#ifdef CC_TIMING
1469 double t_uncond_hook = MPI_Wtime() - t1;
1470 t1 = MPI_Wtime();
1471#endif
1472
1473 if(iteration > 1)
1474 {
1475 StarCheckAfterHooking(Abool, parent, stars, uncondHooks, false);
1476 stars.Apply([](short isStar){return isStar==STAR? CONVERGED: isStar;});
1477 }
1478 else
1479 {
1480 // explain
1481 stars.EWiseApply(uncondHooks, [](short isStar, IT x){return static_cast<short>(NONSTAR);},
1482 false, static_cast<IT>(NONSTAR));
1483 }
1484
1485 IT nconverged = stars.Reduce(std::plus<IT>(), static_cast<IT>(0), [](short isStar){return static_cast<IT>(isStar==CONVERGED);});
1486
1487 if(nconverged==nrows)
1488 {
1489 outs.clear();
1490 outs << "Iteration: " << iteration << " converged: " << nrows << " stars: 0" << " nonstars: 0" ;
1491 outs<< endl;
1492 SpParHelper::Print(outs.str());
1493 break;
1494 }
1495
1496#ifdef CC_TIMING
1497 double t_starcheck2 = MPI_Wtime() - t1;
1498 t1 = MPI_Wtime();
1499#endif
1500 Shortcut(parent, stars);
1501#ifdef CC_TIMING
1502 double t_shortcut = MPI_Wtime() - t1;
1503 t1 = MPI_Wtime();
1504#endif
1505
1506
1507 StarCheck(parent, stars);
1508#ifdef CC_TIMING
1509 double t_starcheck = MPI_Wtime() - t1;
1510 t1 = MPI_Wtime();
1511#endif
1512 IT nonstars = stars.Reduce(std::plus<IT>(), static_cast<IT>(0), [](short isStar){return static_cast<IT>(isStar==NONSTAR);});
1514
1515
1516
1517
1518
1519
1520
1521 double t2 = MPI_Wtime();
1522 outs.str("");
1523 outs.clear();
1524 outs << "Iteration: " << iteration << " converged: " << nconverged << " stars: " << nstars << " nonstars: " << nonstars;
1525#ifdef CC_TIMING
1526 //outs << " Time: t_cond_hook: " << t_cond_hook << " t_starcheck1: " << t_starcheck1 << " t_uncond_hook: " << t_uncond_hook << " t_starcheck2: " << t_starcheck2 << " t_shortcut: " << t_shortcut << " t_starcheck: " << t_starcheck;
1527#endif
1528 outs<< endl;
1529 SpParHelper::Print(outs.str());
1530
1531 iteration++;
1532
1533
1534 }
1535
1536 FullyDistVec<IT, IT> cc(parent.getcommgrid());
1537 nCC = LabelCC(parent, cc);
1538
1539 // TODO: Print to file
1540 //PrintCC(cc, nCC);
1541 //Correctness(A, cc, nCC, parent);
1542
1543 return cc;
1544 }
1545
1546
1547 template <typename IT>
1549 {
1550 for(IT i=0; i< nCC; i++)
1551 {
1552 FullyDistVec<IT, IT> ith = CC.FindInds(bind2nd(std::equal_to<IT>(), i));
1553 ith.DebugPrint();
1554 }
1555 }
1556
1557 // Print the size of the first 4 clusters
1558 template <typename IT>
1560 {
1561 FullyDistSpVec<IT, IT> cc1 = cc.Find([](IT label){return label==0;});
1562 FullyDistSpVec<IT, IT> cc2 = cc.Find([](IT label){return label==1;});
1563 FullyDistSpVec<IT, IT> cc3 = cc.Find([](IT label){return label==2;});
1564 FullyDistSpVec<IT, IT> cc4 = cc.Find([](IT label){return label==3;});
1565
1566 std::ostringstream outs;
1567 outs.str("");
1568 outs.clear();
1569 outs << "Size of the first component: " << cc1.getnnz() << std::endl;
1570 outs << "Size of the second component: " << cc2.getnnz() << std::endl;
1571 outs << "Size of the third component: " << cc3.getnnz() << std::endl;
1572 outs << "Size of the fourth component: " << cc4.getnnz() << std::endl;
1573 }
1574
1575
1576 template <typename IT>
1578 {
1579 FullyDistVec<IT, IT> ccSizes(CC.getcommgrid(), nCC, 0);
1580 for(IT i=0; i< nCC; i++)
1581 {
1582 FullyDistSpVec<IT, IT> ith = CC.Find(bind2nd(std::equal_to<IT>(), i));
1583 ccSizes.SetElement(i, ith.getnnz());
1584 }
1585
1586 IT largestCCSise = ccSizes.Reduce(maximum<IT>(), static_cast<IT>(0));
1587
1588
1589 const IT * locCCSizes = ccSizes.GetLocArr();
1590 int numBins = 200;
1591 std::vector<IT> localHist(numBins,0);
1592 for(IT i=0; i< ccSizes.LocArrSize(); i++)
1593 {
1595 localHist[bin]++;
1596 }
1597
1598 std::vector<IT> globalHist(numBins,0);
1599 MPI_Comm world = CC.getcommgrid()->GetWorld();
1601
1602
1603 int myrank;
1604 MPI_Comm_rank(world,&myrank);
1605 if(myrank==0)
1606 {
1607 std::cout << "The largest component size: " << largestCCSise << std::endl;
1608 std::ofstream output;
1609 output.open("hist.txt", std::ios_base::app );
1610 std::copy(globalHist.begin(), globalHist.end(), std::ostream_iterator<IT> (output, " "));
1611 output << std::endl;
1612 output.close();
1613 }
1614
1615
1616 //ccSizes.PrintToFile("histCC.txt");
1617 }
1618
1619}
1620
int64_t IT
double NT
#define NONSTAR
Definition CC.h:51
#define STAR
Definition CC.h:52
#define CONVERGED
Definition CC.h:53
Definition test.cpp:53
static void Print(const std::string &s)
int size
Definition common.h:20
int rank
bool output
Definition comms.cpp:56
int nprocs
Definition comms.cpp:55
@ Column
Definition SpDefs.h:115
void omp_par_scan(T *A, T *B, I cnt)
Definition CC.h:89
int Mpi_Alltoallv(T *sbuff, int *s_cnt, int *sdisp, T *rbuff, int *r_cnt, int *rdisp, MPI_Comm comm)
Definition CC.h:311
int replicate(const FullyDistVec< IT, NT > dense, FullyDistSpVec< IT, IT > ri, vector< vector< NT > > &bcastBuffer)
Definition CC.h:347
FullyDistSpVec< IT, IT > UnconditionalHook2(const SpParMat< IT, NT, DER > &A, FullyDistVec< IT, IT > &parents, FullyDistVec< IT, short > stars)
Definition CC.h:1243
bool neigborsInSameCC(const SpParMat< IT, NT, DER > &A, FullyDistVec< IT, IT > &cclabel)
Definition CC.h:1347
void Correctness(const SpParMat< IT, NT, DER > &A, FullyDistVec< IT, IT > &cclabel, IT nCC, FullyDistVec< IT, IT > parent)
Definition CC.h:1357
void Shortcut(FullyDistVec< IT, IT > &parent)
Definition CC.h:1322
IT LabelCC(FullyDistVec< IT, IT > &parent, FullyDistVec< IT, IT > &cclabel)
Definition CC.h:1384
void First4Clust(FullyDistVec< IT, IT > &cc)
Definition CC.h:1559
void StarCheck(FullyDistVec< IT, IT > &parents, FullyDistVec< IT, short > &stars)
Definition CC.h:1126
FullyDistSpVec< IT, NT > Assign(FullyDistSpVec< IT, IT > &ind, FullyDistSpVec< IT, NT > &val)
Definition CC.h:747
FullyDistVec< IT, IT > CC(SpParMat< IT, NT, DER > &A, IT &nCC)
Definition CC.h:1405
int ReduceAssign(FullyDistSpVec< IT, IT > &ind, FullyDistSpVec< IT, NT > &val, vector< vector< NT > > &reduceBuffer, NT MAX_FOR_REDUCE)
Definition CC.h:580
void PrintCC(FullyDistVec< IT, IT > CC, IT nCC)
Definition CC.h:1548
FullyDistSpVec< IT, NT > Extract(const FullyDistVec< IT, NT > dense, FullyDistSpVec< IT, IT > ri)
Definition CC.h:420
void DeleteAll(A arr1)
Definition Deleter.h:48
void HistCC(FullyDistVec< IT, IT > CC, IT nCC)
Definition CC.h:1577
int Mpi_Alltoallv_kway(T *sbuff_, int *s_cnt_, int *sdisp_, T *rbuff_, int *r_cnt_, int *rdisp_, MPI_Comm c, int kway=2)
Definition CC.h:133
void StarCheckAfterHooking(const SpParMat< IT, NT, DER > &A, FullyDistVec< IT, IT > &parent, FullyDistVec< IT, short > &star, FullyDistSpVec< IT, IT > condhooks, bool isStar2StarHookPossible)
Definition CC.h:1035
FullyDistSpVec< IT, IT > ConditionalHook(const SpParMat< IT, NT, DER > &A, FullyDistVec< IT, IT > &parent, FullyDistVec< IT, short > stars, int iteration)
Definition CC.h:1195
int Mpi_Alltoallv_sparse(T *sendbuf, int *sendcnts, int *sdispls, T *recvbuf, int *recvcnts, int *rdispls, MPI_Comm comm)
double A
static MPI_Op mpi_op()
Definition CC.h:68
static bool returnedSAID()
Definition CC.h:67
promote_trait< T1, T2 >::T_promote T_promote
Definition CC.h:65
static T_promote add(const T_promote &arg1, const T_promote &arg2)
Definition CC.h:70
static T_promote id()
Definition CC.h:66
static T_promote multiply(const T1 &arg1, const T2 &arg2)
Definition CC.h:75
static T2 add(const T2 &arg1, const T2 &arg2)
static T2 multiply(const T1 &arg1, const T2 &arg2)
static void axpy(const T1 a, const T2 &x, T_promote &y)
Definition CC.h:80