COMBINATORIAL_BLAS 1.6
 
Loading...
Searching...
No Matches
ApproxWeightPerfectMatching.h
Go to the documentation of this file.
1//
2// ApproxWeightPerfectMatching.h
3//
4//
5// Created by Ariful Azad on 8/22/17.
6//
7//
8
9#ifndef ApproxWeightPerfectMatching_h
10#define ApproxWeightPerfectMatching_h
11
12#include "CombBLAS/CombBLAS.h"
13#include "BPMaximalMatching.h"
14#include "BPMaximumMatching.h"
15#include <parallel/algorithm>
16#include <parallel/numeric>
17#include <memory>
18#include <limits>
19
20
21using namespace std;
22
23namespace combblas {
24
25
26template <class IT>
27struct AWPM_param
28{
29 int nprocs;
30 int myrank;
31 int pr;
32 int pc;
33 IT lncol;
34 IT lnrow;
39 std::shared_ptr<CommGrid> commGrid;
40};
41
42
43template <class IT, class NT>
44std::vector<std::tuple<IT,IT,NT>> ExchangeData(std::vector<std::vector<std::tuple<IT,IT,NT>>> & tempTuples, MPI_Comm World)
45{
46
47 /* Create/allocate variables for vector assignment */
48 MPI_Datatype MPI_tuple;
49 MPI_Type_contiguous(sizeof(std::tuple<IT,IT,NT>), MPI_CHAR, &MPI_tuple);
50 MPI_Type_commit(&MPI_tuple);
51
52 int nprocs;
53 MPI_Comm_size(World, &nprocs);
54
55 int * sendcnt = new int[nprocs];
56 int * recvcnt = new int[nprocs];
57 int * sdispls = new int[nprocs]();
58 int * rdispls = new int[nprocs]();
59
60 // Set the newly found vector entries
61 IT totsend = 0;
62 for(IT i=0; i<nprocs; ++i)
63 {
64 sendcnt[i] = tempTuples[i].size();
65 totsend += tempTuples[i].size();
66 }
67
68 MPI_Alltoall(sendcnt, 1, MPI_INT, recvcnt, 1, MPI_INT, World);
69
70 std::partial_sum(sendcnt, sendcnt+nprocs-1, sdispls+1);
71 std::partial_sum(recvcnt, recvcnt+nprocs-1, rdispls+1);
72 IT totrecv = accumulate(recvcnt,recvcnt+nprocs, static_cast<IT>(0));
73
74
75 std::vector< std::tuple<IT,IT,NT> > sendTuples(totsend);
76 for(int i=0; i<nprocs; ++i)
77 {
78 copy(tempTuples[i].begin(), tempTuples[i].end(), sendTuples.data()+sdispls[i]);
79 std::vector< std::tuple<IT,IT,NT> >().swap(tempTuples[i]); // clear memory
80 }
81 std::vector< std::tuple<IT,IT,NT> > recvTuples(totrecv);
82 MPI_Alltoallv(sendTuples.data(), sendcnt, sdispls, MPI_tuple, recvTuples.data(), recvcnt, rdispls, MPI_tuple, World);
83 DeleteAll(sendcnt, recvcnt, sdispls, rdispls); // free all memory
84 MPI_Type_free(&MPI_tuple);
85 return recvTuples;
86
87}
88
89
90
91template <class IT, class NT>
92std::vector<std::tuple<IT,IT,IT,NT>> ExchangeData1(std::vector<std::vector<std::tuple<IT,IT,IT,NT>>> & tempTuples, MPI_Comm World)
93{
94
95 /* Create/allocate variables for vector assignment */
96 MPI_Datatype MPI_tuple;
97 MPI_Type_contiguous(sizeof(std::tuple<IT,IT,IT,NT>), MPI_CHAR, &MPI_tuple);
98 MPI_Type_commit(&MPI_tuple);
99
100 int nprocs;
101 MPI_Comm_size(World, &nprocs);
102
103 int * sendcnt = new int[nprocs];
104 int * recvcnt = new int[nprocs];
105 int * sdispls = new int[nprocs]();
106 int * rdispls = new int[nprocs]();
107
108 // Set the newly found vector entries
109 IT totsend = 0;
110 for(IT i=0; i<nprocs; ++i)
111 {
112 sendcnt[i] = tempTuples[i].size();
113 totsend += tempTuples[i].size();
114 }
115
116 MPI_Alltoall(sendcnt, 1, MPI_INT, recvcnt, 1, MPI_INT, World);
117
118 std::partial_sum(sendcnt, sendcnt+nprocs-1, sdispls+1);
119 std::partial_sum(recvcnt, recvcnt+nprocs-1, rdispls+1);
120 IT totrecv = std::accumulate(recvcnt,recvcnt+nprocs, static_cast<IT>(0));
121
122 std::vector< std::tuple<IT,IT,IT,NT> > sendTuples(totsend);
123 for(int i=0; i<nprocs; ++i)
124 {
125 copy(tempTuples[i].begin(), tempTuples[i].end(), sendTuples.data()+sdispls[i]);
126 std::vector< std::tuple<IT,IT,IT,NT> >().swap(tempTuples[i]); // clear memory
127 }
128 std::vector< std::tuple<IT,IT,IT,NT> > recvTuples(totrecv);
129 MPI_Alltoallv(sendTuples.data(), sendcnt, sdispls, MPI_tuple, recvTuples.data(), recvcnt, rdispls, MPI_tuple, World);
130 DeleteAll(sendcnt, recvcnt, sdispls, rdispls); // free all memory
131 MPI_Type_free(&MPI_tuple);
132 return recvTuples;
133}
134
135
136
137
138// remember that getnrow() and getncol() require collectives
139// Hence, we save them once and pass them to this function
140template <class IT, class NT,class DER>
141int OwnerProcs(SpParMat < IT, NT, DER > & A, IT grow, IT gcol, IT nrows, IT ncols)
142{
143 auto commGrid = A.getcommgrid();
144 int procrows = commGrid->GetGridRows();
145 int proccols = commGrid->GetGridCols();
146
147 IT m_perproc = nrows / procrows;
148 IT n_perproc = ncols / proccols;
149 int pr, pc;
150 if(m_perproc != 0)
151 pr = std::min(static_cast<int>(grow / m_perproc), procrows-1);
152 else // all owned by the last processor row
153 pr = procrows -1;
154 if(n_perproc != 0)
155 pc = std::min(static_cast<int>(gcol / n_perproc), proccols-1);
156 else
157 pc = proccols-1;
158 return commGrid->GetRank(pr, pc);
159}
160
161
162/*
163// Hence, we save them once and pass them to this function
164template <class IT, class NT,class DER>
165int OwnerProcs(SpParMat < IT, NT, DER > & A, IT grow, IT gcol, IT nrows, IT ncols)
166{
167
168
169
170 int pr1, pc1;
171 if(m_perproc != 0)
172 pr1 = std::min(static_cast<int>(grow / m_perproc), pr-1);
173 else // all owned by the last processor row
174 pr1 = pr -1;
175 if(n_perproc != 0)
176 pc1 = std::min(static_cast<int>(gcol / n_perproc), pc-1);
177 else
178 pc1 = pc-1;
179 return commGrid->GetRank(pr1, pc1);
180}
181 */
182
183
184template <class IT>
185std::vector<std::tuple<IT,IT>> MateBcast(std::vector<std::tuple<IT,IT>> sendTuples, MPI_Comm World)
186{
187
188 /* Create/allocate variables for vector assignment */
189 MPI_Datatype MPI_tuple;
190 MPI_Type_contiguous(sizeof(std::tuple<IT,IT>) , MPI_CHAR, &MPI_tuple);
191 MPI_Type_commit(&MPI_tuple);
192
193
194 int nprocs;
195 MPI_Comm_size(World, &nprocs);
196
197 int * recvcnt = new int[nprocs];
198 int * rdispls = new int[nprocs]();
199 int sendcnt = sendTuples.size();
200
201
202 MPI_Allgather(&sendcnt, 1, MPI_INT, recvcnt, 1, MPI_INT, World);
203
204 std::partial_sum(recvcnt, recvcnt+nprocs-1, rdispls+1);
205 IT totrecv = std::accumulate(recvcnt,recvcnt+nprocs, static_cast<IT>(0));
206
207 std::vector< std::tuple<IT,IT> > recvTuples(totrecv);
208
209
210 MPI_Allgatherv(sendTuples.data(), sendcnt, MPI_tuple,
211 recvTuples.data(), recvcnt, rdispls,MPI_tuple,World );
212
213 DeleteAll(recvcnt, rdispls); // free all memory
214 MPI_Type_free(&MPI_tuple);
215 return recvTuples;
216
217}
218
219
220// -----------------------------------------------------------
221// replicate weights of mates
222// Can be improved by removing AllReduce by All2All
223// -----------------------------------------------------------
224
225template <class IT, class NT>
226void ReplicateMateWeights( const AWPM_param<IT>& param, Dcsc<IT, NT>*dcsc, const std::vector<IT>& colptr, std::vector<IT>& RepMateC2R, std::vector<NT>& RepMateWR2C, std::vector<NT>& RepMateWC2R)
227{
228
229
230 fill(RepMateWC2R.begin(), RepMateWC2R.end(), static_cast<NT>(0));
231 fill(RepMateWR2C.begin(), RepMateWR2C.end(), static_cast<NT>(0));
232
233
234#ifdef THREADED
235#pragma omp parallel for
236#endif
237 for(int k=0; k<param.lncol; ++k)
238 {
239
240 IT lj = k; // local numbering
241 IT mj = RepMateC2R[lj]; // mate of j
242
243 if(mj >= param.localRowStart && mj < (param.localRowStart+param.lnrow) )
244 {
245 for(IT cp = colptr[k]; cp < colptr[k+1]; ++cp)
246 {
247 IT li = dcsc->ir[cp];
248 IT i = li + param.localRowStart;
249 // TODO: use binary search to directly go to mj-th entry if more than 32 nonzero in this column
250 if( i == mj)
251 {
252 RepMateWC2R[lj] = dcsc->numx[cp];
253 RepMateWR2C[mj-param.localRowStart] = dcsc->numx[cp];
254 //break;
255 }
256 }
257 }
258 }
259
260
261
262 MPI_Comm ColWorld = param.commGrid->GetColWorld();
263 MPI_Comm RowWorld = param.commGrid->GetRowWorld();
264
265 MPI_Allreduce(MPI_IN_PLACE, RepMateWC2R.data(), RepMateWC2R.size(), MPIType<NT>(), MPI_SUM, ColWorld);
266 MPI_Allreduce(MPI_IN_PLACE, RepMateWR2C.data(), RepMateWR2C.size(), MPIType<NT>(), MPI_SUM, RowWorld);
267}
268
269
270
271
272template <class IT, class NT,class DER>
273NT Trace( SpParMat < IT, NT, DER > & A, IT& rettrnnz=0)
274{
275
276 IT nrows = A.getnrow();
277 IT ncols = A.getncol();
278 auto commGrid = A.getcommgrid();
279 MPI_Comm World = commGrid->GetWorld();
280 int myrank=commGrid->GetRank();
281 int pr = commGrid->GetGridRows();
282 int pc = commGrid->GetGridCols();
283
284
285 //Information about the matrix distribution
286 //Assume that A is an nrow x ncol matrix
287 //The local submatrix is an lnrow x lncol matrix
288 int rowrank = commGrid->GetRankInProcRow();
289 int colrank = commGrid->GetRankInProcCol();
290 IT m_perproc = nrows / pr;
291 IT n_perproc = ncols / pc;
292 DER* spSeq = A.seqptr(); // local submatrix
293 IT localRowStart = colrank * m_perproc; // first row in this process
294 IT localColStart = rowrank * n_perproc; // first col in this process
295
296
297 IT trnnz = 0;
298 NT trace = 0.0;
299 for(auto colit = spSeq->begcol(); colit != spSeq->endcol(); ++colit) // iterate over columns
300 {
301 IT lj = colit.colid(); // local numbering
302 IT j = lj + localColStart;
303
304 for(auto nzit = spSeq->begnz(colit); nzit < spSeq->endnz(colit); ++nzit)
305 {
306
307 IT li = nzit.rowid();
308 IT i = li + localRowStart;
309 if( i == j)
310 {
311 trnnz ++;
312 trace += nzit.value();
313
314 }
315 }
316
317 }
318 MPI_Allreduce(MPI_IN_PLACE, &trnnz, 1, MPIType<IT>(), MPI_SUM, World);
319 MPI_Allreduce(MPI_IN_PLACE, &trace, 1, MPIType<NT>(), MPI_SUM, World);
320 rettrnnz = trnnz;
321 /*
322 if(myrank==0)
323 cout <<"nrows: " << nrows << " Nnz in the diag: " << trnnz << " sum of diag: " << trace << endl;
324 */
325 return trace;
326
327}
328
329
330template <class NT>
331NT MatchingWeight( std::vector<NT>& RepMateWC2R, MPI_Comm RowWorld, NT& minw)
332{
333 NT w = 0;
334 minw = 99999999999999.0;
335 for(int i=0; i<RepMateWC2R.size(); i++)
336 {
337 //w += fabs(RepMateWC2R[i]);
338 //w += exp(RepMateWC2R[i]);
339 //minw = min(minw, exp(RepMateWC2R[i]));
340
341 w += RepMateWC2R[i];
342 minw = std::min(minw, RepMateWC2R[i]);
343 }
344
345 MPI_Allreduce(MPI_IN_PLACE, &w, 1, MPIType<NT>(), MPI_SUM, RowWorld);
346 MPI_Allreduce(MPI_IN_PLACE, &minw, 1, MPIType<NT>(), MPI_MIN, RowWorld);
347 return w;
348}
349
350
351
352
353
354// update the distributed mate vectors from replicated mate vectors
355template <class IT>
356void UpdateMatching(FullyDistVec<IT, IT>& mateRow2Col, FullyDistVec<IT, IT>& mateCol2Row, std::vector<IT>& RepMateR2C, std::vector<IT>& RepMateC2R)
357{
358
359 auto commGrid = mateRow2Col.getcommgrid();
360 MPI_Comm RowWorld = commGrid->GetRowWorld();
361 int rowroot = commGrid->GetDiagOfProcRow();
362 int pc = commGrid->GetGridCols();
363
364 // mateRow2Col is easy
365 IT localLenR2C = mateRow2Col.LocArrSize();
366 //IT* localR2C = mateRow2Col.GetLocArr();
367 for(IT i=0, j = mateRow2Col.RowLenUntil(); i<localLenR2C; i++, j++)
368 {
369 mateRow2Col.SetLocalElement(i, RepMateR2C[j]);
370 //localR2C[i] = RepMateR2C[j];
371 }
372
373
374 // mateCol2Row requires communication
375 std::vector <int> sendcnts(pc);
376 std::vector <int> dpls(pc);
377 dpls[0] = 0;
378 for(int i=1; i<pc; i++)
379 {
380 dpls[i] = mateCol2Row.RowLenUntil(i);
381 sendcnts[i-1] = dpls[i] - dpls[i-1];
382 }
383 sendcnts[pc-1] = RepMateC2R.size() - dpls[pc-1];
384
385 IT localLenC2R = mateCol2Row.LocArrSize();
386 IT* localC2R = (IT*) mateCol2Row.GetLocArr();
387 MPI_Scatterv(RepMateC2R.data(),sendcnts.data(), dpls.data(), MPIType<IT>(), localC2R, localLenC2R, MPIType<IT>(),rowroot, RowWorld);
388}
389
390
391
392inline int ThreadBuffLenForBinning(int itemsize, int nbins)
393{
394 // 1MB shared cache (per 2 cores) in KNL
395#ifndef L2_CACHE_SIZE
396#define L2_CACHE_SIZE 256000
397#endif
398 int THREAD_BUF_LEN = 256;
399 while(true)
400 {
401 int bufferMem = THREAD_BUF_LEN * nbins * itemsize ;
402 if(bufferMem>L2_CACHE_SIZE ) THREAD_BUF_LEN/=2;
403 else break;
404 }
405 THREAD_BUF_LEN = std::min(nbins+1,THREAD_BUF_LEN);
406
407 return THREAD_BUF_LEN;
408}
409
410
411
412template <class IT, class NT>
413std::vector< std::tuple<IT,IT,NT> > Phase1(const AWPM_param<IT>& param, Dcsc<IT, NT>* dcsc, const std::vector<IT>& colptr, const std::vector<IT>& RepMateR2C, const std::vector<IT>& RepMateC2R, const std::vector<NT>& RepMateWR2C, const std::vector<NT>& RepMateWC2R )
414{
415
416
417
418 double tstart = MPI_Wtime();
419
420
421 MPI_Comm World = param.commGrid->GetWorld();
422
423
424
425 //Step 1: Count the amount of data to be sent to different processors
426 std::vector<int> sendcnt(param.nprocs,0); // number items to be sent to each processor
427
428
429#ifdef THREADED
430#pragma omp parallel
431#endif
432 {
433 std::vector<int> tsendcnt(param.nprocs,0);
434#ifdef THREADED
435#pragma omp for
436#endif
437 for(int k=0; k<param.lncol; ++k)
438 {
439 IT mj = RepMateC2R[k]; // lj = k
440 IT j = k + param.localColStart;
441
442 for(IT cp = colptr[k]; cp < colptr[k+1]; ++cp)
443 {
444 IT li = dcsc->ir[cp];
445 IT i = li + param.localRowStart;
446 IT mi = RepMateR2C[li];
447 if( i > mj) // TODO : stop when first come to this, may be use <
448 {
449
450 int rrank = param.m_perproc != 0 ? std::min(static_cast<int>(mj / param.m_perproc), param.pr-1) : (param.pr-1);
451 int crank = param.n_perproc != 0 ? std::min(static_cast<int>(mi / param.n_perproc), param.pc-1) : (param.pc-1);
452 int owner = param.commGrid->GetRank(rrank , crank);
453 tsendcnt[owner]++;
454 }
455 }
456 }
457 for(int i=0; i<param.nprocs; i++)
458 {
459 __sync_fetch_and_add(sendcnt.data()+i, tsendcnt[i]);
460 }
461 }
462
463
464
465
466 IT totsend = std::accumulate(sendcnt.data(), sendcnt.data()+param.nprocs, static_cast<IT>(0));
467 std::vector<int> sdispls (param.nprocs, 0);
468 std::partial_sum(sendcnt.data(), sendcnt.data()+param.nprocs-1, sdispls.data()+1);
469
470 std::vector< std::tuple<IT,IT,NT> > sendTuples(totsend);
471 std::vector<int> transferCount(param.nprocs,0);
472 int THREAD_BUF_LEN = ThreadBuffLenForBinning(24, param.nprocs);
473
474
475 //Step 2: Compile data to be sent to different processors
476#ifdef THREADED
477#pragma omp parallel
478#endif
479 {
480 std::vector<int> tsendcnt(param.nprocs,0);
481 std::vector<std::tuple<IT,IT, NT>> tsendTuples (param.nprocs*THREAD_BUF_LEN);
482#ifdef THREADED
483#pragma omp for
484#endif
485 for(int k=0; k<param.lncol; ++k)
486 {
487 IT mj = RepMateC2R[k];
488 IT lj = k;
489 IT j = k + param.localColStart;
490
491 for(IT cp = colptr[k]; cp < colptr[k+1]; ++cp)
492 {
493 IT li = dcsc->ir[cp];
494 IT i = li + param.localRowStart;
495 IT mi = RepMateR2C[li];
496 if( i > mj) // TODO : stop when first come to this, may be use <
497 {
498 double w = dcsc->numx[cp]- RepMateWR2C[li] - RepMateWC2R[lj];
499 int rrank = param.m_perproc != 0 ? std::min(static_cast<int>(mj / param.m_perproc), param.pr-1) : (param.pr-1);
500 int crank = param.n_perproc != 0 ? std::min(static_cast<int>(mi / param.n_perproc), param.pc-1) : (param.pc-1);
501 int owner = param.commGrid->GetRank(rrank , crank);
502
503 if (tsendcnt[owner] < THREAD_BUF_LEN)
504 {
505 tsendTuples[THREAD_BUF_LEN * owner + tsendcnt[owner]] = std::make_tuple(mi, mj, w);
506 tsendcnt[owner]++;
507 }
508 else
509 {
510 int tt = __sync_fetch_and_add(transferCount.data()+owner, THREAD_BUF_LEN);
511 copy( tsendTuples.data()+THREAD_BUF_LEN * owner, tsendTuples.data()+THREAD_BUF_LEN * (owner+1) , sendTuples.data() + sdispls[owner]+ tt);
512
513 tsendTuples[THREAD_BUF_LEN * owner] = std::make_tuple(mi, mj, w);
514 tsendcnt[owner] = 1;
515 }
516 }
517 }
518 }
519 for(int owner=0; owner < param.nprocs; owner++)
520 {
521 if (tsendcnt[owner] >0)
522 {
523 int tt = __sync_fetch_and_add(transferCount.data()+owner, tsendcnt[owner]);
524 copy( tsendTuples.data()+THREAD_BUF_LEN * owner, tsendTuples.data()+THREAD_BUF_LEN * owner + tsendcnt[owner], sendTuples.data() + sdispls[owner]+ tt);
525 }
526 }
527 }
528
529 double t1Comp = MPI_Wtime() - tstart;
530 tstart = MPI_Wtime();
531
532 // Step 3: Communicate data
533
534 std::vector<int> recvcnt (param.nprocs);
535 std::vector<int> rdispls (param.nprocs, 0);
536
537 MPI_Alltoall(sendcnt.data(), 1, MPI_INT, recvcnt.data(), 1, MPI_INT, World);
538 std::partial_sum(recvcnt.data(), recvcnt.data()+param.nprocs-1, rdispls.data()+1);
539 IT totrecv = std::accumulate(recvcnt.data(), recvcnt.data()+param.nprocs, static_cast<IT>(0));
540
541
542 MPI_Datatype MPI_tuple;
543 MPI_Type_contiguous(sizeof(std::tuple<IT,IT,NT>), MPI_CHAR, &MPI_tuple);
544 MPI_Type_commit(&MPI_tuple);
545
546 std::vector< std::tuple<IT,IT,NT> > recvTuples1(totrecv);
547 MPI_Alltoallv(sendTuples.data(), sendcnt.data(), sdispls.data(), MPI_tuple, recvTuples1.data(), recvcnt.data(), rdispls.data(), MPI_tuple, World);
548 MPI_Type_free(&MPI_tuple);
549 double t1Comm = MPI_Wtime() - tstart;
550 return recvTuples1;
551}
552
553
554
555
556
557template <class IT, class NT>
558std::vector< std::tuple<IT,IT,IT,NT> > Phase2(const AWPM_param<IT>& param, std::vector<std::tuple<IT,IT,NT>>& recvTuples, Dcsc<IT, NT>* dcsc, const std::vector<IT>& colptr, const std::vector<IT>& RepMateR2C, const std::vector<IT>& RepMateC2R, const std::vector<NT>& RepMateWR2C, const std::vector<NT>& RepMateWC2R )
559{
560
561 MPI_Comm World = param.commGrid->GetWorld();
562
563 double tstart = MPI_Wtime();
564
565 // Step 1: Sort for effecient searching of indices
566 __gnu_parallel::sort(recvTuples.begin(), recvTuples.end());
567 std::vector<std::vector<std::tuple<IT,IT, IT, NT>>> tempTuples1 (param.nprocs);
568
569 std::vector<int> sendcnt(param.nprocs,0); // number items to be sent to each processor
570
571 //Step 2: Count the amount of data to be sent to different processors
572 // Instead of binary search in each column, I am doing linear search
573 // Linear search is faster here because, we need to search 40%-50% of nnz
574 int nBins = 1;
575#ifdef THREADED
576#pragma omp parallel
577 {
578 nBins = omp_get_num_threads() * 4;
579 }
580#endif
581
582#ifdef THREADED
583#pragma omp parallel for
584#endif
585 for(int i=0; i<nBins; i++)
586 {
587 int perBin = recvTuples.size()/nBins;
588 int startBinIndex = perBin * i;
589 int endBinIndex = perBin * (i+1);
590 if(i==nBins-1) endBinIndex = recvTuples.size();
591
592
593 std::vector<int> tsendcnt(param.nprocs,0);
594 for(int k=startBinIndex; k<endBinIndex;)
595 {
596
597 IT mi = std::get<0>(recvTuples[k]);
598 IT lcol = mi - param.localColStart;
599 IT i = RepMateC2R[lcol];
600 IT idx1 = k;
601 IT idx2 = colptr[lcol];
602
603 for(; std::get<0>(recvTuples[idx1]) == mi && idx2 < colptr[lcol+1];) //**
604 {
605
606 IT mj = std::get<1>(recvTuples[idx1]) ;
607 IT lrow = mj - param.localRowStart;
608 IT j = RepMateR2C[lrow];
609 IT lrowMat = dcsc->ir[idx2];
610 if(lrowMat == lrow)
611 {
612 NT weight = std::get<2>(recvTuples[idx1]);
613 NT cw = weight + RepMateWR2C[lrow]; //w+W[M'[j],M[i]];
614 if (cw > 0)
615 {
616 int rrank = (param.m_perproc != 0) ? std::min(static_cast<int>(mj / param.m_perproc), param.pr-1) : (param.pr-1);
617 int crank = (param.n_perproc != 0) ? std::min(static_cast<int>(j / param.n_perproc), param.pc-1) : (param.pc-1);
618 int owner = param.commGrid->GetRank(rrank , crank);
619 tsendcnt[owner]++;
620 }
621
622 idx1++; idx2++;
623 }
624 else if(lrowMat > lrow)
625 idx1 ++;
626 else
627 idx2 ++;
628 }
629
630 for(;std::get<0>(recvTuples[idx1]) == mi ; idx1++);
631 k = idx1;
632
633 }
634 for(int i=0; i<param.nprocs; i++)
635 {
636 __sync_fetch_and_add(sendcnt.data()+i, tsendcnt[i]);
637 }
638 }
639
640
641
642
643 IT totsend = std::accumulate(sendcnt.data(), sendcnt.data()+param.nprocs, static_cast<IT>(0));
644 std::vector<int> sdispls (param.nprocs, 0);
645 std::partial_sum(sendcnt.data(), sendcnt.data()+param.nprocs-1, sdispls.data()+1);
646
647 std::vector< std::tuple<IT,IT,IT,NT> > sendTuples(totsend);
648 std::vector<int> transferCount(param.nprocs,0);
649 int THREAD_BUF_LEN = ThreadBuffLenForBinning(32, param.nprocs);
650
651
652 //Step 3: Compile data to be sent to different processors
653#ifdef THREADED
654#pragma omp parallel for
655#endif
656 for(int i=0; i<nBins; i++)
657 {
658 int perBin = recvTuples.size()/nBins;
659 int startBinIndex = perBin * i;
660 int endBinIndex = perBin * (i+1);
661 if(i==nBins-1) endBinIndex = recvTuples.size();
662
663
664 std::vector<int> tsendcnt(param.nprocs,0);
665 std::vector<std::tuple<IT,IT, IT, NT>> tsendTuples (param.nprocs*THREAD_BUF_LEN);
666 for(int k=startBinIndex; k<endBinIndex;)
667 {
668 IT mi = std::get<0>(recvTuples[k]);
669 IT lcol = mi - param.localColStart;
670 IT i = RepMateC2R[lcol];
671 IT idx1 = k;
672 IT idx2 = colptr[lcol];
673
674 for(; std::get<0>(recvTuples[idx1]) == mi && idx2 < colptr[lcol+1];) //**
675 {
676
677 IT mj = std::get<1>(recvTuples[idx1]) ;
678 IT lrow = mj - param.localRowStart;
679 IT j = RepMateR2C[lrow];
680 IT lrowMat = dcsc->ir[idx2];
681 if(lrowMat == lrow)
682 {
683 NT weight = std::get<2>(recvTuples[idx1]);
684 NT cw = weight + RepMateWR2C[lrow]; //w+W[M'[j],M[i]];
685 if (cw > 0)
686 {
687 int rrank = (param.m_perproc != 0) ? std::min(static_cast<int>(mj / param.m_perproc), param.pr-1) : (param.pr-1);
688 int crank = (param.n_perproc != 0) ? std::min(static_cast<int>(j / param.n_perproc), param.pc-1) : (param.pc-1);
689 int owner = param.commGrid->GetRank(rrank , crank);
690
691 if (tsendcnt[owner] < THREAD_BUF_LEN)
692 {
693 tsendTuples[THREAD_BUF_LEN * owner + tsendcnt[owner]] = std::make_tuple(mj, mi, i, cw);
694 tsendcnt[owner]++;
695 }
696 else
697 {
698 int tt = __sync_fetch_and_add(transferCount.data()+owner, THREAD_BUF_LEN);
699 std::copy( tsendTuples.data()+THREAD_BUF_LEN * owner, tsendTuples.data()+THREAD_BUF_LEN * (owner+1) , sendTuples.data() + sdispls[owner]+ tt);
700
701 tsendTuples[THREAD_BUF_LEN * owner] = std::make_tuple(mj, mi, i, cw);
702 tsendcnt[owner] = 1;
703 }
704
705 }
706
707 idx1++; idx2++;
708 }
709 else if(lrowMat > lrow)
710 idx1 ++;
711 else
712 idx2 ++;
713 }
714
715 for(;std::get<0>(recvTuples[idx1]) == mi ; idx1++);
716 k = idx1;
717 }
718
719 for(int owner=0; owner < param.nprocs; owner++)
720 {
721 if (tsendcnt[owner] >0)
722 {
723 int tt = __sync_fetch_and_add(transferCount.data()+owner, tsendcnt[owner]);
724 std::copy( tsendTuples.data()+THREAD_BUF_LEN * owner, tsendTuples.data()+THREAD_BUF_LEN * owner + tsendcnt[owner], sendTuples.data() + sdispls[owner]+ tt);
725 }
726 }
727 }
728
729 // Step 4: Communicate data
730
731 double t2Comp = MPI_Wtime() - tstart;
732 tstart = MPI_Wtime();
733
734 std::vector<int> recvcnt (param.nprocs);
735 std::vector<int> rdispls (param.nprocs, 0);
736
737 MPI_Alltoall(sendcnt.data(), 1, MPI_INT, recvcnt.data(), 1, MPI_INT, World);
738 std::partial_sum(recvcnt.data(), recvcnt.data()+param.nprocs-1, rdispls.data()+1);
739 IT totrecv = std::accumulate(recvcnt.data(), recvcnt.data()+param.nprocs, static_cast<IT>(0));
740
741
742 MPI_Datatype MPI_tuple;
743 MPI_Type_contiguous(sizeof(std::tuple<IT,IT,IT,NT>), MPI_CHAR, &MPI_tuple);
744 MPI_Type_commit(&MPI_tuple);
745
746 std::vector< std::tuple<IT,IT,IT,NT> > recvTuples1(totrecv);
747 MPI_Alltoallv(sendTuples.data(), sendcnt.data(), sdispls.data(), MPI_tuple, recvTuples1.data(), recvcnt.data(), rdispls.data(), MPI_tuple, World);
748 MPI_Type_free(&MPI_tuple);
749 double t2Comm = MPI_Wtime() - tstart;
750 return recvTuples1;
751}
752
753
754// Old version of Phase 2
755// Not multithreaded (uses binary search)
756template <class IT, class NT>
757std::vector<std::vector<std::tuple<IT,IT, IT, NT>>> Phase2_old(const AWPM_param<IT>& param, std::vector<std::tuple<IT,IT,NT>>& recvTuples, Dcsc<IT, NT>* dcsc, const std::vector<IT>& colptr, const std::vector<IT>& RepMateR2C, const std::vector<IT>& RepMateC2R, const std::vector<NT>& RepMateWR2C, const std::vector<NT>& RepMateWC2R )
758{
759
760 std::vector<std::vector<std::tuple<IT,IT, IT, NT>>> tempTuples1 (param.nprocs);
761 for(int k=0; k<recvTuples.size(); ++k)
762 {
763 IT mi = std::get<0>(recvTuples[k]) ;
764 IT mj = std::get<1>(recvTuples[k]) ;
765 IT i = RepMateC2R[mi - param.localColStart];
766 NT weight = std::get<2>(recvTuples[k]);
767
768 if(colptr[mi- param.localColStart+1] > colptr[mi- param.localColStart] )
769 {
770 IT * ele = find(dcsc->ir+colptr[mi - param.localColStart], dcsc->ir+colptr[mi - param.localColStart+1], mj - param.localRowStart);
771
772 // TODO: Add a function that returns the edge weight directly
773 if (ele != dcsc->ir+colptr[mi - param.localColStart+1])
774 {
775 NT cw = weight + RepMateWR2C[mj - param.localRowStart]; //w+W[M'[j],M[i]];
776 if (cw > 0)
777 {
778 IT j = RepMateR2C[mj - param.localRowStart];
779 int rrank = (param.m_perproc != 0) ? std::min(static_cast<int>(mj / param.m_perproc), param.pr-1) : (param.pr-1);
780 int crank = (param.n_perproc != 0) ? std::min(static_cast<int>(j / param.n_perproc), param.pc-1) : (param.pc-1);
781 int owner = param.commGrid->GetRank(rrank , crank);
782 tempTuples1[owner].push_back(make_tuple(mj, mi, i, cw));
783 }
784 }
785 }
786 }
787
788 return tempTuples1;
789}
790
791template <class IT, class NT, class DER>
792void TwoThirdApprox(SpParMat < IT, NT, DER > & A, FullyDistVec<IT, IT>& mateRow2Col, FullyDistVec<IT, IT>& mateCol2Row)
793{
794
795 // Information about CommGrid and matrix layout
796 // Assume that processes are laid in (pr x pc) process grid
797 auto commGrid = A.getcommgrid();
798 int myrank=commGrid->GetRank();
799 MPI_Comm World = commGrid->GetWorld();
800 MPI_Comm ColWorld = commGrid->GetColWorld();
801 MPI_Comm RowWorld = commGrid->GetRowWorld();
802 int nprocs = commGrid->GetSize();
803 int pr = commGrid->GetGridRows();
804 int pc = commGrid->GetGridCols();
805 int rowrank = commGrid->GetRankInProcRow();
806 int colrank = commGrid->GetRankInProcCol();
807 int diagneigh = commGrid->GetComplementRank();
808
809 //Information about the matrix distribution
810 //Assume that A is an nrow x ncol matrix
811 //The local submatrix is an lnrow x lncol matrix
812 IT nrows = A.getnrow();
813 IT ncols = A.getncol();
814 IT nnz = A.getnnz();
815 IT m_perproc = nrows / pr;
816 IT n_perproc = ncols / pc;
817 DER* spSeq = A.seqptr(); // local submatrix
818 Dcsc<IT, NT>* dcsc = spSeq->GetDCSC();
819 IT lnrow = spSeq->getnrow();
820 IT lncol = spSeq->getncol();
821 IT localRowStart = colrank * m_perproc; // first row in this process
822 IT localColStart = rowrank * n_perproc; // first col in this process
823
824 AWPM_param<IT> param;
825 param.nprocs = nprocs;
826 param.pr = pr;
827 param.pc = pc;
828 param.lncol = lncol;
829 param.lnrow = lnrow;
830 param.m_perproc = m_perproc;
831 param.n_perproc = n_perproc;
832 param.localRowStart = localRowStart;
833 param.localColStart = localColStart;
834 param.myrank = myrank;
835 param.commGrid = commGrid;
836
837 //double t1CompAll = 0, t1CommAll = 0, t2CompAll = 0, t2CommAll = 0, t3CompAll = 0, t3CommAll = 0, t4CompAll = 0, t4CommAll = 0, t5CompAll = 0, t5CommAll = 0, tUpdateMateCompAll = 0, tUpdateWeightAll = 0;
838
839 double tPhase1 = 0, tPhase2 = 0, tPhase3 = 0, tPhase4 = 0, tPhase5 = 0, tUpdate = 0;
840
841
842 // -----------------------------------------------------------
843 // replicate mate vectors for mateCol2Row
844 // Communication cost: same as the first communication of SpMV
845 // -----------------------------------------------------------
846 int xsize = (int) mateCol2Row.LocArrSize();
847 int trxsize = 0;
848 MPI_Status status;
849 MPI_Sendrecv(&xsize, 1, MPI_INT, diagneigh, TRX, &trxsize, 1, MPI_INT, diagneigh, TRX, World, &status);
850 std::vector<IT> trxnums(trxsize);
851 MPI_Sendrecv(mateCol2Row.GetLocArr(), xsize, MPIType<IT>(), diagneigh, TRX, trxnums.data(), trxsize, MPIType<IT>(), diagneigh, TRX, World, &status);
852
853
854 std::vector<int> colsize(pc);
855 colsize[colrank] = trxsize;
856 MPI_Allgather(MPI_IN_PLACE, 1, MPI_INT, colsize.data(), 1, MPI_INT, ColWorld);
857 std::vector<int> dpls(pc,0); // displacements (zero initialized pid)
858 std::partial_sum(colsize.data(), colsize.data()+pc-1, dpls.data()+1);
859 int accsize = std::accumulate(colsize.data(), colsize.data()+pc, 0);
860 std::vector<IT> RepMateC2R(accsize);
861 MPI_Allgatherv(trxnums.data(), trxsize, MPIType<IT>(), RepMateC2R.data(), colsize.data(), dpls.data(), MPIType<IT>(), ColWorld);
862 // -----------------------------------------------------------
863
864
865 // -----------------------------------------------------------
866 // replicate mate vectors for mateRow2Col
867 // Communication cost: same as the first communication of SpMV
868 // (minus the cost of tranposing vector)
869 // -----------------------------------------------------------
870
871
872 xsize = (int) mateRow2Col.LocArrSize();
873
874 std::vector<int> rowsize(pr);
875 rowsize[rowrank] = xsize;
876 MPI_Allgather(MPI_IN_PLACE, 1, MPI_INT, rowsize.data(), 1, MPI_INT, RowWorld);
877 std::vector<int> rdpls(pr,0); // displacements (zero initialized pid)
878 std::partial_sum(rowsize.data(), rowsize.data()+pr-1, rdpls.data()+1);
879 accsize = std::accumulate(rowsize.data(), rowsize.data()+pr, 0);
880 std::vector<IT> RepMateR2C(accsize);
881 MPI_Allgatherv(mateRow2Col.GetLocArr(), xsize, MPIType<IT>(), RepMateR2C.data(), rowsize.data(), rdpls.data(), MPIType<IT>(), RowWorld);
882 // -----------------------------------------------------------
883
884
885
886 // Getting column pointers for all columns (for CSC-style access)
887 std::vector<IT> colptr (lncol+1,-1);
888 for(auto colit = spSeq->begcol(); colit != spSeq->endcol(); ++colit) // iterate over all columns
889 {
890 IT lj = colit.colid(); // local numbering
891
892 colptr[lj] = colit.colptr();
893 }
894 colptr[lncol] = spSeq->getnnz();
895 for(IT k=lncol-1; k>=0; k--)
896 {
897 if(colptr[k] == -1)
898 {
899 colptr[k] = colptr[k+1];
900 }
901 }
902 // TODO: will this fail empty local matrix where every entry of colptr will be zero
903
904
905 // -----------------------------------------------------------
906 // replicate weights of mates
907 // -----------------------------------------------------------
908 std::vector<NT> RepMateWR2C(lnrow);
909 std::vector<NT> RepMateWC2R(lncol);
910 ReplicateMateWeights(param, dcsc, colptr, RepMateC2R, RepMateWR2C, RepMateWC2R);
911
912
913 int iterations = 0;
914 NT minw;
915 NT weightCur = MatchingWeight(RepMateWC2R, RowWorld, minw);
916 NT weightPrev = weightCur - 999999999999;
917 while(weightCur > weightPrev && iterations++ < 10)
918 {
919
920
921#ifdef DETAIL_STATS
922 if(myrank==0) std::cout << "Iteration " << iterations << ". matching weight: sum = "<< weightCur << " min = " << minw << std::endl;
923#endif
924 // C requests
925 // each row is for a processor where C requests will be sent to
926 double tstart = MPI_Wtime();
927 std::vector<std::tuple<IT,IT,NT>> recvTuples = Phase1(param, dcsc, colptr, RepMateR2C, RepMateC2R, RepMateWR2C, RepMateWC2R );
928 tPhase1 += (MPI_Wtime() - tstart);
929 tstart = MPI_Wtime();
930
931 std::vector<std::tuple<IT,IT,IT,NT>> recvTuples1 = Phase2(param, recvTuples, dcsc, colptr, RepMateR2C, RepMateC2R, RepMateWR2C, RepMateWC2R );
932 std::vector< std::tuple<IT,IT,NT> >().swap(recvTuples);
933 tPhase2 += (MPI_Wtime() - tstart);
934 tstart = MPI_Wtime();
935
936
937 std::vector<std::tuple<IT,IT,IT,NT>> bestTuplesPhase3 (lncol);
938#ifdef THREADED
939#pragma omp parallel for
940#endif
941 for(int k=0; k<lncol; ++k)
942 {
943 bestTuplesPhase3[k] = std::make_tuple(-1,-1,-1,0); // fix this
944 }
945
946 for(int k=0; k<recvTuples1.size(); ++k)
947 {
948 IT mj = std::get<0>(recvTuples1[k]) ;
949 IT mi = std::get<1>(recvTuples1[k]) ;
950 IT i = std::get<2>(recvTuples1[k]) ;
951 NT weight = std::get<3>(recvTuples1[k]);
952 IT j = RepMateR2C[mj - localRowStart];
953 IT lj = j - localColStart;
954
955 // we can get rid of the first check if edge weights are non negative
956 if( (std::get<0>(bestTuplesPhase3[lj]) == -1) || (weight > std::get<3>(bestTuplesPhase3[lj])) )
957 {
958 bestTuplesPhase3[lj] = std::make_tuple(i,mi,mj,weight);
959 }
960 }
961
962 std::vector<std::vector<std::tuple<IT,IT, IT, NT>>> tempTuples1 (nprocs);
963 for(int k=0; k<lncol; ++k)
964 {
965 if( std::get<0>(bestTuplesPhase3[k]) != -1)
966 {
967 //IT j = RepMateR2C[mj - localRowStart]; /// fix me
968
969 IT i = std::get<0>(bestTuplesPhase3[k]) ;
970 IT mi = std::get<1>(bestTuplesPhase3[k]) ;
971 IT mj = std::get<2>(bestTuplesPhase3[k]) ;
972 IT j = RepMateR2C[mj - localRowStart];
973 NT weight = std::get<3>(bestTuplesPhase3[k]);
974 int owner = OwnerProcs(A, i, mi, nrows, ncols);
975
976 tempTuples1[owner].push_back(std::make_tuple(i, j, mj, weight));
977 }
978 }
979 //vector< tuple<IT,IT,IT, NT> >().swap(recvTuples1);
980 recvTuples1 = ExchangeData1(tempTuples1, World);
981
982 tPhase3 += (MPI_Wtime() - tstart);
983 tstart = MPI_Wtime();
984
985 std::vector<std::tuple<IT,IT,IT,IT, NT>> bestTuplesPhase4 (lncol);
986 // we could have used lnrow in both bestTuplesPhase3 and bestTuplesPhase4
987
988 // Phase 4
989 // at the owner of (i,mi)
990#ifdef THREADED
991#pragma omp parallel for
992#endif
993 for(int k=0; k<lncol; ++k)
994 {
995 bestTuplesPhase4[k] = std::make_tuple(-1,-1,-1,-1,0);
996 }
997
998 for(int k=0; k<recvTuples1.size(); ++k)
999 {
1000 IT i = std::get<0>(recvTuples1[k]) ;
1001 IT j = std::get<1>(recvTuples1[k]) ;
1002 IT mj = std::get<2>(recvTuples1[k]) ;
1003 IT mi = RepMateR2C[i-localRowStart];
1004 NT weight = std::get<3>(recvTuples1[k]);
1005 IT lmi = mi - localColStart;
1006 //IT lj = j - localColStart;
1007
1008 // cout <<"****" << i << " " << mi << " "<< j << " " << mj << " " << get<0>(bestTuplesPhase4[lj]) << endl;
1009 // we can get rid of the first check if edge weights are non negative
1010 if( ((std::get<0>(bestTuplesPhase4[lmi]) == -1) || (weight > std::get<4>(bestTuplesPhase4[lmi]))) && std::get<0>(bestTuplesPhase3[lmi])==-1 )
1011 {
1012 bestTuplesPhase4[lmi] = std::make_tuple(i,j,mi,mj,weight);
1013 //cout << "(("<< i << " " << mi << " "<< j << " " << mj << "))"<< endl;
1014 }
1015 }
1016
1017
1018 std::vector<std::vector<std::tuple<IT,IT,IT, IT>>> winnerTuples (nprocs);
1019
1020
1021 for(int k=0; k<lncol; ++k)
1022 {
1023 if( std::get<0>(bestTuplesPhase4[k]) != -1)
1024 {
1025 //int owner = OwnerProcs(A, get<0>(bestTuples[k]), get<1>(bestTuples[k]), nrows, ncols); // (i,mi)
1026 //tempTuples[owner].push_back(bestTuples[k]);
1027 IT i = std::get<0>(bestTuplesPhase4[k]) ;
1028 IT j = std::get<1>(bestTuplesPhase4[k]) ;
1029 IT mi = std::get<2>(bestTuplesPhase4[k]) ;
1030 IT mj = std::get<3>(bestTuplesPhase4[k]) ;
1031
1032
1033 int owner = OwnerProcs(A, mj, j, nrows, ncols);
1034 winnerTuples[owner].push_back(std::make_tuple(i, j, mi, mj));
1035
1037 // passing the opposite of the matching to the owner of (i,mi)
1038 owner = OwnerProcs(A, i, mi, nrows, ncols);
1039 winnerTuples[owner].push_back(std::make_tuple(mj, mi, j, i));
1040 }
1041 }
1042 std::vector<std::tuple<IT,IT,IT,IT>> recvWinnerTuples = ExchangeData1(winnerTuples, World);
1043 tPhase4 += (MPI_Wtime() - tstart);
1044 tstart = MPI_Wtime();
1045
1046 // at the owner of (mj,j)
1047 std::vector<std::tuple<IT,IT>> rowBcastTuples(recvWinnerTuples.size()); //(mi,mj)
1048 std::vector<std::tuple<IT,IT>> colBcastTuples(recvWinnerTuples.size()); //(j,i)
1049#ifdef THREADED
1050#pragma omp parallel for
1051#endif
1052 for(int k=0; k<recvWinnerTuples.size(); ++k)
1053 {
1054 IT i = std::get<0>(recvWinnerTuples[k]) ;
1055 IT j = std::get<1>(recvWinnerTuples[k]) ;
1056 IT mi = std::get<2>(recvWinnerTuples[k]) ;
1057 IT mj = std::get<3>(recvWinnerTuples[k]);
1058
1059 colBcastTuples[k] = std::make_tuple(j,i);
1060 rowBcastTuples[k] = std::make_tuple(mj,mi);
1061 }
1062
1063 std::vector<std::tuple<IT,IT>> updatedR2C = MateBcast(rowBcastTuples, RowWorld);
1064 std::vector<std::tuple<IT,IT>> updatedC2R = MateBcast(colBcastTuples, ColWorld);
1065
1066 tPhase5 += (MPI_Wtime() - tstart);
1067 tstart = MPI_Wtime();
1068
1069
1070#ifdef THREADED
1071#pragma omp parallel for
1072#endif
1073 for(int k=0; k<updatedR2C.size(); k++)
1074 {
1075 IT row = std::get<0>(updatedR2C[k]);
1076 IT mate = std::get<1>(updatedR2C[k]);
1077 RepMateR2C[row-localRowStart] = mate;
1078 }
1079
1080#ifdef THREADED
1081#pragma omp parallel for
1082#endif
1083 for(int k=0; k<updatedC2R.size(); k++)
1084 {
1085 IT col = std::get<0>(updatedC2R[k]);
1086 IT mate = std::get<1>(updatedC2R[k]);
1087 RepMateC2R[col-localColStart] = mate;
1088 }
1089
1090 // update weights of matched edges
1091 // we can do better than this since we are doing sparse updates
1092 ReplicateMateWeights(param, dcsc, colptr, RepMateC2R, RepMateWR2C, RepMateWC2R);
1093 weightPrev = weightCur;
1094 weightCur = MatchingWeight(RepMateWC2R, RowWorld, minw);
1095
1096 tUpdate += (MPI_Wtime() - tstart);
1097
1098 //UpdateMatching(mateRow2Col, mateCol2Row, RepMateR2C, RepMateC2R);
1099 //CheckMatching(mateRow2Col,mateCol2Row);
1100
1101 }
1102
1103#ifdef TIMING
1104 if(myrank==0)
1105 {
1106 std::cout << "------------- overal timing (HWPM) -------------" << std::endl;
1107 //std::cout << t1CompAll << " " << t1CommAll << " " << t2CompAll << " " << t2CommAll << " " << t3CompAll << " " << t3CommAll << " " << t4CompAll << " " << t4CommAll << " " << t5CompAll << " " << t5CommAll << " " << tUpdateMateCompAll << " " << tUpdateWeightAll << std::endl;
1108 std::cout <<"Phase1: "<< tPhase1 << "\nPhase2: " << tPhase2 << "\nPhase3: " << tPhase3 << "\nPhase4: " << tPhase4 << "\nPhase5: " << tPhase5 << "\nUpdate: " << tUpdate << std::endl;
1109 std::cout << "-------------------------------------------------" << std::endl;
1110 }
1111#endif
1112 // update the distributed mate vectors from replicated mate vectors
1113 UpdateMatching(mateRow2Col, mateCol2Row, RepMateR2C, RepMateC2R);
1114 //weightCur = MatchingWeight(RepMateWC2R, RowWorld);
1115
1116
1117
1118}
1119
1120
1121 template <class IT, class NT, class DER>
1122 void TransformWeight(SpParMat < IT, NT, DER > & A, bool applylog)
1123 {
1124 //A.Apply([](NT val){return log(1+abs(val));});
1125 // if the matrix has explicit zero entries, we can still have problem.
1126 // One solution is to remove explicit zero entries before cardinality matching (to be tested)
1127 //A.Apply([](NT val){if(val==0) return log(numeric_limits<NT>::min()); else return log(fabs(val));});
1128 A.Apply([](NT val){return (fabs(val));});
1129
1130 FullyDistVec<IT, NT> maxvRow(A.getcommgrid());
1131 A.Reduce(maxvRow, Row, maximum<NT>(), static_cast<NT>(numeric_limits<NT>::lowest()));
1132 A.DimApply(Row, maxvRow, [](NT val, NT maxval){return val/maxval;});
1133
1134 FullyDistVec<IT, NT> maxvCol(A.getcommgrid());
1135 A.Reduce(maxvCol, Column, maximum<NT>(), static_cast<NT>(numeric_limits<NT>::lowest()));
1136 A.DimApply(Column, maxvCol, [](NT val, NT maxval){return val/maxval;});
1137
1138 if(applylog)
1139 A.Apply([](NT val){return log(val);});
1140
1141 }
1142
1143 template <class IT, class NT>
1144 void AWPM(SpParMat < IT, NT, SpDCCols<IT, NT> > & A1, FullyDistVec<IT, IT>& mateRow2Col, FullyDistVec<IT, IT>& mateCol2Row, bool optimizeProd=true, bool weightedCard=true)
1145 {
1146 SpParMat < IT, NT, SpDCCols<IT, NT> > A(A1); // creating a copy because it is being transformed
1147
1148 if(optimizeProd)
1149 TransformWeight(A, true);
1150 else
1151 TransformWeight(A, false);
1152 SpParMat < IT, NT, SpCCols<IT, NT> > Acsc(A);
1153 SpParMat < IT, NT, SpDCCols<IT, bool> > Abool(A);
1154 SpParMat < IT, NT, SpCCols<IT, bool> > ABoolCSC(MPI_COMM_WORLD);
1155 if(weightedCard)
1156 ABoolCSC = A;
1157
1158 FullyDistVec<IT, IT> degCol(A.getcommgrid());
1159 Abool.Reduce(degCol, Column, plus<IT>(), static_cast<IT>(0));
1160 double ts;
1161
1162 // Compute the initial trace
1163 IT diagnnz;
1164 double origWeight = Trace(A, diagnnz);
1165 bool isOriginalPerfect = diagnnz==A.getnrow();
1166
1167 //--------------------------------------------------------
1168 // Compute the maximal cardinality matching
1169 //--------------------------------------------------------
1170 if(weightedCard)
1171 WeightedGreedy(Acsc, mateRow2Col, mateCol2Row, degCol);
1172 else
1173 WeightedGreedy(ABoolCSC, mateRow2Col, mateCol2Row, degCol);
1174
1175 double mclWeight = MatchingWeight( A, mateRow2Col, mateCol2Row);
1176 bool isPerfectMCL = CheckMatching(mateRow2Col,mateCol2Row);
1177
1178 // if the original matrix has a perfect matching and better weight
1179 if(isOriginalPerfect && mclWeight<=origWeight)
1180 {
1181
1182#ifdef DETAIL_STATS
1183 SpParHelper::Print("Maximal matching is not better that the natural ordering. Hence, keeping the natural ordering.\n");
1184#endif
1185 mateRow2Col.iota(A.getnrow(), 0);
1186 mateCol2Row.iota(A.getncol(), 0);
1187 mclWeight = origWeight;
1188 isPerfectMCL = true;
1189 }
1190
1191
1192 //--------------------------------------------------------
1193 // Compute the maximum cardinality matching
1194 //--------------------------------------------------------
1195 double tmcm = 0;
1196 double mcmWeight = mclWeight;
1197 bool isPerfectMCM = isPerfectMCL;
1198 if(!isPerfectMCL) // run MCM only if we don't have a perfect matching
1199 {
1200 ts = MPI_Wtime();
1201 if(weightedCard)
1202 maximumMatching(Acsc, mateRow2Col, mateCol2Row, true, false, true);
1203 else
1204 maximumMatching(ABoolCSC, mateRow2Col, mateCol2Row, true, false, false);
1205
1206
1207 tmcm = MPI_Wtime() - ts;
1208 mcmWeight = MatchingWeight( A, mateRow2Col, mateCol2Row) ;
1209 isPerfectMCM = CheckMatching(mateRow2Col,mateCol2Row);
1210 }
1211
1212 if(!isPerfectMCM)
1213 SpParHelper::Print("Warning: The Maximum Cardinality Matching did not return a perfect matching! Need to check the input matrix.\n");
1214
1215 //--------------------------------------------------------
1216 // Increase the weight of the perfect matching
1217 //--------------------------------------------------------
1218 ts = MPI_Wtime();
1219 TwoThirdApprox(A, mateRow2Col, mateCol2Row);
1220 double tawpm = MPI_Wtime() - ts;
1221
1222 double awpmWeight = MatchingWeight( A, mateRow2Col, mateCol2Row) ;
1223
1224 bool isPerfectAWPM = CheckMatching(mateRow2Col,mateCol2Row);
1225 if(!isPerfectAWPM)
1226 SpParHelper::Print("Warning: The HWPM code did not return a perfect matching! Need to check the input matrix.\n");
1227
1228 if(isOriginalPerfect && awpmWeight<origWeight) // keep original
1229 {
1230#ifdef DETAIL_STATS
1231 SpParHelper::Print("AWPM is not better that the natural ordering. Hence, keeping the natural ordering.\n");
1232#endif
1233 mateRow2Col.iota(A.getnrow(), 0);
1234 mateCol2Row.iota(A.getncol(), 0);
1235 awpmWeight = origWeight;
1236 }
1237 }
1238
1239}
1240
1241#endif /* ApproxWeightPerfectMatching_h */
#define L2_CACHE_SIZE
static void Print(const std::string &s)
int nprocs
Definition comms.cpp:55
#define TRX
Definition SpDefs.h:103
@ Column
Definition SpDefs.h:115
bool CheckMatching(FullyDistVec< IT, IT > &mateRow2Col, FullyDistVec< IT, IT > &mateCol2Row)
Definition Utility.h:113
int OwnerProcs(SpParMat< IT, NT, DER > &A, IT grow, IT gcol, IT nrows, IT ncols)
std::vector< std::tuple< IT, IT, NT > > Phase1(const AWPM_param< IT > &param, Dcsc< IT, NT > *dcsc, const std::vector< IT > &colptr, const std::vector< IT > &RepMateR2C, const std::vector< IT > &RepMateC2R, const std::vector< NT > &RepMateWR2C, const std::vector< NT > &RepMateWC2R)
std::vector< std::tuple< IT, IT, IT, NT > > Phase2(const AWPM_param< IT > &param, std::vector< std::tuple< IT, IT, NT > > &recvTuples, Dcsc< IT, NT > *dcsc, const std::vector< IT > &colptr, const std::vector< IT > &RepMateR2C, const std::vector< IT > &RepMateC2R, const std::vector< NT > &RepMateWR2C, const std::vector< NT > &RepMateWC2R)
NT MatchingWeight(std::vector< NT > &RepMateWC2R, MPI_Comm RowWorld, NT &minw)
void TransformWeight(SpParMat< IT, NT, DER > &A, bool applylog)
std::vector< std::tuple< IT, IT, NT > > ExchangeData(std::vector< std::vector< std::tuple< IT, IT, NT > > > &tempTuples, MPI_Comm World)
std::vector< std::tuple< IT, IT, IT, NT > > ExchangeData1(std::vector< std::vector< std::tuple< IT, IT, IT, NT > > > &tempTuples, MPI_Comm World)
NT Trace(SpParMat< IT, NT, DER > &A, IT &rettrnnz=0)
void UpdateMatching(FullyDistVec< IT, IT > &mateRow2Col, FullyDistVec< IT, IT > &mateCol2Row, std::vector< IT > &RepMateR2C, std::vector< IT > &RepMateC2R)
void maximumMatching(SpParMat< IT, NT, DER > &A, FullyDistVec< IT, IT > &mateRow2Col, FullyDistVec< IT, IT > &mateCol2Row, bool prune=true, bool randMM=false, bool maximizeWeight=false)
int ThreadBuffLenForBinning(int itemsize, int nbins)
std::vector< std::tuple< IT, IT > > MateBcast(std::vector< std::tuple< IT, IT > > sendTuples, MPI_Comm World)
void ReplicateMateWeights(const AWPM_param< IT > &param, Dcsc< IT, NT > *dcsc, const std::vector< IT > &colptr, std::vector< IT > &RepMateC2R, std::vector< NT > &RepMateWR2C, std::vector< NT > &RepMateWC2R)
std::vector< std::vector< std::tuple< IT, IT, IT, NT > > > Phase2_old(const AWPM_param< IT > &param, std::vector< std::tuple< IT, IT, NT > > &recvTuples, Dcsc< IT, NT > *dcsc, const std::vector< IT > &colptr, const std::vector< IT > &RepMateR2C, const std::vector< IT > &RepMateC2R, const std::vector< NT > &RepMateWR2C, const std::vector< NT > &RepMateWC2R)
void TwoThirdApprox(SpParMat< IT, NT, DER > &A, FullyDistVec< IT, IT > &mateRow2Col, FullyDistVec< IT, IT > &mateCol2Row)
void DeleteAll(A arr1)
Definition Deleter.h:48
void WeightedGreedy(Par_MAT_Double &A, FullyDistVec< IT, IT > &mateRow2Col, FullyDistVec< IT, IT > &mateCol2Row, FullyDistVec< IT, IT > &degCol)
void AWPM(SpParMat< IT, NT, SpDCCols< IT, NT > > &A1, FullyDistVec< IT, IT > &mateRow2Col, FullyDistVec< IT, IT > &mateCol2Row, bool optimizeProd=true, bool weightedCard=true)
double A
std::shared_ptr< CommGrid > commGrid