COMBINATORIAL_BLAS  1.6
ApproxWeightPerfectMatching.h
Go to the documentation of this file.
1 //
2 // ApproxWeightPerfectMatching.h
3 //
4 //
5 // Created by Ariful Azad on 8/22/17.
6 //
7 //
8 
9 #ifndef ApproxWeightPerfectMatching_h
10 #define ApproxWeightPerfectMatching_h
11 
12 #include "CombBLAS.h"
13 #include "BPMaximalMatching.h"
14 #include "BPMaximumMatching.h"
15 #include <parallel/algorithm>
16 #include <parallel/numeric>
17 #include <memory>
18 #include <limits>
19 
20 
21 using namespace std;
22 
23 namespace combblas {
24 
25 
26 template <class IT>
27 struct AWPM_param
28 {
29  int nprocs;
30  int myrank;
31  int pr;
32  int pc;
33  IT lncol;
34  IT lnrow;
39  std::shared_ptr<CommGrid> commGrid;
40 };
41 
42 template <class IT, class NT>
43 std::vector<std::tuple<IT,IT,NT>> ExchangeData(std::vector<std::vector<std::tuple<IT,IT,NT>>> & tempTuples, MPI_Comm World)
44 {
45 
46  /* Create/allocate variables for vector assignment */
47  MPI_Datatype MPI_tuple;
48  MPI_Type_contiguous(sizeof(std::tuple<IT,IT,NT>), MPI_CHAR, &MPI_tuple);
49  MPI_Type_commit(&MPI_tuple);
50 
51  int nprocs;
52  MPI_Comm_size(World, &nprocs);
53 
54  int * sendcnt = new int[nprocs];
55  int * recvcnt = new int[nprocs];
56  int * sdispls = new int[nprocs]();
57  int * rdispls = new int[nprocs]();
58 
59  // Set the newly found vector entries
60  IT totsend = 0;
61  for(IT i=0; i<nprocs; ++i)
62  {
63  sendcnt[i] = tempTuples[i].size();
64  totsend += tempTuples[i].size();
65  }
66 
67  MPI_Alltoall(sendcnt, 1, MPI_INT, recvcnt, 1, MPI_INT, World);
68 
69  std::partial_sum(sendcnt, sendcnt+nprocs-1, sdispls+1);
70  std::partial_sum(recvcnt, recvcnt+nprocs-1, rdispls+1);
71  IT totrecv = accumulate(recvcnt,recvcnt+nprocs, static_cast<IT>(0));
72 
73 
74  std::vector< std::tuple<IT,IT,NT> > sendTuples(totsend);
75  for(int i=0; i<nprocs; ++i)
76  {
77  copy(tempTuples[i].begin(), tempTuples[i].end(), sendTuples.data()+sdispls[i]);
78  std::vector< std::tuple<IT,IT,NT> >().swap(tempTuples[i]); // clear memory
79  }
80  std::vector< std::tuple<IT,IT,NT> > recvTuples(totrecv);
81  MPI_Alltoallv(sendTuples.data(), sendcnt, sdispls, MPI_tuple, recvTuples.data(), recvcnt, rdispls, MPI_tuple, World);
82  DeleteAll(sendcnt, recvcnt, sdispls, rdispls); // free all memory
83  MPI_Type_free(&MPI_tuple);
84  return recvTuples;
85 
86 }
87 
88 
89 
90 template <class IT, class NT>
91 std::vector<std::tuple<IT,IT,IT,NT>> ExchangeData1(std::vector<std::vector<std::tuple<IT,IT,IT,NT>>> & tempTuples, MPI_Comm World)
92 {
93 
94  /* Create/allocate variables for vector assignment */
95  MPI_Datatype MPI_tuple;
96  MPI_Type_contiguous(sizeof(std::tuple<IT,IT,IT,NT>), MPI_CHAR, &MPI_tuple);
97  MPI_Type_commit(&MPI_tuple);
98 
99  int nprocs;
100  MPI_Comm_size(World, &nprocs);
101 
102  int * sendcnt = new int[nprocs];
103  int * recvcnt = new int[nprocs];
104  int * sdispls = new int[nprocs]();
105  int * rdispls = new int[nprocs]();
106 
107  // Set the newly found vector entries
108  IT totsend = 0;
109  for(IT i=0; i<nprocs; ++i)
110  {
111  sendcnt[i] = tempTuples[i].size();
112  totsend += tempTuples[i].size();
113  }
114 
115  MPI_Alltoall(sendcnt, 1, MPI_INT, recvcnt, 1, MPI_INT, World);
116 
117  std::partial_sum(sendcnt, sendcnt+nprocs-1, sdispls+1);
118  std::partial_sum(recvcnt, recvcnt+nprocs-1, rdispls+1);
119  IT totrecv = std::accumulate(recvcnt,recvcnt+nprocs, static_cast<IT>(0));
120 
121  std::vector< std::tuple<IT,IT,IT,NT> > sendTuples(totsend);
122  for(int i=0; i<nprocs; ++i)
123  {
124  copy(tempTuples[i].begin(), tempTuples[i].end(), sendTuples.data()+sdispls[i]);
125  std::vector< std::tuple<IT,IT,IT,NT> >().swap(tempTuples[i]); // clear memory
126  }
127  std::vector< std::tuple<IT,IT,IT,NT> > recvTuples(totrecv);
128  MPI_Alltoallv(sendTuples.data(), sendcnt, sdispls, MPI_tuple, recvTuples.data(), recvcnt, rdispls, MPI_tuple, World);
129  DeleteAll(sendcnt, recvcnt, sdispls, rdispls); // free all memory
130  MPI_Type_free(&MPI_tuple);
131  return recvTuples;
132 }
133 
134 
135 
136 
137 // remember that getnrow() and getncol() require collectives
138 // Hence, we save them once and pass them to this function
139 template <class IT, class NT,class DER>
140 int OwnerProcs(SpParMat < IT, NT, DER > & A, IT grow, IT gcol, IT nrows, IT ncols)
141 {
142  auto commGrid = A.getcommgrid();
143  int procrows = commGrid->GetGridRows();
144  int proccols = commGrid->GetGridCols();
145 
146  IT m_perproc = nrows / procrows;
147  IT n_perproc = ncols / proccols;
148  int pr, pc;
149  if(m_perproc != 0)
150  pr = std::min(static_cast<int>(grow / m_perproc), procrows-1);
151  else // all owned by the last processor row
152  pr = procrows -1;
153  if(n_perproc != 0)
154  pc = std::min(static_cast<int>(gcol / n_perproc), proccols-1);
155  else
156  pc = proccols-1;
157  return commGrid->GetRank(pr, pc);
158 }
159 
160 
161 /*
162 // Hence, we save them once and pass them to this function
163 template <class IT, class NT,class DER>
164 int OwnerProcs(SpParMat < IT, NT, DER > & A, IT grow, IT gcol, IT nrows, IT ncols)
165 {
166 
167 
168 
169  int pr1, pc1;
170  if(m_perproc != 0)
171  pr1 = std::min(static_cast<int>(grow / m_perproc), pr-1);
172  else // all owned by the last processor row
173  pr1 = pr -1;
174  if(n_perproc != 0)
175  pc1 = std::min(static_cast<int>(gcol / n_perproc), pc-1);
176  else
177  pc1 = pc-1;
178  return commGrid->GetRank(pr1, pc1);
179 }
180  */
181 
182 
183 template <class IT>
184 std::vector<std::tuple<IT,IT>> MateBcast(std::vector<std::tuple<IT,IT>> sendTuples, MPI_Comm World)
185 {
186 
187  /* Create/allocate variables for vector assignment */
188  MPI_Datatype MPI_tuple;
189  MPI_Type_contiguous(sizeof(std::tuple<IT,IT>) , MPI_CHAR, &MPI_tuple);
190  MPI_Type_commit(&MPI_tuple);
191 
192 
193  int nprocs;
194  MPI_Comm_size(World, &nprocs);
195 
196  int * recvcnt = new int[nprocs];
197  int * rdispls = new int[nprocs]();
198  int sendcnt = sendTuples.size();
199 
200 
201  MPI_Allgather(&sendcnt, 1, MPI_INT, recvcnt, 1, MPI_INT, World);
202 
203  std::partial_sum(recvcnt, recvcnt+nprocs-1, rdispls+1);
204  IT totrecv = std::accumulate(recvcnt,recvcnt+nprocs, static_cast<IT>(0));
205 
206  std::vector< std::tuple<IT,IT> > recvTuples(totrecv);
207 
208 
209  MPI_Allgatherv(sendTuples.data(), sendcnt, MPI_tuple,
210  recvTuples.data(), recvcnt, rdispls,MPI_tuple,World );
211 
212  DeleteAll(recvcnt, rdispls); // free all memory
213  MPI_Type_free(&MPI_tuple);
214  return recvTuples;
215 
216 }
217 
218 
219 // -----------------------------------------------------------
220 // replicate weights of mates
221 // Can be improved by removing AllReduce by All2All
222 // -----------------------------------------------------------
223 
224 template <class IT, class NT>
225 void ReplicateMateWeights( const AWPM_param<IT>& param, Dcsc<IT, NT>*dcsc, const std::vector<IT>& colptr, std::vector<IT>& RepMateC2R, std::vector<NT>& RepMateWR2C, std::vector<NT>& RepMateWC2R)
226 {
227 
228 
229  fill(RepMateWC2R.begin(), RepMateWC2R.end(), static_cast<NT>(0));
230  fill(RepMateWR2C.begin(), RepMateWR2C.end(), static_cast<NT>(0));
231 
232 
233 #ifdef THREADED
234 #pragma omp parallel for
235 #endif
236  for(int k=0; k<param.lncol; ++k)
237  {
238 
239  IT lj = k; // local numbering
240  IT mj = RepMateC2R[lj]; // mate of j
241 
242  if(mj >= param.localRowStart && mj < (param.localRowStart+param.lnrow) )
243  {
244  for(IT cp = colptr[k]; cp < colptr[k+1]; ++cp)
245  {
246  IT li = dcsc->ir[cp];
247  IT i = li + param.localRowStart;
248  // TODO: use binary search to directly go to mj-th entry if more than 32 nonzero in this column
249  if( i == mj)
250  {
251  RepMateWC2R[lj] = dcsc->numx[cp];
252  RepMateWR2C[mj-param.localRowStart] = dcsc->numx[cp];
253  //break;
254  }
255  }
256  }
257  }
258 
259 
260 
261  MPI_Comm ColWorld = param.commGrid->GetColWorld();
262  MPI_Comm RowWorld = param.commGrid->GetRowWorld();
263 
264  MPI_Allreduce(MPI_IN_PLACE, RepMateWC2R.data(), RepMateWC2R.size(), MPIType<NT>(), MPI_SUM, ColWorld);
265  MPI_Allreduce(MPI_IN_PLACE, RepMateWR2C.data(), RepMateWR2C.size(), MPIType<NT>(), MPI_SUM, RowWorld);
266 }
267 
268 
269 
270 
271 template <class IT, class NT,class DER>
272 NT Trace( SpParMat < IT, NT, DER > & A, IT& rettrnnz=0)
273 {
274 
275  IT nrows = A.getnrow();
276  IT ncols = A.getncol();
277  auto commGrid = A.getcommgrid();
278  MPI_Comm World = commGrid->GetWorld();
279  int myrank=commGrid->GetRank();
280  int pr = commGrid->GetGridRows();
281  int pc = commGrid->GetGridCols();
282 
283 
284  //Information about the matrix distribution
285  //Assume that A is an nrow x ncol matrix
286  //The local submatrix is an lnrow x lncol matrix
287  int rowrank = commGrid->GetRankInProcRow();
288  int colrank = commGrid->GetRankInProcCol();
289  IT m_perproc = nrows / pr;
290  IT n_perproc = ncols / pc;
291  DER* spSeq = A.seqptr(); // local submatrix
292  IT localRowStart = colrank * m_perproc; // first row in this process
293  IT localColStart = rowrank * n_perproc; // first col in this process
294 
295 
296  IT trnnz = 0;
297  NT trace = 0.0;
298  for(auto colit = spSeq->begcol(); colit != spSeq->endcol(); ++colit) // iterate over columns
299  {
300  IT lj = colit.colid(); // local numbering
301  IT j = lj + localColStart;
302 
303  for(auto nzit = spSeq->begnz(colit); nzit < spSeq->endnz(colit); ++nzit)
304  {
305 
306  IT li = nzit.rowid();
307  IT i = li + localRowStart;
308  if( i == j)
309  {
310  trnnz ++;
311  trace += nzit.value();
312 
313  }
314  }
315 
316  }
317  MPI_Allreduce(MPI_IN_PLACE, &trnnz, 1, MPIType<IT>(), MPI_SUM, World);
318  MPI_Allreduce(MPI_IN_PLACE, &trace, 1, MPIType<NT>(), MPI_SUM, World);
319  rettrnnz = trnnz;
320  /*
321  if(myrank==0)
322  cout <<"nrows: " << nrows << " Nnz in the diag: " << trnnz << " sum of diag: " << trace << endl;
323  */
324  return trace;
325 
326 }
327 
328 
329 template <class NT>
330 NT MatchingWeight( std::vector<NT>& RepMateWC2R, MPI_Comm RowWorld, NT& minw)
331 {
332  NT w = 0;
333  minw = 99999999999999.0;
334  for(int i=0; i<RepMateWC2R.size(); i++)
335  {
336  //w += fabs(RepMateWC2R[i]);
337  //w += exp(RepMateWC2R[i]);
338  //minw = min(minw, exp(RepMateWC2R[i]));
339 
340  w += RepMateWC2R[i];
341  minw = std::min(minw, RepMateWC2R[i]);
342  }
343 
344  MPI_Allreduce(MPI_IN_PLACE, &w, 1, MPIType<NT>(), MPI_SUM, RowWorld);
345  MPI_Allreduce(MPI_IN_PLACE, &minw, 1, MPIType<NT>(), MPI_MIN, RowWorld);
346  return w;
347 }
348 
349 
350 
351 
352 
353 // update the distributed mate vectors from replicated mate vectors
354 template <class IT>
355 void UpdateMatching(FullyDistVec<IT, IT>& mateRow2Col, FullyDistVec<IT, IT>& mateCol2Row, std::vector<IT>& RepMateR2C, std::vector<IT>& RepMateC2R)
356 {
357 
358  auto commGrid = mateRow2Col.getcommgrid();
359  MPI_Comm RowWorld = commGrid->GetRowWorld();
360  int rowroot = commGrid->GetDiagOfProcRow();
361  int pc = commGrid->GetGridCols();
362 
363  // mateRow2Col is easy
364  IT localLenR2C = mateRow2Col.LocArrSize();
365  //IT* localR2C = mateRow2Col.GetLocArr();
366  for(IT i=0, j = mateRow2Col.RowLenUntil(); i<localLenR2C; i++, j++)
367  {
368  mateRow2Col.SetLocalElement(i, RepMateR2C[j]);
369  //localR2C[i] = RepMateR2C[j];
370  }
371 
372 
373  // mateCol2Row requires communication
374  std::vector <int> sendcnts(pc);
375  std::vector <int> dpls(pc);
376  dpls[0] = 0;
377  for(int i=1; i<pc; i++)
378  {
379  dpls[i] = mateCol2Row.RowLenUntil(i);
380  sendcnts[i-1] = dpls[i] - dpls[i-1];
381  }
382  sendcnts[pc-1] = RepMateC2R.size() - dpls[pc-1];
383 
384  IT localLenC2R = mateCol2Row.LocArrSize();
385  IT* localC2R = (IT*) mateCol2Row.GetLocArr();
386  MPI_Scatterv(RepMateC2R.data(),sendcnts.data(), dpls.data(), MPIType<IT>(), localC2R, localLenC2R, MPIType<IT>(),rowroot, RowWorld);
387 }
388 
389 
390 
391 inline int ThreadBuffLenForBinning(int itemsize, int nbins)
392 {
393  // 1MB shared cache (per 2 cores) in KNL
394 #ifndef L2_CACHE_SIZE
395 #define L2_CACHE_SIZE 256000
396 #endif
397  int THREAD_BUF_LEN = 256;
398  while(true)
399  {
400  int bufferMem = THREAD_BUF_LEN * nbins * itemsize ;
401  if(bufferMem>L2_CACHE_SIZE ) THREAD_BUF_LEN/=2;
402  else break;
403  }
404  THREAD_BUF_LEN = std::min(nbins+1,THREAD_BUF_LEN);
405 
406  return THREAD_BUF_LEN;
407 }
408 
409 
410 
411 template <class IT, class NT>
412 std::vector< std::tuple<IT,IT,NT> > Phase1(const AWPM_param<IT>& param, Dcsc<IT, NT>* dcsc, const std::vector<IT>& colptr, const std::vector<IT>& RepMateR2C, const std::vector<IT>& RepMateC2R, const std::vector<NT>& RepMateWR2C, const std::vector<NT>& RepMateWC2R )
413 {
414 
415 
416 
417 
418 
419  MPI_Comm World = param.commGrid->GetWorld();
420 
421 
422 
423  //Step 1: Count the amount of data to be sent to different processors
424  std::vector<int> sendcnt(param.nprocs,0); // number items to be sent to each processor
425 
426 
427 #ifdef THREADED
428 #pragma omp parallel
429 #endif
430  {
431  std::vector<int> tsendcnt(param.nprocs,0);
432 #ifdef THREADED
433 #pragma omp for
434 #endif
435  for(int k=0; k<param.lncol; ++k)
436  {
437  IT mj = RepMateC2R[k]; // lj = k
438  IT j = k + param.localColStart;
439 
440  for(IT cp = colptr[k]; cp < colptr[k+1]; ++cp)
441  {
442  IT li = dcsc->ir[cp];
443  IT i = li + param.localRowStart;
444  IT mi = RepMateR2C[li];
445  if( i > mj) // TODO : stop when first come to this, may be use <
446  {
447 
448  int rrank = param.m_perproc != 0 ? std::min(static_cast<int>(mj / param.m_perproc), param.pr-1) : (param.pr-1);
449  int crank = param.n_perproc != 0 ? std::min(static_cast<int>(mi / param.n_perproc), param.pc-1) : (param.pc-1);
450  int owner = param.commGrid->GetRank(rrank , crank);
451  tsendcnt[owner]++;
452  }
453  }
454  }
455  for(int i=0; i<param.nprocs; i++)
456  {
457  __sync_fetch_and_add(sendcnt.data()+i, tsendcnt[i]);
458  }
459  }
460 
461 
462 
463 
464  IT totsend = std::accumulate(sendcnt.data(), sendcnt.data()+param.nprocs, static_cast<IT>(0));
465  std::vector<int> sdispls (param.nprocs, 0);
466  std::partial_sum(sendcnt.data(), sendcnt.data()+param.nprocs-1, sdispls.data()+1);
467 
468  std::vector< std::tuple<IT,IT,NT> > sendTuples(totsend);
469  std::vector<int> transferCount(param.nprocs,0);
470  int THREAD_BUF_LEN = ThreadBuffLenForBinning(24, param.nprocs);
471 
472 
473  //Step 2: Compile data to be sent to different processors
474 #ifdef THREADED
475 #pragma omp parallel
476 #endif
477  {
478  std::vector<int> tsendcnt(param.nprocs,0);
479  std::vector<std::tuple<IT,IT, NT>> tsendTuples (param.nprocs*THREAD_BUF_LEN);
480 #ifdef THREADED
481 #pragma omp for
482 #endif
483  for(int k=0; k<param.lncol; ++k)
484  {
485  IT mj = RepMateC2R[k];
486  IT lj = k;
487  IT j = k + param.localColStart;
488 
489  for(IT cp = colptr[k]; cp < colptr[k+1]; ++cp)
490  {
491  IT li = dcsc->ir[cp];
492  IT i = li + param.localRowStart;
493  IT mi = RepMateR2C[li];
494  if( i > mj) // TODO : stop when first come to this, may be use <
495  {
496  double w = dcsc->numx[cp]- RepMateWR2C[li] - RepMateWC2R[lj];
497  int rrank = param.m_perproc != 0 ? std::min(static_cast<int>(mj / param.m_perproc), param.pr-1) : (param.pr-1);
498  int crank = param.n_perproc != 0 ? std::min(static_cast<int>(mi / param.n_perproc), param.pc-1) : (param.pc-1);
499  int owner = param.commGrid->GetRank(rrank , crank);
500 
501  if (tsendcnt[owner] < THREAD_BUF_LEN)
502  {
503  tsendTuples[THREAD_BUF_LEN * owner + tsendcnt[owner]] = std::make_tuple(mi, mj, w);
504  tsendcnt[owner]++;
505  }
506  else
507  {
508  int tt = __sync_fetch_and_add(transferCount.data()+owner, THREAD_BUF_LEN);
509  copy( tsendTuples.data()+THREAD_BUF_LEN * owner, tsendTuples.data()+THREAD_BUF_LEN * (owner+1) , sendTuples.data() + sdispls[owner]+ tt);
510 
511  tsendTuples[THREAD_BUF_LEN * owner] = std::make_tuple(mi, mj, w);
512  tsendcnt[owner] = 1;
513  }
514  }
515  }
516  }
517  for(int owner=0; owner < param.nprocs; owner++)
518  {
519  if (tsendcnt[owner] >0)
520  {
521  int tt = __sync_fetch_and_add(transferCount.data()+owner, tsendcnt[owner]);
522  copy( tsendTuples.data()+THREAD_BUF_LEN * owner, tsendTuples.data()+THREAD_BUF_LEN * owner + tsendcnt[owner], sendTuples.data() + sdispls[owner]+ tt);
523  }
524  }
525  }
526 
527  // Step 3: Communicate data
528 
529  std::vector<int> recvcnt (param.nprocs);
530  std::vector<int> rdispls (param.nprocs, 0);
531 
532  MPI_Alltoall(sendcnt.data(), 1, MPI_INT, recvcnt.data(), 1, MPI_INT, World);
533  std::partial_sum(recvcnt.data(), recvcnt.data()+param.nprocs-1, rdispls.data()+1);
534  IT totrecv = std::accumulate(recvcnt.data(), recvcnt.data()+param.nprocs, static_cast<IT>(0));
535 
536 
537  MPI_Datatype MPI_tuple;
538  MPI_Type_contiguous(sizeof(std::tuple<IT,IT,NT>), MPI_CHAR, &MPI_tuple);
539  MPI_Type_commit(&MPI_tuple);
540 
541  std::vector< std::tuple<IT,IT,NT> > recvTuples1(totrecv);
542  MPI_Alltoallv(sendTuples.data(), sendcnt.data(), sdispls.data(), MPI_tuple, recvTuples1.data(), recvcnt.data(), rdispls.data(), MPI_tuple, World);
543  MPI_Type_free(&MPI_tuple);
544  return recvTuples1;
545 }
546 
547 
548 
549 
550 
551 template <class IT, class NT>
552 std::vector< std::tuple<IT,IT,IT,NT> > Phase2(const AWPM_param<IT>& param, std::vector<std::tuple<IT,IT,NT>>& recvTuples, Dcsc<IT, NT>* dcsc, const std::vector<IT>& colptr, const std::vector<IT>& RepMateR2C, const std::vector<IT>& RepMateC2R, const std::vector<NT>& RepMateWR2C, const std::vector<NT>& RepMateWC2R )
553 {
554 
555  MPI_Comm World = param.commGrid->GetWorld();
556 
557  double tstart = MPI_Wtime();
558 
559  // Step 1: Sort for effecient searching of indices
560  __gnu_parallel::sort(recvTuples.begin(), recvTuples.end());
561  std::vector<std::vector<std::tuple<IT,IT, IT, NT>>> tempTuples1 (param.nprocs);
562 
563  std::vector<int> sendcnt(param.nprocs,0); // number items to be sent to each processor
564 
565  //Step 2: Count the amount of data to be sent to different processors
566  // Instead of binary search in each column, I am doing linear search
567  // Linear search is faster here because, we need to search 40%-50% of nnz
568  int nBins = 1;
569 #ifdef THREADED
570 #pragma omp parallel
571  {
572  nBins = omp_get_num_threads() * 4;
573  }
574 #endif
575 
576 #ifdef THREADED
577 #pragma omp parallel for
578 #endif
579  for(int i=0; i<nBins; i++)
580  {
581  int perBin = recvTuples.size()/nBins;
582  int startBinIndex = perBin * i;
583  int endBinIndex = perBin * (i+1);
584  if(i==nBins-1) endBinIndex = recvTuples.size();
585 
586 
587  std::vector<int> tsendcnt(param.nprocs,0);
588  for(int k=startBinIndex; k<endBinIndex;)
589  {
590 
591  IT mi = std::get<0>(recvTuples[k]);
592  IT lcol = mi - param.localColStart;
593  IT i = RepMateC2R[lcol];
594  IT idx1 = k;
595  IT idx2 = colptr[lcol];
596 
597  for(; std::get<0>(recvTuples[idx1]) == mi && idx2 < colptr[lcol+1];) //**
598  {
599 
600  IT mj = std::get<1>(recvTuples[idx1]) ;
601  IT lrow = mj - param.localRowStart;
602  IT j = RepMateR2C[lrow];
603  IT lrowMat = dcsc->ir[idx2];
604  if(lrowMat == lrow)
605  {
606  NT weight = std::get<2>(recvTuples[idx1]);
607  NT cw = weight + RepMateWR2C[lrow]; //w+W[M'[j],M[i]];
608  if (cw > 0)
609  {
610  int rrank = (param.m_perproc != 0) ? std::min(static_cast<int>(mj / param.m_perproc), param.pr-1) : (param.pr-1);
611  int crank = (param.n_perproc != 0) ? std::min(static_cast<int>(j / param.n_perproc), param.pc-1) : (param.pc-1);
612  int owner = param.commGrid->GetRank(rrank , crank);
613  tsendcnt[owner]++;
614  }
615 
616  idx1++; idx2++;
617  }
618  else if(lrowMat > lrow)
619  idx1 ++;
620  else
621  idx2 ++;
622  }
623 
624  for(;std::get<0>(recvTuples[idx1]) == mi ; idx1++);
625  k = idx1;
626 
627  }
628  for(int i=0; i<param.nprocs; i++)
629  {
630  __sync_fetch_and_add(sendcnt.data()+i, tsendcnt[i]);
631  }
632  }
633 
634 
635 
636 
637  IT totsend = std::accumulate(sendcnt.data(), sendcnt.data()+param.nprocs, static_cast<IT>(0));
638  std::vector<int> sdispls (param.nprocs, 0);
639  std::partial_sum(sendcnt.data(), sendcnt.data()+param.nprocs-1, sdispls.data()+1);
640 
641  std::vector< std::tuple<IT,IT,IT,NT> > sendTuples(totsend);
642  std::vector<int> transferCount(param.nprocs,0);
643  int THREAD_BUF_LEN = ThreadBuffLenForBinning(32, param.nprocs);
644 
645 
646  //Step 3: Compile data to be sent to different processors
647 #ifdef THREADED
648 #pragma omp parallel for
649 #endif
650  for(int i=0; i<nBins; i++)
651  {
652  int perBin = recvTuples.size()/nBins;
653  int startBinIndex = perBin * i;
654  int endBinIndex = perBin * (i+1);
655  if(i==nBins-1) endBinIndex = recvTuples.size();
656 
657 
658  std::vector<int> tsendcnt(param.nprocs,0);
659  std::vector<std::tuple<IT,IT, IT, NT>> tsendTuples (param.nprocs*THREAD_BUF_LEN);
660  for(int k=startBinIndex; k<endBinIndex;)
661  {
662  IT mi = std::get<0>(recvTuples[k]);
663  IT lcol = mi - param.localColStart;
664  IT i = RepMateC2R[lcol];
665  IT idx1 = k;
666  IT idx2 = colptr[lcol];
667 
668  for(; std::get<0>(recvTuples[idx1]) == mi && idx2 < colptr[lcol+1];) //**
669  {
670 
671  IT mj = std::get<1>(recvTuples[idx1]) ;
672  IT lrow = mj - param.localRowStart;
673  IT j = RepMateR2C[lrow];
674  IT lrowMat = dcsc->ir[idx2];
675  if(lrowMat == lrow)
676  {
677  NT weight = std::get<2>(recvTuples[idx1]);
678  NT cw = weight + RepMateWR2C[lrow]; //w+W[M'[j],M[i]];
679  if (cw > 0)
680  {
681  int rrank = (param.m_perproc != 0) ? std::min(static_cast<int>(mj / param.m_perproc), param.pr-1) : (param.pr-1);
682  int crank = (param.n_perproc != 0) ? std::min(static_cast<int>(j / param.n_perproc), param.pc-1) : (param.pc-1);
683  int owner = param.commGrid->GetRank(rrank , crank);
684 
685  if (tsendcnt[owner] < THREAD_BUF_LEN)
686  {
687  tsendTuples[THREAD_BUF_LEN * owner + tsendcnt[owner]] = std::make_tuple(mj, mi, i, cw);
688  tsendcnt[owner]++;
689  }
690  else
691  {
692  int tt = __sync_fetch_and_add(transferCount.data()+owner, THREAD_BUF_LEN);
693  std::copy( tsendTuples.data()+THREAD_BUF_LEN * owner, tsendTuples.data()+THREAD_BUF_LEN * (owner+1) , sendTuples.data() + sdispls[owner]+ tt);
694 
695  tsendTuples[THREAD_BUF_LEN * owner] = std::make_tuple(mj, mi, i, cw);
696  tsendcnt[owner] = 1;
697  }
698 
699  }
700 
701  idx1++; idx2++;
702  }
703  else if(lrowMat > lrow)
704  idx1 ++;
705  else
706  idx2 ++;
707  }
708 
709  for(;std::get<0>(recvTuples[idx1]) == mi ; idx1++);
710  k = idx1;
711  }
712 
713  for(int owner=0; owner < param.nprocs; owner++)
714  {
715  if (tsendcnt[owner] >0)
716  {
717  int tt = __sync_fetch_and_add(transferCount.data()+owner, tsendcnt[owner]);
718  std::copy( tsendTuples.data()+THREAD_BUF_LEN * owner, tsendTuples.data()+THREAD_BUF_LEN * owner + tsendcnt[owner], sendTuples.data() + sdispls[owner]+ tt);
719  }
720  }
721  }
722 
723  // Step 4: Communicate data
724 
725  std::vector<int> recvcnt (param.nprocs);
726  std::vector<int> rdispls (param.nprocs, 0);
727 
728  MPI_Alltoall(sendcnt.data(), 1, MPI_INT, recvcnt.data(), 1, MPI_INT, World);
729  std::partial_sum(recvcnt.data(), recvcnt.data()+param.nprocs-1, rdispls.data()+1);
730  IT totrecv = std::accumulate(recvcnt.data(), recvcnt.data()+param.nprocs, static_cast<IT>(0));
731 
732 
733  MPI_Datatype MPI_tuple;
734  MPI_Type_contiguous(sizeof(std::tuple<IT,IT,IT,NT>), MPI_CHAR, &MPI_tuple);
735  MPI_Type_commit(&MPI_tuple);
736 
737  std::vector< std::tuple<IT,IT,IT,NT> > recvTuples1(totrecv);
738  MPI_Alltoallv(sendTuples.data(), sendcnt.data(), sdispls.data(), MPI_tuple, recvTuples1.data(), recvcnt.data(), rdispls.data(), MPI_tuple, World);
739  MPI_Type_free(&MPI_tuple);
740  return recvTuples1;
741 }
742 
743 
744 // Old version of Phase 2
745 // Not multithreaded (uses binary search)
746 template <class IT, class NT>
747 std::vector<std::vector<std::tuple<IT,IT, IT, NT>>> Phase2_old(const AWPM_param<IT>& param, std::vector<std::tuple<IT,IT,NT>>& recvTuples, Dcsc<IT, NT>* dcsc, const std::vector<IT>& colptr, const std::vector<IT>& RepMateR2C, const std::vector<IT>& RepMateC2R, const std::vector<NT>& RepMateWR2C, const std::vector<NT>& RepMateWC2R )
748 {
749 
750  std::vector<std::vector<std::tuple<IT,IT, IT, NT>>> tempTuples1 (param.nprocs);
751  for(int k=0; k<recvTuples.size(); ++k)
752  {
753  IT mi = std::get<0>(recvTuples[k]) ;
754  IT mj = std::get<1>(recvTuples[k]) ;
755  IT i = RepMateC2R[mi - param.localColStart];
756  NT weight = std::get<2>(recvTuples[k]);
757 
758  if(colptr[mi- param.localColStart+1] > colptr[mi- param.localColStart] )
759  {
760  IT * ele = find(dcsc->ir+colptr[mi - param.localColStart], dcsc->ir+colptr[mi - param.localColStart+1], mj - param.localRowStart);
761 
762  // TODO: Add a function that returns the edge weight directly
763  if (ele != dcsc->ir+colptr[mi - param.localColStart+1])
764  {
765  NT cw = weight + RepMateWR2C[mj - param.localRowStart]; //w+W[M'[j],M[i]];
766  if (cw > 0)
767  {
768  IT j = RepMateR2C[mj - param.localRowStart];
769  int rrank = (param.m_perproc != 0) ? std::min(static_cast<int>(mj / param.m_perproc), param.pr-1) : (param.pr-1);
770  int crank = (param.n_perproc != 0) ? std::min(static_cast<int>(j / param.n_perproc), param.pc-1) : (param.pc-1);
771  int owner = param.commGrid->GetRank(rrank , crank);
772  tempTuples1[owner].push_back(make_tuple(mj, mi, i, cw));
773  }
774  }
775  }
776  }
777 
778  return tempTuples1;
779 }
780 
781 template <class IT, class NT, class DER>
783 {
784 
785  // Information about CommGrid and matrix layout
786  // Assume that processes are laid in (pr x pc) process grid
787  auto commGrid = A.getcommgrid();
788  int myrank=commGrid->GetRank();
789  MPI_Comm World = commGrid->GetWorld();
790  MPI_Comm ColWorld = commGrid->GetColWorld();
791  MPI_Comm RowWorld = commGrid->GetRowWorld();
792  int nprocs = commGrid->GetSize();
793  int pr = commGrid->GetGridRows();
794  int pc = commGrid->GetGridCols();
795  int rowrank = commGrid->GetRankInProcRow();
796  int colrank = commGrid->GetRankInProcCol();
797  int diagneigh = commGrid->GetComplementRank();
798 
799  //Information about the matrix distribution
800  //Assume that A is an nrow x ncol matrix
801  //The local submatrix is an lnrow x lncol matrix
802  IT nrows = A.getnrow();
803  IT ncols = A.getncol();
804  IT nnz = A.getnnz();
805  IT m_perproc = nrows / pr;
806  IT n_perproc = ncols / pc;
807  DER* spSeq = A.seqptr(); // local submatrix
808  Dcsc<IT, NT>* dcsc = spSeq->GetDCSC();
809  IT lnrow = spSeq->getnrow();
810  IT lncol = spSeq->getncol();
811  IT localRowStart = colrank * m_perproc; // first row in this process
812  IT localColStart = rowrank * n_perproc; // first col in this process
813 
814  AWPM_param<IT> param;
815  param.nprocs = nprocs;
816  param.pr = pr;
817  param.pc = pc;
818  param.lncol = lncol;
819  param.lnrow = lnrow;
820  param.m_perproc = m_perproc;
821  param.n_perproc = n_perproc;
822  param.localRowStart = localRowStart;
823  param.localColStart = localColStart;
824  param.myrank = myrank;
825  param.commGrid = commGrid;
826 
827 
828  // -----------------------------------------------------------
829  // replicate mate vectors for mateCol2Row
830  // Communication cost: same as the first communication of SpMV
831  // -----------------------------------------------------------
832  int xsize = (int) mateCol2Row.LocArrSize();
833  int trxsize = 0;
834  MPI_Status status;
835  MPI_Sendrecv(&xsize, 1, MPI_INT, diagneigh, TRX, &trxsize, 1, MPI_INT, diagneigh, TRX, World, &status);
836  std::vector<IT> trxnums(trxsize);
837  MPI_Sendrecv(mateCol2Row.GetLocArr(), xsize, MPIType<IT>(), diagneigh, TRX, trxnums.data(), trxsize, MPIType<IT>(), diagneigh, TRX, World, &status);
838 
839 
840  std::vector<int> colsize(pc);
841  colsize[colrank] = trxsize;
842  MPI_Allgather(MPI_IN_PLACE, 1, MPI_INT, colsize.data(), 1, MPI_INT, ColWorld);
843  std::vector<int> dpls(pc,0); // displacements (zero initialized pid)
844  std::partial_sum(colsize.data(), colsize.data()+pc-1, dpls.data()+1);
845  int accsize = std::accumulate(colsize.data(), colsize.data()+pc, 0);
846  std::vector<IT> RepMateC2R(accsize);
847  MPI_Allgatherv(trxnums.data(), trxsize, MPIType<IT>(), RepMateC2R.data(), colsize.data(), dpls.data(), MPIType<IT>(), ColWorld);
848  // -----------------------------------------------------------
849 
850 
851  // -----------------------------------------------------------
852  // replicate mate vectors for mateRow2Col
853  // Communication cost: same as the first communication of SpMV
854  // (minus the cost of tranposing vector)
855  // -----------------------------------------------------------
856 
857 
858  xsize = (int) mateRow2Col.LocArrSize();
859 
860  std::vector<int> rowsize(pr);
861  rowsize[rowrank] = xsize;
862  MPI_Allgather(MPI_IN_PLACE, 1, MPI_INT, rowsize.data(), 1, MPI_INT, RowWorld);
863  std::vector<int> rdpls(pr,0); // displacements (zero initialized pid)
864  std::partial_sum(rowsize.data(), rowsize.data()+pr-1, rdpls.data()+1);
865  accsize = std::accumulate(rowsize.data(), rowsize.data()+pr, 0);
866  std::vector<IT> RepMateR2C(accsize);
867  MPI_Allgatherv(mateRow2Col.GetLocArr(), xsize, MPIType<IT>(), RepMateR2C.data(), rowsize.data(), rdpls.data(), MPIType<IT>(), RowWorld);
868  // -----------------------------------------------------------
869 
870 
871 
872  // Getting column pointers for all columns (for CSC-style access)
873  std::vector<IT> colptr (lncol+1,-1);
874  for(auto colit = spSeq->begcol(); colit != spSeq->endcol(); ++colit) // iterate over all columns
875  {
876  IT lj = colit.colid(); // local numbering
877 
878  colptr[lj] = colit.colptr();
879  }
880  colptr[lncol] = spSeq->getnnz();
881  for(IT k=lncol-1; k>=0; k--)
882  {
883  if(colptr[k] == -1)
884  {
885  colptr[k] = colptr[k+1];
886  }
887  }
888  // TODO: will this fail empty local matrix where every entry of colptr will be zero
889 
890 
891  // -----------------------------------------------------------
892  // replicate weights of mates
893  // -----------------------------------------------------------
894  std::vector<NT> RepMateWR2C(lnrow);
895  std::vector<NT> RepMateWC2R(lncol);
896  ReplicateMateWeights(param, dcsc, colptr, RepMateC2R, RepMateWR2C, RepMateWC2R);
897 
898 
899  int iterations = 0;
900  NT minw;
901  NT weightCur = MatchingWeight(RepMateWC2R, RowWorld, minw);
902  NT weightPrev = weightCur - 999999999999;
903  while(weightCur > weightPrev && iterations++ < 10)
904  {
905 
906 
907  if(myrank==0) std::cout << "Iteration " << iterations << ". matching weight: sum = "<< weightCur << " min = " << minw << std::endl;
908  // C requests
909  // each row is for a processor where C requests will be sent to
910  double tstart;
911 
912 
913  std::vector<std::tuple<IT,IT,NT>> recvTuples = Phase1(param, dcsc, colptr, RepMateR2C, RepMateC2R, RepMateWR2C, RepMateWC2R );
914  std::vector<std::tuple<IT,IT,IT,NT>> recvTuples1 = Phase2(param, recvTuples, dcsc, colptr, RepMateR2C, RepMateC2R, RepMateWR2C, RepMateWC2R );
915  std::vector< std::tuple<IT,IT,NT> >().swap(recvTuples);
916 
917 
918 
919  tstart = MPI_Wtime();
920 
921  std::vector<std::tuple<IT,IT,IT,NT>> bestTuplesPhase3 (lncol);
922 #ifdef THREADED
923 #pragma omp parallel for
924 #endif
925  for(int k=0; k<lncol; ++k)
926  {
927  bestTuplesPhase3[k] = std::make_tuple(-1,-1,-1,0); // fix this
928  }
929 
930  for(int k=0; k<recvTuples1.size(); ++k)
931  {
932  IT mj = std::get<0>(recvTuples1[k]) ;
933  IT mi = std::get<1>(recvTuples1[k]) ;
934  IT i = std::get<2>(recvTuples1[k]) ;
935  NT weight = std::get<3>(recvTuples1[k]);
936  IT j = RepMateR2C[mj - localRowStart];
937  IT lj = j - localColStart;
938 
939  // we can get rid of the first check if edge weights are non negative
940  if( (std::get<0>(bestTuplesPhase3[lj]) == -1) || (weight > std::get<3>(bestTuplesPhase3[lj])) )
941  {
942  bestTuplesPhase3[lj] = std::make_tuple(i,mi,mj,weight);
943  }
944  }
945 
946  std::vector<std::vector<std::tuple<IT,IT, IT, NT>>> tempTuples1 (nprocs);
947  for(int k=0; k<lncol; ++k)
948  {
949  if( std::get<0>(bestTuplesPhase3[k]) != -1)
950  {
951  //IT j = RepMateR2C[mj - localRowStart]; /// fix me
952 
953  IT i = std::get<0>(bestTuplesPhase3[k]) ;
954  IT mi = std::get<1>(bestTuplesPhase3[k]) ;
955  IT mj = std::get<2>(bestTuplesPhase3[k]) ;
956  IT j = RepMateR2C[mj - localRowStart];
957  NT weight = std::get<3>(bestTuplesPhase3[k]);
958  int owner = OwnerProcs(A, i, mi, nrows, ncols);
959 
960  tempTuples1[owner].push_back(std::make_tuple(i, j, mj, weight));
961  }
962  }
963 
964  //vector< tuple<IT,IT,IT, NT> >().swap(recvTuples1);
965  recvTuples1 = ExchangeData1(tempTuples1, World);
966 
967  std::vector<std::tuple<IT,IT,IT,IT, NT>> bestTuplesPhase4 (lncol);
968  // we could have used lnrow in both bestTuplesPhase3 and bestTuplesPhase4
969 
970  // Phase 4
971  // at the owner of (i,mi)
972 #ifdef THREADED
973 #pragma omp parallel for
974 #endif
975  for(int k=0; k<lncol; ++k)
976  {
977  bestTuplesPhase4[k] = std::make_tuple(-1,-1,-1,-1,0);
978  }
979 
980  for(int k=0; k<recvTuples1.size(); ++k)
981  {
982  IT i = std::get<0>(recvTuples1[k]) ;
983  IT j = std::get<1>(recvTuples1[k]) ;
984  IT mj = std::get<2>(recvTuples1[k]) ;
985  IT mi = RepMateR2C[i-localRowStart];
986  NT weight = std::get<3>(recvTuples1[k]);
987  IT lmi = mi - localColStart;
988  //IT lj = j - localColStart;
989 
990  // cout <<"****" << i << " " << mi << " "<< j << " " << mj << " " << get<0>(bestTuplesPhase4[lj]) << endl;
991  // we can get rid of the first check if edge weights are non negative
992  if( ((std::get<0>(bestTuplesPhase4[lmi]) == -1) || (weight > std::get<4>(bestTuplesPhase4[lmi]))) && std::get<0>(bestTuplesPhase3[lmi])==-1 )
993  {
994  bestTuplesPhase4[lmi] = std::make_tuple(i,j,mi,mj,weight);
995  //cout << "(("<< i << " " << mi << " "<< j << " " << mj << "))"<< endl;
996  }
997  }
998 
999 
1000  std::vector<std::vector<std::tuple<IT,IT,IT, IT>>> winnerTuples (nprocs);
1001 
1002 
1003  for(int k=0; k<lncol; ++k)
1004  {
1005  if( std::get<0>(bestTuplesPhase4[k]) != -1)
1006  {
1007  //int owner = OwnerProcs(A, get<0>(bestTuples[k]), get<1>(bestTuples[k]), nrows, ncols); // (i,mi)
1008  //tempTuples[owner].push_back(bestTuples[k]);
1009  IT i = std::get<0>(bestTuplesPhase4[k]) ;
1010  IT j = std::get<1>(bestTuplesPhase4[k]) ;
1011  IT mi = std::get<2>(bestTuplesPhase4[k]) ;
1012  IT mj = std::get<3>(bestTuplesPhase4[k]) ;
1013 
1014 
1015  int owner = OwnerProcs(A, mj, j, nrows, ncols);
1016  winnerTuples[owner].push_back(std::make_tuple(i, j, mi, mj));
1017 
1019  // passing the opposite of the matching to the owner of (i,mi)
1020  owner = OwnerProcs(A, i, mi, nrows, ncols);
1021  winnerTuples[owner].push_back(std::make_tuple(mj, mi, j, i));
1022  }
1023  }
1024 
1025 
1026  //vector< tuple<IT,IT,IT, NT> >().swap(recvTuples1);
1027 
1028  std::vector<std::tuple<IT,IT,IT,IT>> recvWinnerTuples = ExchangeData1(winnerTuples, World);
1029 
1030  // at the owner of (mj,j)
1031  std::vector<std::tuple<IT,IT>> rowBcastTuples(recvWinnerTuples.size()); //(mi,mj)
1032  std::vector<std::tuple<IT,IT>> colBcastTuples(recvWinnerTuples.size()); //(j,i)
1033 #ifdef THREADED
1034 #pragma omp parallel for
1035 #endif
1036  for(int k=0; k<recvWinnerTuples.size(); ++k)
1037  {
1038  IT i = std::get<0>(recvWinnerTuples[k]) ;
1039  IT j = std::get<1>(recvWinnerTuples[k]) ;
1040  IT mi = std::get<2>(recvWinnerTuples[k]) ;
1041  IT mj = std::get<3>(recvWinnerTuples[k]);
1042 
1043  colBcastTuples[k] = std::make_tuple(j,i);
1044  rowBcastTuples[k] = std::make_tuple(mj,mi);
1045  }
1046 
1047  std::vector<std::tuple<IT,IT>> updatedR2C = MateBcast(rowBcastTuples, RowWorld);
1048  std::vector<std::tuple<IT,IT>> updatedC2R = MateBcast(colBcastTuples, ColWorld);
1049 
1050 #ifdef THREADED
1051 #pragma omp parallel for
1052 #endif
1053  for(int k=0; k<updatedR2C.size(); k++)
1054  {
1055  IT row = std::get<0>(updatedR2C[k]);
1056  IT mate = std::get<1>(updatedR2C[k]);
1057  RepMateR2C[row-localRowStart] = mate;
1058  }
1059 
1060 #ifdef THREADED
1061 #pragma omp parallel for
1062 #endif
1063  for(int k=0; k<updatedC2R.size(); k++)
1064  {
1065  IT col = std::get<0>(updatedC2R[k]);
1066  IT mate = std::get<1>(updatedC2R[k]);
1067  RepMateC2R[col-localColStart] = mate;
1068  }
1069 
1070 
1071  // update weights of matched edges
1072  // we can do better than this since we are doing sparse updates
1073  ReplicateMateWeights(param, dcsc, colptr, RepMateC2R, RepMateWR2C, RepMateWC2R);
1074 
1075  weightPrev = weightCur;
1076  weightCur = MatchingWeight(RepMateWC2R, RowWorld, minw);
1077 
1078 
1079  //UpdateMatching(mateRow2Col, mateCol2Row, RepMateR2C, RepMateC2R);
1080  //CheckMatching(mateRow2Col,mateCol2Row);
1081 
1082  }
1083 
1084  // update the distributed mate vectors from replicated mate vectors
1085  UpdateMatching(mateRow2Col, mateCol2Row, RepMateR2C, RepMateC2R);
1086  //weightCur = MatchingWeight(RepMateWC2R, RowWorld);
1087 
1088 
1089 
1090 }
1091 
1092 
1093  template <class IT, class NT, class DER>
1095  {
1096  //A.Apply([](NT val){return log(1+abs(val));});
1097  // if the matrix has explicit zero entries, we can still have problem.
1098  // One solution is to remove explicit zero entries before cardinality matching (to be tested)
1099  //A.Apply([](NT val){if(val==0) return log(numeric_limits<NT>::min()); else return log(fabs(val));});
1100  A.Apply([](NT val){return (fabs(val));});
1101 
1102  FullyDistVec<IT, NT> maxvRow(A.getcommgrid());
1103  A.Reduce(maxvRow, Row, maximum<NT>(), static_cast<NT>(numeric_limits<NT>::lowest()));
1104  A.DimApply(Row, maxvRow, [](NT val, NT maxval){return val/maxval;});
1105 
1106  FullyDistVec<IT, NT> maxvCol(A.getcommgrid());
1107  A.Reduce(maxvCol, Column, maximum<NT>(), static_cast<NT>(numeric_limits<NT>::lowest()));
1108  A.DimApply(Column, maxvCol, [](NT val, NT maxval){return val/maxval;});
1109 
1110  if(applylog)
1111  A.Apply([](NT val){return log(val);});
1112 
1113  }
1114 
1115  template <class IT, class NT>
1116  void AWPM(SpParMat < IT, NT, SpDCCols<IT, NT> > & A1, FullyDistVec<IT, IT>& mateRow2Col, FullyDistVec<IT, IT>& mateCol2Row, bool optimizeProd=true)
1117  {
1118  SpParMat < IT, NT, SpDCCols<IT, NT> > A(A1); // creating a copy because it is being transformed
1119 
1120  if(optimizeProd)
1121  TransformWeight(A, true);
1122  else
1123  TransformWeight(A, false);
1126  FullyDistVec<IT, IT> degCol(A.getcommgrid());
1127  Abool.Reduce(degCol, Column, plus<IT>(), static_cast<IT>(0));
1128  double ts;
1129 
1130  // Compute the initial trace
1131  IT diagnnz;
1132  double origWeight = Trace(A, diagnnz);
1133  bool isOriginalPerfect = diagnnz==A.getnrow();
1134 
1135  // compute the maximal matching
1136  WeightedGreedy(Acsc, mateRow2Col, mateCol2Row, degCol);
1137  double mclWeight = MatchingWeight( A, mateRow2Col, mateCol2Row);
1138  SpParHelper::Print("After Greedy sanity check\n");
1139  bool isPerfectMCL = CheckMatching(mateRow2Col,mateCol2Row);
1140 
1141  // if the original matrix has a perfect matching and better weight
1142  if(isOriginalPerfect && mclWeight<=origWeight)
1143  {
1144  SpParHelper::Print("Maximal is not better that the natural ordering. Hence, keeping the natural ordering.\n");
1145  mateRow2Col.iota(A.getnrow(), 0);
1146  mateCol2Row.iota(A.getncol(), 0);
1147  mclWeight = origWeight;
1148  isPerfectMCL = true;
1149  }
1150 
1151 
1152  // MCM
1153  double tmcm = 0;
1154  double mcmWeight = mclWeight;
1155  if(!isPerfectMCL) // run MCM only if we don't have a perfect matching
1156  {
1157  ts = MPI_Wtime();
1158  maximumMatching(Acsc, mateRow2Col, mateCol2Row, true, false, true);
1159  tmcm = MPI_Wtime() - ts;
1160  mcmWeight = MatchingWeight( A, mateRow2Col, mateCol2Row) ;
1161  SpParHelper::Print("After MCM sanity check\n");
1162  CheckMatching(mateRow2Col,mateCol2Row);
1163  }
1164 
1165 
1166  // AWPM
1167  ts = MPI_Wtime();
1168  TwoThirdApprox(A, mateRow2Col, mateCol2Row);
1169  double tawpm = MPI_Wtime() - ts;
1170 
1171  double awpmWeight = MatchingWeight( A, mateRow2Col, mateCol2Row) ;
1172  SpParHelper::Print("After AWPM sanity check\n");
1173  CheckMatching(mateRow2Col,mateCol2Row);
1174  if(isOriginalPerfect && awpmWeight<origWeight) // keep original
1175  {
1176  SpParHelper::Print("AWPM is not better that the natural ordering. Hence, keeping the natural ordering.\n");
1177  mateRow2Col.iota(A.getnrow(), 0);
1178  mateCol2Row.iota(A.getncol(), 0);
1179  awpmWeight = origWeight;
1180  }
1181 
1182  }
1183 
1184 }
1185 
1186 #endif /* ApproxWeightPerfectMatching_h */
#define L2_CACHE_SIZE
void maximumMatching(PSpMat_s32p64 &Aeff, FullyDistVec< int64_t, int64_t > &mateRow2Col, FullyDistVec< int64_t, int64_t > &mateCol2Row)
IT * ir
row indices, size nz
Definition: dcsc.h:121
NT * numx
generic values, size nz
Definition: dcsc.h:122
std::shared_ptr< CommGrid > getcommgrid() const
Definition: FullyDistVec.h:257
void iota(IT globalsize, NT first)
const NT * GetLocArr() const
Definition: FullyDistVec.h:167
void SetLocalElement(IT index, NT value)
Definition: FullyDistVec.h:144
FullyDistVec< IT, NT > Reduce(Dim dim, _BinaryOperation __binary_op, NT id, _UnaryOperation __unary_op) const
Definition: SpParMat.cpp:791
#define TRX
Definition: SpDefs.h:102
Definition: CCGrid.h:4
@ Column
Definition: SpDefs.h:114
@ Row
Definition: SpDefs.h:115
bool CheckMatching(FullyDistVec< IT, IT > &mateRow2Col, FullyDistVec< IT, IT > &mateCol2Row)
Definition: Utility.h:113
int OwnerProcs(SpParMat< IT, NT, DER > &A, IT grow, IT gcol, IT nrows, IT ncols)
std::vector< std::tuple< IT, IT, NT > > Phase1(const AWPM_param< IT > &param, Dcsc< IT, NT > *dcsc, const std::vector< IT > &colptr, const std::vector< IT > &RepMateR2C, const std::vector< IT > &RepMateC2R, const std::vector< NT > &RepMateWR2C, const std::vector< NT > &RepMateWC2R)
void AWPM(SpParMat< IT, NT, SpDCCols< IT, NT > > &A1, FullyDistVec< IT, IT > &mateRow2Col, FullyDistVec< IT, IT > &mateCol2Row, bool optimizeProd=true)
std::vector< std::tuple< IT, IT > > MateBcast(std::vector< std::tuple< IT, IT >> sendTuples, MPI_Comm World)
NT MatchingWeight(std::vector< NT > &RepMateWC2R, MPI_Comm RowWorld, NT &minw)
void TransformWeight(SpParMat< IT, NT, DER > &A, bool applylog)
std::vector< std::tuple< IT, IT, IT, NT > > Phase2(const AWPM_param< IT > &param, std::vector< std::tuple< IT, IT, NT >> &recvTuples, Dcsc< IT, NT > *dcsc, const std::vector< IT > &colptr, const std::vector< IT > &RepMateR2C, const std::vector< IT > &RepMateC2R, const std::vector< NT > &RepMateWR2C, const std::vector< NT > &RepMateWC2R)
NT Trace(SpParMat< IT, NT, DER > &A, IT &rettrnnz=0)
std::vector< std::tuple< IT, IT, IT, NT > > ExchangeData1(std::vector< std::vector< std::tuple< IT, IT, IT, NT >>> &tempTuples, MPI_Comm World)
void UpdateMatching(FullyDistVec< IT, IT > &mateRow2Col, FullyDistVec< IT, IT > &mateCol2Row, std::vector< IT > &RepMateR2C, std::vector< IT > &RepMateC2R)
int ThreadBuffLenForBinning(int itemsize, int nbins)
void ReplicateMateWeights(const AWPM_param< IT > &param, Dcsc< IT, NT > *dcsc, const std::vector< IT > &colptr, std::vector< IT > &RepMateC2R, std::vector< NT > &RepMateWR2C, std::vector< NT > &RepMateWC2R)
std::vector< std::tuple< IT, IT, NT > > ExchangeData(std::vector< std::vector< std::tuple< IT, IT, NT >>> &tempTuples, MPI_Comm World)
void TwoThirdApprox(SpParMat< IT, NT, DER > &A, FullyDistVec< IT, IT > &mateRow2Col, FullyDistVec< IT, IT > &mateCol2Row)
void DeleteAll(A arr1)
Definition: Deleter.h:48
void WeightedGreedy(Par_MAT_Double &A, FullyDistVec< IT, IT > &mateRow2Col, FullyDistVec< IT, IT > &mateCol2Row, FullyDistVec< IT, IT > &degCol)
std::vector< std::vector< std::tuple< IT, IT, IT, NT > > > Phase2_old(const AWPM_param< IT > &param, std::vector< std::tuple< IT, IT, NT >> &recvTuples, Dcsc< IT, NT > *dcsc, const std::vector< IT > &colptr, const std::vector< IT > &RepMateR2C, const std::vector< IT > &RepMateC2R, const std::vector< NT > &RepMateWR2C, const std::vector< NT > &RepMateWC2R)
double A
std::shared_ptr< CommGrid > commGrid
Compute the maximum of two values.
Definition: Operations.h:155