COMBINATORIAL_BLAS  1.6
BPMaximumMatching.h
Go to the documentation of this file.
1 #ifndef BP_MAXIMUM_MATCHING_H
2 #define BP_MAXIMUM_MATCHING_H
3 
4 #include "CombBLAS.h"
5 #include <mpi.h>
6 #include <sys/time.h>
7 #include <iostream>
8 #include <functional>
9 #include <algorithm>
10 #include <vector>
11 #include <string>
12 #include <sstream>
13 #include "MatchingDefs.h"
14 
15 namespace combblas {
16 
25 template <class IT, class DER>
26 SpParMat<IT, bool, DER> PermMat (const FullyDistVec<IT,IT> & ri, const IT ncol)
27 {
28 
29  IT procsPerRow = ri.commGrid->GetGridCols(); // the number of processor in a row of processor grid
30  IT procsPerCol = ri.commGrid->GetGridRows(); // the number of processor in a column of processor grid
31 
32 
33  IT global_nrow = ri.TotalLength();
34  IT global_ncol = ncol;
35  IT m_perprocrow = global_nrow / procsPerRow;
36  IT n_perproccol = global_ncol / procsPerCol;
37 
38 
39  // The indices for FullyDistVec are offset'd to 1/p pieces
40  // The matrix indices are offset'd to 1/sqrt(p) pieces
41  // Add the corresponding offset before sending the data
42 
43  std::vector< std::vector<IT> > rowid(procsPerRow); // rowid in the local matrix of each vector entry
44  std::vector< std::vector<IT> > colid(procsPerRow); // colid in the local matrix of each vector entry
45 
46  IT locvec = ri.arr.size(); // nnz in local vector
47  IT roffset = ri.RowLenUntil(); // the number of vector elements in this processor row before the current processor
48  for(typename std::vector<IT>::size_type i=0; i< (unsigned)locvec; ++i)
49  {
50  if(ri.arr[i]>=0 && ri.arr[i]<ncol) // this specialized for matching. TODO: make it general purpose by passing a function
51  {
52  IT rowrec = (n_perproccol!=0) ? std::min(ri.arr[i] / n_perproccol, procsPerRow-1) : (procsPerRow-1);
53  // ri's numerical values give the colids and its local indices give rowids
54  rowid[rowrec].push_back( i + roffset);
55  colid[rowrec].push_back(ri.arr[i] - (rowrec * n_perproccol));
56  }
57 
58  }
59 
60 
61 
62  int * sendcnt = new int[procsPerRow];
63  int * recvcnt = new int[procsPerRow];
64  for(IT i=0; i<procsPerRow; ++i)
65  {
66  sendcnt[i] = rowid[i].size();
67  }
68 
69  MPI_Alltoall(sendcnt, 1, MPI_INT, recvcnt, 1, MPI_INT, ri.commGrid->GetRowWorld()); // share the counts
70 
71  int * sdispls = new int[procsPerRow]();
72  int * rdispls = new int[procsPerRow]();
73  partial_sum(sendcnt, sendcnt+procsPerRow-1, sdispls+1);
74  partial_sum(recvcnt, recvcnt+procsPerRow-1, rdispls+1);
75  IT p_nnz = accumulate(recvcnt,recvcnt+procsPerRow, static_cast<IT>(0));
76 
77 
78  IT * p_rows = new IT[p_nnz];
79  IT * p_cols = new IT[p_nnz];
80  IT * senddata = new IT[locvec];
81  for(int i=0; i<procsPerRow; ++i)
82  {
83  copy(rowid[i].begin(), rowid[i].end(), senddata+sdispls[i]);
84  std::vector<IT>().swap(rowid[i]); // clear memory of rowid
85  }
86  MPI_Alltoallv(senddata, sendcnt, sdispls, MPIType<IT>(), p_rows, recvcnt, rdispls, MPIType<IT>(), ri.commGrid->GetRowWorld());
87 
88  for(int i=0; i<procsPerRow; ++i)
89  {
90  copy(colid[i].begin(), colid[i].end(), senddata+sdispls[i]);
91  std::vector<IT>().swap(colid[i]); // clear memory of colid
92  }
93  MPI_Alltoallv(senddata, sendcnt, sdispls, MPIType<IT>(), p_cols, recvcnt, rdispls, MPIType<IT>(), ri.commGrid->GetRowWorld());
94  delete [] senddata;
95 
96  std::tuple<IT,IT,bool> * p_tuples = new std::tuple<IT,IT,bool>[p_nnz];
97  for(IT i=0; i< p_nnz; ++i)
98  {
99  p_tuples[i] = make_tuple(p_rows[i], p_cols[i], 1);
100  }
101  DeleteAll(p_rows, p_cols);
102 
103 
104  // Now create the local matrix
105  IT local_nrow = ri.MyRowLength();
106  int my_proccol = ri.commGrid->GetRankInProcRow();
107  IT local_ncol = (my_proccol<(procsPerCol-1))? (n_perproccol) : (global_ncol - (n_perproccol*(procsPerCol-1)));
108 
109  // infer the concrete type SpMat<IT,IT>
110  typedef typename create_trait<DER, IT, bool>::T_inferred DER_IT;
111  DER_IT * PSeq = new DER_IT();
112  PSeq->Create( p_nnz, local_nrow, local_ncol, p_tuples); // deletion of tuples[] is handled by SpMat::Create
113 
114  SpParMat<IT,bool,DER_IT> P (PSeq, ri.commGrid);
115  //Par_DCSC_Bool P (PSeq, ri.commGrid);
116  return P;
117 }
118 
119 
120 
121 
122 /***************************************************************************
123 // Augment a matching by a set of vertex-disjoint augmenting paths.
124 // The paths are explored level-by-level similar to the level-synchronous BFS
125 // This approach is more effecient when we have many short augmenting paths
126 ***************************************************************************/
127 
128 template <typename IT>
129 void AugmentLevel(FullyDistVec<IT, IT>& mateRow2Col, FullyDistVec<IT, IT>& mateCol2Row, FullyDistVec<IT, IT>& parentsRow, FullyDistVec<IT, IT>& leaves)
130 {
131 
132  IT nrow = mateRow2Col.TotalLength();
133  IT ncol = mateCol2Row.TotalLength();
134  FullyDistSpVec<IT, IT> col(leaves, [](IT leaf){return leaf!=-1;});
135  FullyDistSpVec<IT, IT> row(mateRow2Col.getcommgrid(), nrow);
136  FullyDistSpVec<IT, IT> nextcol(col.getcommgrid(), ncol);
137 
138  while(col.getnnz()!=0)
139  {
140 
141  row = col.Invert(nrow);
142  row = EWiseApply<IT>(row, parentsRow,
143  [](IT root, IT parent){return parent;},
144  [](IT root, IT parent){return true;},
145  false, (IT)-1);
146 
147  col = row.Invert(ncol); // children array
148  nextcol = EWiseApply<IT>(col, mateCol2Row,
149  [](IT child, IT mate){return mate;},
150  [](IT child, IT mate){return mate!=-1;},
151  false, (IT)-1);
152  mateRow2Col.Set(row);
153  mateCol2Row.Set(col);
154  col = nextcol;
155  }
156 }
157 
158 
159 /***************************************************************************
160 // Augment a matching by a set of vertex-disjoint augmenting paths.
161 // An MPI processor is responsible for a complete path.
162 // This approach is more effecient when we have few long augmenting paths
163 // We used one-sided MPI. Any PGAS language should be fine as well.
164 // This function is not thread safe, hence multithreading is not used here
165  ***************************************************************************/
166 
167 template <typename IT>
168 void AugmentPath(FullyDistVec<IT, IT>& mateRow2Col, FullyDistVec<IT, IT>& mateCol2Row,FullyDistVec<IT, IT>& parentsRow, FullyDistVec<IT, IT>& leaves)
169 {
170  MPI_Win win_mateRow2Col, win_mateCol2Row, win_parentsRow;
171  MPI_Win_create((IT*)mateRow2Col.GetLocArr(), mateRow2Col.LocArrSize() * sizeof(IT), sizeof(IT), MPI_INFO_NULL, mateRow2Col.commGrid->GetWorld(), &win_mateRow2Col);
172  MPI_Win_create((IT*)mateCol2Row.GetLocArr(), mateCol2Row.LocArrSize() * sizeof(IT), sizeof(IT), MPI_INFO_NULL, mateCol2Row.commGrid->GetWorld(), &win_mateCol2Row);
173  MPI_Win_create((IT*)parentsRow.GetLocArr(), parentsRow.LocArrSize() * sizeof(IT), sizeof(IT), MPI_INFO_NULL, parentsRow.commGrid->GetWorld(), &win_parentsRow);
174 
175 
176  IT* leaves_ptr = (IT*) leaves.GetLocArr();
177  //MPI_Win_fence(0, win_mateRow2Col);
178  //MPI_Win_fence(0, win_mateCol2Row);
179  //MPI_Win_fence(0, win_parentsRow);
180 
181  IT row, col=100, nextrow;
182  int owner_row, owner_col;
183  IT locind_row, locind_col;
184  int myrank;
185 
186  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
187 
188 
189  for(IT i=0; i<leaves.LocArrSize(); i++)
190  {
191  int depth=0;
192  row = *(leaves_ptr+i);
193  while(row != - 1)
194  {
195 
196  owner_row = mateRow2Col.Owner(row, locind_row);
197  MPI_Win_lock(MPI_LOCK_SHARED, owner_row, 0, win_parentsRow);
198  MPI_Get(&col, 1, MPIType<IT>(), owner_row, locind_row, 1, MPIType<IT>(), win_parentsRow);
199  MPI_Win_unlock(owner_row, win_parentsRow);
200 
201  owner_col = mateCol2Row.Owner(col, locind_col);
202  MPI_Win_lock(MPI_LOCK_SHARED, owner_col, 0, win_mateCol2Row);
203  MPI_Fetch_and_op(&row, &nextrow, MPIType<IT>(), owner_col, locind_col, MPI_REPLACE, win_mateCol2Row);
204  MPI_Win_unlock(owner_col, win_mateCol2Row);
205 
206  MPI_Win_lock(MPI_LOCK_SHARED, owner_row, 0, win_mateRow2Col);
207  MPI_Put(&col, 1, MPIType<IT>(), owner_row, locind_row, 1, MPIType<IT>(), win_mateRow2Col);
208  MPI_Win_unlock(owner_row, win_mateRow2Col); // we need this otherwise col might get overwritten before communication!
209  row = nextrow;
210 
211  }
212  }
213 
214  //MPI_Win_fence(0, win_mateRow2Col);
215  //MPI_Win_fence(0, win_mateCol2Row);
216  //MPI_Win_fence(0, win_parentsRow);
217 
218  MPI_Win_free(&win_mateRow2Col);
219  MPI_Win_free(&win_mateCol2Row);
220  MPI_Win_free(&win_parentsRow);
221 }
222 
223 
224 
225 
226 
227 // Maximum cardinality matching
228 // Output: mateRow2Col and mateRow2Col
229 template <typename IT, typename NT,typename DER>
230 void maximumMatching(SpParMat < IT, NT, DER > & A, FullyDistVec<IT, IT>& mateRow2Col,
231  FullyDistVec<IT, IT>& mateCol2Row, bool prune=true, bool randMM = false, bool maximizeWeight = false)
232 {
233  static MTRand GlobalMT(123); // for reproducible result
234  typedef VertexTypeMM <IT> VertexType;
235 
236  int nthreads=1;
237 #ifdef THREADED
238 #pragma omp parallel
239  {
240  nthreads = omp_get_num_threads();
241  }
242 #endif
243  PreAllocatedSPA<VertexType> SPA(A.seq(), nthreads*4);
244 
245  double tstart = MPI_Wtime();
246  int nprocs, myrank;
247  MPI_Comm_size(MPI_COMM_WORLD,&nprocs);
248  MPI_Comm_rank(MPI_COMM_WORLD,&myrank);
249 
250  IT nrow = A.getnrow();
251  IT ncol = A.getncol();
252 
253  FullyDistSpVec<IT, VertexType> fringeRow(A.getcommgrid(), nrow);
254  FullyDistSpVec<IT, IT> umFringeRow(A.getcommgrid(), nrow);
255  FullyDistVec<IT, IT> leaves ( A.getcommgrid(), ncol, (IT) -1);
256 
257  std::vector<std::vector<double> > timing;
258  std::vector<int> layers;
259  std::vector<int64_t> phaseMatched;
260  double t1, time_search, time_augment, time_phase;
261 
262  bool matched = true;
263  int phase = 0;
264  int totalLayer = 0;
265  IT numUnmatchedCol;
266 
267 
268  MPI_Win winLeaves;
269  MPI_Win_create((IT*)leaves.GetLocArr(), leaves.LocArrSize() * sizeof(IT), sizeof(IT), MPI_INFO_NULL, A.getcommgrid()->GetWorld(), &winLeaves);
270 
271 
272  while(matched)
273  {
274  time_phase = MPI_Wtime();
275 
276  std::vector<double> phase_timing(8,0);
277  leaves.Apply ( [](IT val){return (IT) -1;});
278  FullyDistVec<IT, IT> parentsRow ( A.getcommgrid(), nrow, (IT) -1);
279  FullyDistSpVec<IT, VertexType> fringeCol(A.getcommgrid(), ncol);
280  fringeCol = EWiseApply<VertexType>(fringeCol, mateCol2Row,
281  [](VertexType vtx, IT mate){return vtx;},
282  [](VertexType vtx, IT mate){return mate==-1;},
283  true, VertexType());
284 
285 
286  if(randMM) //select rand
287  {
288  fringeCol.ApplyInd([](VertexType vtx, IT idx){return VertexType(idx,idx,GlobalMT.rand());});
289  }
290  else
291  {
292  fringeCol.ApplyInd([](VertexType vtx, IT idx){return VertexType(idx,idx);});
293  }
294 
295  ++phase;
296  numUnmatchedCol = fringeCol.getnnz();
297  int layer = 0;
298 
299 
300  time_search = MPI_Wtime();
301  while(fringeCol.getnnz() > 0)
302  {
303  layer++;
304  t1 = MPI_Wtime();
305 
306  //TODO: think about this semiring
307  if(maximizeWeight)
308  SpMV<WeightMaxMMSR<NT, VertexType>>(A, fringeCol, fringeRow, false, SPA);
309  else
310  SpMV<Select2ndMinSR<NT, VertexType>>(A, fringeCol, fringeRow, false, SPA);
311  phase_timing[0] += MPI_Wtime()-t1;
312 
313 
314 
315  // remove vertices already having parents
316 
317  t1 = MPI_Wtime();
318  fringeRow = EWiseApply<VertexType>(fringeRow, parentsRow,
319  [](VertexType vtx, IT parent){return vtx;},
320  [](VertexType vtx, IT parent){return parent==-1;},
321  false, VertexType());
322 
323  // Set parent pointer
324  parentsRow.EWiseApply(fringeRow,
325  [](IT dval, VertexType svtx){return svtx.parent;},
326  [](IT dval, VertexType svtx){return true;},
327  false, VertexType());
328 
329 
330  umFringeRow = EWiseApply<IT>(fringeRow, mateRow2Col,
331  [](VertexType vtx, IT mate){return vtx.root;},
332  [](VertexType vtx, IT mate){return mate==-1;},
333  false, VertexType());
334 
335  phase_timing[1] += MPI_Wtime()-t1;
336 
337 
338  IT nnz_umFringeRow = umFringeRow.getnnz(); // careful about this timing
339 
340  t1 = MPI_Wtime();
341  if(nnz_umFringeRow >0)
342  {
343  /*
344  if(nnz_umFringeRow < 25*nprocs)
345  {
346  leaves.GSet(umFringeRow,
347  [](IT valRoot, IT idxLeaf){return valRoot;},
348  [](IT valRoot, IT idxLeaf){return idxLeaf;},
349  winLeaves);
350  // There might be a bug here. It does not return the same output for different number of processes
351  // e.g., check with g7jac200sc.mtx matrix
352  }
353  else*/
354  {
355  FullyDistSpVec<IT, IT> temp1(A.getcommgrid(), ncol);
356  temp1 = umFringeRow.Invert(ncol);
357  leaves.Set(temp1);
358  }
359  }
360 
361  phase_timing[2] += MPI_Wtime()-t1;
362 
363 
364 
365 
366  // matched row vertices in the the fringe
367  fringeRow = EWiseApply<VertexType>(fringeRow, mateRow2Col,
368  [](VertexType vtx, IT mate){return VertexType(mate, vtx.root);},
369  [](VertexType vtx, IT mate){return mate!=-1;},
370  false, VertexType());
371 
372  t1 = MPI_Wtime();
373  if(nnz_umFringeRow>0 && prune)
374  {
375  fringeRow.FilterByVal (umFringeRow,[](VertexType vtx){return vtx.root;}, false);
376  }
377  double tprune = MPI_Wtime()-t1;
378  phase_timing[3] += tprune;
379 
380 
381  // Go to matched column from matched row in the fringe. parent is automatically set to itself.
382  t1 = MPI_Wtime();
383  fringeCol = fringeRow.Invert(ncol,
384  [](VertexType& vtx, const IT & index){return vtx.parent;},
385  [](VertexType& vtx, const IT & index){return vtx;},
386  [](VertexType& vtx1, VertexType& vtx2){return vtx1;});
387  phase_timing[4] += MPI_Wtime()-t1;
388 
389 
390 
391 
392  }
393  time_search = MPI_Wtime() - time_search;
394  phase_timing[5] += time_search;
395 
396  IT numMatchedCol = leaves.Count([](IT leaf){return leaf!=-1;});
397  phaseMatched.push_back(numMatchedCol);
398  time_augment = MPI_Wtime();
399  if (numMatchedCol== 0) matched = false;
400  else
401  {
402 
403  if(numMatchedCol < (2* nprocs * nprocs))
404  AugmentPath(mateRow2Col, mateCol2Row,parentsRow, leaves);
405  else
406  AugmentLevel(mateRow2Col, mateCol2Row,parentsRow, leaves);
407  }
408  time_augment = MPI_Wtime() - time_augment;
409  phase_timing[6] += time_augment;
410 
411  time_phase = MPI_Wtime() - time_phase;
412  phase_timing[7] += time_phase;
413  timing.push_back(phase_timing);
414  totalLayer += layer;
415  layers.push_back(layer);
416 
417  }
418 
419 
420  MPI_Win_free(&winLeaves);
421 
422  //isMaximalmatching(A, mateRow2Col, mateCol2Row, unmatchedRow, unmatchedCol);
423  //isMatching(mateCol2Row, mateRow2Col); //todo there is a better way to check this
424 
425 
426  // print statistics
427  double combTime;
428  if(myrank == 0)
429  {
430  std::cout << "****** maximum matching runtime ********\n";
431  std::cout << std::endl;
432  std::cout << "========================================================================\n";
433  std::cout << " BFS Search \n";
434  std::cout << "===================== ==================================================\n";
435  std::cout << "Phase Layer Match SpMV EWOpp CmUqL Prun CmMC BFS Aug Total\n";
436  std::cout << "===================== ===================================================\n";
437 
438  std::vector<double> totalTimes(timing[0].size(),0);
439  int nphases = timing.size();
440  for(int i=0; i<timing.size(); i++)
441  {
442  printf(" %3d %3d %8lld ", i+1, layers[i], phaseMatched[i]);
443  for(int j=0; j<timing[i].size(); j++)
444  {
445  totalTimes[j] += timing[i][j];
446  //timing[i][j] /= timing[i].back();
447  printf("%.2lf ", timing[i][j]);
448  }
449 
450  printf("\n");
451  }
452 
453  std::cout << "-----------------------------------------------------------------------\n";
454  std::cout << "Phase Layer UnMat SpMV EWOpp CmUqL Prun CmMC BFS Aug Total \n";
455  std::cout << "-----------------------------------------------------------------------\n";
456 
457  combTime = totalTimes.back();
458  printf(" %3d %3d %8lld ", nphases, totalLayer/nphases, numUnmatchedCol);
459  for(int j=0; j<totalTimes.size()-1; j++)
460  {
461  printf("%.2lf ", totalTimes[j]);
462  }
463  printf("%.2lf\n", combTime);
464  }
465 
466  IT nrows=A.getnrow();
467  IT matchedRow = mateRow2Col.Count([](IT mate){return mate!=-1;});
468  if(myrank==0)
469  {
470  std::cout << "***Final Maximum Matching***\n";
471  std::cout << "***Total-Rows Matched-Rows Total Time***\n";
472  printf("%lld %lld %lf \n",nrows, matchedRow, combTime);
473  printf("matched rows: %lld , which is: %lf percent \n",matchedRow, 100*(double)matchedRow/(nrows));
474  std::cout << "-------------------------------------------------------\n\n";
475  }
476 
477 }
478 
479 }
480 
481 #endif
482 
MTRand GlobalMT
Definition: RestrictionOp.h:18
double rand()
int size
Definition: common.h:20
Definition: CCGrid.h:4
void AugmentPath(FullyDistVec< IT, IT > &mateRow2Col, FullyDistVec< IT, IT > &mateCol2Row, FullyDistVec< IT, IT > &parentsRow, FullyDistVec< IT, IT > &leaves)
void maximumMatching(SpParMat< IT, NT, DER > &A, FullyDistVec< IT, IT > &mateRow2Col, FullyDistVec< IT, IT > &mateCol2Row, bool prune=true, bool randMM=false, bool maximizeWeight=false)
void DeleteAll(A arr1)
Definition: Deleter.h:48
void AugmentLevel(FullyDistVec< IT, IT > &mateRow2Col, FullyDistVec< IT, IT > &mateCol2Row, FullyDistVec< IT, IT > &parentsRow, FullyDistVec< IT, IT > &leaves)
SpParMat< IT, bool, DER > PermMat(const FullyDistVec< IT, IT > &ri, const IT ncol)
double A