COMBINATORIAL_BLAS 1.6
 
Loading...
Searching...
No Matches
BPMaximumMatching.h
Go to the documentation of this file.
1#ifndef BP_MAXIMUM_MATCHING_H
2#define BP_MAXIMUM_MATCHING_H
3
4#include "CombBLAS/CombBLAS.h"
5#include <mpi.h>
6#include <sys/time.h>
7#include <iostream>
8#include <functional>
9#include <algorithm>
10#include <vector>
11#include <string>
12#include <sstream>
13#include "MatchingDefs.h"
14
15namespace combblas {
16
25template <class IT, class DER>
26SpParMat<IT, bool, DER> PermMat (const FullyDistVec<IT,IT> & ri, const IT ncol)
27{
28
29 IT procsPerRow = ri.commGrid->GetGridCols(); // the number of processor in a row of processor grid
30 IT procsPerCol = ri.commGrid->GetGridRows(); // the number of processor in a column of processor grid
31
32
33 IT global_nrow = ri.TotalLength();
34 IT global_ncol = ncol;
35 IT m_perprocrow = global_nrow / procsPerRow;
36 IT n_perproccol = global_ncol / procsPerCol;
37
38
39 // The indices for FullyDistVec are offset'd to 1/p pieces
40 // The matrix indices are offset'd to 1/sqrt(p) pieces
41 // Add the corresponding offset before sending the data
42
43 std::vector< std::vector<IT> > rowid(procsPerRow); // rowid in the local matrix of each vector entry
44 std::vector< std::vector<IT> > colid(procsPerRow); // colid in the local matrix of each vector entry
45
46 IT locvec = ri.arr.size(); // nnz in local vector
47 IT roffset = ri.RowLenUntil(); // the number of vector elements in this processor row before the current processor
48 for(typename std::vector<IT>::size_type i=0; i< (unsigned)locvec; ++i)
49 {
50 if(ri.arr[i]>=0 && ri.arr[i]<ncol) // this specialized for matching. TODO: make it general purpose by passing a function
51 {
52 IT rowrec = (n_perproccol!=0) ? std::min(ri.arr[i] / n_perproccol, procsPerRow-1) : (procsPerRow-1);
53 // ri's numerical values give the colids and its local indices give rowids
54 rowid[rowrec].push_back( i + roffset);
55 colid[rowrec].push_back(ri.arr[i] - (rowrec * n_perproccol));
56 }
57
58 }
59
60
61
62 int * sendcnt = new int[procsPerRow];
63 int * recvcnt = new int[procsPerRow];
64 for(IT i=0; i<procsPerRow; ++i)
65 {
66 sendcnt[i] = rowid[i].size();
67 }
68
69 MPI_Alltoall(sendcnt, 1, MPI_INT, recvcnt, 1, MPI_INT, ri.commGrid->GetRowWorld()); // share the counts
70
71 int * sdispls = new int[procsPerRow]();
72 int * rdispls = new int[procsPerRow]();
73 partial_sum(sendcnt, sendcnt+procsPerRow-1, sdispls+1);
74 partial_sum(recvcnt, recvcnt+procsPerRow-1, rdispls+1);
75 IT p_nnz = accumulate(recvcnt,recvcnt+procsPerRow, static_cast<IT>(0));
76
77
78 IT * p_rows = new IT[p_nnz];
79 IT * p_cols = new IT[p_nnz];
80 IT * senddata = new IT[locvec];
81 for(int i=0; i<procsPerRow; ++i)
82 {
83 copy(rowid[i].begin(), rowid[i].end(), senddata+sdispls[i]);
84 std::vector<IT>().swap(rowid[i]); // clear memory of rowid
85 }
86 MPI_Alltoallv(senddata, sendcnt, sdispls, MPIType<IT>(), p_rows, recvcnt, rdispls, MPIType<IT>(), ri.commGrid->GetRowWorld());
87
88 for(int i=0; i<procsPerRow; ++i)
89 {
90 copy(colid[i].begin(), colid[i].end(), senddata+sdispls[i]);
91 std::vector<IT>().swap(colid[i]); // clear memory of colid
92 }
93 MPI_Alltoallv(senddata, sendcnt, sdispls, MPIType<IT>(), p_cols, recvcnt, rdispls, MPIType<IT>(), ri.commGrid->GetRowWorld());
94 delete [] senddata;
95
96 std::tuple<IT,IT,bool> * p_tuples = new std::tuple<IT,IT,bool>[p_nnz];
97 for(IT i=0; i< p_nnz; ++i)
98 {
99 p_tuples[i] = make_tuple(p_rows[i], p_cols[i], 1);
100 }
101 DeleteAll(p_rows, p_cols);
102
103
104 // Now create the local matrix
105 IT local_nrow = ri.MyRowLength();
106 int my_proccol = ri.commGrid->GetRankInProcRow();
107 IT local_ncol = (my_proccol<(procsPerCol-1))? (n_perproccol) : (global_ncol - (n_perproccol*(procsPerCol-1)));
108
109 // infer the concrete type SpMat<IT,IT>
110 typedef typename create_trait<DER, IT, bool>::T_inferred DER_IT;
111 DER_IT * PSeq = new DER_IT();
112 PSeq->Create( p_nnz, local_nrow, local_ncol, p_tuples); // deletion of tuples[] is handled by SpMat::Create
113
114 SpParMat<IT,bool,DER_IT> P (PSeq, ri.commGrid);
115 //Par_DCSC_Bool P (PSeq, ri.commGrid);
116 return P;
117}
118
119
120
121
122/***************************************************************************
123// Augment a matching by a set of vertex-disjoint augmenting paths.
124// The paths are explored level-by-level similar to the level-synchronous BFS
125// This approach is more effecient when we have many short augmenting paths
126***************************************************************************/
127
128template <typename IT>
129void AugmentLevel(FullyDistVec<IT, IT>& mateRow2Col, FullyDistVec<IT, IT>& mateCol2Row, FullyDistVec<IT, IT>& parentsRow, FullyDistVec<IT, IT>& leaves)
130{
131
132 IT nrow = mateRow2Col.TotalLength();
133 IT ncol = mateCol2Row.TotalLength();
134 FullyDistSpVec<IT, IT> col(leaves, [](IT leaf){return leaf!=-1;});
135 FullyDistSpVec<IT, IT> row(mateRow2Col.getcommgrid(), nrow);
136 FullyDistSpVec<IT, IT> nextcol(col.getcommgrid(), ncol);
137
138 while(col.getnnz()!=0)
139 {
140
141 row = col.Invert(nrow);
142 row = EWiseApply<IT>(row, parentsRow,
143 [](IT root, IT parent){return parent;},
144 [](IT root, IT parent){return true;},
145 false, (IT)-1);
146
147 col = row.Invert(ncol); // children array
148 nextcol = EWiseApply<IT>(col, mateCol2Row,
149 [](IT child, IT mate){return mate;},
150 [](IT child, IT mate){return mate!=-1;},
151 false, (IT)-1);
152 mateRow2Col.Set(row);
153 mateCol2Row.Set(col);
154 col = nextcol;
155 }
156}
157
158
159/***************************************************************************
160// Augment a matching by a set of vertex-disjoint augmenting paths.
161// An MPI processor is responsible for a complete path.
162// This approach is more effecient when we have few long augmenting paths
163// We used one-sided MPI. Any PGAS language should be fine as well.
164// This function is not thread safe, hence multithreading is not used here
165 ***************************************************************************/
166
167template <typename IT>
168void AugmentPath(FullyDistVec<IT, IT>& mateRow2Col, FullyDistVec<IT, IT>& mateCol2Row,FullyDistVec<IT, IT>& parentsRow, FullyDistVec<IT, IT>& leaves)
169{
170 MPI_Win win_mateRow2Col, win_mateCol2Row, win_parentsRow;
171 MPI_Win_create((IT*)mateRow2Col.GetLocArr(), mateRow2Col.LocArrSize() * sizeof(IT), sizeof(IT), MPI_INFO_NULL, mateRow2Col.commGrid->GetWorld(), &win_mateRow2Col);
172 MPI_Win_create((IT*)mateCol2Row.GetLocArr(), mateCol2Row.LocArrSize() * sizeof(IT), sizeof(IT), MPI_INFO_NULL, mateCol2Row.commGrid->GetWorld(), &win_mateCol2Row);
173 MPI_Win_create((IT*)parentsRow.GetLocArr(), parentsRow.LocArrSize() * sizeof(IT), sizeof(IT), MPI_INFO_NULL, parentsRow.commGrid->GetWorld(), &win_parentsRow);
174
175
176 IT* leaves_ptr = (IT*) leaves.GetLocArr();
177 //MPI_Win_fence(0, win_mateRow2Col);
178 //MPI_Win_fence(0, win_mateCol2Row);
179 //MPI_Win_fence(0, win_parentsRow);
180
181 IT row, col=100, nextrow;
182 int owner_row, owner_col;
183 IT locind_row, locind_col;
184 int myrank;
185
186 MPI_Comm comm = mateRow2Col.getcommgrid()->GetWorld();
187 MPI_Comm_rank(comm,&myrank);
188
189
190 for(IT i=0; i<leaves.LocArrSize(); i++)
191 {
192 int depth=0;
193 row = *(leaves_ptr+i);
194 while(row != - 1)
195 {
196
197 owner_row = mateRow2Col.Owner(row, locind_row);
198 MPI_Win_lock(MPI_LOCK_SHARED, owner_row, 0, win_parentsRow);
199 MPI_Get(&col, 1, MPIType<IT>(), owner_row, locind_row, 1, MPIType<IT>(), win_parentsRow);
200 MPI_Win_unlock(owner_row, win_parentsRow);
201
202 owner_col = mateCol2Row.Owner(col, locind_col);
203 MPI_Win_lock(MPI_LOCK_SHARED, owner_col, 0, win_mateCol2Row);
204 MPI_Fetch_and_op(&row, &nextrow, MPIType<IT>(), owner_col, locind_col, MPI_REPLACE, win_mateCol2Row);
205 MPI_Win_unlock(owner_col, win_mateCol2Row);
206
207 MPI_Win_lock(MPI_LOCK_SHARED, owner_row, 0, win_mateRow2Col);
208 MPI_Put(&col, 1, MPIType<IT>(), owner_row, locind_row, 1, MPIType<IT>(), win_mateRow2Col);
209 MPI_Win_unlock(owner_row, win_mateRow2Col); // we need this otherwise col might get overwritten before communication!
210 row = nextrow;
211
212 }
213 }
214
215 //MPI_Win_fence(0, win_mateRow2Col);
216 //MPI_Win_fence(0, win_mateCol2Row);
217 //MPI_Win_fence(0, win_parentsRow);
218
219 MPI_Win_free(&win_mateRow2Col);
220 MPI_Win_free(&win_mateCol2Row);
221 MPI_Win_free(&win_parentsRow);
222}
223
224
225
226
227
228// Maximum cardinality matching
229// Output: mateRow2Col and mateRow2Col
230template <typename IT, typename NT,typename DER>
231void maximumMatching(SpParMat < IT, NT, DER > & A, FullyDistVec<IT, IT>& mateRow2Col,
232 FullyDistVec<IT, IT>& mateCol2Row, bool prune=true, bool randMM = false, bool maximizeWeight = false)
233{
234 static MTRand GlobalMT(123); // for reproducible result
235 typedef VertexTypeMM <IT> VertexType;
236
237 int nthreads=1;
238#ifdef THREADED
239#pragma omp parallel
240 {
241 nthreads = omp_get_num_threads();
242 }
243#endif
244 PreAllocatedSPA<VertexType> SPA(A.seq(), nthreads*4);
245
246 double tstart = MPI_Wtime();
247 int nprocs, myrank;
248 MPI_Comm comm = A.getcommgrid()->GetWorld();
249 MPI_Comm_size(comm,&nprocs);
250 MPI_Comm_rank(comm,&myrank);
251
252 IT nrow = A.getnrow();
253 IT ncol = A.getncol();
254
255 FullyDistSpVec<IT, VertexType> fringeRow(A.getcommgrid(), nrow);
256 FullyDistSpVec<IT, IT> umFringeRow(A.getcommgrid(), nrow);
257 FullyDistVec<IT, IT> leaves ( A.getcommgrid(), ncol, (IT) -1);
258
259 std::vector<std::vector<double> > timing;
260 std::vector<int> layers;
261 std::vector<int64_t> phaseMatched;
262 double t1, time_search, time_augment, time_phase;
263
264 bool matched = true;
265 int phase = 0;
266 int totalLayer = 0;
267 IT numUnmatchedCol;
268
269
270 MPI_Win winLeaves;
271 MPI_Win_create((IT*)leaves.GetLocArr(), leaves.LocArrSize() * sizeof(IT), sizeof(IT), MPI_INFO_NULL, A.getcommgrid()->GetWorld(), &winLeaves);
272
273
274 while(matched)
275 {
276 time_phase = MPI_Wtime();
277
278 std::vector<double> phase_timing(8,0);
279 leaves.Apply ( [](IT val){return (IT) -1;});
280 FullyDistVec<IT, IT> parentsRow ( A.getcommgrid(), nrow, (IT) -1);
281 FullyDistSpVec<IT, VertexType> fringeCol(A.getcommgrid(), ncol);
282 fringeCol = EWiseApply<VertexType>(fringeCol, mateCol2Row,
283 [](VertexType vtx, IT mate){return vtx;},
284 [](VertexType vtx, IT mate){return mate==-1;},
285 true, VertexType());
286
287
288 if(randMM) //select rand
289 {
290 fringeCol.ApplyInd([](VertexType vtx, IT idx){return VertexType(idx,idx,GlobalMT.rand());});
291 }
292 else
293 {
294 fringeCol.ApplyInd([](VertexType vtx, IT idx){return VertexType(idx,idx);});
295 }
296
297 ++phase;
298 numUnmatchedCol = fringeCol.getnnz();
299 int layer = 0;
300
301
302 time_search = MPI_Wtime();
303 while(fringeCol.getnnz() > 0)
304 {
305 layer++;
306 t1 = MPI_Wtime();
307
308 //TODO: think about this semiring
309 if(maximizeWeight)
310 SpMV<WeightMaxMMSR<NT, VertexType>>(A, fringeCol, fringeRow, false, SPA);
311 else
312 SpMV<Select2ndMinSR<NT, VertexType>>(A, fringeCol, fringeRow, false, SPA);
313 phase_timing[0] += MPI_Wtime()-t1;
314
315
316
317 // remove vertices already having parents
318
319 t1 = MPI_Wtime();
320 fringeRow = EWiseApply<VertexType>(fringeRow, parentsRow,
321 [](VertexType vtx, IT parent){return vtx;},
322 [](VertexType vtx, IT parent){return parent==-1;},
323 false, VertexType());
324
325 // Set parent pointer
326 parentsRow.EWiseApply(fringeRow,
327 [](IT dval, VertexType svtx){return svtx.parent;},
328 [](IT dval, VertexType svtx){return true;},
329 false, VertexType());
330
331
332 umFringeRow = EWiseApply<IT>(fringeRow, mateRow2Col,
333 [](VertexType vtx, IT mate){return vtx.root;},
334 [](VertexType vtx, IT mate){return mate==-1;},
335 false, VertexType());
336
337 phase_timing[1] += MPI_Wtime()-t1;
338
339
340 IT nnz_umFringeRow = umFringeRow.getnnz(); // careful about this timing
341
342 t1 = MPI_Wtime();
343 if(nnz_umFringeRow >0)
344 {
345 /*
346 if(nnz_umFringeRow < 25*nprocs)
347 {
348 leaves.GSet(umFringeRow,
349 [](IT valRoot, IT idxLeaf){return valRoot;},
350 [](IT valRoot, IT idxLeaf){return idxLeaf;},
351 winLeaves);
352 // There might be a bug here. It does not return the same output for different number of processes
353 // e.g., check with g7jac200sc.mtx matrix
354 }
355 else*/
356 {
357 FullyDistSpVec<IT, IT> temp1(A.getcommgrid(), ncol);
358 temp1 = umFringeRow.Invert(ncol);
359 leaves.Set(temp1);
360 }
361 }
362
363 phase_timing[2] += MPI_Wtime()-t1;
364
365
366
367
368 // matched row vertices in the the fringe
369 fringeRow = EWiseApply<VertexType>(fringeRow, mateRow2Col,
370 [](VertexType vtx, IT mate){return VertexType(mate, vtx.root);},
371 [](VertexType vtx, IT mate){return mate!=-1;},
372 false, VertexType());
373
374 t1 = MPI_Wtime();
375 if(nnz_umFringeRow>0 && prune)
376 {
377 fringeRow.FilterByVal (umFringeRow,[](VertexType vtx){return vtx.root;}, false);
378 }
379 double tprune = MPI_Wtime()-t1;
380 phase_timing[3] += tprune;
381
382
383 // Go to matched column from matched row in the fringe. parent is automatically set to itself.
384 t1 = MPI_Wtime();
385 fringeCol = fringeRow.Invert(ncol,
386 [](VertexType& vtx, const IT & index){return vtx.parent;},
387 [](VertexType& vtx, const IT & index){return vtx;},
388 [](VertexType& vtx1, VertexType& vtx2){return vtx1;});
389 phase_timing[4] += MPI_Wtime()-t1;
390
391
392
393
394 }
395 time_search = MPI_Wtime() - time_search;
396 phase_timing[5] += time_search;
397
398 IT numMatchedCol = leaves.Count([](IT leaf){return leaf!=-1;});
399 phaseMatched.push_back(numMatchedCol);
400 time_augment = MPI_Wtime();
401 if (numMatchedCol== 0) matched = false;
402 else
403 {
404
405 if(numMatchedCol < (2* nprocs * nprocs))
406 AugmentPath(mateRow2Col, mateCol2Row,parentsRow, leaves);
407 else
408 AugmentLevel(mateRow2Col, mateCol2Row,parentsRow, leaves);
409 }
410 time_augment = MPI_Wtime() - time_augment;
411 phase_timing[6] += time_augment;
412
413 time_phase = MPI_Wtime() - time_phase;
414 phase_timing[7] += time_phase;
415 timing.push_back(phase_timing);
416 totalLayer += layer;
417 layers.push_back(layer);
418
419 }
420
421
422 MPI_Win_free(&winLeaves);
423 //isMaximalmatching(A, mateRow2Col, mateCol2Row, unmatchedRow, unmatchedCol);
424 //isMatching(mateCol2Row, mateRow2Col); //todo there is a better way to check this
425
426
427 // print statistics
428 double combTime;
429
430#ifdef TIMING
431 if(myrank == 0)
432 {
433 std::cout << "****** maximum matching runtime ********\n";
434 std::cout << std::endl;
435 std::cout << "========================================================================\n";
436 std::cout << " BFS Search \n";
437 std::cout << "===================== ==================================================\n";
438 std::cout << "Phase Layer Match SpMV EWOpp CmUqL Prun CmMC BFS Aug Total\n";
439 std::cout << "===================== ===================================================\n";
440
441 std::vector<double> totalTimes(timing[0].size(),0);
442 int nphases = timing.size();
443 for(int i=0; i<timing.size(); i++)
444 {
445 printf(" %3d %3d %8lld ", i+1, layers[i], phaseMatched[i]);
446 for(int j=0; j<timing[i].size(); j++)
447 {
448 totalTimes[j] += timing[i][j];
449 //timing[i][j] /= timing[i].back();
450 printf("%.2lf ", timing[i][j]);
451 }
452
453 printf("\n");
454 }
455
456 std::cout << "-----------------------------------------------------------------------\n";
457 std::cout << "Phase Layer UnMat SpMV EWOpp CmUqL Prun CmMC BFS Aug Total \n";
458 std::cout << "-----------------------------------------------------------------------\n";
459
460 combTime = totalTimes.back();
461 printf(" %3d %3d %8lld ", nphases, totalLayer/nphases, numUnmatchedCol);
462 for(int j=0; j<totalTimes.size()-1; j++)
463 {
464 printf("%.2lf ", totalTimes[j]);
465 }
466 printf("%.2lf\n", combTime);
467 }
468#endif
469
470 IT nrows=A.getnrow();
471 IT matchedRow = mateRow2Col.Count([](IT mate){return mate!=-1;});
472#ifdef DETAIL_STATS
473 if(myrank==0)
474 {
475 std::cout << "***Final Maximum Matching***\n";
476 std::cout << "***Total-Rows Matched-Rows Total Time***\n";
477 printf("%lld %lld %lf \n",nrows, matchedRow, combTime);
478 printf("matched rows: %lld , which is: %lf percent \n",matchedRow, 100*(double)matchedRow/(nrows));
479 std::cout << "-------------------------------------------------------\n\n";
480 }
481#endif
482
483}
484
485}
486
487#endif
488
int64_t IT
MTRand GlobalMT
bool prune
Definition auction.cpp:15
bool randMM
Definition auction.cpp:15
double rand()
std::shared_ptr< CommGrid > commGrid
int size
Definition common.h:20
int nprocs
Definition comms.cpp:55
void AugmentPath(FullyDistVec< IT, IT > &mateRow2Col, FullyDistVec< IT, IT > &mateCol2Row, FullyDistVec< IT, IT > &parentsRow, FullyDistVec< IT, IT > &leaves)
void maximumMatching(SpParMat< IT, NT, DER > &A, FullyDistVec< IT, IT > &mateRow2Col, FullyDistVec< IT, IT > &mateCol2Row, bool prune=true, bool randMM=false, bool maximizeWeight=false)
void DeleteAll(A arr1)
Definition Deleter.h:48
void AugmentLevel(FullyDistVec< IT, IT > &mateRow2Col, FullyDistVec< IT, IT > &mateCol2Row, FullyDistVec< IT, IT > &parentsRow, FullyDistVec< IT, IT > &leaves)
SpParMat< IT, bool, DER > PermMat(const FullyDistVec< IT, IT > &ri, const IT ncol)
double A