9#ifndef ApproxWeightPerfectMatching_h
10#define ApproxWeightPerfectMatching_h
12#include "CombBLAS/CombBLAS.h"
15#include <parallel/algorithm>
16#include <parallel/numeric>
43template <
class IT,
class NT>
44std::vector<std::tuple<IT,IT,NT>>
ExchangeData(std::vector<std::vector<std::tuple<IT,IT,NT>>> & tempTuples, MPI_Comm World)
48 MPI_Datatype MPI_tuple;
49 MPI_Type_contiguous(
sizeof(std::tuple<IT,IT,NT>), MPI_CHAR, &MPI_tuple);
50 MPI_Type_commit(&MPI_tuple);
53 MPI_Comm_size(World, &
nprocs);
55 int * sendcnt =
new int[
nprocs];
56 int * recvcnt =
new int[
nprocs];
57 int * sdispls =
new int[
nprocs]();
58 int * rdispls =
new int[
nprocs]();
64 sendcnt[i] = tempTuples[i].size();
65 totsend += tempTuples[i].size();
68 MPI_Alltoall(sendcnt, 1, MPI_INT, recvcnt, 1, MPI_INT, World);
70 std::partial_sum(sendcnt, sendcnt+
nprocs-1, sdispls+1);
71 std::partial_sum(recvcnt, recvcnt+
nprocs-1, rdispls+1);
72 IT totrecv = accumulate(recvcnt,recvcnt+
nprocs,
static_cast<IT>(0));
75 std::vector< std::tuple<IT,IT,NT> > sendTuples(totsend);
76 for(
int i=0; i<
nprocs; ++i)
78 copy(tempTuples[i].begin(), tempTuples[i].end(), sendTuples.data()+sdispls[i]);
79 std::vector< std::tuple<IT,IT,NT> >().swap(tempTuples[i]);
81 std::vector< std::tuple<IT,IT,NT> > recvTuples(totrecv);
82 MPI_Alltoallv(sendTuples.data(), sendcnt, sdispls, MPI_tuple, recvTuples.data(), recvcnt, rdispls, MPI_tuple, World);
83 DeleteAll(sendcnt, recvcnt, sdispls, rdispls);
84 MPI_Type_free(&MPI_tuple);
91template <
class IT,
class NT>
92std::vector<std::tuple<IT,IT,IT,NT>>
ExchangeData1(std::vector<std::vector<std::tuple<IT,IT,IT,NT>>> & tempTuples, MPI_Comm World)
96 MPI_Datatype MPI_tuple;
97 MPI_Type_contiguous(
sizeof(std::tuple<IT,IT,IT,NT>), MPI_CHAR, &MPI_tuple);
98 MPI_Type_commit(&MPI_tuple);
101 MPI_Comm_size(World, &
nprocs);
103 int * sendcnt =
new int[
nprocs];
104 int * recvcnt =
new int[
nprocs];
105 int * sdispls =
new int[
nprocs]();
106 int * rdispls =
new int[
nprocs]();
112 sendcnt[i] = tempTuples[i].size();
113 totsend += tempTuples[i].size();
116 MPI_Alltoall(sendcnt, 1, MPI_INT, recvcnt, 1, MPI_INT, World);
118 std::partial_sum(sendcnt, sendcnt+
nprocs-1, sdispls+1);
119 std::partial_sum(recvcnt, recvcnt+
nprocs-1, rdispls+1);
120 IT totrecv = std::accumulate(recvcnt,recvcnt+
nprocs,
static_cast<IT>(0));
122 std::vector< std::tuple<IT,IT,IT,NT> > sendTuples(totsend);
123 for(
int i=0; i<
nprocs; ++i)
125 copy(tempTuples[i].begin(), tempTuples[i].end(), sendTuples.data()+sdispls[i]);
126 std::vector< std::tuple<IT,IT,IT,NT> >().swap(tempTuples[i]);
128 std::vector< std::tuple<IT,IT,IT,NT> > recvTuples(totrecv);
129 MPI_Alltoallv(sendTuples.data(), sendcnt, sdispls, MPI_tuple, recvTuples.data(), recvcnt, rdispls, MPI_tuple, World);
130 DeleteAll(sendcnt, recvcnt, sdispls, rdispls);
131 MPI_Type_free(&MPI_tuple);
140template <
class IT,
class NT,
class DER>
143 auto commGrid =
A.getcommgrid();
144 int procrows = commGrid->GetGridRows();
145 int proccols = commGrid->GetGridCols();
147 IT m_perproc = nrows / procrows;
148 IT n_perproc = ncols / proccols;
151 pr = std::min(
static_cast<int>(grow / m_perproc), procrows-1);
155 pc = std::min(
static_cast<int>(gcol / n_perproc), proccols-1);
158 return commGrid->GetRank(pr, pc);
185std::vector<std::tuple<IT,IT>>
MateBcast(std::vector<std::tuple<IT,IT>> sendTuples, MPI_Comm World)
189 MPI_Datatype MPI_tuple;
190 MPI_Type_contiguous(
sizeof(std::tuple<IT,IT>) , MPI_CHAR, &MPI_tuple);
191 MPI_Type_commit(&MPI_tuple);
195 MPI_Comm_size(World, &
nprocs);
197 int * recvcnt =
new int[
nprocs];
198 int * rdispls =
new int[
nprocs]();
199 int sendcnt = sendTuples.size();
202 MPI_Allgather(&sendcnt, 1, MPI_INT, recvcnt, 1, MPI_INT, World);
204 std::partial_sum(recvcnt, recvcnt+
nprocs-1, rdispls+1);
205 IT totrecv = std::accumulate(recvcnt,recvcnt+
nprocs,
static_cast<IT>(0));
207 std::vector< std::tuple<IT,IT> > recvTuples(totrecv);
210 MPI_Allgatherv(sendTuples.data(), sendcnt, MPI_tuple,
211 recvTuples.data(), recvcnt, rdispls,MPI_tuple,World );
214 MPI_Type_free(&MPI_tuple);
225template <
class IT,
class NT>
226void ReplicateMateWeights(
const AWPM_param<IT>& param, Dcsc<IT, NT>*dcsc,
const std::vector<IT>& colptr, std::vector<IT>& RepMateC2R, std::vector<NT>& RepMateWR2C, std::vector<NT>& RepMateWC2R)
230 fill(RepMateWC2R.begin(), RepMateWC2R.end(),
static_cast<NT>(0));
231 fill(RepMateWR2C.begin(), RepMateWR2C.end(),
static_cast<NT>(0));
235#pragma omp parallel for
237 for(
int k=0; k<param.lncol; ++k)
241 IT mj = RepMateC2R[lj];
243 if(mj >= param.localRowStart && mj < (param.localRowStart+param.lnrow) )
245 for(
IT cp = colptr[k]; cp < colptr[k+1]; ++cp)
247 IT li = dcsc->ir[cp];
248 IT i = li + param.localRowStart;
252 RepMateWC2R[lj] = dcsc->numx[cp];
253 RepMateWR2C[mj-param.localRowStart] = dcsc->numx[cp];
262 MPI_Comm ColWorld = param.commGrid->GetColWorld();
263 MPI_Comm RowWorld = param.commGrid->GetRowWorld();
265 MPI_Allreduce(MPI_IN_PLACE, RepMateWC2R.data(), RepMateWC2R.size(), MPIType<NT>(), MPI_SUM, ColWorld);
266 MPI_Allreduce(MPI_IN_PLACE, RepMateWR2C.data(), RepMateWR2C.size(), MPIType<NT>(), MPI_SUM, RowWorld);
272template <
class IT,
class NT,
class DER>
273NT Trace( SpParMat < IT, NT, DER > &
A,
IT& rettrnnz=0)
276 IT nrows =
A.getnrow();
277 IT ncols =
A.getncol();
278 auto commGrid =
A.getcommgrid();
279 MPI_Comm World = commGrid->GetWorld();
280 int myrank=commGrid->GetRank();
281 int pr = commGrid->GetGridRows();
282 int pc = commGrid->GetGridCols();
288 int rowrank = commGrid->GetRankInProcRow();
289 int colrank = commGrid->GetRankInProcCol();
290 IT m_perproc = nrows / pr;
291 IT n_perproc = ncols / pc;
292 DER* spSeq =
A.seqptr();
293 IT localRowStart = colrank * m_perproc;
294 IT localColStart = rowrank * n_perproc;
299 for(
auto colit = spSeq->begcol(); colit != spSeq->endcol(); ++colit)
301 IT lj = colit.colid();
302 IT j = lj + localColStart;
304 for(
auto nzit = spSeq->begnz(colit); nzit < spSeq->endnz(colit); ++nzit)
307 IT li = nzit.rowid();
308 IT i = li + localRowStart;
312 trace += nzit.value();
318 MPI_Allreduce(MPI_IN_PLACE, &trnnz, 1, MPIType<IT>(), MPI_SUM, World);
319 MPI_Allreduce(MPI_IN_PLACE, &trace, 1, MPIType<NT>(), MPI_SUM, World);
334 minw = 99999999999999.0;
335 for(
int i=0; i<RepMateWC2R.size(); i++)
342 minw = std::min(minw, RepMateWC2R[i]);
345 MPI_Allreduce(MPI_IN_PLACE, &w, 1, MPIType<NT>(), MPI_SUM, RowWorld);
346 MPI_Allreduce(MPI_IN_PLACE, &minw, 1, MPIType<NT>(), MPI_MIN, RowWorld);
356void UpdateMatching(FullyDistVec<IT, IT>& mateRow2Col, FullyDistVec<IT, IT>& mateCol2Row, std::vector<IT>& RepMateR2C, std::vector<IT>& RepMateC2R)
359 auto commGrid = mateRow2Col.getcommgrid();
360 MPI_Comm RowWorld = commGrid->GetRowWorld();
361 int rowroot = commGrid->GetDiagOfProcRow();
362 int pc = commGrid->GetGridCols();
365 IT localLenR2C = mateRow2Col.LocArrSize();
367 for(
IT i=0, j = mateRow2Col.RowLenUntil(); i<localLenR2C; i++, j++)
369 mateRow2Col.SetLocalElement(i, RepMateR2C[j]);
375 std::vector <int> sendcnts(pc);
376 std::vector <int> dpls(pc);
378 for(
int i=1; i<pc; i++)
380 dpls[i] = mateCol2Row.RowLenUntil(i);
381 sendcnts[i-1] = dpls[i] - dpls[i-1];
383 sendcnts[pc-1] = RepMateC2R.size() - dpls[pc-1];
385 IT localLenC2R = mateCol2Row.LocArrSize();
386 IT* localC2R = (
IT*) mateCol2Row.GetLocArr();
387 MPI_Scatterv(RepMateC2R.data(),sendcnts.data(), dpls.data(), MPIType<IT>(), localC2R, localLenC2R, MPIType<IT>(),rowroot, RowWorld);
396#define L2_CACHE_SIZE 256000
398 int THREAD_BUF_LEN = 256;
401 int bufferMem = THREAD_BUF_LEN * nbins * itemsize ;
405 THREAD_BUF_LEN = std::min(nbins+1,THREAD_BUF_LEN);
407 return THREAD_BUF_LEN;
412template <
class IT,
class NT>
413std::vector< std::tuple<IT,IT,NT> >
Phase1(
const AWPM_param<IT>& param, Dcsc<IT, NT>* dcsc,
const std::vector<IT>& colptr,
const std::vector<IT>& RepMateR2C,
const std::vector<IT>& RepMateC2R,
const std::vector<NT>& RepMateWR2C,
const std::vector<NT>& RepMateWC2R )
418 double tstart = MPI_Wtime();
421 MPI_Comm World = param.commGrid->GetWorld();
426 std::vector<int> sendcnt(param.nprocs,0);
433 std::vector<int> tsendcnt(param.nprocs,0);
437 for(
int k=0; k<param.lncol; ++k)
439 IT mj = RepMateC2R[k];
440 IT j = k + param.localColStart;
442 for(
IT cp = colptr[k]; cp < colptr[k+1]; ++cp)
444 IT li = dcsc->ir[cp];
445 IT i = li + param.localRowStart;
446 IT mi = RepMateR2C[li];
450 int rrank = param.m_perproc != 0 ? std::min(
static_cast<int>(mj / param.m_perproc), param.pr-1) : (param.pr-1);
451 int crank = param.n_perproc != 0 ? std::min(
static_cast<int>(mi / param.n_perproc), param.pc-1) : (param.pc-1);
452 int owner = param.commGrid->GetRank(rrank , crank);
457 for(
int i=0; i<param.nprocs; i++)
459 __sync_fetch_and_add(sendcnt.data()+i, tsendcnt[i]);
466 IT totsend = std::accumulate(sendcnt.data(), sendcnt.data()+param.nprocs,
static_cast<IT>(0));
467 std::vector<int> sdispls (param.nprocs, 0);
468 std::partial_sum(sendcnt.data(), sendcnt.data()+param.nprocs-1, sdispls.data()+1);
470 std::vector< std::tuple<IT,IT,NT> > sendTuples(totsend);
471 std::vector<int> transferCount(param.nprocs,0);
480 std::vector<int> tsendcnt(param.nprocs,0);
481 std::vector<std::tuple<IT,IT, NT>> tsendTuples (param.nprocs*THREAD_BUF_LEN);
485 for(
int k=0; k<param.lncol; ++k)
487 IT mj = RepMateC2R[k];
489 IT j = k + param.localColStart;
491 for(
IT cp = colptr[k]; cp < colptr[k+1]; ++cp)
493 IT li = dcsc->ir[cp];
494 IT i = li + param.localRowStart;
495 IT mi = RepMateR2C[li];
498 double w = dcsc->numx[cp]- RepMateWR2C[li] - RepMateWC2R[lj];
499 int rrank = param.m_perproc != 0 ? std::min(
static_cast<int>(mj / param.m_perproc), param.pr-1) : (param.pr-1);
500 int crank = param.n_perproc != 0 ? std::min(
static_cast<int>(mi / param.n_perproc), param.pc-1) : (param.pc-1);
501 int owner = param.commGrid->GetRank(rrank , crank);
503 if (tsendcnt[owner] < THREAD_BUF_LEN)
505 tsendTuples[THREAD_BUF_LEN * owner + tsendcnt[owner]] = std::make_tuple(mi, mj, w);
510 int tt = __sync_fetch_and_add(transferCount.data()+owner, THREAD_BUF_LEN);
511 copy( tsendTuples.data()+THREAD_BUF_LEN * owner, tsendTuples.data()+THREAD_BUF_LEN * (owner+1) , sendTuples.data() + sdispls[owner]+ tt);
513 tsendTuples[THREAD_BUF_LEN * owner] = std::make_tuple(mi, mj, w);
519 for(
int owner=0; owner < param.nprocs; owner++)
521 if (tsendcnt[owner] >0)
523 int tt = __sync_fetch_and_add(transferCount.data()+owner, tsendcnt[owner]);
524 copy( tsendTuples.data()+THREAD_BUF_LEN * owner, tsendTuples.data()+THREAD_BUF_LEN * owner + tsendcnt[owner], sendTuples.data() + sdispls[owner]+ tt);
529 double t1Comp = MPI_Wtime() - tstart;
530 tstart = MPI_Wtime();
534 std::vector<int> recvcnt (param.nprocs);
535 std::vector<int> rdispls (param.nprocs, 0);
537 MPI_Alltoall(sendcnt.data(), 1, MPI_INT, recvcnt.data(), 1, MPI_INT, World);
538 std::partial_sum(recvcnt.data(), recvcnt.data()+param.nprocs-1, rdispls.data()+1);
539 IT totrecv = std::accumulate(recvcnt.data(), recvcnt.data()+param.nprocs,
static_cast<IT>(0));
542 MPI_Datatype MPI_tuple;
543 MPI_Type_contiguous(
sizeof(std::tuple<IT,IT,NT>), MPI_CHAR, &MPI_tuple);
544 MPI_Type_commit(&MPI_tuple);
546 std::vector< std::tuple<IT,IT,NT> > recvTuples1(totrecv);
547 MPI_Alltoallv(sendTuples.data(), sendcnt.data(), sdispls.data(), MPI_tuple, recvTuples1.data(), recvcnt.data(), rdispls.data(), MPI_tuple, World);
548 MPI_Type_free(&MPI_tuple);
549 double t1Comm = MPI_Wtime() - tstart;
557template <
class IT,
class NT>
558std::vector< std::tuple<IT,IT,IT,NT> >
Phase2(
const AWPM_param<IT>& param, std::vector<std::tuple<IT,IT,NT>>& recvTuples, Dcsc<IT, NT>* dcsc,
const std::vector<IT>& colptr,
const std::vector<IT>& RepMateR2C,
const std::vector<IT>& RepMateC2R,
const std::vector<NT>& RepMateWR2C,
const std::vector<NT>& RepMateWC2R )
561 MPI_Comm World = param.commGrid->GetWorld();
563 double tstart = MPI_Wtime();
566 __gnu_parallel::sort(recvTuples.begin(), recvTuples.end());
567 std::vector<std::vector<std::tuple<IT,IT, IT, NT>>> tempTuples1 (param.nprocs);
569 std::vector<int> sendcnt(param.nprocs,0);
578 nBins = omp_get_num_threads() * 4;
583#pragma omp parallel for
585 for(
int i=0; i<nBins; i++)
587 int perBin = recvTuples.size()/nBins;
588 int startBinIndex = perBin * i;
589 int endBinIndex = perBin * (i+1);
590 if(i==nBins-1) endBinIndex = recvTuples.size();
593 std::vector<int> tsendcnt(param.nprocs,0);
594 for(
int k=startBinIndex; k<endBinIndex;)
597 IT mi = std::get<0>(recvTuples[k]);
598 IT lcol = mi - param.localColStart;
599 IT i = RepMateC2R[lcol];
601 IT idx2 = colptr[lcol];
603 for(; std::get<0>(recvTuples[idx1]) == mi && idx2 < colptr[lcol+1];)
606 IT mj = std::get<1>(recvTuples[idx1]) ;
607 IT lrow = mj - param.localRowStart;
608 IT j = RepMateR2C[lrow];
609 IT lrowMat = dcsc->ir[idx2];
612 NT weight = std::get<2>(recvTuples[idx1]);
613 NT cw = weight + RepMateWR2C[lrow];
616 int rrank = (param.m_perproc != 0) ? std::min(
static_cast<int>(mj / param.m_perproc), param.pr-1) : (param.pr-1);
617 int crank = (param.n_perproc != 0) ? std::min(
static_cast<int>(j / param.n_perproc), param.pc-1) : (param.pc-1);
618 int owner = param.commGrid->GetRank(rrank , crank);
624 else if(lrowMat > lrow)
630 for(;std::get<0>(recvTuples[idx1]) == mi ; idx1++);
634 for(
int i=0; i<param.nprocs; i++)
636 __sync_fetch_and_add(sendcnt.data()+i, tsendcnt[i]);
643 IT totsend = std::accumulate(sendcnt.data(), sendcnt.data()+param.nprocs,
static_cast<IT>(0));
644 std::vector<int> sdispls (param.nprocs, 0);
645 std::partial_sum(sendcnt.data(), sendcnt.data()+param.nprocs-1, sdispls.data()+1);
647 std::vector< std::tuple<IT,IT,IT,NT> > sendTuples(totsend);
648 std::vector<int> transferCount(param.nprocs,0);
654#pragma omp parallel for
656 for(
int i=0; i<nBins; i++)
658 int perBin = recvTuples.size()/nBins;
659 int startBinIndex = perBin * i;
660 int endBinIndex = perBin * (i+1);
661 if(i==nBins-1) endBinIndex = recvTuples.size();
664 std::vector<int> tsendcnt(param.nprocs,0);
665 std::vector<std::tuple<IT,IT, IT, NT>> tsendTuples (param.nprocs*THREAD_BUF_LEN);
666 for(
int k=startBinIndex; k<endBinIndex;)
668 IT mi = std::get<0>(recvTuples[k]);
669 IT lcol = mi - param.localColStart;
670 IT i = RepMateC2R[lcol];
672 IT idx2 = colptr[lcol];
674 for(; std::get<0>(recvTuples[idx1]) == mi && idx2 < colptr[lcol+1];)
677 IT mj = std::get<1>(recvTuples[idx1]) ;
678 IT lrow = mj - param.localRowStart;
679 IT j = RepMateR2C[lrow];
680 IT lrowMat = dcsc->ir[idx2];
683 NT weight = std::get<2>(recvTuples[idx1]);
684 NT cw = weight + RepMateWR2C[lrow];
687 int rrank = (param.m_perproc != 0) ? std::min(
static_cast<int>(mj / param.m_perproc), param.pr-1) : (param.pr-1);
688 int crank = (param.n_perproc != 0) ? std::min(
static_cast<int>(j / param.n_perproc), param.pc-1) : (param.pc-1);
689 int owner = param.commGrid->GetRank(rrank , crank);
691 if (tsendcnt[owner] < THREAD_BUF_LEN)
693 tsendTuples[THREAD_BUF_LEN * owner + tsendcnt[owner]] = std::make_tuple(mj, mi, i, cw);
698 int tt = __sync_fetch_and_add(transferCount.data()+owner, THREAD_BUF_LEN);
699 std::copy( tsendTuples.data()+THREAD_BUF_LEN * owner, tsendTuples.data()+THREAD_BUF_LEN * (owner+1) , sendTuples.data() + sdispls[owner]+ tt);
701 tsendTuples[THREAD_BUF_LEN * owner] = std::make_tuple(mj, mi, i, cw);
709 else if(lrowMat > lrow)
715 for(;std::get<0>(recvTuples[idx1]) == mi ; idx1++);
719 for(
int owner=0; owner < param.nprocs; owner++)
721 if (tsendcnt[owner] >0)
723 int tt = __sync_fetch_and_add(transferCount.data()+owner, tsendcnt[owner]);
724 std::copy( tsendTuples.data()+THREAD_BUF_LEN * owner, tsendTuples.data()+THREAD_BUF_LEN * owner + tsendcnt[owner], sendTuples.data() + sdispls[owner]+ tt);
731 double t2Comp = MPI_Wtime() - tstart;
732 tstart = MPI_Wtime();
734 std::vector<int> recvcnt (param.nprocs);
735 std::vector<int> rdispls (param.nprocs, 0);
737 MPI_Alltoall(sendcnt.data(), 1, MPI_INT, recvcnt.data(), 1, MPI_INT, World);
738 std::partial_sum(recvcnt.data(), recvcnt.data()+param.nprocs-1, rdispls.data()+1);
739 IT totrecv = std::accumulate(recvcnt.data(), recvcnt.data()+param.nprocs,
static_cast<IT>(0));
742 MPI_Datatype MPI_tuple;
743 MPI_Type_contiguous(
sizeof(std::tuple<IT,IT,IT,NT>), MPI_CHAR, &MPI_tuple);
744 MPI_Type_commit(&MPI_tuple);
746 std::vector< std::tuple<IT,IT,IT,NT> > recvTuples1(totrecv);
747 MPI_Alltoallv(sendTuples.data(), sendcnt.data(), sdispls.data(), MPI_tuple, recvTuples1.data(), recvcnt.data(), rdispls.data(), MPI_tuple, World);
748 MPI_Type_free(&MPI_tuple);
749 double t2Comm = MPI_Wtime() - tstart;
756template <
class IT,
class NT>
757std::vector<std::vector<std::tuple<IT,IT, IT, NT>>>
Phase2_old(
const AWPM_param<IT>& param, std::vector<std::tuple<IT,IT,NT>>& recvTuples, Dcsc<IT, NT>* dcsc,
const std::vector<IT>& colptr,
const std::vector<IT>& RepMateR2C,
const std::vector<IT>& RepMateC2R,
const std::vector<NT>& RepMateWR2C,
const std::vector<NT>& RepMateWC2R )
760 std::vector<std::vector<std::tuple<IT,IT, IT, NT>>> tempTuples1 (param.nprocs);
761 for(
int k=0; k<recvTuples.size(); ++k)
763 IT mi = std::get<0>(recvTuples[k]) ;
764 IT mj = std::get<1>(recvTuples[k]) ;
765 IT i = RepMateC2R[mi - param.localColStart];
766 NT weight = std::get<2>(recvTuples[k]);
768 if(colptr[mi- param.localColStart+1] > colptr[mi- param.localColStart] )
770 IT * ele = find(dcsc->ir+colptr[mi - param.localColStart], dcsc->ir+colptr[mi - param.localColStart+1], mj - param.localRowStart);
773 if (ele != dcsc->ir+colptr[mi - param.localColStart+1])
775 NT cw = weight + RepMateWR2C[mj - param.localRowStart];
778 IT j = RepMateR2C[mj - param.localRowStart];
779 int rrank = (param.m_perproc != 0) ? std::min(
static_cast<int>(mj / param.m_perproc), param.pr-1) : (param.pr-1);
780 int crank = (param.n_perproc != 0) ? std::min(
static_cast<int>(j / param.n_perproc), param.pc-1) : (param.pc-1);
781 int owner = param.commGrid->GetRank(rrank , crank);
782 tempTuples1[owner].push_back(make_tuple(mj, mi, i, cw));
791template <
class IT,
class NT,
class DER>
792void TwoThirdApprox(SpParMat < IT, NT, DER > &
A, FullyDistVec<IT, IT>& mateRow2Col, FullyDistVec<IT, IT>& mateCol2Row)
797 auto commGrid =
A.getcommgrid();
798 int myrank=commGrid->GetRank();
799 MPI_Comm World = commGrid->GetWorld();
800 MPI_Comm ColWorld = commGrid->GetColWorld();
801 MPI_Comm RowWorld = commGrid->GetRowWorld();
802 int nprocs = commGrid->GetSize();
803 int pr = commGrid->GetGridRows();
804 int pc = commGrid->GetGridCols();
805 int rowrank = commGrid->GetRankInProcRow();
806 int colrank = commGrid->GetRankInProcCol();
807 int diagneigh = commGrid->GetComplementRank();
812 IT nrows =
A.getnrow();
813 IT ncols =
A.getncol();
815 IT m_perproc = nrows / pr;
816 IT n_perproc = ncols / pc;
817 DER* spSeq =
A.seqptr();
818 Dcsc<IT, NT>* dcsc = spSeq->GetDCSC();
819 IT lnrow = spSeq->getnrow();
820 IT lncol = spSeq->getncol();
821 IT localRowStart = colrank * m_perproc;
822 IT localColStart = rowrank * n_perproc;
824 AWPM_param<IT> param;
830 param.m_perproc = m_perproc;
831 param.n_perproc = n_perproc;
832 param.localRowStart = localRowStart;
833 param.localColStart = localColStart;
834 param.myrank = myrank;
835 param.commGrid = commGrid;
839 double tPhase1 = 0, tPhase2 = 0, tPhase3 = 0, tPhase4 = 0, tPhase5 = 0, tUpdate = 0;
846 int xsize = (int) mateCol2Row.LocArrSize();
849 MPI_Sendrecv(&xsize, 1, MPI_INT, diagneigh,
TRX, &trxsize, 1, MPI_INT, diagneigh,
TRX, World, &status);
850 std::vector<IT> trxnums(trxsize);
851 MPI_Sendrecv(mateCol2Row.GetLocArr(), xsize, MPIType<IT>(), diagneigh,
TRX, trxnums.data(), trxsize, MPIType<IT>(), diagneigh,
TRX, World, &status);
854 std::vector<int> colsize(pc);
855 colsize[colrank] = trxsize;
856 MPI_Allgather(MPI_IN_PLACE, 1, MPI_INT, colsize.data(), 1, MPI_INT, ColWorld);
857 std::vector<int> dpls(pc,0);
858 std::partial_sum(colsize.data(), colsize.data()+pc-1, dpls.data()+1);
859 int accsize = std::accumulate(colsize.data(), colsize.data()+pc, 0);
860 std::vector<IT> RepMateC2R(accsize);
861 MPI_Allgatherv(trxnums.data(), trxsize, MPIType<IT>(), RepMateC2R.data(), colsize.data(), dpls.data(), MPIType<IT>(), ColWorld);
872 xsize = (int) mateRow2Col.LocArrSize();
874 std::vector<int> rowsize(pr);
875 rowsize[rowrank] = xsize;
876 MPI_Allgather(MPI_IN_PLACE, 1, MPI_INT, rowsize.data(), 1, MPI_INT, RowWorld);
877 std::vector<int> rdpls(pr,0);
878 std::partial_sum(rowsize.data(), rowsize.data()+pr-1, rdpls.data()+1);
879 accsize = std::accumulate(rowsize.data(), rowsize.data()+pr, 0);
880 std::vector<IT> RepMateR2C(accsize);
881 MPI_Allgatherv(mateRow2Col.GetLocArr(), xsize, MPIType<IT>(), RepMateR2C.data(), rowsize.data(), rdpls.data(), MPIType<IT>(), RowWorld);
887 std::vector<IT> colptr (lncol+1,-1);
888 for(
auto colit = spSeq->begcol(); colit != spSeq->endcol(); ++colit)
890 IT lj = colit.colid();
892 colptr[lj] = colit.colptr();
894 colptr[lncol] = spSeq->getnnz();
895 for(
IT k=lncol-1; k>=0; k--)
899 colptr[k] = colptr[k+1];
908 std::vector<NT> RepMateWR2C(lnrow);
909 std::vector<NT> RepMateWC2R(lncol);
916 NT weightPrev = weightCur - 999999999999;
917 while(weightCur > weightPrev && iterations++ < 10)
922 if(myrank==0) std::cout <<
"Iteration " << iterations <<
". matching weight: sum = "<< weightCur <<
" min = " << minw << std::endl;
926 double tstart = MPI_Wtime();
927 std::vector<std::tuple<IT,IT,NT>> recvTuples =
Phase1(param, dcsc, colptr, RepMateR2C, RepMateC2R, RepMateWR2C, RepMateWC2R );
928 tPhase1 += (MPI_Wtime() - tstart);
929 tstart = MPI_Wtime();
931 std::vector<std::tuple<IT,IT,IT,NT>> recvTuples1 =
Phase2(param, recvTuples, dcsc, colptr, RepMateR2C, RepMateC2R, RepMateWR2C, RepMateWC2R );
932 std::vector< std::tuple<IT,IT,NT> >().swap(recvTuples);
933 tPhase2 += (MPI_Wtime() - tstart);
934 tstart = MPI_Wtime();
937 std::vector<std::tuple<IT,IT,IT,NT>> bestTuplesPhase3 (lncol);
939#pragma omp parallel for
941 for(
int k=0; k<lncol; ++k)
943 bestTuplesPhase3[k] = std::make_tuple(-1,-1,-1,0);
946 for(
int k=0; k<recvTuples1.size(); ++k)
948 IT mj = std::get<0>(recvTuples1[k]) ;
949 IT mi = std::get<1>(recvTuples1[k]) ;
950 IT i = std::get<2>(recvTuples1[k]) ;
951 NT weight = std::get<3>(recvTuples1[k]);
952 IT j = RepMateR2C[mj - localRowStart];
953 IT lj = j - localColStart;
956 if( (std::get<0>(bestTuplesPhase3[lj]) == -1) || (weight > std::get<3>(bestTuplesPhase3[lj])) )
958 bestTuplesPhase3[lj] = std::make_tuple(i,mi,mj,weight);
962 std::vector<std::vector<std::tuple<IT,IT, IT, NT>>> tempTuples1 (
nprocs);
963 for(
int k=0; k<lncol; ++k)
965 if( std::get<0>(bestTuplesPhase3[k]) != -1)
969 IT i = std::get<0>(bestTuplesPhase3[k]) ;
970 IT mi = std::get<1>(bestTuplesPhase3[k]) ;
971 IT mj = std::get<2>(bestTuplesPhase3[k]) ;
972 IT j = RepMateR2C[mj - localRowStart];
973 NT weight = std::get<3>(bestTuplesPhase3[k]);
976 tempTuples1[owner].push_back(std::make_tuple(i, j, mj, weight));
982 tPhase3 += (MPI_Wtime() - tstart);
983 tstart = MPI_Wtime();
985 std::vector<std::tuple<IT,IT,IT,IT, NT>> bestTuplesPhase4 (lncol);
991#pragma omp parallel for
993 for(
int k=0; k<lncol; ++k)
995 bestTuplesPhase4[k] = std::make_tuple(-1,-1,-1,-1,0);
998 for(
int k=0; k<recvTuples1.size(); ++k)
1000 IT i = std::get<0>(recvTuples1[k]) ;
1001 IT j = std::get<1>(recvTuples1[k]) ;
1002 IT mj = std::get<2>(recvTuples1[k]) ;
1003 IT mi = RepMateR2C[i-localRowStart];
1004 NT weight = std::get<3>(recvTuples1[k]);
1005 IT lmi = mi - localColStart;
1010 if( ((std::get<0>(bestTuplesPhase4[lmi]) == -1) || (weight > std::get<4>(bestTuplesPhase4[lmi]))) && std::get<0>(bestTuplesPhase3[lmi])==-1 )
1012 bestTuplesPhase4[lmi] = std::make_tuple(i,j,mi,mj,weight);
1018 std::vector<std::vector<std::tuple<IT,IT,IT, IT>>> winnerTuples (
nprocs);
1021 for(
int k=0; k<lncol; ++k)
1023 if( std::get<0>(bestTuplesPhase4[k]) != -1)
1027 IT i = std::get<0>(bestTuplesPhase4[k]) ;
1028 IT j = std::get<1>(bestTuplesPhase4[k]) ;
1029 IT mi = std::get<2>(bestTuplesPhase4[k]) ;
1030 IT mj = std::get<3>(bestTuplesPhase4[k]) ;
1034 winnerTuples[owner].push_back(std::make_tuple(i, j, mi, mj));
1039 winnerTuples[owner].push_back(std::make_tuple(mj, mi, j, i));
1042 std::vector<std::tuple<IT,IT,IT,IT>> recvWinnerTuples =
ExchangeData1(winnerTuples, World);
1043 tPhase4 += (MPI_Wtime() - tstart);
1044 tstart = MPI_Wtime();
1047 std::vector<std::tuple<IT,IT>> rowBcastTuples(recvWinnerTuples.size());
1048 std::vector<std::tuple<IT,IT>> colBcastTuples(recvWinnerTuples.size());
1050#pragma omp parallel for
1052 for(
int k=0; k<recvWinnerTuples.size(); ++k)
1054 IT i = std::get<0>(recvWinnerTuples[k]) ;
1055 IT j = std::get<1>(recvWinnerTuples[k]) ;
1056 IT mi = std::get<2>(recvWinnerTuples[k]) ;
1057 IT mj = std::get<3>(recvWinnerTuples[k]);
1059 colBcastTuples[k] = std::make_tuple(j,i);
1060 rowBcastTuples[k] = std::make_tuple(mj,mi);
1063 std::vector<std::tuple<IT,IT>> updatedR2C =
MateBcast(rowBcastTuples, RowWorld);
1064 std::vector<std::tuple<IT,IT>> updatedC2R =
MateBcast(colBcastTuples, ColWorld);
1066 tPhase5 += (MPI_Wtime() - tstart);
1067 tstart = MPI_Wtime();
1071#pragma omp parallel for
1073 for(
int k=0; k<updatedR2C.size(); k++)
1075 IT row = std::get<0>(updatedR2C[k]);
1076 IT mate = std::get<1>(updatedR2C[k]);
1077 RepMateR2C[row-localRowStart] = mate;
1081#pragma omp parallel for
1083 for(
int k=0; k<updatedC2R.size(); k++)
1085 IT col = std::get<0>(updatedC2R[k]);
1086 IT mate = std::get<1>(updatedC2R[k]);
1087 RepMateC2R[col-localColStart] = mate;
1093 weightPrev = weightCur;
1096 tUpdate += (MPI_Wtime() - tstart);
1106 std::cout <<
"------------- overal timing (HWPM) -------------" << std::endl;
1108 std::cout <<
"Phase1: "<< tPhase1 <<
"\nPhase2: " << tPhase2 <<
"\nPhase3: " << tPhase3 <<
"\nPhase4: " << tPhase4 <<
"\nPhase5: " << tPhase5 <<
"\nUpdate: " << tUpdate << std::endl;
1109 std::cout <<
"-------------------------------------------------" << std::endl;
1113 UpdateMatching(mateRow2Col, mateCol2Row, RepMateR2C, RepMateC2R);
1121 template <
class IT,
class NT,
class DER>
1128 A.Apply([](
NT val){
return (fabs(val));});
1130 FullyDistVec<IT, NT> maxvRow(
A.getcommgrid());
1131 A.Reduce(maxvRow,
Row, maximum<NT>(),
static_cast<NT>(numeric_limits<NT>::lowest()));
1132 A.DimApply(
Row, maxvRow, [](
NT val,
NT maxval){
return val/maxval;});
1134 FullyDistVec<IT, NT> maxvCol(
A.getcommgrid());
1135 A.Reduce(maxvCol,
Column, maximum<NT>(),
static_cast<NT>(numeric_limits<NT>::lowest()));
1136 A.DimApply(
Column, maxvCol, [](
NT val,
NT maxval){
return val/maxval;});
1139 A.Apply([](
NT val){
return log(val);});
1143 template <
class IT,
class NT>
1144 void AWPM(SpParMat <
IT,
NT, SpDCCols<IT, NT> > & A1, FullyDistVec<IT, IT>& mateRow2Col, FullyDistVec<IT, IT>& mateCol2Row,
bool optimizeProd=
true,
bool weightedCard=
true)
1146 SpParMat < IT, NT, SpDCCols<IT, NT> >
A(A1);
1152 SpParMat < IT, NT, SpCCols<IT, NT> > Acsc(
A);
1153 SpParMat < IT, NT, SpDCCols<IT, bool> > Abool(
A);
1154 SpParMat < IT, NT, SpCCols<IT, bool> > ABoolCSC(MPI_COMM_WORLD);
1158 FullyDistVec<IT, IT> degCol(
A.getcommgrid());
1159 Abool.Reduce(degCol,
Column, plus<IT>(),
static_cast<IT>(0));
1164 double origWeight =
Trace(
A, diagnnz);
1165 bool isOriginalPerfect = diagnnz==
A.getnrow();
1179 if(isOriginalPerfect && mclWeight<=origWeight)
1183 SpParHelper::Print(
"Maximal matching is not better that the natural ordering. Hence, keeping the natural ordering.\n");
1185 mateRow2Col.iota(
A.getnrow(), 0);
1186 mateCol2Row.iota(
A.getncol(), 0);
1187 mclWeight = origWeight;
1188 isPerfectMCL =
true;
1196 double mcmWeight = mclWeight;
1197 bool isPerfectMCM = isPerfectMCL;
1204 maximumMatching(ABoolCSC, mateRow2Col, mateCol2Row,
true,
false,
false);
1207 tmcm = MPI_Wtime() - ts;
1213 SpParHelper::Print(
"Warning: The Maximum Cardinality Matching did not return a perfect matching! Need to check the input matrix.\n");
1220 double tawpm = MPI_Wtime() - ts;
1224 bool isPerfectAWPM =
CheckMatching(mateRow2Col,mateCol2Row);
1226 SpParHelper::Print(
"Warning: The HWPM code did not return a perfect matching! Need to check the input matrix.\n");
1228 if(isOriginalPerfect && awpmWeight<origWeight)
1231 SpParHelper::Print(
"AWPM is not better that the natural ordering. Hence, keeping the natural ordering.\n");
1233 mateRow2Col.iota(
A.getnrow(), 0);
1234 mateCol2Row.iota(
A.getncol(), 0);
1235 awpmWeight = origWeight;
static void Print(const std::string &s)
bool CheckMatching(FullyDistVec< IT, IT > &mateRow2Col, FullyDistVec< IT, IT > &mateCol2Row)
int OwnerProcs(SpParMat< IT, NT, DER > &A, IT grow, IT gcol, IT nrows, IT ncols)
std::vector< std::tuple< IT, IT, NT > > Phase1(const AWPM_param< IT > ¶m, Dcsc< IT, NT > *dcsc, const std::vector< IT > &colptr, const std::vector< IT > &RepMateR2C, const std::vector< IT > &RepMateC2R, const std::vector< NT > &RepMateWR2C, const std::vector< NT > &RepMateWC2R)
std::vector< std::tuple< IT, IT, IT, NT > > Phase2(const AWPM_param< IT > ¶m, std::vector< std::tuple< IT, IT, NT > > &recvTuples, Dcsc< IT, NT > *dcsc, const std::vector< IT > &colptr, const std::vector< IT > &RepMateR2C, const std::vector< IT > &RepMateC2R, const std::vector< NT > &RepMateWR2C, const std::vector< NT > &RepMateWC2R)
NT MatchingWeight(std::vector< NT > &RepMateWC2R, MPI_Comm RowWorld, NT &minw)
void TransformWeight(SpParMat< IT, NT, DER > &A, bool applylog)
std::vector< std::tuple< IT, IT, NT > > ExchangeData(std::vector< std::vector< std::tuple< IT, IT, NT > > > &tempTuples, MPI_Comm World)
std::vector< std::tuple< IT, IT, IT, NT > > ExchangeData1(std::vector< std::vector< std::tuple< IT, IT, IT, NT > > > &tempTuples, MPI_Comm World)
NT Trace(SpParMat< IT, NT, DER > &A, IT &rettrnnz=0)
void UpdateMatching(FullyDistVec< IT, IT > &mateRow2Col, FullyDistVec< IT, IT > &mateCol2Row, std::vector< IT > &RepMateR2C, std::vector< IT > &RepMateC2R)
void maximumMatching(SpParMat< IT, NT, DER > &A, FullyDistVec< IT, IT > &mateRow2Col, FullyDistVec< IT, IT > &mateCol2Row, bool prune=true, bool randMM=false, bool maximizeWeight=false)
int ThreadBuffLenForBinning(int itemsize, int nbins)
std::vector< std::tuple< IT, IT > > MateBcast(std::vector< std::tuple< IT, IT > > sendTuples, MPI_Comm World)
void ReplicateMateWeights(const AWPM_param< IT > ¶m, Dcsc< IT, NT > *dcsc, const std::vector< IT > &colptr, std::vector< IT > &RepMateC2R, std::vector< NT > &RepMateWR2C, std::vector< NT > &RepMateWC2R)
std::vector< std::vector< std::tuple< IT, IT, IT, NT > > > Phase2_old(const AWPM_param< IT > ¶m, std::vector< std::tuple< IT, IT, NT > > &recvTuples, Dcsc< IT, NT > *dcsc, const std::vector< IT > &colptr, const std::vector< IT > &RepMateR2C, const std::vector< IT > &RepMateC2R, const std::vector< NT > &RepMateWR2C, const std::vector< NT > &RepMateWC2R)
void TwoThirdApprox(SpParMat< IT, NT, DER > &A, FullyDistVec< IT, IT > &mateRow2Col, FullyDistVec< IT, IT > &mateCol2Row)
void WeightedGreedy(Par_MAT_Double &A, FullyDistVec< IT, IT > &mateRow2Col, FullyDistVec< IT, IT > &mateCol2Row, FullyDistVec< IT, IT > °Col)
void AWPM(SpParMat< IT, NT, SpDCCols< IT, NT > > &A1, FullyDistVec< IT, IT > &mateRow2Col, FullyDistVec< IT, IT > &mateCol2Row, bool optimizeProd=true, bool weightedCard=true)
std::shared_ptr< CommGrid > commGrid