9 #ifndef ApproxWeightPerfectMatching_h
10 #define ApproxWeightPerfectMatching_h
15 #include <parallel/algorithm>
16 #include <parallel/numeric>
42 template <
class IT,
class NT>
43 std::vector<std::tuple<IT,IT,NT>>
ExchangeData(std::vector<std::vector<std::tuple<IT,IT,NT>>> & tempTuples, MPI_Comm World)
47 MPI_Datatype MPI_tuple;
48 MPI_Type_contiguous(
sizeof(std::tuple<IT,IT,NT>), MPI_CHAR, &MPI_tuple);
49 MPI_Type_commit(&MPI_tuple);
52 MPI_Comm_size(World, &nprocs);
54 int * sendcnt =
new int[nprocs];
55 int * recvcnt =
new int[nprocs];
56 int * sdispls =
new int[nprocs]();
57 int * rdispls =
new int[nprocs]();
61 for(IT i=0; i<nprocs; ++i)
63 sendcnt[i] = tempTuples[i].size();
64 totsend += tempTuples[i].size();
67 MPI_Alltoall(sendcnt, 1, MPI_INT, recvcnt, 1, MPI_INT, World);
69 std::partial_sum(sendcnt, sendcnt+nprocs-1, sdispls+1);
70 std::partial_sum(recvcnt, recvcnt+nprocs-1, rdispls+1);
71 IT totrecv = accumulate(recvcnt,recvcnt+nprocs,
static_cast<IT
>(0));
74 std::vector< std::tuple<IT,IT,NT> > sendTuples(totsend);
75 for(
int i=0; i<nprocs; ++i)
77 copy(tempTuples[i].begin(), tempTuples[i].end(), sendTuples.data()+sdispls[i]);
78 std::vector< std::tuple<IT,IT,NT> >().swap(tempTuples[i]);
80 std::vector< std::tuple<IT,IT,NT> > recvTuples(totrecv);
81 MPI_Alltoallv(sendTuples.data(), sendcnt, sdispls, MPI_tuple, recvTuples.data(), recvcnt, rdispls, MPI_tuple, World);
82 DeleteAll(sendcnt, recvcnt, sdispls, rdispls);
83 MPI_Type_free(&MPI_tuple);
90 template <
class IT,
class NT>
91 std::vector<std::tuple<IT,IT,IT,NT>>
ExchangeData1(std::vector<std::vector<std::tuple<IT,IT,IT,NT>>> & tempTuples, MPI_Comm World)
95 MPI_Datatype MPI_tuple;
96 MPI_Type_contiguous(
sizeof(std::tuple<IT,IT,IT,NT>), MPI_CHAR, &MPI_tuple);
97 MPI_Type_commit(&MPI_tuple);
100 MPI_Comm_size(World, &nprocs);
102 int * sendcnt =
new int[nprocs];
103 int * recvcnt =
new int[nprocs];
104 int * sdispls =
new int[nprocs]();
105 int * rdispls =
new int[nprocs]();
109 for(IT i=0; i<nprocs; ++i)
111 sendcnt[i] = tempTuples[i].size();
112 totsend += tempTuples[i].size();
115 MPI_Alltoall(sendcnt, 1, MPI_INT, recvcnt, 1, MPI_INT, World);
117 std::partial_sum(sendcnt, sendcnt+nprocs-1, sdispls+1);
118 std::partial_sum(recvcnt, recvcnt+nprocs-1, rdispls+1);
119 IT totrecv = std::accumulate(recvcnt,recvcnt+nprocs,
static_cast<IT
>(0));
121 std::vector< std::tuple<IT,IT,IT,NT> > sendTuples(totsend);
122 for(
int i=0; i<nprocs; ++i)
124 copy(tempTuples[i].begin(), tempTuples[i].end(), sendTuples.data()+sdispls[i]);
125 std::vector< std::tuple<IT,IT,IT,NT> >().swap(tempTuples[i]);
127 std::vector< std::tuple<IT,IT,IT,NT> > recvTuples(totrecv);
128 MPI_Alltoallv(sendTuples.data(), sendcnt, sdispls, MPI_tuple, recvTuples.data(), recvcnt, rdispls, MPI_tuple, World);
129 DeleteAll(sendcnt, recvcnt, sdispls, rdispls);
130 MPI_Type_free(&MPI_tuple);
139 template <
class IT,
class NT,
class DER>
142 auto commGrid =
A.getcommgrid();
143 int procrows = commGrid->GetGridRows();
144 int proccols = commGrid->GetGridCols();
146 IT m_perproc = nrows / procrows;
147 IT n_perproc = ncols / proccols;
150 pr = std::min(
static_cast<int>(grow / m_perproc), procrows-1);
154 pc = std::min(
static_cast<int>(gcol / n_perproc), proccols-1);
157 return commGrid->GetRank(pr, pc);
184 std::vector<std::tuple<IT,IT>>
MateBcast(std::vector<std::tuple<IT,IT>> sendTuples, MPI_Comm World)
188 MPI_Datatype MPI_tuple;
189 MPI_Type_contiguous(
sizeof(std::tuple<IT,IT>) , MPI_CHAR, &MPI_tuple);
190 MPI_Type_commit(&MPI_tuple);
194 MPI_Comm_size(World, &nprocs);
196 int * recvcnt =
new int[nprocs];
197 int * rdispls =
new int[nprocs]();
198 int sendcnt = sendTuples.size();
201 MPI_Allgather(&sendcnt, 1, MPI_INT, recvcnt, 1, MPI_INT, World);
203 std::partial_sum(recvcnt, recvcnt+nprocs-1, rdispls+1);
204 IT totrecv = std::accumulate(recvcnt,recvcnt+nprocs,
static_cast<IT
>(0));
206 std::vector< std::tuple<IT,IT> > recvTuples(totrecv);
209 MPI_Allgatherv(sendTuples.data(), sendcnt, MPI_tuple,
210 recvTuples.data(), recvcnt, rdispls,MPI_tuple,World );
213 MPI_Type_free(&MPI_tuple);
224 template <
class IT,
class NT>
229 fill(RepMateWC2R.begin(), RepMateWC2R.end(),
static_cast<NT
>(0));
230 fill(RepMateWR2C.begin(), RepMateWR2C.end(),
static_cast<NT
>(0));
234 #pragma omp parallel for
236 for(
int k=0; k<param.
lncol; ++k)
240 IT mj = RepMateC2R[lj];
244 for(IT cp = colptr[k]; cp < colptr[k+1]; ++cp)
246 IT li = dcsc->
ir[cp];
251 RepMateWC2R[lj] = dcsc->
numx[cp];
261 MPI_Comm ColWorld = param.
commGrid->GetColWorld();
262 MPI_Comm RowWorld = param.
commGrid->GetRowWorld();
264 MPI_Allreduce(MPI_IN_PLACE, RepMateWC2R.data(), RepMateWC2R.size(), MPIType<NT>(), MPI_SUM, ColWorld);
265 MPI_Allreduce(MPI_IN_PLACE, RepMateWR2C.data(), RepMateWR2C.size(), MPIType<NT>(), MPI_SUM, RowWorld);
271 template <
class IT,
class NT,
class DER>
275 IT nrows =
A.getnrow();
276 IT ncols =
A.getncol();
277 auto commGrid =
A.getcommgrid();
278 MPI_Comm World = commGrid->GetWorld();
279 int myrank=commGrid->GetRank();
280 int pr = commGrid->GetGridRows();
281 int pc = commGrid->GetGridCols();
287 int rowrank = commGrid->GetRankInProcRow();
288 int colrank = commGrid->GetRankInProcCol();
289 IT m_perproc = nrows / pr;
290 IT n_perproc = ncols / pc;
291 DER* spSeq =
A.seqptr();
292 IT localRowStart = colrank * m_perproc;
293 IT localColStart = rowrank * n_perproc;
298 for(
auto colit = spSeq->begcol(); colit != spSeq->endcol(); ++colit)
300 IT lj = colit.colid();
301 IT j = lj + localColStart;
303 for(
auto nzit = spSeq->begnz(colit); nzit < spSeq->endnz(colit); ++nzit)
306 IT li = nzit.rowid();
307 IT i = li + localRowStart;
311 trace += nzit.value();
317 MPI_Allreduce(MPI_IN_PLACE, &trnnz, 1, MPIType<IT>(), MPI_SUM, World);
318 MPI_Allreduce(MPI_IN_PLACE, &trace, 1, MPIType<NT>(), MPI_SUM, World);
333 minw = 99999999999999.0;
334 for(
int i=0; i<RepMateWC2R.size(); i++)
341 minw = std::min(minw, RepMateWC2R[i]);
344 MPI_Allreduce(MPI_IN_PLACE, &w, 1, MPIType<NT>(), MPI_SUM, RowWorld);
345 MPI_Allreduce(MPI_IN_PLACE, &minw, 1, MPIType<NT>(), MPI_MIN, RowWorld);
359 MPI_Comm RowWorld = commGrid->GetRowWorld();
360 int rowroot = commGrid->GetDiagOfProcRow();
361 int pc = commGrid->GetGridCols();
366 for(IT i=0, j = mateRow2Col.RowLenUntil(); i<localLenR2C; i++, j++)
374 std::vector <int> sendcnts(pc);
375 std::vector <int> dpls(pc);
377 for(
int i=1; i<pc; i++)
379 dpls[i] = mateCol2Row.RowLenUntil(i);
380 sendcnts[i-1] = dpls[i] - dpls[i-1];
382 sendcnts[pc-1] = RepMateC2R.size() - dpls[pc-1];
385 IT* localC2R = (IT*) mateCol2Row.
GetLocArr();
386 MPI_Scatterv(RepMateC2R.data(),sendcnts.data(), dpls.data(), MPIType<IT>(), localC2R, localLenC2R, MPIType<IT>(),rowroot, RowWorld);
394 #ifndef L2_CACHE_SIZE
395 #define L2_CACHE_SIZE 256000
397 int THREAD_BUF_LEN = 256;
400 int bufferMem = THREAD_BUF_LEN * nbins * itemsize ;
404 THREAD_BUF_LEN = std::min(nbins+1,THREAD_BUF_LEN);
406 return THREAD_BUF_LEN;
411 template <
class IT,
class NT>
412 std::vector< std::tuple<IT,IT,NT> >
Phase1(
const AWPM_param<IT>& param,
Dcsc<IT, NT>* dcsc,
const std::vector<IT>& colptr,
const std::vector<IT>& RepMateR2C,
const std::vector<IT>& RepMateC2R,
const std::vector<NT>& RepMateWR2C,
const std::vector<NT>& RepMateWC2R )
419 MPI_Comm World = param.
commGrid->GetWorld();
424 std::vector<int> sendcnt(param.
nprocs,0);
431 std::vector<int> tsendcnt(param.
nprocs,0);
435 for(
int k=0; k<param.
lncol; ++k)
437 IT mj = RepMateC2R[k];
440 for(IT cp = colptr[k]; cp < colptr[k+1]; ++cp)
442 IT li = dcsc->
ir[cp];
444 IT mi = RepMateR2C[li];
448 int rrank = param.
m_perproc != 0 ? std::min(
static_cast<int>(mj / param.
m_perproc), param.
pr-1) : (param.
pr-1);
449 int crank = param.
n_perproc != 0 ? std::min(
static_cast<int>(mi / param.
n_perproc), param.
pc-1) : (param.
pc-1);
450 int owner = param.
commGrid->GetRank(rrank , crank);
455 for(
int i=0; i<param.
nprocs; i++)
457 __sync_fetch_and_add(sendcnt.data()+i, tsendcnt[i]);
464 IT totsend = std::accumulate(sendcnt.data(), sendcnt.data()+param.
nprocs,
static_cast<IT
>(0));
465 std::vector<int> sdispls (param.
nprocs, 0);
466 std::partial_sum(sendcnt.data(), sendcnt.data()+param.
nprocs-1, sdispls.data()+1);
468 std::vector< std::tuple<IT,IT,NT> > sendTuples(totsend);
469 std::vector<int> transferCount(param.
nprocs,0);
478 std::vector<int> tsendcnt(param.
nprocs,0);
479 std::vector<std::tuple<IT,IT, NT>> tsendTuples (param.
nprocs*THREAD_BUF_LEN);
483 for(
int k=0; k<param.
lncol; ++k)
485 IT mj = RepMateC2R[k];
489 for(IT cp = colptr[k]; cp < colptr[k+1]; ++cp)
491 IT li = dcsc->
ir[cp];
493 IT mi = RepMateR2C[li];
496 double w = dcsc->
numx[cp]- RepMateWR2C[li] - RepMateWC2R[lj];
497 int rrank = param.
m_perproc != 0 ? std::min(
static_cast<int>(mj / param.
m_perproc), param.
pr-1) : (param.
pr-1);
498 int crank = param.
n_perproc != 0 ? std::min(
static_cast<int>(mi / param.
n_perproc), param.
pc-1) : (param.
pc-1);
499 int owner = param.
commGrid->GetRank(rrank , crank);
501 if (tsendcnt[owner] < THREAD_BUF_LEN)
503 tsendTuples[THREAD_BUF_LEN * owner + tsendcnt[owner]] = std::make_tuple(mi, mj, w);
508 int tt = __sync_fetch_and_add(transferCount.data()+owner, THREAD_BUF_LEN);
509 copy( tsendTuples.data()+THREAD_BUF_LEN * owner, tsendTuples.data()+THREAD_BUF_LEN * (owner+1) , sendTuples.data() + sdispls[owner]+ tt);
511 tsendTuples[THREAD_BUF_LEN * owner] = std::make_tuple(mi, mj, w);
517 for(
int owner=0; owner < param.
nprocs; owner++)
519 if (tsendcnt[owner] >0)
521 int tt = __sync_fetch_and_add(transferCount.data()+owner, tsendcnt[owner]);
522 copy( tsendTuples.data()+THREAD_BUF_LEN * owner, tsendTuples.data()+THREAD_BUF_LEN * owner + tsendcnt[owner], sendTuples.data() + sdispls[owner]+ tt);
529 std::vector<int> recvcnt (param.
nprocs);
530 std::vector<int> rdispls (param.
nprocs, 0);
532 MPI_Alltoall(sendcnt.data(), 1, MPI_INT, recvcnt.data(), 1, MPI_INT, World);
533 std::partial_sum(recvcnt.data(), recvcnt.data()+param.
nprocs-1, rdispls.data()+1);
534 IT totrecv = std::accumulate(recvcnt.data(), recvcnt.data()+param.
nprocs,
static_cast<IT
>(0));
537 MPI_Datatype MPI_tuple;
538 MPI_Type_contiguous(
sizeof(std::tuple<IT,IT,NT>), MPI_CHAR, &MPI_tuple);
539 MPI_Type_commit(&MPI_tuple);
541 std::vector< std::tuple<IT,IT,NT> > recvTuples1(totrecv);
542 MPI_Alltoallv(sendTuples.data(), sendcnt.data(), sdispls.data(), MPI_tuple, recvTuples1.data(), recvcnt.data(), rdispls.data(), MPI_tuple, World);
543 MPI_Type_free(&MPI_tuple);
551 template <
class IT,
class NT>
552 std::vector< std::tuple<IT,IT,IT,NT> >
Phase2(
const AWPM_param<IT>& param, std::vector<std::tuple<IT,IT,NT>>& recvTuples,
Dcsc<IT, NT>* dcsc,
const std::vector<IT>& colptr,
const std::vector<IT>& RepMateR2C,
const std::vector<IT>& RepMateC2R,
const std::vector<NT>& RepMateWR2C,
const std::vector<NT>& RepMateWC2R )
555 MPI_Comm World = param.
commGrid->GetWorld();
557 double tstart = MPI_Wtime();
560 __gnu_parallel::sort(recvTuples.begin(), recvTuples.end());
561 std::vector<std::vector<std::tuple<IT,IT, IT, NT>>> tempTuples1 (param.
nprocs);
563 std::vector<int> sendcnt(param.
nprocs,0);
572 nBins = omp_get_num_threads() * 4;
577 #pragma omp parallel for
579 for(
int i=0; i<nBins; i++)
581 int perBin = recvTuples.size()/nBins;
582 int startBinIndex = perBin * i;
583 int endBinIndex = perBin * (i+1);
584 if(i==nBins-1) endBinIndex = recvTuples.size();
587 std::vector<int> tsendcnt(param.
nprocs,0);
588 for(
int k=startBinIndex; k<endBinIndex;)
591 IT mi = std::get<0>(recvTuples[k]);
593 IT i = RepMateC2R[lcol];
595 IT idx2 = colptr[lcol];
597 for(; std::get<0>(recvTuples[idx1]) == mi && idx2 < colptr[lcol+1];)
600 IT mj = std::get<1>(recvTuples[idx1]) ;
602 IT j = RepMateR2C[lrow];
603 IT lrowMat = dcsc->
ir[idx2];
606 NT weight = std::get<2>(recvTuples[idx1]);
607 NT cw = weight + RepMateWR2C[lrow];
610 int rrank = (param.
m_perproc != 0) ? std::min(
static_cast<int>(mj / param.
m_perproc), param.
pr-1) : (param.
pr-1);
611 int crank = (param.
n_perproc != 0) ? std::min(
static_cast<int>(j / param.
n_perproc), param.
pc-1) : (param.
pc-1);
612 int owner = param.
commGrid->GetRank(rrank , crank);
618 else if(lrowMat > lrow)
624 for(;std::get<0>(recvTuples[idx1]) == mi ; idx1++);
628 for(
int i=0; i<param.
nprocs; i++)
630 __sync_fetch_and_add(sendcnt.data()+i, tsendcnt[i]);
637 IT totsend = std::accumulate(sendcnt.data(), sendcnt.data()+param.
nprocs,
static_cast<IT
>(0));
638 std::vector<int> sdispls (param.
nprocs, 0);
639 std::partial_sum(sendcnt.data(), sendcnt.data()+param.
nprocs-1, sdispls.data()+1);
641 std::vector< std::tuple<IT,IT,IT,NT> > sendTuples(totsend);
642 std::vector<int> transferCount(param.
nprocs,0);
648 #pragma omp parallel for
650 for(
int i=0; i<nBins; i++)
652 int perBin = recvTuples.size()/nBins;
653 int startBinIndex = perBin * i;
654 int endBinIndex = perBin * (i+1);
655 if(i==nBins-1) endBinIndex = recvTuples.size();
658 std::vector<int> tsendcnt(param.
nprocs,0);
659 std::vector<std::tuple<IT,IT, IT, NT>> tsendTuples (param.
nprocs*THREAD_BUF_LEN);
660 for(
int k=startBinIndex; k<endBinIndex;)
662 IT mi = std::get<0>(recvTuples[k]);
664 IT i = RepMateC2R[lcol];
666 IT idx2 = colptr[lcol];
668 for(; std::get<0>(recvTuples[idx1]) == mi && idx2 < colptr[lcol+1];)
671 IT mj = std::get<1>(recvTuples[idx1]) ;
673 IT j = RepMateR2C[lrow];
674 IT lrowMat = dcsc->
ir[idx2];
677 NT weight = std::get<2>(recvTuples[idx1]);
678 NT cw = weight + RepMateWR2C[lrow];
681 int rrank = (param.
m_perproc != 0) ? std::min(
static_cast<int>(mj / param.
m_perproc), param.
pr-1) : (param.
pr-1);
682 int crank = (param.
n_perproc != 0) ? std::min(
static_cast<int>(j / param.
n_perproc), param.
pc-1) : (param.
pc-1);
683 int owner = param.
commGrid->GetRank(rrank , crank);
685 if (tsendcnt[owner] < THREAD_BUF_LEN)
687 tsendTuples[THREAD_BUF_LEN * owner + tsendcnt[owner]] = std::make_tuple(mj, mi, i, cw);
692 int tt = __sync_fetch_and_add(transferCount.data()+owner, THREAD_BUF_LEN);
693 std::copy( tsendTuples.data()+THREAD_BUF_LEN * owner, tsendTuples.data()+THREAD_BUF_LEN * (owner+1) , sendTuples.data() + sdispls[owner]+ tt);
695 tsendTuples[THREAD_BUF_LEN * owner] = std::make_tuple(mj, mi, i, cw);
703 else if(lrowMat > lrow)
709 for(;std::get<0>(recvTuples[idx1]) == mi ; idx1++);
713 for(
int owner=0; owner < param.
nprocs; owner++)
715 if (tsendcnt[owner] >0)
717 int tt = __sync_fetch_and_add(transferCount.data()+owner, tsendcnt[owner]);
718 std::copy( tsendTuples.data()+THREAD_BUF_LEN * owner, tsendTuples.data()+THREAD_BUF_LEN * owner + tsendcnt[owner], sendTuples.data() + sdispls[owner]+ tt);
725 std::vector<int> recvcnt (param.
nprocs);
726 std::vector<int> rdispls (param.
nprocs, 0);
728 MPI_Alltoall(sendcnt.data(), 1, MPI_INT, recvcnt.data(), 1, MPI_INT, World);
729 std::partial_sum(recvcnt.data(), recvcnt.data()+param.
nprocs-1, rdispls.data()+1);
730 IT totrecv = std::accumulate(recvcnt.data(), recvcnt.data()+param.
nprocs,
static_cast<IT
>(0));
733 MPI_Datatype MPI_tuple;
734 MPI_Type_contiguous(
sizeof(std::tuple<IT,IT,IT,NT>), MPI_CHAR, &MPI_tuple);
735 MPI_Type_commit(&MPI_tuple);
737 std::vector< std::tuple<IT,IT,IT,NT> > recvTuples1(totrecv);
738 MPI_Alltoallv(sendTuples.data(), sendcnt.data(), sdispls.data(), MPI_tuple, recvTuples1.data(), recvcnt.data(), rdispls.data(), MPI_tuple, World);
739 MPI_Type_free(&MPI_tuple);
746 template <
class IT,
class NT>
747 std::vector<std::vector<std::tuple<IT,IT, IT, NT>>>
Phase2_old(
const AWPM_param<IT>& param, std::vector<std::tuple<IT,IT,NT>>& recvTuples,
Dcsc<IT, NT>* dcsc,
const std::vector<IT>& colptr,
const std::vector<IT>& RepMateR2C,
const std::vector<IT>& RepMateC2R,
const std::vector<NT>& RepMateWR2C,
const std::vector<NT>& RepMateWC2R )
750 std::vector<std::vector<std::tuple<IT,IT, IT, NT>>> tempTuples1 (param.
nprocs);
751 for(
int k=0; k<recvTuples.size(); ++k)
753 IT mi = std::get<0>(recvTuples[k]) ;
754 IT mj = std::get<1>(recvTuples[k]) ;
756 NT weight = std::get<2>(recvTuples[k]);
769 int rrank = (param.
m_perproc != 0) ? std::min(
static_cast<int>(mj / param.
m_perproc), param.
pr-1) : (param.
pr-1);
770 int crank = (param.
n_perproc != 0) ? std::min(
static_cast<int>(j / param.
n_perproc), param.
pc-1) : (param.
pc-1);
771 int owner = param.
commGrid->GetRank(rrank , crank);
772 tempTuples1[owner].push_back(make_tuple(mj, mi, i, cw));
781 template <
class IT,
class NT,
class DER>
787 auto commGrid =
A.getcommgrid();
788 int myrank=commGrid->GetRank();
789 MPI_Comm World = commGrid->GetWorld();
790 MPI_Comm ColWorld = commGrid->GetColWorld();
791 MPI_Comm RowWorld = commGrid->GetRowWorld();
792 int nprocs = commGrid->GetSize();
793 int pr = commGrid->GetGridRows();
794 int pc = commGrid->GetGridCols();
795 int rowrank = commGrid->GetRankInProcRow();
796 int colrank = commGrid->GetRankInProcCol();
797 int diagneigh = commGrid->GetComplementRank();
802 IT nrows =
A.getnrow();
803 IT ncols =
A.getncol();
805 IT m_perproc = nrows / pr;
806 IT n_perproc = ncols / pc;
807 DER* spSeq =
A.seqptr();
809 IT lnrow = spSeq->getnrow();
810 IT lncol = spSeq->getncol();
811 IT localRowStart = colrank * m_perproc;
812 IT localColStart = rowrank * n_perproc;
835 MPI_Sendrecv(&xsize, 1, MPI_INT, diagneigh,
TRX, &trxsize, 1, MPI_INT, diagneigh,
TRX, World, &status);
836 std::vector<IT> trxnums(trxsize);
837 MPI_Sendrecv(mateCol2Row.
GetLocArr(), xsize, MPIType<IT>(), diagneigh,
TRX, trxnums.data(), trxsize, MPIType<IT>(), diagneigh,
TRX, World, &status);
840 std::vector<int> colsize(pc);
841 colsize[colrank] = trxsize;
842 MPI_Allgather(MPI_IN_PLACE, 1, MPI_INT, colsize.data(), 1, MPI_INT, ColWorld);
843 std::vector<int> dpls(pc,0);
844 std::partial_sum(colsize.data(), colsize.data()+pc-1, dpls.data()+1);
845 int accsize = std::accumulate(colsize.data(), colsize.data()+pc, 0);
846 std::vector<IT> RepMateC2R(accsize);
847 MPI_Allgatherv(trxnums.data(), trxsize, MPIType<IT>(), RepMateC2R.data(), colsize.data(), dpls.data(), MPIType<IT>(), ColWorld);
860 std::vector<int> rowsize(pr);
861 rowsize[rowrank] = xsize;
862 MPI_Allgather(MPI_IN_PLACE, 1, MPI_INT, rowsize.data(), 1, MPI_INT, RowWorld);
863 std::vector<int> rdpls(pr,0);
864 std::partial_sum(rowsize.data(), rowsize.data()+pr-1, rdpls.data()+1);
865 accsize = std::accumulate(rowsize.data(), rowsize.data()+pr, 0);
866 std::vector<IT> RepMateR2C(accsize);
867 MPI_Allgatherv(mateRow2Col.
GetLocArr(), xsize, MPIType<IT>(), RepMateR2C.data(), rowsize.data(), rdpls.data(), MPIType<IT>(), RowWorld);
873 std::vector<IT> colptr (lncol+1,-1);
874 for(
auto colit = spSeq->begcol(); colit != spSeq->endcol(); ++colit)
876 IT lj = colit.colid();
878 colptr[lj] = colit.colptr();
880 colptr[lncol] = spSeq->getnnz();
881 for(IT k=lncol-1; k>=0; k--)
885 colptr[k] = colptr[k+1];
894 std::vector<NT> RepMateWR2C(lnrow);
895 std::vector<NT> RepMateWC2R(lncol);
902 NT weightPrev = weightCur - 999999999999;
903 while(weightCur > weightPrev && iterations++ < 10)
907 if(myrank==0) std::cout <<
"Iteration " << iterations <<
". matching weight: sum = "<< weightCur <<
" min = " << minw << std::endl;
913 std::vector<std::tuple<IT,IT,NT>> recvTuples =
Phase1(param, dcsc, colptr, RepMateR2C, RepMateC2R, RepMateWR2C, RepMateWC2R );
914 std::vector<std::tuple<IT,IT,IT,NT>> recvTuples1 =
Phase2(param, recvTuples, dcsc, colptr, RepMateR2C, RepMateC2R, RepMateWR2C, RepMateWC2R );
915 std::vector< std::tuple<IT,IT,NT> >().swap(recvTuples);
919 tstart = MPI_Wtime();
921 std::vector<std::tuple<IT,IT,IT,NT>> bestTuplesPhase3 (lncol);
923 #pragma omp parallel for
925 for(
int k=0; k<lncol; ++k)
927 bestTuplesPhase3[k] = std::make_tuple(-1,-1,-1,0);
930 for(
int k=0; k<recvTuples1.size(); ++k)
932 IT mj = std::get<0>(recvTuples1[k]) ;
933 IT mi = std::get<1>(recvTuples1[k]) ;
934 IT i = std::get<2>(recvTuples1[k]) ;
935 NT weight = std::get<3>(recvTuples1[k]);
936 IT j = RepMateR2C[mj - localRowStart];
937 IT lj = j - localColStart;
940 if( (std::get<0>(bestTuplesPhase3[lj]) == -1) || (weight > std::get<3>(bestTuplesPhase3[lj])) )
942 bestTuplesPhase3[lj] = std::make_tuple(i,mi,mj,weight);
946 std::vector<std::vector<std::tuple<IT,IT, IT, NT>>> tempTuples1 (nprocs);
947 for(
int k=0; k<lncol; ++k)
949 if( std::get<0>(bestTuplesPhase3[k]) != -1)
953 IT i = std::get<0>(bestTuplesPhase3[k]) ;
954 IT mi = std::get<1>(bestTuplesPhase3[k]) ;
955 IT mj = std::get<2>(bestTuplesPhase3[k]) ;
956 IT j = RepMateR2C[mj - localRowStart];
957 NT weight = std::get<3>(bestTuplesPhase3[k]);
960 tempTuples1[owner].push_back(std::make_tuple(i, j, mj, weight));
967 std::vector<std::tuple<IT,IT,IT,IT, NT>> bestTuplesPhase4 (lncol);
973 #pragma omp parallel for
975 for(
int k=0; k<lncol; ++k)
977 bestTuplesPhase4[k] = std::make_tuple(-1,-1,-1,-1,0);
980 for(
int k=0; k<recvTuples1.size(); ++k)
982 IT i = std::get<0>(recvTuples1[k]) ;
983 IT j = std::get<1>(recvTuples1[k]) ;
984 IT mj = std::get<2>(recvTuples1[k]) ;
985 IT mi = RepMateR2C[i-localRowStart];
986 NT weight = std::get<3>(recvTuples1[k]);
987 IT lmi = mi - localColStart;
992 if( ((std::get<0>(bestTuplesPhase4[lmi]) == -1) || (weight > std::get<4>(bestTuplesPhase4[lmi]))) && std::get<0>(bestTuplesPhase3[lmi])==-1 )
994 bestTuplesPhase4[lmi] = std::make_tuple(i,j,mi,mj,weight);
1000 std::vector<std::vector<std::tuple<IT,IT,IT, IT>>> winnerTuples (nprocs);
1003 for(
int k=0; k<lncol; ++k)
1005 if( std::get<0>(bestTuplesPhase4[k]) != -1)
1009 IT i = std::get<0>(bestTuplesPhase4[k]) ;
1010 IT j = std::get<1>(bestTuplesPhase4[k]) ;
1011 IT mi = std::get<2>(bestTuplesPhase4[k]) ;
1012 IT mj = std::get<3>(bestTuplesPhase4[k]) ;
1016 winnerTuples[owner].push_back(std::make_tuple(i, j, mi, mj));
1021 winnerTuples[owner].push_back(std::make_tuple(mj, mi, j, i));
1028 std::vector<std::tuple<IT,IT,IT,IT>> recvWinnerTuples =
ExchangeData1(winnerTuples, World);
1031 std::vector<std::tuple<IT,IT>> rowBcastTuples(recvWinnerTuples.size());
1032 std::vector<std::tuple<IT,IT>> colBcastTuples(recvWinnerTuples.size());
1034 #pragma omp parallel for
1036 for(
int k=0; k<recvWinnerTuples.size(); ++k)
1038 IT i = std::get<0>(recvWinnerTuples[k]) ;
1039 IT j = std::get<1>(recvWinnerTuples[k]) ;
1040 IT mi = std::get<2>(recvWinnerTuples[k]) ;
1041 IT mj = std::get<3>(recvWinnerTuples[k]);
1043 colBcastTuples[k] = std::make_tuple(j,i);
1044 rowBcastTuples[k] = std::make_tuple(mj,mi);
1047 std::vector<std::tuple<IT,IT>> updatedR2C =
MateBcast(rowBcastTuples, RowWorld);
1048 std::vector<std::tuple<IT,IT>> updatedC2R =
MateBcast(colBcastTuples, ColWorld);
1051 #pragma omp parallel for
1053 for(
int k=0; k<updatedR2C.size(); k++)
1055 IT row = std::get<0>(updatedR2C[k]);
1056 IT mate = std::get<1>(updatedR2C[k]);
1057 RepMateR2C[row-localRowStart] = mate;
1061 #pragma omp parallel for
1063 for(
int k=0; k<updatedC2R.size(); k++)
1065 IT col = std::get<0>(updatedC2R[k]);
1066 IT mate = std::get<1>(updatedC2R[k]);
1067 RepMateC2R[col-localColStart] = mate;
1075 weightPrev = weightCur;
1085 UpdateMatching(mateRow2Col, mateCol2Row, RepMateR2C, RepMateC2R);
1093 template <
class IT,
class NT,
class DER>
1100 A.Apply([](NT val){
return (fabs(val));});
1103 A.Reduce(maxvRow,
Row,
maximum<NT>(),
static_cast<NT
>(numeric_limits<NT>::lowest()));
1104 A.DimApply(
Row, maxvRow, [](NT val, NT maxval){
return val/maxval;});
1107 A.Reduce(maxvCol,
Column,
maximum<NT>(),
static_cast<NT
>(numeric_limits<NT>::lowest()));
1108 A.DimApply(
Column, maxvCol, [](NT val, NT maxval){
return val/maxval;});
1111 A.Apply([](NT val){
return log(val);});
1115 template <
class IT,
class NT>
1127 Abool.
Reduce(degCol,
Column, plus<IT>(),
static_cast<IT
>(0));
1132 double origWeight =
Trace(
A, diagnnz);
1133 bool isOriginalPerfect = diagnnz==
A.getnrow();
1138 SpParHelper::Print(
"After Greedy sanity check\n");
1142 if(isOriginalPerfect && mclWeight<=origWeight)
1144 SpParHelper::Print(
"Maximal is not better that the natural ordering. Hence, keeping the natural ordering.\n");
1145 mateRow2Col.
iota(
A.getnrow(), 0);
1146 mateCol2Row.
iota(
A.getncol(), 0);
1147 mclWeight = origWeight;
1148 isPerfectMCL =
true;
1154 double mcmWeight = mclWeight;
1159 tmcm = MPI_Wtime() - ts;
1161 SpParHelper::Print(
"After MCM sanity check\n");
1169 double tawpm = MPI_Wtime() - ts;
1172 SpParHelper::Print(
"After AWPM sanity check\n");
1174 if(isOriginalPerfect && awpmWeight<origWeight)
1176 SpParHelper::Print(
"AWPM is not better that the natural ordering. Hence, keeping the natural ordering.\n");
1177 mateRow2Col.
iota(
A.getnrow(), 0);
1178 mateCol2Row.
iota(
A.getncol(), 0);
1179 awpmWeight = origWeight;
void maximumMatching(PSpMat_s32p64 &Aeff, FullyDistVec< int64_t, int64_t > &mateRow2Col, FullyDistVec< int64_t, int64_t > &mateCol2Row)
IT * ir
row indices, size nz
NT * numx
generic values, size nz
std::shared_ptr< CommGrid > getcommgrid() const
void iota(IT globalsize, NT first)
const NT * GetLocArr() const
void SetLocalElement(IT index, NT value)
FullyDistVec< IT, NT > Reduce(Dim dim, _BinaryOperation __binary_op, NT id, _UnaryOperation __unary_op) const
bool CheckMatching(FullyDistVec< IT, IT > &mateRow2Col, FullyDistVec< IT, IT > &mateCol2Row)
int OwnerProcs(SpParMat< IT, NT, DER > &A, IT grow, IT gcol, IT nrows, IT ncols)
std::vector< std::tuple< IT, IT, NT > > Phase1(const AWPM_param< IT > ¶m, Dcsc< IT, NT > *dcsc, const std::vector< IT > &colptr, const std::vector< IT > &RepMateR2C, const std::vector< IT > &RepMateC2R, const std::vector< NT > &RepMateWR2C, const std::vector< NT > &RepMateWC2R)
void AWPM(SpParMat< IT, NT, SpDCCols< IT, NT > > &A1, FullyDistVec< IT, IT > &mateRow2Col, FullyDistVec< IT, IT > &mateCol2Row, bool optimizeProd=true)
std::vector< std::tuple< IT, IT > > MateBcast(std::vector< std::tuple< IT, IT >> sendTuples, MPI_Comm World)
NT MatchingWeight(std::vector< NT > &RepMateWC2R, MPI_Comm RowWorld, NT &minw)
void TransformWeight(SpParMat< IT, NT, DER > &A, bool applylog)
std::vector< std::tuple< IT, IT, IT, NT > > Phase2(const AWPM_param< IT > ¶m, std::vector< std::tuple< IT, IT, NT >> &recvTuples, Dcsc< IT, NT > *dcsc, const std::vector< IT > &colptr, const std::vector< IT > &RepMateR2C, const std::vector< IT > &RepMateC2R, const std::vector< NT > &RepMateWR2C, const std::vector< NT > &RepMateWC2R)
NT Trace(SpParMat< IT, NT, DER > &A, IT &rettrnnz=0)
std::vector< std::tuple< IT, IT, IT, NT > > ExchangeData1(std::vector< std::vector< std::tuple< IT, IT, IT, NT >>> &tempTuples, MPI_Comm World)
void UpdateMatching(FullyDistVec< IT, IT > &mateRow2Col, FullyDistVec< IT, IT > &mateCol2Row, std::vector< IT > &RepMateR2C, std::vector< IT > &RepMateC2R)
int ThreadBuffLenForBinning(int itemsize, int nbins)
void ReplicateMateWeights(const AWPM_param< IT > ¶m, Dcsc< IT, NT > *dcsc, const std::vector< IT > &colptr, std::vector< IT > &RepMateC2R, std::vector< NT > &RepMateWR2C, std::vector< NT > &RepMateWC2R)
std::vector< std::tuple< IT, IT, NT > > ExchangeData(std::vector< std::vector< std::tuple< IT, IT, NT >>> &tempTuples, MPI_Comm World)
void TwoThirdApprox(SpParMat< IT, NT, DER > &A, FullyDistVec< IT, IT > &mateRow2Col, FullyDistVec< IT, IT > &mateCol2Row)
void WeightedGreedy(Par_MAT_Double &A, FullyDistVec< IT, IT > &mateRow2Col, FullyDistVec< IT, IT > &mateCol2Row, FullyDistVec< IT, IT > °Col)
std::vector< std::vector< std::tuple< IT, IT, IT, NT > > > Phase2_old(const AWPM_param< IT > ¶m, std::vector< std::tuple< IT, IT, NT >> &recvTuples, Dcsc< IT, NT > *dcsc, const std::vector< IT > &colptr, const std::vector< IT > &RepMateR2C, const std::vector< IT > &RepMateC2R, const std::vector< NT > &RepMateWR2C, const std::vector< NT > &RepMateWC2R)
std::shared_ptr< CommGrid > commGrid
Compute the maximum of two values.