9#ifndef ApproxWeightPerfectMatching_h
10#define ApproxWeightPerfectMatching_h
15#include <parallel/algorithm>
16#include <parallel/numeric>
42template <
class IT,
class NT>
43std::vector<std::tuple<IT,IT,NT>>
ExchangeData(std::vector<std::vector<std::tuple<IT,IT,NT>>> & tempTuples, MPI_Comm World)
47 MPI_Datatype MPI_tuple;
48 MPI_Type_contiguous(
sizeof(std::tuple<IT,IT,NT>), MPI_CHAR, &MPI_tuple);
49 MPI_Type_commit(&MPI_tuple);
52 MPI_Comm_size(World, &nprocs);
54 int * sendcnt =
new int[nprocs];
55 int * recvcnt =
new int[nprocs];
56 int * sdispls =
new int[nprocs]();
57 int * rdispls =
new int[nprocs]();
61 for(IT i=0; i<nprocs; ++i)
63 sendcnt[i] = tempTuples[i].size();
64 totsend += tempTuples[i].size();
67 MPI_Alltoall(sendcnt, 1, MPI_INT, recvcnt, 1, MPI_INT, World);
69 std::partial_sum(sendcnt, sendcnt+nprocs-1, sdispls+1);
70 std::partial_sum(recvcnt, recvcnt+nprocs-1, rdispls+1);
71 IT totrecv = accumulate(recvcnt,recvcnt+nprocs,
static_cast<IT
>(0));
74 std::vector< std::tuple<IT,IT,NT> > sendTuples(totsend);
75 for(
int i=0; i<nprocs; ++i)
77 copy(tempTuples[i].begin(), tempTuples[i].end(), sendTuples.data()+sdispls[i]);
78 std::vector< std::tuple<IT,IT,NT> >().swap(tempTuples[i]);
80 std::vector< std::tuple<IT,IT,NT> > recvTuples(totrecv);
81 MPI_Alltoallv(sendTuples.data(), sendcnt, sdispls, MPI_tuple, recvTuples.data(), recvcnt, rdispls, MPI_tuple, World);
82 DeleteAll(sendcnt, recvcnt, sdispls, rdispls);
83 MPI_Type_free(&MPI_tuple);
90template <
class IT,
class NT>
91std::vector<std::tuple<IT,IT,IT,NT>>
ExchangeData1(std::vector<std::vector<std::tuple<IT,IT,IT,NT>>> & tempTuples, MPI_Comm World)
95 MPI_Datatype MPI_tuple;
96 MPI_Type_contiguous(
sizeof(std::tuple<IT,IT,IT,NT>), MPI_CHAR, &MPI_tuple);
97 MPI_Type_commit(&MPI_tuple);
100 MPI_Comm_size(World, &nprocs);
102 int * sendcnt =
new int[nprocs];
103 int * recvcnt =
new int[nprocs];
104 int * sdispls =
new int[nprocs]();
105 int * rdispls =
new int[nprocs]();
109 for(IT i=0; i<nprocs; ++i)
111 sendcnt[i] = tempTuples[i].size();
112 totsend += tempTuples[i].size();
115 MPI_Alltoall(sendcnt, 1, MPI_INT, recvcnt, 1, MPI_INT, World);
117 std::partial_sum(sendcnt, sendcnt+nprocs-1, sdispls+1);
118 std::partial_sum(recvcnt, recvcnt+nprocs-1, rdispls+1);
119 IT totrecv = std::accumulate(recvcnt,recvcnt+nprocs,
static_cast<IT
>(0));
121 std::vector< std::tuple<IT,IT,IT,NT> > sendTuples(totsend);
122 for(
int i=0; i<nprocs; ++i)
124 copy(tempTuples[i].begin(), tempTuples[i].end(), sendTuples.data()+sdispls[i]);
125 std::vector< std::tuple<IT,IT,IT,NT> >().swap(tempTuples[i]);
127 std::vector< std::tuple<IT,IT,IT,NT> > recvTuples(totrecv);
128 MPI_Alltoallv(sendTuples.data(), sendcnt, sdispls, MPI_tuple, recvTuples.data(), recvcnt, rdispls, MPI_tuple, World);
129 DeleteAll(sendcnt, recvcnt, sdispls, rdispls);
130 MPI_Type_free(&MPI_tuple);
139template <
class IT,
class NT,
class DER>
140int OwnerProcs(SpParMat < IT, NT, DER > &
A, IT grow, IT gcol, IT nrows, IT ncols)
142 auto commGrid =
A.getcommgrid();
143 int procrows = commGrid->GetGridRows();
144 int proccols = commGrid->GetGridCols();
146 IT m_perproc = nrows / procrows;
147 IT n_perproc = ncols / proccols;
150 pr = std::min(
static_cast<int>(grow / m_perproc), procrows-1);
154 pc = std::min(
static_cast<int>(gcol / n_perproc), proccols-1);
157 return commGrid->GetRank(pr, pc);
184std::vector<std::tuple<IT,IT>>
MateBcast(std::vector<std::tuple<IT,IT>> sendTuples, MPI_Comm World)
188 MPI_Datatype MPI_tuple;
189 MPI_Type_contiguous(
sizeof(std::tuple<IT,IT>) , MPI_CHAR, &MPI_tuple);
190 MPI_Type_commit(&MPI_tuple);
194 MPI_Comm_size(World, &nprocs);
196 int * recvcnt =
new int[nprocs];
197 int * rdispls =
new int[nprocs]();
198 int sendcnt = sendTuples.size();
201 MPI_Allgather(&sendcnt, 1, MPI_INT, recvcnt, 1, MPI_INT, World);
203 std::partial_sum(recvcnt, recvcnt+nprocs-1, rdispls+1);
204 IT totrecv = std::accumulate(recvcnt,recvcnt+nprocs,
static_cast<IT
>(0));
206 std::vector< std::tuple<IT,IT> > recvTuples(totrecv);
209 MPI_Allgatherv(sendTuples.data(), sendcnt, MPI_tuple,
210 recvTuples.data(), recvcnt, rdispls,MPI_tuple,World );
213 MPI_Type_free(&MPI_tuple);
224template <
class IT,
class NT>
225void ReplicateMateWeights(
const AWPM_param<IT>& param, Dcsc<IT, NT>*dcsc,
const std::vector<IT>& colptr, std::vector<IT>& RepMateC2R, std::vector<NT>& RepMateWR2C, std::vector<NT>& RepMateWC2R)
229 fill(RepMateWC2R.begin(), RepMateWC2R.end(),
static_cast<NT
>(0));
230 fill(RepMateWR2C.begin(), RepMateWR2C.end(),
static_cast<NT
>(0));
234#pragma omp parallel for
236 for(
int k=0; k<param.lncol; ++k)
240 IT mj = RepMateC2R[lj];
242 if(mj >= param.localRowStart && mj < (param.localRowStart+param.lnrow) )
244 for(IT cp = colptr[k]; cp < colptr[k+1]; ++cp)
246 IT li = dcsc->ir[cp];
247 IT i = li + param.localRowStart;
251 RepMateWC2R[lj] = dcsc->numx[cp];
252 RepMateWR2C[mj-param.localRowStart] = dcsc->numx[cp];
261 MPI_Comm ColWorld = param.commGrid->GetColWorld();
262 MPI_Comm RowWorld = param.commGrid->GetRowWorld();
264 MPI_Allreduce(MPI_IN_PLACE, RepMateWC2R.data(), RepMateWC2R.size(), MPIType<NT>(), MPI_SUM, ColWorld);
265 MPI_Allreduce(MPI_IN_PLACE, RepMateWR2C.data(), RepMateWR2C.size(), MPIType<NT>(), MPI_SUM, RowWorld);
271template <
class IT,
class NT,
class DER>
272NT
Trace( SpParMat < IT, NT, DER > &
A, IT& rettrnnz=0)
275 IT nrows =
A.getnrow();
276 IT ncols =
A.getncol();
277 auto commGrid =
A.getcommgrid();
278 MPI_Comm World = commGrid->GetWorld();
279 int myrank=commGrid->GetRank();
280 int pr = commGrid->GetGridRows();
281 int pc = commGrid->GetGridCols();
287 int rowrank = commGrid->GetRankInProcRow();
288 int colrank = commGrid->GetRankInProcCol();
289 IT m_perproc = nrows / pr;
290 IT n_perproc = ncols / pc;
291 DER* spSeq =
A.seqptr();
292 IT localRowStart = colrank * m_perproc;
293 IT localColStart = rowrank * n_perproc;
298 for(
auto colit = spSeq->begcol(); colit != spSeq->endcol(); ++colit)
300 IT lj = colit.colid();
301 IT j = lj + localColStart;
303 for(
auto nzit = spSeq->begnz(colit); nzit < spSeq->endnz(colit); ++nzit)
306 IT li = nzit.rowid();
307 IT i = li + localRowStart;
311 trace += nzit.value();
317 MPI_Allreduce(MPI_IN_PLACE, &trnnz, 1, MPIType<IT>(), MPI_SUM, World);
318 MPI_Allreduce(MPI_IN_PLACE, &trace, 1, MPIType<NT>(), MPI_SUM, World);
330NT
MatchingWeight( std::vector<NT>& RepMateWC2R, MPI_Comm RowWorld, NT& minw)
333 minw = 99999999999999.0;
334 for(
int i=0; i<RepMateWC2R.size(); i++)
341 minw = std::min(minw, RepMateWC2R[i]);
344 MPI_Allreduce(MPI_IN_PLACE, &w, 1, MPIType<NT>(), MPI_SUM, RowWorld);
345 MPI_Allreduce(MPI_IN_PLACE, &minw, 1, MPIType<NT>(), MPI_MIN, RowWorld);
355void UpdateMatching(FullyDistVec<IT, IT>& mateRow2Col, FullyDistVec<IT, IT>& mateCol2Row, std::vector<IT>& RepMateR2C, std::vector<IT>& RepMateC2R)
358 auto commGrid = mateRow2Col.getcommgrid();
359 MPI_Comm RowWorld = commGrid->GetRowWorld();
360 int rowroot = commGrid->GetDiagOfProcRow();
361 int pc = commGrid->GetGridCols();
364 IT localLenR2C = mateRow2Col.LocArrSize();
366 for(IT i=0, j = mateRow2Col.RowLenUntil(); i<localLenR2C; i++, j++)
368 mateRow2Col.SetLocalElement(i, RepMateR2C[j]);
374 std::vector <int> sendcnts(pc);
375 std::vector <int> dpls(pc);
377 for(
int i=1; i<pc; i++)
379 dpls[i] = mateCol2Row.RowLenUntil(i);
380 sendcnts[i-1] = dpls[i] - dpls[i-1];
382 sendcnts[pc-1] = RepMateC2R.size() - dpls[pc-1];
384 IT localLenC2R = mateCol2Row.LocArrSize();
385 IT* localC2R = (IT*) mateCol2Row.GetLocArr();
386 MPI_Scatterv(RepMateC2R.data(),sendcnts.data(), dpls.data(), MPIType<IT>(), localC2R, localLenC2R, MPIType<IT>(),rowroot, RowWorld);
395#define L2_CACHE_SIZE 256000
397 int THREAD_BUF_LEN = 256;
400 int bufferMem = THREAD_BUF_LEN * nbins * itemsize ;
404 THREAD_BUF_LEN = std::min(nbins+1,THREAD_BUF_LEN);
406 return THREAD_BUF_LEN;
411template <
class IT,
class NT>
412std::vector< std::tuple<IT,IT,NT> >
Phase1(
const AWPM_param<IT>& param, Dcsc<IT, NT>* dcsc,
const std::vector<IT>& colptr,
const std::vector<IT>& RepMateR2C,
const std::vector<IT>& RepMateC2R,
const std::vector<NT>& RepMateWR2C,
const std::vector<NT>& RepMateWC2R )
419 MPI_Comm World = param.commGrid->GetWorld();
424 std::vector<int> sendcnt(param.nprocs,0);
431 std::vector<int> tsendcnt(param.nprocs,0);
435 for(
int k=0; k<param.lncol; ++k)
437 IT mj = RepMateC2R[k];
438 IT j = k + param.localColStart;
440 for(IT cp = colptr[k]; cp < colptr[k+1]; ++cp)
442 IT li = dcsc->ir[cp];
443 IT i = li + param.localRowStart;
444 IT mi = RepMateR2C[li];
448 int rrank = param.m_perproc != 0 ? std::min(
static_cast<int>(mj / param.m_perproc), param.pr-1) : (param.pr-1);
449 int crank = param.n_perproc != 0 ? std::min(
static_cast<int>(mi / param.n_perproc), param.pc-1) : (param.pc-1);
450 int owner = param.commGrid->GetRank(rrank , crank);
455 for(
int i=0; i<param.nprocs; i++)
457 __sync_fetch_and_add(sendcnt.data()+i, tsendcnt[i]);
464 IT totsend = std::accumulate(sendcnt.data(), sendcnt.data()+param.nprocs,
static_cast<IT
>(0));
465 std::vector<int> sdispls (param.nprocs, 0);
466 std::partial_sum(sendcnt.data(), sendcnt.data()+param.nprocs-1, sdispls.data()+1);
468 std::vector< std::tuple<IT,IT,NT> > sendTuples(totsend);
469 std::vector<int> transferCount(param.nprocs,0);
478 std::vector<int> tsendcnt(param.nprocs,0);
479 std::vector<std::tuple<IT,IT, NT>> tsendTuples (param.nprocs*THREAD_BUF_LEN);
483 for(
int k=0; k<param.lncol; ++k)
485 IT mj = RepMateC2R[k];
487 IT j = k + param.localColStart;
489 for(IT cp = colptr[k]; cp < colptr[k+1]; ++cp)
491 IT li = dcsc->ir[cp];
492 IT i = li + param.localRowStart;
493 IT mi = RepMateR2C[li];
496 double w = dcsc->numx[cp]- RepMateWR2C[li] - RepMateWC2R[lj];
497 int rrank = param.m_perproc != 0 ? std::min(
static_cast<int>(mj / param.m_perproc), param.pr-1) : (param.pr-1);
498 int crank = param.n_perproc != 0 ? std::min(
static_cast<int>(mi / param.n_perproc), param.pc-1) : (param.pc-1);
499 int owner = param.commGrid->GetRank(rrank , crank);
501 if (tsendcnt[owner] < THREAD_BUF_LEN)
503 tsendTuples[THREAD_BUF_LEN * owner + tsendcnt[owner]] = std::make_tuple(mi, mj, w);
508 int tt = __sync_fetch_and_add(transferCount.data()+owner, THREAD_BUF_LEN);
509 copy( tsendTuples.data()+THREAD_BUF_LEN * owner, tsendTuples.data()+THREAD_BUF_LEN * (owner+1) , sendTuples.data() + sdispls[owner]+ tt);
511 tsendTuples[THREAD_BUF_LEN * owner] = std::make_tuple(mi, mj, w);
517 for(
int owner=0; owner < param.nprocs; owner++)
519 if (tsendcnt[owner] >0)
521 int tt = __sync_fetch_and_add(transferCount.data()+owner, tsendcnt[owner]);
522 copy( tsendTuples.data()+THREAD_BUF_LEN * owner, tsendTuples.data()+THREAD_BUF_LEN * owner + tsendcnt[owner], sendTuples.data() + sdispls[owner]+ tt);
529 std::vector<int> recvcnt (param.nprocs);
530 std::vector<int> rdispls (param.nprocs, 0);
532 MPI_Alltoall(sendcnt.data(), 1, MPI_INT, recvcnt.data(), 1, MPI_INT, World);
533 std::partial_sum(recvcnt.data(), recvcnt.data()+param.nprocs-1, rdispls.data()+1);
534 IT totrecv = std::accumulate(recvcnt.data(), recvcnt.data()+param.nprocs,
static_cast<IT
>(0));
537 MPI_Datatype MPI_tuple;
538 MPI_Type_contiguous(
sizeof(std::tuple<IT,IT,NT>), MPI_CHAR, &MPI_tuple);
539 MPI_Type_commit(&MPI_tuple);
541 std::vector< std::tuple<IT,IT,NT> > recvTuples1(totrecv);
542 MPI_Alltoallv(sendTuples.data(), sendcnt.data(), sdispls.data(), MPI_tuple, recvTuples1.data(), recvcnt.data(), rdispls.data(), MPI_tuple, World);
543 MPI_Type_free(&MPI_tuple);
551template <
class IT,
class NT>
552std::vector< std::tuple<IT,IT,IT,NT> >
Phase2(
const AWPM_param<IT>& param, std::vector<std::tuple<IT,IT,NT>>& recvTuples, Dcsc<IT, NT>* dcsc,
const std::vector<IT>& colptr,
const std::vector<IT>& RepMateR2C,
const std::vector<IT>& RepMateC2R,
const std::vector<NT>& RepMateWR2C,
const std::vector<NT>& RepMateWC2R )
555 MPI_Comm World = param.commGrid->GetWorld();
557 double tstart = MPI_Wtime();
560 __gnu_parallel::sort(recvTuples.begin(), recvTuples.end());
561 std::vector<std::vector<std::tuple<IT,IT, IT, NT>>> tempTuples1 (param.nprocs);
563 std::vector<int> sendcnt(param.nprocs,0);
572 nBins = omp_get_num_threads() * 4;
577#pragma omp parallel for
579 for(
int i=0; i<nBins; i++)
581 int perBin = recvTuples.size()/nBins;
582 int startBinIndex = perBin * i;
583 int endBinIndex = perBin * (i+1);
584 if(i==nBins-1) endBinIndex = recvTuples.size();
587 std::vector<int> tsendcnt(param.nprocs,0);
588 for(
int k=startBinIndex; k<endBinIndex;)
591 IT mi = std::get<0>(recvTuples[k]);
592 IT lcol = mi - param.localColStart;
593 IT i = RepMateC2R[lcol];
595 IT idx2 = colptr[lcol];
597 for(; std::get<0>(recvTuples[idx1]) == mi && idx2 < colptr[lcol+1];)
600 IT mj = std::get<1>(recvTuples[idx1]) ;
601 IT lrow = mj - param.localRowStart;
602 IT j = RepMateR2C[lrow];
603 IT lrowMat = dcsc->ir[idx2];
606 NT weight = std::get<2>(recvTuples[idx1]);
607 NT cw = weight + RepMateWR2C[lrow];
610 int rrank = (param.m_perproc != 0) ? std::min(
static_cast<int>(mj / param.m_perproc), param.pr-1) : (param.pr-1);
611 int crank = (param.n_perproc != 0) ? std::min(
static_cast<int>(j / param.n_perproc), param.pc-1) : (param.pc-1);
612 int owner = param.commGrid->GetRank(rrank , crank);
618 else if(lrowMat > lrow)
624 for(;std::get<0>(recvTuples[idx1]) == mi ; idx1++);
628 for(
int i=0; i<param.nprocs; i++)
630 __sync_fetch_and_add(sendcnt.data()+i, tsendcnt[i]);
637 IT totsend = std::accumulate(sendcnt.data(), sendcnt.data()+param.nprocs,
static_cast<IT
>(0));
638 std::vector<int> sdispls (param.nprocs, 0);
639 std::partial_sum(sendcnt.data(), sendcnt.data()+param.nprocs-1, sdispls.data()+1);
641 std::vector< std::tuple<IT,IT,IT,NT> > sendTuples(totsend);
642 std::vector<int> transferCount(param.nprocs,0);
648#pragma omp parallel for
650 for(
int i=0; i<nBins; i++)
652 int perBin = recvTuples.size()/nBins;
653 int startBinIndex = perBin * i;
654 int endBinIndex = perBin * (i+1);
655 if(i==nBins-1) endBinIndex = recvTuples.size();
658 std::vector<int> tsendcnt(param.nprocs,0);
659 std::vector<std::tuple<IT,IT, IT, NT>> tsendTuples (param.nprocs*THREAD_BUF_LEN);
660 for(
int k=startBinIndex; k<endBinIndex;)
662 IT mi = std::get<0>(recvTuples[k]);
663 IT lcol = mi - param.localColStart;
664 IT i = RepMateC2R[lcol];
666 IT idx2 = colptr[lcol];
668 for(; std::get<0>(recvTuples[idx1]) == mi && idx2 < colptr[lcol+1];)
671 IT mj = std::get<1>(recvTuples[idx1]) ;
672 IT lrow = mj - param.localRowStart;
673 IT j = RepMateR2C[lrow];
674 IT lrowMat = dcsc->ir[idx2];
677 NT weight = std::get<2>(recvTuples[idx1]);
678 NT cw = weight + RepMateWR2C[lrow];
681 int rrank = (param.m_perproc != 0) ? std::min(
static_cast<int>(mj / param.m_perproc), param.pr-1) : (param.pr-1);
682 int crank = (param.n_perproc != 0) ? std::min(
static_cast<int>(j / param.n_perproc), param.pc-1) : (param.pc-1);
683 int owner = param.commGrid->GetRank(rrank , crank);
685 if (tsendcnt[owner] < THREAD_BUF_LEN)
687 tsendTuples[THREAD_BUF_LEN * owner + tsendcnt[owner]] = std::make_tuple(mj, mi, i, cw);
692 int tt = __sync_fetch_and_add(transferCount.data()+owner, THREAD_BUF_LEN);
693 std::copy( tsendTuples.data()+THREAD_BUF_LEN * owner, tsendTuples.data()+THREAD_BUF_LEN * (owner+1) , sendTuples.data() + sdispls[owner]+ tt);
695 tsendTuples[THREAD_BUF_LEN * owner] = std::make_tuple(mj, mi, i, cw);
703 else if(lrowMat > lrow)
709 for(;std::get<0>(recvTuples[idx1]) == mi ; idx1++);
713 for(
int owner=0; owner < param.nprocs; owner++)
715 if (tsendcnt[owner] >0)
717 int tt = __sync_fetch_and_add(transferCount.data()+owner, tsendcnt[owner]);
718 std::copy( tsendTuples.data()+THREAD_BUF_LEN * owner, tsendTuples.data()+THREAD_BUF_LEN * owner + tsendcnt[owner], sendTuples.data() + sdispls[owner]+ tt);
725 std::vector<int> recvcnt (param.nprocs);
726 std::vector<int> rdispls (param.nprocs, 0);
728 MPI_Alltoall(sendcnt.data(), 1, MPI_INT, recvcnt.data(), 1, MPI_INT, World);
729 std::partial_sum(recvcnt.data(), recvcnt.data()+param.nprocs-1, rdispls.data()+1);
730 IT totrecv = std::accumulate(recvcnt.data(), recvcnt.data()+param.nprocs,
static_cast<IT
>(0));
733 MPI_Datatype MPI_tuple;
734 MPI_Type_contiguous(
sizeof(std::tuple<IT,IT,IT,NT>), MPI_CHAR, &MPI_tuple);
735 MPI_Type_commit(&MPI_tuple);
737 std::vector< std::tuple<IT,IT,IT,NT> > recvTuples1(totrecv);
738 MPI_Alltoallv(sendTuples.data(), sendcnt.data(), sdispls.data(), MPI_tuple, recvTuples1.data(), recvcnt.data(), rdispls.data(), MPI_tuple, World);
739 MPI_Type_free(&MPI_tuple);
746template <
class IT,
class NT>
747std::vector<std::vector<std::tuple<IT,IT, IT, NT>>>
Phase2_old(
const AWPM_param<IT>& param, std::vector<std::tuple<IT,IT,NT>>& recvTuples, Dcsc<IT, NT>* dcsc,
const std::vector<IT>& colptr,
const std::vector<IT>& RepMateR2C,
const std::vector<IT>& RepMateC2R,
const std::vector<NT>& RepMateWR2C,
const std::vector<NT>& RepMateWC2R )
750 std::vector<std::vector<std::tuple<IT,IT, IT, NT>>> tempTuples1 (param.nprocs);
751 for(
int k=0; k<recvTuples.size(); ++k)
753 IT mi = std::get<0>(recvTuples[k]) ;
754 IT mj = std::get<1>(recvTuples[k]) ;
755 IT i = RepMateC2R[mi - param.localColStart];
756 NT weight = std::get<2>(recvTuples[k]);
758 if(colptr[mi- param.localColStart+1] > colptr[mi- param.localColStart] )
760 IT * ele = find(dcsc->ir+colptr[mi - param.localColStart], dcsc->ir+colptr[mi - param.localColStart+1], mj - param.localRowStart);
763 if (ele != dcsc->ir+colptr[mi - param.localColStart+1])
765 NT cw = weight + RepMateWR2C[mj - param.localRowStart];
768 IT j = RepMateR2C[mj - param.localRowStart];
769 int rrank = (param.m_perproc != 0) ? std::min(
static_cast<int>(mj / param.m_perproc), param.pr-1) : (param.pr-1);
770 int crank = (param.n_perproc != 0) ? std::min(
static_cast<int>(j / param.n_perproc), param.pc-1) : (param.pc-1);
771 int owner = param.commGrid->GetRank(rrank , crank);
772 tempTuples1[owner].push_back(make_tuple(mj, mi, i, cw));
781template <
class IT,
class NT,
class DER>
782void TwoThirdApprox(SpParMat < IT, NT, DER > &
A, FullyDistVec<IT, IT>& mateRow2Col, FullyDistVec<IT, IT>& mateCol2Row)
787 auto commGrid =
A.getcommgrid();
788 int myrank=commGrid->GetRank();
789 MPI_Comm World = commGrid->GetWorld();
790 MPI_Comm ColWorld = commGrid->GetColWorld();
791 MPI_Comm RowWorld = commGrid->GetRowWorld();
792 int nprocs = commGrid->GetSize();
793 int pr = commGrid->GetGridRows();
794 int pc = commGrid->GetGridCols();
795 int rowrank = commGrid->GetRankInProcRow();
796 int colrank = commGrid->GetRankInProcCol();
797 int diagneigh = commGrid->GetComplementRank();
802 IT nrows =
A.getnrow();
803 IT ncols =
A.getncol();
805 IT m_perproc = nrows / pr;
806 IT n_perproc = ncols / pc;
807 DER* spSeq =
A.seqptr();
808 Dcsc<IT, NT>* dcsc = spSeq->GetDCSC();
809 IT lnrow = spSeq->getnrow();
810 IT lncol = spSeq->getncol();
811 IT localRowStart = colrank * m_perproc;
812 IT localColStart = rowrank * n_perproc;
814 AWPM_param<IT> param;
815 param.nprocs = nprocs;
820 param.m_perproc = m_perproc;
821 param.n_perproc = n_perproc;
822 param.localRowStart = localRowStart;
823 param.localColStart = localColStart;
824 param.myrank = myrank;
825 param.commGrid = commGrid;
832 int xsize = (int) mateCol2Row.LocArrSize();
835 MPI_Sendrecv(&xsize, 1, MPI_INT, diagneigh,
TRX, &trxsize, 1, MPI_INT, diagneigh,
TRX, World, &status);
836 std::vector<IT> trxnums(trxsize);
837 MPI_Sendrecv(mateCol2Row.GetLocArr(), xsize, MPIType<IT>(), diagneigh,
TRX, trxnums.data(), trxsize, MPIType<IT>(), diagneigh,
TRX, World, &status);
840 std::vector<int> colsize(pc);
841 colsize[colrank] = trxsize;
842 MPI_Allgather(MPI_IN_PLACE, 1, MPI_INT, colsize.data(), 1, MPI_INT, ColWorld);
843 std::vector<int> dpls(pc,0);
844 std::partial_sum(colsize.data(), colsize.data()+pc-1, dpls.data()+1);
845 int accsize = std::accumulate(colsize.data(), colsize.data()+pc, 0);
846 std::vector<IT> RepMateC2R(accsize);
847 MPI_Allgatherv(trxnums.data(), trxsize, MPIType<IT>(), RepMateC2R.data(), colsize.data(), dpls.data(), MPIType<IT>(), ColWorld);
858 xsize = (int) mateRow2Col.LocArrSize();
860 std::vector<int> rowsize(pr);
861 rowsize[rowrank] = xsize;
862 MPI_Allgather(MPI_IN_PLACE, 1, MPI_INT, rowsize.data(), 1, MPI_INT, RowWorld);
863 std::vector<int> rdpls(pr,0);
864 std::partial_sum(rowsize.data(), rowsize.data()+pr-1, rdpls.data()+1);
865 accsize = std::accumulate(rowsize.data(), rowsize.data()+pr, 0);
866 std::vector<IT> RepMateR2C(accsize);
867 MPI_Allgatherv(mateRow2Col.GetLocArr(), xsize, MPIType<IT>(), RepMateR2C.data(), rowsize.data(), rdpls.data(), MPIType<IT>(), RowWorld);
873 std::vector<IT> colptr (lncol+1,-1);
874 for(
auto colit = spSeq->begcol(); colit != spSeq->endcol(); ++colit)
876 IT lj = colit.colid();
878 colptr[lj] = colit.colptr();
880 colptr[lncol] = spSeq->getnnz();
881 for(IT k=lncol-1; k>=0; k--)
885 colptr[k] = colptr[k+1];
894 std::vector<NT> RepMateWR2C(lnrow);
895 std::vector<NT> RepMateWC2R(lncol);
902 NT weightPrev = weightCur - 999999999999;
903 while(weightCur > weightPrev && iterations++ < 10)
907 if(myrank==0) std::cout <<
"Iteration " << iterations <<
". matching weight: sum = "<< weightCur <<
" min = " << minw << std::endl;
913 std::vector<std::tuple<IT,IT,NT>> recvTuples =
Phase1(param, dcsc, colptr, RepMateR2C, RepMateC2R, RepMateWR2C, RepMateWC2R );
914 std::vector<std::tuple<IT,IT,IT,NT>> recvTuples1 =
Phase2(param, recvTuples, dcsc, colptr, RepMateR2C, RepMateC2R, RepMateWR2C, RepMateWC2R );
915 std::vector< std::tuple<IT,IT,NT> >().swap(recvTuples);
919 tstart = MPI_Wtime();
921 std::vector<std::tuple<IT,IT,IT,NT>> bestTuplesPhase3 (lncol);
923#pragma omp parallel for
925 for(
int k=0; k<lncol; ++k)
927 bestTuplesPhase3[k] = std::make_tuple(-1,-1,-1,0);
930 for(
int k=0; k<recvTuples1.size(); ++k)
932 IT mj = std::get<0>(recvTuples1[k]) ;
933 IT mi = std::get<1>(recvTuples1[k]) ;
934 IT i = std::get<2>(recvTuples1[k]) ;
935 NT weight = std::get<3>(recvTuples1[k]);
936 IT j = RepMateR2C[mj - localRowStart];
937 IT lj = j - localColStart;
940 if( (std::get<0>(bestTuplesPhase3[lj]) == -1) || (weight > std::get<3>(bestTuplesPhase3[lj])) )
942 bestTuplesPhase3[lj] = std::make_tuple(i,mi,mj,weight);
946 std::vector<std::vector<std::tuple<IT,IT, IT, NT>>> tempTuples1 (nprocs);
947 for(
int k=0; k<lncol; ++k)
949 if( std::get<0>(bestTuplesPhase3[k]) != -1)
953 IT i = std::get<0>(bestTuplesPhase3[k]) ;
954 IT mi = std::get<1>(bestTuplesPhase3[k]) ;
955 IT mj = std::get<2>(bestTuplesPhase3[k]) ;
956 IT j = RepMateR2C[mj - localRowStart];
957 NT weight = std::get<3>(bestTuplesPhase3[k]);
960 tempTuples1[owner].push_back(std::make_tuple(i, j, mj, weight));
967 std::vector<std::tuple<IT,IT,IT,IT, NT>> bestTuplesPhase4 (lncol);
973#pragma omp parallel for
975 for(
int k=0; k<lncol; ++k)
977 bestTuplesPhase4[k] = std::make_tuple(-1,-1,-1,-1,0);
980 for(
int k=0; k<recvTuples1.size(); ++k)
982 IT i = std::get<0>(recvTuples1[k]) ;
983 IT j = std::get<1>(recvTuples1[k]) ;
984 IT mj = std::get<2>(recvTuples1[k]) ;
985 IT mi = RepMateR2C[i-localRowStart];
986 NT weight = std::get<3>(recvTuples1[k]);
987 IT lmi = mi - localColStart;
992 if( ((std::get<0>(bestTuplesPhase4[lmi]) == -1) || (weight > std::get<4>(bestTuplesPhase4[lmi]))) && std::get<0>(bestTuplesPhase3[lmi])==-1 )
994 bestTuplesPhase4[lmi] = std::make_tuple(i,j,mi,mj,weight);
1000 std::vector<std::vector<std::tuple<IT,IT,IT, IT>>> winnerTuples (nprocs);
1003 for(
int k=0; k<lncol; ++k)
1005 if( std::get<0>(bestTuplesPhase4[k]) != -1)
1009 IT i = std::get<0>(bestTuplesPhase4[k]) ;
1010 IT j = std::get<1>(bestTuplesPhase4[k]) ;
1011 IT mi = std::get<2>(bestTuplesPhase4[k]) ;
1012 IT mj = std::get<3>(bestTuplesPhase4[k]) ;
1016 winnerTuples[owner].push_back(std::make_tuple(i, j, mi, mj));
1021 winnerTuples[owner].push_back(std::make_tuple(mj, mi, j, i));
1028 std::vector<std::tuple<IT,IT,IT,IT>> recvWinnerTuples =
ExchangeData1(winnerTuples, World);
1031 std::vector<std::tuple<IT,IT>> rowBcastTuples(recvWinnerTuples.size());
1032 std::vector<std::tuple<IT,IT>> colBcastTuples(recvWinnerTuples.size());
1034#pragma omp parallel for
1036 for(
int k=0; k<recvWinnerTuples.size(); ++k)
1038 IT i = std::get<0>(recvWinnerTuples[k]) ;
1039 IT j = std::get<1>(recvWinnerTuples[k]) ;
1040 IT mi = std::get<2>(recvWinnerTuples[k]) ;
1041 IT mj = std::get<3>(recvWinnerTuples[k]);
1043 colBcastTuples[k] = std::make_tuple(j,i);
1044 rowBcastTuples[k] = std::make_tuple(mj,mi);
1047 std::vector<std::tuple<IT,IT>> updatedR2C =
MateBcast(rowBcastTuples, RowWorld);
1048 std::vector<std::tuple<IT,IT>> updatedC2R =
MateBcast(colBcastTuples, ColWorld);
1051#pragma omp parallel for
1053 for(
int k=0; k<updatedR2C.size(); k++)
1055 IT row = std::get<0>(updatedR2C[k]);
1056 IT mate = std::get<1>(updatedR2C[k]);
1057 RepMateR2C[row-localRowStart] = mate;
1061#pragma omp parallel for
1063 for(
int k=0; k<updatedC2R.size(); k++)
1065 IT col = std::get<0>(updatedC2R[k]);
1066 IT mate = std::get<1>(updatedC2R[k]);
1067 RepMateC2R[col-localColStart] = mate;
1075 weightPrev = weightCur;
1085 UpdateMatching(mateRow2Col, mateCol2Row, RepMateR2C, RepMateC2R);
1093 template <
class IT,
class NT,
class DER>
1100 A.Apply([](NT val){
return (fabs(val));});
1102 FullyDistVec<IT, NT> maxvRow(
A.getcommgrid());
1103 A.Reduce(maxvRow,
Row, maximum<NT>(),
static_cast<NT
>(numeric_limits<NT>::lowest()));
1104 A.DimApply(
Row, maxvRow, [](NT val, NT maxval){
return val/maxval;});
1106 FullyDistVec<IT, NT> maxvCol(
A.getcommgrid());
1107 A.Reduce(maxvCol,
Column, maximum<NT>(),
static_cast<NT
>(numeric_limits<NT>::lowest()));
1108 A.DimApply(
Column, maxvCol, [](NT val, NT maxval){
return val/maxval;});
1111 A.Apply([](NT val){
return log(val);});
1115 template <
class IT,
class NT>
1116 void AWPM(SpParMat < IT, NT, SpDCCols<IT, NT> > & A1, FullyDistVec<IT, IT>& mateRow2Col, FullyDistVec<IT, IT>& mateCol2Row,
bool optimizeProd=
true)
1118 SpParMat < IT, NT, SpDCCols<IT, NT> >
A(A1);
1124 SpParMat < IT, NT, SpCCols<IT, NT> > Acsc(
A);
1125 SpParMat < IT, NT, SpDCCols<IT, bool> > Abool(
A);
1126 FullyDistVec<IT, IT> degCol(
A.getcommgrid());
1127 Abool.Reduce(degCol,
Column, plus<IT>(),
static_cast<IT
>(0));
1132 double origWeight =
Trace(
A, diagnnz);
1133 bool isOriginalPerfect = diagnnz==
A.getnrow();
1142 if(isOriginalPerfect && mclWeight<=origWeight)
1144 SpParHelper::Print(
"Maximal is not better that the natural ordering. Hence, keeping the natural ordering.\n");
1145 mateRow2Col.iota(
A.getnrow(), 0);
1146 mateCol2Row.iota(
A.getncol(), 0);
1147 mclWeight = origWeight;
1148 isPerfectMCL =
true;
1154 double mcmWeight = mclWeight;
1159 tmcm = MPI_Wtime() - ts;
1169 double tawpm = MPI_Wtime() - ts;
1174 if(isOriginalPerfect && awpmWeight<origWeight)
1176 SpParHelper::Print(
"AWPM is not better that the natural ordering. Hence, keeping the natural ordering.\n");
1177 mateRow2Col.iota(
A.getnrow(), 0);
1178 mateCol2Row.iota(
A.getncol(), 0);
1179 awpmWeight = origWeight;
static void Print(const std::string &s)
bool CheckMatching(FullyDistVec< IT, IT > &mateRow2Col, FullyDistVec< IT, IT > &mateCol2Row)
int OwnerProcs(SpParMat< IT, NT, DER > &A, IT grow, IT gcol, IT nrows, IT ncols)
std::vector< std::tuple< IT, IT, NT > > Phase1(const AWPM_param< IT > ¶m, Dcsc< IT, NT > *dcsc, const std::vector< IT > &colptr, const std::vector< IT > &RepMateR2C, const std::vector< IT > &RepMateC2R, const std::vector< NT > &RepMateWR2C, const std::vector< NT > &RepMateWC2R)
void AWPM(SpParMat< IT, NT, SpDCCols< IT, NT > > &A1, FullyDistVec< IT, IT > &mateRow2Col, FullyDistVec< IT, IT > &mateCol2Row, bool optimizeProd=true)
std::vector< std::tuple< IT, IT, IT, NT > > Phase2(const AWPM_param< IT > ¶m, std::vector< std::tuple< IT, IT, NT > > &recvTuples, Dcsc< IT, NT > *dcsc, const std::vector< IT > &colptr, const std::vector< IT > &RepMateR2C, const std::vector< IT > &RepMateC2R, const std::vector< NT > &RepMateWR2C, const std::vector< NT > &RepMateWC2R)
NT MatchingWeight(std::vector< NT > &RepMateWC2R, MPI_Comm RowWorld, NT &minw)
void TransformWeight(SpParMat< IT, NT, DER > &A, bool applylog)
std::vector< std::tuple< IT, IT, NT > > ExchangeData(std::vector< std::vector< std::tuple< IT, IT, NT > > > &tempTuples, MPI_Comm World)
std::vector< std::tuple< IT, IT, IT, NT > > ExchangeData1(std::vector< std::vector< std::tuple< IT, IT, IT, NT > > > &tempTuples, MPI_Comm World)
NT Trace(SpParMat< IT, NT, DER > &A, IT &rettrnnz=0)
void UpdateMatching(FullyDistVec< IT, IT > &mateRow2Col, FullyDistVec< IT, IT > &mateCol2Row, std::vector< IT > &RepMateR2C, std::vector< IT > &RepMateC2R)
void maximumMatching(SpParMat< IT, NT, DER > &A, FullyDistVec< IT, IT > &mateRow2Col, FullyDistVec< IT, IT > &mateCol2Row, bool prune=true, bool randMM=false, bool maximizeWeight=false)
int ThreadBuffLenForBinning(int itemsize, int nbins)
std::vector< std::tuple< IT, IT > > MateBcast(std::vector< std::tuple< IT, IT > > sendTuples, MPI_Comm World)
void ReplicateMateWeights(const AWPM_param< IT > ¶m, Dcsc< IT, NT > *dcsc, const std::vector< IT > &colptr, std::vector< IT > &RepMateC2R, std::vector< NT > &RepMateWR2C, std::vector< NT > &RepMateWC2R)
std::vector< std::vector< std::tuple< IT, IT, IT, NT > > > Phase2_old(const AWPM_param< IT > ¶m, std::vector< std::tuple< IT, IT, NT > > &recvTuples, Dcsc< IT, NT > *dcsc, const std::vector< IT > &colptr, const std::vector< IT > &RepMateR2C, const std::vector< IT > &RepMateC2R, const std::vector< NT > &RepMateWR2C, const std::vector< NT > &RepMateWC2R)
void TwoThirdApprox(SpParMat< IT, NT, DER > &A, FullyDistVec< IT, IT > &mateRow2Col, FullyDistVec< IT, IT > &mateCol2Row)
void WeightedGreedy(Par_MAT_Double &A, FullyDistVec< IT, IT > &mateRow2Col, FullyDistVec< IT, IT > &mateCol2Row, FullyDistVec< IT, IT > °Col)
std::shared_ptr< CommGrid > commGrid