COMBINATORIAL_BLAS 1.6
 
Loading...
Searching...
No Matches
ApproxWeightPerfectMatching.h
Go to the documentation of this file.
1//
2// ApproxWeightPerfectMatching.h
3//
4//
5// Created by Ariful Azad on 8/22/17.
6//
7//
8
9#ifndef ApproxWeightPerfectMatching_h
10#define ApproxWeightPerfectMatching_h
11
12#include "CombBLAS.h"
13#include "BPMaximalMatching.h"
14#include "BPMaximumMatching.h"
15#include <parallel/algorithm>
16#include <parallel/numeric>
17#include <memory>
18#include <limits>
19
20
21using namespace std;
22
23namespace combblas {
24
25
26template <class IT>
27struct AWPM_param
28{
29 int nprocs;
30 int myrank;
31 int pr;
32 int pc;
33 IT lncol;
34 IT lnrow;
37 IT m_perproc;
38 IT n_perproc;
39 std::shared_ptr<CommGrid> commGrid;
40};
41
42template <class IT, class NT>
43std::vector<std::tuple<IT,IT,NT>> ExchangeData(std::vector<std::vector<std::tuple<IT,IT,NT>>> & tempTuples, MPI_Comm World)
44{
45
46 /* Create/allocate variables for vector assignment */
47 MPI_Datatype MPI_tuple;
48 MPI_Type_contiguous(sizeof(std::tuple<IT,IT,NT>), MPI_CHAR, &MPI_tuple);
49 MPI_Type_commit(&MPI_tuple);
50
51 int nprocs;
52 MPI_Comm_size(World, &nprocs);
53
54 int * sendcnt = new int[nprocs];
55 int * recvcnt = new int[nprocs];
56 int * sdispls = new int[nprocs]();
57 int * rdispls = new int[nprocs]();
58
59 // Set the newly found vector entries
60 IT totsend = 0;
61 for(IT i=0; i<nprocs; ++i)
62 {
63 sendcnt[i] = tempTuples[i].size();
64 totsend += tempTuples[i].size();
65 }
66
67 MPI_Alltoall(sendcnt, 1, MPI_INT, recvcnt, 1, MPI_INT, World);
68
69 std::partial_sum(sendcnt, sendcnt+nprocs-1, sdispls+1);
70 std::partial_sum(recvcnt, recvcnt+nprocs-1, rdispls+1);
71 IT totrecv = accumulate(recvcnt,recvcnt+nprocs, static_cast<IT>(0));
72
73
74 std::vector< std::tuple<IT,IT,NT> > sendTuples(totsend);
75 for(int i=0; i<nprocs; ++i)
76 {
77 copy(tempTuples[i].begin(), tempTuples[i].end(), sendTuples.data()+sdispls[i]);
78 std::vector< std::tuple<IT,IT,NT> >().swap(tempTuples[i]); // clear memory
79 }
80 std::vector< std::tuple<IT,IT,NT> > recvTuples(totrecv);
81 MPI_Alltoallv(sendTuples.data(), sendcnt, sdispls, MPI_tuple, recvTuples.data(), recvcnt, rdispls, MPI_tuple, World);
82 DeleteAll(sendcnt, recvcnt, sdispls, rdispls); // free all memory
83 MPI_Type_free(&MPI_tuple);
84 return recvTuples;
85
86}
87
88
89
90template <class IT, class NT>
91std::vector<std::tuple<IT,IT,IT,NT>> ExchangeData1(std::vector<std::vector<std::tuple<IT,IT,IT,NT>>> & tempTuples, MPI_Comm World)
92{
93
94 /* Create/allocate variables for vector assignment */
95 MPI_Datatype MPI_tuple;
96 MPI_Type_contiguous(sizeof(std::tuple<IT,IT,IT,NT>), MPI_CHAR, &MPI_tuple);
97 MPI_Type_commit(&MPI_tuple);
98
99 int nprocs;
100 MPI_Comm_size(World, &nprocs);
101
102 int * sendcnt = new int[nprocs];
103 int * recvcnt = new int[nprocs];
104 int * sdispls = new int[nprocs]();
105 int * rdispls = new int[nprocs]();
106
107 // Set the newly found vector entries
108 IT totsend = 0;
109 for(IT i=0; i<nprocs; ++i)
110 {
111 sendcnt[i] = tempTuples[i].size();
112 totsend += tempTuples[i].size();
113 }
114
115 MPI_Alltoall(sendcnt, 1, MPI_INT, recvcnt, 1, MPI_INT, World);
116
117 std::partial_sum(sendcnt, sendcnt+nprocs-1, sdispls+1);
118 std::partial_sum(recvcnt, recvcnt+nprocs-1, rdispls+1);
119 IT totrecv = std::accumulate(recvcnt,recvcnt+nprocs, static_cast<IT>(0));
120
121 std::vector< std::tuple<IT,IT,IT,NT> > sendTuples(totsend);
122 for(int i=0; i<nprocs; ++i)
123 {
124 copy(tempTuples[i].begin(), tempTuples[i].end(), sendTuples.data()+sdispls[i]);
125 std::vector< std::tuple<IT,IT,IT,NT> >().swap(tempTuples[i]); // clear memory
126 }
127 std::vector< std::tuple<IT,IT,IT,NT> > recvTuples(totrecv);
128 MPI_Alltoallv(sendTuples.data(), sendcnt, sdispls, MPI_tuple, recvTuples.data(), recvcnt, rdispls, MPI_tuple, World);
129 DeleteAll(sendcnt, recvcnt, sdispls, rdispls); // free all memory
130 MPI_Type_free(&MPI_tuple);
131 return recvTuples;
132}
133
134
135
136
137// remember that getnrow() and getncol() require collectives
138// Hence, we save them once and pass them to this function
139template <class IT, class NT,class DER>
140int OwnerProcs(SpParMat < IT, NT, DER > & A, IT grow, IT gcol, IT nrows, IT ncols)
141{
142 auto commGrid = A.getcommgrid();
143 int procrows = commGrid->GetGridRows();
144 int proccols = commGrid->GetGridCols();
145
146 IT m_perproc = nrows / procrows;
147 IT n_perproc = ncols / proccols;
148 int pr, pc;
149 if(m_perproc != 0)
150 pr = std::min(static_cast<int>(grow / m_perproc), procrows-1);
151 else // all owned by the last processor row
152 pr = procrows -1;
153 if(n_perproc != 0)
154 pc = std::min(static_cast<int>(gcol / n_perproc), proccols-1);
155 else
156 pc = proccols-1;
157 return commGrid->GetRank(pr, pc);
158}
159
160
161/*
162// Hence, we save them once and pass them to this function
163template <class IT, class NT,class DER>
164int OwnerProcs(SpParMat < IT, NT, DER > & A, IT grow, IT gcol, IT nrows, IT ncols)
165{
166
167
168
169 int pr1, pc1;
170 if(m_perproc != 0)
171 pr1 = std::min(static_cast<int>(grow / m_perproc), pr-1);
172 else // all owned by the last processor row
173 pr1 = pr -1;
174 if(n_perproc != 0)
175 pc1 = std::min(static_cast<int>(gcol / n_perproc), pc-1);
176 else
177 pc1 = pc-1;
178 return commGrid->GetRank(pr1, pc1);
179}
180 */
181
182
183template <class IT>
184std::vector<std::tuple<IT,IT>> MateBcast(std::vector<std::tuple<IT,IT>> sendTuples, MPI_Comm World)
185{
186
187 /* Create/allocate variables for vector assignment */
188 MPI_Datatype MPI_tuple;
189 MPI_Type_contiguous(sizeof(std::tuple<IT,IT>) , MPI_CHAR, &MPI_tuple);
190 MPI_Type_commit(&MPI_tuple);
191
192
193 int nprocs;
194 MPI_Comm_size(World, &nprocs);
195
196 int * recvcnt = new int[nprocs];
197 int * rdispls = new int[nprocs]();
198 int sendcnt = sendTuples.size();
199
200
201 MPI_Allgather(&sendcnt, 1, MPI_INT, recvcnt, 1, MPI_INT, World);
202
203 std::partial_sum(recvcnt, recvcnt+nprocs-1, rdispls+1);
204 IT totrecv = std::accumulate(recvcnt,recvcnt+nprocs, static_cast<IT>(0));
205
206 std::vector< std::tuple<IT,IT> > recvTuples(totrecv);
207
208
209 MPI_Allgatherv(sendTuples.data(), sendcnt, MPI_tuple,
210 recvTuples.data(), recvcnt, rdispls,MPI_tuple,World );
211
212 DeleteAll(recvcnt, rdispls); // free all memory
213 MPI_Type_free(&MPI_tuple);
214 return recvTuples;
215
216}
217
218
219// -----------------------------------------------------------
220// replicate weights of mates
221// Can be improved by removing AllReduce by All2All
222// -----------------------------------------------------------
223
224template <class IT, class NT>
225void ReplicateMateWeights( const AWPM_param<IT>& param, Dcsc<IT, NT>*dcsc, const std::vector<IT>& colptr, std::vector<IT>& RepMateC2R, std::vector<NT>& RepMateWR2C, std::vector<NT>& RepMateWC2R)
226{
227
228
229 fill(RepMateWC2R.begin(), RepMateWC2R.end(), static_cast<NT>(0));
230 fill(RepMateWR2C.begin(), RepMateWR2C.end(), static_cast<NT>(0));
231
232
233#ifdef THREADED
234#pragma omp parallel for
235#endif
236 for(int k=0; k<param.lncol; ++k)
237 {
238
239 IT lj = k; // local numbering
240 IT mj = RepMateC2R[lj]; // mate of j
241
242 if(mj >= param.localRowStart && mj < (param.localRowStart+param.lnrow) )
243 {
244 for(IT cp = colptr[k]; cp < colptr[k+1]; ++cp)
245 {
246 IT li = dcsc->ir[cp];
247 IT i = li + param.localRowStart;
248 // TODO: use binary search to directly go to mj-th entry if more than 32 nonzero in this column
249 if( i == mj)
250 {
251 RepMateWC2R[lj] = dcsc->numx[cp];
252 RepMateWR2C[mj-param.localRowStart] = dcsc->numx[cp];
253 //break;
254 }
255 }
256 }
257 }
258
259
260
261 MPI_Comm ColWorld = param.commGrid->GetColWorld();
262 MPI_Comm RowWorld = param.commGrid->GetRowWorld();
263
264 MPI_Allreduce(MPI_IN_PLACE, RepMateWC2R.data(), RepMateWC2R.size(), MPIType<NT>(), MPI_SUM, ColWorld);
265 MPI_Allreduce(MPI_IN_PLACE, RepMateWR2C.data(), RepMateWR2C.size(), MPIType<NT>(), MPI_SUM, RowWorld);
266}
267
268
269
270
271template <class IT, class NT,class DER>
272NT Trace( SpParMat < IT, NT, DER > & A, IT& rettrnnz=0)
273{
274
275 IT nrows = A.getnrow();
276 IT ncols = A.getncol();
277 auto commGrid = A.getcommgrid();
278 MPI_Comm World = commGrid->GetWorld();
279 int myrank=commGrid->GetRank();
280 int pr = commGrid->GetGridRows();
281 int pc = commGrid->GetGridCols();
282
283
284 //Information about the matrix distribution
285 //Assume that A is an nrow x ncol matrix
286 //The local submatrix is an lnrow x lncol matrix
287 int rowrank = commGrid->GetRankInProcRow();
288 int colrank = commGrid->GetRankInProcCol();
289 IT m_perproc = nrows / pr;
290 IT n_perproc = ncols / pc;
291 DER* spSeq = A.seqptr(); // local submatrix
292 IT localRowStart = colrank * m_perproc; // first row in this process
293 IT localColStart = rowrank * n_perproc; // first col in this process
294
295
296 IT trnnz = 0;
297 NT trace = 0.0;
298 for(auto colit = spSeq->begcol(); colit != spSeq->endcol(); ++colit) // iterate over columns
299 {
300 IT lj = colit.colid(); // local numbering
301 IT j = lj + localColStart;
302
303 for(auto nzit = spSeq->begnz(colit); nzit < spSeq->endnz(colit); ++nzit)
304 {
305
306 IT li = nzit.rowid();
307 IT i = li + localRowStart;
308 if( i == j)
309 {
310 trnnz ++;
311 trace += nzit.value();
312
313 }
314 }
315
316 }
317 MPI_Allreduce(MPI_IN_PLACE, &trnnz, 1, MPIType<IT>(), MPI_SUM, World);
318 MPI_Allreduce(MPI_IN_PLACE, &trace, 1, MPIType<NT>(), MPI_SUM, World);
319 rettrnnz = trnnz;
320 /*
321 if(myrank==0)
322 cout <<"nrows: " << nrows << " Nnz in the diag: " << trnnz << " sum of diag: " << trace << endl;
323 */
324 return trace;
325
326}
327
328
329template <class NT>
330NT MatchingWeight( std::vector<NT>& RepMateWC2R, MPI_Comm RowWorld, NT& minw)
331{
332 NT w = 0;
333 minw = 99999999999999.0;
334 for(int i=0; i<RepMateWC2R.size(); i++)
335 {
336 //w += fabs(RepMateWC2R[i]);
337 //w += exp(RepMateWC2R[i]);
338 //minw = min(minw, exp(RepMateWC2R[i]));
339
340 w += RepMateWC2R[i];
341 minw = std::min(minw, RepMateWC2R[i]);
342 }
343
344 MPI_Allreduce(MPI_IN_PLACE, &w, 1, MPIType<NT>(), MPI_SUM, RowWorld);
345 MPI_Allreduce(MPI_IN_PLACE, &minw, 1, MPIType<NT>(), MPI_MIN, RowWorld);
346 return w;
347}
348
349
350
351
352
353// update the distributed mate vectors from replicated mate vectors
354template <class IT>
355void UpdateMatching(FullyDistVec<IT, IT>& mateRow2Col, FullyDistVec<IT, IT>& mateCol2Row, std::vector<IT>& RepMateR2C, std::vector<IT>& RepMateC2R)
356{
357
358 auto commGrid = mateRow2Col.getcommgrid();
359 MPI_Comm RowWorld = commGrid->GetRowWorld();
360 int rowroot = commGrid->GetDiagOfProcRow();
361 int pc = commGrid->GetGridCols();
362
363 // mateRow2Col is easy
364 IT localLenR2C = mateRow2Col.LocArrSize();
365 //IT* localR2C = mateRow2Col.GetLocArr();
366 for(IT i=0, j = mateRow2Col.RowLenUntil(); i<localLenR2C; i++, j++)
367 {
368 mateRow2Col.SetLocalElement(i, RepMateR2C[j]);
369 //localR2C[i] = RepMateR2C[j];
370 }
371
372
373 // mateCol2Row requires communication
374 std::vector <int> sendcnts(pc);
375 std::vector <int> dpls(pc);
376 dpls[0] = 0;
377 for(int i=1; i<pc; i++)
378 {
379 dpls[i] = mateCol2Row.RowLenUntil(i);
380 sendcnts[i-1] = dpls[i] - dpls[i-1];
381 }
382 sendcnts[pc-1] = RepMateC2R.size() - dpls[pc-1];
383
384 IT localLenC2R = mateCol2Row.LocArrSize();
385 IT* localC2R = (IT*) mateCol2Row.GetLocArr();
386 MPI_Scatterv(RepMateC2R.data(),sendcnts.data(), dpls.data(), MPIType<IT>(), localC2R, localLenC2R, MPIType<IT>(),rowroot, RowWorld);
387}
388
389
390
391inline int ThreadBuffLenForBinning(int itemsize, int nbins)
392{
393 // 1MB shared cache (per 2 cores) in KNL
394#ifndef L2_CACHE_SIZE
395#define L2_CACHE_SIZE 256000
396#endif
397 int THREAD_BUF_LEN = 256;
398 while(true)
399 {
400 int bufferMem = THREAD_BUF_LEN * nbins * itemsize ;
401 if(bufferMem>L2_CACHE_SIZE ) THREAD_BUF_LEN/=2;
402 else break;
403 }
404 THREAD_BUF_LEN = std::min(nbins+1,THREAD_BUF_LEN);
405
406 return THREAD_BUF_LEN;
407}
408
409
410
411template <class IT, class NT>
412std::vector< std::tuple<IT,IT,NT> > Phase1(const AWPM_param<IT>& param, Dcsc<IT, NT>* dcsc, const std::vector<IT>& colptr, const std::vector<IT>& RepMateR2C, const std::vector<IT>& RepMateC2R, const std::vector<NT>& RepMateWR2C, const std::vector<NT>& RepMateWC2R )
413{
414
415
416
417
418
419 MPI_Comm World = param.commGrid->GetWorld();
420
421
422
423 //Step 1: Count the amount of data to be sent to different processors
424 std::vector<int> sendcnt(param.nprocs,0); // number items to be sent to each processor
425
426
427#ifdef THREADED
428#pragma omp parallel
429#endif
430 {
431 std::vector<int> tsendcnt(param.nprocs,0);
432#ifdef THREADED
433#pragma omp for
434#endif
435 for(int k=0; k<param.lncol; ++k)
436 {
437 IT mj = RepMateC2R[k]; // lj = k
438 IT j = k + param.localColStart;
439
440 for(IT cp = colptr[k]; cp < colptr[k+1]; ++cp)
441 {
442 IT li = dcsc->ir[cp];
443 IT i = li + param.localRowStart;
444 IT mi = RepMateR2C[li];
445 if( i > mj) // TODO : stop when first come to this, may be use <
446 {
447
448 int rrank = param.m_perproc != 0 ? std::min(static_cast<int>(mj / param.m_perproc), param.pr-1) : (param.pr-1);
449 int crank = param.n_perproc != 0 ? std::min(static_cast<int>(mi / param.n_perproc), param.pc-1) : (param.pc-1);
450 int owner = param.commGrid->GetRank(rrank , crank);
451 tsendcnt[owner]++;
452 }
453 }
454 }
455 for(int i=0; i<param.nprocs; i++)
456 {
457 __sync_fetch_and_add(sendcnt.data()+i, tsendcnt[i]);
458 }
459 }
460
461
462
463
464 IT totsend = std::accumulate(sendcnt.data(), sendcnt.data()+param.nprocs, static_cast<IT>(0));
465 std::vector<int> sdispls (param.nprocs, 0);
466 std::partial_sum(sendcnt.data(), sendcnt.data()+param.nprocs-1, sdispls.data()+1);
467
468 std::vector< std::tuple<IT,IT,NT> > sendTuples(totsend);
469 std::vector<int> transferCount(param.nprocs,0);
470 int THREAD_BUF_LEN = ThreadBuffLenForBinning(24, param.nprocs);
471
472
473 //Step 2: Compile data to be sent to different processors
474#ifdef THREADED
475#pragma omp parallel
476#endif
477 {
478 std::vector<int> tsendcnt(param.nprocs,0);
479 std::vector<std::tuple<IT,IT, NT>> tsendTuples (param.nprocs*THREAD_BUF_LEN);
480#ifdef THREADED
481#pragma omp for
482#endif
483 for(int k=0; k<param.lncol; ++k)
484 {
485 IT mj = RepMateC2R[k];
486 IT lj = k;
487 IT j = k + param.localColStart;
488
489 for(IT cp = colptr[k]; cp < colptr[k+1]; ++cp)
490 {
491 IT li = dcsc->ir[cp];
492 IT i = li + param.localRowStart;
493 IT mi = RepMateR2C[li];
494 if( i > mj) // TODO : stop when first come to this, may be use <
495 {
496 double w = dcsc->numx[cp]- RepMateWR2C[li] - RepMateWC2R[lj];
497 int rrank = param.m_perproc != 0 ? std::min(static_cast<int>(mj / param.m_perproc), param.pr-1) : (param.pr-1);
498 int crank = param.n_perproc != 0 ? std::min(static_cast<int>(mi / param.n_perproc), param.pc-1) : (param.pc-1);
499 int owner = param.commGrid->GetRank(rrank , crank);
500
501 if (tsendcnt[owner] < THREAD_BUF_LEN)
502 {
503 tsendTuples[THREAD_BUF_LEN * owner + tsendcnt[owner]] = std::make_tuple(mi, mj, w);
504 tsendcnt[owner]++;
505 }
506 else
507 {
508 int tt = __sync_fetch_and_add(transferCount.data()+owner, THREAD_BUF_LEN);
509 copy( tsendTuples.data()+THREAD_BUF_LEN * owner, tsendTuples.data()+THREAD_BUF_LEN * (owner+1) , sendTuples.data() + sdispls[owner]+ tt);
510
511 tsendTuples[THREAD_BUF_LEN * owner] = std::make_tuple(mi, mj, w);
512 tsendcnt[owner] = 1;
513 }
514 }
515 }
516 }
517 for(int owner=0; owner < param.nprocs; owner++)
518 {
519 if (tsendcnt[owner] >0)
520 {
521 int tt = __sync_fetch_and_add(transferCount.data()+owner, tsendcnt[owner]);
522 copy( tsendTuples.data()+THREAD_BUF_LEN * owner, tsendTuples.data()+THREAD_BUF_LEN * owner + tsendcnt[owner], sendTuples.data() + sdispls[owner]+ tt);
523 }
524 }
525 }
526
527 // Step 3: Communicate data
528
529 std::vector<int> recvcnt (param.nprocs);
530 std::vector<int> rdispls (param.nprocs, 0);
531
532 MPI_Alltoall(sendcnt.data(), 1, MPI_INT, recvcnt.data(), 1, MPI_INT, World);
533 std::partial_sum(recvcnt.data(), recvcnt.data()+param.nprocs-1, rdispls.data()+1);
534 IT totrecv = std::accumulate(recvcnt.data(), recvcnt.data()+param.nprocs, static_cast<IT>(0));
535
536
537 MPI_Datatype MPI_tuple;
538 MPI_Type_contiguous(sizeof(std::tuple<IT,IT,NT>), MPI_CHAR, &MPI_tuple);
539 MPI_Type_commit(&MPI_tuple);
540
541 std::vector< std::tuple<IT,IT,NT> > recvTuples1(totrecv);
542 MPI_Alltoallv(sendTuples.data(), sendcnt.data(), sdispls.data(), MPI_tuple, recvTuples1.data(), recvcnt.data(), rdispls.data(), MPI_tuple, World);
543 MPI_Type_free(&MPI_tuple);
544 return recvTuples1;
545}
546
547
548
549
550
551template <class IT, class NT>
552std::vector< std::tuple<IT,IT,IT,NT> > Phase2(const AWPM_param<IT>& param, std::vector<std::tuple<IT,IT,NT>>& recvTuples, Dcsc<IT, NT>* dcsc, const std::vector<IT>& colptr, const std::vector<IT>& RepMateR2C, const std::vector<IT>& RepMateC2R, const std::vector<NT>& RepMateWR2C, const std::vector<NT>& RepMateWC2R )
553{
554
555 MPI_Comm World = param.commGrid->GetWorld();
556
557 double tstart = MPI_Wtime();
558
559 // Step 1: Sort for effecient searching of indices
560 __gnu_parallel::sort(recvTuples.begin(), recvTuples.end());
561 std::vector<std::vector<std::tuple<IT,IT, IT, NT>>> tempTuples1 (param.nprocs);
562
563 std::vector<int> sendcnt(param.nprocs,0); // number items to be sent to each processor
564
565 //Step 2: Count the amount of data to be sent to different processors
566 // Instead of binary search in each column, I am doing linear search
567 // Linear search is faster here because, we need to search 40%-50% of nnz
568 int nBins = 1;
569#ifdef THREADED
570#pragma omp parallel
571 {
572 nBins = omp_get_num_threads() * 4;
573 }
574#endif
575
576#ifdef THREADED
577#pragma omp parallel for
578#endif
579 for(int i=0; i<nBins; i++)
580 {
581 int perBin = recvTuples.size()/nBins;
582 int startBinIndex = perBin * i;
583 int endBinIndex = perBin * (i+1);
584 if(i==nBins-1) endBinIndex = recvTuples.size();
585
586
587 std::vector<int> tsendcnt(param.nprocs,0);
588 for(int k=startBinIndex; k<endBinIndex;)
589 {
590
591 IT mi = std::get<0>(recvTuples[k]);
592 IT lcol = mi - param.localColStart;
593 IT i = RepMateC2R[lcol];
594 IT idx1 = k;
595 IT idx2 = colptr[lcol];
596
597 for(; std::get<0>(recvTuples[idx1]) == mi && idx2 < colptr[lcol+1];) //**
598 {
599
600 IT mj = std::get<1>(recvTuples[idx1]) ;
601 IT lrow = mj - param.localRowStart;
602 IT j = RepMateR2C[lrow];
603 IT lrowMat = dcsc->ir[idx2];
604 if(lrowMat == lrow)
605 {
606 NT weight = std::get<2>(recvTuples[idx1]);
607 NT cw = weight + RepMateWR2C[lrow]; //w+W[M'[j],M[i]];
608 if (cw > 0)
609 {
610 int rrank = (param.m_perproc != 0) ? std::min(static_cast<int>(mj / param.m_perproc), param.pr-1) : (param.pr-1);
611 int crank = (param.n_perproc != 0) ? std::min(static_cast<int>(j / param.n_perproc), param.pc-1) : (param.pc-1);
612 int owner = param.commGrid->GetRank(rrank , crank);
613 tsendcnt[owner]++;
614 }
615
616 idx1++; idx2++;
617 }
618 else if(lrowMat > lrow)
619 idx1 ++;
620 else
621 idx2 ++;
622 }
623
624 for(;std::get<0>(recvTuples[idx1]) == mi ; idx1++);
625 k = idx1;
626
627 }
628 for(int i=0; i<param.nprocs; i++)
629 {
630 __sync_fetch_and_add(sendcnt.data()+i, tsendcnt[i]);
631 }
632 }
633
634
635
636
637 IT totsend = std::accumulate(sendcnt.data(), sendcnt.data()+param.nprocs, static_cast<IT>(0));
638 std::vector<int> sdispls (param.nprocs, 0);
639 std::partial_sum(sendcnt.data(), sendcnt.data()+param.nprocs-1, sdispls.data()+1);
640
641 std::vector< std::tuple<IT,IT,IT,NT> > sendTuples(totsend);
642 std::vector<int> transferCount(param.nprocs,0);
643 int THREAD_BUF_LEN = ThreadBuffLenForBinning(32, param.nprocs);
644
645
646 //Step 3: Compile data to be sent to different processors
647#ifdef THREADED
648#pragma omp parallel for
649#endif
650 for(int i=0; i<nBins; i++)
651 {
652 int perBin = recvTuples.size()/nBins;
653 int startBinIndex = perBin * i;
654 int endBinIndex = perBin * (i+1);
655 if(i==nBins-1) endBinIndex = recvTuples.size();
656
657
658 std::vector<int> tsendcnt(param.nprocs,0);
659 std::vector<std::tuple<IT,IT, IT, NT>> tsendTuples (param.nprocs*THREAD_BUF_LEN);
660 for(int k=startBinIndex; k<endBinIndex;)
661 {
662 IT mi = std::get<0>(recvTuples[k]);
663 IT lcol = mi - param.localColStart;
664 IT i = RepMateC2R[lcol];
665 IT idx1 = k;
666 IT idx2 = colptr[lcol];
667
668 for(; std::get<0>(recvTuples[idx1]) == mi && idx2 < colptr[lcol+1];) //**
669 {
670
671 IT mj = std::get<1>(recvTuples[idx1]) ;
672 IT lrow = mj - param.localRowStart;
673 IT j = RepMateR2C[lrow];
674 IT lrowMat = dcsc->ir[idx2];
675 if(lrowMat == lrow)
676 {
677 NT weight = std::get<2>(recvTuples[idx1]);
678 NT cw = weight + RepMateWR2C[lrow]; //w+W[M'[j],M[i]];
679 if (cw > 0)
680 {
681 int rrank = (param.m_perproc != 0) ? std::min(static_cast<int>(mj / param.m_perproc), param.pr-1) : (param.pr-1);
682 int crank = (param.n_perproc != 0) ? std::min(static_cast<int>(j / param.n_perproc), param.pc-1) : (param.pc-1);
683 int owner = param.commGrid->GetRank(rrank , crank);
684
685 if (tsendcnt[owner] < THREAD_BUF_LEN)
686 {
687 tsendTuples[THREAD_BUF_LEN * owner + tsendcnt[owner]] = std::make_tuple(mj, mi, i, cw);
688 tsendcnt[owner]++;
689 }
690 else
691 {
692 int tt = __sync_fetch_and_add(transferCount.data()+owner, THREAD_BUF_LEN);
693 std::copy( tsendTuples.data()+THREAD_BUF_LEN * owner, tsendTuples.data()+THREAD_BUF_LEN * (owner+1) , sendTuples.data() + sdispls[owner]+ tt);
694
695 tsendTuples[THREAD_BUF_LEN * owner] = std::make_tuple(mj, mi, i, cw);
696 tsendcnt[owner] = 1;
697 }
698
699 }
700
701 idx1++; idx2++;
702 }
703 else if(lrowMat > lrow)
704 idx1 ++;
705 else
706 idx2 ++;
707 }
708
709 for(;std::get<0>(recvTuples[idx1]) == mi ; idx1++);
710 k = idx1;
711 }
712
713 for(int owner=0; owner < param.nprocs; owner++)
714 {
715 if (tsendcnt[owner] >0)
716 {
717 int tt = __sync_fetch_and_add(transferCount.data()+owner, tsendcnt[owner]);
718 std::copy( tsendTuples.data()+THREAD_BUF_LEN * owner, tsendTuples.data()+THREAD_BUF_LEN * owner + tsendcnt[owner], sendTuples.data() + sdispls[owner]+ tt);
719 }
720 }
721 }
722
723 // Step 4: Communicate data
724
725 std::vector<int> recvcnt (param.nprocs);
726 std::vector<int> rdispls (param.nprocs, 0);
727
728 MPI_Alltoall(sendcnt.data(), 1, MPI_INT, recvcnt.data(), 1, MPI_INT, World);
729 std::partial_sum(recvcnt.data(), recvcnt.data()+param.nprocs-1, rdispls.data()+1);
730 IT totrecv = std::accumulate(recvcnt.data(), recvcnt.data()+param.nprocs, static_cast<IT>(0));
731
732
733 MPI_Datatype MPI_tuple;
734 MPI_Type_contiguous(sizeof(std::tuple<IT,IT,IT,NT>), MPI_CHAR, &MPI_tuple);
735 MPI_Type_commit(&MPI_tuple);
736
737 std::vector< std::tuple<IT,IT,IT,NT> > recvTuples1(totrecv);
738 MPI_Alltoallv(sendTuples.data(), sendcnt.data(), sdispls.data(), MPI_tuple, recvTuples1.data(), recvcnt.data(), rdispls.data(), MPI_tuple, World);
739 MPI_Type_free(&MPI_tuple);
740 return recvTuples1;
741}
742
743
744// Old version of Phase 2
745// Not multithreaded (uses binary search)
746template <class IT, class NT>
747std::vector<std::vector<std::tuple<IT,IT, IT, NT>>> Phase2_old(const AWPM_param<IT>& param, std::vector<std::tuple<IT,IT,NT>>& recvTuples, Dcsc<IT, NT>* dcsc, const std::vector<IT>& colptr, const std::vector<IT>& RepMateR2C, const std::vector<IT>& RepMateC2R, const std::vector<NT>& RepMateWR2C, const std::vector<NT>& RepMateWC2R )
748{
749
750 std::vector<std::vector<std::tuple<IT,IT, IT, NT>>> tempTuples1 (param.nprocs);
751 for(int k=0; k<recvTuples.size(); ++k)
752 {
753 IT mi = std::get<0>(recvTuples[k]) ;
754 IT mj = std::get<1>(recvTuples[k]) ;
755 IT i = RepMateC2R[mi - param.localColStart];
756 NT weight = std::get<2>(recvTuples[k]);
757
758 if(colptr[mi- param.localColStart+1] > colptr[mi- param.localColStart] )
759 {
760 IT * ele = find(dcsc->ir+colptr[mi - param.localColStart], dcsc->ir+colptr[mi - param.localColStart+1], mj - param.localRowStart);
761
762 // TODO: Add a function that returns the edge weight directly
763 if (ele != dcsc->ir+colptr[mi - param.localColStart+1])
764 {
765 NT cw = weight + RepMateWR2C[mj - param.localRowStart]; //w+W[M'[j],M[i]];
766 if (cw > 0)
767 {
768 IT j = RepMateR2C[mj - param.localRowStart];
769 int rrank = (param.m_perproc != 0) ? std::min(static_cast<int>(mj / param.m_perproc), param.pr-1) : (param.pr-1);
770 int crank = (param.n_perproc != 0) ? std::min(static_cast<int>(j / param.n_perproc), param.pc-1) : (param.pc-1);
771 int owner = param.commGrid->GetRank(rrank , crank);
772 tempTuples1[owner].push_back(make_tuple(mj, mi, i, cw));
773 }
774 }
775 }
776 }
777
778 return tempTuples1;
779}
780
781template <class IT, class NT, class DER>
782void TwoThirdApprox(SpParMat < IT, NT, DER > & A, FullyDistVec<IT, IT>& mateRow2Col, FullyDistVec<IT, IT>& mateCol2Row)
783{
784
785 // Information about CommGrid and matrix layout
786 // Assume that processes are laid in (pr x pc) process grid
787 auto commGrid = A.getcommgrid();
788 int myrank=commGrid->GetRank();
789 MPI_Comm World = commGrid->GetWorld();
790 MPI_Comm ColWorld = commGrid->GetColWorld();
791 MPI_Comm RowWorld = commGrid->GetRowWorld();
792 int nprocs = commGrid->GetSize();
793 int pr = commGrid->GetGridRows();
794 int pc = commGrid->GetGridCols();
795 int rowrank = commGrid->GetRankInProcRow();
796 int colrank = commGrid->GetRankInProcCol();
797 int diagneigh = commGrid->GetComplementRank();
798
799 //Information about the matrix distribution
800 //Assume that A is an nrow x ncol matrix
801 //The local submatrix is an lnrow x lncol matrix
802 IT nrows = A.getnrow();
803 IT ncols = A.getncol();
804 IT nnz = A.getnnz();
805 IT m_perproc = nrows / pr;
806 IT n_perproc = ncols / pc;
807 DER* spSeq = A.seqptr(); // local submatrix
808 Dcsc<IT, NT>* dcsc = spSeq->GetDCSC();
809 IT lnrow = spSeq->getnrow();
810 IT lncol = spSeq->getncol();
811 IT localRowStart = colrank * m_perproc; // first row in this process
812 IT localColStart = rowrank * n_perproc; // first col in this process
813
814 AWPM_param<IT> param;
815 param.nprocs = nprocs;
816 param.pr = pr;
817 param.pc = pc;
818 param.lncol = lncol;
819 param.lnrow = lnrow;
820 param.m_perproc = m_perproc;
821 param.n_perproc = n_perproc;
822 param.localRowStart = localRowStart;
823 param.localColStart = localColStart;
824 param.myrank = myrank;
825 param.commGrid = commGrid;
826
827
828 // -----------------------------------------------------------
829 // replicate mate vectors for mateCol2Row
830 // Communication cost: same as the first communication of SpMV
831 // -----------------------------------------------------------
832 int xsize = (int) mateCol2Row.LocArrSize();
833 int trxsize = 0;
834 MPI_Status status;
835 MPI_Sendrecv(&xsize, 1, MPI_INT, diagneigh, TRX, &trxsize, 1, MPI_INT, diagneigh, TRX, World, &status);
836 std::vector<IT> trxnums(trxsize);
837 MPI_Sendrecv(mateCol2Row.GetLocArr(), xsize, MPIType<IT>(), diagneigh, TRX, trxnums.data(), trxsize, MPIType<IT>(), diagneigh, TRX, World, &status);
838
839
840 std::vector<int> colsize(pc);
841 colsize[colrank] = trxsize;
842 MPI_Allgather(MPI_IN_PLACE, 1, MPI_INT, colsize.data(), 1, MPI_INT, ColWorld);
843 std::vector<int> dpls(pc,0); // displacements (zero initialized pid)
844 std::partial_sum(colsize.data(), colsize.data()+pc-1, dpls.data()+1);
845 int accsize = std::accumulate(colsize.data(), colsize.data()+pc, 0);
846 std::vector<IT> RepMateC2R(accsize);
847 MPI_Allgatherv(trxnums.data(), trxsize, MPIType<IT>(), RepMateC2R.data(), colsize.data(), dpls.data(), MPIType<IT>(), ColWorld);
848 // -----------------------------------------------------------
849
850
851 // -----------------------------------------------------------
852 // replicate mate vectors for mateRow2Col
853 // Communication cost: same as the first communication of SpMV
854 // (minus the cost of tranposing vector)
855 // -----------------------------------------------------------
856
857
858 xsize = (int) mateRow2Col.LocArrSize();
859
860 std::vector<int> rowsize(pr);
861 rowsize[rowrank] = xsize;
862 MPI_Allgather(MPI_IN_PLACE, 1, MPI_INT, rowsize.data(), 1, MPI_INT, RowWorld);
863 std::vector<int> rdpls(pr,0); // displacements (zero initialized pid)
864 std::partial_sum(rowsize.data(), rowsize.data()+pr-1, rdpls.data()+1);
865 accsize = std::accumulate(rowsize.data(), rowsize.data()+pr, 0);
866 std::vector<IT> RepMateR2C(accsize);
867 MPI_Allgatherv(mateRow2Col.GetLocArr(), xsize, MPIType<IT>(), RepMateR2C.data(), rowsize.data(), rdpls.data(), MPIType<IT>(), RowWorld);
868 // -----------------------------------------------------------
869
870
871
872 // Getting column pointers for all columns (for CSC-style access)
873 std::vector<IT> colptr (lncol+1,-1);
874 for(auto colit = spSeq->begcol(); colit != spSeq->endcol(); ++colit) // iterate over all columns
875 {
876 IT lj = colit.colid(); // local numbering
877
878 colptr[lj] = colit.colptr();
879 }
880 colptr[lncol] = spSeq->getnnz();
881 for(IT k=lncol-1; k>=0; k--)
882 {
883 if(colptr[k] == -1)
884 {
885 colptr[k] = colptr[k+1];
886 }
887 }
888 // TODO: will this fail empty local matrix where every entry of colptr will be zero
889
890
891 // -----------------------------------------------------------
892 // replicate weights of mates
893 // -----------------------------------------------------------
894 std::vector<NT> RepMateWR2C(lnrow);
895 std::vector<NT> RepMateWC2R(lncol);
896 ReplicateMateWeights(param, dcsc, colptr, RepMateC2R, RepMateWR2C, RepMateWC2R);
897
898
899 int iterations = 0;
900 NT minw;
901 NT weightCur = MatchingWeight(RepMateWC2R, RowWorld, minw);
902 NT weightPrev = weightCur - 999999999999;
903 while(weightCur > weightPrev && iterations++ < 10)
904 {
905
906
907 if(myrank==0) std::cout << "Iteration " << iterations << ". matching weight: sum = "<< weightCur << " min = " << minw << std::endl;
908 // C requests
909 // each row is for a processor where C requests will be sent to
910 double tstart;
911
912
913 std::vector<std::tuple<IT,IT,NT>> recvTuples = Phase1(param, dcsc, colptr, RepMateR2C, RepMateC2R, RepMateWR2C, RepMateWC2R );
914 std::vector<std::tuple<IT,IT,IT,NT>> recvTuples1 = Phase2(param, recvTuples, dcsc, colptr, RepMateR2C, RepMateC2R, RepMateWR2C, RepMateWC2R );
915 std::vector< std::tuple<IT,IT,NT> >().swap(recvTuples);
916
917
918
919 tstart = MPI_Wtime();
920
921 std::vector<std::tuple<IT,IT,IT,NT>> bestTuplesPhase3 (lncol);
922#ifdef THREADED
923#pragma omp parallel for
924#endif
925 for(int k=0; k<lncol; ++k)
926 {
927 bestTuplesPhase3[k] = std::make_tuple(-1,-1,-1,0); // fix this
928 }
929
930 for(int k=0; k<recvTuples1.size(); ++k)
931 {
932 IT mj = std::get<0>(recvTuples1[k]) ;
933 IT mi = std::get<1>(recvTuples1[k]) ;
934 IT i = std::get<2>(recvTuples1[k]) ;
935 NT weight = std::get<3>(recvTuples1[k]);
936 IT j = RepMateR2C[mj - localRowStart];
937 IT lj = j - localColStart;
938
939 // we can get rid of the first check if edge weights are non negative
940 if( (std::get<0>(bestTuplesPhase3[lj]) == -1) || (weight > std::get<3>(bestTuplesPhase3[lj])) )
941 {
942 bestTuplesPhase3[lj] = std::make_tuple(i,mi,mj,weight);
943 }
944 }
945
946 std::vector<std::vector<std::tuple<IT,IT, IT, NT>>> tempTuples1 (nprocs);
947 for(int k=0; k<lncol; ++k)
948 {
949 if( std::get<0>(bestTuplesPhase3[k]) != -1)
950 {
951 //IT j = RepMateR2C[mj - localRowStart]; /// fix me
952
953 IT i = std::get<0>(bestTuplesPhase3[k]) ;
954 IT mi = std::get<1>(bestTuplesPhase3[k]) ;
955 IT mj = std::get<2>(bestTuplesPhase3[k]) ;
956 IT j = RepMateR2C[mj - localRowStart];
957 NT weight = std::get<3>(bestTuplesPhase3[k]);
958 int owner = OwnerProcs(A, i, mi, nrows, ncols);
959
960 tempTuples1[owner].push_back(std::make_tuple(i, j, mj, weight));
961 }
962 }
963
964 //vector< tuple<IT,IT,IT, NT> >().swap(recvTuples1);
965 recvTuples1 = ExchangeData1(tempTuples1, World);
966
967 std::vector<std::tuple<IT,IT,IT,IT, NT>> bestTuplesPhase4 (lncol);
968 // we could have used lnrow in both bestTuplesPhase3 and bestTuplesPhase4
969
970 // Phase 4
971 // at the owner of (i,mi)
972#ifdef THREADED
973#pragma omp parallel for
974#endif
975 for(int k=0; k<lncol; ++k)
976 {
977 bestTuplesPhase4[k] = std::make_tuple(-1,-1,-1,-1,0);
978 }
979
980 for(int k=0; k<recvTuples1.size(); ++k)
981 {
982 IT i = std::get<0>(recvTuples1[k]) ;
983 IT j = std::get<1>(recvTuples1[k]) ;
984 IT mj = std::get<2>(recvTuples1[k]) ;
985 IT mi = RepMateR2C[i-localRowStart];
986 NT weight = std::get<3>(recvTuples1[k]);
987 IT lmi = mi - localColStart;
988 //IT lj = j - localColStart;
989
990 // cout <<"****" << i << " " << mi << " "<< j << " " << mj << " " << get<0>(bestTuplesPhase4[lj]) << endl;
991 // we can get rid of the first check if edge weights are non negative
992 if( ((std::get<0>(bestTuplesPhase4[lmi]) == -1) || (weight > std::get<4>(bestTuplesPhase4[lmi]))) && std::get<0>(bestTuplesPhase3[lmi])==-1 )
993 {
994 bestTuplesPhase4[lmi] = std::make_tuple(i,j,mi,mj,weight);
995 //cout << "(("<< i << " " << mi << " "<< j << " " << mj << "))"<< endl;
996 }
997 }
998
999
1000 std::vector<std::vector<std::tuple<IT,IT,IT, IT>>> winnerTuples (nprocs);
1001
1002
1003 for(int k=0; k<lncol; ++k)
1004 {
1005 if( std::get<0>(bestTuplesPhase4[k]) != -1)
1006 {
1007 //int owner = OwnerProcs(A, get<0>(bestTuples[k]), get<1>(bestTuples[k]), nrows, ncols); // (i,mi)
1008 //tempTuples[owner].push_back(bestTuples[k]);
1009 IT i = std::get<0>(bestTuplesPhase4[k]) ;
1010 IT j = std::get<1>(bestTuplesPhase4[k]) ;
1011 IT mi = std::get<2>(bestTuplesPhase4[k]) ;
1012 IT mj = std::get<3>(bestTuplesPhase4[k]) ;
1013
1014
1015 int owner = OwnerProcs(A, mj, j, nrows, ncols);
1016 winnerTuples[owner].push_back(std::make_tuple(i, j, mi, mj));
1017
1019 // passing the opposite of the matching to the owner of (i,mi)
1020 owner = OwnerProcs(A, i, mi, nrows, ncols);
1021 winnerTuples[owner].push_back(std::make_tuple(mj, mi, j, i));
1022 }
1023 }
1024
1025
1026 //vector< tuple<IT,IT,IT, NT> >().swap(recvTuples1);
1027
1028 std::vector<std::tuple<IT,IT,IT,IT>> recvWinnerTuples = ExchangeData1(winnerTuples, World);
1029
1030 // at the owner of (mj,j)
1031 std::vector<std::tuple<IT,IT>> rowBcastTuples(recvWinnerTuples.size()); //(mi,mj)
1032 std::vector<std::tuple<IT,IT>> colBcastTuples(recvWinnerTuples.size()); //(j,i)
1033#ifdef THREADED
1034#pragma omp parallel for
1035#endif
1036 for(int k=0; k<recvWinnerTuples.size(); ++k)
1037 {
1038 IT i = std::get<0>(recvWinnerTuples[k]) ;
1039 IT j = std::get<1>(recvWinnerTuples[k]) ;
1040 IT mi = std::get<2>(recvWinnerTuples[k]) ;
1041 IT mj = std::get<3>(recvWinnerTuples[k]);
1042
1043 colBcastTuples[k] = std::make_tuple(j,i);
1044 rowBcastTuples[k] = std::make_tuple(mj,mi);
1045 }
1046
1047 std::vector<std::tuple<IT,IT>> updatedR2C = MateBcast(rowBcastTuples, RowWorld);
1048 std::vector<std::tuple<IT,IT>> updatedC2R = MateBcast(colBcastTuples, ColWorld);
1049
1050#ifdef THREADED
1051#pragma omp parallel for
1052#endif
1053 for(int k=0; k<updatedR2C.size(); k++)
1054 {
1055 IT row = std::get<0>(updatedR2C[k]);
1056 IT mate = std::get<1>(updatedR2C[k]);
1057 RepMateR2C[row-localRowStart] = mate;
1058 }
1059
1060#ifdef THREADED
1061#pragma omp parallel for
1062#endif
1063 for(int k=0; k<updatedC2R.size(); k++)
1064 {
1065 IT col = std::get<0>(updatedC2R[k]);
1066 IT mate = std::get<1>(updatedC2R[k]);
1067 RepMateC2R[col-localColStart] = mate;
1068 }
1069
1070
1071 // update weights of matched edges
1072 // we can do better than this since we are doing sparse updates
1073 ReplicateMateWeights(param, dcsc, colptr, RepMateC2R, RepMateWR2C, RepMateWC2R);
1074
1075 weightPrev = weightCur;
1076 weightCur = MatchingWeight(RepMateWC2R, RowWorld, minw);
1077
1078
1079 //UpdateMatching(mateRow2Col, mateCol2Row, RepMateR2C, RepMateC2R);
1080 //CheckMatching(mateRow2Col,mateCol2Row);
1081
1082 }
1083
1084 // update the distributed mate vectors from replicated mate vectors
1085 UpdateMatching(mateRow2Col, mateCol2Row, RepMateR2C, RepMateC2R);
1086 //weightCur = MatchingWeight(RepMateWC2R, RowWorld);
1087
1088
1089
1090}
1091
1092
1093 template <class IT, class NT, class DER>
1094 void TransformWeight(SpParMat < IT, NT, DER > & A, bool applylog)
1095 {
1096 //A.Apply([](NT val){return log(1+abs(val));});
1097 // if the matrix has explicit zero entries, we can still have problem.
1098 // One solution is to remove explicit zero entries before cardinality matching (to be tested)
1099 //A.Apply([](NT val){if(val==0) return log(numeric_limits<NT>::min()); else return log(fabs(val));});
1100 A.Apply([](NT val){return (fabs(val));});
1101
1102 FullyDistVec<IT, NT> maxvRow(A.getcommgrid());
1103 A.Reduce(maxvRow, Row, maximum<NT>(), static_cast<NT>(numeric_limits<NT>::lowest()));
1104 A.DimApply(Row, maxvRow, [](NT val, NT maxval){return val/maxval;});
1105
1106 FullyDistVec<IT, NT> maxvCol(A.getcommgrid());
1107 A.Reduce(maxvCol, Column, maximum<NT>(), static_cast<NT>(numeric_limits<NT>::lowest()));
1108 A.DimApply(Column, maxvCol, [](NT val, NT maxval){return val/maxval;});
1109
1110 if(applylog)
1111 A.Apply([](NT val){return log(val);});
1112
1113 }
1114
1115 template <class IT, class NT>
1116 void AWPM(SpParMat < IT, NT, SpDCCols<IT, NT> > & A1, FullyDistVec<IT, IT>& mateRow2Col, FullyDistVec<IT, IT>& mateCol2Row, bool optimizeProd=true)
1117 {
1118 SpParMat < IT, NT, SpDCCols<IT, NT> > A(A1); // creating a copy because it is being transformed
1119
1120 if(optimizeProd)
1121 TransformWeight(A, true);
1122 else
1123 TransformWeight(A, false);
1124 SpParMat < IT, NT, SpCCols<IT, NT> > Acsc(A);
1125 SpParMat < IT, NT, SpDCCols<IT, bool> > Abool(A);
1126 FullyDistVec<IT, IT> degCol(A.getcommgrid());
1127 Abool.Reduce(degCol, Column, plus<IT>(), static_cast<IT>(0));
1128 double ts;
1129
1130 // Compute the initial trace
1131 IT diagnnz;
1132 double origWeight = Trace(A, diagnnz);
1133 bool isOriginalPerfect = diagnnz==A.getnrow();
1134
1135 // compute the maximal matching
1136 WeightedGreedy(Acsc, mateRow2Col, mateCol2Row, degCol);
1137 double mclWeight = MatchingWeight( A, mateRow2Col, mateCol2Row);
1138 SpParHelper::Print("After Greedy sanity check\n");
1139 bool isPerfectMCL = CheckMatching(mateRow2Col,mateCol2Row);
1140
1141 // if the original matrix has a perfect matching and better weight
1142 if(isOriginalPerfect && mclWeight<=origWeight)
1143 {
1144 SpParHelper::Print("Maximal is not better that the natural ordering. Hence, keeping the natural ordering.\n");
1145 mateRow2Col.iota(A.getnrow(), 0);
1146 mateCol2Row.iota(A.getncol(), 0);
1147 mclWeight = origWeight;
1148 isPerfectMCL = true;
1149 }
1150
1151
1152 // MCM
1153 double tmcm = 0;
1154 double mcmWeight = mclWeight;
1155 if(!isPerfectMCL) // run MCM only if we don't have a perfect matching
1156 {
1157 ts = MPI_Wtime();
1158 maximumMatching(Acsc, mateRow2Col, mateCol2Row, true, false, true);
1159 tmcm = MPI_Wtime() - ts;
1160 mcmWeight = MatchingWeight( A, mateRow2Col, mateCol2Row) ;
1161 SpParHelper::Print("After MCM sanity check\n");
1162 CheckMatching(mateRow2Col,mateCol2Row);
1163 }
1164
1165
1166 // AWPM
1167 ts = MPI_Wtime();
1168 TwoThirdApprox(A, mateRow2Col, mateCol2Row);
1169 double tawpm = MPI_Wtime() - ts;
1170
1171 double awpmWeight = MatchingWeight( A, mateRow2Col, mateCol2Row) ;
1172 SpParHelper::Print("After AWPM sanity check\n");
1173 CheckMatching(mateRow2Col,mateCol2Row);
1174 if(isOriginalPerfect && awpmWeight<origWeight) // keep original
1175 {
1176 SpParHelper::Print("AWPM is not better that the natural ordering. Hence, keeping the natural ordering.\n");
1177 mateRow2Col.iota(A.getnrow(), 0);
1178 mateCol2Row.iota(A.getncol(), 0);
1179 awpmWeight = origWeight;
1180 }
1181
1182 }
1183
1184}
1185
1186#endif /* ApproxWeightPerfectMatching_h */
#define L2_CACHE_SIZE
static void Print(const std::string &s)
#define TRX
Definition SpDefs.h:102
@ Column
Definition SpDefs.h:114
bool CheckMatching(FullyDistVec< IT, IT > &mateRow2Col, FullyDistVec< IT, IT > &mateCol2Row)
Definition Utility.h:113
int OwnerProcs(SpParMat< IT, NT, DER > &A, IT grow, IT gcol, IT nrows, IT ncols)
std::vector< std::tuple< IT, IT, NT > > Phase1(const AWPM_param< IT > &param, Dcsc< IT, NT > *dcsc, const std::vector< IT > &colptr, const std::vector< IT > &RepMateR2C, const std::vector< IT > &RepMateC2R, const std::vector< NT > &RepMateWR2C, const std::vector< NT > &RepMateWC2R)
void AWPM(SpParMat< IT, NT, SpDCCols< IT, NT > > &A1, FullyDistVec< IT, IT > &mateRow2Col, FullyDistVec< IT, IT > &mateCol2Row, bool optimizeProd=true)
std::vector< std::tuple< IT, IT, IT, NT > > Phase2(const AWPM_param< IT > &param, std::vector< std::tuple< IT, IT, NT > > &recvTuples, Dcsc< IT, NT > *dcsc, const std::vector< IT > &colptr, const std::vector< IT > &RepMateR2C, const std::vector< IT > &RepMateC2R, const std::vector< NT > &RepMateWR2C, const std::vector< NT > &RepMateWC2R)
NT MatchingWeight(std::vector< NT > &RepMateWC2R, MPI_Comm RowWorld, NT &minw)
void TransformWeight(SpParMat< IT, NT, DER > &A, bool applylog)
std::vector< std::tuple< IT, IT, NT > > ExchangeData(std::vector< std::vector< std::tuple< IT, IT, NT > > > &tempTuples, MPI_Comm World)
std::vector< std::tuple< IT, IT, IT, NT > > ExchangeData1(std::vector< std::vector< std::tuple< IT, IT, IT, NT > > > &tempTuples, MPI_Comm World)
NT Trace(SpParMat< IT, NT, DER > &A, IT &rettrnnz=0)
void UpdateMatching(FullyDistVec< IT, IT > &mateRow2Col, FullyDistVec< IT, IT > &mateCol2Row, std::vector< IT > &RepMateR2C, std::vector< IT > &RepMateC2R)
void maximumMatching(SpParMat< IT, NT, DER > &A, FullyDistVec< IT, IT > &mateRow2Col, FullyDistVec< IT, IT > &mateCol2Row, bool prune=true, bool randMM=false, bool maximizeWeight=false)
int ThreadBuffLenForBinning(int itemsize, int nbins)
std::vector< std::tuple< IT, IT > > MateBcast(std::vector< std::tuple< IT, IT > > sendTuples, MPI_Comm World)
void ReplicateMateWeights(const AWPM_param< IT > &param, Dcsc< IT, NT > *dcsc, const std::vector< IT > &colptr, std::vector< IT > &RepMateC2R, std::vector< NT > &RepMateWR2C, std::vector< NT > &RepMateWC2R)
std::vector< std::vector< std::tuple< IT, IT, IT, NT > > > Phase2_old(const AWPM_param< IT > &param, std::vector< std::tuple< IT, IT, NT > > &recvTuples, Dcsc< IT, NT > *dcsc, const std::vector< IT > &colptr, const std::vector< IT > &RepMateR2C, const std::vector< IT > &RepMateC2R, const std::vector< NT > &RepMateWR2C, const std::vector< NT > &RepMateWC2R)
void TwoThirdApprox(SpParMat< IT, NT, DER > &A, FullyDistVec< IT, IT > &mateRow2Col, FullyDistVec< IT, IT > &mateCol2Row)
void DeleteAll(A arr1)
Definition Deleter.h:48
void WeightedGreedy(Par_MAT_Double &A, FullyDistVec< IT, IT > &mateRow2Col, FullyDistVec< IT, IT > &mateCol2Row, FullyDistVec< IT, IT > &degCol)
double A
std::shared_ptr< CommGrid > commGrid