30#ifndef _PAR_FRIENDS_H_
31#define _PAR_FRIENDS_H_
47template <
class IT,
class NT,
class DER>
58template <
typename IT,
typename NT>
66 else if (
vecs.size() < 2)
73 typename std::vector< FullyDistVec<IT,NT> >::iterator
it =
vecs.begin();
93 std::vector< std::vector< NT > > data(nprocs);
94 std::vector< std::vector< IT > > inds(nprocs);
104 data[
owner].push_back(
it->arr[i]);
110 int *
sendcnt =
new int[nprocs];
111 int *
sdispls =
new int[nprocs];
112 for(
int i=0; i<nprocs; ++i)
113 sendcnt[i] = (
int) data[i].size();
115 int *
rdispls =
new int[nprocs];
116 int *
recvcnt =
new int[nprocs];
120 for(
int i=0; i<nprocs-1; ++i)
127 for(
int i=0; i<nprocs; ++i)
130 std::vector<NT>().swap(data[i]);
137 for(
int i=0; i<nprocs; ++i)
140 std::vector<IT>().swap(inds[i]);
146 for(
int i=0; i<nprocs; ++i)
158template <
typename MATRIXA,
typename MATRIXB>
161 if(
A.getncol() !=
B.getnrow())
163 std::ostringstream
outs;
164 outs <<
"Can not multiply, dimensions does not match"<< std::endl;
165 outs <<
A.getncol() <<
" != " <<
B.getnrow() << std::endl;
170 if((
void*) &
A == (
void*) &
B)
172 std::ostringstream
outs;
173 outs <<
"Can not multiply, inputs alias (make a temporary copy of one of them first)"<< std::endl;
183template <
typename IT,
typename NT,
typename DER>
225 std::ostringstream
outs;
226 outs <<
"Number of columns needing recovery: " <<
nrecover << std::endl;
239 true,
static_cast<NT>(-1));
261 std::ostringstream
outs;
262 outs <<
"Number of columns needing selection: " <<
nselect << std::endl;
311 outs1 <<
"Number of columns needing recovery after selection: " <<
nselect << std::endl;
325 A.PruneColumn(
pruneCols, std::less<NT>(),
true);
348template <
typename SR,
typename NUO,
typename UDERO,
typename IU,
typename NU1,
typename NU2,
typename UDERA,
typename UDERB>
354 if(
A.getncol() !=
B.getnrow())
356 std::ostringstream
outs;
357 outs <<
"Can not multiply, dimensions does not match"<< std::endl;
358 outs <<
A.getncol() <<
" != " <<
B.getnrow() << std::endl;
365 SpParHelper::Print(
"MemEfficientSpGEMM: The value of phases is too small or large. Resetting to 1.\n");
417 std::cout <<
"!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n Warning: input and output memory requirement is greater than per-process avaiable memory. Keeping phase to the value supplied at the command line. The program may go out of memory and crash! \n !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" << std::endl;
419#ifdef SHOW_MEMORY_USAGE
422 std::cout <<
"phases: " << phases <<
": per process memory: " <<
perProcessMemory <<
" GB asquareMem: " <<
asquareMem/1000000000.00 <<
" GB" <<
" inputMem: " <<
inputMem/1000000000.00 <<
" GB" <<
" outputMem: " <<
outputMem/1000000000.00 <<
" GB" <<
" kselectmem: " <<
kselectmem/1000000000.00 <<
" GB" << std::endl;
424 std::cout <<
"phases: " << phases <<
": per process memory: " <<
perProcessMemory <<
" GB asquareMem: " <<
asquareMem/1000000.00 <<
" MB" <<
" inputMem: " <<
inputMem/1000000.00 <<
" MB" <<
" outputMem: " <<
outputMem/1000000.00 <<
" MB" <<
" kselectmem: " <<
kselectmem/1000000.00 <<
" MB" << std::endl;
430 IU C_m =
A.spSeq->getnrow();
431 IU C_n =
B.spSeq->getncol();
456 for(
int p = 0; p< phases; ++p)
459 std::vector< SpTuples<IU,NUO> *>
tomerge;
460 for(
int i = 0; i <
stages; ++i)
466 ess.resize(UDERA::esscount);
467 for(
int j=0;
j< UDERA::esscount; ++
j)
485 ess.resize(UDERB::esscount);
486 for(
int j=0;
j< UDERB::esscount; ++
j)
517#ifdef SHOW_MEMORY_USAGE
519 for(
size_t i = 0; i <
tomerge.size(); ++i)
529 std::cout << p+1 <<
". unmerged: " <<
summa_memory/1000000000.00 <<
"GB " ;
531 std::cout << p+1 <<
". unmerged: " <<
summa_memory/1000000.00 <<
" MB " ;
543#ifdef SHOW_MEMORY_USAGE
554 std::cout <<
" merged: " <<
merge_memory/1000000000.00 <<
"GB " ;
556 std::cout <<
" merged: " <<
merge_memory/1000000.00 <<
" MB " ;
572#ifdef SHOW_MEMORY_USAGE
585 std::cout <<
"Prune: " <<
prune_memory/1000000000.00 <<
"GB " << std::endl ;
587 std::cout <<
"Prune: " <<
prune_memory/1000000.00 <<
" MB " << std::endl ;
616template <
typename SR,
typename NUO,
typename UDERO,
typename IU,
typename NU1,
typename NU2,
typename UDERA,
typename UDERB>
628 IU C_m =
A.spSeq->getnrow();
629 IU C_n =
B.spSeq->getncol();
636 const_cast< UDERB*
>(
B.spSeq)->Transpose();
649 std::vector< SpTuples<IU,NUO> *>
tomerge;
654 for(
int i = 0; i <
stages; ++i)
663 ess.resize(UDERA::esscount);
664 for(
int j=0;
j< UDERA::esscount; ++
j)
678 ess.resize(UDERB::esscount);
679 for(
int j=0;
j< UDERB::esscount; ++
j)
707 for(
int i = 0; i <
stages; ++i)
716 ess.resize(UDERA::esscount);
717 for(
int j=0;
j< UDERA::esscount; ++
j)
733 ess.resize(UDERB::esscount);
734 for(
int j=0;
j< UDERB::esscount; ++
j)
778 const_cast< UDERB*
>(
B.spSeq)->Transpose();
791template <
typename SR,
typename NUO,
typename UDERO,
typename IU,
typename NU1,
typename NU2,
typename UDERA,
typename UDERB>
802 IU C_m =
A.spSeq->getnrow();
803 IU C_n =
B.spSeq->getncol();
817 std::vector< SpTuples<IU,NUO> *>
tomerge;
822 for(
int i = 0; i <
stages; ++i)
831 ess.resize(UDERA::esscount);
832 for(
int j=0;
j< UDERA::esscount; ++
j)
848 ess.resize(UDERB::esscount);
849 for(
int j=0;
j< UDERB::esscount; ++
j)
876 #ifdef COMBBLAS_DEBUG
877 std::ostringstream
outs;
878 outs << i <<
"th SUMMA iteration"<< std::endl;
915 template <
typename IU,
typename NU1,
typename NU2,
typename UDERA,
typename UDERB>
921 if(
A.getncol() !=
B.getnrow())
923 std::ostringstream
outs;
924 outs <<
"Can not multiply, dimensions does not match"<< std::endl;
925 outs <<
A.getncol() <<
" != " <<
B.getnrow() << std::endl;
949 for(
int i = 0; i <
stages; ++i)
958 ess.resize(UDERA::esscount);
959 for(
int j=0;
j< UDERA::esscount; ++
j)
975 ess.resize(UDERB::esscount);
976 for(
int j=0;
j< UDERB::esscount; ++
j)
989 IU nzc =
BRecv->GetDCSC()->nzc;
992#pragma omp parallel for reduction (+:nnzC_stage)
994 for (
IU k=0; k<nzc; k++)
1019template <
typename MATRIX,
typename VECTOR>
1022 if(
A.getncol() !=
x.TotalLength())
1024 std::ostringstream
outs;
1025 outs <<
"Can not multiply, dimensions does not match"<< std::endl;
1026 outs <<
A.getncol() <<
" != " <<
x.TotalLength() << std::endl;
1030 if(! ( *(
A.getcommgrid()) == *(
x.getcommgrid())) )
1032 std::cout <<
"Grids are not comparable for SpMV" << std::endl;
1038template <
typename SR,
typename IU,
typename NUM,
typename UDER>
1039FullyDistSpVec<IU,typename promote_trait<NUM,IU>::T_promote>
SpMV
1040 (
const SpParMat<IU,NUM,UDER> &
A,
const FullyDistSpVec<IU,IU> & x,
bool indexisvalue, OptBuf<
int32_t,
typename promote_trait<NUM,IU>::T_promote > & optbuf);
1042template <
typename SR,
typename IU,
typename NUM,
typename UDER>
1056template<
typename IU,
typename NV>
1063 int diagneigh =
x.
commGrid->GetComplementRank();
1066 MPI_Sendrecv(&
roffst, 1,
MPIType<int32_t>(), diagneigh,
TROST, &
roffset, 1,
MPIType<int32_t>(), diagneigh,
TROST,
World, &
status);
1067 MPI_Sendrecv(&
xlocnz, 1,
MPIType<int32_t>(), diagneigh,
TRNNZ, &
trxlocnz, 1,
MPIType<int32_t>(), diagneigh,
TRNNZ,
World, &
status);
1068 MPI_Sendrecv(&
luntil, 1,
MPIType<IU>(), diagneigh,
TRLUT, &
lenuntil, 1,
MPIType<IU>(), diagneigh,
TRLUT,
World, &
status);
1075#pragma omp parallel for
1077 for(
int i=0; i<
xlocnz; ++i)
1079 MPI_Sendrecv(
temp_xind,
xlocnz,
MPIType<int32_t>(), diagneigh,
TRI,
trxinds,
trxlocnz,
MPIType<int32_t>(), diagneigh,
TRI,
World, &
status);
1084 MPI_Sendrecv(
const_cast<NV*
>(
SpHelper::p2a(
x.num)),
xlocnz,
MPIType<NV>(), diagneigh,
TRX,
trxnums,
trxlocnz,
MPIType<NV>(), diagneigh,
TRX,
World, &
status);
1098template<
typename IU,
typename NV>
1102 int colneighs, colrank;
1105 int *
colnz =
new int[colneighs];
1108 int *
dpls =
new int[colneighs]();
1132 for(
int i=0; i<
accnz; ++i)
1157template<
typename SR,
typename IVT,
typename OVT,
typename IU,
typename NUM,
typename UDER>
1163 if(
A.spSeq->getnsplit() > 0)
1176 if(
A.spSeq->getnsplit() > 0)
1189 std::vector< int32_t >
indy;
1190 std::vector< OVT >
numy;
1225template <
typename SR,
typename IU,
typename OVT>
1238 for(
int i=0; i<
veclen; i++)
1247 int32_t inf = std::numeric_limits<int32_t>::min();
1248 int32_t sup = std::numeric_limits<int32_t>::max();
1251 for(
int i=0; i<
nlists; ++i)
1297template <
typename SR,
typename IU,
typename OVT>
1312#pragma omp parallel for
1314 for(
int i=0; i<
veclen; i++)
1332 for(
int k=0; k<
nlists; k++)
1336#pragma omp parallel for
1350#pragma omp parallel for schedule(dynamic)
1378#pragma omp parallel for schedule(dynamic)
1393template <
typename SR,
typename IVT,
typename OVT,
typename IU,
typename NUM,
typename UDER>
1399 y.glen =
A.getnrow();
1444 LocalSpMV<SR>(
A,
rowneighs,
optbuf,
indacc,
numacc,
sendindbuf,
sendnumbuf,
sdispls,
sendcnt,
accnz,
indexisvalue,
SPA);
1461#pragma omp parallel for
1463 for(
int i=0; i<
sendcnt[0]; i++)
1472#pragma omp parallel for
1474 for(
int i=0; i<
sendcnt[0]; i++)
1524 std::vector<IU>().swap(
y.ind);
1525 std::vector<OVT>().swap(
y.num);
1531#pragma omp parallel for
1553template <
typename SR,
typename IVT,
typename OVT,
typename IU,
typename NUM,
typename UDER>
1560template <
typename SR,
typename IVT,
typename OVT,
typename IU,
typename NUM,
typename UDER>
1568template <
typename SR,
typename IVT,
typename OVT,
typename IU,
typename NUM,
typename UDER>
1580template <
typename SR,
typename IU,
typename NUM,
typename UDER>
1593template <
typename SR,
typename IU,
typename NUM,
typename NUV,
typename UDER>
1607 int diagneigh =
x.
commGrid->GetComplementRank();
1609 MPI_Sendrecv(&
xsize, 1,
MPI_INT, diagneigh,
TRX, &
trxsize, 1,
MPI_INT, diagneigh,
TRX,
World, &
status);
1612 MPI_Sendrecv(
const_cast<NUV*
>(
SpHelper::p2a(
x.arr)),
xsize,
MPIType<NUV>(), diagneigh,
TRX,
trxnums,
trxsize,
MPIType<NUV>(), diagneigh,
TRX,
World, &
status);
1614 int colneighs, colrank;
1617 int *
colsize =
new int[colneighs];
1620 int *
dpls =
new int[colneighs]();
1629 T_promote
id = SR::id();
1673template <
typename SR,
typename IU,
typename NUM,
typename NUV,
typename UDER>
1689 int diagneigh =
x.
commGrid->GetComplementRank();
1691 MPI_Sendrecv(&
xlocnz, 1,
MPI_INT, diagneigh,
TRX, &
trxlocnz, 1,
MPI_INT, diagneigh,
TRX,
World, &
status);
1692 MPI_Sendrecv(&
roffst, 1,
MPI_INT, diagneigh,
TROST, &
offset, 1,
MPI_INT, diagneigh,
TROST,
World, &
status);
1696 MPI_Sendrecv(
const_cast<IU*
>(
SpHelper::p2a(
x.ind)),
xlocnz,
MPIType<IU>(), diagneigh,
TRX,
trxinds,
trxlocnz,
MPIType<IU>(), diagneigh,
TRX,
World, &
status);
1697 MPI_Sendrecv(
const_cast<NUV*
>(
SpHelper::p2a(
x.num)),
xlocnz,
MPIType<NUV>(), diagneigh,
TRX,
trxnums,
trxlocnz,
MPIType<NUV>(), diagneigh,
TRX,
World, &
status);
1700 int colneighs, colrank;
1703 int *
colnz =
new int[colneighs];
1706 int *
dpls =
new int[colneighs]();
1719 std::vector< int32_t >
indy;
1720 std::vector< T_promote >
numy;
1738 typename std::vector<int32_t>::size_type
outnz =
indy.size();
1739 for(
typename std::vector<IU>::size_type i=0; i<
outnz; ++i)
1772 std::vector<IU>().swap(
sendind[i]);
1777 std::vector<T_promote>().swap(
sendnum[i]);
1788 bool * isthere =
new bool[
ysize];
1790 std::fill_n(isthere,
ysize,
false);
1810 for(
int i=0; i<
nnzy; ++i)
1820template <
typename IU,
typename NU1,
typename NU2,
typename UDERA,
typename UDERB>
1834 std::cout <<
"Grids are not comparable elementwise multiplication" << std::endl;
1840template <
typename RETT,
typename RETDER,
typename IU,
typename NU1,
typename NU2,
typename UDERA,
typename UDERB,
typename _BinaryOperation>
1851 std::cout <<
"Grids are not comparable elementwise apply" << std::endl;
1857template <
typename RETT,
typename RETDER,
typename IU,
typename NU1,
typename NU2,
typename UDERA,
typename UDERB,
typename _BinaryOperation,
typename _BinaryPredicate>
1859 (
const SpParMat<IU,NU1,UDERA> &
A,
const SpParMat<IU,NU2,UDERB> &
B,
_BinaryOperation __binary_op,
_BinaryPredicate do_op,
bool allowANulls,
bool allowBNulls,
const NU1&
ANullVal,
const NU2&
BNullVal,
const bool allowIntersect,
const bool useExtendedBinOp)
1868 std::cout <<
"Grids are not comparable elementwise apply" << std::endl;
1875template <
typename RETT,
typename RETDER,
typename IU,
typename NU1,
typename NU2,
typename UDERA,
typename UDERB,
typename _BinaryOperation,
typename _BinaryPredicate>
1877EWiseApply (
const SpParMat<IU,NU1,UDERA> &
A,
const SpParMat<IU,NU2,UDERB> &
B,
_BinaryOperation __binary_op,
_BinaryPredicate do_op,
bool allowANulls,
bool allowBNulls,
const NU1&
ANullVal,
const NU2&
BNullVal,
const bool allowIntersect =
true)
1890template <
typename IU,
typename NU1,
typename NU2>
1899 if(
V.glen !=
W.glen)
1901 std::cerr <<
"Vector dimensions don't match for EWiseMult\n";
1910 #if defined(_OPENMP) && defined(CBLAS_EXPERIMENTAL)
1916 #pragma omp parallel for
1923 if(
W.arr[
V.ind[i]] ==
zero)
1925 tlinds[t].push_back(
V.ind[i]);
1926 tlnums[t].push_back(
V.num[i]);
1936 #pragma omp parallel for
1945 if(
W.arr[
V.ind[i]] ==
zero)
1957 if(
W.arr[
V.ind[i]] !=
zero)
1960 Product.num.push_back(
V.num[i] *
W.arr[
V.ind[i]]);
1969 std::cout <<
"Grids are not comparable elementwise multiplication" << std::endl;
1979template <
typename RET,
typename IU,
typename NU1,
typename NU2,
typename _BinaryOperation,
typename _BinaryPredicate>
1983 typedef RET T_promote;
1987 if(
V.TotalLength() !=
W.TotalLength())
1989 std::ostringstream
outs;
1990 outs <<
"Vector dimensions don't match (" <<
V.TotalLength() <<
" vs " <<
W.TotalLength() <<
") for EWiseApply (short version)\n";
2033 auto it = std::lower_bound (
V.ind.begin(),
V.ind.end(),
tStartIdx);
2100 std::cout <<
"Grids are not comparable for EWiseApply" << std::endl;
2125template <
typename RET,
typename IU,
typename NU1,
typename NU2,
typename _BinaryOperation,
typename _BinaryPredicate>
2134 typedef RET T_promote;
2139 if(
V.TotalLength() !=
W.TotalLength())
2141 std::ostringstream
outs;
2142 outs <<
"Vector dimensions don't match (" <<
V.TotalLength() <<
" vs " <<
W.TotalLength() <<
") for EWiseApply (short version)\n";
2194 std::cout <<
"Grids are not comparable for EWiseApply" << std::endl;
2226template <
typename RET,
typename IU,
typename NU1,
typename NU2,
typename _BinaryOperation,
typename _BinaryPredicate>
2228 (
const FullyDistSpVec<IU,NU1> &
V,
const FullyDistSpVec<IU,NU2> &
W ,
_BinaryOperation _binary_op,
_BinaryPredicate _doOp,
bool allowVNulls,
bool allowWNulls,
NU1 Vzero,
NU2 Wzero,
const bool allowIntersect,
const bool useExtendedBinOp)
2231 typedef RET T_promote;
2235 if(
V.glen !=
W.glen)
2237 std::ostringstream
outs;
2238 outs <<
"Vector dimensions don't match (" <<
V.glen <<
" vs " <<
W.glen <<
") for EWiseApply (full version)\n";
2245 typename std::vector< IU >::const_iterator
indV =
V.ind.begin();
2246 typename std::vector< NU1 >::const_iterator
numV =
V.num.begin();
2247 typename std::vector< IU >::const_iterator
indW =
W.ind.begin();
2248 typename std::vector< NU2 >::const_iterator
numW =
W.num.begin();
2250 while (
indV <
V.ind.end() &&
indW <
W.ind.end())
2317 std::cout <<
"Grids are not comparable for EWiseApply" << std::endl;
2324template <
typename RET,
typename IU,
typename NU1,
typename NU2,
typename _BinaryOperation,
typename _BinaryPredicate>
2338template <
typename RET,
typename IU,
typename NU1,
typename NU2,
typename _BinaryOperation,
typename _BinaryPredicate>
2340 (
const FullyDistSpVec<IU,NU1> &
V,
const FullyDistSpVec<IU,NU2> &
W ,
_BinaryOperation _binary_op,
_BinaryPredicate _doOp,
bool allowVNulls,
bool allowWNulls,
NU1 Vzero,
NU2 Wzero,
const bool allowIntersect =
true)
double cblas_transvectime
double cblas_localspmvtime
double cblas_allgathertime
double cblas_alltoalltime
double cblas_mergeconttime
double mcl_localspgemmtime
double mcl_multiwaymergetime
double mcl_prunecolumntime
std::shared_ptr< CommGrid > commGrid
static void deallocate2D(T **array, I m)
static const T * p2a(const std::vector< T > &v)
static void GetSetSizes(const SpMat< IT, NT, DER > &Matrix, IT **&sizes, MPI_Comm &comm1d)
static void BCastMatrix(MPI_Comm &comm1d, SpMat< IT, NT, DER > &Matrix, const std::vector< IT > &essentials, int root)
static void Print(const std::string &s)
SpParMat< IU, NUO, UDERO > MemEfficientSpGEMM(SpParMat< IU, NU1, UDERA > &A, SpParMat< IU, NU2, UDERB > &B, int phases, NUO hardThreshold, IU selectNum, IU recoverNum, NUO recoverPct, int kselectVersion, int64_t perProcessMemory)
void MCLPruneRecoverySelect(SpParMat< IT, NT, DER > &A, NT hardThreshold, IT selectNum, IT recoverNum, NT recoverPct, int kselectVersion)
int64_t EstPerProcessNnzSUMMA(SpParMat< IU, NU1, UDERA > &A, SpParMat< IU, NU2, UDERB > &B)
void MergeContributions(FullyDistSpVec< IU, VT > &y, int *&recvcnt, int *&rdispls, int32_t *&recvindbuf, VT *&recvnumbuf, int rowneighs)
FullyDistSpVec< IU, RET > EWiseApply_threaded(const FullyDistSpVec< IU, NU1 > &V, const FullyDistVec< IU, NU2 > &W, _BinaryOperation _binary_op, _BinaryPredicate _doOp, bool allowVNulls, NU1 Vzero, const bool useExtendedBinOp)
void CheckSpMVCompliance(const MATRIX &A, const VECTOR &x)
FullyDistVec< IT, NT > Concatenate(std::vector< FullyDistVec< IT, NT > > &vecs)
SpParMat< IU, NUO, UDERO > Mult_AnXBn_Synch(SpParMat< IU, NU1, UDERA > &A, SpParMat< IU, NU2, UDERB > &B, bool clearA=false, bool clearB=false)
void AllGatherVector(MPI_Comm &ColWorld, int trxlocnz, IU lenuntil, int32_t *&trxinds, NV *&trxnums, int32_t *&indacc, NV *&numacc, int &accnz, bool indexisvalue)
SpParMat< IU, NUO, UDERO > Mult_AnXBn_DoubleBuff(SpParMat< IU, NU1, UDERA > &A, SpParMat< IU, NU2, UDERB > &B, bool clearA=false, bool clearB=false)
Dcsc< IU, N_promote > EWiseApply(const Dcsc< IU, NU1 > &A, const Dcsc< IU, NU2 > *B, _BinaryOperation __binary_op, bool notB, const NU2 &defaultBVal)
void TransposeVector(MPI_Comm &World, const FullyDistSpVec< IU, NV > &x, int32_t &trxlocnz, IU &lenuntil, int32_t *&trxinds, NV *&trxnums, bool indexisvalue)
void LocalSpMV(const SpParMat< IT, bool, UDER > &A, int rowneighs, OptBuf< int32_t, VT > &optbuf, int32_t *&indacc, VT *&numacc, int *sendcnt, int accnz)
IT * estimateNNZ(const SpDCCols< IT, NT1 > &A, const SpDCCols< IT, NT2 > &B)
void MergeContributions_threaded(int *&listSizes, std::vector< int32_t * > &indsvec, std::vector< OVT * > &numsvec, std::vector< IU > &mergedind, std::vector< OVT > &mergednum, IU maxindex)
bool CheckSpGEMMCompliance(const MATRIXA &A, const MATRIXB &B)
Dcsc< IU, typename promote_trait< NU1, NU2 >::T_promote > EWiseMult(const Dcsc< IU, NU1 > &A, const Dcsc< IU, NU2 > *B, bool exclude)
shared_ptr< CommGrid > ProductGrid(CommGrid *gridA, CommGrid *gridB, int &innerdim, int &Aoffset, int &Boffset)