30#ifndef _PAR_FRIENDS_H_
31#define _PAR_FRIENDS_H_
49template <
class IT,
class NT,
class DER>
60template <
typename IT,
typename NT>
68 else if (
vecs.size() < 2)
75 typename std::vector< FullyDistVec<IT,NT> >::iterator
it =
vecs.begin();
95 std::vector< std::vector< NT > > data(
nprocs);
96 std::vector< std::vector< IT > > inds(
nprocs);
106 data[
owner].push_back(
it->arr[i]);
113 int * sdispls =
new int[
nprocs];
114 for(
int i=0; i<
nprocs; ++i)
115 sendcnt[i] = (
int) data[i].size();
117 int * rdispls =
new int[
nprocs];
122 for(
int i=0; i<
nprocs-1; ++i)
124 sdispls[i+1] = sdispls[i] +
sendcnt[i];
125 rdispls[i+1] = rdispls[i] +
recvcnt[i];
129 for(
int i=0; i<
nprocs; ++i)
131 std::copy(data[i].begin(), data[i].end(),
senddatabuf+sdispls[i]);
132 std::vector<NT>().swap(data[i]);
139 for(
int i=0; i<
nprocs; ++i)
141 std::copy(inds[i].begin(), inds[i].end(),
sendindsbuf+sdispls[i]);
142 std::vector<IT>().swap(inds[i]);
148 for(
int i=0; i<
nprocs; ++i)
150 for(
int j = rdispls[i];
j < rdispls[i] +
recvcnt[i]; ++
j)
160template <
typename MATRIXA,
typename MATRIXB>
163 if(
A.getncol() !=
B.getnrow())
165 std::ostringstream
outs;
166 outs <<
"Can not multiply, dimensions does not match"<< std::endl;
167 outs <<
A.getncol() <<
" != " <<
B.getnrow() << std::endl;
172 if((
void*) &
A == (
void*) &
B)
174 std::ostringstream
outs;
175 outs <<
"Can not multiply, inputs alias (make a temporary copy of one of them first)"<< std::endl;
185template <
typename IT,
typename NT,
typename DER>
240 std::ostringstream
outs;
241 outs <<
"Number of columns needing recovery: " <<
nrecover << std::endl;
253 true,
static_cast<NT>(-1));
275 std::ostringstream
outs;
276 outs <<
"Number of columns needing selection: " <<
nselect << std::endl;
324 std::ostringstream
outs1;
325 outs1 <<
"Number of columns needing recovery after selection: " <<
nselect << std::endl;
338 A.PruneColumn(
pruneCols, std::less<NT>(),
true);
355template <
typename SR,
typename IU,
typename NU1,
typename NU2,
typename UDERA,
typename UDERB>
365 IU C_n =
B.spSeq->getncol();
381 int Bself = (
B.commGrid)->GetRankInProcCol();
383 for(
int i = 0; i <
stages; ++i)
392 ess.resize(UDERA::esscount);
393 for(
int j=0;
j< UDERA::esscount; ++
j)
408 ess.resize(UDERB::esscount);
409 for(
int j=0; j< UDERB::esscount; ++j)
411 ess[j] = BRecvSizes[j][i];
417 local_flops += EstimateLocalFLOP<SR>
423 if(clearA &&
A.spSeq != NULL) {
427 if(clearB &&
B.spSeq != NULL) {
439 MPI_Allreduce(&local_flops, &global_flops, 1, MPI_LONG_LONG_INT, MPI_SUM,
A.getcommgrid()->GetWorld());
449template <
typename SR,
typename NUO,
typename UDERO,
typename IU,
typename NU1,
typename NU2,
typename UDERA,
typename UDERB>
453 typedef typename UDERA::LocalIT
LIA;
454 typedef typename UDERB::LocalIT
LIB;
455 typedef typename UDERO::LocalIT
LIC;
459 if(
A.getncol() !=
B.getnrow())
461 std::ostringstream
outs;
462 outs <<
"Can not multiply, dimensions does not match"<< std::endl;
463 outs <<
A.getncol() <<
" != " <<
B.getnrow() << std::endl;
470 SpParHelper::Print(
"MemEfficientSpGEMM: The value of phases is too small or large. Resetting to 1.\n");
524 std::cout <<
"!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n Warning: input and output memory requirement is greater than per-process avaiable memory. Keeping phase to the value supplied at the command line. The program may go out of memory and crash! \n !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" << std::endl;
526#ifdef SHOW_MEMORY_USAGE
529 std::cout <<
"phases: " << phases <<
": per process memory: " <<
perProcessMemory <<
" GB asquareMem: " <<
asquareMem/1000000000.00 <<
" GB" <<
" inputMem: " <<
inputMem/1000000000.00 <<
" GB" <<
" outputMem: " <<
outputMem/1000000000.00 <<
" GB" <<
" kselectmem: " <<
kselectmem/1000000000.00 <<
" GB" << std::endl;
531 std::cout <<
"phases: " << phases <<
": per process memory: " <<
perProcessMemory <<
" GB asquareMem: " <<
asquareMem/1000000.00 <<
" MB" <<
" inputMem: " <<
inputMem/1000000.00 <<
" MB" <<
" outputMem: " <<
outputMem/1000000.00 <<
" MB" <<
" kselectmem: " <<
kselectmem/1000000.00 <<
" MB" << std::endl;
559 static_assert(std::is_same<LIA, LIB>::value,
"local index types for both input matrices should be the same");
560 static_assert(std::is_same<LIA, LIC>::value,
"local index types for input and output matrices should be the same");
572 int Bself = (
B.commGrid)->GetRankInProcCol();
574 for(
int p = 0; p< phases; ++p)
577 std::vector< SpTuples<LIC,NUO> *>
tomerge;
578 for(
int i = 0; i <
stages; ++i)
580 std::vector<LIA>
ess;
584 ess.resize(UDERA::esscount);
585 for(
int j=0;
j< UDERA::esscount; ++
j)
605 ess.resize(UDERB::esscount);
606 for(
int j=0;
j< UDERB::esscount; ++
j)
643#ifdef SHOW_MEMORY_USAGE
645 for(
size_t i = 0; i <
tomerge.size(); ++i)
655 std::cout << p+1 <<
". unmerged: " <<
summa_memory/1000000000.00 <<
"GB " ;
657 std::cout << p+1 <<
". unmerged: " <<
summa_memory/1000000.00 <<
" MB " ;
671#ifdef SHOW_MEMORY_USAGE
682 std::cout <<
" merged: " <<
merge_memory/1000000000.00 <<
"GB " ;
684 std::cout <<
" merged: " <<
merge_memory/1000000.00 <<
" MB " ;
700#ifdef SHOW_MEMORY_USAGE
713 std::cout <<
"Prune: " <<
prune_memory/1000000000.00 <<
"GB " << std::endl ;
715 std::cout <<
"Prune: " <<
prune_memory/1000000.00 <<
" MB " << std::endl ;
732template <
typename SR,
typename NUO,
typename UDERO,
typename IU,
typename NU1,
typename NU2,
typename UDERA,
typename UDERB>
738 typedef typename UDERA::LocalIT
LIA;
739 typedef typename UDERB::LocalIT
LIB;
740 typedef typename UDERO::LocalIT
LIC;
798template <
typename SR,
typename NUO,
typename UDERO,
typename IU,
typename NU1,
typename NU2,
typename UDERA,
typename UDERB>
807 typedef typename UDERA::LocalIT
LIA;
808 typedef typename UDERB::LocalIT
LIB;
809 typedef typename UDERO::LocalIT
LIC;
811 static_assert(std::is_same<LIA, LIB>::value,
"local index types for both input matrices should be the same");
812 static_assert(std::is_same<LIA, LIC>::value,
"local index types for input and output matrices should be the same");
824 const_cast< UDERB*
>(
B.spSeq)->Transpose();
840 std::vector< SpTuples<LIC,NUO> *>
tomerge;
843 int Bself = (
B.commGrid)->GetRankInProcCol();
845 for(
int i = 0; i <
stages; ++i)
847 std::vector<LIA>
ess;
854 ess.resize(UDERA::esscount);
855 for(
int j=0;
j< UDERA::esscount; ++
j)
869 ess.resize(UDERB::esscount);
870 for(
int j=0;
j< UDERB::esscount; ++
j)
909 for(
int i = 0; i <
stages; ++i)
911 std::vector<LIA>
ess;
918 ess.resize(UDERA::esscount);
919 for(
int j=0;
j< UDERA::esscount; ++
j)
935 ess.resize(UDERB::esscount);
936 for(
int j=0;
j< UDERB::esscount; ++
j)
992 const_cast< UDERB*
>(
B.spSeq)->Transpose();
1004template <
typename SR,
typename NUO,
typename UDERO,
typename IU,
typename NU1,
typename NU2,
typename UDERA,
typename UDERB>
1017 IU C_m =
A.spSeq->getnrow();
1018 IU C_n =
B.spSeq->getncol();
1031 std::vector< SpTuples<IU,NUO> *>
tomerge;
1034 int Bself = (
B.commGrid)->GetRankInProcCol();
1036 for(
int i = 0; i <
stages; ++i)
1038 std::vector<IU>
ess;
1045 ess.resize(UDERA::esscount);
1046 for(
int j=0;
j< UDERA::esscount; ++
j)
1061 ess.resize(UDERB::esscount);
1062 for(
int j=0;
j< UDERB::esscount; ++
j)
1078#ifdef COMBBLAS_DEBUG
1079 std::ostringstream
outs;
1080 outs << i <<
"th SUMMA iteration"<< std::endl;
1110template <
typename SR,
typename NUO,
typename UDERO,
typename IU,
typename NU1,
typename NU2,
typename UDERA,
typename UDERB>
1122 IU C_m =
A.spSeq->getnrow();
1123 IU C_n =
B.spSeq->getncol();
1143 for(
int i = 0; i <
stages; i++){
1151 int Bself = (
B.commGrid)->GetRankInProcCol();
1153 std::vector< SpTuples<IU,NUO> *>
tomerge;
1155 for(
int i = 0; i <
stages; ++i){
1156 std::vector<IU>
ess;
1159 ess.resize(UDERA::esscount);
1169 ess.resize(UDERB::esscount);
1191 #ifdef COMBBLAS_DEBUG
1192 std::ostringstream
outs;
1193 outs << i <<
"th SUMMA iteration"<< std::endl;
1242template <
typename IU,
typename NU1,
typename NU2,
typename UDERA,
typename UDERB>
1245 typedef typename UDERA::LocalIT
LIA;
1246 typedef typename UDERB::LocalIT
LIB;
1247 static_assert(std::is_same<LIA, LIB>::value,
"local index types for both input matrices should be the same");
1253 if(
A.getncol() !=
B.getnrow())
1255 std::ostringstream
outs;
1256 outs <<
"Can not multiply, dimensions does not match"<< std::endl;
1257 outs <<
A.getncol() <<
" != " <<
B.getnrow() << std::endl;
1278 int Bself = (
B.commGrid)->GetRankInProcCol();
1281 for(
int i = 0; i <
stages; ++i)
1283 std::vector<LIA>
ess;
1290 ess.resize(UDERA::esscount);
1291 for(
int j=0;
j< UDERA::esscount; ++
j)
1307 ess.resize(UDERB::esscount);
1308 for(
int j=0;
j< UDERB::esscount; ++
j)
1350template <
typename MATRIX,
typename VECTOR>
1353 if(
A.getncol() !=
x.TotalLength())
1355 std::ostringstream
outs;
1356 outs <<
"Can not multiply, dimensions does not match"<< std::endl;
1357 outs <<
A.getncol() <<
" != " <<
x.TotalLength() << std::endl;
1361 if(! ( *(
A.getcommgrid()) == *(
x.getcommgrid())) )
1363 std::cout <<
"Grids are not comparable for SpMV" << std::endl;
1369template <
typename SR,
typename IU,
typename NUM,
typename UDER>
1370FullyDistSpVec<IU,typename promote_trait<NUM,IU>::T_promote>
SpMV
1373template <
typename SR,
typename IU,
typename NUM,
typename UDER>
1387template<
typename IU,
typename NV>
1394 int diagneigh =
x.
commGrid->GetComplementRank();
1397 MPI_Sendrecv(&
roffst, 1,
MPIType<int32_t>(), diagneigh,
TROST, &
roffset, 1,
MPIType<int32_t>(), diagneigh,
TROST,
World, &
status);
1398 MPI_Sendrecv(&
xlocnz, 1,
MPIType<int32_t>(), diagneigh,
TRNNZ, &
trxlocnz, 1,
MPIType<int32_t>(), diagneigh,
TRNNZ,
World, &
status);
1399 MPI_Sendrecv(&
luntil, 1,
MPIType<IU>(), diagneigh,
TRLUT, &
lenuntil, 1,
MPIType<IU>(), diagneigh,
TRLUT,
World, &
status);
1406#pragma omp parallel for
1408 for(
int i=0; i<
xlocnz; ++i)
1410 MPI_Sendrecv(
temp_xind,
xlocnz,
MPIType<int32_t>(), diagneigh,
TRI,
trxinds,
trxlocnz,
MPIType<int32_t>(), diagneigh,
TRI,
World, &
status);
1415 MPI_Sendrecv(
const_cast<NV*
>(
SpHelper::p2a(
x.num)),
xlocnz,
MPIType<NV>(), diagneigh,
TRX,
trxnums,
trxlocnz,
MPIType<NV>(), diagneigh,
TRX,
World, &
status);
1429template<
typename IU,
typename NV>
1433 int colneighs, colrank;
1436 int *
colnz =
new int[colneighs];
1439 int *
dpls =
new int[colneighs]();
1463 for(
int i=0; i<
accnz; ++i)
1488template<
typename SR,
typename IVT,
typename OVT,
typename IU,
typename NUM,
typename UDER>
1494 if(
A.spSeq->getnsplit() > 0)
1507 if(
A.spSeq->getnsplit() > 0)
1514 sendcnt[i] = sdispls[i+1] - sdispls[i];
1520 std::vector< int32_t >
indy;
1521 std::vector< OVT >
numy;
1556template <
typename SR,
typename IU,
typename OVT>
1569 for(
int i=0; i<
veclen; i++)
1578 int32_t inf = std::numeric_limits<int32_t>::min();
1579 int32_t sup = std::numeric_limits<int32_t>::max();
1582 for(
int i=0; i<
nlists; ++i)
1628template <
typename SR,
typename IU,
typename OVT>
1643#pragma omp parallel for
1645 for(
int i=0; i<
veclen; i++)
1663 for(
int k=0; k<
nlists; k++)
1667#pragma omp parallel for
1681#pragma omp parallel for schedule(dynamic)
1709#pragma omp parallel for schedule(dynamic)
1724template <
typename SR,
typename IVT,
typename OVT,
typename IU,
typename NUM,
typename UDER>
1730 y.glen =
A.getnrow();
1775 LocalSpMV<SR>(
A,
rowneighs,
optbuf,
indacc,
numacc,
sendindbuf,
sendnumbuf, sdispls,
sendcnt,
accnz,
indexisvalue,
SPA);
1792#pragma omp parallel for
1794 for(
int i=0; i<
sendcnt[0]; i++)
1803#pragma omp parallel for
1805 for(
int i=0; i<
sendcnt[0]; i++)
1823 rdispls[i+1] = rdispls[i] +
recvcnt[i];
1855 std::vector<IU>().swap(
y.ind);
1856 std::vector<OVT>().swap(
y.num);
1862#pragma omp parallel for
1884template <
typename SR,
typename IVT,
typename OVT,
typename IU,
typename NUM,
typename UDER>
1891template <
typename SR,
typename IVT,
typename OVT,
typename IU,
typename NUM,
typename UDER>
1899template <
typename SR,
typename IVT,
typename OVT,
typename IU,
typename NUM,
typename UDER>
1911template <
typename SR,
typename IU,
typename NUM,
typename UDER>
1924template <
typename SR,
typename IU,
typename NUM,
typename NUV,
typename UDER>
1938 int diagneigh =
x.
commGrid->GetComplementRank();
1940 MPI_Sendrecv(&
xsize, 1,
MPI_INT, diagneigh,
TRX, &
trxsize, 1,
MPI_INT, diagneigh,
TRX,
World, &
status);
1943 MPI_Sendrecv(
const_cast<NUV*
>(
SpHelper::p2a(
x.arr)),
xsize,
MPIType<NUV>(), diagneigh,
TRX,
trxnums,
trxsize,
MPIType<NUV>(), diagneigh,
TRX,
World, &
status);
1945 int colneighs, colrank;
1948 int *
colsize =
new int[colneighs];
1951 int *
dpls =
new int[colneighs]();
1960 T_promote
id = SR::id();
2004template <
typename SR,
typename IU,
typename NUM,
typename NUV,
typename UDER>
2020 int diagneigh =
x.
commGrid->GetComplementRank();
2022 MPI_Sendrecv(&
xlocnz, 1,
MPI_INT, diagneigh,
TRX, &
trxlocnz, 1,
MPI_INT, diagneigh,
TRX,
World, &
status);
2023 MPI_Sendrecv(&
roffst, 1,
MPI_INT, diagneigh,
TROST, &
offset, 1,
MPI_INT, diagneigh,
TROST,
World, &
status);
2027 MPI_Sendrecv(
const_cast<IU*
>(
SpHelper::p2a(
x.ind)),
xlocnz,
MPIType<IU>(), diagneigh,
TRX,
trxinds,
trxlocnz,
MPIType<IU>(), diagneigh,
TRX,
World, &
status);
2028 MPI_Sendrecv(
const_cast<NUV*
>(
SpHelper::p2a(
x.num)),
xlocnz,
MPIType<NUV>(), diagneigh,
TRX,
trxnums,
trxlocnz,
MPIType<NUV>(), diagneigh,
TRX,
World, &
status);
2031 int colneighs, colrank;
2034 int *
colnz =
new int[colneighs];
2037 int *
dpls =
new int[colneighs]();
2050 std::vector< int32_t >
indy;
2051 std::vector< T_promote >
numy;
2069 typename std::vector<int32_t>::size_type
outnz =
indy.size();
2070 for(
typename std::vector<IU>::size_type i=0; i<
outnz; ++i)
2093 sdispls[i+1] = sdispls[i] +
sendcnt[i];
2094 rdispls[i+1] = rdispls[i] +
recvcnt[i];
2103 std::vector<IU>().swap(
sendind[i]);
2108 std::vector<T_promote>().swap(
sendnum[i]);
2119 bool * isthere =
new bool[
ysize];
2121 std::fill_n(isthere,
ysize,
false);
2141 for(
int i=0; i<
nnzy; ++i)
2156template <
typename IU,
typename NU1,
typename NU2,
typename UDERA,
typename UDERB>
2166 std::cout <<
"Grids are not comparable for set difference" << std::endl;
2173template <
typename IU,
typename NU1,
typename NU2,
typename UDERA,
typename UDERB>
2187 std::cout <<
"Grids are not comparable elementwise multiplication" << std::endl;
2193template <
typename RETT,
typename RETDER,
typename IU,
typename NU1,
typename NU2,
typename UDERA,
typename UDERB,
typename _BinaryOperation>
2204 std::cout <<
"Grids are not comparable elementwise apply" << std::endl;
2210template <
typename RETT,
typename RETDER,
typename IU,
typename NU1,
typename NU2,
typename UDERA,
typename UDERB,
typename _BinaryOperation,
typename _BinaryPredicate>
2212 (
const SpParMat<IU,NU1,UDERA> &
A,
const SpParMat<IU,NU2,UDERB> &
B,
_BinaryOperation __binary_op,
_BinaryPredicate do_op,
bool allowANulls,
bool allowBNulls,
const NU1&
ANullVal,
const NU2&
BNullVal,
const bool allowIntersect,
const bool useExtendedBinOp)
2221 std::cout <<
"Grids are not comparable elementwise apply" << std::endl;
2228template <
typename RETT,
typename RETDER,
typename IU,
typename NU1,
typename NU2,
typename UDERA,
typename UDERB,
typename _BinaryOperation,
typename _BinaryPredicate>
2230EWiseApply (
const SpParMat<IU,NU1,UDERA> &
A,
const SpParMat<IU,NU2,UDERB> &
B,
_BinaryOperation __binary_op,
_BinaryPredicate do_op,
bool allowANulls,
bool allowBNulls,
const NU1&
ANullVal,
const NU2&
BNullVal,
const bool allowIntersect =
true)
2243template <
typename IU,
typename NU1,
typename NU2>
2252 if(
V.glen !=
W.glen)
2254 std::cerr <<
"Vector dimensions don't match for EWiseMult\n";
2263 #if defined(_OPENMP) && defined(CBLAS_EXPERIMENTAL)
2269 #pragma omp parallel for
2276 if(
W.arr[
V.ind[i]] ==
zero)
2278 tlinds[t].push_back(
V.ind[i]);
2279 tlnums[t].push_back(
V.num[i]);
2289 #pragma omp parallel for
2298 if(
W.arr[
V.ind[i]] ==
zero)
2310 if(
W.arr[
V.ind[i]] !=
zero)
2313 Product.num.push_back(
V.num[i] *
W.arr[
V.ind[i]]);
2322 std::cout <<
"Grids are not comparable elementwise multiplication" << std::endl;
2332template <
typename RET,
typename IU,
typename NU1,
typename NU2,
typename _BinaryOperation,
typename _BinaryPredicate>
2336 typedef RET T_promote;
2340 if(
V.TotalLength() !=
W.TotalLength())
2342 std::ostringstream
outs;
2343 outs <<
"Vector dimensions don't match (" <<
V.TotalLength() <<
" vs " <<
W.TotalLength() <<
") for EWiseApply (short version)\n";
2386 auto it = std::lower_bound (
V.ind.begin(),
V.ind.end(),
tStartIdx);
2453 std::cout <<
"Grids are not comparable for EWiseApply" << std::endl;
2478template <
typename RET,
typename IU,
typename NU1,
typename NU2,
typename _BinaryOperation,
typename _BinaryPredicate>
2487 typedef RET T_promote;
2492 if(
V.TotalLength() !=
W.TotalLength())
2494 std::ostringstream
outs;
2495 outs <<
"Vector dimensions don't match (" <<
V.TotalLength() <<
" vs " <<
W.TotalLength() <<
") for EWiseApply (short version)\n";
2547 std::cout <<
"Grids are not comparable for EWiseApply" << std::endl;
2579template <
typename RET,
typename IU,
typename NU1,
typename NU2,
typename _BinaryOperation,
typename _BinaryPredicate>
2581 (
const FullyDistSpVec<IU,NU1> &
V,
const FullyDistSpVec<IU,NU2> &
W ,
_BinaryOperation _binary_op,
_BinaryPredicate _doOp,
bool allowVNulls,
bool allowWNulls,
NU1 Vzero,
NU2 Wzero,
const bool allowIntersect,
const bool useExtendedBinOp)
2584 typedef RET T_promote;
2588 if(
V.glen !=
W.glen)
2590 std::ostringstream
outs;
2591 outs <<
"Vector dimensions don't match (" <<
V.glen <<
" vs " <<
W.glen <<
") for EWiseApply (full version)\n";
2598 typename std::vector< IU >::const_iterator
indV =
V.ind.begin();
2599 typename std::vector< NU1 >::const_iterator
numV =
V.num.begin();
2600 typename std::vector< IU >::const_iterator
indW =
W.ind.begin();
2601 typename std::vector< NU2 >::const_iterator
numW =
W.num.begin();
2603 while (
indV <
V.ind.end() &&
indW <
W.ind.end())
2670 std::cout <<
"Grids are not comparable for EWiseApply" << std::endl;
2677template <
typename RET,
typename IU,
typename NU1,
typename NU2,
typename _BinaryOperation,
typename _BinaryPredicate>
2691template <
typename RET,
typename IU,
typename NU1,
typename NU2,
typename _BinaryOperation,
typename _BinaryPredicate>
2693 (
const FullyDistSpVec<IU,NU1> &
V,
const FullyDistSpVec<IU,NU2> &
W ,
_BinaryOperation _binary_op,
_BinaryPredicate _doOp,
bool allowVNulls,
bool allowWNulls,
NU1 Vzero,
NU2 Wzero,
const bool allowIntersect =
true)
2716template <
typename NZT>
2727 template<
typename c,
typename t,
typename V>
2739template<
typename NZT>
2745 for (
auto it = arr.begin();
it != arr.end(); ++
it)
2746 *
it = std::numeric_limits<float>::max();
2761 for (
int i = 0; i <
NROUNDS; ++i)
2783 static bool exists =
false;
2800 for (
int i = 0; i < *
len; ++i)
2807template <
typename IU,
typename NU1,
typename NU2,
2808 typename UDERA,
typename UDERB>
2820 #pragma omp parallel
2823 nthds = omp_get_num_threads();
2827 std::cout <<
"taking transposes." << std::endl;
2833 std::cout <<
"setting initial samples." << std::endl;
2839 #pragma omp parallel
2842 std::default_random_engine
gen;
2846 #pragma omp parallel for
2861 std::cout <<
"computing mid samples." << std::endl;
2870 std::cout <<
"computing final samples." << std::endl;
2879 std::cout <<
"computing nnz estimation." << std::endl;
2883 std::cout << myrank <<
"samples_final loc size: "
2889 #pragma omp parallel for reduction (+:nnzest)
2900 std::cout <<
"taking transposes again." << std::endl;
2905 (
B.commGrid)->GetWorld());
2908 std::cout <<
"sampling-based spmv est tot: " <<
nnzC_tot << std::endl;
2918template <
typename SR,
typename NUO,
typename UDERO,
typename IU,
typename NU1,
typename NU2,
typename UDER1,
typename UDER2>
2922 typedef typename UDERO::LocalIT
LIC;
2923 typedef typename UDER1::LocalIT
LIA;
2924 typedef typename UDER2::LocalIT
LIB;
2933 if(
A.getncol() !=
B.getnrow()){
2934 std::ostringstream
outs;
2935 outs <<
"Can not multiply, dimensions does not match"<< std::endl;
2936 outs <<
A.getncol() <<
" != " <<
B.getnrow() << std::endl;
2957 std::shared_ptr<CommGrid>
GridC =
ProductGrid((
A.GetLayerMat()->getcommgrid()).get(),
2958 (
B.GetLayerMat()->getcommgrid()).get(),
2960 IU C_m =
A.GetLayerMat()->seqptr()->getnrow();
2961 IU C_n =
B.GetLayerMat()->seqptr()->getncol();
2972 std::vector< SpTuples<IU,NUO> *>
tomerge;
2974 int Aself = (
A.GetLayerMat()->getcommgrid())->GetRankInProcRow();
2975 int Bself = (
B.GetLayerMat()->getcommgrid())->GetRankInProcCol();
2981 for(
int i = 0; i <
stages; ++i) {
2982 std::vector<IU>
ess;
2985 ARecv =
A.GetLayerMat()->seqptr();
2988 ess.resize(UDER1::esscount);
2989 for(
int j=0;
j<UDER1::esscount; ++
j) {
3016 BRecv =
B.GetLayerMat()->seqptr();
3019 ess.resize(UDER2::esscount);
3020 for(
int j=0;
j<UDER2::esscount; ++
j) {
3026 MPI_Barrier(
A.GetLayerMat()->getcommgrid()->GetWorld());
3106 int *
sendcnt =
new int[
A.getcommgrid3D()->GetGridLayers()];
3107 int *
sendprfl =
new int[
A.getcommgrid3D()->GetGridLayers()*3];
3108 int * sdispls =
new int[
A.getcommgrid3D()->GetGridLayers()]();
3109 int *
recvcnt =
new int[
A.getcommgrid3D()->GetGridLayers()];
3110 int *
recvprfl =
new int[
A.getcommgrid3D()->GetGridLayers()*3];
3111 int * rdispls =
new int[
A.getcommgrid3D()->GetGridLayers()]();
3119#pragma omp parallel for
3120 for(
int i=0; i <
A.getcommgrid3D()->GetGridLayers(); ++i){
3130 sendprfl[i*3+1] = (
int)(
A.GetLayerMat()->seqptr()->getnrow());
3133 std::partial_sum(
sendcnt,
sendcnt+
A.getcommgrid3D()->GetGridLayers()-1, sdispls+1);
3136 for(
int i=0; i <
A.getcommgrid3D()->GetGridLayers(); ++i){
3137#pragma omp parallel for schedule(static)
3145 for(
int i = 0; i <
A.getcommgrid3D()->GetGridLayers(); i++)
recvcnt[i] =
recvprfl[i*3];
3146 std::partial_sum(
recvcnt,
recvcnt+
A.getcommgrid3D()->GetGridLayers()-1, rdispls+1);
3148 std::tuple<LIC,LIC,NUO>*
recvTuples =
static_cast<std::tuple<LIC,LIC,NUO>*
> (::operator
new (
sizeof(std::tuple<LIC,LIC,NUO>[
totrecv])));
3160#pragma omp parallel for
3161 for (
int i = 0; i <
A.getcommgrid3D()->GetGridLayers(); i++){
3204 std::shared_ptr<CommGrid3D>
grid3d;
3205 grid3d.reset(
new CommGrid3D(
A.getcommgrid3D()->GetWorld(),
A.getcommgrid3D()->GetGridLayers(),
A.getcommgrid3D()->GetGridRows(),
A.getcommgrid3D()->GetGridCols(),
A.isSpecial()));
3214template <
typename SR,
typename NUO,
typename UDERO,
typename IU,
typename NU1,
typename NU2,
typename UDERA,
typename UDERB>
3219 typedef typename UDERA::LocalIT
LIA;
3220 typedef typename UDERB::LocalIT
LIB;
3221 typedef typename UDERO::LocalIT
LIC;
3226 if(
A.getncol() !=
B.getnrow()){
3227 std::ostringstream
outs;
3228 outs <<
"Can not multiply, dimensions does not match"<< std::endl;
3229 outs <<
A.getncol() <<
" != " <<
B.getnrow() << std::endl;
3238 SpParHelper::Print(
"[MemEfficientSpGEMM3D]\tThe value of phases is too small or large. Resetting to 1.\n");
3322 for(
int j = 0;
j <
temp.size();
j++){
3332 for(
int p = 0; p < phases; p++){
3340 for(
int i = 0; i <
PiecesOfB.size(); i++){
3341 if(i % phases == p){
3368 std::shared_ptr<CommGrid>
GridC =
ProductGrid((
A.GetLayerMat()->getcommgrid()).get(),
3371 LIA C_m =
A.GetLayerMat()->seqptr()->getnrow();
3383 std::vector< SpTuples<LIC,NUO> *>
tomerge;
3385 int Aself = (
A.GetLayerMat()->getcommgrid())->GetRankInProcRow();
3392 for(
int i = 0; i <
stages; ++i) {
3393 std::vector<LIA>
ess;
3396 ARecv =
A.GetLayerMat()->seqptr();
3399 ess.resize(UDERA::esscount);
3400 for(
int j=0;
j<UDERA::esscount; ++
j) {
3431 ess.resize(UDERB::esscount);
3432 for(
int j=0;
j<UDERB::esscount; ++
j) {
3438 MPI_Barrier(
A.GetLayerMat()->getcommgrid()->GetWorld());
3507 fprintf(
stderr,
"[MemEfficientSpGEMM3D]\tPhase: %d\tSUMMA Merge time: %lf\n", p, (
t3-
t2));
3516 if(myrank == 0)
fprintf(
stderr,
"[MemEfficientSpGEMM3D]\tPhase: %d\tSUMMA time: %lf\n", p, (
t1-
t0));
3535 int *
sendcnt =
new int[
A.getcommgrid3D()->GetGridLayers()];
3536 int *
sendprfl =
new int[
A.getcommgrid3D()->GetGridLayers()*3];
3537 int * sdispls =
new int[
A.getcommgrid3D()->GetGridLayers()]();
3538 int *
recvcnt =
new int[
A.getcommgrid3D()->GetGridLayers()];
3539 int *
recvprfl =
new int[
A.getcommgrid3D()->GetGridLayers()*3];
3540 int * rdispls =
new int[
A.getcommgrid3D()->GetGridLayers()]();
3549 if(myrank == 0)
fprintf(
stderr,
"[MemEfficientSpGEMM3D]\tPhase: %d\tAllocation of alltoall information: %lf\n", p, (
t3-
t2));
3555#pragma omp parallel for
3556 for(
int i=0; i <
A.getcommgrid3D()->GetGridLayers(); ++i){
3566 sendprfl[i*3+1] = (
int)(
A.GetLayerMat()->seqptr()->getnrow());
3569 std::partial_sum(
sendcnt,
sendcnt+
A.getcommgrid3D()->GetGridLayers()-1, sdispls+1);
3572 if(myrank == 0)
fprintf(
stderr,
"[MemEfficientSpGEMM3D]\tPhase: %d\tGetting Alltoall data ready: %lf\n", p, (
t3-
t2));
3579 for(
int i=0; i <
A.getcommgrid3D()->GetGridLayers(); ++i){
3580#pragma omp parallel for schedule(static)
3587 if(myrank == 0)
fprintf(
stderr,
"[MemEfficientSpGEMM3D]\tPhase: %d\tGetting Alltoallv data ready: %lf\n", p, (
t3-
t2));
3596 if(myrank == 0)
fprintf(
stderr,
"[MemEfficientSpGEMM3D]\tPhase: %d\tAlltoall: %lf\n", p, (
t3-
t2));
3601 for(
int i = 0; i <
A.getcommgrid3D()->GetGridLayers(); i++)
recvcnt[i] =
recvprfl[i*3];
3602 std::partial_sum(
recvcnt,
recvcnt+
A.getcommgrid3D()->GetGridLayers()-1, rdispls+1);
3604 std::tuple<LIC,LIC,NUO>*
recvTuples =
static_cast<std::tuple<LIC,LIC,NUO>*
> (::operator
new (
sizeof(std::tuple<LIC,LIC,NUO>[
totrecv])));
3607 if(myrank == 0)
fprintf(
stderr,
"[MemEfficientSpGEMM3D]\tPhase: %d\tAllocation of receive data: %lf\n", p, (
t3-
t2));
3617 if(myrank == 0)
fprintf(
stderr,
"[MemEfficientSpGEMM3D]\tPhase: %d\tAlltoallv: %lf\n", p, (
t3-
t2));
3623#pragma omp parallel for
3624 for (
int i = 0; i <
A.getcommgrid3D()->GetGridLayers(); i++){
3629 if(myrank == 0)
fprintf(
stderr,
"[MemEfficientSpGEMM3D]\tPhase: %d\trecvChunks creation: %lf\n", p, (
t3-
t2));
3641 if(myrank == 0)
fprintf(
stderr,
"[MemEfficientSpGEMM3D]\tPhase: %d\tMemory freeing: %lf\n", p, (
t3-
t2));
3650 if(myrank == 0)
fprintf(
stderr,
"[MemEfficientSpGEMM3D]\tPhase: %d\tReduction time: %lf\n", p, (
t1-
t0));
3665 if(myrank == 0)
fprintf(
stderr,
"[MemEfficientSpGEMM3D]\tPhase: %d\t3D Merge time: %lf\n", p, (
t1-
t0));
3690 if(myrank == 0)
fprintf(
stderr,
"[MemEfficientSpGEMM3D]\tPhase: %d\tMCLPruneRecoverySelect time: %lf\n",p, (
t1-
t0));
3699 std::shared_ptr<CommGrid3D>
grid3d;
3700 grid3d.reset(
new CommGrid3D(
A.getcommgrid3D()->GetWorld(),
A.getcommgrid3D()->GetGridLayers(),
A.getcommgrid3D()->GetGridRows(),
A.getcommgrid3D()->GetGridCols(),
A.isSpecial()));
double cblas_transvectime
double cblas_localspmvtime
double cblas_allgathertime
double cblas_alltoalltime
double cblas_mergeconttime
double mcl_localspgemmtime
double mcl_multiwaymergetime
double mcl_prunecolumntime
double mcl3d_symbolictime
double mcl3d_SUMMAmergetime
double mcl3d_reductiontime
double mcl3d_localspgemmtime
std::shared_ptr< CommGrid > commGrid
void save(std::basic_ostream< c, t > &os, std::array< V, NROUNDS > &sample_vec, int64_t index)
static void deallocate2D(T **array, I m)
static const T * p2a(const std::vector< T > &v)
static void GetSetSizes(const SpMat< IT, NT, DER > &Matrix, IT **&sizes, MPI_Comm &comm1d)
static void BCastMatrix(MPI_Comm &comm1d, SpMat< IT, NT, DER > &Matrix, const std::vector< IT > &essentials, int root)
static void Print(const std::string &s)
static void IBCastMatrix(MPI_Comm &comm1d, SpMat< IT, NT, DER > &Matrix, const std::vector< IT > &essentials, int root, std::vector< MPI_Request > &indarrayReq, std::vector< MPI_Request > &numarrayReq)
SpParMat3D< IU, NUO, UDERO > Mult_AnXBn_SUMMA3D(SpParMat3D< IU, NU1, UDER1 > &A, SpParMat3D< IU, NU2, UDER2 > &B)
IT * estimateFLOP(const SpDCCols< IT, NT1 > &A, const SpDCCols< IT, NT2 > &B, IT *aux=nullptr)
int64_t EstPerProcessNnzSUMMA(SpParMat< IU, NU1, UDERA > &A, SpParMat< IU, NU2, UDERB > &B, bool hashEstimate)
void MCLPruneRecoverySelect(SpParMat< IT, NT, DER > &A, NT hardThreshold, IT selectNum, IT recoverNum, NT recoverPct, int kselectVersion)
SpParMat3D< IU, NUO, UDERO > MemEfficientSpGEMM3D(SpParMat3D< IU, NU1, UDERA > &A, SpParMat3D< IU, NU2, UDERB > &B, int phases, NUO hardThreshold, IU selectNum, IU recoverNum, NUO recoverPct, int kselectVersion, int computationKernel, int64_t perProcessMemory)
void MergeContributions(FullyDistSpVec< IU, VT > &y, int *&recvcnt, int *&rdispls, int32_t *&recvindbuf, VT *&recvnumbuf, int rowneighs)
FullyDistSpVec< IU, RET > EWiseApply_threaded(const FullyDistSpVec< IU, NU1 > &V, const FullyDistVec< IU, NU2 > &W, _BinaryOperation _binary_op, _BinaryPredicate _doOp, bool allowVNulls, NU1 Vzero, const bool useExtendedBinOp)
void CheckSpMVCompliance(const MATRIX &A, const VECTOR &x)
Dcsc< IU, typename promote_trait< NU1, NU2 >::T_promote > SetDifference(const Dcsc< IU, NU1 > &A, const Dcsc< IU, NU2 > *B)
int64_t EstPerProcessNnzSpMV(SpParMat< IU, NU1, UDERA > &A, SpParMat< IU, NU2, UDERB > &B)
int CalculateNumberOfPhases(SpParMat< IU, NU1, UDERA > &A, SpParMat< IU, NU2, UDERB > &B, NUO hardThreshold, IU selectNum, IU recoverNum, NUO recoverPct, int kselectVersion, int64_t perProcessMemory)
SpParMat< IU, NUO, UDERO > Mult_AnXBn_Overlap(SpParMat< IU, NU1, UDERA > &A, SpParMat< IU, NU2, UDERB > &B, bool clearA=false, bool clearB=false)
SpParMat< IU, NUO, UDERO > MemEfficientSpGEMM(SpParMat< IU, NU1, UDERA > &A, SpParMat< IU, NU2, UDERB > &B, int phases, NUO hardThreshold, IU selectNum, IU recoverNum, NUO recoverPct, int kselectVersion, int computationKernel, int64_t perProcessMemory)
FullyDistVec< IT, NT > Concatenate(std::vector< FullyDistVec< IT, NT > > &vecs)
SpParMat< IU, NUO, UDERO > Mult_AnXBn_Synch(SpParMat< IU, NU1, UDERA > &A, SpParMat< IU, NU2, UDERB > &B, bool clearA=false, bool clearB=false)
void AllGatherVector(MPI_Comm &ColWorld, int trxlocnz, IU lenuntil, int32_t *&trxinds, NV *&trxnums, int32_t *&indacc, NV *&numacc, int &accnz, bool indexisvalue)
IU EstimateFLOP(SpParMat< IU, NU1, UDERA > &A, SpParMat< IU, NU2, UDERB > &B, bool clearA=false, bool clearB=false)
SpParMat< IU, NUO, UDERO > Mult_AnXBn_DoubleBuff(SpParMat< IU, NU1, UDERA > &A, SpParMat< IU, NU2, UDERB > &B, bool clearA=false, bool clearB=false)
Dcsc< IU, N_promote > EWiseApply(const Dcsc< IU, NU1 > &A, const Dcsc< IU, NU2 > *B, _BinaryOperation __binary_op, bool notB, const NU2 &defaultBVal)
void TransposeVector(MPI_Comm &World, const FullyDistSpVec< IU, NV > &x, int32_t &trxlocnz, IU &lenuntil, int32_t *&trxinds, NV *&trxnums, bool indexisvalue)
void LocalSpMV(const SpParMat< IT, bool, UDER > &A, int rowneighs, OptBuf< int32_t, VT > &optbuf, int32_t *&indacc, VT *&numacc, int *sendcnt, int accnz)
std::array< float, NROUNDS > samparr_t
void MergeContributions_threaded(int *&listSizes, std::vector< int32_t * > &indsvec, std::vector< OVT * > &numsvec, std::vector< IU > &mergedind, std::vector< OVT > &mergednum, IU maxindex)
bool CheckSpGEMMCompliance(const MATRIXA &A, const MATRIXB &B)
Dcsc< IU, typename promote_trait< NU1, NU2 >::T_promote > EWiseMult(const Dcsc< IU, NU1 > &A, const Dcsc< IU, NU2 > *B, bool exclude)
shared_ptr< CommGrid > ProductGrid(CommGrid *gridA, CommGrid *gridB, int &innerdim, int &Aoffset, int &Boffset)
IT * estimateNNZ_Hash(const SpDCCols< IT, NT1 > &A, const SpDCCols< IT, NT2 > &B, IT *flopC, IT *aux=nullptr)
static void MPI_func(void *invec, void *inoutvec, int *len, MPI_Datatype *datatype)
static void axpy(const NZT a, const samparr_t &x, samparr_t &y)
static samparr_t multiply(const NZT arg1, const samparr_t &arg2)
static samparr_t add(const samparr_t &arg1, const samparr_t &arg2)
static bool returnedSAID()