42#pragma omp for schedule(static)
44 for (
int i=0; i<
size; i++)
55 for(
int i=0; i<(
ithread+1); i++)
61#pragma omp for schedule(static)
63 for (
int i=0; i<
size; i++)
74template <
typename SR,
typename NTO,
typename IT,
typename NT1,
typename NT2>
83 if(
A.isZero() ||
B.isZero())
92 float cf =
static_cast<float>(
nA+1) /
static_cast<float>(
Adcsc->nzc);
110 std::tuple<IT,IT,NTO> *
tuplesC =
static_cast<std::tuple<IT,IT,NTO> *
> (::operator
new (
sizeof(std::tuple<IT,IT,NTO>[
nnzc])));
124#pragma omp parallel for
126 for(
size_t i=0; i <
Bnzc; ++i)
164 if (!SR::returnedSAID())
206template <
typename IT,
typename NT>
209 return left.first <
right.first;
213template <
typename SR,
typename NTO,
typename IT,
typename NT1,
typename NT2>
224 if(
A.isZero() ||
B.isZero())
233 float cf =
static_cast<float>(
nA+1) /
static_cast<float>(
Adcsc->nzc);
272 std::tuple<IT,IT,NTO> *
tuplesC =
static_cast<std::tuple<IT,IT,NTO> *
> (::operator
new (
sizeof(std::tuple<IT,IT,NTO>[
nnzc])));
290#pragma omp parallel for
292 for(
size_t i=0; i <
Bdcsc->nzc; ++i)
337 if (!SR::returnedSAID())
436 for (
size_t j=0;
j < index; ++
j)
463 template <
typename SR,
typename NTO,
typename IT,
typename NT1,
typename NT2>
475 if(
A.isZero() ||
B.isZero())
484 float cf =
static_cast<float>(
nA+1) /
static_cast<float>(
Adcsc->nzc);
516 std::tuple<IT,IT,NTO> *
tuplesC =
new std::tuple<IT,IT,NTO>[
nnzc];
529#pragma omp parallel for
531 for(
size_t i=0; i <
Bdcsc->nzc; ++i)
619 for (
size_t j=0;
j < index; ++
j)
662template <
typename SR,
typename IT,
typename NT1,
typename NT2>
692template <
typename IT,
typename NT1,
typename NT2>
696 if(
A.isZero() ||
B.isZero())
704 float cf =
static_cast<float>(
A.getncol()+1) /
static_cast<float>(
Adcsc->nzc);
724#pragma omp parallel for
726 for(
IT i=0; i<
Bdcsc->nzc; ++i)
743#pragma omp parallel for
745 for(
int i=0; i <
Bdcsc->nzc; ++i)
806template <
typename IT,
typename NT1,
typename NT2>
810 if(
A.isZero() ||
B.isZero())
818 float cf =
static_cast<float>(
A.getncol()+1) /
static_cast<float>(
Adcsc->nzc);
859#pragma omp parallel for
861 for(
int i=0; i <
Bdcsc->nzc; ++i)
936template <
typename IT,
typename NT1,
typename NT2>
945 if (
A.isZero() ||
B.isZero())
966 nthds = omp_get_num_threads();
973 std::default_random_engine
gen;
977 #pragma omp parallel for
984 #pragma omp parallel for
987 samples_mid[i] = std::numeric_limits<float>::max();
990 #pragma omp parallel for
992 for (
IT i = 0; i <
Adcsc->nzc; ++i)
1000 for (
int k = 0; k <
nrounds; ++k)
1011 sizeof(*samples_final));
1017 #pragma omp parallel for reduction (+:nnzest)
1019 for (
IT i = 0; i <
Bdcsc->nzc; ++i)
1023 tid = omp_get_thread_num();
1034 for (
int k = 0; k <
nrounds; ++k)
1057template <
typename IT,
typename NT1,
typename NT2>
1061 if(
A.isZero() ||
B.isZero())
1069 float cf =
static_cast<float>(
A.getncol()+1) /
static_cast<float>(
Adcsc->nzc);
1092#pragma omp parallel for
1094 for(
IT i=0; i<
Bdcsc->nzc; ++i)
1110#pragma omp parallel for
1112 for(
int i=0; i <
Bdcsc->nzc; ++i)
1142template <
typename SR,
1160 if(
A.isZero() ||
B.isZero())
1168 #pragma omp parallel
1185 std::tuple<IT, IT, NTO> *
tuplesC =
static_cast<std::tuple<IT, IT, NTO> *
>
1186 (::operator
new (
sizeof(std::tuple<IT, IT, NTO>[
nnzc])));
1189 #pragma omp parallel for
1191 for (
size_t i = 0; i <
Bcsc->n; ++i)
1194 double cr =
static_cast<double>
1219 if (!SR::returnedSAID())
1253 std::vector< std::pair<IT, NTO>>
T(
ht_size);
1256 T[
j].first = std::numeric_limits<IT>::max();
1270 if (
T[
hv].first == key)
1272 else if (
T[
hv].first == std::numeric_limits<IT>::max())
1288 if (
T[
j].first != std::numeric_limits<IT>::max())
1295 for (
size_t j = 0;
j < index; ++
j)
1318template <
typename IT,
1327 if (
A.isZero() ||
B.isZero())
1335 #pragma omp parallel
1344 #pragma omp parallel for
1346 for (
IT i = 0; i <
Bcsc->n; ++i)
1350 #pragma omp parallel for
1352 for (
IT i = 0; i <
Bcsc->n; ++i)
1363template <
typename IT,
1373 if (
A.isZero() ||
B.isZero())
1381 #pragma omp parallel
1390 #pragma omp parallel for
1392 for (
IT i = 0; i <
Bcsc->n; ++i)
1396 #pragma omp parallel for
1398 for (
IT i = 0; i <
Bcsc->n; ++i)
1410 T[
j] = std::numeric_limits<IT>::max();
1423 else if (
T[
hv] == std::numeric_limits<IT>::max())
SelectMaxSRing< bool, int64_t > SR
SpTuples< IT, NTO > * LocalSpGEMMHash(const SpDCCols< IT, NT1 > &A, const SpDCCols< IT, NT2 > &B, bool clearA, bool clearB, bool sort=true)
IT * estimateFLOP(const SpDCCols< IT, NT1 > &A, const SpDCCols< IT, NT2 > &B, IT *aux=nullptr)
SpTuples< IT, NTO > * LocalSpGEMM(const SpDCCols< IT, NT1 > &A, const SpDCCols< IT, NT2 > &B, bool clearA, bool clearB)
IT EstimateLocalFLOP(const SpDCCols< IT, NT1 > &A, const SpDCCols< IT, NT2 > &B, bool clearA, bool clearB)
SpTuples< IT, NTO > * LocalHybridSpGEMM(const SpDCCols< IT, NT1 > &A, const SpDCCols< IT, NT2 > &B, bool clearA, bool clearB, IT *aux=nullptr)
bool sort_less(const std::pair< IT, NT > &left, const std::pair< IT, NT > &right)
T * prefixsum(T *in, int size, int nthreads)
int64_t estimateNNZ_sampling(const SpDCCols< IT, NT1 > &A, const SpDCCols< IT, NT2 > &B, int nrounds=5)
IT * estimateNNZ(const SpDCCols< IT, NT1 > &A, const SpDCCols< IT, NT2 > &B, IT *aux=nullptr, bool freeaux=true)
IT * estimateNNZ_Hash(const SpDCCols< IT, NT1 > &A, const SpDCCols< IT, NT2 > &B, IT *flopC, IT *aux=nullptr)