26 std::vector<T> tsum(nthreads+1);
28 T* out =
new T[
size+1];
37 ithread = omp_get_thread_num();
42#pragma omp for schedule(static)
44 for (
int i=0; i<
size; i++)
50 tsum[ithread+1] = sum;
55 for(
int i=0; i<(ithread+1); i++)
61#pragma omp for schedule(static)
63 for (
int i=0; i<
size; i++)
74template <
typename SR,
typename NTO,
typename IT,
typename NT1,
typename NT2>
76(
const SpDCCols<IT, NT1> &
A,
77 const SpDCCols<IT, NT2> &
B,
78 bool clearA,
bool clearB)
80 IT mdim =
A.getnrow();
81 IT ndim =
B.getncol();
83 if(
A.isZero() ||
B.isZero())
85 return new SpTuples<IT, NTO>(0, mdim, ndim);
89 Dcsc<IT,NT1>* Adcsc =
A.GetDCSC();
90 Dcsc<IT,NT2>* Bdcsc =
B.GetDCSC();
92 float cf =
static_cast<float>(nA+1) /
static_cast<float>(Adcsc->nzc);
93 IT csize =
static_cast<IT>(ceil(cf));
95 Adcsc->ConstructAux(nA, aux);
102 numThreads = omp_get_num_threads();
107 IT* colptrC = prefixsum<IT>(colnnzC, Bdcsc->nzc, numThreads);
109 IT nnzc = colptrC[Bdcsc->nzc];
110 std::tuple<IT,IT,NTO> * tuplesC =
static_cast<std::tuple<IT,IT,NTO> *
> (::operator
new (
sizeof(std::tuple<IT,IT,NTO>[nnzc])));
113 std::vector<std::vector< std::pair<IT,IT>>> colindsVec(numThreads);
114 std::vector<std::vector<HeapEntry<IT,NT1>>> globalheapVec(numThreads);
116 for(
int i=0; i<numThreads; i++)
118 colindsVec[i].resize(nnzA/numThreads);
119 globalheapVec[i].resize(nnzA/numThreads);
122 size_t Bnzc = (size_t) Bdcsc->nzc;
124#pragma omp parallel for
126 for(
size_t i=0; i < Bnzc; ++i)
128 size_t nnzcolB = Bdcsc->cp[i+1] - Bdcsc->cp[i];
131 myThread = omp_get_thread_num();
133 if(colindsVec[myThread].
size() < nnzcolB)
135 colindsVec[myThread].resize(nnzcolB);
136 globalheapVec[myThread].resize(nnzcolB);
142 Adcsc->FillColInds(Bdcsc->ir + Bdcsc->cp[i], nnzcolB, colindsVec[myThread], aux, csize);
143 std::pair<IT,IT> * colinds = colindsVec[myThread].data();
144 HeapEntry<IT,NT1> * wset = globalheapVec[myThread].data();
148 for(
size_t j = 0; j < nnzcolB; ++j)
150 if(colinds[j].first != colinds[j].second)
152 wset[hsize++] = HeapEntry< IT,NT1 > (Adcsc->ir[colinds[j].first], j, Adcsc->numx[colinds[j].first]);
155 std::make_heap(wset, wset+hsize);
157 IT curptr = colptrC[i];
160 std::pop_heap(wset, wset + hsize);
161 IT locb = wset[hsize-1].runr;
163 NTO mrhs = SR::multiply(wset[hsize-1].num, Bdcsc->numx[Bdcsc->cp[i]+locb]);
164 if (!SR::returnedSAID())
166 if( (curptr > colptrC[i]) && std::get<0>(tuplesC[curptr-1]) == wset[hsize-1].key)
168 std::get<2>(tuplesC[curptr-1]) = SR::add(std::get<2>(tuplesC[curptr-1]), mrhs);
172 tuplesC[curptr++]= std::make_tuple(wset[hsize-1].key, Bdcsc->jc[i], mrhs) ;
177 if( (++(colinds[locb].first)) != colinds[locb].second)
180 wset[hsize-1].key = Adcsc->ir[colinds[locb].first];
181 wset[hsize-1].num = Adcsc->numx[colinds[locb].first];
182 std::push_heap(wset, wset+hsize);
192 delete const_cast<SpDCCols<IT, NT1> *
>(&
A);
194 delete const_cast<SpDCCols<IT, NT2> *
>(&
B);
199 SpTuples<IT, NTO>* spTuplesC =
new SpTuples<IT, NTO> (nnzc, mdim, ndim, tuplesC,
true,
true);
206template <
typename IT,
typename NT>
207bool sort_less(
const std::pair<IT, NT> &left,
const std::pair<IT, NT> &right)
209 return left.first < right.first;
213template <
typename SR,
typename NTO,
typename IT,
typename NT1,
typename NT2>
215(
const SpDCCols<IT, NT1> &
A,
216 const SpDCCols<IT, NT2> &
B,
217 bool clearA,
bool clearB,
IT * aux =
nullptr)
221 IT mdim =
A.getnrow();
222 IT ndim =
B.getncol();
223 IT nnzA =
A.getnnz();
224 if(
A.isZero() ||
B.isZero())
226 return new SpTuples<IT, NTO>(0, mdim, ndim);
230 Dcsc<IT,NT1>* Adcsc =
A.GetDCSC();
231 Dcsc<IT,NT2>* Bdcsc =
B.GetDCSC();
233 float cf =
static_cast<float>(nA+1) /
static_cast<float>(Adcsc->nzc);
234 IT csize =
static_cast<IT>(ceil(cf));
236 bool deleteAux =
false;
240 Adcsc->ConstructAux(nA, aux);
247 numThreads = omp_get_num_threads();
260 IT* flopptr = prefixsum<IT>(flopC, Bdcsc->nzc, numThreads);
261 IT flop = flopptr[Bdcsc->nzc];
262 IT* colptrC = prefixsum<IT>(colnnzC, Bdcsc->nzc, numThreads);
265 IT nnzc = colptrC[Bdcsc->nzc];
272 std::tuple<IT,IT,NTO> * tuplesC =
static_cast<std::tuple<IT,IT,NTO> *
> (::operator
new (
sizeof(std::tuple<IT,IT,NTO>[nnzc])));
276 std::vector<std::vector< std::pair<IT,IT>>> colindsVec(numThreads);
278 std::vector<std::vector< std::pair<IT,NTO>>> globalHashVecAll(numThreads);
279 std::vector<std::vector< HeapEntry<IT,NT1>>> globalHeapVecAll(numThreads);
290#pragma omp parallel for
292 for(
size_t i=0; i < Bdcsc->nzc; ++i)
294 size_t nnzcolB = Bdcsc->cp[i+1] - Bdcsc->cp[i];
298 myThread = omp_get_thread_num();
300 if(colindsVec[myThread].
size() < nnzcolB)
302 colindsVec[myThread].resize(nnzcolB);
307 Adcsc->FillColInds(Bdcsc->ir + Bdcsc->cp[i], nnzcolB, colindsVec[myThread], aux, csize);
308 std::pair<IT,IT> * colinds = colindsVec[myThread].data();
310 double cr =
static_cast<double>(flopptr[i+1] - flopptr[i]) / (colptrC[i+1] - colptrC[i]);
313 if(globalHeapVecAll[myThread].
size() < nnzcolB)
314 globalHeapVecAll[myThread].resize(nnzcolB);
317 HeapEntry<IT, NT1> * wset = globalHeapVecAll[myThread].data();
321 for(
size_t j = 0; j < nnzcolB; ++j)
323 if(colinds[j].first != colinds[j].second)
325 wset[hsize++] = HeapEntry< IT,NT1 > (Adcsc->ir[colinds[j].first], j, Adcsc->numx[colinds[j].first]);
328 std::make_heap(wset, wset+hsize);
330 IT curptr = colptrC[i];
333 std::pop_heap(wset, wset + hsize);
334 IT locb = wset[hsize-1].runr;
336 NTO mrhs = SR::multiply(wset[hsize-1].num, Bdcsc->numx[Bdcsc->cp[i]+locb]);
337 if (!SR::returnedSAID())
339 if( (curptr > colptrC[i]) && std::get<0>(tuplesC[curptr-1]) == wset[hsize-1].key)
341 std::get<2>(tuplesC[curptr-1]) = SR::add(std::get<2>(tuplesC[curptr-1]), mrhs);
345 tuplesC[curptr++]= std::make_tuple(wset[hsize-1].key, Bdcsc->jc[i], mrhs) ;
348 if( (++(colinds[locb].first)) != colinds[locb].second)
351 wset[hsize-1].key = Adcsc->ir[colinds[locb].first];
352 wset[hsize-1].num = Adcsc->numx[colinds[locb].first];
353 std::push_heap(wset, wset+hsize);
366 const IT minHashTableSize = 16;
367 const IT hashScale = 107;
368 size_t nnzcolC = colptrC[i+1] - colptrC[i];
370 size_t ht_size = minHashTableSize;
371 while(ht_size < nnzcolC)
377 if(globalHashVecAll[myThread].
size() < ht_size)
378 globalHashVecAll[myThread].resize(ht_size);
384 std::pair<IT,NTO>* globalHashVec = globalHashVecAll[myThread].data();
389 for(
size_t j=0; j < ht_size; ++j)
391 globalHashVec[j].first = -1;
395 for (
size_t j=0; j < nnzcolB; ++j)
397 IT t_bcol = Bdcsc->ir[Bdcsc->cp[i] + j];
398 NT2 t_bval = Bdcsc->numx[Bdcsc->cp[i] + j];
399 for (
IT k = colinds[j].first; k < colinds[j].second; ++k)
401 NTO mrhs = SR::multiply(Adcsc->numx[k], t_bval);
402 IT key = Adcsc->ir[k];
403 IT hash = (key*hashScale) & (ht_size-1);
406 if (globalHashVec[hash].first == key)
408 globalHashVec[
hash].second = SR::add(mrhs, globalHashVec[hash].second);
411 else if (globalHashVec[hash].first == -1)
413 globalHashVec[
hash].first = key;
414 globalHashVec[
hash].second = mrhs;
426 for (
size_t j=0; j < ht_size; ++j)
428 if (globalHashVec[j].first != -1)
430 globalHashVec[index++] = globalHashVec[j];
434 std::sort(globalHashVecAll[myThread].begin(), globalHashVecAll[myThread].begin() + index, sort_less<IT, NTO>);
435 IT curptr = colptrC[i];
436 for (
size_t j=0; j < index; ++j)
438 tuplesC[curptr++]= std::make_tuple(globalHashVec[j].first, Bdcsc->jc[i], globalHashVec[j].second);
444 delete const_cast<SpDCCols<IT, NT1> *
>(&
A);
446 delete const_cast<SpDCCols<IT, NT2> *
>(&
B);
453 SpTuples<IT, NTO>* spTuplesC =
new SpTuples<IT, NTO> (nnzc, mdim, ndim, tuplesC,
true,
true);
463 template <
typename SR,
typename NTO,
typename IT,
typename NT1,
typename NT2>
465 (
const SpDCCols<IT, NT1> &
A,
466 const SpDCCols<IT, NT2> &
B,
467 bool clearA,
bool clearB,
bool sort=
true)
470 double t0=MPI_Wtime();
472 IT mdim =
A.getnrow();
473 IT ndim =
B.getncol();
474 IT nnzA =
A.getnnz();
475 if(
A.isZero() ||
B.isZero())
477 return new SpTuples<IT, NTO>(0, mdim, ndim);
481 Dcsc<IT,NT1>* Adcsc =
A.GetDCSC();
482 Dcsc<IT,NT2>* Bdcsc =
B.GetDCSC();
484 float cf =
static_cast<float>(nA+1) /
static_cast<float>(Adcsc->nzc);
485 IT csize =
static_cast<IT>(ceil(cf));
487 Adcsc->ConstructAux(nA, aux);
494 numThreads = omp_get_num_threads();
501 IT* flopptr = prefixsum<IT>(flopC, Bdcsc->nzc, numThreads);
502 IT flop = flopptr[Bdcsc->nzc];
506 IT* colptrC = prefixsum<IT>(colnnzC, Bdcsc->nzc, numThreads);
509 IT nnzc = colptrC[Bdcsc->nzc];
510 double compression_ratio = (double)flop / nnzc;
516 std::tuple<IT,IT,NTO> * tuplesC =
new std::tuple<IT,IT,NTO>[nnzc];
519 std::vector<std::vector< std::pair<IT,IT>>> colindsVec(numThreads);
521 for(
int i=0; i<numThreads; i++)
523 colindsVec[i].resize(nnzA/numThreads);
529#pragma omp parallel for
531 for(
size_t i=0; i < Bdcsc->nzc; ++i)
533 size_t nnzcolB = Bdcsc->cp[i+1] - Bdcsc->cp[i];
537 myThread = omp_get_thread_num();
539 if(colindsVec[myThread].
size() < nnzcolB)
541 colindsVec[myThread].resize(nnzcolB);
546 Adcsc->FillColInds(Bdcsc->ir + Bdcsc->cp[i], nnzcolB, colindsVec[myThread], aux, csize);
547 std::pair<IT,IT> * colinds = colindsVec[myThread].data();
554 const IT minHashTableSize = 16;
555 const IT hashScale = 107;
556 size_t nnzcolC = colptrC[i+1] - colptrC[i];
558 size_t ht_size = minHashTableSize;
559 while(ht_size < nnzcolC)
563 std::vector< std::pair<IT,NTO>> globalHashVec(ht_size);
569 for(
size_t j=0; j < ht_size; ++j)
571 globalHashVec[j].first = -1;
575 for (
size_t j=0; j < nnzcolB; ++j)
577 IT t_bcol = Bdcsc->ir[Bdcsc->cp[i] + j];
578 NT2 t_bval = Bdcsc->numx[Bdcsc->cp[i] + j];
579 for (
IT k = colinds[j].first; k < colinds[j].second; ++k)
581 NTO mrhs = SR::multiply(Adcsc->numx[k], t_bval);
582 IT key = Adcsc->ir[k];
583 IT hash = (key*hashScale) & (ht_size-1);
586 if (globalHashVec[hash].first == key)
588 globalHashVec[
hash].second = SR::add(mrhs, globalHashVec[hash].second);
591 else if (globalHashVec[hash].first == -1)
593 globalHashVec[
hash].first = key;
594 globalHashVec[
hash].second = mrhs;
609 for (
size_t j=0; j < ht_size; ++j)
611 if (globalHashVec[j].first != -1)
613 globalHashVec[index++] = globalHashVec[j];
616 std::sort(globalHashVec.begin(), globalHashVec.begin() + index, sort_less<IT, NTO>);
618 IT curptr = colptrC[i];
619 for (
size_t j=0; j < index; ++j)
621 tuplesC[curptr++]= std::make_tuple(globalHashVec[j].first, Bdcsc->jc[i], globalHashVec[j].second);
626 IT curptr = colptrC[i];
627 for (
size_t j=0; j < ht_size; ++j)
629 if (globalHashVec[j].first != -1)
631 tuplesC[curptr++]= std::make_tuple(globalHashVec[j].first, Bdcsc->jc[i], globalHashVec[j].second);
640 delete const_cast<SpDCCols<IT, NT1> *
>(&
A);
642 delete const_cast<SpDCCols<IT, NT2> *
>(&
B);
648 SpTuples<IT, NTO>* spTuplesC =
new SpTuples<IT, NTO> (nnzc, mdim, ndim, tuplesC,
true,
false);
650 double t1=MPI_Wtime();
662template <
typename SR,
typename IT,
typename NT1,
typename NT2>
664(
const SpDCCols<IT, NT1> &
A,
665 const SpDCCols<IT, NT2> &
B,
666 bool clearA,
bool clearB)
668 Dcsc<IT,NT1>* Adcsc =
A.GetDCSC();
669 Dcsc<IT,NT2>* Bdcsc =
B.GetDCSC();
674 numThreads = omp_get_num_threads();
678 IT* flopptr = prefixsum<IT>(flopC, Bdcsc->nzc, numThreads);
679 IT flop = flopptr[Bdcsc->nzc];
683 delete const_cast<SpDCCols<IT, NT1> *
>(&
A);
685 delete const_cast<SpDCCols<IT, NT2> *
>(&
B);
692template <
typename IT,
typename NT1,
typename NT2>
693IT*
estimateNNZ(
const SpDCCols<IT, NT1> &
A,
const SpDCCols<IT, NT2> &
B,
IT * aux =
nullptr,
bool freeaux =
true)
695 IT nnzA =
A.getnnz();
696 if(
A.isZero() ||
B.isZero())
701 Dcsc<IT,NT1>* Adcsc =
A.GetDCSC();
702 Dcsc<IT,NT2>* Bdcsc =
B.GetDCSC();
704 float cf =
static_cast<float>(
A.getncol()+1) /
static_cast<float>(Adcsc->nzc);
705 IT csize =
static_cast<IT>(ceil(cf));
708 Adcsc->ConstructAux(
A.getncol(), aux);
716 numThreads = omp_get_num_threads();
721 IT* colnnzC =
new IT[Bdcsc->nzc];
724#pragma omp parallel for
726 for(
IT i=0; i< Bdcsc->nzc; ++i)
732 std::vector<std::vector< std::pair<IT,IT>>> colindsVec(numThreads);
733 std::vector<std::vector<std::pair<IT,IT>>> globalheapVec(numThreads);
736 for(
int i=0; i<numThreads; i++)
738 colindsVec[i].resize(nnzA/numThreads);
739 globalheapVec[i].resize(nnzA/numThreads);
743#pragma omp parallel for
745 for(
int i=0; i < Bdcsc->nzc; ++i)
747 size_t nnzcolB = Bdcsc->cp[i+1] - Bdcsc->cp[i];
750 myThread = omp_get_thread_num();
752 if(colindsVec[myThread].
size() < nnzcolB)
754 colindsVec[myThread].resize(nnzcolB);
755 globalheapVec[myThread].resize(nnzcolB);
760 Adcsc->FillColInds(Bdcsc->ir + Bdcsc->cp[i], nnzcolB, colindsVec[myThread], aux, csize);
761 std::pair<IT,IT> * colinds = colindsVec[myThread].data();
762 std::pair<IT,IT> * curheap = globalheapVec[myThread].data();
766 for(
IT j = 0; (unsigned)j < nnzcolB; ++j)
768 if(colinds[j].first != colinds[j].second)
770 curheap[hsize++] = std::make_pair(Adcsc->ir[colinds[j].first], j);
773 std::make_heap(curheap, curheap+hsize, std::greater<std::pair<IT,IT>>());
779 std::pop_heap(curheap, curheap + hsize, std::greater<std::pair<IT,IT>>());
780 IT locb = curheap[hsize-1].second;
782 if( curheap[hsize-1].first != prevRow)
784 prevRow = curheap[hsize-1].first;
788 if( (++(colinds[locb].first)) != colinds[locb].second)
790 curheap[hsize-1].first = Adcsc->ir[colinds[locb].first];
791 std::push_heap(curheap, curheap+hsize, std::greater<std::pair<IT,IT>>());
800 if (freeaux)
delete [] aux;
806template <
typename IT,
typename NT1,
typename NT2>
809 IT nnzA =
A.getnnz();
810 if(
A.isZero() ||
B.isZero())
815 Dcsc<IT,NT1>* Adcsc =
A.GetDCSC();
816 Dcsc<IT,NT2>* Bdcsc =
B.GetDCSC();
818 float cf =
static_cast<float>(
A.getncol()+1) /
static_cast<float>(Adcsc->nzc);
819 IT csize =
static_cast<IT>(ceil(cf));
820 bool deleteAux =
false;
824 Adcsc->ConstructAux(
A.getncol(), aux);
831 numThreads = omp_get_num_threads();
836 IT* colnnzC =
new IT[Bdcsc->nzc];
850 std::vector<std::vector< std::pair<IT,IT>>> colindsVec(numThreads);
851 std::vector<std::vector< IT>> globalHashVecAll(numThreads);
859#pragma omp parallel for
861 for(
int i=0; i < Bdcsc->nzc; ++i)
864 size_t nnzcolB = Bdcsc->cp[i+1] - Bdcsc->cp[i];
867 myThread = omp_get_thread_num();
869 if(colindsVec[myThread].
size() < nnzcolB)
871 colindsVec[myThread].resize(nnzcolB);
876 Adcsc->FillColInds(Bdcsc->ir + Bdcsc->cp[i], nnzcolB, colindsVec[myThread], aux, csize);
877 std::pair<IT,IT> * colinds = colindsVec[myThread].data();
880 const IT minHashTableSize = 16;
881 const IT hashScale = 107;
884 IT ht_size = minHashTableSize;
885 while(ht_size < flopC[i])
890 if(globalHashVecAll[myThread].
size() < ht_size)
892 globalHashVecAll[myThread].resize(ht_size);
895 IT* globalHashVec = globalHashVecAll[myThread].data();
897 for(
IT j=0; (unsigned)j < ht_size; ++j)
899 globalHashVec[j] = -1;
902 for (
IT j=0; (unsigned)j < nnzcolB; ++j)
904 IT t_bcol = Bdcsc->ir[Bdcsc->cp[i] + j];
905 for (
IT k = colinds[j].first; (unsigned)k < colinds[j].second; ++k)
907 IT key = Adcsc->ir[k];
908 IT hash = (key*hashScale) & (ht_size-1);
911 if (globalHashVec[hash] == key)
915 else if (globalHashVec[hash] == -1)
917 globalHashVec[
hash] = key;
936template <
typename IT,
typename NT1,
typename NT2>
939 const SpDCCols<IT, NT1> &
A,
940 const SpDCCols<IT, NT2> &
B,
944 IT nnzA =
A.getnnz();
945 if (
A.isZero() ||
B.isZero())
948 Dcsc<IT,NT1> *Adcsc =
A.GetDCSC();
949 Dcsc<IT,NT2> *Bdcsc =
B.GetDCSC();
951 float usedmem = 0.0f;
954 float *samples_init, *samples_mid, *samples_final;
958 samples_init = (
float *) malloc(m * nrounds *
sizeof(*samples_init));
959 samples_mid = (
float *) malloc(p * nrounds *
sizeof(*samples_mid));
966 nthds = omp_get_num_threads();
973 std::default_random_engine gen;
974 std::exponential_distribution<float> exp_dist(lambda);
977 #pragma omp parallel for
979 for (
IT i = 0; i < m * nrounds; ++i)
980 samples_init[i] = exp_dist(gen);
984 #pragma omp parallel for
986 for (
IT i = 0; i < p * nrounds; ++i)
987 samples_mid[i] = std::numeric_limits<float>::max();
990 #pragma omp parallel for
992 for (
IT i = 0; i < Adcsc->nzc; ++i)
994 IT col = Adcsc->jc[i];
995 IT beg_mid = col * nrounds;
996 for (
IT j = Adcsc->cp[i]; j < Adcsc->cp[i + 1]; ++j)
998 IT row = Adcsc->ir[j];
999 IT beg_init = row * nrounds;
1000 for (
int k = 0; k < nrounds; ++k)
1002 if (samples_init[beg_init + k] < samples_mid[beg_mid + k])
1003 samples_mid[beg_mid + k] = samples_init[beg_init + k];
1010 samples_final = (
float *) malloc(
B.getnzc() * nrounds *
1011 sizeof(*samples_final));
1012 colest = (
float *) malloc(
B.getnzc() *
sizeof(*colest));
1014 float nnzest = 0.0f;
1017 #pragma omp parallel for reduction (+:nnzest)
1019 for (
IT i = 0; i < Bdcsc->nzc; ++i)
1023 tid = omp_get_thread_num();
1026 IT beg_final = i * nrounds;
1027 for (
IT k = beg_final; k < beg_final + nrounds; ++k)
1028 samples_final[k] = std::numeric_limits<float>::max();
1030 for (
IT j = Bdcsc->cp[i]; j < Bdcsc->cp[i + 1]; ++j)
1032 IT row = Bdcsc->ir[j];
1033 IT beg_mid = row * nrounds;
1034 for (
int k = 0; k < nrounds; ++k)
1036 if (samples_mid[beg_mid + k] < samples_final[beg_final + k])
1037 samples_final[beg_final + k] = samples_mid[beg_mid + k];
1042 for (
IT k = beg_final; k < beg_final + nrounds; ++k)
1043 colest[i] += samples_final[k];
1044 colest[i] =
static_cast<float>(nrounds - 1) / colest[i];
1046 nnzest += colest[i];
1050 free(samples_final);
1053 return static_cast<int64_t>(nnzest);
1057template <
typename IT,
typename NT1,
typename NT2>
1058IT*
estimateFLOP(
const SpDCCols<IT, NT1> &
A,
const SpDCCols<IT, NT2> &
B,
IT * aux =
nullptr)
1060 IT nnzA =
A.getnnz();
1061 if(
A.isZero() ||
B.isZero())
1066 Dcsc<IT,NT1>* Adcsc =
A.GetDCSC();
1067 Dcsc<IT,NT2>* Bdcsc =
B.GetDCSC();
1069 float cf =
static_cast<float>(
A.getncol()+1) /
static_cast<float>(Adcsc->nzc);
1070 IT csize =
static_cast<IT>(ceil(cf));
1072 bool deleteAux =
false;
1076 Adcsc->ConstructAux(
A.getncol(), aux);
1084 numThreads = omp_get_num_threads();
1089 IT* colflopC =
new IT[Bdcsc->nzc];
1092#pragma omp parallel for
1094 for(
IT i=0; i< Bdcsc->nzc; ++i)
1101 std::vector<std::vector< std::pair<IT,IT>>> colindsVec(numThreads);
1110#pragma omp parallel for
1112 for(
int i=0; i < Bdcsc->nzc; ++i)
1114 size_t nnzcolB = Bdcsc->cp[i+1] - Bdcsc->cp[i];
1117 myThread = omp_get_thread_num();
1119 if(colindsVec[myThread].
size() < nnzcolB)
1121 colindsVec[myThread].resize(nnzcolB);
1126 Adcsc->FillColInds(Bdcsc->ir + Bdcsc->cp[i], nnzcolB, colindsVec[myThread], aux, csize);
1127 for (
IT j = 0; (unsigned)j < nnzcolB; ++j) {
1128 colflopC[i] += colindsVec[myThread][j].second - colindsVec[myThread][j].first;
1142template <
typename SR,
1149 const SpCCols<IT, NT2> &
B,
1154 double t0 = MPI_Wtime();
1156 IT mdim =
A.getnrow();
1157 IT ndim =
B.getncol();
1158 IT nnzA =
A.getnnz();
1160 if(
A.isZero() ||
B.isZero())
1161 return new SpTuples<IT, NTO>(0, mdim, ndim);
1163 Csc<IT, NT1> *Acsc =
A.GetCSC();
1164 Csc<IT, NT2> *Bcsc =
B.GetCSC();
1168 #pragma omp parallel
1170 numThreads = omp_get_num_threads();
1175 IT *flopptr = prefixsum<IT>(flopC, Bcsc->n, numThreads);
1176 IT flop = flopptr[Bcsc->n];
1179 IT *colptrC = prefixsum<IT>(colnnzC, Bcsc->n, numThreads);
1182 IT nnzc = colptrC[Bcsc->n];
1183 double compression_ratio = (double)flop / nnzc;
1185 std::tuple<IT, IT, NTO> *tuplesC =
static_cast<std::tuple<IT, IT, NTO> *
>
1186 (::operator
new (
sizeof(std::tuple<IT, IT, NTO>[nnzc])));
1189 #pragma omp parallel for
1191 for (
size_t i = 0; i < Bcsc->n; ++i)
1193 size_t nnzcolB = Bcsc->jc[i + 1] - Bcsc->jc[i];
1194 double cr =
static_cast<double>
1195 (flopptr[i+1] - flopptr[i]) / (colptrC[i+1] - colptrC[i]);
1198 std::vector<IT> cnt(nnzcolB);
1199 std::vector<HeapEntry<IT, NT1>> globalheapVec(nnzcolB);
1200 HeapEntry<IT, NT1> *wset = globalheapVec.data();
1202 for (
IT j = Bcsc->jc[i]; j < Bcsc->jc[i + 1]; ++j)
1204 IT ca = Bcsc->ir[j];
1205 cnt[j - Bcsc->jc[i]] = Acsc->jc[ca];
1206 if (Acsc->jc[ca] != Acsc->jc[ca + 1])
1207 wset[hsize++] = HeapEntry<IT, NT1>
1208 (Acsc->ir[Acsc->jc[ca]], j, Acsc->num[Acsc->jc[ca]]);
1211 std::make_heap(wset, wset + hsize);
1213 IT curptr = colptrC[i];
1216 std::pop_heap(wset, wset + hsize);
1217 IT locb = wset[hsize - 1].runr;
1218 NTO mrhs = SR::multiply(wset[hsize - 1].num, Bcsc->num[locb]);
1219 if (!SR::returnedSAID())
1221 if ((curptr > colptrC[i]) &&
1222 std::get<0>(tuplesC[curptr - 1]) == wset[hsize - 1].key)
1223 std::get<2>(tuplesC[curptr - 1]) =
1224 SR::add(std::get<2>(tuplesC[curptr - 1]), mrhs);
1226 tuplesC[curptr++] = std::make_tuple(wset[hsize - 1].key,
1230 IT locb_offset = locb - Bcsc->jc[i];
1231 IT ca = Bcsc->ir[locb];
1232 ++(cnt[locb_offset]);
1233 if (cnt[locb_offset] != Acsc->jc[ca + 1])
1235 wset[hsize - 1].key = Acsc->ir[cnt[locb_offset]];
1236 wset[hsize - 1].num = Acsc->num[cnt[locb_offset]];
1237 std::push_heap(wset, wset + hsize);
1246 const IT minHashTableSize = 16;
1247 const IT hashScale = 107;
1248 size_t nnzcolC = colptrC[i+1] - colptrC[i];
1249 size_t ht_size = minHashTableSize;
1251 while (ht_size < nnzcolC)
1253 std::vector< std::pair<IT, NTO>> T(ht_size);
1255 for (
size_t j = 0; j < ht_size; ++j)
1256 T[j].first = std::numeric_limits<IT>::max();
1259 for (
IT j = Bcsc->jc[i]; j < Bcsc->jc[i + 1]; ++j)
1261 IT t_bcol = Bcsc->ir[j];
1262 NT2 t_bval = Bcsc->num[j];
1263 for (
IT k = Acsc->jc[t_bcol]; k < Acsc->jc[t_bcol + 1]; ++k)
1265 NTO mrhs = SR::multiply(Acsc->num[k], t_bval);
1266 IT key = Acsc->ir[k];
1267 IT hv = (key * hashScale) & (ht_size - 1);
1270 if (T[hv].first == key)
1271 T[hv].second = SR::add(mrhs, T[hv].second);
1272 else if (T[hv].first == std::numeric_limits<IT>::max())
1275 T[hv].second = mrhs;
1279 hv = (hv + 1) & (ht_size - 1);
1286 for (
size_t j = 0; j < ht_size; ++j)
1288 if (T[j].first != std::numeric_limits<IT>::max())
1292 std::sort(T.begin(), T.begin() + index, sort_less<IT, NTO>);
1294 IT curptr = colptrC[i];
1295 for (
size_t j = 0; j < index; ++j)
1296 tuplesC[curptr++] = std::make_tuple(T[j].first, i, T[j].second);
1301 delete const_cast<SpCCols<IT, NT1> *
>(&
A);
1303 delete const_cast<SpCCols<IT, NT2> *
>(&
B);
1308 SpTuples<IT, NTO> *spTuplesC =
1309 new SpTuples<IT, NTO> (nnzc, mdim, ndim, tuplesC,
true,
true);
1311 double t1 = MPI_Wtime();
1318template <
typename IT,
1323 const SpCCols<IT, NT2> &
B
1326 IT nnzA =
A.getnnz();
1327 if (
A.isZero() ||
B.isZero())
1330 Csc<IT, NT1> *Acsc =
A.GetCSC();
1331 Csc<IT, NT2> *Bcsc =
B.GetCSC();
1335 #pragma omp parallel
1337 numThreads = omp_get_num_threads();
1341 IT *colflopC =
new IT[Bcsc->n];
1344 #pragma omp parallel for
1346 for (
IT i = 0; i < Bcsc->n; ++i)
1350 #pragma omp parallel for
1352 for (
IT i = 0; i < Bcsc->n; ++i)
1354 for (
IT j = Bcsc->jc[i]; j < Bcsc->jc[i+1]; ++j)
1355 colflopC[i] += Acsc->jc[Bcsc->ir[j]+1] - Acsc->jc[Bcsc->ir[j]];
1363template <
typename IT,
1368 const SpCCols<IT, NT2> &
B,
1372 IT nnzA =
A.getnnz();
1373 if (
A.isZero() ||
B.isZero())
1376 Csc<IT, NT1> *Acsc =
A.GetCSC();
1377 Csc<IT, NT2> *Bcsc =
B.GetCSC();
1381 #pragma omp parallel
1383 numThreads = omp_get_num_threads();
1387 IT *colnnzC =
new IT[Bcsc->n];
1390 #pragma omp parallel for
1392 for (
IT i = 0; i < Bcsc->n; ++i)
1396 #pragma omp parallel for
1398 for (
IT i = 0; i < Bcsc->n; ++i)
1401 const IT minHashTableSize = 16;
1402 const IT hashScale = 107;
1403 IT ht_size = minHashTableSize;
1404 while (ht_size < flopC[i])
1408 std::vector<IT> T(ht_size);
1409 for (
IT j = 0; (unsigned)j < ht_size; ++j)
1410 T[j] = std::numeric_limits<IT>::max();
1412 for (
IT j = Bcsc->jc[i]; j < Bcsc->jc[i + 1]; ++j)
1414 for (
IT k = Acsc->jc[Bcsc->ir[j]]; k < Acsc->jc[Bcsc->ir[j]+1]; ++k)
1416 IT key = Acsc->ir[k];
1417 IT hv = (key * hashScale) & (ht_size - 1);
1423 else if (T[hv] == std::numeric_limits<IT>::max())
1430 hv = (hv + 1) & (ht_size - 1);
SelectMaxSRing< bool, int64_t > SR
SpTuples< IT, NTO > * LocalSpGEMMHash(const SpDCCols< IT, NT1 > &A, const SpDCCols< IT, NT2 > &B, bool clearA, bool clearB, bool sort=true)
IT * estimateFLOP(const SpDCCols< IT, NT1 > &A, const SpDCCols< IT, NT2 > &B, IT *aux=nullptr)
SpTuples< IT, NTO > * LocalSpGEMM(const SpDCCols< IT, NT1 > &A, const SpDCCols< IT, NT2 > &B, bool clearA, bool clearB)
IT EstimateLocalFLOP(const SpDCCols< IT, NT1 > &A, const SpDCCols< IT, NT2 > &B, bool clearA, bool clearB)
SpTuples< IT, NTO > * LocalHybridSpGEMM(const SpDCCols< IT, NT1 > &A, const SpDCCols< IT, NT2 > &B, bool clearA, bool clearB, IT *aux=nullptr)
bool sort_less(const std::pair< IT, NT > &left, const std::pair< IT, NT > &right)
T * prefixsum(T *in, int size, int nthreads)
int64_t estimateNNZ_sampling(const SpDCCols< IT, NT1 > &A, const SpDCCols< IT, NT2 > &B, int nrounds=5)
IT * estimateNNZ(const SpDCCols< IT, NT1 > &A, const SpDCCols< IT, NT2 > &B, IT *aux=nullptr, bool freeaux=true)
IT * estimateNNZ_Hash(const SpDCCols< IT, NT1 > &A, const SpDCCols< IT, NT2 > &B, IT *flopC, IT *aux=nullptr)
unsigned int hash(unsigned int a)