COMBINATORIAL_BLAS 1.6
 
Loading...
Searching...
No Matches
MultiwayMerge.h
Go to the documentation of this file.
1#ifndef _MULTIWAY_MERGE_H_
2#define _MULTIWAY_MERGE_H_
3
4#include "CombBLAS.h"
5
6namespace combblas {
7
8/*
9 Multithreaded prefix sum
10 Inputs:
11 in: an input array
12 size: the length of the input array "in"
13 nthreads: number of threads used to compute the prefix sum
14
15 Output:
16 return an array of size "size+1"
17 the memory of the output array is allocated internallay
18
19 Example:
20
21 in = [2, 1, 3, 5]
22 out = [0, 2, 3, 6, 11]
23 */
24template <typename T>
25T* prefixSum(T* in, int size, int nthreads)
26{
27 std::vector<T> tsum(nthreads+1);
28 tsum[0] = 0;
29 T* out = new T[size+1];
30 out[0] = 0;
31 T* psum = &out[1];
32#ifdef THREADED
33#pragma omp parallel
34#endif
35 {
36 int ithread = 0;
37 #ifdef THREADED
38 ithread = omp_get_thread_num();
39 #endif
40
41 T sum = 0;
42#ifdef THREADED
43#pragma omp for schedule(static)
44#endif
45 for (int i=0; i<size; i++)
46 {
47 sum += in[i];
48 psum[i] = sum;
49 }
50
51 tsum[ithread+1] = sum;
52#ifdef THREADED
53#pragma omp barrier
54#endif
55 T offset = 0;
56 for(int i=0; i<(ithread+1); i++)
57 {
58 offset += tsum[i];
59 }
60
61#ifdef THREADED
62#pragma omp for schedule(static)
63#endif
64 for (int i=0; i<size; i++)
65 {
66 psum[i] += offset;
67 }
68
69 }
70 return out;
71}
72
73
74/***************************************************************************
75 * Find indices of column splitters in a list of tuple in parallel.
76 * Inputs:
77 * tuples: an array of SpTuples each tuple is (rowid, colid, val)
78 * nsplits: number of splits requested
79 * Output:
80 * splitters: An array of size (nsplits+1) storing the starts and ends of split tuples.
81 * different type used for output since we might need int or IT
82 ***************************************************************************/
83
84template <typename RT, typename IT, typename NT>
85std::vector<RT> findColSplitters(SpTuples<IT,NT> * & spTuples, int nsplits)
86{
87 std::vector<RT> splitters(nsplits+1);
88 splitters[0] = static_cast<RT>(0);
89 ColLexiCompare<IT,NT> comp;
90#ifdef THREADED
91#pragma omp parallel for
92#endif
93 for(int i=1; i< nsplits; i++)
94 {
95 IT cur_col = i * (spTuples->getncol()/nsplits);
96 std::tuple<IT,IT,NT> search_tuple(0, cur_col, NT());
97 std::tuple<IT,IT,NT>* it = std::lower_bound (spTuples->tuples, spTuples->tuples + spTuples->getnnz(), search_tuple, comp);
98 splitters[i] = (RT) (it - spTuples->tuples);
99 }
100 splitters[nsplits] = spTuples->getnnz();
101
102 return splitters;
103}
104
105 // Find ColSplitters using finger search
106 // Run by one threrad
107 template <typename RT, typename IT, typename NT>
108 std::vector<RT> findColSplittersFinger(SpTuples<IT,NT> * & spTuples, int nsplits)
109 {
110 std::vector<RT> splitters(nsplits+1);
111 splitters[0] = static_cast<RT>(0);
112 ColLexiCompare<IT,NT> comp;
113
114 std::tuple<IT,IT,NT>* start = spTuples->tuples;
115 std::tuple<IT,IT,NT>* end = spTuples->tuples + spTuples->getnnz();
116 for(int i=1; i< nsplits; i++)
117 {
118 IT cur_col = i * (spTuples->getncol()/nsplits);
119 std::tuple<IT,IT,NT> search_tuple(0, cur_col, NT());
120 std::tuple<IT,IT,NT>* it = std::lower_bound (start, end, search_tuple, comp);
121 splitters[i] = (RT) (it - spTuples->tuples);
122 //start = it;
123 }
124 splitters[nsplits] = spTuples->getnnz();
125
126 return splitters;
127 }
128
129// Symbolic serial merge : only estimates nnz
130template<class IT, class NT>
131IT SerialMergeNNZ( const std::vector<SpTuples<IT,NT> *> & ArrSpTups)
132{
133 int nlists = ArrSpTups.size();
134 ColLexiCompare<IT,int> heapcomp;
135 std::vector<std::tuple<IT, IT, int>> heap(nlists);
136 std::vector<IT> curptr(nlists, static_cast<IT>(0));
137 IT hsize = 0;
138 for(int i=0; i< nlists; ++i)
139 {
140 if(ArrSpTups[i]->getnnz()>0)
141 {
142 heap[hsize++] = std::make_tuple(std::get<0>(ArrSpTups[i]->tuples[0]), std::get<1>(ArrSpTups[i]->tuples[0]), i);
143 }
144
145 }
146 std::make_heap(heap.data(), heap.data()+hsize, std::not2(heapcomp));
147
148 std::tuple<IT, IT, NT> curTuple;
149 IT estnnz = 0;
150 while(hsize > 0)
151 {
152 std::pop_heap(heap.data(), heap.data() + hsize, std::not2(heapcomp)); // result is stored in heap[hsize-1]
153 int source = std::get<2>(heap[hsize-1]);
154 if( (estnnz ==0) || (std::get<0>(curTuple) != std::get<0>(heap[hsize-1])) || (std::get<1>(curTuple) != std::get<1>(heap[hsize-1])))
155 {
156 curTuple = ArrSpTups[source]->tuples[curptr[source]];
157 estnnz++;
158 }
159 curptr[source]++;
160 if(curptr[source] != ArrSpTups[source]->getnnz()) // That array has not been depleted
161 {
162 heap[hsize-1] = std::make_tuple(std::get<0>(ArrSpTups[source]->tuples[curptr[source]]),
163 std::get<1>(ArrSpTups[source]->tuples[curptr[source]]), source);
164 std::push_heap(heap.data(), heap.data()+hsize, std::not2(heapcomp));
165 }
166 else
167 {
168 --hsize;
169 }
170 }
171 return estnnz;
172}
173
174
175/*
176 "Internal function" called by MultiwayMerge inside threaded region.
177 The merged list is stored in a preallocated buffer ntuples
178 Never called from outside.
179 Assumption1: the input lists are already column sorted
180 Assumption2: at least two lists are passed to this function
181 Assumption3: the input and output lists are to be deleted by caller
182 */
183
184template<class SR, class IT, class NT>
185void SerialMerge( const std::vector<SpTuples<IT,NT> *> & ArrSpTups, std::tuple<IT, IT, NT> * ntuples)
186{
187 int nlists = ArrSpTups.size();
188 ColLexiCompare<IT,int> heapcomp;
189 std::vector<std::tuple<IT, IT, int>> heap(nlists); // if performance issue, create this outside of threaded region
190 std::vector<IT> curptr(nlists, static_cast<IT>(0));
191 IT estnnz = 0;
192 IT hsize = 0;
193 for(int i=0; i< nlists; ++i)
194 {
195 if(ArrSpTups[i]->getnnz()>0)
196 {
197 estnnz += ArrSpTups[i]->getnnz();
198 heap[hsize++] = std::make_tuple(std::get<0>(ArrSpTups[i]->tuples[0]), std::get<1>(ArrSpTups[i]->tuples[0]), i);
199 }
200
201 }
202 std::make_heap(heap.data(), heap.data()+hsize, std::not2(heapcomp));
203 IT cnz = 0;
204
205 while(hsize > 0)
206 {
207 std::pop_heap(heap.data(), heap.data() + hsize, std::not2(heapcomp)); // result is stored in heap[hsize-1]
208 int source = std::get<2>(heap[hsize-1]);
209
210 if( (cnz != 0) &&
211 ((std::get<0>(ntuples[cnz-1]) == std::get<0>(heap[hsize-1])) && (std::get<1>(ntuples[cnz-1]) == std::get<1>(heap[hsize-1]))) )
212 {
213 std::get<2>(ntuples[cnz-1]) = SR::add(std::get<2>(ntuples[cnz-1]), ArrSpTups[source]->numvalue(curptr[source]++));
214 }
215 else
216 {
217 ntuples[cnz++] = ArrSpTups[source]->tuples[curptr[source]++];
218 }
219
220 if(curptr[source] != ArrSpTups[source]->getnnz()) // That array has not been depleted
221 {
222 heap[hsize-1] = std::make_tuple(std::get<0>(ArrSpTups[source]->tuples[curptr[source]]),
223 std::get<1>(ArrSpTups[source]->tuples[curptr[source]]), source);
224 std::push_heap(heap.data(), heap.data()+hsize, std::not2(heapcomp));
225 }
226 else
227 {
228 --hsize;
229 }
230 }
231}
232
233
234
235
236// Symbolic serial merge : only estimates nnz
237 template<class IT, class NT>
238 IT* SerialMergeNNZHash( const std::vector<SpTuples<IT,NT> *> & ArrSpTups, IT& totnnz, IT& maxnnzPerCol, IT startCol, IT endCol)
239 {
240
241 int nlists = ArrSpTups.size();
242 IT ncols = endCol - startCol; // in this split
243 std::vector<IT> curptr(nlists, static_cast<IT>(0));
244 const IT minHashTableSize = 16;
245 const IT hashScale = 107;
246 std::vector<NT> globalHashVec(minHashTableSize);
247
248
249
250 IT* colnnzC = new IT[ncols](); // nnz in every column of C
251 maxnnzPerCol = 0;
252 totnnz = 0;
253
254 for(IT col = 0; col<ncols; col++)
255 {
256 IT globalCol = col + startCol;
257
258 // symbolic flop
259 size_t nnzcol = 0;
260 for(int i=0; i<nlists; i++)
261 {
262 IT curidx = curptr[i];
263 while((ArrSpTups[i]->getnnz()>curidx) && (ArrSpTups[i]->colindex(curidx++) == globalCol))
264 {
265 nnzcol++;
266 }
267 }
268 size_t ht_size = minHashTableSize;
269 while(ht_size < nnzcol) //ht_size is set as 2^n
270 {
271 ht_size <<= 1;
272 }
273 if(globalHashVec.size() < ht_size)
274 globalHashVec.resize(ht_size);
275
276 for(size_t j=0; j < ht_size; ++j)
277 {
278 globalHashVec[j] = -1;
279 }
280 for(int i=0; i<nlists; i++)
281 {
282 //IT curcol = std::get<1>(ArrSpTups[i]->tuples[curptr[i]]);
283 while((ArrSpTups[i]->getnnz()>curptr[i]) && (ArrSpTups[i]->colindex(curptr[i]) == globalCol))
284 {
285 IT key = ArrSpTups[i]->rowindex(curptr[i]);
286 IT hash = (key*hashScale) & (ht_size-1);
287
288 while (1) //hash probing
289 {
290 if (globalHashVec[hash] == key) //key is found in hash table
291 {
292 break;
293 }
294 else if (globalHashVec[hash] == -1) //key is not registered yet
295 {
296 globalHashVec[hash] = key;
297 colnnzC[col] ++;
298 break;
299 }
300 else //key is not found
301 {
302 hash = (hash+1) & (ht_size-1);
303 }
304 }
305 curptr[i]++;
306 }
307 }
308 totnnz += colnnzC[col];
309 if(colnnzC[col] > maxnnzPerCol) maxnnzPerCol = colnnzC[col];
310 }
311 return colnnzC;
312 }
313
314
315
316
317 // Serially merge a split along the column
318 // startCol and endCol denote the start and end of the current split
319 // maxcolnnz: maximum nnz in a merged column (from symbolic)
320 template<class SR, class IT, class NT>
321 void SerialMergeHash( const std::vector<SpTuples<IT,NT> *> & ArrSpTups, std::tuple<IT, IT, NT> * ntuples, IT* colnnz, IT maxcolnnz, IT startCol, IT endCol, bool sorted)
322 {
323 int nlists = ArrSpTups.size();
324 IT ncols = endCol - startCol; // in this split
325 IT outptr = 0;
326 std::vector<IT> curptr(nlists, static_cast<IT>(0));
327
328 const IT minHashTableSize = 16;
329 const IT hashScale = 107;
330 //std::vector< std::pair<IT,NT>> globalHashVec(std::max(minHashTableSize, maxcolnnz*2));
331 std::vector< std::pair<uint32_t,NT>> globalHashVec(std::max(minHashTableSize, maxcolnnz*2));
332
333 for(IT col = 0; col<ncols; col++)
334 {
335 IT globalCol = col + startCol;
336 size_t ht_size = minHashTableSize;
337 while(ht_size < colnnz[col]) //ht_size is set as 2^n
338 {
339 ht_size <<= 1;
340 }
341 for(size_t j=0; j < ht_size; ++j)
342 {
343 globalHashVec[j].first = -1;
344 }
345 for(int i=0; i<nlists; i++)
346 {
347 while((ArrSpTups[i]->getnnz()>curptr[i]) && (ArrSpTups[i]->colindex(curptr[i]) == globalCol))
348 {
349 IT key = ArrSpTups[i]->rowindex(curptr[i]);
350 IT hash = (key*hashScale) & (ht_size-1);
351
352 while (1) //hash probing
353 {
354 NT curval = ArrSpTups[i]->numvalue(curptr[i]);
355 if (globalHashVec[hash].first == key) //key is found in hash table
356 {
357 globalHashVec[hash].second = SR::add(curval, globalHashVec[hash].second);
358 break;
359 }
360 else if (globalHashVec[hash].first == -1) //key is not registered yet
361 {
362 globalHashVec[hash].first = key;
363 globalHashVec[hash].second = curval;
364 break;
365 }
366 else //key is not found
367 {
368 hash = (hash+1) & (ht_size-1);
369 }
370 }
371 curptr[i]++;
372 }
373 }
374
375 if(sorted)
376 {
377 size_t index = 0;
378 for (size_t j=0; j < ht_size; ++j)
379 {
380 if (globalHashVec[j].first != -1)
381 {
382 globalHashVec[index++] = globalHashVec[j];
383 }
384 }
385 integerSort<NT>(globalHashVec.data(), index);
386 //std::sort(globalHashVec.begin(), globalHashVec.begin() + index, sort_less<IT, NT>);
387
388
389 for (size_t j=0; j < index; ++j)
390 {
391 ntuples[outptr++]= std::make_tuple(globalHashVec[j].first, globalCol, globalHashVec[j].second);
392 }
393 }
394 else
395 {
396 for (size_t j=0; j < ht_size; ++j)
397 {
398 if (globalHashVec[j].first != -1)
399 {
400 ntuples[outptr++]= std::make_tuple(globalHashVec[j].first, globalCol, globalHashVec[j].second);
401 }
402 }
403 }
404 }
405 }
406
407
408
409// Performs a balanced merge of the array of SpTuples
410// Assumes the input parameters are already column sorted
411template<class SR, class IT, class NT>
412SpTuples<IT, NT>* MultiwayMerge( std::vector<SpTuples<IT,NT> *> & ArrSpTups, IT mdim = 0, IT ndim = 0, bool delarrs = false )
413{
414
415 int nlists = ArrSpTups.size();
416 if(nlists == 0)
417 {
418 return new SpTuples<IT,NT>(0, mdim, ndim); //empty mxn SpTuples
419 }
420 if(nlists == 1)
421 {
422 if(delarrs) // steal data from input, and don't delete input
423 {
424 return ArrSpTups[0];
425 }
426 else // copy input to output
427 {
428 // std::tuple<IT, IT, NT>* mergeTups = static_cast<std::tuple<IT, IT, NT>*>
429 // (::operator new (sizeof(std::tuple<IT, IT, NT>[ArrSpTups[0]->getnnz()])));
430
431 std::tuple<IT, IT, NT>* mergeTups = new std::tuple<IT, IT, NT>[ArrSpTups[0]->getnnz()];
432#ifdef THREADED
433#pragma omp parallel for
434#endif
435 for(int i=0; i<ArrSpTups[0]->getnnz(); i++)
436 mergeTups[i] = ArrSpTups[0]->tuples[i];
437
438 return new SpTuples<IT,NT> (ArrSpTups[0]->getnnz(), mdim, ndim, mergeTups, false);
439 }
440 }
441
442 // ---- check correctness of input dimensions ------
443 for(int i=0; i< nlists; ++i)
444 {
445 if((mdim != ArrSpTups[i]->getnrow()) || ndim != ArrSpTups[i]->getncol())
446 {
447 std::cerr << "Dimensions of SpTuples do not match on multiwayMerge()" << std::endl;
448 return new SpTuples<IT,NT>(0,0,0);
449 }
450 }
451
452 int nthreads = 1;
453#ifdef THREADED
454#pragma omp parallel
455 {
456 nthreads = omp_get_num_threads();
457 }
458#endif
459 int nsplits = 4*nthreads; // oversplit for load balance
460 nsplits = std::min(nsplits, (int)ndim); // we cannot split a column
461 std::vector< std::vector<IT> > colPtrs;
462 for(int i=0; i< nlists; i++)
463 {
464 colPtrs.push_back(findColSplitters<IT>(ArrSpTups[i], nsplits)); // in parallel
465 }
466
467 std::vector<IT> mergedNnzPerSplit(nsplits);
468 std::vector<IT> inputNnzPerSplit(nsplits);
469// ------ estimate memory requirement after merge in each split ------
470#ifdef THREADED
471#pragma omp parallel for schedule(dynamic)
472#endif
473 for(int i=0; i< nsplits; i++) // for each part
474 {
475 std::vector<SpTuples<IT,NT> *> listSplitTups(nlists);
476 IT t = static_cast<IT>(0);
477 for(int j=0; j< nlists; ++j)
478 {
479 IT curnnz= colPtrs[j][i+1] - colPtrs[j][i];
480 listSplitTups[j] = new SpTuples<IT, NT> (curnnz, mdim, ndim, ArrSpTups[j]->tuples + colPtrs[j][i], true);
481 t += colPtrs[j][i+1] - colPtrs[j][i];
482 }
483 mergedNnzPerSplit[i] = SerialMergeNNZ(listSplitTups);
484 inputNnzPerSplit[i] = t;
485 }
486
487 std::vector<IT> mdisp(nsplits+1,0);
488 for(int i=0; i<nsplits; ++i)
489 mdisp[i+1] = mdisp[i] + mergedNnzPerSplit[i];
490 IT mergedNnzAll = mdisp[nsplits];
491
492
493#ifdef COMBBLAS_DEBUG
494 IT inputNnzAll = std::accumulate(inputNnzPerSplit.begin(), inputNnzPerSplit.end(), static_cast<IT>(0));
495 double ratio = inputNnzAll / (double) mergedNnzAll;
496 std::ostringstream outs;
497 outs << "Multiwaymerge: inputNnz/mergedNnz = " << ratio << std::endl;
498 SpParHelper::Print(outs.str());
499#endif
500
501
502 // ------ allocate memory outside of the parallel region ------
503 //std::tuple<IT, IT, NT> * mergeBuf = static_cast<std::tuple<IT, IT, NT>*> (::operator new (sizeof(std::tuple<IT, IT, NT>[mergedNnzAll])));
504 std::tuple<IT, IT, NT> * mergeBuf = new std::tuple<IT, IT, NT>[mergedNnzAll];
505 // ------ perform merge in parallel ------
506#ifdef THREADED
507#pragma omp parallel for schedule(dynamic)
508#endif
509 for(int i=0; i< nsplits; i++) // serially merge part by part
510 {
511 std::vector<SpTuples<IT,NT> *> listSplitTups(nlists);
512 for(int j=0; j< nlists; ++j)
513 {
514 IT curnnz= colPtrs[j][i+1] - colPtrs[j][i];
515 listSplitTups[j] = new SpTuples<IT, NT> (curnnz, mdim, ndim, ArrSpTups[j]->tuples + colPtrs[j][i], true);
516 }
517 SerialMerge<SR>(listSplitTups, mergeBuf + mdisp[i]);
518 }
519
520 for(int i=0; i< nlists; i++)
521 {
522 if(delarrs)
523 delete ArrSpTups[i]; // May be expensive for large local matrices
524 }
525 return new SpTuples<IT, NT> (mergedNnzAll, mdim, ndim, mergeBuf, true, false);
526}
527
528
529
530 // --------------------------------------------------------
531 // Hash-based multiway merge
532 // Columns of the input matrices may or may not be sorted
533 // the hash merging algorithm does not need sorted inputs
534 // If sorted=true, columns of the output matrix are sorted
535 // --------------------------------------------------------
536 template<class SR, class IT, class NT>
537 SpTuples<IT, NT>* MultiwayMergeHash( std::vector<SpTuples<IT,NT> *> & ArrSpTups, IT mdim = 0, IT ndim = 0, bool delarrs = false, bool sorted=true )
538 {
539 int nprocs, myrank;
540 MPI_Comm_size(MPI_COMM_WORLD,&nprocs);
541 MPI_Comm_rank(MPI_COMM_WORLD,&myrank);
542
543 int nlists = ArrSpTups.size();
544 if(nlists == 0)
545 {
546 return new SpTuples<IT,NT>(0, mdim, ndim); //empty mxn SpTuples
547 }
548 if(nlists == 1)
549 {
550 if(delarrs) // steal data from input, and don't delete input
551 {
552 return ArrSpTups[0];
553 }
554 else // copy input to output
555 {
556 std::tuple<IT, IT, NT>* mergeTups = static_cast<std::tuple<IT, IT, NT>*>
557 (::operator new (sizeof(std::tuple<IT, IT, NT>[ArrSpTups[0]->getnnz()])));
558#ifdef THREADED
559#pragma omp parallel for
560#endif
561 for(int i=0; i<ArrSpTups[0]->getnnz(); i++)
562 mergeTups[i] = ArrSpTups[0]->tuples[i];
563
564 // Caution: ArrSpTups[0] can be either sorted or unsorted
565 // By setting sorted=true, we prevented sorting in the SpTuples constructor
566 // TODO: we better keep a isSorted flag in SpTuples (also in DCSC/CSC)
567 return new SpTuples<IT,NT> (ArrSpTups[0]->getnnz(), mdim, ndim, mergeTups, true, true);
568 }
569 }
570
571 // ---- check correctness of input dimensions ------
572 for(int i=0; i< nlists; ++i)
573 {
574 if((mdim != ArrSpTups[i]->getnrow()) || ndim != ArrSpTups[i]->getncol())
575 {
576 std::cerr << "Dimensions of SpTuples do not match on multiwayMerge()" << std::endl;
577 return new SpTuples<IT,NT>(0,0,0);
578 }
579 }
580
581 int nthreads = 1;
582#ifdef THREADED
583#pragma omp parallel
584 {
585 nthreads = omp_get_num_threads();
586 }
587#endif
588 int nsplits = 4*nthreads; // oversplit for load balance
589 nsplits = std::min(nsplits, (int)ndim); // we cannot split a column
590 std::vector< std::vector<IT> > colPtrs(nlists);
591
592
593#ifdef THREADED
594#pragma omp parallel for
595#endif
596 for(int j=0; j< nlists; j++)
597 {
598 colPtrs[j]=findColSplittersFinger<IT>(ArrSpTups[j], nsplits);
599 }
600
601
602 // listSplitTups is just a temporary vector to facilitate serial merging
603 // It does not allocate or move any input tuples
604 // Hence, sorted and opnew options do not matter when creating SpTuples
605 // Ideally we can directly work with std::tuples
606 std::vector<std::vector<SpTuples<IT,NT> *>> listSplitTups(nsplits);
607
608 for(int i=0; i< nsplits; ++i) // for each part
609 {
610 listSplitTups[i].resize(nlists);
611
612 for(int j=0; j< nlists; ++j)
613 {
614 IT curnnz= colPtrs[j][i+1] - colPtrs[j][i];
615 listSplitTups[i][j] = new SpTuples<IT, NT> (curnnz, mdim, ndim, ArrSpTups[j]->tuples + colPtrs[j][i], true);
616 }
617
618 }
619
620 std::vector<IT> mergedNnzPerSplit(nsplits);
621 std::vector<IT> mergedNnzPerSplit1(nsplits);
622 std::vector<IT> maxNnzPerColumnSplit(nsplits);
623 std::vector<IT*> nnzPerColSplit(nsplits);
624
625 // ------ estimate memory requirement after merge in each split ------
626#ifdef THREADED
627#pragma omp parallel for schedule(dynamic)
628#endif
629 for(int i=0; i< nsplits; i++) // for each part
630 {
631 IT startCol = i* (ndim/nsplits);
632 IT endCol = (i+1)* (ndim/nsplits);
633 if(i == (nsplits-1)) endCol = ndim;
634
635 nnzPerColSplit[i] = SerialMergeNNZHash(listSplitTups[i], mergedNnzPerSplit[i], maxNnzPerColumnSplit[i], startCol, endCol);
636 }
637
638 std::vector<IT> mdisp(nsplits+1,0);
639 for(int i=0; i<nsplits; ++i)
640 mdisp[i+1] = mdisp[i] + mergedNnzPerSplit[i];
641 IT mergedNnzAll = mdisp[nsplits];
642
643 // ------ allocate memory outside of the parallel region ------
644 std::tuple<IT, IT, NT> * mergeBuf = static_cast<std::tuple<IT, IT, NT>*> (::operator new (sizeof(std::tuple<IT, IT, NT>[mergedNnzAll])));
645 //std::tuple<IT, IT, NT> * mergeBuf = new std::tuple<IT, IT, NT>[mergedNnzAll];
646
647
648
649 // ------ perform merge in parallel ------
650#ifdef THREADED
651#pragma omp parallel for schedule(dynamic)
652#endif
653 for(int i=0; i< nsplits; i++) // serially merge part by part
654 {
655 //SerialMerge<SR>(listSplitTups, mergeBuf + mdisp[i]);
656 IT startCol = i* (ndim/nsplits);
657 IT endCol = (i+1)* (ndim/nsplits);
658 if(i == (nsplits-1)) endCol = ndim;
659 SerialMergeHash<SR>(listSplitTups[i], mergeBuf + mdisp[i], nnzPerColSplit[i], maxNnzPerColumnSplit[i], startCol, endCol, sorted);
660 // last parameter is for sorted
661 }
662
663 // Delete and free a lot of dynamic allocations
664 for(int i=0; i< nsplits; ++i) // for each part
665 {
666 delete nnzPerColSplit[i];
667 for(int j=0; j< nlists; ++j)
668 {
669 listSplitTups[i][j]->tuples_deleted = true;
670 delete listSplitTups[i][j];
671 }
672
673 }
674 for(int i=0; i< nlists; i++)
675 {
676 if(delarrs)
677 delete ArrSpTups[i]; // May be expensive for large local matrices
678 }
679
680 // Caution: We allow both sorted and unsorted tuples in SpTuples
681 // By setting sorted=true, we prevented sorting in the SpTuples constructor
682 // TODO: we better keep a isSorted flag in SpTuples (also in DCSC/CSC)
683 return new SpTuples<IT, NT> (mergedNnzAll, mdim, ndim, mergeBuf, true, true);
684 }
685
686 // --------------------------------------------------------
687 // Hash-based multiway merge
688 // Columns of the input matrices may or may not be sorted
689 // the hash merging algorithm does not need sorted inputs
690 // If sorted=true, columns of the output matrix are sorted
691 // --------------------------------------------------------
692 template<class SR, class IT, class NT>
693 SpTuples<IT, NT>* MultiwayMergeHashSliding( std::vector<SpTuples<IT,NT> *> & ArrSpTups, IT mdim = 0, IT ndim = 0, bool delarrs = false, bool sorted=true, IT maxHashTableSize = 16384)
694 {
695 int nprocs, myrank;
696 MPI_Comm_size(MPI_COMM_WORLD,&nprocs);
697 MPI_Comm_rank(MPI_COMM_WORLD,&myrank);
698
699 int nlists = ArrSpTups.size();
700 if(nlists == 0)
701 {
702 return new SpTuples<IT,NT>(0, mdim, ndim); //empty mxn SpTuples
703 }
704 if(nlists == 1)
705 {
706 if(delarrs) // steal data from input, and don't delete input
707 {
708 return ArrSpTups[0];
709 }
710 else // copy input to output
711 {
712 std::tuple<IT, IT, NT>* mergeTups = static_cast<std::tuple<IT, IT, NT>*>
713 (::operator new (sizeof(std::tuple<IT, IT, NT>[ArrSpTups[0]->getnnz()])));
714#ifdef THREADED
715#pragma omp parallel for
716#endif
717 for(int i=0; i<ArrSpTups[0]->getnnz(); i++)
718 mergeTups[i] = ArrSpTups[0]->tuples[i];
719
720 // Caution: ArrSpTups[0] can be either sorted or unsorted
721 // By setting sorted=true, we prevented sorting in the SpTuples constructor
722 // TODO: we better keep a isSorted flag in SpTuples (also in DCSC/CSC)
723 return new SpTuples<IT,NT> (ArrSpTups[0]->getnnz(), mdim, ndim, mergeTups, true, true);
724 }
725 }
726
727 // ---- check correctness of input dimensions ------
728 for(int i=0; i< nlists; ++i)
729 {
730 if((mdim != ArrSpTups[i]->getnrow()) || ndim != ArrSpTups[i]->getncol())
731 {
732 std::cerr << "Dimensions of SpTuples do not match on MultiwayMergeHashSliding()" << std::endl;
733 //std::cerr << mdim << " vs " << ArrSpTups[i]->getnrow() << " | " << ndim << " vs " << ArrSpTups[i]->getncol() << std::endl;
734 return new SpTuples<IT,NT>(0,0,0);
735 }
736 }
737
738 int nthreads = 1;
739#ifdef THREADED
740#pragma omp parallel
741 {
742 nthreads = omp_get_num_threads();
743 }
744#endif
745 int nsplits = 4*nthreads; // oversplit for load balance
746 nsplits = std::min(nsplits, (int)ndim); // we cannot split a column
747
748 const IT minHashTableSize = 16;
749 //const IT maxHashTableSize = 8 * 1024; // Moved to parameter
750 const IT hashScale = 107;
751
752 /*
753 * To store column pointers of CSC like data structures
754 * */
755 IT** colPtrs = static_cast<IT**> (::operator new (sizeof(IT*[nlists])));
756 for(int l = 0; l < nlists; l++){
757 colPtrs[l] = static_cast<IT*> (::operator new (sizeof(IT[ndim+1])));
758 }
759 ColLexiCompare<IT,NT> colCmp;
760 RowLexiCompare<IT,NT> rowCmp;
761
762#ifdef THREADED
763#pragma omp parallel for
764#endif
765 for(int s = 0; s < nsplits; s++){
766 IT startColSplit = s * (ndim/nsplits);
767 IT endColSplit = (s == (nsplits-1) ) ? ndim : (s+1) * (ndim/nsplits);
768 for(int l = 0; l < nlists; l++){
769 std::tuple<IT, IT, NT> firstTuple(0, startColSplit, NT());
770 std::tuple<IT, IT, NT>* first = std::lower_bound(ArrSpTups[l]->tuples, ArrSpTups[l]->tuples+ArrSpTups[l]->getnnz(), firstTuple, colCmp);
771 std::tuple<IT, IT, NT> lastTuple(0, endColSplit, NT());
772 std::tuple<IT, IT, NT>* last = std::lower_bound(ArrSpTups[l]->tuples, ArrSpTups[l]->tuples+ArrSpTups[l]->getnnz(), lastTuple, colCmp);
773 for(IT c = startColSplit; c < endColSplit; c++){
774 if(c == 0) colPtrs[l][c] = 0;
775 else{
776 std::tuple<IT, IT, NT> searchTuple(0, c, NT());
777 std::tuple<IT, IT, NT>* pos = std::lower_bound(first, last, searchTuple, colCmp);
778 colPtrs[l][c] = pos - ArrSpTups[l]->tuples;
779 }
780 }
781 if(s == nsplits-1) colPtrs[l][ndim] = ArrSpTups[l]->getnnz();
782 }
783 }
784
785 size_t* flopsPerCol = static_cast<size_t*> (::operator new (sizeof(size_t[ndim])));
786 IT* nWindowPerColSymbolic = static_cast<IT*> (::operator new (sizeof(IT[ndim])));
787#ifdef THREADED
788#pragma omp parallel for
789#endif
790 for(IT c = 0; c < ndim; c++){
791 flopsPerCol[c] = 0;
792 for(int l = 0; l < nlists; l++){
793 flopsPerCol[c] += colPtrs[l][c+1] - colPtrs[l][c];
794 }
795 nWindowPerColSymbolic[c] = flopsPerCol[c] / maxHashTableSize + 1;
796 }
797
798 size_t* prefixSumFlopsPerCol = prefixSum<size_t>(flopsPerCol, ndim, nthreads);
799 size_t totalFlops = prefixSumFlopsPerCol[ndim];
800 size_t flopsPerSplit = totalFlops / nsplits;
801 IT* colSplitters = static_cast<size_t*> (::operator new (sizeof(size_t[nsplits+1])));
802
803 /*
804 * For symbolic, split column between threads in such a way so that total flops is
805 * balanced between threads
806 * */
807#ifdef THREADED
808#pragma omp parallel for
809#endif
810 for(int s = 0; s < nsplits; s++){
811 size_t searchItem = s * flopsPerSplit;
812 size_t* searchResult = std::lower_bound(prefixSumFlopsPerCol, prefixSumFlopsPerCol + ndim + 1, searchItem);
813 colSplitters[s] = searchResult - prefixSumFlopsPerCol;
814 }
815 colSplitters[nsplits] = ndim;
816
817 /*
818 * Calculate prefix sum of number of windows needed per column.
819 * This information will be used to determine the index in the windowsSymbolic array
820 * */
821 IT* prefixSumWindowSymbolic = prefixSum<IT>(nWindowPerColSymbolic, ndim, nthreads);
822
823 std::pair<IT, IT>* windowsSymbolic = static_cast<std::pair<IT, IT>*> (::operator new (sizeof(std::pair<IT, IT>[prefixSumWindowSymbolic[ndim]])));
824 IT* nnzPerCol = static_cast<IT*> (::operator new (sizeof(IT[ndim])));
825 IT* nWindowPerCol = static_cast<IT*> (::operator new (sizeof(IT[ndim])));
826
827 /*
828 * To keep track of rows being processed in each matrix over sliding window
829 * */
830 std::pair<IT, IT>** rowIdsRange = static_cast<std::pair<IT, IT>**> (::operator new (sizeof(std::pair<IT, IT>*[nsplits])));
831 for(int s = 0; s < nsplits; s++){
832 rowIdsRange[s] = static_cast<std::pair<IT, IT>*> (::operator new (sizeof(std::pair<IT, IT>[nlists])));
833 }
834
835#ifdef THREADED
836#pragma omp parallel
837#endif
838 {
839 std::vector<NT> globalHashVec(minHashTableSize);
840 size_t tid = omp_get_thread_num();
841#ifdef THREADED
842#pragma omp for schedule(dynamic)
843#endif
844 for(int s = 0; s < nsplits; s++){
845 IT startCol = colSplitters[s];
846 IT endCol = colSplitters[s+1];
847
848 for(IT c = startCol; c < endCol; c++){
849 nnzPerCol[c] = 0;
850 nWindowPerCol[c] = 1;
851 if(nWindowPerColSymbolic[c] == 1){
852 IT startRow = 0;
853 IT endRow = mdim;
854 size_t wsIdx = prefixSumWindowSymbolic[c];
855
856 windowsSymbolic[wsIdx].first = 0; // Stores start row id of the window
857 windowsSymbolic[wsIdx].second = 0; // Stores number of merged nonzero in the window
858
859 size_t htSize = minHashTableSize;
860 while(htSize < flopsPerCol[c]) {
861 //ht_size is set as 2^n
862 htSize <<= 1;
863 }
864 if(globalHashVec.size() < htSize) globalHashVec.resize(htSize);
865 for(size_t j=0; j < htSize; ++j) {
866 globalHashVec[j] = -1;
867 }
868
869 for(int l = 0; l < nlists; l++){
870 for(IT i = colPtrs[l][c]; i < colPtrs[l][c+1]; i++){
871 IT key = ArrSpTups[l]->rowindex(i);
872 IT hash = (key*hashScale) & (htSize-1);
873
874 while (1) {
875 //hash probing
876 if (globalHashVec[hash] == key) {
877 //key is found in hash table
878 break;
879 }
880 else if (globalHashVec[hash] == -1) {
881 //key is not registered yet
882 globalHashVec[hash] = key; // Register the key
883 nnzPerCol[c]++; // New nz in the column after merge
884 windowsSymbolic[wsIdx].second++; // New nz in the window after merge
885 break;
886 }
887 else {
888 //key is not found
889 hash = (hash+1) & (htSize-1);
890 }
891 }
892 }
893 }
894 }
895 else{
896 IT nrowsPerWindow = mdim / nWindowPerColSymbolic[c];
897 IT runningSum = 0;
898 for(size_t w = 0; w < nWindowPerColSymbolic[c]; w++){
899 IT rowStart = w * nrowsPerWindow;
900 IT rowEnd = (w == nWindowPerColSymbolic[c]-1) ? mdim : (w+1) * nrowsPerWindow;
901 size_t wsIdx = prefixSumWindowSymbolic[c] + w;
902
903 windowsSymbolic[wsIdx].first = rowStart;
904 windowsSymbolic[wsIdx].second = 0;
905
906 size_t flopsWindow = 0;
907 for(int l = 0; l < nlists; l++){
908 std::tuple<IT, IT, NT>* first = ArrSpTups[l]->tuples + colPtrs[l][c];
909 std::tuple<IT, IT, NT>* last = ArrSpTups[l]->tuples + colPtrs[l][c+1];
910
911 if(rowStart > 0){
912 std::tuple<IT, IT, NT> searchTuple(rowStart, 0, NT());
913 first = std::lower_bound(first, last, searchTuple, rowCmp);
914 }
915
916 if(rowEnd < mdim){
917 std::tuple<IT, IT, NT> searchTuple(rowEnd, 0, NT());
918 last = std::lower_bound(first, last, searchTuple, rowCmp);
919 }
920
921 rowIdsRange[s][l].first = first - (ArrSpTups[l]->tuples);
922 rowIdsRange[s][l].second = last - (ArrSpTups[l]->tuples);
923
924 flopsWindow += last - first;
925 }
926 size_t htSize = minHashTableSize;
927 while(htSize < flopsWindow) {
928 //ht_size is set as 2^n
929 htSize <<= 1;
930 }
931 if(globalHashVec.size() < htSize) globalHashVec.resize(htSize);
932 for(size_t j=0; j < htSize; ++j) {
933 globalHashVec[j] = -1;
934 }
935 for(int l = 0; l < nlists; l++){
936 for(IT i = rowIdsRange[s][l].first; i < rowIdsRange[s][l].second; i++){
937 IT key = ArrSpTups[l]->rowindex(i);
938 IT hash = (key * hashScale) & (htSize-1);
939 while (1) {
940 //hash probing
941 if (globalHashVec[hash] == key) {
942 //key is found in hash table
943 break;
944 }
945 else if (globalHashVec[hash] == -1) {
946 //key is not registered yet
947 globalHashVec[hash] = key;
948 nnzPerCol[c]++;
949 windowsSymbolic[wsIdx].second++;
950 break;
951 }
952 else {
953 //key is not found
954 hash = (hash+1) & (htSize-1);
955 }
956 }
957 }
958 }
959
960 if(w == 0){
961 runningSum = windowsSymbolic[wsIdx].second;
962 }
963 else{
964 if(runningSum + windowsSymbolic[wsIdx].second > maxHashTableSize) {
965 nWindowPerCol[c]++;
966 runningSum = windowsSymbolic[wsIdx].second;
967 }
968 else{
969 runningSum += windowsSymbolic[wsIdx].second;
970 }
971 }
972 }
973 }
974 }
975 }
976 }
977
978 /*
979 * Now collapse symbolic windows to get windows of actual computation
980 * */
981 IT* prefixSumWindow = prefixSum<IT>(nWindowPerCol, ndim, nthreads);
982 std::pair<IT, IT>* windows = static_cast<std::pair<IT, IT>*> (::operator new (sizeof(std::pair<IT, IT>[prefixSumWindow[ndim]])));
983
984#ifdef THREADED
985#pragma omp parallel for schedule(dynamic)
986#endif
987 for(int s = 0; s < nsplits; s++){
988 IT colStart = colSplitters[s];
989 IT colEnd = colSplitters[s+1];
990 for(IT c = colStart; c < colEnd; c++){
991 IT nWindowSymbolic = nWindowPerColSymbolic[c];
992 IT wsIdx = prefixSumWindowSymbolic[c];
993 IT wcIdx = prefixSumWindow[c];
994 windows[wcIdx].first = windowsSymbolic[wsIdx].first;
995 windows[wcIdx].second = windowsSymbolic[wsIdx].second;
996 // w = 0 is already taken care of. So start from w = 1
997 for(IT w = 1; w < nWindowSymbolic; w++){
998 wsIdx = prefixSumWindowSymbolic[c] + w;
999 if(windows[wcIdx].second + windowsSymbolic[wsIdx].second > maxHashTableSize){
1000 wcIdx++;
1001 windows[wcIdx].first = windowsSymbolic[wsIdx].first;
1002 windows[wcIdx].second = windowsSymbolic[wsIdx].second;
1003 }
1004 else{
1005 windows[wcIdx].second = windows[wcIdx].second + windowsSymbolic[wsIdx].second;
1006 }
1007 }
1008 }
1009 }
1010
1011 IT* prefixSumNnzPerCol = prefixSum<IT>(nnzPerCol, ndim, nthreads);
1012 IT totalNnz = prefixSumNnzPerCol[ndim];
1013 std::tuple<IT, IT, NT> * mergeBuf = static_cast<std::tuple<IT, IT, NT>*> (::operator new (sizeof(std::tuple<IT, IT, NT>[totalNnz])));
1014
1015#ifdef THREADED
1016#pragma omp parallel
1017#endif
1018 {
1019 std::vector< std::pair<uint32_t,NT> > globalHashVec(minHashTableSize);
1020 size_t tid = omp_get_thread_num();
1021#ifdef THREADED
1022#pragma omp for schedule(dynamic)
1023#endif
1024 for(int s = 0; s < nsplits; s++){
1025 IT startCol = colSplitters[s];
1026 IT endCol = colSplitters[s+1];
1027 for(IT c = startCol; c < endCol; c++){
1028 IT nWindow = nWindowPerCol[c];
1029 IT outptr = prefixSumNnzPerCol[c];
1030 if(nWindow == 1){
1031 IT wcIdx = prefixSumWindow[c];
1032 IT nnzWindow = windows[wcIdx].second;
1033
1034 size_t htSize = minHashTableSize;
1035 while(htSize < nnzWindow)
1036 {
1037 //htSize is set as 2^n
1038 htSize <<= 1;
1039 }
1040 if(globalHashVec.size() < htSize) globalHashVec.resize(htSize);
1041 for(size_t j=0; j < htSize; ++j)
1042 {
1043 globalHashVec[j].first = -1;
1044 }
1045
1046 for(int l = 0; l < nlists; l++){
1047 for(IT i = colPtrs[l][c]; i < colPtrs[l][c+1]; i++){
1048 IT key = ArrSpTups[l]->rowindex(i);
1049 IT hash = (key * hashScale) & (htSize-1);
1050 while (1) {
1051 //hash probing
1052 if (globalHashVec[hash].first == key) {
1053 //key is found in hash table
1054 // Add to the previos value stored in the position
1055 globalHashVec[hash].second += ArrSpTups[l]->numvalue(i);
1056 break;
1057 }
1058 else if (globalHashVec[hash].first == -1) {
1059 //key is not registered yet
1060 // Register the key and store the value
1061 globalHashVec[hash].first = key;
1062 globalHashVec[hash].second = ArrSpTups[l]->numvalue(i);
1063 break;
1064 }
1065 else {
1066 //key is not found
1067 hash = (hash+1) & (htSize-1);
1068 }
1069 }
1070 }
1071 }
1072 if(sorted){
1073 size_t index = 0;
1074 for (size_t j=0; j < htSize; j++){
1075 if (globalHashVec[j].first != -1){
1076 globalHashVec[index] = globalHashVec[j];
1077 index++;
1078 }
1079 }
1080 integerSort<NT>(globalHashVec.data(), index);
1081 //std::sort(globalHashVec.begin(), globalHashVec.begin() + index, sort_less<IT, NT>);
1082 for(size_t j = 0; j < index; j++){
1083 mergeBuf[outptr] = std::tuple<IT, IT, NT>(globalHashVec[j].first, c, globalHashVec[j].second);
1084 outptr++;
1085 }
1086 }
1087 else{
1088 for (size_t j=0; j < htSize; j++){
1089 if (globalHashVec[j].first != -1){
1090 mergeBuf[outptr] = std::tuple<IT, IT, NT>(globalHashVec[j].first, c, globalHashVec[j].second);
1091 outptr++;
1092 }
1093 }
1094 }
1095 }
1096 else{
1097 for (int l = 0; l < nlists; l++){
1098 rowIdsRange[s][l].first = colPtrs[l][c];
1099 rowIdsRange[s][l].second = colPtrs[l][c+1];
1100 }
1101
1102 for (size_t w = 0; w < nWindow; w++){
1103 IT wcIdx = prefixSumWindow[c] + w;
1104 IT startRow = windows[wcIdx].first;
1105 IT endRow = (w == nWindow-1) ? mdim : windows[wcIdx+1].first;
1106 IT nnzWindow = windows[wcIdx].second;
1107
1108 size_t htSize = minHashTableSize;
1109 while(htSize < nnzWindow) htSize <<= 1;
1110 if(globalHashVec.size() < htSize) globalHashVec.resize(htSize);
1111 for(size_t j = 0; j < htSize; j++) globalHashVec[j].first = -1;
1112
1113 for(int l = 0; l < nlists; l++){
1114 while( rowIdsRange[s][l].first < rowIdsRange[s][l].second ){
1115 IT i = rowIdsRange[s][l].first;
1116 IT key = ArrSpTups[l]->rowindex(i);
1117 if(key >= endRow) break;
1118 IT hash = (key * hashScale) & (htSize-1);
1119 while (1) {
1120 if (globalHashVec[hash].first == key) {
1121 //key is found in hash table
1122 // Add to the previos value stored in the position
1123 globalHashVec[hash].second += ArrSpTups[l]->numvalue(i);
1124 break;
1125 }
1126 else if (globalHashVec[hash].first == -1) {
1127 //key is not registered yet
1128 // Register the key and store the value
1129 globalHashVec[hash].first = key;
1130 globalHashVec[hash].second = ArrSpTups[l]->numvalue(i);
1131 break;
1132 }
1133 else {
1134 //key is not found
1135 hash = (hash+1) & (htSize-1);
1136 }
1137 }
1138 rowIdsRange[s][l].first++;
1139 }
1140 }
1141 if(sorted){
1142 size_t index = 0;
1143 for (size_t j=0; j < htSize; j++){
1144 if (globalHashVec[j].first != -1){
1145 globalHashVec[index++] = globalHashVec[j];
1146 }
1147 }
1148 integerSort<NT>(globalHashVec.data(), index);
1149 //std::sort(globalHashVec.begin(), globalHashVec.begin() + index, sort_less<IT, NT>);
1150 for(size_t j = 0; j < index; j++){
1151 mergeBuf[outptr] = std::tuple<IT, IT, NT>(globalHashVec[j].first, c, globalHashVec[j].second);
1152 outptr++;
1153 }
1154 }
1155 else{
1156 for (size_t j=0; j < htSize; j++){
1157 if (globalHashVec[j].first != -1){
1158 mergeBuf[outptr] = std::tuple<IT, IT, NT>(globalHashVec[j].first, c, globalHashVec[j].second);
1159 outptr++;
1160 }
1161 }
1162 }
1163 }
1164 }
1165 }
1166 }
1167 }
1168
1169 // Delete all allocated memories by prefixSum function
1170 delete [] prefixSumFlopsPerCol;
1171 delete [] prefixSumNnzPerCol;
1172 delete [] prefixSumWindowSymbolic;
1173 delete [] prefixSumWindow;
1174
1175 // Delete rest with operator delete as all memories was allocated with operator new
1176 ::operator delete(colSplitters);
1177 for(int s = 0; s < nsplits; s++) ::operator delete(rowIdsRange[s]);
1178 ::operator delete(rowIdsRange);
1179
1180 ::operator delete(nWindowPerColSymbolic);
1181 ::operator delete(windowsSymbolic);
1182 ::operator delete(nWindowPerCol);
1183 ::operator delete(windows);
1184
1185 ::operator delete(flopsPerCol);
1186 ::operator delete(nnzPerCol);
1187
1188 for(int l = 0; l < nlists; l++) ::operator delete(colPtrs[l]);
1189 ::operator delete(colPtrs);
1190
1191 for(int i=0; i< nlists; i++)
1192 {
1193 if(delarrs)
1194 delete ArrSpTups[i]; // May be expensive for large local matrices
1195 }
1196
1197 // Caution: We allow both sorted and unsorted tuples in SpTuples
1198 // By setting sorted=true, we prevented sorting in the SpTuples constructor
1199 // TODO: we better keep a isSorted flag in SpTuples (also in DCSC/CSC)
1200 return new SpTuples<IT, NT> (totalNnz, mdim, ndim, mergeBuf, true, true);
1201 }
1202
1203}
1204
1205#endif
int64_t IT
double NT
static void Print(const std::string &s)
int size
Definition common.h:20
int nprocs
Definition comms.cpp:55
void SerialMerge(const std::vector< SpTuples< IT, NT > * > &ArrSpTups, std::tuple< IT, IT, NT > *ntuples)
IT SerialMergeNNZ(const std::vector< SpTuples< IT, NT > * > &ArrSpTups)
void SerialMergeHash(const std::vector< SpTuples< IT, NT > * > &ArrSpTups, std::tuple< IT, IT, NT > *ntuples, IT *colnnz, IT maxcolnnz, IT startCol, IT endCol, bool sorted)
T * prefixSum(T *in, int size, int nthreads)
SpTuples< IT, NT > * MultiwayMerge(std::vector< SpTuples< IT, NT > * > &ArrSpTups, IT mdim=0, IT ndim=0, bool delarrs=false)
SpTuples< IT, NT > * MultiwayMergeHash(std::vector< SpTuples< IT, NT > * > &ArrSpTups, IT mdim=0, IT ndim=0, bool delarrs=false, bool sorted=true)
IT * SerialMergeNNZHash(const std::vector< SpTuples< IT, NT > * > &ArrSpTups, IT &totnnz, IT &maxnnzPerCol, IT startCol, IT endCol)
std::vector< RT > findColSplitters(SpTuples< IT, NT > *&spTuples, int nsplits)
SpTuples< IT, NT > * MultiwayMergeHashSliding(std::vector< SpTuples< IT, NT > * > &ArrSpTups, IT mdim=0, IT ndim=0, bool delarrs=false, bool sorted=true, IT maxHashTableSize=16384)
std::vector< RT > findColSplittersFinger(SpTuples< IT, NT > *&spTuples, int nsplits)
unsigned int hash(unsigned int a)
Definition utils.h:79