VTK-m  2.2
ParallelRadixSort.h
Go to the documentation of this file.
1 //============================================================================
2 // Copyright (c) Kitware, Inc.
3 // All rights reserved.
4 // See LICENSE.txt for details.
5 //
6 // This software is distributed WITHOUT ANY WARRANTY; without even
7 // the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
8 // PURPOSE. See the above copyright notice for more information.
9 //============================================================================
10 
11 // Copyright 2010, Takuya Akiba
12 // All rights reserved.
13 //
14 // Redistribution and use in source and binary forms, with or without
15 // modification, are permitted provided that the following conditions are
16 // met:
17 //
18 // * Redistributions of source code must retain the above copyright
19 // notice, this list of conditions and the following disclaimer.
20 // * Redistributions in binary form must reproduce the above
21 // copyright notice, this list of conditions and the following disclaimer
22 // in the documentation and/or other materials provided with the
23 // distribution.
24 // * Neither the name of Takuya Akiba nor the names of its
25 // contributors may be used to endorse or promote products derived from
26 // this software without specific prior written permission.
27 //
28 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
29 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
30 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
31 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
32 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
33 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
34 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
35 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
36 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
37 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
38 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
39 
40 // Modifications of Takuya Akiba's original GitHub source code for inclusion
41 // in VTK-m:
42 //
43 // - Made parallelization library details generic (see "Threading Interface" below).
44 // - Added minimum threshold for parallel, will instead invoke serial radix sort (kxsort)
45 // - Added std::greater<T> and std::less<T> to interface for descending order sorts
46 // - Added linear scaling of threads used by the algorithm for more stable performance
47 // on machines with lots of available threads (KNL and Haswell)
48 //
49 // This file contains an implementation of Satish parallel radix sort
50 // as documented in the following citation:
51 //
52 // Fast sort on CPUs and GPUs: a case for bandwidth oblivious SIMD sort.
53 // N. Satish, C. Kim, J. Chhugani, A. D. Nguyen, V. W. Lee, D. Kim, and P. Dubey.
54 // In Proc. SIGMOD, pages 351–362, 2010
55 //
56 // Threading Interface:
57 //
58 // To use this implementation, an object containing the following members should
59 // be passed as the 'threader' argument at the entry points:
60 //
61 // struct ThreaderExample
62 // {
63 // // Return the number of cores available:
64 // size_t GetAvailableCores();
65 //
66 // // Run the supplied functor in a new thread. This task is likely to
67 // // generate children (through RunChildTasks), and must block until all
68 // // children complete.
69 // template <typename TaskType>
70 // void RunParentTask(TaskType task);
71 //
72 // // Run an child task in a new thread. The function may be blocking or
73 // // non-blocking, and 'data' is an abitrary object passed to the parent
74 // // task's operator() (See the TBB implementation for details).
75 // template <typename TaskType, typename ParentTaskThreadData>
76 // void RunChildTasks(ParentTaskThreadData data, TaskType left, TaskType right);
77 // };
78 //
79 // See the sample implementations and the RunTask struct below for examples.
80 
81 #ifndef vtk_m_cont_internal_ParallelRadixSort_h
82 #define vtk_m_cont_internal_ParallelRadixSort_h
83 
85 
86 #include <algorithm>
87 #include <cassert>
88 #include <climits>
89 #include <cmath>
90 #include <cstring>
91 #include <functional>
92 #include <stdint.h>
93 #include <utility>
94 
95 #include <vtkm/Types.h>
96 #include <vtkm/cont/Logging.h>
97 
99 
101 
102 namespace vtkm
103 {
104 namespace cont
105 {
106 namespace internal
107 {
108 namespace radix
109 {
110 
111 namespace utility
112 {
113 // Return the number of threads that would be executed in parallel regions
114 inline size_t GetMaxThreads(size_t num_bytes, size_t available_cores)
115 {
116  const double CORES_PER_BYTE =
117  double(available_cores - 1) / double(BYTES_FOR_MAX_PARALLELISM - MIN_BYTES_FOR_PARALLEL);
118  const double Y_INTERCEPT = 1.0 - CORES_PER_BYTE * MIN_BYTES_FOR_PARALLEL;
119 
120  const size_t num_cores = (size_t)(CORES_PER_BYTE * double(num_bytes) + Y_INTERCEPT);
121  if (num_cores < 1)
122  {
123  return 1;
124  }
125  if (num_cores > available_cores)
126  {
127  return available_cores;
128  }
129  return num_cores;
130 }
131 } // namespace utility
132 
133 namespace internal
134 {
135 // Size of the software managed buffer
136 const size_t kOutBufferSize = 32;
137 
138 // Ascending order radix sort is a no-op
139 template <typename PlainType,
140  typename UnsignedType,
141  typename CompareType,
142  typename ValueManager,
143  unsigned int Base>
144 struct ParallelRadixCompareInternal
145 {
146  inline static void reverse(UnsignedType& t) { (void)t; }
147 };
148 
149 // Handle descending order radix sort
150 template <typename PlainType, typename UnsignedType, typename ValueManager, unsigned int Base>
151 struct ParallelRadixCompareInternal<PlainType,
152  UnsignedType,
153  std::greater<PlainType>,
154  ValueManager,
155  Base>
156 {
157  inline static void reverse(UnsignedType& t) { t = ((1 << Base) - 1) - t; }
158 };
159 
160 // The algorithm is implemented in this internal class
161 template <typename PlainType,
162  typename CompareType,
163  typename UnsignedType,
164  typename Encoder,
165  typename ValueManager,
166  typename ThreaderType,
167  unsigned int Base>
168 class ParallelRadixSortInternal
169 {
170 public:
171  using CompareInternal =
172  ParallelRadixCompareInternal<PlainType, UnsignedType, CompareType, ValueManager, Base>;
173 
174  ParallelRadixSortInternal();
175  ~ParallelRadixSortInternal();
176 
177  void Init(PlainType* data, size_t num_elems, const ThreaderType& threader);
178 
179  PlainType* Sort(PlainType* data, ValueManager* value_manager);
180 
181  static void InitAndSort(PlainType* data,
182  size_t num_elems,
183  const ThreaderType& threader,
184  ValueManager* value_manager);
185 
186 private:
187  CompareInternal compare_internal_;
188  size_t num_elems_;
189  size_t num_threads_;
190 
191  UnsignedType* tmp_;
192  size_t** histo_;
193  UnsignedType*** out_buf_;
194  size_t** out_buf_n_;
195 
196  size_t *pos_bgn_, *pos_end_;
197  ValueManager* value_manager_;
198  ThreaderType threader_;
199 
200  void DeleteAll();
201 
202  UnsignedType* SortInternal(UnsignedType* data, ValueManager* value_manager);
203 
204  // Compute |pos_bgn_| and |pos_end_| (associated ranges for each threads)
205  void ComputeRanges();
206 
207  // First step of each iteration of sorting
208  // Compute the histogram of |src| using bits in [b, b + Base)
209  void ComputeHistogram(unsigned int b, UnsignedType* src);
210 
211  // Second step of each iteration of sorting
212  // Scatter elements of |src| to |dst| using the histogram
213  void Scatter(unsigned int b, UnsignedType* src, UnsignedType* dst);
214 };
215 
216 template <typename PlainType,
217  typename CompareType,
218  typename UnsignedType,
219  typename Encoder,
220  typename ValueManager,
221  typename ThreaderType,
222  unsigned int Base>
223 ParallelRadixSortInternal<PlainType,
224  CompareType,
225  UnsignedType,
226  Encoder,
227  ValueManager,
228  ThreaderType,
229  Base>::ParallelRadixSortInternal()
230  : num_elems_(0)
231  , num_threads_(0)
232  , tmp_(NULL)
233  , histo_(NULL)
234  , out_buf_(NULL)
235  , out_buf_n_(NULL)
236  , pos_bgn_(NULL)
237  , pos_end_(NULL)
238 {
239  assert(sizeof(PlainType) == sizeof(UnsignedType));
240 }
241 
242 template <typename PlainType,
243  typename CompareType,
244  typename UnsignedType,
245  typename Encoder,
246  typename ValueManager,
247  typename ThreaderType,
248  unsigned int Base>
249 ParallelRadixSortInternal<PlainType,
250  CompareType,
251  UnsignedType,
252  Encoder,
253  ValueManager,
254  ThreaderType,
255  Base>::~ParallelRadixSortInternal()
256 {
257  DeleteAll();
258 }
259 
260 template <typename PlainType,
261  typename CompareType,
262  typename UnsignedType,
263  typename Encoder,
264  typename ValueManager,
265  typename ThreaderType,
266  unsigned int Base>
267 void ParallelRadixSortInternal<PlainType,
268  CompareType,
269  UnsignedType,
270  Encoder,
271  ValueManager,
272  ThreaderType,
273  Base>::DeleteAll()
274 {
275  delete[] tmp_;
276  tmp_ = NULL;
277 
278  for (size_t i = 0; i < num_threads_; ++i)
279  delete[] histo_[i];
280  delete[] histo_;
281  histo_ = NULL;
282 
283  for (size_t i = 0; i < num_threads_; ++i)
284  {
285  for (size_t j = 0; j < 1 << Base; ++j)
286  {
287  delete[] out_buf_[i][j];
288  }
289  delete[] out_buf_n_[i];
290  delete[] out_buf_[i];
291  }
292  delete[] out_buf_;
293  delete[] out_buf_n_;
294  out_buf_ = NULL;
295  out_buf_n_ = NULL;
296 
297  delete[] pos_bgn_;
298  delete[] pos_end_;
299  pos_bgn_ = pos_end_ = NULL;
300 
301  num_elems_ = 0;
302  num_threads_ = 0;
303 }
304 
305 template <typename PlainType,
306  typename CompareType,
307  typename UnsignedType,
308  typename Encoder,
309  typename ValueManager,
310  typename ThreaderType,
311  unsigned int Base>
312 void ParallelRadixSortInternal<PlainType,
313  CompareType,
314  UnsignedType,
315  Encoder,
316  ValueManager,
317  ThreaderType,
318  Base>::Init(PlainType* data,
319  size_t num_elems,
320  const ThreaderType& threader)
321 {
322  (void)data;
323  DeleteAll();
324 
325  threader_ = threader;
326 
327  num_elems_ = num_elems;
328 
329  num_threads_ =
330  utility::GetMaxThreads(num_elems_ * sizeof(PlainType), threader_.GetAvailableCores());
331 
332  tmp_ = new UnsignedType[num_elems_];
333  histo_ = new size_t*[num_threads_];
334  for (size_t i = 0; i < num_threads_; ++i)
335  {
336  histo_[i] = new size_t[1 << Base];
337  }
338 
339  out_buf_ = new UnsignedType**[num_threads_];
340  out_buf_n_ = new size_t*[num_threads_];
341  for (size_t i = 0; i < num_threads_; ++i)
342  {
343  out_buf_[i] = new UnsignedType*[1 << Base];
344  out_buf_n_[i] = new size_t[1 << Base];
345  for (size_t j = 0; j < 1 << Base; ++j)
346  {
347  out_buf_[i][j] = new UnsignedType[kOutBufferSize];
348  }
349  }
350 
351  pos_bgn_ = new size_t[num_threads_];
352  pos_end_ = new size_t[num_threads_];
353 }
354 
355 template <typename PlainType,
356  typename CompareType,
357  typename UnsignedType,
358  typename Encoder,
359  typename ValueManager,
360  typename ThreaderType,
361  unsigned int Base>
362 PlainType* ParallelRadixSortInternal<PlainType,
363  CompareType,
364  UnsignedType,
365  Encoder,
366  ValueManager,
367  ThreaderType,
368  Base>::Sort(PlainType* data, ValueManager* value_manager)
369 {
370  UnsignedType* src = reinterpret_cast<UnsignedType*>(data);
371  UnsignedType* res = SortInternal(src, value_manager);
372  return reinterpret_cast<PlainType*>(res);
373 }
374 
375 template <typename PlainType,
376  typename CompareType,
377  typename UnsignedType,
378  typename Encoder,
379  typename ValueManager,
380  typename ThreaderType,
381  unsigned int Base>
382 void ParallelRadixSortInternal<PlainType,
383  CompareType,
384  UnsignedType,
385  Encoder,
386  ValueManager,
387  ThreaderType,
388  Base>::InitAndSort(PlainType* data,
389  size_t num_elems,
390  const ThreaderType& threader,
391  ValueManager* value_manager)
392 {
393  ParallelRadixSortInternal prs;
394  prs.Init(data, num_elems, threader);
395  const PlainType* res = prs.Sort(data, value_manager);
396  if (res != data)
397  {
398  for (size_t i = 0; i < num_elems; ++i)
399  data[i] = res[i];
400  }
401 }
402 
403 template <typename PlainType,
404  typename CompareType,
405  typename UnsignedType,
406  typename Encoder,
407  typename ValueManager,
408  typename ThreaderType,
409  unsigned int Base>
410 UnsignedType* ParallelRadixSortInternal<PlainType,
411  CompareType,
412  UnsignedType,
413  Encoder,
414  ValueManager,
415  ThreaderType,
416  Base>::SortInternal(UnsignedType* data,
417  ValueManager* value_manager)
418 {
419 
420  value_manager_ = value_manager;
421 
422  // Compute |pos_bgn_| and |pos_end_|
423  ComputeRanges();
424 
425  // Iterate from lower bits to higher bits
426  const size_t bits = CHAR_BIT * sizeof(UnsignedType);
427  UnsignedType *src = data, *dst = tmp_;
428  for (unsigned int b = 0; b < bits; b += Base)
429  {
430  ComputeHistogram(b, src);
431  Scatter(b, src, dst);
432 
433  std::swap(src, dst);
434  value_manager->Next();
435  }
436 
437  return src;
438 }
439 
440 template <typename PlainType,
441  typename CompareType,
442  typename UnsignedType,
443  typename Encoder,
444  typename ValueManager,
445  typename ThreaderType,
446  unsigned int Base>
447 void ParallelRadixSortInternal<PlainType,
448  CompareType,
449  UnsignedType,
450  Encoder,
451  ValueManager,
452  ThreaderType,
453  Base>::ComputeRanges()
454 {
455  pos_bgn_[0] = 0;
456  for (size_t i = 0; i < num_threads_ - 1; ++i)
457  {
458  const size_t t = (num_elems_ - pos_bgn_[i]) / (num_threads_ - i);
459  pos_bgn_[i + 1] = pos_end_[i] = pos_bgn_[i] + t;
460  }
461  pos_end_[num_threads_ - 1] = num_elems_;
462 }
463 
464 template <typename PlainType,
465  typename UnsignedType,
466  typename Encoder,
467  unsigned int Base,
468  typename Function,
469  typename ThreaderType>
470 struct RunTask
471 {
472  RunTask(size_t binary_tree_height,
473  size_t binary_tree_position,
474  Function f,
475  size_t num_elems,
476  size_t num_threads,
477  const ThreaderType& threader)
478  : binary_tree_height_(binary_tree_height)
479  , binary_tree_position_(binary_tree_position)
480  , f_(f)
481  , num_elems_(num_elems)
482  , num_threads_(num_threads)
483  , threader_(threader)
484  {
485  }
486 
487  template <typename ThreaderData = void*>
488  void operator()(ThreaderData tData = nullptr) const
489  {
490  size_t num_nodes_at_current_height = (size_t)pow(2, (double)binary_tree_height_);
491  if (num_threads_ <= num_nodes_at_current_height)
492  {
493  const size_t my_id = binary_tree_position_ - num_nodes_at_current_height;
494  if (my_id < num_threads_)
495  {
496  f_(my_id);
497  }
498  }
499  else
500  {
501  RunTask left(binary_tree_height_ + 1,
502  2 * binary_tree_position_,
503  f_,
504  num_elems_,
505  num_threads_,
506  threader_);
507  RunTask right(binary_tree_height_ + 1,
508  2 * binary_tree_position_ + 1,
509  f_,
510  num_elems_,
511  num_threads_,
512  threader_);
513  threader_.RunChildTasks(tData, left, right);
514  }
515  }
516 
517  size_t binary_tree_height_;
518  size_t binary_tree_position_;
519  Function f_;
520  size_t num_elems_;
521  size_t num_threads_;
522  ThreaderType threader_;
523 };
524 
525 template <typename PlainType,
526  typename CompareType,
527  typename UnsignedType,
528  typename Encoder,
529  typename ValueManager,
530  typename ThreaderType,
531  unsigned int Base>
532 void ParallelRadixSortInternal<PlainType,
533  CompareType,
534  UnsignedType,
535  Encoder,
536  ValueManager,
537  ThreaderType,
538  Base>::ComputeHistogram(unsigned int b, UnsignedType* src)
539 {
540  // Compute local histogram
541 
542  auto lambda = [=](const size_t my_id) {
543  const size_t my_bgn = pos_bgn_[my_id];
544  const size_t my_end = pos_end_[my_id];
545  size_t* my_histo = histo_[my_id];
546 
547  memset(my_histo, 0, sizeof(size_t) * (1 << Base));
548  for (size_t i = my_bgn; i < my_end; ++i)
549  {
550  const UnsignedType s = Encoder::encode(src[i]);
551  UnsignedType t = (s >> b) & ((1 << Base) - 1);
552  compare_internal_.reverse(t);
553  ++my_histo[t];
554  }
555  };
556 
557  using RunTaskType =
558  RunTask<PlainType, UnsignedType, Encoder, Base, std::function<void(size_t)>, ThreaderType>;
559 
560  RunTaskType root(0, 1, lambda, num_elems_, num_threads_, threader_);
561  this->threader_.RunParentTask(root);
562 
563  // Compute global histogram
564  size_t s = 0;
565  for (size_t i = 0; i < 1 << Base; ++i)
566  {
567  for (size_t j = 0; j < num_threads_; ++j)
568  {
569  const size_t t = s + histo_[j][i];
570  histo_[j][i] = s;
571  s = t;
572  }
573  }
574 }
575 
576 template <typename PlainType,
577  typename CompareType,
578  typename UnsignedType,
579  typename Encoder,
580  typename ValueManager,
581  typename ThreaderType,
582  unsigned int Base>
583 void ParallelRadixSortInternal<PlainType,
584  CompareType,
585  UnsignedType,
586  Encoder,
587  ValueManager,
588  ThreaderType,
589  Base>::Scatter(unsigned int b, UnsignedType* src, UnsignedType* dst)
590 {
591 
592  auto lambda = [=](const size_t my_id) {
593  const size_t my_bgn = pos_bgn_[my_id];
594  const size_t my_end = pos_end_[my_id];
595  size_t* my_histo = histo_[my_id];
596  UnsignedType** my_buf = out_buf_[my_id];
597  size_t* my_buf_n = out_buf_n_[my_id];
598 
599  memset(my_buf_n, 0, sizeof(size_t) * (1 << Base));
600  for (size_t i = my_bgn; i < my_end; ++i)
601  {
602  const UnsignedType s = Encoder::encode(src[i]);
603  UnsignedType t = (s >> b) & ((1 << Base) - 1);
604  compare_internal_.reverse(t);
605  my_buf[t][my_buf_n[t]] = src[i];
606  value_manager_->Push(my_id, t, my_buf_n[t], i);
607  ++my_buf_n[t];
608 
609  if (my_buf_n[t] == kOutBufferSize)
610  {
611  size_t p = my_histo[t];
612  for (size_t j = 0; j < kOutBufferSize; ++j)
613  {
614  size_t tp = p++;
615  dst[tp] = my_buf[t][j];
616  }
617  value_manager_->Flush(my_id, t, kOutBufferSize, my_histo[t]);
618 
619  my_histo[t] += kOutBufferSize;
620  my_buf_n[t] = 0;
621  }
622  }
623 
624  // Flush everything
625  for (size_t i = 0; i < 1 << Base; ++i)
626  {
627  size_t p = my_histo[i];
628  for (size_t j = 0; j < my_buf_n[i]; ++j)
629  {
630  size_t tp = p++;
631  dst[tp] = my_buf[i][j];
632  }
633  value_manager_->Flush(my_id, i, my_buf_n[i], my_histo[i]);
634  }
635  };
636 
637  using RunTaskType =
638  RunTask<PlainType, UnsignedType, Encoder, Base, std::function<void(size_t)>, ThreaderType>;
639  RunTaskType root(0, 1, lambda, num_elems_, num_threads_, threader_);
640  this->threader_.RunParentTask(root);
641 }
642 } // namespace internal
643 
644 // Encoders encode signed/unsigned integers and floating point numbers
645 // to correctly ordered unsigned integers
646 namespace encoder
647 {
648 class EncoderDummy
649 {
650 };
651 
652 class EncoderUnsigned
653 {
654 public:
655  template <typename UnsignedType>
656  inline static UnsignedType encode(UnsignedType x)
657  {
658  return x;
659  }
660 };
661 
662 class EncoderSigned
663 {
664 public:
665  template <typename UnsignedType>
666  inline static UnsignedType encode(UnsignedType x)
667  {
668  return x ^ (UnsignedType(1) << (CHAR_BIT * sizeof(UnsignedType) - 1));
669  }
670 };
671 
672 class EncoderDecimal
673 {
674 public:
675  template <typename UnsignedType>
676  inline static UnsignedType encode(UnsignedType x)
677  {
678  static const size_t bits = CHAR_BIT * sizeof(UnsignedType);
679  const UnsignedType a = x >> (bits - 1);
680  const UnsignedType b = (-static_cast<int>(a)) | (UnsignedType(1) << (bits - 1));
681  return x ^ b;
682  }
683 };
684 } // namespace encoder
685 
686 // Value managers are used to generalize the sorting algorithm
687 // to sorting of keys and sorting of pairs
688 namespace value_manager
689 {
690 class DummyValueManager
691 {
692 public:
693  inline void Push(int thread, size_t bucket, size_t num, size_t from_pos)
694  {
695  (void)thread;
696  (void)bucket;
697  (void)num;
698  (void)from_pos;
699  }
700 
701  inline void Flush(int thread, size_t bucket, size_t num, size_t to_pos)
702  {
703  (void)thread;
704  (void)bucket;
705  (void)num;
706  (void)to_pos;
707  }
708 
709  void Next() {}
710 };
711 
712 template <typename PlainType, typename ValueType, int Base>
713 class PairValueManager
714 {
715 public:
716  PairValueManager()
717  : max_elems_(0)
718  , max_threads_(0)
719  , original_(NULL)
720  , tmp_(NULL)
721  , src_(NULL)
722  , dst_(NULL)
723  , out_buf_(NULL)
724  , tmp_size(0)
725  {
726  }
727 
728  ~PairValueManager() { DeleteAll(); }
729 
730  void Init(size_t max_elems, size_t available_threads);
731 
732  void Start(ValueType* original, size_t num_elems)
733  {
734  assert(num_elems <= max_elems_);
735  src_ = original_ = original;
736  dst_ = tmp_;
737  }
738 
739  inline void Push(int thread, size_t bucket, size_t num, size_t from_pos)
740  {
741  out_buf_[thread][bucket][num] = src_[from_pos];
742  }
743 
744  inline void Flush(int thread, size_t bucket, size_t num, size_t to_pos)
745  {
746  for (size_t i = 0; i < num; ++i)
747  {
748  dst_[to_pos++] = out_buf_[thread][bucket][i];
749  }
750  }
751 
752  void Next() { std::swap(src_, dst_); }
753 
754  ValueType* GetResult() { return src_; }
755 
756 private:
757  size_t max_elems_;
758  int max_threads_;
759 
760  static constexpr size_t kOutBufferSize = internal::kOutBufferSize;
761  ValueType *original_, *tmp_;
762  ValueType *src_, *dst_;
763  ValueType*** out_buf_;
764  vtkm::UInt64 tmp_size;
765 
766  void DeleteAll();
767 };
768 
769 template <typename PlainType, typename ValueType, int Base>
770 void PairValueManager<PlainType, ValueType, Base>::Init(size_t max_elems, size_t available_cores)
771 {
772  DeleteAll();
773 
774  max_elems_ = max_elems;
775  max_threads_ = utility::GetMaxThreads(max_elems_ * sizeof(PlainType), available_cores);
776 
777  { // This allocation can be quite large, so log it:
778  tmp_size = max_elems * sizeof(ValueType);
780  "Allocating working memory for radix sort-by-key: %s.",
781  vtkm::cont::GetSizeString(tmp_size).c_str());
782  tmp_ = new ValueType[max_elems];
783  }
784 
785  out_buf_ = new ValueType**[max_threads_];
786  for (int i = 0; i < max_threads_; ++i)
787  {
788  out_buf_[i] = new ValueType*[1 << Base];
789  for (size_t j = 0; j < 1 << Base; ++j)
790  {
791  out_buf_[i][j] = new ValueType[kOutBufferSize];
792  }
793  }
794 }
795 
796 template <typename PlainType, typename ValueType, int Base>
797 void PairValueManager<PlainType, ValueType, Base>::DeleteAll()
798 {
799  { // This allocation can be quite large, so log it:
801  "Freeing working memory for radix sort-by-key: %s.",
802  vtkm::cont::GetSizeString(tmp_size).c_str());
803  delete[] tmp_;
804  tmp_ = NULL;
805  tmp_size = 0;
806  }
807 
808  for (int i = 0; i < max_threads_; ++i)
809  {
810  for (size_t j = 0; j < 1 << Base; ++j)
811  {
812  delete[] out_buf_[i][j];
813  }
814  delete[] out_buf_[i];
815  }
816  delete[] out_buf_;
817  out_buf_ = NULL;
818 
819  max_elems_ = 0;
820  max_threads_ = 0;
821 }
822 } // namespace value_manager
823 
824 // Frontend class for sorting keys
825 template <typename ThreaderType,
826  typename PlainType,
827  typename CompareType,
828  typename UnsignedType = PlainType,
829  typename Encoder = encoder::EncoderDummy,
830  unsigned int Base = 8>
831 class KeySort
832 {
833  using DummyValueManager = value_manager::DummyValueManager;
834  using Internal = internal::ParallelRadixSortInternal<PlainType,
835  CompareType,
836  UnsignedType,
837  Encoder,
838  DummyValueManager,
839  ThreaderType,
840  Base>;
841 
842 public:
843  void InitAndSort(PlainType* data,
844  size_t num_elems,
845  const ThreaderType& threader,
846  const CompareType& comp)
847  {
848  (void)comp;
849  DummyValueManager dvm;
850  Internal::InitAndSort(data, num_elems, threader, &dvm);
851  }
852 };
853 
854 // Frontend class for sorting pairs
855 template <typename ThreaderType,
856  typename PlainType,
857  typename ValueType,
858  typename CompareType,
859  typename UnsignedType = PlainType,
860  typename Encoder = encoder::EncoderDummy,
861  int Base = 8>
862 class PairSort
863 {
864  using ValueManager = value_manager::PairValueManager<PlainType, ValueType, Base>;
865  using Internal = internal::ParallelRadixSortInternal<PlainType,
866  CompareType,
867  UnsignedType,
868  Encoder,
869  ValueManager,
870  ThreaderType,
871  Base>;
872 
873 public:
874  void InitAndSort(PlainType* keys,
875  ValueType* vals,
876  size_t num_elems,
877  const ThreaderType& threader,
878  const CompareType& comp)
879  {
880  (void)comp;
881  ValueManager vm;
882  vm.Init(num_elems, threader.GetAvailableCores());
883  vm.Start(vals, num_elems);
884  Internal::InitAndSort(keys, num_elems, threader, &vm);
885  ValueType* res_vals = vm.GetResult();
886  if (res_vals != vals)
887  {
888  for (size_t i = 0; i < num_elems; ++i)
889  {
890  vals[i] = res_vals[i];
891  }
892  }
893  }
894 
895 private:
896 };
897 
898 #define KEY_SORT_CASE(plain_type, compare_type, unsigned_type, encoder_type) \
899  template <typename ThreaderType> \
900  class KeySort<ThreaderType, plain_type, compare_type> \
901  : public KeySort<ThreaderType, \
902  plain_type, \
903  compare_type, \
904  unsigned_type, \
905  encoder::Encoder##encoder_type> \
906  { \
907  }; \
908  template <typename V, typename ThreaderType> \
909  class PairSort<ThreaderType, plain_type, V, compare_type> \
910  : public PairSort<ThreaderType, \
911  plain_type, \
912  V, \
913  compare_type, \
914  unsigned_type, \
915  encoder::Encoder##encoder_type> \
916  { \
917  };
918 
919 // Unsigned integers
920 KEY_SORT_CASE(unsigned int, std::less<unsigned int>, unsigned int, Unsigned);
921 KEY_SORT_CASE(unsigned int, std::greater<unsigned int>, unsigned int, Unsigned);
922 KEY_SORT_CASE(unsigned short int, std::less<unsigned short int>, unsigned short int, Unsigned);
923 KEY_SORT_CASE(unsigned short int, std::greater<unsigned short int>, unsigned short int, Unsigned);
924 KEY_SORT_CASE(unsigned long int, std::less<unsigned long int>, unsigned long int, Unsigned);
925 KEY_SORT_CASE(unsigned long int, std::greater<unsigned long int>, unsigned long int, Unsigned);
926 KEY_SORT_CASE(unsigned long long int,
927  std::less<unsigned long long int>,
928  unsigned long long int,
929  Unsigned);
930 KEY_SORT_CASE(unsigned long long int,
931  std::greater<unsigned long long int>,
932  unsigned long long int,
933  Unsigned);
934 
935 // Unsigned char
936 KEY_SORT_CASE(unsigned char, std::less<unsigned char>, unsigned char, Unsigned);
937 KEY_SORT_CASE(unsigned char, std::greater<unsigned char>, unsigned char, Unsigned);
938 KEY_SORT_CASE(char16_t, std::less<char16_t>, uint16_t, Unsigned);
939 KEY_SORT_CASE(char16_t, std::greater<char16_t>, uint16_t, Unsigned);
940 KEY_SORT_CASE(char32_t, std::less<char32_t>, uint32_t, Unsigned);
941 KEY_SORT_CASE(char32_t, std::greater<char32_t>, uint32_t, Unsigned);
942 KEY_SORT_CASE(wchar_t, std::less<wchar_t>, uint32_t, Unsigned);
943 KEY_SORT_CASE(wchar_t, std::greater<wchar_t>, uint32_t, Unsigned);
944 
945 // Signed integers
946 KEY_SORT_CASE(char, std::less<char>, unsigned char, Signed);
947 KEY_SORT_CASE(char, std::greater<char>, unsigned char, Signed);
948 KEY_SORT_CASE(short, std::less<short>, unsigned short, Signed);
949 KEY_SORT_CASE(short, std::greater<short>, unsigned short, Signed);
950 KEY_SORT_CASE(int, std::less<int>, unsigned int, Signed);
951 KEY_SORT_CASE(int, std::greater<int>, unsigned int, Signed);
952 KEY_SORT_CASE(long, std::less<long>, unsigned long, Signed);
953 KEY_SORT_CASE(long, std::greater<long>, unsigned long, Signed);
954 KEY_SORT_CASE(long long, std::less<long long>, unsigned long long, Signed);
955 KEY_SORT_CASE(long long, std::greater<long long>, unsigned long long, Signed);
956 
957 // |signed char| and |char| are treated as different types
958 KEY_SORT_CASE(signed char, std::less<signed char>, unsigned char, Signed);
959 KEY_SORT_CASE(signed char, std::greater<signed char>, unsigned char, Signed);
960 
961 // Floating point numbers
962 KEY_SORT_CASE(float, std::less<float>, uint32_t, Decimal);
963 KEY_SORT_CASE(float, std::greater<float>, uint32_t, Decimal);
964 KEY_SORT_CASE(double, std::less<double>, uint64_t, Decimal);
965 KEY_SORT_CASE(double, std::greater<double>, uint64_t, Decimal);
966 
967 #undef KEY_SORT_CASE
968 
969 template <typename T, typename CompareType>
970 struct run_kx_radix_sort_keys
971 {
972  static void run(T* data, size_t num_elems, const CompareType& comp)
973  {
974  std::sort(data, data + num_elems, comp);
975  }
976 };
977 
978 #define KX_SORT_KEYS(key_type) \
979  template <> \
980  struct run_kx_radix_sort_keys<key_type, std::less<key_type>> \
981  { \
982  static void run(key_type* data, size_t num_elems, const std::less<key_type>& comp) \
983  { \
984  (void)comp; \
985  kx::radix_sort(data, data + num_elems); \
986  } \
987  };
988 
989 KX_SORT_KEYS(unsigned short int);
990 KX_SORT_KEYS(int);
991 KX_SORT_KEYS(unsigned int);
992 KX_SORT_KEYS(long int);
993 KX_SORT_KEYS(unsigned long int);
994 KX_SORT_KEYS(long long int);
995 KX_SORT_KEYS(unsigned long long int);
996 KX_SORT_KEYS(unsigned char);
997 
998 #undef KX_SORT_KEYS
999 
1000 template <typename T, typename CompareType>
1001 bool use_serial_sort_keys(T* data, size_t num_elems, const CompareType& comp)
1002 {
1003  size_t total_bytes = (num_elems) * sizeof(T);
1004  if (total_bytes < MIN_BYTES_FOR_PARALLEL)
1005  {
1006  run_kx_radix_sort_keys<T, CompareType>::run(data, num_elems, comp);
1007  return true;
1008  }
1009  return false;
1010 }
1011 
1012 // Generate radix sort interfaces for key and key value sorts.
1013 #define VTKM_INTERNAL_RADIX_SORT_INSTANTIATE(threader_type, key_type) \
1014  VTKM_CONT_EXPORT void parallel_radix_sort_key_values( \
1015  key_type* keys, vtkm::Id* vals, size_t num_elems, const std::greater<key_type>& comp) \
1016  { \
1017  using namespace vtkm::cont::internal::radix; \
1018  PairSort<threader_type, key_type, vtkm::Id, std::greater<key_type>> ps; \
1019  ps.InitAndSort(keys, vals, num_elems, threader_type(), comp); \
1020  } \
1021  VTKM_CONT_EXPORT void parallel_radix_sort_key_values( \
1022  key_type* keys, vtkm::Id* vals, size_t num_elems, const std::less<key_type>& comp) \
1023  { \
1024  using namespace vtkm::cont::internal::radix; \
1025  PairSort<threader_type, key_type, vtkm::Id, std::less<key_type>> ps; \
1026  ps.InitAndSort(keys, vals, num_elems, threader_type(), comp); \
1027  } \
1028  VTKM_CONT_EXPORT void parallel_radix_sort( \
1029  key_type* data, size_t num_elems, const std::greater<key_type>& comp) \
1030  { \
1031  using namespace vtkm::cont::internal::radix; \
1032  if (!use_serial_sort_keys(data, num_elems, comp)) \
1033  { \
1034  KeySort<threader_type, key_type, std::greater<key_type>> ks; \
1035  ks.InitAndSort(data, num_elems, threader_type(), comp); \
1036  } \
1037  } \
1038  VTKM_CONT_EXPORT void parallel_radix_sort( \
1039  key_type* data, size_t num_elems, const std::less<key_type>& comp) \
1040  { \
1041  using namespace vtkm::cont::internal::radix; \
1042  if (!use_serial_sort_keys(data, num_elems, comp)) \
1043  { \
1044  KeySort<threader_type, key_type, std::less<key_type>> ks; \
1045  ks.InitAndSort(data, num_elems, threader_type(), comp); \
1046  } \
1047  }
1048 
1049 #define VTKM_INSTANTIATE_RADIX_SORT_FOR_THREADER(ThreaderType) \
1050  VTKM_INTERNAL_RADIX_SORT_INSTANTIATE(ThreaderType, short int) \
1051  VTKM_INTERNAL_RADIX_SORT_INSTANTIATE(ThreaderType, unsigned short int) \
1052  VTKM_INTERNAL_RADIX_SORT_INSTANTIATE(ThreaderType, int) \
1053  VTKM_INTERNAL_RADIX_SORT_INSTANTIATE(ThreaderType, unsigned int) \
1054  VTKM_INTERNAL_RADIX_SORT_INSTANTIATE(ThreaderType, long int) \
1055  VTKM_INTERNAL_RADIX_SORT_INSTANTIATE(ThreaderType, unsigned long int) \
1056  VTKM_INTERNAL_RADIX_SORT_INSTANTIATE(ThreaderType, long long int) \
1057  VTKM_INTERNAL_RADIX_SORT_INSTANTIATE(ThreaderType, unsigned long long int) \
1058  VTKM_INTERNAL_RADIX_SORT_INSTANTIATE(ThreaderType, unsigned char) \
1059  VTKM_INTERNAL_RADIX_SORT_INSTANTIATE(ThreaderType, signed char) \
1060  VTKM_INTERNAL_RADIX_SORT_INSTANTIATE(ThreaderType, char) \
1061  VTKM_INTERNAL_RADIX_SORT_INSTANTIATE(ThreaderType, char16_t) \
1062  VTKM_INTERNAL_RADIX_SORT_INSTANTIATE(ThreaderType, char32_t) \
1063  VTKM_INTERNAL_RADIX_SORT_INSTANTIATE(ThreaderType, wchar_t) \
1064  VTKM_INTERNAL_RADIX_SORT_INSTANTIATE(ThreaderType, float) \
1065  VTKM_INTERNAL_RADIX_SORT_INSTANTIATE(ThreaderType, double)
1066 
1068 }
1069 }
1070 }
1071 } // end namespace vtkm::cont::internal::radix
1072 
1073 #endif // vtk_m_cont_internal_ParallelRadixSort_h
VTKM_THIRDPARTY_POST_INCLUDE
#define VTKM_THIRDPARTY_POST_INCLUDE
Definition: Configure.h:192
vtkm
Groups connected points that have the same field value.
Definition: Atomic.h:19
Types.h
KX_SORT_KEYS
#define KX_SORT_KEYS(key_type)
Definition: ParallelRadixSort.h:978
vtkm::cont::GetSizeString
std::string GetSizeString(vtkm::UInt64 bytes, int prec=2)
Returns "%1 (%2 bytes)" where %1 is the result from GetHumanReadableSize and %2 is the exact number o...
KXSort.h
VTKM_LOG_F
#define VTKM_LOG_F(level,...)
Writes a message using printf syntax to the indicated log level.
Definition: Logging.h:209
vtkm::cont::LogLevel::MemCont
@ MemCont
Host-side resource allocations/frees (e.g. ArrayHandle control buffers).
VTKM_THIRDPARTY_PRE_INCLUDE
#define VTKM_THIRDPARTY_PRE_INCLUDE
Definition: Configure.h:191
vtkm::UInt64
unsigned long long UInt64
Base type to use for 64-bit signed integer numbers.
Definition: Types.h:207
Logging.h
Logging utilities.
KEY_SORT_CASE
#define KEY_SORT_CASE(plain_type, compare_type, unsigned_type, encoder_type)
Definition: ParallelRadixSort.h:898
ParallelRadixSortInterface.h