VTK-m  2.0
DeviceAdapterAlgorithmCuda.h
Go to the documentation of this file.
1 //============================================================================
2 // Copyright (c) Kitware, Inc.
3 // All rights reserved.
4 // See LICENSE.txt for details.
5 //
6 // This software is distributed WITHOUT ANY WARRANTY; without even
7 // the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
8 // PURPOSE. See the above copyright notice for more information.
9 //============================================================================
10 #ifndef vtk_m_cont_cuda_internal_DeviceAdapterAlgorithmCuda_h
11 #define vtk_m_cont_cuda_internal_DeviceAdapterAlgorithmCuda_h
12 
13 #include <vtkm/Math.h>
14 #include <vtkm/TypeTraits.h>
15 #include <vtkm/Types.h>
16 #include <vtkm/UnaryPredicates.h>
17 
18 #include <vtkm/cont/ArrayHandle.h>
20 #include <vtkm/cont/BitField.h>
23 #include <vtkm/cont/Logging.h>
24 #include <vtkm/cont/Token.h>
25 #include <vtkm/cont/vtkm_cont_export.h>
26 
28 
36 
39 
40 // Disable warnings we check vtkm for but Thrust does not.
42 VTKM_THIRDPARTY_PRE_INCLUDE
43 //needs to be first
44 #include <vtkm/exec/cuda/internal/ExecutionPolicy.h>
45 
46 #include <cooperative_groups.h>
47 #include <cuda.h>
48 #include <thrust/advance.h>
49 #include <thrust/binary_search.h>
50 #include <thrust/copy.h>
51 #include <thrust/count.h>
52 #include <thrust/iterator/counting_iterator.h>
53 #include <thrust/scan.h>
54 #include <thrust/sort.h>
55 #include <thrust/system/cpp/memory.h>
56 #include <thrust/system/cuda/vector.h>
57 #include <thrust/unique.h>
58 
59 VTKM_THIRDPARTY_POST_INCLUDE
60 
61 #include <limits>
62 #include <memory>
63 
64 namespace vtkm
65 {
66 namespace cont
67 {
68 namespace cuda
69 {
70 
81 struct VTKM_CONT_EXPORT ScheduleParameters
82 {
85 
88 
91 };
92 
127 VTKM_CONT_EXPORT void InitScheduleParameters(
128  vtkm::cont::cuda::ScheduleParameters (*)(char const* name,
129  int major,
130  int minor,
131  int multiProcessorCount,
132  int maxThreadsPerMultiProcessor,
133  int maxThreadsPerBlock));
134 
135 namespace internal
136 {
137 
138 #if (defined(VTKM_GCC) || defined(VTKM_CLANG))
139 #pragma GCC diagnostic push
140 #pragma GCC diagnostic ignored "-Wunused-parameter"
141 #endif
142 
143 template <typename TaskType>
144 __global__ void TaskStrided1DLaunch(TaskType task, vtkm::Id size)
145 {
146  //see https://devblogs.nvidia.com/cuda-pro-tip-write-flexible-kernels-grid-stride-loops/
147  //for why our inc is grid-stride
148  const vtkm::Id start = blockIdx.x * blockDim.x + threadIdx.x;
149  const vtkm::Id inc = blockDim.x * gridDim.x;
150  task(start, size, inc);
151 }
152 
153 template <typename TaskType>
154 __global__ void TaskStrided3DLaunch(TaskType task, vtkm::Id3 size)
155 {
156  //This is the 3D version of executing in a grid-stride manner
157  const dim3 start(blockIdx.x * blockDim.x + threadIdx.x,
158  blockIdx.y * blockDim.y + threadIdx.y,
159  blockIdx.z * blockDim.z + threadIdx.z);
160  const dim3 inc(blockDim.x * gridDim.x, blockDim.y * gridDim.y, blockDim.z * gridDim.z);
161 
162  for (vtkm::Id k = start.z; k < size[2]; k += inc.z)
163  {
164  for (vtkm::Id j = start.y; j < size[1]; j += inc.y)
165  {
166  task(size, start.x, size[0], inc.x, j, k);
167  }
168  }
169 }
170 
171 template <typename T, typename BinaryOperationType>
172 __global__ void SumExclusiveScan(T a, T b, T result, BinaryOperationType binary_op)
173 {
174  result = binary_op(a, b);
175 }
176 
177 #if (defined(VTKM_GCC) || defined(VTKM_CLANG))
178 #pragma GCC diagnostic pop
179 #endif
180 
181 template <typename FunctorType, typename ArgType>
182 struct FunctorSupportsUnaryImpl
183 {
184  template <typename F, typename A, typename = decltype(std::declval<F>()(std::declval<A>()))>
185  static std::true_type has(int);
186  template <typename F, typename A>
187  static std::false_type has(...);
188  using type = decltype(has<FunctorType, ArgType>(0));
189 };
190 template <typename FunctorType, typename ArgType>
191 using FunctorSupportsUnary = typename FunctorSupportsUnaryImpl<FunctorType, ArgType>::type;
192 
193 template <typename PortalType,
194  typename BinaryAndUnaryFunctor,
195  typename = FunctorSupportsUnary<BinaryAndUnaryFunctor, typename PortalType::ValueType>>
196 struct CastPortal;
197 
198 template <typename PortalType, typename BinaryAndUnaryFunctor>
199 struct CastPortal<PortalType, BinaryAndUnaryFunctor, std::true_type>
200 {
201  using InputType = typename PortalType::ValueType;
202  using ValueType = decltype(std::declval<BinaryAndUnaryFunctor>()(std::declval<InputType>()));
203 
204  PortalType Portal;
205  BinaryAndUnaryFunctor Functor;
206 
207  VTKM_CONT
208  CastPortal(const PortalType& portal, const BinaryAndUnaryFunctor& functor)
209  : Portal(portal)
210  , Functor(functor)
211  {
212  }
213 
214  VTKM_EXEC
215  vtkm::Id GetNumberOfValues() const { return this->Portal.GetNumberOfValues(); }
216 
217  VTKM_EXEC
218  ValueType Get(vtkm::Id index) const { return this->Functor(this->Portal.Get(index)); }
219 };
220 
221 template <typename PortalType, typename BinaryFunctor>
222 struct CastPortal<PortalType, BinaryFunctor, std::false_type>
223 {
224  using InputType = typename PortalType::ValueType;
225  using ValueType =
226  decltype(std::declval<BinaryFunctor>()(std::declval<InputType>(), std::declval<InputType>()));
227 
228  PortalType Portal;
229 
230  VTKM_CONT
231  CastPortal(const PortalType& portal, const BinaryFunctor&)
232  : Portal(portal)
233  {
234  }
235 
236  VTKM_EXEC
237  vtkm::Id GetNumberOfValues() const { return this->Portal.GetNumberOfValues(); }
238 
239  VTKM_EXEC
240  ValueType Get(vtkm::Id index) const { return static_cast<ValueType>(this->Portal.Get(index)); }
241 };
242 
243 struct CudaFreeFunctor
244 {
245  void operator()(void* ptr) const { VTKM_CUDA_CALL(cudaFree(ptr)); }
246 };
247 
248 template <typename T>
249 using CudaUniquePtr = std::unique_ptr<T, CudaFreeFunctor>;
250 
251 template <typename T>
252 CudaUniquePtr<T> make_CudaUniquePtr(std::size_t numElements)
253 {
254  T* ptr;
255  VTKM_CUDA_CALL(cudaMalloc(&ptr, sizeof(T) * numElements));
256  return CudaUniquePtr<T>(ptr);
257 }
258 }
259 } // end namespace cuda::internal
260 
261 template <>
262 struct DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>
263  : vtkm::cont::internal::DeviceAdapterAlgorithmGeneral<
264  vtkm::cont::DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>,
265  vtkm::cont::DeviceAdapterTagCuda>
266 {
267 // Because of some funny code conversions in nvcc, kernels for devices have to
268 // be public.
269 #ifndef VTKM_CUDA
270 private:
271 #endif
272 
273  using Superclass = vtkm::cont::internal::DeviceAdapterAlgorithmGeneral<
275  vtkm::cont::DeviceAdapterTagCuda>;
276 
277  template <typename BitsPortal, typename IndicesPortal, typename GlobalPopCountType>
278  struct BitFieldToUnorderedSetFunctor : public vtkm::exec::FunctorBase
279  {
280  VTKM_STATIC_ASSERT_MSG(VTKM_PASS_COMMAS(std::is_same<GlobalPopCountType, vtkm::Int32>::value ||
281  std::is_same<GlobalPopCountType, vtkm::UInt32>::value ||
282  std::is_same<GlobalPopCountType, vtkm::UInt64>::value),
283  "Unsupported GlobalPopCountType. Must support CUDA atomicAdd.");
284 
285  //Using typename BitsPortal::WordTypePreferred causes dependent type errors using GCC 4.8.5
286  //which is the GCC required compiler for CUDA 9.2 on summit/power9
288 
290  VTKM_PASS_COMMAS(std::is_same<typename IndicesPortal::ValueType, vtkm::Id>::value));
291 
292  VTKM_CONT
293  BitFieldToUnorderedSetFunctor(const BitsPortal& input,
294  const IndicesPortal& output,
295  GlobalPopCountType* globalPopCount)
296  : Input{ input }
297  , Output{ output }
298  , GlobalPopCount{ globalPopCount }
299  , FinalWordIndex{ input.GetNumberOfWords() - 1 }
300  , FinalWordMask(input.GetFinalWordMask())
301  {
302  }
303 
305 
307  {
308  assert(this->GlobalPopCount != nullptr);
309  VTKM_CUDA_CALL(cudaMemset(this->GlobalPopCount, 0, sizeof(GlobalPopCountType)));
310  }
311 
313  __device__ void operator()(vtkm::Id wordIdx) const
314  {
315  Word word = this->Input.GetWord(wordIdx);
316 
317  // The last word may be partial -- mask out trailing bits if needed.
318  const Word mask = wordIdx == this->FinalWordIndex ? this->FinalWordMask : ~Word{ 0 };
319 
320  word &= mask;
321 
322  if (word != 0)
323  {
324  this->LocalPopCount = vtkm::CountSetBits(word);
325  this->ReduceAllocate();
326 
327  vtkm::Id firstBitIdx = wordIdx * sizeof(Word) * CHAR_BIT;
328  do
329  {
330  // Find next bit. FindFirstSetBit's result is indexed starting at 1.
331  vtkm::Int32 bit = vtkm::FindFirstSetBit(word) - 1;
332  vtkm::Id outIdx = this->GetNextOutputIndex();
333  // Write index of bit
334  this->Output.Set(outIdx, firstBitIdx + bit);
335  word ^= (1 << bit); // clear bit
336  } while (word != 0); // have bits
337  }
338  }
339 
341  {
342  assert(this->GlobalPopCount != nullptr);
343  GlobalPopCountType result;
344  VTKM_CUDA_CALL(cudaMemcpy(
345  &result, this->GlobalPopCount, sizeof(GlobalPopCountType), cudaMemcpyDeviceToHost));
346  return static_cast<vtkm::Id>(result);
347  }
348 
349  private:
350  // Every thread with a non-zero local popcount calls this function, which
351  // computes the total popcount for the coalesced threads and allocates
352  // a contiguous block in the output by atomically increasing the global
353  // popcount.
355  __device__ void ReduceAllocate() const
356  {
357  const auto activeLanes = cooperative_groups::coalesced_threads();
358  const int activeRank = activeLanes.thread_rank();
359  const int activeSize = activeLanes.size();
360 
361  // Reduction value:
362  vtkm::Int32 rVal = this->LocalPopCount;
363  for (int delta = 1; delta < activeSize; delta *= 2)
364  {
365  const vtkm::Int32 shflVal = activeLanes.shfl_down(rVal, delta);
366  if (activeRank + delta < activeSize)
367  {
368  rVal += shflVal;
369  }
370  }
371 
372  if (activeRank == 0)
373  {
374  this->AllocationHead =
375  atomicAdd(this->GlobalPopCount, static_cast<GlobalPopCountType>(rVal));
376  }
377 
378  this->AllocationHead = activeLanes.shfl(this->AllocationHead, 0);
379  }
380 
381  // The global output allocation is written to by striding the writes across
382  // the warp lanes, allowing the writes to global memory to be coalesced.
384  __device__ vtkm::Id GetNextOutputIndex() const
385  {
386  // Only lanes with unwritten output indices left will call this method,
387  // so just check the coalesced threads:
388  const auto activeLanes = cooperative_groups::coalesced_threads();
389  const int activeRank = activeLanes.thread_rank();
390  const int activeSize = activeLanes.size();
391 
392  vtkm::Id nextIdx = static_cast<vtkm::Id>(this->AllocationHead + activeRank);
393  this->AllocationHead += activeSize;
394 
395  return nextIdx;
396  }
397 
398  const BitsPortal Input;
399  const IndicesPortal Output;
400  GlobalPopCountType* GlobalPopCount;
401  mutable vtkm::UInt64 AllocationHead{ 0 };
402  mutable vtkm::Int32 LocalPopCount{ 0 };
403  // Used to mask trailing bits the in last word.
404  vtkm::Id FinalWordIndex{ 0 };
405  Word FinalWordMask{ 0 };
406  };
407 
408  template <class InputPortal, class OutputPortal>
409  VTKM_CONT static void CopyPortal(const InputPortal& input, const OutputPortal& output)
410  {
411  try
412  {
413  ::thrust::copy(ThrustCudaPolicyPerThread,
414  cuda::internal::IteratorBegin(input),
415  cuda::internal::IteratorEnd(input),
416  cuda::internal::IteratorBegin(output));
417  }
418  catch (...)
419  {
420  cuda::internal::throwAsVTKmException();
421  }
422  }
423 
424  template <class ValueIterator, class StencilPortal, class OutputPortal, class UnaryPredicate>
425  VTKM_CONT static vtkm::Id CopyIfPortal(ValueIterator valuesBegin,
426  ValueIterator valuesEnd,
427  StencilPortal stencil,
428  OutputPortal output,
429  UnaryPredicate unary_predicate)
430  {
431  auto outputBegin = cuda::internal::IteratorBegin(output);
432 
433  using ValueType = typename StencilPortal::ValueType;
434 
435  vtkm::exec::cuda::internal::WrappedUnaryPredicate<ValueType, UnaryPredicate> up(
436  unary_predicate);
437 
438  try
439  {
440  auto newLast = ::thrust::copy_if(ThrustCudaPolicyPerThread,
441  valuesBegin,
442  valuesEnd,
443  cuda::internal::IteratorBegin(stencil),
444  outputBegin,
445  up);
446  return static_cast<vtkm::Id>(::thrust::distance(outputBegin, newLast));
447  }
448  catch (...)
449  {
450  cuda::internal::throwAsVTKmException();
451  return vtkm::Id(0);
452  }
453  }
454 
455  template <class ValuePortal, class StencilPortal, class OutputPortal, class UnaryPredicate>
456  VTKM_CONT static vtkm::Id CopyIfPortal(ValuePortal values,
457  StencilPortal stencil,
458  OutputPortal output,
459  UnaryPredicate unary_predicate)
460  {
461  return CopyIfPortal(cuda::internal::IteratorBegin(values),
462  cuda::internal::IteratorEnd(values),
463  stencil,
464  output,
465  unary_predicate);
466  }
467 
468  template <class InputPortal, class OutputPortal>
469  VTKM_CONT static void CopySubRangePortal(const InputPortal& input,
470  vtkm::Id inputOffset,
471  vtkm::Id size,
472  const OutputPortal& output,
473  vtkm::Id outputOffset)
474  {
475  try
476  {
477  ::thrust::copy_n(ThrustCudaPolicyPerThread,
478  cuda::internal::IteratorBegin(input) + inputOffset,
479  static_cast<std::size_t>(size),
480  cuda::internal::IteratorBegin(output) + outputOffset);
481  }
482  catch (...)
483  {
484  cuda::internal::throwAsVTKmException();
485  }
486  }
487 
488 
489  template <typename BitsPortal, typename GlobalPopCountType>
490  struct CountSetBitsFunctor : public vtkm::exec::FunctorBase
491  {
492  VTKM_STATIC_ASSERT_MSG(VTKM_PASS_COMMAS(std::is_same<GlobalPopCountType, vtkm::Int32>::value ||
493  std::is_same<GlobalPopCountType, vtkm::UInt32>::value ||
494  std::is_same<GlobalPopCountType, vtkm::UInt64>::value),
495  "Unsupported GlobalPopCountType. Must support CUDA atomicAdd.");
496 
497  //Using typename BitsPortal::WordTypePreferred causes dependent type errors using GCC 4.8.5
498  //which is the GCC required compiler for CUDA 9.2 on summit/power9
500 
501  VTKM_CONT
502  CountSetBitsFunctor(const BitsPortal& portal, GlobalPopCountType* globalPopCount)
503  : Portal{ portal }
504  , GlobalPopCount{ globalPopCount }
505  , FinalWordIndex{ portal.GetNumberOfWords() - 1 }
506  , FinalWordMask{ portal.GetFinalWordMask() }
507  {
508  }
509 
511 
513  {
514  assert(this->GlobalPopCount != nullptr);
515  VTKM_CUDA_CALL(cudaMemset(this->GlobalPopCount, 0, sizeof(GlobalPopCountType)));
516  }
517 
519  __device__ void operator()(vtkm::Id wordIdx) const
520  {
521  Word word = this->Portal.GetWord(wordIdx);
522 
523  // The last word may be partial -- mask out trailing bits if needed.
524  const Word mask = wordIdx == this->FinalWordIndex ? this->FinalWordMask : ~Word{ 0 };
525 
526  word &= mask;
527 
528  if (word != 0)
529  {
530  this->LocalPopCount = vtkm::CountSetBits(word);
531  this->Reduce();
532  }
533  }
534 
536  {
537  assert(this->GlobalPopCount != nullptr);
538  GlobalPopCountType result;
539  VTKM_CUDA_CALL(cudaMemcpy(
540  &result, this->GlobalPopCount, sizeof(GlobalPopCountType), cudaMemcpyDeviceToHost));
541  return static_cast<vtkm::Id>(result);
542  }
543 
544  private:
545  // Every thread with a non-zero local popcount calls this function, which
546  // computes the total popcount for the coalesced threads and atomically
547  // increasing the global popcount.
549  __device__ void Reduce() const
550  {
551  const auto activeLanes = cooperative_groups::coalesced_threads();
552  const int activeRank = activeLanes.thread_rank();
553  const int activeSize = activeLanes.size();
554 
555  // Reduction value:
556  vtkm::Int32 rVal = this->LocalPopCount;
557  for (int delta = 1; delta < activeSize; delta *= 2)
558  {
559  const vtkm::Int32 shflVal = activeLanes.shfl_down(rVal, delta);
560  if (activeRank + delta < activeSize)
561  {
562  rVal += shflVal;
563  }
564  }
565 
566  if (activeRank == 0)
567  {
568  atomicAdd(this->GlobalPopCount, static_cast<GlobalPopCountType>(rVal));
569  }
570  }
571 
572  const BitsPortal Portal;
573  GlobalPopCountType* GlobalPopCount;
574  mutable vtkm::Int32 LocalPopCount{ 0 };
575  // Used to mask trailing bits the in last word.
576  vtkm::Id FinalWordIndex{ 0 };
577  Word FinalWordMask{ 0 };
578  };
579 
580  template <class InputPortal, class ValuesPortal, class OutputPortal>
581  VTKM_CONT static void LowerBoundsPortal(const InputPortal& input,
582  const ValuesPortal& values,
583  const OutputPortal& output)
584  {
585  using ValueType = typename ValuesPortal::ValueType;
586  LowerBoundsPortal(input, values, output, ::thrust::less<ValueType>());
587  }
588 
589  template <class InputPortal, class OutputPortal>
590  VTKM_CONT static void LowerBoundsPortal(const InputPortal& input,
591  const OutputPortal& values_output)
592  {
593  using ValueType = typename InputPortal::ValueType;
594  LowerBoundsPortal(input, values_output, values_output, ::thrust::less<ValueType>());
595  }
596 
597  template <class InputPortal, class ValuesPortal, class OutputPortal, class BinaryCompare>
598  VTKM_CONT static void LowerBoundsPortal(const InputPortal& input,
599  const ValuesPortal& values,
600  const OutputPortal& output,
601  BinaryCompare binary_compare)
602  {
603  using ValueType = typename InputPortal::ValueType;
604  vtkm::exec::cuda::internal::WrappedBinaryPredicate<ValueType, BinaryCompare> bop(
605  binary_compare);
606 
607  try
608  {
609  ::thrust::lower_bound(ThrustCudaPolicyPerThread,
610  cuda::internal::IteratorBegin(input),
611  cuda::internal::IteratorEnd(input),
612  cuda::internal::IteratorBegin(values),
613  cuda::internal::IteratorEnd(values),
614  cuda::internal::IteratorBegin(output),
615  bop);
616  }
617  catch (...)
618  {
619  cuda::internal::throwAsVTKmException();
620  }
621  }
622 
623  template <class InputPortal, typename T>
624  VTKM_CONT static T ReducePortal(const InputPortal& input, T initialValue)
625  {
626  return ReducePortal(input, initialValue, ::thrust::plus<T>());
627  }
628 
629  template <class InputPortal, typename T, class BinaryFunctor>
630  VTKM_CONT static T ReducePortal(const InputPortal& input,
631  T initialValue,
632  BinaryFunctor binary_functor)
633  {
634  using fast_path = std::is_same<typename InputPortal::ValueType, T>;
635  return ReducePortalImpl(input, initialValue, binary_functor, fast_path());
636  }
637 
638  template <class InputPortal, typename T, class BinaryFunctor>
639  VTKM_CONT static T ReducePortalImpl(const InputPortal& input,
640  T initialValue,
641  BinaryFunctor binary_functor,
642  std::true_type)
643  {
644  //The portal type and the initial value are the same so we can use
645  //the thrust reduction algorithm
646  vtkm::exec::cuda::internal::WrappedBinaryOperator<T, BinaryFunctor> bop(binary_functor);
647 
648  try
649  {
650  return ::thrust::reduce(ThrustCudaPolicyPerThread,
651  cuda::internal::IteratorBegin(input),
652  cuda::internal::IteratorEnd(input),
653  initialValue,
654  bop);
655  }
656  catch (...)
657  {
658  cuda::internal::throwAsVTKmException();
659  }
660 
661  return initialValue;
662  }
663 
664  template <class InputPortal, typename T, class BinaryFunctor>
665  VTKM_CONT static T ReducePortalImpl(const InputPortal& input,
666  T initialValue,
667  BinaryFunctor binary_functor,
668  std::false_type)
669  {
670  //The portal type and the initial value AREN'T the same type so we have
671  //to a slower approach, where we wrap the input portal inside a cast
672  //portal
673  vtkm::cont::cuda::internal::CastPortal<InputPortal, BinaryFunctor> castPortal(input,
674  binary_functor);
675 
676  vtkm::exec::cuda::internal::WrappedBinaryOperator<T, BinaryFunctor> bop(binary_functor);
677 
678  try
679  {
680  return ::thrust::reduce(ThrustCudaPolicyPerThread,
681  cuda::internal::IteratorBegin(castPortal),
682  cuda::internal::IteratorEnd(castPortal),
683  initialValue,
684  bop);
685  }
686  catch (...)
687  {
688  cuda::internal::throwAsVTKmException();
689  }
690 
691  return initialValue;
692  }
693 
694  template <class KeysPortal,
695  class ValuesPortal,
696  class KeysOutputPortal,
697  class ValueOutputPortal,
698  class BinaryFunctor>
699  VTKM_CONT static vtkm::Id ReduceByKeyPortal(const KeysPortal& keys,
700  const ValuesPortal& values,
701  const KeysOutputPortal& keys_output,
702  const ValueOutputPortal& values_output,
703  BinaryFunctor binary_functor)
704  {
705  auto keys_out_begin = cuda::internal::IteratorBegin(keys_output);
706  auto values_out_begin = cuda::internal::IteratorBegin(values_output);
707 
708  ::thrust::pair<decltype(keys_out_begin), decltype(values_out_begin)> result_iterators;
709 
710  ::thrust::equal_to<typename KeysPortal::ValueType> binaryPredicate;
711 
712  using ValueType = typename ValuesPortal::ValueType;
713  vtkm::exec::cuda::internal::WrappedBinaryOperator<ValueType, BinaryFunctor> bop(binary_functor);
714 
715  try
716  {
717  result_iterators = ::thrust::reduce_by_key(vtkm_cuda_policy(),
718  cuda::internal::IteratorBegin(keys),
719  cuda::internal::IteratorEnd(keys),
720  cuda::internal::IteratorBegin(values),
721  keys_out_begin,
722  values_out_begin,
723  binaryPredicate,
724  bop);
725  }
726  catch (...)
727  {
728  cuda::internal::throwAsVTKmException();
729  }
730 
731  return static_cast<vtkm::Id>(::thrust::distance(keys_out_begin, result_iterators.first));
732  }
733 
734  template <class InputPortal, class OutputPortal>
735  VTKM_CONT static typename InputPortal::ValueType ScanExclusivePortal(const InputPortal& input,
736  const OutputPortal& output)
737  {
738  using ValueType = typename OutputPortal::ValueType;
739 
740  return ScanExclusivePortal(input,
741  output,
742  (::thrust::plus<ValueType>()),
744  }
745 
746  template <class InputPortal, class OutputPortal, class BinaryFunctor>
747  VTKM_CONT static typename InputPortal::ValueType ScanExclusivePortal(
748  const InputPortal& input,
749  const OutputPortal& output,
750  BinaryFunctor binaryOp,
751  typename InputPortal::ValueType initialValue)
752  {
753  // Use iterator to get value so that thrust device_ptr has chance to handle
754  // data on device.
755  using ValueType = typename OutputPortal::ValueType;
756 
757  //we have size three so that we can store the origin end value, the
758  //new end value, and the sum of those two
759  ::thrust::system::cuda::vector<ValueType> sum(3);
760  try
761  {
762 
763  //store the current value of the last position array in a separate cuda
764  //memory location since the exclusive_scan will overwrite that value
765  //once run
766  ::thrust::copy_n(
767  ThrustCudaPolicyPerThread, cuda::internal::IteratorEnd(input) - 1, 1, sum.begin());
768 
769  vtkm::exec::cuda::internal::WrappedBinaryOperator<ValueType, BinaryFunctor> bop(binaryOp);
770 
771  auto end = ::thrust::exclusive_scan(ThrustCudaPolicyPerThread,
772  cuda::internal::IteratorBegin(input),
773  cuda::internal::IteratorEnd(input),
774  cuda::internal::IteratorBegin(output),
775  initialValue,
776  bop);
777 
778  //Store the new value for the end of the array. This is done because
779  //with items such as the transpose array it is unsafe to pass the
780  //portal to the SumExclusiveScan
781  ::thrust::copy_n(ThrustCudaPolicyPerThread, (end - 1), 1, sum.begin() + 1);
782 
783  //execute the binaryOp one last time on the device.
784  cuda::internal::SumExclusiveScan<<<1, 1, 0, cudaStreamPerThread>>>(
785  sum[0], sum[1], sum[2], bop);
786  }
787  catch (...)
788  {
789  cuda::internal::throwAsVTKmException();
790  }
791  return sum[2];
792  }
793 
794  template <class InputPortal, class OutputPortal>
795  VTKM_CONT static typename InputPortal::ValueType ScanInclusivePortal(const InputPortal& input,
796  const OutputPortal& output)
797  {
798  using ValueType = typename OutputPortal::ValueType;
799  return ScanInclusivePortal(input, output, ::thrust::plus<ValueType>());
800  }
801 
802  template <class InputPortal, class OutputPortal, class BinaryFunctor>
803  VTKM_CONT static typename InputPortal::ValueType ScanInclusivePortal(const InputPortal& input,
804  const OutputPortal& output,
805  BinaryFunctor binary_functor)
806  {
807  using ValueType = typename OutputPortal::ValueType;
808  vtkm::exec::cuda::internal::WrappedBinaryOperator<ValueType, BinaryFunctor> bop(binary_functor);
809 
810  try
811  {
812  ::thrust::system::cuda::vector<ValueType> result(1);
813  auto end = ::thrust::inclusive_scan(ThrustCudaPolicyPerThread,
814  cuda::internal::IteratorBegin(input),
815  cuda::internal::IteratorEnd(input),
816  cuda::internal::IteratorBegin(output),
817  bop);
818 
819  ::thrust::copy_n(ThrustCudaPolicyPerThread, end - 1, 1, result.begin());
820  return result[0];
821  }
822  catch (...)
823  {
824  cuda::internal::throwAsVTKmException();
825  return typename InputPortal::ValueType();
826  }
827 
828  //return the value at the last index in the array, as that is the sum
829  }
830 
831  template <typename KeysPortal, typename ValuesPortal, typename OutputPortal>
832  VTKM_CONT static void ScanInclusiveByKeyPortal(const KeysPortal& keys,
833  const ValuesPortal& values,
834  const OutputPortal& output)
835  {
836  using KeyType = typename KeysPortal::ValueType;
837  using ValueType = typename OutputPortal::ValueType;
838  ScanInclusiveByKeyPortal(
839  keys, values, output, ::thrust::equal_to<KeyType>(), ::thrust::plus<ValueType>());
840  }
841 
842  template <typename KeysPortal,
843  typename ValuesPortal,
844  typename OutputPortal,
845  typename BinaryPredicate,
846  typename AssociativeOperator>
847  VTKM_CONT static void ScanInclusiveByKeyPortal(const KeysPortal& keys,
848  const ValuesPortal& values,
849  const OutputPortal& output,
850  BinaryPredicate binary_predicate,
851  AssociativeOperator binary_operator)
852  {
853  using KeyType = typename KeysPortal::ValueType;
854  vtkm::exec::cuda::internal::WrappedBinaryOperator<KeyType, BinaryPredicate> bpred(
855  binary_predicate);
856  using ValueType = typename OutputPortal::ValueType;
857  vtkm::exec::cuda::internal::WrappedBinaryOperator<ValueType, AssociativeOperator> bop(
858  binary_operator);
859 
860  try
861  {
862  ::thrust::inclusive_scan_by_key(ThrustCudaPolicyPerThread,
863  cuda::internal::IteratorBegin(keys),
864  cuda::internal::IteratorEnd(keys),
865  cuda::internal::IteratorBegin(values),
866  cuda::internal::IteratorBegin(output),
867  bpred,
868  bop);
869  }
870  catch (...)
871  {
872  cuda::internal::throwAsVTKmException();
873  }
874  }
875 
876  template <typename KeysPortal, typename ValuesPortal, typename OutputPortal>
877  VTKM_CONT static void ScanExclusiveByKeyPortal(const KeysPortal& keys,
878  const ValuesPortal& values,
879  const OutputPortal& output)
880  {
881  using KeyType = typename KeysPortal::ValueType;
882  using ValueType = typename OutputPortal::ValueType;
883  ScanExclusiveByKeyPortal(keys,
884  values,
885  output,
887  ::thrust::equal_to<KeyType>(),
888  ::thrust::plus<ValueType>());
889  }
890 
891  template <typename KeysPortal,
892  typename ValuesPortal,
893  typename OutputPortal,
894  typename T,
895  typename BinaryPredicate,
896  typename AssociativeOperator>
897  VTKM_CONT static void ScanExclusiveByKeyPortal(const KeysPortal& keys,
898  const ValuesPortal& values,
899  const OutputPortal& output,
900  T initValue,
901  BinaryPredicate binary_predicate,
902  AssociativeOperator binary_operator)
903  {
904  using KeyType = typename KeysPortal::ValueType;
905  vtkm::exec::cuda::internal::WrappedBinaryOperator<KeyType, BinaryPredicate> bpred(
906  binary_predicate);
907  using ValueType = typename OutputPortal::ValueType;
908  vtkm::exec::cuda::internal::WrappedBinaryOperator<ValueType, AssociativeOperator> bop(
909  binary_operator);
910  try
911  {
912  ::thrust::exclusive_scan_by_key(ThrustCudaPolicyPerThread,
913  cuda::internal::IteratorBegin(keys),
914  cuda::internal::IteratorEnd(keys),
915  cuda::internal::IteratorBegin(values),
916  cuda::internal::IteratorBegin(output),
917  initValue,
918  bpred,
919  bop);
920  }
921  catch (...)
922  {
923  cuda::internal::throwAsVTKmException();
924  }
925  }
926 
927  template <class ValuesPortal>
928  VTKM_CONT static void SortPortal(const ValuesPortal& values)
929  {
930  using ValueType = typename ValuesPortal::ValueType;
931  SortPortal(values, ::thrust::less<ValueType>());
932  }
933 
934  template <class ValuesPortal, class BinaryCompare>
935  VTKM_CONT static void SortPortal(const ValuesPortal& values, BinaryCompare binary_compare)
936  {
937  using ValueType = typename ValuesPortal::ValueType;
938  vtkm::exec::cuda::internal::WrappedBinaryPredicate<ValueType, BinaryCompare> bop(
939  binary_compare);
940  try
941  {
942  ::thrust::sort(vtkm_cuda_policy(),
943  cuda::internal::IteratorBegin(values),
944  cuda::internal::IteratorEnd(values),
945  bop);
946  }
947  catch (...)
948  {
949  cuda::internal::throwAsVTKmException();
950  }
951  }
952 
953  template <class KeysPortal, class ValuesPortal>
954  VTKM_CONT static void SortByKeyPortal(const KeysPortal& keys, const ValuesPortal& values)
955  {
956  using ValueType = typename KeysPortal::ValueType;
957  SortByKeyPortal(keys, values, ::thrust::less<ValueType>());
958  }
959 
960  template <class KeysPortal, class ValuesPortal, class BinaryCompare>
961  VTKM_CONT static void SortByKeyPortal(const KeysPortal& keys,
962  const ValuesPortal& values,
963  BinaryCompare binary_compare)
964  {
965  using ValueType = typename KeysPortal::ValueType;
966  vtkm::exec::cuda::internal::WrappedBinaryPredicate<ValueType, BinaryCompare> bop(
967  binary_compare);
968  try
969  {
970  ::thrust::sort_by_key(vtkm_cuda_policy(),
971  cuda::internal::IteratorBegin(keys),
972  cuda::internal::IteratorEnd(keys),
973  cuda::internal::IteratorBegin(values),
974  bop);
975  }
976  catch (...)
977  {
978  cuda::internal::throwAsVTKmException();
979  }
980  }
981 
982  template <class ValuesPortal>
983  VTKM_CONT static vtkm::Id UniquePortal(const ValuesPortal values)
984  {
985  try
986  {
987  auto begin = cuda::internal::IteratorBegin(values);
988  auto newLast =
989  ::thrust::unique(ThrustCudaPolicyPerThread, begin, cuda::internal::IteratorEnd(values));
990  return static_cast<vtkm::Id>(::thrust::distance(begin, newLast));
991  }
992  catch (...)
993  {
994  cuda::internal::throwAsVTKmException();
995  return vtkm::Id(0);
996  }
997  }
998 
999  template <class ValuesPortal, class BinaryCompare>
1000  VTKM_CONT static vtkm::Id UniquePortal(const ValuesPortal values, BinaryCompare binary_compare)
1001  {
1002  using ValueType = typename ValuesPortal::ValueType;
1003  vtkm::exec::cuda::internal::WrappedBinaryPredicate<ValueType, BinaryCompare> bop(
1004  binary_compare);
1005  try
1006  {
1007  auto begin = cuda::internal::IteratorBegin(values);
1008  auto newLast = ::thrust::unique(
1009  ThrustCudaPolicyPerThread, begin, cuda::internal::IteratorEnd(values), bop);
1010  return static_cast<vtkm::Id>(::thrust::distance(begin, newLast));
1011  }
1012  catch (...)
1013  {
1014  cuda::internal::throwAsVTKmException();
1015  return vtkm::Id(0);
1016  }
1017  }
1018 
1019  template <class InputPortal, class ValuesPortal, class OutputPortal>
1020  VTKM_CONT static void UpperBoundsPortal(const InputPortal& input,
1021  const ValuesPortal& values,
1022  const OutputPortal& output)
1023  {
1024  try
1025  {
1026  ::thrust::upper_bound(ThrustCudaPolicyPerThread,
1027  cuda::internal::IteratorBegin(input),
1028  cuda::internal::IteratorEnd(input),
1029  cuda::internal::IteratorBegin(values),
1030  cuda::internal::IteratorEnd(values),
1031  cuda::internal::IteratorBegin(output));
1032  }
1033  catch (...)
1034  {
1035  cuda::internal::throwAsVTKmException();
1036  }
1037  }
1038 
1039  template <class InputPortal, class ValuesPortal, class OutputPortal, class BinaryCompare>
1040  VTKM_CONT static void UpperBoundsPortal(const InputPortal& input,
1041  const ValuesPortal& values,
1042  const OutputPortal& output,
1043  BinaryCompare binary_compare)
1044  {
1045  using ValueType = typename OutputPortal::ValueType;
1046 
1047  vtkm::exec::cuda::internal::WrappedBinaryPredicate<ValueType, BinaryCompare> bop(
1048  binary_compare);
1049  try
1050  {
1051  ::thrust::upper_bound(ThrustCudaPolicyPerThread,
1052  cuda::internal::IteratorBegin(input),
1053  cuda::internal::IteratorEnd(input),
1054  cuda::internal::IteratorBegin(values),
1055  cuda::internal::IteratorEnd(values),
1056  cuda::internal::IteratorBegin(output),
1057  bop);
1058  }
1059  catch (...)
1060  {
1061  cuda::internal::throwAsVTKmException();
1062  }
1063  }
1064 
1065  template <class InputPortal, class OutputPortal>
1066  VTKM_CONT static void UpperBoundsPortal(const InputPortal& input,
1067  const OutputPortal& values_output)
1068  {
1069  try
1070  {
1071  ::thrust::upper_bound(ThrustCudaPolicyPerThread,
1072  cuda::internal::IteratorBegin(input),
1073  cuda::internal::IteratorEnd(input),
1074  cuda::internal::IteratorBegin(values_output),
1075  cuda::internal::IteratorEnd(values_output),
1076  cuda::internal::IteratorBegin(values_output));
1077  }
1078  catch (...)
1079  {
1080  cuda::internal::throwAsVTKmException();
1081  }
1082  }
1083 
1084  template <typename GlobalPopCountType, typename BitsPortal, typename IndicesPortal>
1085  VTKM_CONT static vtkm::Id BitFieldToUnorderedSetPortal(const BitsPortal& bits,
1086  const IndicesPortal& indices)
1087  {
1088  using Functor = BitFieldToUnorderedSetFunctor<BitsPortal, IndicesPortal, GlobalPopCountType>;
1089 
1090  // RAII for the global atomic counter.
1091  auto globalCount = cuda::internal::make_CudaUniquePtr<GlobalPopCountType>(1);
1092  Functor functor{ bits, indices, globalCount.get() };
1093 
1094  functor.Initialize();
1095  Schedule(functor, bits.GetNumberOfWords());
1096  Synchronize(); // Ensure kernel is done before checking final atomic count
1097  return functor.Finalize();
1098  }
1099 
1100  template <typename GlobalPopCountType, typename BitsPortal>
1101  VTKM_CONT static vtkm::Id CountSetBitsPortal(const BitsPortal& bits)
1102  {
1103  using Functor = CountSetBitsFunctor<BitsPortal, GlobalPopCountType>;
1104 
1105  // RAII for the global atomic counter.
1106  auto globalCount = cuda::internal::make_CudaUniquePtr<GlobalPopCountType>(1);
1107  Functor functor{ bits, globalCount.get() };
1108 
1109  functor.Initialize();
1110  Schedule(functor, bits.GetNumberOfWords());
1111  Synchronize(); // Ensure kernel is done before checking final atomic count
1112  return functor.Finalize();
1113  }
1114 
1115  //-----------------------------------------------------------------------------
1116 
1117 public:
1118  template <typename IndicesStorage>
1120  const vtkm::cont::BitField& bits,
1122  {
1124 
1125  vtkm::Id numBits = bits.GetNumberOfBits();
1126 
1127  {
1128  vtkm::cont::Token token;
1129  auto bitsPortal = bits.PrepareForInput(DeviceAdapterTagCuda{}, token);
1130  auto indicesPortal = indices.PrepareForOutput(numBits, DeviceAdapterTagCuda{}, token);
1131 
1132  // Use a uint64 for accumulator, as atomicAdd does not support signed int64.
1133  numBits = BitFieldToUnorderedSetPortal<vtkm::UInt64>(bitsPortal, indicesPortal);
1134  }
1135 
1136  indices.Allocate(numBits, vtkm::CopyFlag::On);
1137  return numBits;
1138  }
1139 
1140  template <typename T, typename U, class SIn, class SOut>
1143  {
1145 
1146  const vtkm::Id inSize = input.GetNumberOfValues();
1147  if (inSize <= 0)
1148  {
1149  output.Allocate(inSize, vtkm::CopyFlag::On);
1150  return;
1151  }
1152  vtkm::cont::Token token;
1153  CopyPortal(input.PrepareForInput(DeviceAdapterTagCuda(), token),
1154  output.PrepareForOutput(inSize, DeviceAdapterTagCuda(), token));
1155  }
1156 
1157  template <typename T, typename U, class SIn, class SStencil, class SOut>
1159  const vtkm::cont::ArrayHandle<T, SStencil>& stencil,
1161  {
1163 
1164  vtkm::Id size = stencil.GetNumberOfValues();
1165  if (size <= 0)
1166  {
1167  output.Allocate(size, vtkm::CopyFlag::On);
1168  return;
1169  }
1170 
1171  vtkm::Id newSize;
1172 
1173  {
1174  vtkm::cont::Token token;
1175 
1176  newSize = CopyIfPortal(input.PrepareForInput(DeviceAdapterTagCuda(), token),
1177  stencil.PrepareForInput(DeviceAdapterTagCuda(), token),
1178  output.PrepareForOutput(size, DeviceAdapterTagCuda(), token),
1179  ::vtkm::NotZeroInitialized()); //yes on the stencil
1180  }
1181 
1182  output.Allocate(newSize, vtkm::CopyFlag::On);
1183  }
1184 
1185  template <typename T, typename U, class SIn, class SStencil, class SOut, class UnaryPredicate>
1187  const vtkm::cont::ArrayHandle<T, SStencil>& stencil,
1189  UnaryPredicate unary_predicate)
1190  {
1192 
1193  vtkm::Id size = stencil.GetNumberOfValues();
1194  if (size <= 0)
1195  {
1196  output.Allocate(size, vtkm::CopyFlag::On);
1197  return;
1198  }
1199 
1200  vtkm::Id newSize;
1201 
1202  {
1203  vtkm::cont::Token token;
1204  newSize = CopyIfPortal(input.PrepareForInput(DeviceAdapterTagCuda(), token),
1205  stencil.PrepareForInput(DeviceAdapterTagCuda(), token),
1206  output.PrepareForOutput(size, DeviceAdapterTagCuda(), token),
1207  unary_predicate);
1208  }
1209 
1210  output.Allocate(newSize, vtkm::CopyFlag::On);
1211  }
1212 
1213  template <typename T, typename U, class SIn, class SOut>
1215  vtkm::Id inputStartIndex,
1216  vtkm::Id numberOfElementsToCopy,
1218  vtkm::Id outputIndex = 0)
1219  {
1221 
1222  const vtkm::Id inSize = input.GetNumberOfValues();
1223 
1224  // Check if the ranges overlap and fail if they do.
1225  if (input == output &&
1226  ((outputIndex >= inputStartIndex &&
1227  outputIndex < inputStartIndex + numberOfElementsToCopy) ||
1228  (inputStartIndex >= outputIndex &&
1229  inputStartIndex < outputIndex + numberOfElementsToCopy)))
1230  {
1231  return false;
1232  }
1233 
1234  if (inputStartIndex < 0 || numberOfElementsToCopy < 0 || outputIndex < 0 ||
1235  inputStartIndex >= inSize)
1236  { //invalid parameters
1237  return false;
1238  }
1239 
1240  //determine if the numberOfElementsToCopy needs to be reduced
1241  if (inSize < (inputStartIndex + numberOfElementsToCopy))
1242  { //adjust the size
1243  numberOfElementsToCopy = (inSize - inputStartIndex);
1244  }
1245 
1246  const vtkm::Id outSize = output.GetNumberOfValues();
1247  const vtkm::Id copyOutEnd = outputIndex + numberOfElementsToCopy;
1248  if (outSize < copyOutEnd)
1249  { //output is not large enough
1250  if (outSize == 0)
1251  { //since output has nothing, just need to allocate to correct length
1252  output.Allocate(copyOutEnd);
1253  }
1254  else
1255  { //we currently have data in this array, so preserve it in the new
1256  //resized array
1258  temp.Allocate(copyOutEnd);
1259  CopySubRange(output, 0, outSize, temp);
1260  output = temp;
1261  }
1262  }
1263  vtkm::cont::Token token;
1264  CopySubRangePortal(input.PrepareForInput(DeviceAdapterTagCuda(), token),
1265  inputStartIndex,
1266  numberOfElementsToCopy,
1267  output.PrepareForInPlace(DeviceAdapterTagCuda(), token),
1268  outputIndex);
1269  return true;
1270  }
1271 
1273  {
1275  vtkm::cont::Token token;
1276  auto bitsPortal = bits.PrepareForInput(DeviceAdapterTagCuda{}, token);
1277  // Use a uint64 for accumulator, as atomicAdd does not support signed int64.
1278  return CountSetBitsPortal<vtkm::UInt64>(bitsPortal);
1279  }
1280 
1281  template <typename T, class SIn, class SVal, class SOut>
1283  const vtkm::cont::ArrayHandle<T, SVal>& values,
1285  {
1287 
1288  vtkm::Id numberOfValues = values.GetNumberOfValues();
1289  vtkm::cont::Token token;
1290  LowerBoundsPortal(input.PrepareForInput(DeviceAdapterTagCuda(), token),
1291  values.PrepareForInput(DeviceAdapterTagCuda(), token),
1292  output.PrepareForOutput(numberOfValues, DeviceAdapterTagCuda(), token));
1293  }
1294 
1295  template <typename T, class SIn, class SVal, class SOut, class BinaryCompare>
1297  const vtkm::cont::ArrayHandle<T, SVal>& values,
1299  BinaryCompare binary_compare)
1300  {
1302 
1303  vtkm::Id numberOfValues = values.GetNumberOfValues();
1304  vtkm::cont::Token token;
1305  LowerBoundsPortal(input.PrepareForInput(DeviceAdapterTagCuda(), token),
1306  values.PrepareForInput(DeviceAdapterTagCuda(), token),
1307  output.PrepareForOutput(numberOfValues, DeviceAdapterTagCuda(), token),
1308  binary_compare);
1309  }
1310 
1311  template <class SIn, class SOut>
1314  {
1316 
1317  vtkm::cont::Token token;
1318  LowerBoundsPortal(input.PrepareForInput(DeviceAdapterTagCuda(), token),
1319  values_output.PrepareForInPlace(DeviceAdapterTagCuda(), token));
1320  }
1321 
1322  template <typename T, typename U, class SIn>
1323  VTKM_CONT static U Reduce(const vtkm::cont::ArrayHandle<T, SIn>& input, U initialValue)
1324  {
1326 
1327  const vtkm::Id numberOfValues = input.GetNumberOfValues();
1328  if (numberOfValues <= 0)
1329  {
1330  return initialValue;
1331  }
1332  vtkm::cont::Token token;
1333  return ReducePortal(input.PrepareForInput(DeviceAdapterTagCuda(), token), initialValue);
1334  }
1335 
1336  template <typename T, typename U, class SIn, class BinaryFunctor>
1338  U initialValue,
1339  BinaryFunctor binary_functor)
1340  {
1342 
1343  const vtkm::Id numberOfValues = input.GetNumberOfValues();
1344  if (numberOfValues <= 0)
1345  {
1346  return initialValue;
1347  }
1348  vtkm::cont::Token token;
1349  return ReducePortal(
1350  input.PrepareForInput(DeviceAdapterTagCuda(), token), initialValue, binary_functor);
1351  }
1352 
1353  // At least some versions of Thrust/nvcc result in compile errors when calling Thrust's
1354  // reduce with sufficiently complex iterators, which can happen with some versions of
1355  // ArrayHandleMultiplexer. Thus, don't use the Thrust version for ArrayHandleMultiplexer.
1356  template <typename T, typename U, typename... SIns>
1359  U initialValue)
1360  {
1361  return Superclass::Reduce(input, initialValue);
1362  }
1363  template <typename T, typename U, typename BinaryFunctor, typename... SIns>
1366  U initialValue,
1367  BinaryFunctor binary_functor)
1368  {
1369  return Superclass::Reduce(input, initialValue, binary_functor);
1370  }
1371 
1372  template <typename T,
1373  typename U,
1374  class KIn,
1375  class VIn,
1376  class KOut,
1377  class VOut,
1378  class BinaryFunctor>
1380  const vtkm::cont::ArrayHandle<U, VIn>& values,
1381  vtkm::cont::ArrayHandle<T, KOut>& keys_output,
1382  vtkm::cont::ArrayHandle<U, VOut>& values_output,
1383  BinaryFunctor binary_functor)
1384  {
1386 
1387  //there is a concern that by default we will allocate too much
1388  //space for the keys/values output. 1 option is to
1389  const vtkm::Id numberOfValues = keys.GetNumberOfValues();
1390  if (numberOfValues <= 0)
1391  {
1392  return;
1393  }
1394 
1395  vtkm::Id reduced_size;
1396  {
1397  vtkm::cont::Token token;
1398  reduced_size = ReduceByKeyPortal(
1399  keys.PrepareForInput(DeviceAdapterTagCuda(), token),
1400  values.PrepareForInput(DeviceAdapterTagCuda(), token),
1401  keys_output.PrepareForOutput(numberOfValues, DeviceAdapterTagCuda(), token),
1402  values_output.PrepareForOutput(numberOfValues, DeviceAdapterTagCuda(), token),
1403  binary_functor);
1404  }
1405 
1406  keys_output.Allocate(reduced_size, vtkm::CopyFlag::On);
1407  values_output.Allocate(reduced_size, vtkm::CopyFlag::On);
1408  }
1409 
1410  template <typename T, class SIn, class SOut>
1413  {
1415 
1416  const vtkm::Id numberOfValues = input.GetNumberOfValues();
1417  if (numberOfValues <= 0)
1418  {
1419  output.Allocate(0);
1421  }
1422 
1423  //We need call PrepareForInput on the input argument before invoking a
1424  //function. The order of execution of parameters of a function is undefined
1425  //so we need to make sure input is called before output, or else in-place
1426  //use case breaks.
1427  vtkm::cont::Token token;
1428  auto inputPortal = input.PrepareForInput(DeviceAdapterTagCuda(), token);
1429  return ScanExclusivePortal(
1430  inputPortal, output.PrepareForOutput(numberOfValues, DeviceAdapterTagCuda(), token));
1431  }
1432 
1433  template <typename T, class SIn, class SOut, class BinaryFunctor>
1436  BinaryFunctor binary_functor,
1437  const T& initialValue)
1438  {
1440 
1441  const vtkm::Id numberOfValues = input.GetNumberOfValues();
1442  if (numberOfValues <= 0)
1443  {
1444  output.Allocate(0);
1446  }
1447 
1448  //We need call PrepareForInput on the input argument before invoking a
1449  //function. The order of execution of parameters of a function is undefined
1450  //so we need to make sure input is called before output, or else in-place
1451  //use case breaks.
1452  vtkm::cont::Token token;
1453  auto inputPortal = input.PrepareForInput(DeviceAdapterTagCuda(), token);
1454  return ScanExclusivePortal(
1455  inputPortal,
1456  output.PrepareForOutput(numberOfValues, DeviceAdapterTagCuda(), token),
1457  binary_functor,
1458  initialValue);
1459  }
1460 
1461  template <typename T, class SIn, class SOut>
1464  {
1466 
1467  const vtkm::Id numberOfValues = input.GetNumberOfValues();
1468  if (numberOfValues <= 0)
1469  {
1470  output.Allocate(0);
1472  }
1473 
1474  //We need call PrepareForInput on the input argument before invoking a
1475  //function. The order of execution of parameters of a function is undefined
1476  //so we need to make sure input is called before output, or else in-place
1477  //use case breaks.
1478  vtkm::cont::Token token;
1479  auto inputPortal = input.PrepareForInput(DeviceAdapterTagCuda(), token);
1480  return ScanInclusivePortal(
1481  inputPortal, output.PrepareForOutput(numberOfValues, DeviceAdapterTagCuda(), token));
1482  }
1483 
1484  template <typename T, class SIn, class SOut, class BinaryFunctor>
1487  BinaryFunctor binary_functor)
1488  {
1490 
1491  const vtkm::Id numberOfValues = input.GetNumberOfValues();
1492  if (numberOfValues <= 0)
1493  {
1494  output.Allocate(0);
1496  }
1497 
1498  //We need call PrepareForInput on the input argument before invoking a
1499  //function. The order of execution of parameters of a function is undefined
1500  //so we need to make sure input is called before output, or else in-place
1501  //use case breaks.
1502  vtkm::cont::Token token;
1503  auto inputPortal = input.PrepareForInput(DeviceAdapterTagCuda(), token);
1504  return ScanInclusivePortal(
1505  inputPortal,
1506  output.PrepareForOutput(numberOfValues, DeviceAdapterTagCuda(), token),
1507  binary_functor);
1508  }
1509 
1510  template <typename T, typename U, typename KIn, typename VIn, typename VOut>
1512  const vtkm::cont::ArrayHandle<U, VIn>& values,
1514  {
1516 
1517  const vtkm::Id numberOfValues = keys.GetNumberOfValues();
1518  if (numberOfValues <= 0)
1519  {
1520  output.Allocate(0);
1521  return;
1522  }
1523 
1524  //We need call PrepareForInput on the input argument before invoking a
1525  //function. The order of execution of parameters of a function is undefined
1526  //so we need to make sure input is called before output, or else in-place
1527  //use case breaks.
1528  vtkm::cont::Token token;
1529  auto keysPortal = keys.PrepareForInput(DeviceAdapterTagCuda(), token);
1530  auto valuesPortal = values.PrepareForInput(DeviceAdapterTagCuda(), token);
1531  ScanInclusiveByKeyPortal(
1532  keysPortal,
1533  valuesPortal,
1534  output.PrepareForOutput(numberOfValues, DeviceAdapterTagCuda(), token));
1535  }
1536 
1537  template <typename T,
1538  typename U,
1539  typename KIn,
1540  typename VIn,
1541  typename VOut,
1542  typename BinaryFunctor>
1544  const vtkm::cont::ArrayHandle<U, VIn>& values,
1546  BinaryFunctor binary_functor)
1547  {
1549 
1550  const vtkm::Id numberOfValues = keys.GetNumberOfValues();
1551  if (numberOfValues <= 0)
1552  {
1553  output.Allocate(0);
1554  return;
1555  }
1556 
1557  //We need call PrepareForInput on the input argument before invoking a
1558  //function. The order of execution of parameters of a function is undefined
1559  //so we need to make sure input is called before output, or else in-place
1560  //use case breaks.
1561  vtkm::cont::Token token;
1562  auto keysPortal = keys.PrepareForInput(DeviceAdapterTagCuda(), token);
1563  auto valuesPortal = values.PrepareForInput(DeviceAdapterTagCuda(), token);
1564  ScanInclusiveByKeyPortal(keysPortal,
1565  valuesPortal,
1566  output.PrepareForOutput(numberOfValues, DeviceAdapterTagCuda(), token),
1567  ::thrust::equal_to<T>(),
1568  binary_functor);
1569  }
1570 
1571  template <typename T, typename U, typename KIn, typename VIn, typename VOut>
1573  const vtkm::cont::ArrayHandle<U, VIn>& values,
1575  {
1577 
1578  const vtkm::Id numberOfValues = keys.GetNumberOfValues();
1579  if (numberOfValues <= 0)
1580  {
1581  output.Allocate(0);
1582  return;
1583  }
1584 
1585  //We need call PrepareForInput on the input argument before invoking a
1586  //function. The order of execution of parameters of a function is undefined
1587  //so we need to make sure input is called before output, or else in-place
1588  //use case breaks.
1589  vtkm::cont::Token token;
1590  auto keysPortal = keys.PrepareForInput(DeviceAdapterTagCuda(), token);
1591  auto valuesPortal = values.PrepareForInput(DeviceAdapterTagCuda(), token);
1592  ScanExclusiveByKeyPortal(keysPortal,
1593  valuesPortal,
1594  output.PrepareForOutput(numberOfValues, DeviceAdapterTagCuda(), token),
1596  ::thrust::equal_to<T>(),
1597  vtkm::Add());
1598  }
1599 
1600  template <typename T,
1601  typename U,
1602  typename KIn,
1603  typename VIn,
1604  typename VOut,
1605  typename BinaryFunctor>
1607  const vtkm::cont::ArrayHandle<U, VIn>& values,
1609  const U& initialValue,
1610  BinaryFunctor binary_functor)
1611  {
1613 
1614  const vtkm::Id numberOfValues = keys.GetNumberOfValues();
1615  if (numberOfValues <= 0)
1616  {
1617  output.Allocate(0);
1618  return;
1619  }
1620 
1621  //We need call PrepareForInput on the input argument before invoking a
1622  //function. The order of execution of parameters of a function is undefined
1623  //so we need to make sure input is called before output, or else in-place
1624  //use case breaks.
1625  vtkm::cont::Token token;
1626  auto keysPortal = keys.PrepareForInput(DeviceAdapterTagCuda(), token);
1627  auto valuesPortal = values.PrepareForInput(DeviceAdapterTagCuda(), token);
1628  ScanExclusiveByKeyPortal(keysPortal,
1629  valuesPortal,
1630  output.PrepareForOutput(numberOfValues, DeviceAdapterTagCuda(), token),
1631  initialValue,
1632  ::thrust::equal_to<T>(),
1633  binary_functor);
1634  }
1635 
1636  // we use cuda pinned memory to reduce the amount of synchronization
1637  // and mem copies between the host and device.
1638  struct VTKM_CONT_EXPORT PinnedErrorArray
1639  {
1640  char* HostPtr = nullptr;
1641  char* DevicePtr = nullptr;
1642  vtkm::Id Size = 0;
1643  };
1644 
1645  VTKM_CONT_EXPORT
1646  static const PinnedErrorArray& GetPinnedErrorArray();
1647 
1648  VTKM_CONT_EXPORT
1649  static void CheckForErrors(); // throws vtkm::cont::ErrorExecution
1650 
1651  VTKM_CONT_EXPORT
1652  static void SetupErrorBuffer(vtkm::exec::cuda::internal::TaskStrided& functor);
1653 
1654  VTKM_CONT_EXPORT
1655  static void GetBlocksAndThreads(vtkm::UInt32& blocks,
1656  vtkm::UInt32& threadsPerBlock,
1657  vtkm::Id size);
1658 
1659  VTKM_CONT_EXPORT
1660  static void GetBlocksAndThreads(vtkm::UInt32& blocks, dim3& threadsPerBlock, const dim3& size);
1661 
1662  VTKM_CONT_EXPORT
1663  static void LogKernelLaunch(const cudaFuncAttributes& func_attrs,
1664  const std::type_info& worklet_info,
1665  vtkm::UInt32 blocks,
1666  vtkm::UInt32 threadsPerBlock,
1667  vtkm::Id size);
1668 
1669  VTKM_CONT_EXPORT
1670  static void LogKernelLaunch(const cudaFuncAttributes& func_attrs,
1671  const std::type_info& worklet_info,
1672  vtkm::UInt32 blocks,
1673  dim3 threadsPerBlock,
1674  const dim3& size);
1675 
1676 public:
1677  template <typename WType, typename IType>
1678  static void ScheduleTask(vtkm::exec::cuda::internal::TaskStrided1D<WType, IType>& functor,
1679  vtkm::Id numInstances)
1680  {
1682 
1683  VTKM_ASSERT(numInstances >= 0);
1684  if (numInstances < 1)
1685  {
1686  // No instances means nothing to run. Just return.
1687  return;
1688  }
1689 
1690  CheckForErrors();
1691  SetupErrorBuffer(functor);
1692 
1693  vtkm::UInt32 blocks, threadsPerBlock;
1694  GetBlocksAndThreads(blocks, threadsPerBlock, numInstances);
1695 
1696 #ifdef VTKM_ENABLE_LOGGING
1698  {
1699  using FunctorType = vtkm::exec::cuda::internal::TaskStrided1D<WType, IType>;
1700  cudaFuncAttributes empty_kernel_attrs;
1701  VTKM_CUDA_CALL(cudaFuncGetAttributes(&empty_kernel_attrs,
1702  cuda::internal::TaskStrided1DLaunch<FunctorType>));
1703  LogKernelLaunch(empty_kernel_attrs, typeid(WType), blocks, threadsPerBlock, numInstances);
1704  }
1705 #endif
1706 
1707  cuda::internal::TaskStrided1DLaunch<<<blocks, threadsPerBlock, 0, cudaStreamPerThread>>>(
1708  functor, numInstances);
1709  }
1710 
1711  template <typename WType, typename IType>
1712  static void ScheduleTask(vtkm::exec::cuda::internal::TaskStrided3D<WType, IType>& functor,
1713  vtkm::Id3 rangeMax)
1714  {
1716 
1717  VTKM_ASSERT((rangeMax[0] >= 0) && (rangeMax[1] >= 0) && (rangeMax[2] >= 0));
1718  if ((rangeMax[0] < 1) || (rangeMax[1] < 1) || (rangeMax[2] < 1))
1719  {
1720  // No instances means nothing to run. Just return.
1721  return;
1722  }
1723 
1724  CheckForErrors();
1725  SetupErrorBuffer(functor);
1726 
1727  const dim3 ranges(static_cast<vtkm::UInt32>(rangeMax[0]),
1728  static_cast<vtkm::UInt32>(rangeMax[1]),
1729  static_cast<vtkm::UInt32>(rangeMax[2]));
1730 
1731  vtkm::UInt32 blocks;
1732  dim3 threadsPerBlock;
1733  GetBlocksAndThreads(blocks, threadsPerBlock, ranges);
1734 
1735 #ifdef VTKM_ENABLE_LOGGING
1737  {
1738  using FunctorType = vtkm::exec::cuda::internal::TaskStrided3D<WType, IType>;
1739  cudaFuncAttributes empty_kernel_attrs;
1740  VTKM_CUDA_CALL(cudaFuncGetAttributes(&empty_kernel_attrs,
1741  cuda::internal::TaskStrided3DLaunch<FunctorType>));
1742  LogKernelLaunch(empty_kernel_attrs, typeid(WType), blocks, threadsPerBlock, ranges);
1743  }
1744 #endif
1745 
1746  cuda::internal::TaskStrided3DLaunch<<<blocks, threadsPerBlock, 0, cudaStreamPerThread>>>(
1747  functor, rangeMax);
1748  }
1749 
1750  template <class Functor>
1751  VTKM_CONT static void Schedule(Functor functor, vtkm::Id numInstances)
1752  {
1754 
1755  vtkm::exec::cuda::internal::TaskStrided1D<Functor, vtkm::internal::NullType> kernel(functor);
1756 
1757  ScheduleTask(kernel, numInstances);
1758  }
1759 
1760  template <class Functor>
1761  VTKM_CONT static void Schedule(Functor functor, const vtkm::Id3& rangeMax)
1762  {
1764 
1765  vtkm::exec::cuda::internal::TaskStrided3D<Functor, vtkm::internal::NullType> kernel(functor);
1766  ScheduleTask(kernel, rangeMax);
1767  }
1768 
1769  template <typename T, class Storage>
1771  {
1773 
1774  vtkm::cont::Token token;
1775  SortPortal(values.PrepareForInPlace(DeviceAdapterTagCuda(), token));
1776  }
1777 
1778  template <typename T, class Storage, class BinaryCompare>
1780  BinaryCompare binary_compare)
1781  {
1783 
1784  vtkm::cont::Token token;
1785  SortPortal(values.PrepareForInPlace(DeviceAdapterTagCuda(), token), binary_compare);
1786  }
1787 
1788  template <typename T, typename U, class StorageT, class StorageU>
1791  {
1793 
1794  vtkm::cont::Token token;
1795  SortByKeyPortal(keys.PrepareForInPlace(DeviceAdapterTagCuda(), token),
1796  values.PrepareForInPlace(DeviceAdapterTagCuda(), token));
1797  }
1798 
1799  template <typename T, typename U, class StorageT, class StorageU, class BinaryCompare>
1802  BinaryCompare binary_compare)
1803  {
1805 
1806  vtkm::cont::Token token;
1807  SortByKeyPortal(keys.PrepareForInPlace(DeviceAdapterTagCuda(), token),
1808  values.PrepareForInPlace(DeviceAdapterTagCuda(), token),
1809  binary_compare);
1810  }
1811 
1812  template <typename T, class Storage>
1814  {
1816 
1817  vtkm::Id newSize;
1818 
1819  {
1820  vtkm::cont::Token token;
1821  newSize = UniquePortal(values.PrepareForInPlace(DeviceAdapterTagCuda(), token));
1822  }
1823 
1824  values.Allocate(newSize, vtkm::CopyFlag::On);
1825  }
1826 
1827  template <typename T, class Storage, class BinaryCompare>
1829  BinaryCompare binary_compare)
1830  {
1832 
1833  vtkm::Id newSize;
1834  {
1835  vtkm::cont::Token token;
1836  newSize =
1837  UniquePortal(values.PrepareForInPlace(DeviceAdapterTagCuda(), token), binary_compare);
1838  }
1839 
1840  values.Allocate(newSize, vtkm::CopyFlag::On);
1841  }
1842 
1843  template <typename T, class SIn, class SVal, class SOut>
1845  const vtkm::cont::ArrayHandle<T, SVal>& values,
1847  {
1849 
1850  vtkm::Id numberOfValues = values.GetNumberOfValues();
1851  vtkm::cont::Token token;
1852  UpperBoundsPortal(input.PrepareForInput(DeviceAdapterTagCuda(), token),
1853  values.PrepareForInput(DeviceAdapterTagCuda(), token),
1854  output.PrepareForOutput(numberOfValues, DeviceAdapterTagCuda(), token));
1855  }
1856 
1857  template <typename T, class SIn, class SVal, class SOut, class BinaryCompare>
1859  const vtkm::cont::ArrayHandle<T, SVal>& values,
1861  BinaryCompare binary_compare)
1862  {
1864 
1865  vtkm::Id numberOfValues = values.GetNumberOfValues();
1866  vtkm::cont::Token token;
1867  UpperBoundsPortal(input.PrepareForInput(DeviceAdapterTagCuda(), token),
1868  values.PrepareForInput(DeviceAdapterTagCuda(), token),
1869  output.PrepareForOutput(numberOfValues, DeviceAdapterTagCuda(), token),
1870  binary_compare);
1871  }
1872 
1873  template <class SIn, class SOut>
1876  {
1878 
1879  vtkm::cont::Token token;
1880  UpperBoundsPortal(input.PrepareForInput(DeviceAdapterTagCuda(), token),
1881  values_output.PrepareForInPlace(DeviceAdapterTagCuda(), token));
1882  }
1883 
1884  VTKM_CONT static void Synchronize()
1885  {
1887 
1888  VTKM_CUDA_CALL(cudaStreamSynchronize(cudaStreamPerThread));
1889  CheckForErrors();
1890  }
1891 };
1892 
1893 template <>
1894 class DeviceTaskTypes<vtkm::cont::DeviceAdapterTagCuda>
1895 {
1896 public:
1897  template <typename WorkletType, typename InvocationType>
1898  static vtkm::exec::cuda::internal::TaskStrided1D<WorkletType, InvocationType>
1899  MakeTask(WorkletType& worklet, InvocationType& invocation, vtkm::Id)
1900  {
1901  using Task = vtkm::exec::cuda::internal::TaskStrided1D<WorkletType, InvocationType>;
1902  return Task(worklet, invocation);
1903  }
1904 
1905  template <typename WorkletType, typename InvocationType>
1906  static vtkm::exec::cuda::internal::TaskStrided3D<WorkletType, InvocationType>
1907  MakeTask(WorkletType& worklet, InvocationType& invocation, vtkm::Id3)
1908  {
1909  using Task = vtkm::exec::cuda::internal::TaskStrided3D<WorkletType, InvocationType>;
1910  return Task(worklet, invocation);
1911  }
1912 };
1913 }
1914 } // namespace vtkm::cont
1915 
1916 #endif //vtk_m_cont_cuda_internal_DeviceAdapterAlgorithmCuda_h
vtkm::cont::ArrayHandle::GetNumberOfValues
VTKM_CONT vtkm::Id GetNumberOfValues() const
Returns the number of entries in the array.
Definition: ArrayHandle.h:448
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::SortByKeyPortal
static VTKM_CONT void SortByKeyPortal(const KeysPortal &keys, const ValuesPortal &values)
Definition: DeviceAdapterAlgorithmCuda.h:954
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::UniquePortal
static VTKM_CONT vtkm::Id UniquePortal(const ValuesPortal values, BinaryCompare binary_compare)
Definition: DeviceAdapterAlgorithmCuda.h:1000
vtkm::cont::ArrayHandle
Manages an array-worth of data.
Definition: ArrayHandle.h:283
vtkm::cont::cuda::ScheduleParameters
Represents how to schedule 1D, 2D, and 3D Cuda kernels.
Definition: DeviceAdapterAlgorithmCuda.h:81
ArrayHandle.h
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::SortByKeyPortal
static VTKM_CONT void SortByKeyPortal(const KeysPortal &keys, const ValuesPortal &values, BinaryCompare binary_compare)
Definition: DeviceAdapterAlgorithmCuda.h:961
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::UpperBoundsPortal
static VTKM_CONT void UpperBoundsPortal(const InputPortal &input, const ValuesPortal &values, const OutputPortal &output)
Definition: DeviceAdapterAlgorithmCuda.h:1020
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::BitFieldToUnorderedSetFunctor::Finalize
VTKM_CONT vtkm::Id Finalize() const
Definition: DeviceAdapterAlgorithmCuda.h:340
VTKM_EXEC
#define VTKM_EXEC
Definition: ExportMacros.h:51
vtkm
Groups connected points that have the same field value.
Definition: Atomic.h:19
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::LowerBoundsPortal
static VTKM_CONT void LowerBoundsPortal(const InputPortal &input, const ValuesPortal &values, const OutputPortal &output)
Definition: DeviceAdapterAlgorithmCuda.h:581
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::ScanInclusiveByKey
static VTKM_CONT void ScanInclusiveByKey(const vtkm::cont::ArrayHandle< T, KIn > &keys, const vtkm::cont::ArrayHandle< U, VIn > &values, vtkm::cont::ArrayHandle< U, VOut > &output, BinaryFunctor binary_functor)
Definition: DeviceAdapterAlgorithmCuda.h:1543
vtkm::TypeTraits
The TypeTraits class provides helpful compile-time information about the basic types used in VTKm (an...
Definition: TypeTraits.h:61
vtkm::cont::BitField::PrepareForInput
VTKM_CONT ReadPortalType PrepareForInput(vtkm::cont::DeviceAdapterId device, vtkm::cont::Token &token) const
Prepares this BitField to be used as an input to an operation in the execution environment.
vtkm::cont::cuda::InitScheduleParameters
VTKM_CONT_EXPORT void InitScheduleParameters(vtkm::cont::cuda::ScheduleParameters(*)(char const *name, int major, int minor, int multiProcessorCount, int maxThreadsPerMultiProcessor, int maxThreadsPerBlock))
Specify the custom scheduling to use for VTK-m CUDA kernel launches.
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::Copy
static VTKM_CONT void Copy(const vtkm::cont::ArrayHandle< T, SIn > &input, vtkm::cont::ArrayHandle< U, SOut > &output)
Definition: DeviceAdapterAlgorithmCuda.h:1141
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::ScanExclusive
static VTKM_CONT T ScanExclusive(const vtkm::cont::ArrayHandle< T, SIn > &input, vtkm::cont::ArrayHandle< T, SOut > &output)
Definition: DeviceAdapterAlgorithmCuda.h:1411
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::ScheduleTask
static void ScheduleTask(vtkm::exec::cuda::internal::TaskStrided3D< WType, IType > &functor, vtkm::Id3 rangeMax)
Definition: DeviceAdapterAlgorithmCuda.h:1712
Types.h
vtkm::cont::cuda::ScheduleParameters::three_d_blocks
int three_d_blocks
Definition: DeviceAdapterAlgorithmCuda.h:89
vtkm::cont::DeviceAdapterAlgorithm::VIn
static VTKM_CONT T VIn
Definition: DeviceAdapterAlgorithm.h:349
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::BitFieldToUnorderedSetFunctor::ReduceAllocate
VTKM_SUPPRESS_EXEC_WARNINGS __device__ void ReduceAllocate() const
Definition: DeviceAdapterAlgorithmCuda.h:355
VTKM_ASSERT
#define VTKM_ASSERT(condition)
Definition: Assert.h:43
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::LowerBounds
static VTKM_CONT void LowerBounds(const vtkm::cont::ArrayHandle< T, SIn > &input, const vtkm::cont::ArrayHandle< T, SVal > &values, vtkm::cont::ArrayHandle< vtkm::Id, SOut > &output, BinaryCompare binary_compare)
Definition: DeviceAdapterAlgorithmCuda.h:1296
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::SortPortal
static VTKM_CONT void SortPortal(const ValuesPortal &values)
Definition: DeviceAdapterAlgorithmCuda.h:928
vtkm::cont::ArrayHandle::Allocate
VTKM_CONT void Allocate(vtkm::Id numberOfValues, vtkm::CopyFlag preserve, vtkm::cont::Token &token) const
Allocates an array large enough to hold the given number of values.
Definition: ArrayHandle.h:465
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::Sort
static VTKM_CONT void Sort(vtkm::cont::ArrayHandle< T, Storage > &values)
Definition: DeviceAdapterAlgorithmCuda.h:1770
vtkm::cont::ArrayHandle::PrepareForInput
VTKM_CONT ReadPortalType PrepareForInput(vtkm::cont::DeviceAdapterId device, vtkm::cont::Token &token) const
Prepares this array to be used as an input to an operation in the execution environment.
Definition: ArrayHandle.h:574
UnaryPredicates.h
vtkm::Get
VTKM_SUPPRESS_EXEC_WARNINGS VTKM_EXEC_CONT auto Get(const vtkm::Tuple< Ts... > &tuple) -> decltype(tuple.template Get< Index >())
Retrieve the object from a vtkm::Tuple at the given index.
Definition: Tuple.h:83
vtkm::cont::DeviceAdapterAlgorithm::Reduce
static VTKM_CONT U Reduce(const vtkm::cont::ArrayHandle< T, CIn > &input, U initialValue)
Compute a accumulated sum operation on the input ArrayHandle.
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::BitFieldToUnorderedSetPortal
static VTKM_CONT vtkm::Id BitFieldToUnorderedSetPortal(const BitsPortal &bits, const IndicesPortal &indices)
Definition: DeviceAdapterAlgorithmCuda.h:1085
BitField.h
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::BitFieldToUnorderedSetFunctor::GetNextOutputIndex
VTKM_SUPPRESS_EXEC_WARNINGS __device__ vtkm::Id GetNextOutputIndex() const
Definition: DeviceAdapterAlgorithmCuda.h:384
DeviceAdapterAlgorithmGeneral.h
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::ScheduleTask
static void ScheduleTask(vtkm::exec::cuda::internal::TaskStrided1D< WType, IType > &functor, vtkm::Id numInstances)
Definition: DeviceAdapterAlgorithmCuda.h:1678
vtkm::cont::cuda::ScheduleParameters::two_d_threads_per_block
dim3 two_d_threads_per_block
Definition: DeviceAdapterAlgorithmCuda.h:87
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::CountSetBitsFunctor::operator()
VTKM_SUPPRESS_EXEC_WARNINGS __device__ void operator()(vtkm::Id wordIdx) const
Definition: DeviceAdapterAlgorithmCuda.h:519
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::Schedule
static VTKM_CONT void Schedule(Functor functor, const vtkm::Id3 &rangeMax)
Definition: DeviceAdapterAlgorithmCuda.h:1761
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::Sort
static VTKM_CONT void Sort(vtkm::cont::ArrayHandle< T, Storage > &values, BinaryCompare binary_compare)
Definition: DeviceAdapterAlgorithmCuda.h:1779
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::BitFieldToUnorderedSetFunctor::Output
const IndicesPortal Output
Definition: DeviceAdapterAlgorithmCuda.h:399
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::BitFieldToUnorderedSetFunctor::BitFieldToUnorderedSetFunctor
VTKM_CONT BitFieldToUnorderedSetFunctor(const BitsPortal &input, const IndicesPortal &output, GlobalPopCountType *globalPopCount)
Definition: DeviceAdapterAlgorithmCuda.h:293
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::SortByKey
static VTKM_CONT void SortByKey(vtkm::cont::ArrayHandle< T, StorageT > &keys, vtkm::cont::ArrayHandle< U, StorageU > &values)
Definition: DeviceAdapterAlgorithmCuda.h:1789
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::ScanInclusivePortal
static VTKM_CONT InputPortal::ValueType ScanInclusivePortal(const InputPortal &input, const OutputPortal &output)
Definition: DeviceAdapterAlgorithmCuda.h:795
vtkm::cont::cuda::ScheduleParameters::one_d_threads_per_block
int one_d_threads_per_block
Definition: DeviceAdapterAlgorithmCuda.h:84
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::ScanInclusive
static VTKM_CONT T ScanInclusive(const vtkm::cont::ArrayHandle< T, SIn > &input, vtkm::cont::ArrayHandle< T, SOut > &output)
Definition: DeviceAdapterAlgorithmCuda.h:1462
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::CountSetBitsFunctor::Finalize
VTKM_CONT vtkm::Id Finalize() const
Definition: DeviceAdapterAlgorithmCuda.h:535
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::BitFieldToUnorderedSet
static VTKM_CONT vtkm::Id BitFieldToUnorderedSet(const vtkm::cont::BitField &bits, vtkm::cont::ArrayHandle< Id, IndicesStorage > &indices)
Definition: DeviceAdapterAlgorithmCuda.h:1119
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::UpperBounds
static VTKM_CONT void UpperBounds(const vtkm::cont::ArrayHandle< T, SIn > &input, const vtkm::cont::ArrayHandle< T, SVal > &values, vtkm::cont::ArrayHandle< vtkm::Id, SOut > &output, BinaryCompare binary_compare)
Definition: DeviceAdapterAlgorithmCuda.h:1858
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::CountSetBitsPortal
static VTKM_CONT vtkm::Id CountSetBitsPortal(const BitsPortal &bits)
Definition: DeviceAdapterAlgorithmCuda.h:1101
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::ScanExclusiveByKeyPortal
static VTKM_CONT void ScanExclusiveByKeyPortal(const KeysPortal &keys, const ValuesPortal &values, const OutputPortal &output, T initValue, BinaryPredicate binary_predicate, AssociativeOperator binary_operator)
Definition: DeviceAdapterAlgorithmCuda.h:897
vtkm::cont::ArrayHandle::PrepareForInPlace
VTKM_CONT WritePortalType PrepareForInPlace(vtkm::cont::DeviceAdapterId device, vtkm::cont::Token &token) const
Prepares this array to be used in an in-place operation (both as input and output) in the execution e...
Definition: ArrayHandle.h:593
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::Reduce
static VTKM_CONT U Reduce(const vtkm::cont::ArrayHandle< T, vtkm::cont::StorageTagMultiplexer< SIns... >> &input, U initialValue)
Definition: DeviceAdapterAlgorithmCuda.h:1357
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >
Definition: DeviceAdapterAlgorithmCuda.h:262
vtkm::cont::DeviceAdapterAlgorithm::KIn
static VTKM_CONT T KIn
Definition: DeviceAdapterAlgorithm.h:348
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::LowerBoundsPortal
static VTKM_CONT void LowerBoundsPortal(const InputPortal &input, const ValuesPortal &values, const OutputPortal &output, BinaryCompare binary_compare)
Definition: DeviceAdapterAlgorithmCuda.h:598
vtkm::cont::GetStderrLogLevel
VTKM_CONT_EXPORT VTKM_CONT vtkm::cont::LogLevel GetStderrLogLevel()
Get the active highest log level that will be printed to stderr.
DeviceAdapterAlgorithm.h
DeviceAdapterTimerImplementationCuda.h
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::BitFieldToUnorderedSetFunctor::operator()
VTKM_SUPPRESS_EXEC_WARNINGS __device__ void operator()(vtkm::Id wordIdx) const
Definition: DeviceAdapterAlgorithmCuda.h:313
vtkm::Id
vtkm::Int32 Id
Represents an ID (index into arrays).
Definition: Types.h:191
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::ScanInclusiveByKeyPortal
static VTKM_CONT void ScanInclusiveByKeyPortal(const KeysPortal &keys, const ValuesPortal &values, const OutputPortal &output, BinaryPredicate binary_predicate, AssociativeOperator binary_operator)
Definition: DeviceAdapterAlgorithmCuda.h:847
DeviceAdapterRuntimeDetectorCuda.h
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::CopyPortal
static VTKM_CONT void CopyPortal(const InputPortal &input, const OutputPortal &output)
Definition: DeviceAdapterAlgorithmCuda.h:409
ThrustPatches.h
DeviceAdapterTagCuda.h
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::Unique
static VTKM_CONT void Unique(vtkm::cont::ArrayHandle< T, Storage > &values, BinaryCompare binary_compare)
Definition: DeviceAdapterAlgorithmCuda.h:1828
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::BitFieldToUnorderedSetFunctor::Word
vtkm::AtomicTypePreferred Word
Definition: DeviceAdapterAlgorithmCuda.h:287
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::LowerBounds
static VTKM_CONT void LowerBounds(const vtkm::cont::ArrayHandle< vtkm::Id, SIn > &input, vtkm::cont::ArrayHandle< vtkm::Id, SOut > &values_output)
Definition: DeviceAdapterAlgorithmCuda.h:1312
vtkm::Add
Definition: Types.h:222
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::CountSetBitsFunctor::Initialize
VTKM_CONT void Initialize()
Definition: DeviceAdapterAlgorithmCuda.h:512
vtkm::cont::Token
A token to hold the scope of an ArrayHandle or other object.
Definition: Token.h:35
TypeTraits.h
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::CountSetBitsFunctor::GlobalPopCount
GlobalPopCountType * GlobalPopCount
Definition: DeviceAdapterAlgorithmCuda.h:573
ThrustExceptionHandler.h
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::LowerBoundsPortal
static VTKM_CONT void LowerBoundsPortal(const InputPortal &input, const OutputPortal &values_output)
Definition: DeviceAdapterAlgorithmCuda.h:590
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::CopyIf
static VTKM_CONT void CopyIf(const vtkm::cont::ArrayHandle< U, SIn > &input, const vtkm::cont::ArrayHandle< T, SStencil > &stencil, vtkm::cont::ArrayHandle< U, SOut > &output, UnaryPredicate unary_predicate)
Definition: DeviceAdapterAlgorithmCuda.h:1186
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::CountSetBitsFunctor::Portal
const BitsPortal Portal
Definition: DeviceAdapterAlgorithmCuda.h:572
vtkm::cont::DeviceTaskTypes< vtkm::cont::DeviceAdapterTagCuda >::MakeTask
static vtkm::exec::cuda::internal::TaskStrided3D< WorkletType, InvocationType > MakeTask(WorkletType &worklet, InvocationType &invocation, vtkm::Id3)
Definition: DeviceAdapterAlgorithmCuda.h:1907
Math.h
VTKM_STATIC_ASSERT
#define VTKM_STATIC_ASSERT(condition)
Definition: StaticAssert.h:16
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::CopyIfPortal
static VTKM_CONT vtkm::Id CopyIfPortal(ValuePortal values, StencilPortal stencil, OutputPortal output, UnaryPredicate unary_predicate)
Definition: DeviceAdapterAlgorithmCuda.h:456
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::CountSetBits
static VTKM_CONT vtkm::Id CountSetBits(const vtkm::cont::BitField &bits)
Definition: DeviceAdapterAlgorithmCuda.h:1272
VTKM_CUDA_CALL
#define VTKM_CUDA_CALL(command)
A macro that can be wrapped around a CUDA command and will throw an ErrorCuda exception if the CUDA c...
Definition: ErrorCuda.h:38
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::ReduceByKey
static VTKM_CONT void ReduceByKey(const vtkm::cont::ArrayHandle< T, KIn > &keys, const vtkm::cont::ArrayHandle< U, VIn > &values, vtkm::cont::ArrayHandle< T, KOut > &keys_output, vtkm::cont::ArrayHandle< U, VOut > &values_output, BinaryFunctor binary_functor)
Definition: DeviceAdapterAlgorithmCuda.h:1379
ErrorMessageBuffer.h
vtkm::cont::LogLevel::KernelLaunches
@ KernelLaunches
Details on Device-side Kernel Launches.
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::ScanInclusivePortal
static VTKM_CONT InputPortal::ValueType ScanInclusivePortal(const InputPortal &input, const OutputPortal &output, BinaryFunctor binary_functor)
Definition: DeviceAdapterAlgorithmCuda.h:803
VTKM_STATIC_ASSERT_MSG
#define VTKM_STATIC_ASSERT_MSG(condition, message)
Definition: StaticAssert.h:18
vtkm::worklet::zfp::Word
vtkm::UInt64 Word
Definition: ZFPBlockReader.h:22
MakeThrustIterator.h
vtkm::FindFirstSetBit
VTKM_EXEC_CONT vtkm::Int32 FindFirstSetBit(vtkm::UInt32 word)
Bitwise operations.
Definition: Math.h:2791
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::ScanExclusiveByKey
static VTKM_CONT void ScanExclusiveByKey(const vtkm::cont::ArrayHandle< T, KIn > &keys, const vtkm::cont::ArrayHandle< U, VIn > &values, vtkm::cont::ArrayHandle< U, VOut > &output, const U &initialValue, BinaryFunctor binary_functor)
Definition: DeviceAdapterAlgorithmCuda.h:1606
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::UpperBoundsPortal
static VTKM_CONT void UpperBoundsPortal(const InputPortal &input, const OutputPortal &values_output)
Definition: DeviceAdapterAlgorithmCuda.h:1066
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::CountSetBitsFunctor::~CountSetBitsFunctor
~CountSetBitsFunctor()
Definition: DeviceAdapterAlgorithmCuda.h:510
vtkm::cont::DeviceTaskTypes
Class providing a device-specific support for selecting the optimal Task type for a given worklet.
Definition: DeviceAdapterAlgorithm.h:744
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::ScanExclusiveByKey
static VTKM_CONT void ScanExclusiveByKey(const vtkm::cont::ArrayHandle< T, KIn > &keys, const vtkm::cont::ArrayHandle< U, VIn > &values, vtkm::cont::ArrayHandle< U, VOut > &output)
Definition: DeviceAdapterAlgorithmCuda.h:1572
VTKM_CONT
#define VTKM_CONT
Definition: ExportMacros.h:57
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::ScanExclusivePortal
static VTKM_CONT InputPortal::ValueType ScanExclusivePortal(const InputPortal &input, const OutputPortal &output, BinaryFunctor binaryOp, typename InputPortal::ValueType initialValue)
Definition: DeviceAdapterAlgorithmCuda.h:747
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::ScanInclusiveByKeyPortal
static VTKM_CONT void ScanInclusiveByKeyPortal(const KeysPortal &keys, const ValuesPortal &values, const OutputPortal &output)
Definition: DeviceAdapterAlgorithmCuda.h:832
VTKM_LOG_SCOPE_FUNCTION
#define VTKM_LOG_SCOPE_FUNCTION(level)
Definition: Logging.h:266
vtkm::cont::DeviceAdapterAlgorithm
Struct containing device adapter algorithms.
Definition: DeviceAdapterAlgorithm.h:41
vtkm::CopyFlag::On
@ On
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::ReducePortalImpl
static VTKM_CONT T ReducePortalImpl(const InputPortal &input, T initialValue, BinaryFunctor binary_functor, std::false_type)
Definition: DeviceAdapterAlgorithmCuda.h:665
vtkm::cont::DeviceAdapterAlgorithm::VOut
static VTKM_CONT T VOut
Definition: DeviceAdapterAlgorithm.h:350
vtkm::AtomicTypePreferred
vtkm::UInt32 AtomicTypePreferred
The preferred type to use for atomic operations.
Definition: Atomic.h:763
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::ScanExclusivePortal
static VTKM_CONT InputPortal::ValueType ScanExclusivePortal(const InputPortal &input, const OutputPortal &output)
Definition: DeviceAdapterAlgorithmCuda.h:735
vtkm::cont::cuda::ScheduleParameters::one_d_blocks
int one_d_blocks
Definition: DeviceAdapterAlgorithmCuda.h:83
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::CopySubRange
static VTKM_CONT bool CopySubRange(const vtkm::cont::ArrayHandle< T, SIn > &input, vtkm::Id inputStartIndex, vtkm::Id numberOfElementsToCopy, vtkm::cont::ArrayHandle< U, SOut > &output, vtkm::Id outputIndex=0)
Definition: DeviceAdapterAlgorithmCuda.h:1214
ErrorExecution.h
vtkm::cont::DeviceAdapterAlgorithm::U
static VTKM_CONT T U
Definition: DeviceAdapterAlgorithm.h:347
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::CountSetBitsFunctor::Word
vtkm::AtomicTypePreferred Word
Definition: DeviceAdapterAlgorithmCuda.h:499
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::BitFieldToUnorderedSetFunctor::Initialize
VTKM_CONT void Initialize()
Definition: DeviceAdapterAlgorithmCuda.h:306
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::UpperBounds
static VTKM_CONT void UpperBounds(const vtkm::cont::ArrayHandle< vtkm::Id, SIn > &input, vtkm::cont::ArrayHandle< vtkm::Id, SOut > &values_output)
Definition: DeviceAdapterAlgorithmCuda.h:1874
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::CopyIf
static VTKM_CONT void CopyIf(const vtkm::cont::ArrayHandle< U, SIn > &input, const vtkm::cont::ArrayHandle< T, SStencil > &stencil, vtkm::cont::ArrayHandle< U, SOut > &output)
Definition: DeviceAdapterAlgorithmCuda.h:1158
vtkm::exec::FunctorBase
Base class for all user worklets invoked in the execution environment from a call to vtkm::cont::Devi...
Definition: FunctorBase.h:30
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::CountSetBitsFunctor::Reduce
VTKM_SUPPRESS_EXEC_WARNINGS __device__ void Reduce() const
Definition: DeviceAdapterAlgorithmCuda.h:549
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::ReducePortal
static VTKM_CONT T ReducePortal(const InputPortal &input, T initialValue, BinaryFunctor binary_functor)
Definition: DeviceAdapterAlgorithmCuda.h:630
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::Unique
static VTKM_CONT void Unique(vtkm::cont::ArrayHandle< T, Storage > &values)
Definition: DeviceAdapterAlgorithmCuda.h:1813
vtkm::Vec< vtkm::Id, 3 >
vtkm::cont::DeviceAdapterAlgorithm::CopySubRange
static VTKM_CONT bool CopySubRange(const vtkm::cont::ArrayHandle< T, CIn > &input, vtkm::Id inputStartIndex, vtkm::Id numberOfElementsToCopy, vtkm::cont::ArrayHandle< U, COut > &output, vtkm::Id outputIndex=0)
Copy the contents of a section of one ArrayHandle to another.
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::CopySubRangePortal
static VTKM_CONT void CopySubRangePortal(const InputPortal &input, vtkm::Id inputOffset, vtkm::Id size, const OutputPortal &output, vtkm::Id outputOffset)
Definition: DeviceAdapterAlgorithmCuda.h:469
vtkm::UInt32
uint32_t UInt32
Definition: Types.h:161
vtkm::cont::BitField
Definition: BitField.h:497
ArrayHandleMultiplexer.h
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::CountSetBitsFunctor::CountSetBitsFunctor
VTKM_CONT CountSetBitsFunctor(const BitsPortal &portal, GlobalPopCountType *globalPopCount)
Definition: DeviceAdapterAlgorithmCuda.h:502
vtkm::NotZeroInitialized
Predicate that takes a single argument x, and returns True if it isn't the identity of the Type T.
Definition: UnaryPredicates.h:32
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::ScanInclusiveByKey
static VTKM_CONT void ScanInclusiveByKey(const vtkm::cont::ArrayHandle< T, KIn > &keys, const vtkm::cont::ArrayHandle< U, VIn > &values, vtkm::cont::ArrayHandle< U, VOut > &output)
Definition: DeviceAdapterAlgorithmCuda.h:1511
vtkm::cont::ArrayHandle::PrepareForOutput
VTKM_CONT WritePortalType PrepareForOutput(vtkm::Id numberOfValues, vtkm::cont::DeviceAdapterId device, vtkm::cont::Token &token) const
Prepares (allocates) this array to be used as an output from an operation in the execution environmen...
Definition: ArrayHandle.h:613
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::SortByKey
static VTKM_CONT void SortByKey(vtkm::cont::ArrayHandle< T, StorageT > &keys, vtkm::cont::ArrayHandle< U, StorageU > &values, BinaryCompare binary_compare)
Definition: DeviceAdapterAlgorithmCuda.h:1800
vtkm::cont::BitField::GetNumberOfBits
VTKM_CONT vtkm::Id GetNumberOfBits() const
Return the number of bits stored by this BitField.
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::Schedule
static VTKM_CONT void Schedule(Functor functor, vtkm::Id numInstances)
Definition: DeviceAdapterAlgorithmCuda.h:1751
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::ReduceByKeyPortal
static VTKM_CONT vtkm::Id ReduceByKeyPortal(const KeysPortal &keys, const ValuesPortal &values, const KeysOutputPortal &keys_output, const ValueOutputPortal &values_output, BinaryFunctor binary_functor)
Definition: DeviceAdapterAlgorithmCuda.h:699
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::UniquePortal
static VTKM_CONT vtkm::Id UniquePortal(const ValuesPortal values)
Definition: DeviceAdapterAlgorithmCuda.h:983
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::UpperBounds
static VTKM_CONT void UpperBounds(const vtkm::cont::ArrayHandle< T, SIn > &input, const vtkm::cont::ArrayHandle< T, SVal > &values, vtkm::cont::ArrayHandle< vtkm::Id, SOut > &output)
Definition: DeviceAdapterAlgorithmCuda.h:1844
vtkm::cont::DeviceAdapterAlgorithm::Schedule
static VTKM_CONT void Schedule(Functor functor, vtkm::Id numInstances)
Schedule many instances of a function to run on concurrent threads.
vtkm::Int32
int32_t Int32
Definition: Types.h:160
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::Synchronize
static VTKM_CONT void Synchronize()
Definition: DeviceAdapterAlgorithmCuda.h:1884
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::SortPortal
static VTKM_CONT void SortPortal(const ValuesPortal &values, BinaryCompare binary_compare)
Definition: DeviceAdapterAlgorithmCuda.h:935
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::Reduce
static VTKM_CONT U Reduce(const vtkm::cont::ArrayHandle< T, vtkm::cont::StorageTagMultiplexer< SIns... >> &input, U initialValue, BinaryFunctor binary_functor)
Definition: DeviceAdapterAlgorithmCuda.h:1364
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::Superclass
vtkm::cont::internal::DeviceAdapterAlgorithmGeneral< vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >, vtkm::cont::DeviceAdapterTagCuda > Superclass
Definition: DeviceAdapterAlgorithmCuda.h:275
vtkm::cont::cuda::ScheduleParameters::three_d_threads_per_block
dim3 three_d_threads_per_block
Definition: DeviceAdapterAlgorithmCuda.h:90
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::BitFieldToUnorderedSetFunctor::GlobalPopCount
GlobalPopCountType * GlobalPopCount
Definition: DeviceAdapterAlgorithmCuda.h:400
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::ScanExclusiveByKeyPortal
static VTKM_CONT void ScanExclusiveByKeyPortal(const KeysPortal &keys, const ValuesPortal &values, const OutputPortal &output)
Definition: DeviceAdapterAlgorithmCuda.h:877
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::CopyIfPortal
static VTKM_CONT vtkm::Id CopyIfPortal(ValueIterator valuesBegin, ValueIterator valuesEnd, StencilPortal stencil, OutputPortal output, UnaryPredicate unary_predicate)
Definition: DeviceAdapterAlgorithmCuda.h:425
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::LowerBounds
static VTKM_CONT void LowerBounds(const vtkm::cont::ArrayHandle< T, SIn > &input, const vtkm::cont::ArrayHandle< T, SVal > &values, vtkm::cont::ArrayHandle< vtkm::Id, SOut > &output)
Definition: DeviceAdapterAlgorithmCuda.h:1282
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::ScanExclusive
static VTKM_CONT T ScanExclusive(const vtkm::cont::ArrayHandle< T, SIn > &input, vtkm::cont::ArrayHandle< T, SOut > &output, BinaryFunctor binary_functor, const T &initialValue)
Definition: DeviceAdapterAlgorithmCuda.h:1434
Logging.h
Logging utilities.
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::Reduce
static VTKM_CONT U Reduce(const vtkm::cont::ArrayHandle< T, SIn > &input, U initialValue, BinaryFunctor binary_functor)
Definition: DeviceAdapterAlgorithmCuda.h:1337
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::BitFieldToUnorderedSetFunctor::Input
const BitsPortal Input
Definition: DeviceAdapterAlgorithmCuda.h:398
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::Reduce
static VTKM_CONT U Reduce(const vtkm::cont::ArrayHandle< T, SIn > &input, U initialValue)
Definition: DeviceAdapterAlgorithmCuda.h:1323
vtkm::CountSetBits
VTKM_EXEC_CONT vtkm::Int32 CountSetBits(vtkm::UInt32 word)
Count the total number of bits set in word.
Definition: Math.h:2887
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::ScanInclusive
static VTKM_CONT T ScanInclusive(const vtkm::cont::ArrayHandle< T, SIn > &input, vtkm::cont::ArrayHandle< T, SOut > &output, BinaryFunctor binary_functor)
Definition: DeviceAdapterAlgorithmCuda.h:1485
vtkm::cont::cuda::ScheduleParameters::two_d_blocks
int two_d_blocks
Definition: DeviceAdapterAlgorithmCuda.h:86
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::ReducePortalImpl
static VTKM_CONT T ReducePortalImpl(const InputPortal &input, T initialValue, BinaryFunctor binary_functor, std::true_type)
Definition: DeviceAdapterAlgorithmCuda.h:639
vtkm::cont::DeviceTaskTypes< vtkm::cont::DeviceAdapterTagCuda >::MakeTask
static vtkm::exec::cuda::internal::TaskStrided1D< WorkletType, InvocationType > MakeTask(WorkletType &worklet, InvocationType &invocation, vtkm::Id)
Definition: DeviceAdapterAlgorithmCuda.h:1899
ErrorCuda.h
Token.h
VTKM_SUPPRESS_EXEC_WARNINGS
#define VTKM_SUPPRESS_EXEC_WARNINGS
Definition: ExportMacros.h:53
vtkm::TypeTraits::ZeroInitialization
static VTKM_EXEC_CONT T ZeroInitialization()
Definition: TypeTraits.h:75
TaskStrided.h
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::BitFieldToUnorderedSetFunctor::~BitFieldToUnorderedSetFunctor
~BitFieldToUnorderedSetFunctor()
Definition: DeviceAdapterAlgorithmCuda.h:304
vtkm::cont::LogLevel::Perf
@ Perf
General timing data and algorithm flow information, such as filter execution, worklet dispatches,...
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::ReducePortal
static VTKM_CONT T ReducePortal(const InputPortal &input, T initialValue)
Definition: DeviceAdapterAlgorithmCuda.h:624
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::UpperBoundsPortal
static VTKM_CONT void UpperBoundsPortal(const InputPortal &input, const ValuesPortal &values, const OutputPortal &output, BinaryCompare binary_compare)
Definition: DeviceAdapterAlgorithmCuda.h:1040
WrappedOperators.h
vtkm::cont::StorageTagMultiplexer
Definition: ArrayHandleMultiplexer.h:141