VTK-m  2.2
DeviceAdapterAlgorithmCuda.h
Go to the documentation of this file.
1 //============================================================================
2 // Copyright (c) Kitware, Inc.
3 // All rights reserved.
4 // See LICENSE.txt for details.
5 //
6 // This software is distributed WITHOUT ANY WARRANTY; without even
7 // the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
8 // PURPOSE. See the above copyright notice for more information.
9 //============================================================================
10 #ifndef vtk_m_cont_cuda_internal_DeviceAdapterAlgorithmCuda_h
11 #define vtk_m_cont_cuda_internal_DeviceAdapterAlgorithmCuda_h
12 
13 #include <vtkm/Math.h>
14 #include <vtkm/TypeTraits.h>
15 #include <vtkm/Types.h>
16 #include <vtkm/UnaryPredicates.h>
17 
18 #include <vtkm/cont/ArrayHandle.h>
20 #include <vtkm/cont/BitField.h>
23 #include <vtkm/cont/Logging.h>
24 #include <vtkm/cont/Token.h>
26 
28 
36 
39 
40 // Disable warnings we check vtkm for but Thrust does not.
43 //needs to be first
44 #include <vtkm/exec/cuda/internal/ExecutionPolicy.h>
45 
46 #include <cooperative_groups.h>
47 #include <cuda.h>
48 #include <thrust/advance.h>
49 #include <thrust/binary_search.h>
50 #include <thrust/copy.h>
51 #include <thrust/count.h>
52 #include <thrust/iterator/counting_iterator.h>
53 #include <thrust/scan.h>
54 #include <thrust/sort.h>
55 #include <thrust/system/cpp/memory.h>
56 #include <thrust/system/cuda/vector.h>
57 #include <thrust/unique.h>
58 
60 
61 #include <limits>
62 #include <memory>
63 
64 namespace vtkm
65 {
66 namespace cont
67 {
68 namespace cuda
69 {
70 
82 {
85 
88 
91 };
92 
128  vtkm::cont::cuda::ScheduleParameters (*)(char const* name,
129  int major,
130  int minor,
131  int multiProcessorCount,
132  int maxThreadsPerMultiProcessor,
133  int maxThreadsPerBlock));
134 
135 namespace internal
136 {
137 
138 #if (defined(VTKM_GCC) || defined(VTKM_CLANG))
139 #pragma GCC diagnostic push
140 #pragma GCC diagnostic ignored "-Wunused-parameter"
141 #endif
142 
143 template <typename TaskType>
144 __global__ void TaskStrided1DLaunch(TaskType task, vtkm::Id size)
145 {
146  //see https://devblogs.nvidia.com/cuda-pro-tip-write-flexible-kernels-grid-stride-loops/
147  //for why our inc is grid-stride
148  const vtkm::Id start = blockIdx.x * blockDim.x + threadIdx.x;
149  const vtkm::Id inc = blockDim.x * gridDim.x;
150  task(start, size, inc);
151 }
152 
153 template <typename TaskType>
154 __global__ void TaskStrided3DLaunch(TaskType task, vtkm::Id3 size)
155 {
156  //This is the 3D version of executing in a grid-stride manner
157  const dim3 start(blockIdx.x * blockDim.x + threadIdx.x,
158  blockIdx.y * blockDim.y + threadIdx.y,
159  blockIdx.z * blockDim.z + threadIdx.z);
160  const dim3 inc(blockDim.x * gridDim.x, blockDim.y * gridDim.y, blockDim.z * gridDim.z);
161 
162  for (vtkm::Id k = start.z; k < size[2]; k += inc.z)
163  {
164  for (vtkm::Id j = start.y; j < size[1]; j += inc.y)
165  {
166  task(size, start.x, size[0], inc.x, j, k);
167  }
168  }
169 }
170 
171 template <typename T, typename BinaryOperationType>
172 __global__ void SumExclusiveScan(T a, T b, T result, BinaryOperationType binary_op)
173 {
174  result = binary_op(a, b);
175 }
176 
177 #if (defined(VTKM_GCC) || defined(VTKM_CLANG))
178 #pragma GCC diagnostic pop
179 #endif
180 
181 template <typename FunctorType, typename ArgType>
182 struct FunctorSupportsUnaryImpl
183 {
184  template <typename F, typename A, typename = decltype(std::declval<F>()(std::declval<A>()))>
185  static std::true_type has(int);
186  template <typename F, typename A>
187  static std::false_type has(...);
188  using type = decltype(has<FunctorType, ArgType>(0));
189 };
190 template <typename FunctorType, typename ArgType>
191 using FunctorSupportsUnary = typename FunctorSupportsUnaryImpl<FunctorType, ArgType>::type;
192 
193 template <typename PortalType,
194  typename BinaryAndUnaryFunctor,
195  typename = FunctorSupportsUnary<BinaryAndUnaryFunctor, typename PortalType::ValueType>>
196 struct CastPortal;
197 
198 template <typename PortalType, typename BinaryAndUnaryFunctor>
199 struct CastPortal<PortalType, BinaryAndUnaryFunctor, std::true_type>
200 {
201  using InputType = typename PortalType::ValueType;
202  using ValueType = decltype(std::declval<BinaryAndUnaryFunctor>()(std::declval<InputType>()));
203 
204  PortalType Portal;
205  BinaryAndUnaryFunctor Functor;
206 
207  VTKM_CONT
208  CastPortal(const PortalType& portal, const BinaryAndUnaryFunctor& functor)
209  : Portal(portal)
210  , Functor(functor)
211  {
212  }
213 
214  VTKM_EXEC
215  vtkm::Id GetNumberOfValues() const { return this->Portal.GetNumberOfValues(); }
216 
217  VTKM_EXEC
218  ValueType Get(vtkm::Id index) const { return this->Functor(this->Portal.Get(index)); }
219 };
220 
221 template <typename PortalType, typename BinaryFunctor>
222 struct CastPortal<PortalType, BinaryFunctor, std::false_type>
223 {
224  using InputType = typename PortalType::ValueType;
225  using ValueType =
226  decltype(std::declval<BinaryFunctor>()(std::declval<InputType>(), std::declval<InputType>()));
227 
228  PortalType Portal;
229 
230  VTKM_CONT
231  CastPortal(const PortalType& portal, const BinaryFunctor&)
232  : Portal(portal)
233  {
234  }
235 
236  VTKM_EXEC
237  vtkm::Id GetNumberOfValues() const { return this->Portal.GetNumberOfValues(); }
238 
239  VTKM_EXEC
240  ValueType Get(vtkm::Id index) const { return static_cast<ValueType>(this->Portal.Get(index)); }
241 };
242 
243 struct CudaFreeFunctor
244 {
245  void operator()(void* ptr) const { VTKM_CUDA_CALL(cudaFree(ptr)); }
246 };
247 
248 template <typename T>
249 using CudaUniquePtr = std::unique_ptr<T, CudaFreeFunctor>;
250 
251 template <typename T>
252 CudaUniquePtr<T> make_CudaUniquePtr(std::size_t numElements)
253 {
254  T* ptr;
255  VTKM_CUDA_CALL(cudaMalloc(&ptr, sizeof(T) * numElements));
256  return CudaUniquePtr<T>(ptr);
257 }
258 }
259 } // end namespace cuda::internal
260 
261 template <>
263  : vtkm::cont::internal::DeviceAdapterAlgorithmGeneral<
264  vtkm::cont::DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>,
265  vtkm::cont::DeviceAdapterTagCuda>
266 {
267 // Because of some funny code conversions in nvcc, kernels for devices have to
268 // be public.
269 #ifndef VTKM_CUDA
270 private:
271 #endif
272 
273  using Superclass = vtkm::cont::internal::DeviceAdapterAlgorithmGeneral<
276 
277  template <typename BitsPortal, typename IndicesPortal, typename GlobalPopCountType>
278  struct BitFieldToUnorderedSetFunctor : public vtkm::exec::FunctorBase
279  {
280  VTKM_STATIC_ASSERT_MSG(VTKM_PASS_COMMAS(std::is_same<GlobalPopCountType, vtkm::Int32>::value ||
281  std::is_same<GlobalPopCountType, vtkm::UInt32>::value ||
282  std::is_same<GlobalPopCountType, vtkm::UInt64>::value),
283  "Unsupported GlobalPopCountType. Must support CUDA atomicAdd.");
284 
285  //Using typename BitsPortal::WordTypePreferred causes dependent type errors using GCC 4.8.5
286  //which is the GCC required compiler for CUDA 9.2 on summit/power9
288 
290  VTKM_PASS_COMMAS(std::is_same<typename IndicesPortal::ValueType, vtkm::Id>::value));
291 
292  VTKM_CONT
293  BitFieldToUnorderedSetFunctor(const BitsPortal& input,
294  const IndicesPortal& output,
295  GlobalPopCountType* globalPopCount)
296  : Input{ input }
297  , Output{ output }
298  , GlobalPopCount{ globalPopCount }
299  , FinalWordIndex{ input.GetNumberOfWords() - 1 }
300  , FinalWordMask(input.GetFinalWordMask())
301  {
302  }
303 
305 
307  {
308  assert(this->GlobalPopCount != nullptr);
309  VTKM_CUDA_CALL(cudaMemset(this->GlobalPopCount, 0, sizeof(GlobalPopCountType)));
310  }
311 
313  __device__ void operator()(vtkm::Id wordIdx) const
314  {
315  Word word = this->Input.GetWord(wordIdx);
316 
317  // The last word may be partial -- mask out trailing bits if needed.
318  const Word mask = wordIdx == this->FinalWordIndex ? this->FinalWordMask : ~Word{ 0 };
319 
320  word &= mask;
321 
322  if (word != 0)
323  {
324  this->LocalPopCount = vtkm::CountSetBits(word);
325  this->ReduceAllocate();
326 
327  vtkm::Id firstBitIdx = wordIdx * sizeof(Word) * CHAR_BIT;
328  do
329  {
330  // Find next bit. FindFirstSetBit's result is indexed starting at 1.
331  vtkm::Int32 bit = vtkm::FindFirstSetBit(word) - 1;
332  vtkm::Id outIdx = this->GetNextOutputIndex();
333  // Write index of bit
334  this->Output.Set(outIdx, firstBitIdx + bit);
335  word ^= (1 << bit); // clear bit
336  } while (word != 0); // have bits
337  }
338  }
339 
341  {
342  assert(this->GlobalPopCount != nullptr);
343  GlobalPopCountType result;
344  VTKM_CUDA_CALL(cudaMemcpy(
345  &result, this->GlobalPopCount, sizeof(GlobalPopCountType), cudaMemcpyDeviceToHost));
346  return static_cast<vtkm::Id>(result);
347  }
348 
349  private:
350  // Every thread with a non-zero local popcount calls this function, which
351  // computes the total popcount for the coalesced threads and allocates
352  // a contiguous block in the output by atomically increasing the global
353  // popcount.
355  __device__ void ReduceAllocate() const
356  {
357  const auto activeLanes = cooperative_groups::coalesced_threads();
358  const int activeRank = activeLanes.thread_rank();
359  const int activeSize = activeLanes.size();
360 
361  // Reduction value:
362  vtkm::Int32 rVal = this->LocalPopCount;
363  for (int delta = 1; delta < activeSize; delta *= 2)
364  {
365  const vtkm::Int32 shflVal = activeLanes.shfl_down(rVal, delta);
366  if (activeRank + delta < activeSize)
367  {
368  rVal += shflVal;
369  }
370  }
371 
372  if (activeRank == 0)
373  {
374  this->AllocationHead =
375  atomicAdd(this->GlobalPopCount, static_cast<GlobalPopCountType>(rVal));
376  }
377 
378  this->AllocationHead = activeLanes.shfl(this->AllocationHead, 0);
379  }
380 
381  // The global output allocation is written to by striding the writes across
382  // the warp lanes, allowing the writes to global memory to be coalesced.
384  __device__ vtkm::Id GetNextOutputIndex() const
385  {
386  // Only lanes with unwritten output indices left will call this method,
387  // so just check the coalesced threads:
388  const auto activeLanes = cooperative_groups::coalesced_threads();
389  const int activeRank = activeLanes.thread_rank();
390  const int activeSize = activeLanes.size();
391 
392  vtkm::Id nextIdx = static_cast<vtkm::Id>(this->AllocationHead + activeRank);
393  this->AllocationHead += activeSize;
394 
395  return nextIdx;
396  }
397 
398  const BitsPortal Input;
399  const IndicesPortal Output;
400  GlobalPopCountType* GlobalPopCount;
401  mutable vtkm::UInt64 AllocationHead{ 0 };
402  mutable vtkm::Int32 LocalPopCount{ 0 };
403  // Used to mask trailing bits the in last word.
404  vtkm::Id FinalWordIndex{ 0 };
405  Word FinalWordMask{ 0 };
406  };
407 
408  template <class InputPortal, class OutputPortal>
409  VTKM_CONT static void CopyPortal(const InputPortal& input, const OutputPortal& output)
410  {
411  try
412  {
413  ::thrust::copy(ThrustCudaPolicyPerThread,
414  cuda::internal::IteratorBegin(input),
415  cuda::internal::IteratorEnd(input),
416  cuda::internal::IteratorBegin(output));
417  }
418  catch (...)
419  {
420  cuda::internal::throwAsVTKmException();
421  }
422  }
423 
424  template <class ValueIterator, class StencilPortal, class OutputPortal, class UnaryPredicate>
425  VTKM_CONT static vtkm::Id CopyIfPortal(ValueIterator valuesBegin,
426  ValueIterator valuesEnd,
427  StencilPortal stencil,
428  OutputPortal output,
429  UnaryPredicate unary_predicate)
430  {
431  auto outputBegin = cuda::internal::IteratorBegin(output);
432 
433  using ValueType = typename StencilPortal::ValueType;
434 
435  vtkm::exec::cuda::internal::WrappedUnaryPredicate<ValueType, UnaryPredicate> up(
436  unary_predicate);
437 
438  try
439  {
440  auto newLast = ::thrust::copy_if(ThrustCudaPolicyPerThread,
441  valuesBegin,
442  valuesEnd,
443  cuda::internal::IteratorBegin(stencil),
444  outputBegin,
445  up);
446  return static_cast<vtkm::Id>(::thrust::distance(outputBegin, newLast));
447  }
448  catch (...)
449  {
450  cuda::internal::throwAsVTKmException();
451  return vtkm::Id(0);
452  }
453  }
454 
455  template <class ValuePortal, class StencilPortal, class OutputPortal, class UnaryPredicate>
456  VTKM_CONT static vtkm::Id CopyIfPortal(ValuePortal values,
457  StencilPortal stencil,
458  OutputPortal output,
459  UnaryPredicate unary_predicate)
460  {
461  return CopyIfPortal(cuda::internal::IteratorBegin(values),
462  cuda::internal::IteratorEnd(values),
463  stencil,
464  output,
465  unary_predicate);
466  }
467 
468  template <class InputPortal, class OutputPortal>
469  VTKM_CONT static void CopySubRangePortal(const InputPortal& input,
470  vtkm::Id inputOffset,
471  vtkm::Id size,
472  const OutputPortal& output,
473  vtkm::Id outputOffset)
474  {
475  try
476  {
477  ::thrust::copy_n(ThrustCudaPolicyPerThread,
478  cuda::internal::IteratorBegin(input) + inputOffset,
479  static_cast<std::size_t>(size),
480  cuda::internal::IteratorBegin(output) + outputOffset);
481  }
482  catch (...)
483  {
484  cuda::internal::throwAsVTKmException();
485  }
486  }
487 
488 
489  template <typename BitsPortal, typename GlobalPopCountType>
490  struct CountSetBitsFunctor : public vtkm::exec::FunctorBase
491  {
492  VTKM_STATIC_ASSERT_MSG(VTKM_PASS_COMMAS(std::is_same<GlobalPopCountType, vtkm::Int32>::value ||
493  std::is_same<GlobalPopCountType, vtkm::UInt32>::value ||
494  std::is_same<GlobalPopCountType, vtkm::UInt64>::value),
495  "Unsupported GlobalPopCountType. Must support CUDA atomicAdd.");
496 
497  //Using typename BitsPortal::WordTypePreferred causes dependent type errors using GCC 4.8.5
498  //which is the GCC required compiler for CUDA 9.2 on summit/power9
500 
501  VTKM_CONT
502  CountSetBitsFunctor(const BitsPortal& portal, GlobalPopCountType* globalPopCount)
503  : Portal{ portal }
504  , GlobalPopCount{ globalPopCount }
505  , FinalWordIndex{ portal.GetNumberOfWords() - 1 }
506  , FinalWordMask{ portal.GetFinalWordMask() }
507  {
508  }
509 
511 
513  {
514  assert(this->GlobalPopCount != nullptr);
515  VTKM_CUDA_CALL(cudaMemset(this->GlobalPopCount, 0, sizeof(GlobalPopCountType)));
516  }
517 
519  __device__ void operator()(vtkm::Id wordIdx) const
520  {
521  Word word = this->Portal.GetWord(wordIdx);
522 
523  // The last word may be partial -- mask out trailing bits if needed.
524  const Word mask = wordIdx == this->FinalWordIndex ? this->FinalWordMask : ~Word{ 0 };
525 
526  word &= mask;
527 
528  if (word != 0)
529  {
530  this->LocalPopCount = vtkm::CountSetBits(word);
531  this->Reduce();
532  }
533  }
534 
536  {
537  assert(this->GlobalPopCount != nullptr);
538  GlobalPopCountType result;
539  VTKM_CUDA_CALL(cudaMemcpy(
540  &result, this->GlobalPopCount, sizeof(GlobalPopCountType), cudaMemcpyDeviceToHost));
541  return static_cast<vtkm::Id>(result);
542  }
543 
544  private:
545  // Every thread with a non-zero local popcount calls this function, which
546  // computes the total popcount for the coalesced threads and atomically
547  // increasing the global popcount.
549  __device__ void Reduce() const
550  {
551  const auto activeLanes = cooperative_groups::coalesced_threads();
552  const int activeRank = activeLanes.thread_rank();
553  const int activeSize = activeLanes.size();
554 
555  // Reduction value:
556  vtkm::Int32 rVal = this->LocalPopCount;
557  for (int delta = 1; delta < activeSize; delta *= 2)
558  {
559  const vtkm::Int32 shflVal = activeLanes.shfl_down(rVal, delta);
560  if (activeRank + delta < activeSize)
561  {
562  rVal += shflVal;
563  }
564  }
565 
566  if (activeRank == 0)
567  {
568  atomicAdd(this->GlobalPopCount, static_cast<GlobalPopCountType>(rVal));
569  }
570  }
571 
572  const BitsPortal Portal;
573  GlobalPopCountType* GlobalPopCount;
574  mutable vtkm::Int32 LocalPopCount{ 0 };
575  // Used to mask trailing bits the in last word.
576  vtkm::Id FinalWordIndex{ 0 };
577  Word FinalWordMask{ 0 };
578  };
579 
580  template <class InputPortal, class ValuesPortal, class OutputPortal>
581  VTKM_CONT static void LowerBoundsPortal(const InputPortal& input,
582  const ValuesPortal& values,
583  const OutputPortal& output)
584  {
585  using ValueType = typename ValuesPortal::ValueType;
586  LowerBoundsPortal(input, values, output, ::thrust::less<ValueType>());
587  }
588 
589  template <class InputPortal, class OutputPortal>
590  VTKM_CONT static void LowerBoundsPortal(const InputPortal& input,
591  const OutputPortal& values_output)
592  {
593  using ValueType = typename InputPortal::ValueType;
594  LowerBoundsPortal(input, values_output, values_output, ::thrust::less<ValueType>());
595  }
596 
597  template <class InputPortal, class ValuesPortal, class OutputPortal, class BinaryCompare>
598  VTKM_CONT static void LowerBoundsPortal(const InputPortal& input,
599  const ValuesPortal& values,
600  const OutputPortal& output,
601  BinaryCompare binary_compare)
602  {
603  using ValueType = typename InputPortal::ValueType;
604  vtkm::exec::cuda::internal::WrappedBinaryPredicate<ValueType, BinaryCompare> bop(
605  binary_compare);
606 
607  try
608  {
609  ::thrust::lower_bound(ThrustCudaPolicyPerThread,
610  cuda::internal::IteratorBegin(input),
611  cuda::internal::IteratorEnd(input),
612  cuda::internal::IteratorBegin(values),
613  cuda::internal::IteratorEnd(values),
614  cuda::internal::IteratorBegin(output),
615  bop);
616  }
617  catch (...)
618  {
619  cuda::internal::throwAsVTKmException();
620  }
621  }
622 
623  template <class InputPortal, typename T>
624  VTKM_CONT static T ReducePortal(const InputPortal& input, T initialValue)
625  {
626  return ReducePortal(input, initialValue, ::thrust::plus<T>());
627  }
628 
629  template <class InputPortal, typename T, class BinaryFunctor>
630  VTKM_CONT static T ReducePortal(const InputPortal& input,
631  T initialValue,
632  BinaryFunctor binary_functor)
633  {
634  using fast_path = std::is_same<typename InputPortal::ValueType, T>;
635  return ReducePortalImpl(input, initialValue, binary_functor, fast_path());
636  }
637 
638  template <class InputPortal, typename T, class BinaryFunctor>
639  VTKM_CONT static T ReducePortalImpl(const InputPortal& input,
640  T initialValue,
641  BinaryFunctor binary_functor,
642  std::true_type)
643  {
644  //The portal type and the initial value are the same so we can use
645  //the thrust reduction algorithm
646  vtkm::exec::cuda::internal::WrappedBinaryOperator<T, BinaryFunctor> bop(binary_functor);
647 
648  try
649  {
650  return ::thrust::reduce(ThrustCudaPolicyPerThread,
651  cuda::internal::IteratorBegin(input),
652  cuda::internal::IteratorEnd(input),
653  initialValue,
654  bop);
655  }
656  catch (...)
657  {
658  cuda::internal::throwAsVTKmException();
659  }
660 
661  return initialValue;
662  }
663 
664  template <class InputPortal, typename T, class BinaryFunctor>
665  VTKM_CONT static T ReducePortalImpl(const InputPortal& input,
666  T initialValue,
667  BinaryFunctor binary_functor,
668  std::false_type)
669  {
670  //The portal type and the initial value AREN'T the same type so we have
671  //to a slower approach, where we wrap the input portal inside a cast
672  //portal
673  vtkm::cont::cuda::internal::CastPortal<InputPortal, BinaryFunctor> castPortal(input,
674  binary_functor);
675 
676  vtkm::exec::cuda::internal::WrappedBinaryOperator<T, BinaryFunctor> bop(binary_functor);
677 
678  try
679  {
680  return ::thrust::reduce(ThrustCudaPolicyPerThread,
681  cuda::internal::IteratorBegin(castPortal),
682  cuda::internal::IteratorEnd(castPortal),
683  initialValue,
684  bop);
685  }
686  catch (...)
687  {
688  cuda::internal::throwAsVTKmException();
689  }
690 
691  return initialValue;
692  }
693 
694  template <class KeysPortal,
695  class ValuesPortal,
696  class KeysOutputPortal,
697  class ValueOutputPortal,
698  class BinaryFunctor>
699  VTKM_CONT static vtkm::Id ReduceByKeyPortal(const KeysPortal& keys,
700  const ValuesPortal& values,
701  const KeysOutputPortal& keys_output,
702  const ValueOutputPortal& values_output,
703  BinaryFunctor binary_functor)
704  {
705  auto keys_out_begin = cuda::internal::IteratorBegin(keys_output);
706  auto values_out_begin = cuda::internal::IteratorBegin(values_output);
707 
708  ::thrust::pair<decltype(keys_out_begin), decltype(values_out_begin)> result_iterators;
709 
710  ::thrust::equal_to<typename KeysPortal::ValueType> binaryPredicate;
711 
712  using ValueType = typename ValuesPortal::ValueType;
713  vtkm::exec::cuda::internal::WrappedBinaryOperator<ValueType, BinaryFunctor> bop(binary_functor);
714 
715  try
716  {
717  result_iterators = ::thrust::reduce_by_key(vtkm_cuda_policy(),
718  cuda::internal::IteratorBegin(keys),
719  cuda::internal::IteratorEnd(keys),
720  cuda::internal::IteratorBegin(values),
721  keys_out_begin,
722  values_out_begin,
723  binaryPredicate,
724  bop);
725  }
726  catch (...)
727  {
728  cuda::internal::throwAsVTKmException();
729  }
730 
731  return static_cast<vtkm::Id>(::thrust::distance(keys_out_begin, result_iterators.first));
732  }
733 
734  template <class InputPortal, class OutputPortal>
735  VTKM_CONT static typename InputPortal::ValueType ScanExclusivePortal(const InputPortal& input,
736  const OutputPortal& output)
737  {
738  using ValueType = typename OutputPortal::ValueType;
739 
740  return ScanExclusivePortal(input,
741  output,
742  (::thrust::plus<ValueType>()),
744  }
745 
746  template <class InputPortal, class OutputPortal, class BinaryFunctor>
747  VTKM_CONT static typename InputPortal::ValueType ScanExclusivePortal(
748  const InputPortal& input,
749  const OutputPortal& output,
750  BinaryFunctor binaryOp,
751  typename InputPortal::ValueType initialValue)
752  {
753  // Use iterator to get value so that thrust device_ptr has chance to handle
754  // data on device.
755  using ValueType = typename OutputPortal::ValueType;
756 
757  //we have size three so that we can store the origin end value, the
758  //new end value, and the sum of those two
759  ::thrust::system::cuda::vector<ValueType> sum(3);
760  try
761  {
762 
763  //store the current value of the last position array in a separate cuda
764  //memory location since the exclusive_scan will overwrite that value
765  //once run
766  ::thrust::copy_n(
767  ThrustCudaPolicyPerThread, cuda::internal::IteratorEnd(input) - 1, 1, sum.begin());
768 
769  vtkm::exec::cuda::internal::WrappedBinaryOperator<ValueType, BinaryFunctor> bop(binaryOp);
770 
771  auto end = ::thrust::exclusive_scan(ThrustCudaPolicyPerThread,
772  cuda::internal::IteratorBegin(input),
773  cuda::internal::IteratorEnd(input),
774  cuda::internal::IteratorBegin(output),
775  initialValue,
776  bop);
777 
778  //Store the new value for the end of the array. This is done because
779  //with items such as the transpose array it is unsafe to pass the
780  //portal to the SumExclusiveScan
781  ::thrust::copy_n(ThrustCudaPolicyPerThread, (end - 1), 1, sum.begin() + 1);
782 
783  //execute the binaryOp one last time on the device.
784  cuda::internal::SumExclusiveScan<<<1, 1, 0, cudaStreamPerThread>>>(
785  sum[0], sum[1], sum[2], bop);
786  }
787  catch (...)
788  {
789  cuda::internal::throwAsVTKmException();
790  }
791  return sum[2];
792  }
793 
794  template <class InputPortal, class OutputPortal>
795  VTKM_CONT static typename InputPortal::ValueType ScanInclusivePortal(const InputPortal& input,
796  const OutputPortal& output)
797  {
798  using ValueType = typename OutputPortal::ValueType;
799  return ScanInclusivePortal(input, output, ::thrust::plus<ValueType>());
800  }
801 
802  template <class InputPortal, class OutputPortal, class BinaryFunctor>
803  VTKM_CONT static typename InputPortal::ValueType ScanInclusivePortal(const InputPortal& input,
804  const OutputPortal& output,
805  BinaryFunctor binary_functor)
806  {
807  using ValueType = typename OutputPortal::ValueType;
808  vtkm::exec::cuda::internal::WrappedBinaryOperator<ValueType, BinaryFunctor> bop(binary_functor);
809 
810  try
811  {
812  ::thrust::system::cuda::vector<ValueType> result(1);
813  auto end = ::thrust::inclusive_scan(ThrustCudaPolicyPerThread,
814  cuda::internal::IteratorBegin(input),
815  cuda::internal::IteratorEnd(input),
816  cuda::internal::IteratorBegin(output),
817  bop);
818 
819  ::thrust::copy_n(ThrustCudaPolicyPerThread, end - 1, 1, result.begin());
820  return result[0];
821  }
822  catch (...)
823  {
824  cuda::internal::throwAsVTKmException();
825  return typename InputPortal::ValueType();
826  }
827 
828  //return the value at the last index in the array, as that is the sum
829  }
830 
831  template <typename KeysPortal, typename ValuesPortal, typename OutputPortal>
832  VTKM_CONT static void ScanInclusiveByKeyPortal(const KeysPortal& keys,
833  const ValuesPortal& values,
834  const OutputPortal& output)
835  {
836  using KeyType = typename KeysPortal::ValueType;
837  using ValueType = typename OutputPortal::ValueType;
838  ScanInclusiveByKeyPortal(
839  keys, values, output, ::thrust::equal_to<KeyType>(), ::thrust::plus<ValueType>());
840  }
841 
842  template <typename KeysPortal,
843  typename ValuesPortal,
844  typename OutputPortal,
845  typename BinaryPredicate,
846  typename AssociativeOperator>
847  VTKM_CONT static void ScanInclusiveByKeyPortal(const KeysPortal& keys,
848  const ValuesPortal& values,
849  const OutputPortal& output,
850  BinaryPredicate binary_predicate,
851  AssociativeOperator binary_operator)
852  {
853  using KeyType = typename KeysPortal::ValueType;
854  vtkm::exec::cuda::internal::WrappedBinaryOperator<KeyType, BinaryPredicate> bpred(
855  binary_predicate);
856  using ValueType = typename OutputPortal::ValueType;
857  vtkm::exec::cuda::internal::WrappedBinaryOperator<ValueType, AssociativeOperator> bop(
858  binary_operator);
859 
860  try
861  {
862  ::thrust::inclusive_scan_by_key(ThrustCudaPolicyPerThread,
863  cuda::internal::IteratorBegin(keys),
864  cuda::internal::IteratorEnd(keys),
865  cuda::internal::IteratorBegin(values),
866  cuda::internal::IteratorBegin(output),
867  bpred,
868  bop);
869  }
870  catch (...)
871  {
872  cuda::internal::throwAsVTKmException();
873  }
874  }
875 
876  template <typename KeysPortal, typename ValuesPortal, typename OutputPortal>
877  VTKM_CONT static void ScanExclusiveByKeyPortal(const KeysPortal& keys,
878  const ValuesPortal& values,
879  const OutputPortal& output)
880  {
881  using KeyType = typename KeysPortal::ValueType;
882  using ValueType = typename OutputPortal::ValueType;
883  ScanExclusiveByKeyPortal(keys,
884  values,
885  output,
887  ::thrust::equal_to<KeyType>(),
888  ::thrust::plus<ValueType>());
889  }
890 
891  template <typename KeysPortal,
892  typename ValuesPortal,
893  typename OutputPortal,
894  typename T,
895  typename BinaryPredicate,
896  typename AssociativeOperator>
897  VTKM_CONT static void ScanExclusiveByKeyPortal(const KeysPortal& keys,
898  const ValuesPortal& values,
899  const OutputPortal& output,
900  T initValue,
901  BinaryPredicate binary_predicate,
902  AssociativeOperator binary_operator)
903  {
904  using KeyType = typename KeysPortal::ValueType;
905  vtkm::exec::cuda::internal::WrappedBinaryOperator<KeyType, BinaryPredicate> bpred(
906  binary_predicate);
907  using ValueType = typename OutputPortal::ValueType;
908  vtkm::exec::cuda::internal::WrappedBinaryOperator<ValueType, AssociativeOperator> bop(
909  binary_operator);
910  try
911  {
912  ::thrust::exclusive_scan_by_key(ThrustCudaPolicyPerThread,
913  cuda::internal::IteratorBegin(keys),
914  cuda::internal::IteratorEnd(keys),
915  cuda::internal::IteratorBegin(values),
916  cuda::internal::IteratorBegin(output),
917  initValue,
918  bpred,
919  bop);
920  }
921  catch (...)
922  {
923  cuda::internal::throwAsVTKmException();
924  }
925  }
926 
927  template <class ValuesPortal>
928  VTKM_CONT static void SortPortal(const ValuesPortal& values)
929  {
930  using ValueType = typename ValuesPortal::ValueType;
931  SortPortal(values, ::thrust::less<ValueType>());
932  }
933 
934  template <class ValuesPortal, class BinaryCompare>
935  VTKM_CONT static void SortPortal(const ValuesPortal& values, BinaryCompare binary_compare)
936  {
937  using ValueType = typename ValuesPortal::ValueType;
938  vtkm::exec::cuda::internal::WrappedBinaryPredicate<ValueType, BinaryCompare> bop(
939  binary_compare);
940  try
941  {
942  ::thrust::sort(vtkm_cuda_policy(),
943  cuda::internal::IteratorBegin(values),
944  cuda::internal::IteratorEnd(values),
945  bop);
946  }
947  catch (...)
948  {
949  cuda::internal::throwAsVTKmException();
950  }
951  }
952 
953  template <class KeysPortal, class ValuesPortal>
954  VTKM_CONT static void SortByKeyPortal(const KeysPortal& keys, const ValuesPortal& values)
955  {
956  using ValueType = typename KeysPortal::ValueType;
957  SortByKeyPortal(keys, values, ::thrust::less<ValueType>());
958  }
959 
960  template <class KeysPortal, class ValuesPortal, class BinaryCompare>
961  VTKM_CONT static void SortByKeyPortal(const KeysPortal& keys,
962  const ValuesPortal& values,
963  BinaryCompare binary_compare)
964  {
965  using ValueType = typename KeysPortal::ValueType;
966  vtkm::exec::cuda::internal::WrappedBinaryPredicate<ValueType, BinaryCompare> bop(
967  binary_compare);
968  try
969  {
970  ::thrust::sort_by_key(vtkm_cuda_policy(),
971  cuda::internal::IteratorBegin(keys),
972  cuda::internal::IteratorEnd(keys),
973  cuda::internal::IteratorBegin(values),
974  bop);
975  }
976  catch (...)
977  {
978  cuda::internal::throwAsVTKmException();
979  }
980  }
981 
982  template <class ValuesPortal>
983  VTKM_CONT static vtkm::Id UniquePortal(const ValuesPortal values)
984  {
985  try
986  {
987  auto begin = cuda::internal::IteratorBegin(values);
988  auto newLast =
989  ::thrust::unique(ThrustCudaPolicyPerThread, begin, cuda::internal::IteratorEnd(values));
990  return static_cast<vtkm::Id>(::thrust::distance(begin, newLast));
991  }
992  catch (...)
993  {
994  cuda::internal::throwAsVTKmException();
995  return vtkm::Id(0);
996  }
997  }
998 
999  template <class ValuesPortal, class BinaryCompare>
1000  VTKM_CONT static vtkm::Id UniquePortal(const ValuesPortal values, BinaryCompare binary_compare)
1001  {
1002  using ValueType = typename ValuesPortal::ValueType;
1003  vtkm::exec::cuda::internal::WrappedBinaryPredicate<ValueType, BinaryCompare> bop(
1004  binary_compare);
1005  try
1006  {
1007  auto begin = cuda::internal::IteratorBegin(values);
1008  auto newLast = ::thrust::unique(
1009  ThrustCudaPolicyPerThread, begin, cuda::internal::IteratorEnd(values), bop);
1010  return static_cast<vtkm::Id>(::thrust::distance(begin, newLast));
1011  }
1012  catch (...)
1013  {
1014  cuda::internal::throwAsVTKmException();
1015  return vtkm::Id(0);
1016  }
1017  }
1018 
1019  template <class InputPortal, class ValuesPortal, class OutputPortal>
1020  VTKM_CONT static void UpperBoundsPortal(const InputPortal& input,
1021  const ValuesPortal& values,
1022  const OutputPortal& output)
1023  {
1024  try
1025  {
1026  ::thrust::upper_bound(ThrustCudaPolicyPerThread,
1027  cuda::internal::IteratorBegin(input),
1028  cuda::internal::IteratorEnd(input),
1029  cuda::internal::IteratorBegin(values),
1030  cuda::internal::IteratorEnd(values),
1031  cuda::internal::IteratorBegin(output));
1032  }
1033  catch (...)
1034  {
1035  cuda::internal::throwAsVTKmException();
1036  }
1037  }
1038 
1039  template <class InputPortal, class ValuesPortal, class OutputPortal, class BinaryCompare>
1040  VTKM_CONT static void UpperBoundsPortal(const InputPortal& input,
1041  const ValuesPortal& values,
1042  const OutputPortal& output,
1043  BinaryCompare binary_compare)
1044  {
1045  using ValueType = typename OutputPortal::ValueType;
1046 
1047  vtkm::exec::cuda::internal::WrappedBinaryPredicate<ValueType, BinaryCompare> bop(
1048  binary_compare);
1049  try
1050  {
1051  ::thrust::upper_bound(ThrustCudaPolicyPerThread,
1052  cuda::internal::IteratorBegin(input),
1053  cuda::internal::IteratorEnd(input),
1054  cuda::internal::IteratorBegin(values),
1055  cuda::internal::IteratorEnd(values),
1056  cuda::internal::IteratorBegin(output),
1057  bop);
1058  }
1059  catch (...)
1060  {
1061  cuda::internal::throwAsVTKmException();
1062  }
1063  }
1064 
1065  template <class InputPortal, class OutputPortal>
1066  VTKM_CONT static void UpperBoundsPortal(const InputPortal& input,
1067  const OutputPortal& values_output)
1068  {
1069  try
1070  {
1071  ::thrust::upper_bound(ThrustCudaPolicyPerThread,
1072  cuda::internal::IteratorBegin(input),
1073  cuda::internal::IteratorEnd(input),
1074  cuda::internal::IteratorBegin(values_output),
1075  cuda::internal::IteratorEnd(values_output),
1076  cuda::internal::IteratorBegin(values_output));
1077  }
1078  catch (...)
1079  {
1080  cuda::internal::throwAsVTKmException();
1081  }
1082  }
1083 
1084  template <typename GlobalPopCountType, typename BitsPortal, typename IndicesPortal>
1085  VTKM_CONT static vtkm::Id BitFieldToUnorderedSetPortal(const BitsPortal& bits,
1086  const IndicesPortal& indices)
1087  {
1088  using Functor = BitFieldToUnorderedSetFunctor<BitsPortal, IndicesPortal, GlobalPopCountType>;
1089 
1090  // RAII for the global atomic counter.
1091  auto globalCount = cuda::internal::make_CudaUniquePtr<GlobalPopCountType>(1);
1092  Functor functor{ bits, indices, globalCount.get() };
1093 
1094  functor.Initialize();
1095  Schedule(functor, bits.GetNumberOfWords());
1096  Synchronize(); // Ensure kernel is done before checking final atomic count
1097  return functor.Finalize();
1098  }
1099 
1100  template <typename GlobalPopCountType, typename BitsPortal>
1101  VTKM_CONT static vtkm::Id CountSetBitsPortal(const BitsPortal& bits)
1102  {
1103  using Functor = CountSetBitsFunctor<BitsPortal, GlobalPopCountType>;
1104 
1105  // RAII for the global atomic counter.
1106  auto globalCount = cuda::internal::make_CudaUniquePtr<GlobalPopCountType>(1);
1107  Functor functor{ bits, globalCount.get() };
1108 
1109  functor.Initialize();
1110  Schedule(functor, bits.GetNumberOfWords());
1111  Synchronize(); // Ensure kernel is done before checking final atomic count
1112  return functor.Finalize();
1113  }
1114 
1115  //-----------------------------------------------------------------------------
1116 
1117 public:
1118  template <typename IndicesStorage>
1120  const vtkm::cont::BitField& bits,
1122  {
1124 
1125  vtkm::Id numBits = bits.GetNumberOfBits();
1126 
1127  {
1128  vtkm::cont::Token token;
1129  auto bitsPortal = bits.PrepareForInput(DeviceAdapterTagCuda{}, token);
1130  auto indicesPortal = indices.PrepareForOutput(numBits, DeviceAdapterTagCuda{}, token);
1131 
1132  // Use a uint64 for accumulator, as atomicAdd does not support signed int64.
1133  numBits = BitFieldToUnorderedSetPortal<vtkm::UInt64>(bitsPortal, indicesPortal);
1134  }
1135 
1136  indices.Allocate(numBits, vtkm::CopyFlag::On);
1137  return numBits;
1138  }
1139 
1140  template <typename T, typename U, class SIn, class SOut>
1143  {
1145 
1146  const vtkm::Id inSize = input.GetNumberOfValues();
1147  if (inSize <= 0)
1148  {
1149  output.Allocate(inSize, vtkm::CopyFlag::On);
1150  return;
1151  }
1152  vtkm::cont::Token token;
1153  CopyPortal(input.PrepareForInput(DeviceAdapterTagCuda(), token),
1154  output.PrepareForOutput(inSize, DeviceAdapterTagCuda(), token));
1155  }
1156 
1157  template <typename T, typename U, class SIn, class SStencil, class SOut>
1159  const vtkm::cont::ArrayHandle<T, SStencil>& stencil,
1161  {
1163 
1164  vtkm::Id size = stencil.GetNumberOfValues();
1165  if (size <= 0)
1166  {
1167  output.Allocate(size, vtkm::CopyFlag::On);
1168  return;
1169  }
1170 
1171  vtkm::Id newSize;
1172 
1173  {
1174  vtkm::cont::Token token;
1175 
1176  newSize = CopyIfPortal(input.PrepareForInput(DeviceAdapterTagCuda(), token),
1177  stencil.PrepareForInput(DeviceAdapterTagCuda(), token),
1178  output.PrepareForOutput(size, DeviceAdapterTagCuda(), token),
1179  ::vtkm::NotZeroInitialized()); //yes on the stencil
1180  }
1181 
1182  output.Allocate(newSize, vtkm::CopyFlag::On);
1183  }
1184 
1185  template <typename T, typename U, class SIn, class SStencil, class SOut, class UnaryPredicate>
1187  const vtkm::cont::ArrayHandle<T, SStencil>& stencil,
1189  UnaryPredicate unary_predicate)
1190  {
1192 
1193  vtkm::Id size = stencil.GetNumberOfValues();
1194  if (size <= 0)
1195  {
1196  output.Allocate(size, vtkm::CopyFlag::On);
1197  return;
1198  }
1199 
1200  vtkm::Id newSize;
1201 
1202  {
1203  vtkm::cont::Token token;
1204  newSize = CopyIfPortal(input.PrepareForInput(DeviceAdapterTagCuda(), token),
1205  stencil.PrepareForInput(DeviceAdapterTagCuda(), token),
1206  output.PrepareForOutput(size, DeviceAdapterTagCuda(), token),
1207  unary_predicate);
1208  }
1209 
1210  output.Allocate(newSize, vtkm::CopyFlag::On);
1211  }
1212 
1213  template <typename T, typename U, class SIn, class SOut>
1215  vtkm::Id inputStartIndex,
1216  vtkm::Id numberOfElementsToCopy,
1218  vtkm::Id outputIndex = 0)
1219  {
1221 
1222  const vtkm::Id inSize = input.GetNumberOfValues();
1223 
1224  // Check if the ranges overlap and fail if they do.
1225  if (input == output &&
1226  ((outputIndex >= inputStartIndex &&
1227  outputIndex < inputStartIndex + numberOfElementsToCopy) ||
1228  (inputStartIndex >= outputIndex &&
1229  inputStartIndex < outputIndex + numberOfElementsToCopy)))
1230  {
1231  return false;
1232  }
1233 
1234  if (inputStartIndex < 0 || numberOfElementsToCopy < 0 || outputIndex < 0 ||
1235  inputStartIndex >= inSize)
1236  { //invalid parameters
1237  return false;
1238  }
1239 
1240  //determine if the numberOfElementsToCopy needs to be reduced
1241  if (inSize < (inputStartIndex + numberOfElementsToCopy))
1242  { //adjust the size
1243  numberOfElementsToCopy = (inSize - inputStartIndex);
1244  }
1245 
1246  const vtkm::Id outSize = output.GetNumberOfValues();
1247  const vtkm::Id copyOutEnd = outputIndex + numberOfElementsToCopy;
1248  if (outSize < copyOutEnd)
1249  { //output is not large enough
1250  if (outSize == 0)
1251  { //since output has nothing, just need to allocate to correct length
1252  output.Allocate(copyOutEnd);
1253  }
1254  else
1255  { //we currently have data in this array, so preserve it in the new
1256  //resized array
1258  temp.Allocate(copyOutEnd);
1259  CopySubRange(output, 0, outSize, temp);
1260  output = temp;
1261  }
1262  }
1263  vtkm::cont::Token token;
1264  CopySubRangePortal(input.PrepareForInput(DeviceAdapterTagCuda(), token),
1265  inputStartIndex,
1266  numberOfElementsToCopy,
1267  output.PrepareForInPlace(DeviceAdapterTagCuda(), token),
1268  outputIndex);
1269  return true;
1270  }
1271 
1273  {
1275  vtkm::cont::Token token;
1276  auto bitsPortal = bits.PrepareForInput(DeviceAdapterTagCuda{}, token);
1277  // Use a uint64 for accumulator, as atomicAdd does not support signed int64.
1278  return CountSetBitsPortal<vtkm::UInt64>(bitsPortal);
1279  }
1280 
1281  template <typename T, class SIn, class SVal, class SOut>
1283  const vtkm::cont::ArrayHandle<T, SVal>& values,
1285  {
1287 
1288  vtkm::Id numberOfValues = values.GetNumberOfValues();
1289  vtkm::cont::Token token;
1290  LowerBoundsPortal(input.PrepareForInput(DeviceAdapterTagCuda(), token),
1291  values.PrepareForInput(DeviceAdapterTagCuda(), token),
1292  output.PrepareForOutput(numberOfValues, DeviceAdapterTagCuda(), token));
1293  }
1294 
1295  template <typename T, class SIn, class SVal, class SOut, class BinaryCompare>
1297  const vtkm::cont::ArrayHandle<T, SVal>& values,
1299  BinaryCompare binary_compare)
1300  {
1302 
1303  vtkm::Id numberOfValues = values.GetNumberOfValues();
1304  vtkm::cont::Token token;
1305  LowerBoundsPortal(input.PrepareForInput(DeviceAdapterTagCuda(), token),
1306  values.PrepareForInput(DeviceAdapterTagCuda(), token),
1307  output.PrepareForOutput(numberOfValues, DeviceAdapterTagCuda(), token),
1308  binary_compare);
1309  }
1310 
1311  template <class SIn, class SOut>
1314  {
1316 
1317  vtkm::cont::Token token;
1318  LowerBoundsPortal(input.PrepareForInput(DeviceAdapterTagCuda(), token),
1319  values_output.PrepareForInPlace(DeviceAdapterTagCuda(), token));
1320  }
1321 
1322  template <typename T, typename U, class SIn>
1323  VTKM_CONT static U Reduce(const vtkm::cont::ArrayHandle<T, SIn>& input, U initialValue)
1324  {
1326 
1327  const vtkm::Id numberOfValues = input.GetNumberOfValues();
1328  if (numberOfValues <= 0)
1329  {
1330  return initialValue;
1331  }
1332  vtkm::cont::Token token;
1333  return ReducePortal(input.PrepareForInput(DeviceAdapterTagCuda(), token), initialValue);
1334  }
1335 
1336  template <typename T, typename U, class SIn, class BinaryFunctor>
1338  U initialValue,
1339  BinaryFunctor binary_functor)
1340  {
1342 
1343  const vtkm::Id numberOfValues = input.GetNumberOfValues();
1344  if (numberOfValues <= 0)
1345  {
1346  return initialValue;
1347  }
1348  vtkm::cont::Token token;
1349  return ReducePortal(
1350  input.PrepareForInput(DeviceAdapterTagCuda(), token), initialValue, binary_functor);
1351  }
1352 
1353  // At least some versions of Thrust/nvcc result in compile errors when calling Thrust's
1354  // reduce with sufficiently complex iterators, which can happen with some versions of
1355  // ArrayHandleMultiplexer. Thus, don't use the Thrust version for ArrayHandleMultiplexer.
1356  template <typename T, typename U, typename... SIns>
1359  U initialValue)
1360  {
1361  return Superclass::Reduce(input, initialValue);
1362  }
1363  template <typename T, typename U, typename BinaryFunctor, typename... SIns>
1366  U initialValue,
1367  BinaryFunctor binary_functor)
1368  {
1369  return Superclass::Reduce(input, initialValue, binary_functor);
1370  }
1371 
1372  template <typename T,
1373  typename U,
1374  class KIn,
1375  class VIn,
1376  class KOut,
1377  class VOut,
1378  class BinaryFunctor>
1380  const vtkm::cont::ArrayHandle<U, VIn>& values,
1381  vtkm::cont::ArrayHandle<T, KOut>& keys_output,
1382  vtkm::cont::ArrayHandle<U, VOut>& values_output,
1383  BinaryFunctor binary_functor)
1384  {
1386 
1387  //there is a concern that by default we will allocate too much
1388  //space for the keys/values output. 1 option is to
1389  const vtkm::Id numberOfValues = keys.GetNumberOfValues();
1390  if (numberOfValues <= 0)
1391  {
1392  return;
1393  }
1394 
1395  vtkm::Id reduced_size;
1396  {
1397  vtkm::cont::Token token;
1398  reduced_size = ReduceByKeyPortal(
1399  keys.PrepareForInput(DeviceAdapterTagCuda(), token),
1400  values.PrepareForInput(DeviceAdapterTagCuda(), token),
1401  keys_output.PrepareForOutput(numberOfValues, DeviceAdapterTagCuda(), token),
1402  values_output.PrepareForOutput(numberOfValues, DeviceAdapterTagCuda(), token),
1403  binary_functor);
1404  }
1405 
1406  keys_output.Allocate(reduced_size, vtkm::CopyFlag::On);
1407  values_output.Allocate(reduced_size, vtkm::CopyFlag::On);
1408  }
1409 
1410  template <typename T, class SIn, class SOut>
1413  {
1415 
1416  const vtkm::Id numberOfValues = input.GetNumberOfValues();
1417  if (numberOfValues <= 0)
1418  {
1419  output.Allocate(0);
1421  }
1422 
1423  //We need call PrepareForInput on the input argument before invoking a
1424  //function. The order of execution of parameters of a function is undefined
1425  //so we need to make sure input is called before output, or else in-place
1426  //use case breaks.
1427  vtkm::cont::Token token;
1428  auto inputPortal = input.PrepareForInput(DeviceAdapterTagCuda(), token);
1429  return ScanExclusivePortal(
1430  inputPortal, output.PrepareForOutput(numberOfValues, DeviceAdapterTagCuda(), token));
1431  }
1432 
1433  template <typename T, class SIn, class SOut, class BinaryFunctor>
1436  BinaryFunctor binary_functor,
1437  const T& initialValue)
1438  {
1440 
1441  const vtkm::Id numberOfValues = input.GetNumberOfValues();
1442  if (numberOfValues <= 0)
1443  {
1444  output.Allocate(0);
1446  }
1447 
1448  //We need call PrepareForInput on the input argument before invoking a
1449  //function. The order of execution of parameters of a function is undefined
1450  //so we need to make sure input is called before output, or else in-place
1451  //use case breaks.
1452  vtkm::cont::Token token;
1453  auto inputPortal = input.PrepareForInput(DeviceAdapterTagCuda(), token);
1454  return ScanExclusivePortal(
1455  inputPortal,
1456  output.PrepareForOutput(numberOfValues, DeviceAdapterTagCuda(), token),
1457  binary_functor,
1458  initialValue);
1459  }
1460 
1461  template <typename T, class SIn, class SOut>
1464  {
1466 
1467  const vtkm::Id numberOfValues = input.GetNumberOfValues();
1468  if (numberOfValues <= 0)
1469  {
1470  output.Allocate(0);
1472  }
1473 
1474  //We need call PrepareForInput on the input argument before invoking a
1475  //function. The order of execution of parameters of a function is undefined
1476  //so we need to make sure input is called before output, or else in-place
1477  //use case breaks.
1478  vtkm::cont::Token token;
1479  auto inputPortal = input.PrepareForInput(DeviceAdapterTagCuda(), token);
1480  return ScanInclusivePortal(
1481  inputPortal, output.PrepareForOutput(numberOfValues, DeviceAdapterTagCuda(), token));
1482  }
1483 
1484  template <typename T, class SIn, class SOut, class BinaryFunctor>
1487  BinaryFunctor binary_functor)
1488  {
1490 
1491  const vtkm::Id numberOfValues = input.GetNumberOfValues();
1492  if (numberOfValues <= 0)
1493  {
1494  output.Allocate(0);
1496  }
1497 
1498  //We need call PrepareForInput on the input argument before invoking a
1499  //function. The order of execution of parameters of a function is undefined
1500  //so we need to make sure input is called before output, or else in-place
1501  //use case breaks.
1502  vtkm::cont::Token token;
1503  auto inputPortal = input.PrepareForInput(DeviceAdapterTagCuda(), token);
1504  return ScanInclusivePortal(
1505  inputPortal,
1506  output.PrepareForOutput(numberOfValues, DeviceAdapterTagCuda(), token),
1507  binary_functor);
1508  }
1509 
1510  template <typename T, typename U, typename KIn, typename VIn, typename VOut>
1512  const vtkm::cont::ArrayHandle<U, VIn>& values,
1514  {
1516 
1517  const vtkm::Id numberOfValues = keys.GetNumberOfValues();
1518  if (numberOfValues <= 0)
1519  {
1520  output.Allocate(0);
1521  return;
1522  }
1523 
1524  //We need call PrepareForInput on the input argument before invoking a
1525  //function. The order of execution of parameters of a function is undefined
1526  //so we need to make sure input is called before output, or else in-place
1527  //use case breaks.
1528  vtkm::cont::Token token;
1529  auto keysPortal = keys.PrepareForInput(DeviceAdapterTagCuda(), token);
1530  auto valuesPortal = values.PrepareForInput(DeviceAdapterTagCuda(), token);
1531  ScanInclusiveByKeyPortal(
1532  keysPortal,
1533  valuesPortal,
1534  output.PrepareForOutput(numberOfValues, DeviceAdapterTagCuda(), token));
1535  }
1536 
1537  template <typename T,
1538  typename U,
1539  typename KIn,
1540  typename VIn,
1541  typename VOut,
1542  typename BinaryFunctor>
1544  const vtkm::cont::ArrayHandle<U, VIn>& values,
1546  BinaryFunctor binary_functor)
1547  {
1549 
1550  const vtkm::Id numberOfValues = keys.GetNumberOfValues();
1551  if (numberOfValues <= 0)
1552  {
1553  output.Allocate(0);
1554  return;
1555  }
1556 
1557  //We need call PrepareForInput on the input argument before invoking a
1558  //function. The order of execution of parameters of a function is undefined
1559  //so we need to make sure input is called before output, or else in-place
1560  //use case breaks.
1561  vtkm::cont::Token token;
1562  auto keysPortal = keys.PrepareForInput(DeviceAdapterTagCuda(), token);
1563  auto valuesPortal = values.PrepareForInput(DeviceAdapterTagCuda(), token);
1564  ScanInclusiveByKeyPortal(keysPortal,
1565  valuesPortal,
1566  output.PrepareForOutput(numberOfValues, DeviceAdapterTagCuda(), token),
1567  ::thrust::equal_to<T>(),
1568  binary_functor);
1569  }
1570 
1571  template <typename T, typename U, typename KIn, typename VIn, typename VOut>
1573  const vtkm::cont::ArrayHandle<U, VIn>& values,
1575  {
1577 
1578  const vtkm::Id numberOfValues = keys.GetNumberOfValues();
1579  if (numberOfValues <= 0)
1580  {
1581  output.Allocate(0);
1582  return;
1583  }
1584 
1585  //We need call PrepareForInput on the input argument before invoking a
1586  //function. The order of execution of parameters of a function is undefined
1587  //so we need to make sure input is called before output, or else in-place
1588  //use case breaks.
1589  vtkm::cont::Token token;
1590  auto keysPortal = keys.PrepareForInput(DeviceAdapterTagCuda(), token);
1591  auto valuesPortal = values.PrepareForInput(DeviceAdapterTagCuda(), token);
1592  ScanExclusiveByKeyPortal(keysPortal,
1593  valuesPortal,
1594  output.PrepareForOutput(numberOfValues, DeviceAdapterTagCuda(), token),
1596  ::thrust::equal_to<T>(),
1597  vtkm::Add());
1598  }
1599 
1600  template <typename T,
1601  typename U,
1602  typename KIn,
1603  typename VIn,
1604  typename VOut,
1605  typename BinaryFunctor>
1607  const vtkm::cont::ArrayHandle<U, VIn>& values,
1609  const U& initialValue,
1610  BinaryFunctor binary_functor)
1611  {
1613 
1614  const vtkm::Id numberOfValues = keys.GetNumberOfValues();
1615  if (numberOfValues <= 0)
1616  {
1617  output.Allocate(0);
1618  return;
1619  }
1620 
1621  //We need call PrepareForInput on the input argument before invoking a
1622  //function. The order of execution of parameters of a function is undefined
1623  //so we need to make sure input is called before output, or else in-place
1624  //use case breaks.
1625  vtkm::cont::Token token;
1626  auto keysPortal = keys.PrepareForInput(DeviceAdapterTagCuda(), token);
1627  auto valuesPortal = values.PrepareForInput(DeviceAdapterTagCuda(), token);
1628  ScanExclusiveByKeyPortal(keysPortal,
1629  valuesPortal,
1630  output.PrepareForOutput(numberOfValues, DeviceAdapterTagCuda(), token),
1631  initialValue,
1632  ::thrust::equal_to<T>(),
1633  binary_functor);
1634  }
1635 
1636  // we use cuda pinned memory to reduce the amount of synchronization
1637  // and mem copies between the host and device.
1638  struct VTKM_CONT_EXPORT PinnedErrorArray
1639  {
1640  char* HostPtr = nullptr;
1641  char* DevicePtr = nullptr;
1642  vtkm::Id Size = 0;
1643  };
1644 
1646  static const PinnedErrorArray& GetPinnedErrorArray();
1647 
1649  static void CheckForErrors(); // throws vtkm::cont::ErrorExecution
1650 
1652  static void SetupErrorBuffer(vtkm::exec::cuda::internal::TaskStrided& functor);
1653 
1655  static void GetBlocksAndThreads(vtkm::UInt32& blocks,
1656  vtkm::UInt32& threadsPerBlock,
1657  vtkm::Id size,
1658  vtkm::IdComponent maxThreadsPerBlock);
1659 
1661  static void GetBlocksAndThreads(vtkm::UInt32& blocks,
1662  dim3& threadsPerBlock,
1663  const dim3& size,
1664  vtkm::IdComponent maxThreadsPerBlock);
1665 
1666  template <typename... Hints, typename... Args>
1667  static void GetBlocksAndThreads(vtkm::cont::internal::HintList<Hints...>, Args&&... args)
1668  {
1669  using ThreadsPerBlock =
1670  vtkm::cont::internal::HintFind<vtkm::cont::internal::HintList<Hints...>,
1671  vtkm::cont::internal::HintThreadsPerBlock<0>,
1673  GetBlocksAndThreads(std::forward<Args>(args)..., ThreadsPerBlock::MaxThreads);
1674  }
1675 
1677  static void LogKernelLaunch(const cudaFuncAttributes& func_attrs,
1678  const std::type_info& worklet_info,
1679  vtkm::UInt32 blocks,
1680  vtkm::UInt32 threadsPerBlock,
1681  vtkm::Id size);
1682 
1684  static void LogKernelLaunch(const cudaFuncAttributes& func_attrs,
1685  const std::type_info& worklet_info,
1686  vtkm::UInt32 blocks,
1687  dim3 threadsPerBlock,
1688  const dim3& size);
1689 
1690 public:
1691  template <typename WType, typename IType, typename Hints>
1692  static void ScheduleTask(vtkm::exec::cuda::internal::TaskStrided1D<WType, IType, Hints>& functor,
1693  vtkm::Id numInstances)
1694  {
1696 
1697  VTKM_ASSERT(numInstances >= 0);
1698  if (numInstances < 1)
1699  {
1700  // No instances means nothing to run. Just return.
1701  return;
1702  }
1703 
1704  CheckForErrors();
1705  SetupErrorBuffer(functor);
1706 
1707  vtkm::UInt32 blocks, threadsPerBlock;
1708  GetBlocksAndThreads(Hints{}, blocks, threadsPerBlock, numInstances);
1709 
1710 #ifdef VTKM_ENABLE_LOGGING
1712  {
1713  using FunctorType = std::decay_t<decltype(functor)>;
1714  cudaFuncAttributes empty_kernel_attrs;
1715  VTKM_CUDA_CALL(cudaFuncGetAttributes(&empty_kernel_attrs,
1716  cuda::internal::TaskStrided1DLaunch<FunctorType>));
1717  LogKernelLaunch(empty_kernel_attrs, typeid(WType), blocks, threadsPerBlock, numInstances);
1718  }
1719 #endif
1720 
1721  cuda::internal::TaskStrided1DLaunch<<<blocks, threadsPerBlock, 0, cudaStreamPerThread>>>(
1722  functor, numInstances);
1723  }
1724 
1725  template <typename WType, typename IType, typename Hints>
1726  static void ScheduleTask(vtkm::exec::cuda::internal::TaskStrided3D<WType, IType, Hints>& functor,
1727  vtkm::Id3 rangeMax)
1728  {
1730 
1731  VTKM_ASSERT((rangeMax[0] >= 0) && (rangeMax[1] >= 0) && (rangeMax[2] >= 0));
1732  if ((rangeMax[0] < 1) || (rangeMax[1] < 1) || (rangeMax[2] < 1))
1733  {
1734  // No instances means nothing to run. Just return.
1735  return;
1736  }
1737 
1738  CheckForErrors();
1739  SetupErrorBuffer(functor);
1740 
1741  const dim3 ranges(static_cast<vtkm::UInt32>(rangeMax[0]),
1742  static_cast<vtkm::UInt32>(rangeMax[1]),
1743  static_cast<vtkm::UInt32>(rangeMax[2]));
1744 
1745  vtkm::UInt32 blocks;
1746  dim3 threadsPerBlock;
1747  GetBlocksAndThreads(Hints{}, blocks, threadsPerBlock, ranges);
1748 
1749 #ifdef VTKM_ENABLE_LOGGING
1751  {
1752  using FunctorType = std::decay_t<decltype(functor)>;
1753  cudaFuncAttributes empty_kernel_attrs;
1754  VTKM_CUDA_CALL(cudaFuncGetAttributes(&empty_kernel_attrs,
1755  cuda::internal::TaskStrided3DLaunch<FunctorType>));
1756  LogKernelLaunch(empty_kernel_attrs, typeid(WType), blocks, threadsPerBlock, ranges);
1757  }
1758 #endif
1759 
1760  cuda::internal::TaskStrided3DLaunch<<<blocks, threadsPerBlock, 0, cudaStreamPerThread>>>(
1761  functor, rangeMax);
1762  }
1763 
1764  template <typename Hints, typename Functor>
1765  VTKM_CONT static void Schedule(Hints, Functor functor, vtkm::Id numInstances)
1766  {
1768 
1769  vtkm::exec::cuda::internal::TaskStrided1D<Functor, vtkm::internal::NullType, Hints> kernel(
1770  functor);
1771 
1772  ScheduleTask(kernel, numInstances);
1773  }
1774 
1775  template <typename FunctorType>
1776  VTKM_CONT static inline void Schedule(FunctorType&& functor, vtkm::Id numInstances)
1777  {
1778  Schedule(vtkm::cont::internal::HintList<>{}, functor, numInstances);
1779  }
1780 
1781  template <typename Hints, typename Functor>
1782  VTKM_CONT static void Schedule(Hints, Functor functor, const vtkm::Id3& rangeMax)
1783  {
1785 
1786  vtkm::exec::cuda::internal::TaskStrided3D<Functor, vtkm::internal::NullType, Hints> kernel(
1787  functor);
1788  ScheduleTask(kernel, rangeMax);
1789  }
1790 
1791  template <typename FunctorType>
1792  VTKM_CONT static inline void Schedule(FunctorType&& functor, vtkm::Id3 rangeMax)
1793  {
1794  Schedule(vtkm::cont::internal::HintList<>{}, functor, rangeMax);
1795  }
1796 
1797  template <typename T, class Storage>
1799  {
1801 
1802  vtkm::cont::Token token;
1803  SortPortal(values.PrepareForInPlace(DeviceAdapterTagCuda(), token));
1804  }
1805 
1806  template <typename T, class Storage, class BinaryCompare>
1808  BinaryCompare binary_compare)
1809  {
1811 
1812  vtkm::cont::Token token;
1813  SortPortal(values.PrepareForInPlace(DeviceAdapterTagCuda(), token), binary_compare);
1814  }
1815 
1816  template <typename T, typename U, class StorageT, class StorageU>
1819  {
1821 
1822  vtkm::cont::Token token;
1823  SortByKeyPortal(keys.PrepareForInPlace(DeviceAdapterTagCuda(), token),
1824  values.PrepareForInPlace(DeviceAdapterTagCuda(), token));
1825  }
1826 
1827  template <typename T, typename U, class StorageT, class StorageU, class BinaryCompare>
1830  BinaryCompare binary_compare)
1831  {
1833 
1834  vtkm::cont::Token token;
1835  SortByKeyPortal(keys.PrepareForInPlace(DeviceAdapterTagCuda(), token),
1836  values.PrepareForInPlace(DeviceAdapterTagCuda(), token),
1837  binary_compare);
1838  }
1839 
1840  template <typename T, class Storage>
1842  {
1844 
1845  vtkm::Id newSize;
1846 
1847  {
1848  vtkm::cont::Token token;
1849  newSize = UniquePortal(values.PrepareForInPlace(DeviceAdapterTagCuda(), token));
1850  }
1851 
1852  values.Allocate(newSize, vtkm::CopyFlag::On);
1853  }
1854 
1855  template <typename T, class Storage, class BinaryCompare>
1857  BinaryCompare binary_compare)
1858  {
1860 
1861  vtkm::Id newSize;
1862  {
1863  vtkm::cont::Token token;
1864  newSize =
1865  UniquePortal(values.PrepareForInPlace(DeviceAdapterTagCuda(), token), binary_compare);
1866  }
1867 
1868  values.Allocate(newSize, vtkm::CopyFlag::On);
1869  }
1870 
1871  template <typename T, class SIn, class SVal, class SOut>
1873  const vtkm::cont::ArrayHandle<T, SVal>& values,
1875  {
1877 
1878  vtkm::Id numberOfValues = values.GetNumberOfValues();
1879  vtkm::cont::Token token;
1880  UpperBoundsPortal(input.PrepareForInput(DeviceAdapterTagCuda(), token),
1881  values.PrepareForInput(DeviceAdapterTagCuda(), token),
1882  output.PrepareForOutput(numberOfValues, DeviceAdapterTagCuda(), token));
1883  }
1884 
1885  template <typename T, class SIn, class SVal, class SOut, class BinaryCompare>
1887  const vtkm::cont::ArrayHandle<T, SVal>& values,
1889  BinaryCompare binary_compare)
1890  {
1892 
1893  vtkm::Id numberOfValues = values.GetNumberOfValues();
1894  vtkm::cont::Token token;
1895  UpperBoundsPortal(input.PrepareForInput(DeviceAdapterTagCuda(), token),
1896  values.PrepareForInput(DeviceAdapterTagCuda(), token),
1897  output.PrepareForOutput(numberOfValues, DeviceAdapterTagCuda(), token),
1898  binary_compare);
1899  }
1900 
1901  template <class SIn, class SOut>
1904  {
1906 
1907  vtkm::cont::Token token;
1908  UpperBoundsPortal(input.PrepareForInput(DeviceAdapterTagCuda(), token),
1909  values_output.PrepareForInPlace(DeviceAdapterTagCuda(), token));
1910  }
1911 
1912  VTKM_CONT static void Synchronize()
1913  {
1915 
1916  VTKM_CUDA_CALL(cudaStreamSynchronize(cudaStreamPerThread));
1917  CheckForErrors();
1918  }
1919 };
1920 
1921 template <>
1923 {
1924 public:
1925  template <typename Hints, typename WorkletType, typename InvocationType>
1926  static vtkm::exec::cuda::internal::TaskStrided1D<WorkletType, InvocationType, Hints>
1927  MakeTask(WorkletType& worklet, InvocationType& invocation, vtkm::Id, Hints = Hints{})
1928  {
1929  return { worklet, invocation };
1930  }
1931 
1932  template <typename Hints, typename WorkletType, typename InvocationType>
1933  static vtkm::exec::cuda::internal::TaskStrided3D<WorkletType, InvocationType, Hints>
1934  MakeTask(WorkletType& worklet, InvocationType& invocation, vtkm::Id3, Hints = Hints{})
1935  {
1936  return { worklet, invocation };
1937  }
1938 
1939  template <typename WorkletType, typename InvocationType, typename RangeType>
1940  VTKM_CONT static auto MakeTask(WorkletType& worklet,
1941  InvocationType& invocation,
1942  const RangeType& range)
1943  {
1944  return MakeTask<vtkm::cont::internal::HintList<>>(worklet, invocation, range);
1945  }
1946 };
1947 }
1948 } // namespace vtkm::cont
1949 
1950 #endif //vtk_m_cont_cuda_internal_DeviceAdapterAlgorithmCuda_h
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::ScanExclusiveByKey
static void ScanExclusiveByKey(const vtkm::cont::ArrayHandle< T, KIn > &keys, const vtkm::cont::ArrayHandle< U, VIn > &values, vtkm::cont::ArrayHandle< U, VOut > &output)
Definition: DeviceAdapterAlgorithmCuda.h:1572
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::CountSetBits
static vtkm::Id CountSetBits(const vtkm::cont::BitField &bits)
Definition: DeviceAdapterAlgorithmCuda.h:1272
vtkm::cont::ArrayHandle
Manages an array-worth of data.
Definition: ArrayHandle.h:300
vtkm::cont::cuda::ScheduleParameters
Represents how to schedule 1D, 2D, and 3D Cuda kernels.
Definition: DeviceAdapterAlgorithmCuda.h:81
ArrayHandle.h
VTKM_THIRDPARTY_POST_INCLUDE
#define VTKM_THIRDPARTY_POST_INCLUDE
Definition: Configure.h:192
vtkm::cont::DeviceTaskTypes< vtkm::cont::DeviceAdapterTagCuda >::MakeTask
static vtkm::exec::cuda::internal::TaskStrided3D< WorkletType, InvocationType, Hints > MakeTask(WorkletType &worklet, InvocationType &invocation, vtkm::Id3, Hints=Hints{})
Definition: DeviceAdapterAlgorithmCuda.h:1934
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::Schedule
static void Schedule(Hints, Functor functor, const vtkm::Id3 &rangeMax)
Definition: DeviceAdapterAlgorithmCuda.h:1782
VTKM_EXEC
#define VTKM_EXEC
Definition: ExportMacros.h:51
vtkm
Groups connected points that have the same field value.
Definition: Atomic.h:19
vtkm::TypeTraits
The TypeTraits class provides helpful compile-time information about the basic types used in VTKm (an...
Definition: TypeTraits.h:61
vtkm::Get
auto Get(const vtkm::Tuple< Ts... > &tuple)
Retrieve the object from a vtkm::Tuple at the given index.
Definition: Tuple.h:81
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::Schedule
static void Schedule(FunctorType &&functor, vtkm::Id3 rangeMax)
Definition: DeviceAdapterAlgorithmCuda.h:1792
Types.h
vtkm::cont::cuda::ScheduleParameters::three_d_blocks
int three_d_blocks
Definition: DeviceAdapterAlgorithmCuda.h:89
VTKM_ASSERT
#define VTKM_ASSERT(condition)
Definition: Assert.h:43
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::SortPortal
static void SortPortal(const ValuesPortal &values, BinaryCompare binary_compare)
Definition: DeviceAdapterAlgorithmCuda.h:935
vtkm::cont::DeviceAdapterTagCuda
Tag for a device adapter that uses a CUDA capable GPU device.
Definition: DeviceAdapterTagCuda.h:33
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::BitFieldToUnorderedSetFunctor::GetNextOutputIndex
__device__ vtkm::Id GetNextOutputIndex() const
Definition: DeviceAdapterAlgorithmCuda.h:384
UnaryPredicates.h
vtkm::cont::DeviceAdapterAlgorithm::Schedule
static void Schedule(Functor functor, vtkm::Id numInstances)
Schedule many instances of a function to run on concurrent threads.
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::CopySubRange
static bool CopySubRange(const vtkm::cont::ArrayHandle< T, SIn > &input, vtkm::Id inputStartIndex, vtkm::Id numberOfElementsToCopy, vtkm::cont::ArrayHandle< U, SOut > &output, vtkm::Id outputIndex=0)
Definition: DeviceAdapterAlgorithmCuda.h:1214
vtkm::IdComponent
vtkm::Int32 IdComponent
Base type to use to index small lists.
Definition: Types.h:194
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::CountSetBitsFunctor::Initialize
void Initialize()
Definition: DeviceAdapterAlgorithmCuda.h:512
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::UpperBounds
static void UpperBounds(const vtkm::cont::ArrayHandle< T, SIn > &input, const vtkm::cont::ArrayHandle< T, SVal > &values, vtkm::cont::ArrayHandle< vtkm::Id, SOut > &output, BinaryCompare binary_compare)
Definition: DeviceAdapterAlgorithmCuda.h:1886
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::BitFieldToUnorderedSetFunctor::BitFieldToUnorderedSetFunctor
BitFieldToUnorderedSetFunctor(const BitsPortal &input, const IndicesPortal &output, GlobalPopCountType *globalPopCount)
Definition: DeviceAdapterAlgorithmCuda.h:293
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::ScanExclusivePortal
static InputPortal::ValueType ScanExclusivePortal(const InputPortal &input, const OutputPortal &output)
Definition: DeviceAdapterAlgorithmCuda.h:735
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::LowerBounds
static void LowerBounds(const vtkm::cont::ArrayHandle< vtkm::Id, SIn > &input, vtkm::cont::ArrayHandle< vtkm::Id, SOut > &values_output)
Definition: DeviceAdapterAlgorithmCuda.h:1312
BitField.h
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::ScanExclusiveByKey
static void ScanExclusiveByKey(const vtkm::cont::ArrayHandle< T, KIn > &keys, const vtkm::cont::ArrayHandle< U, VIn > &values, vtkm::cont::ArrayHandle< U, VOut > &output, const U &initialValue, BinaryFunctor binary_functor)
Definition: DeviceAdapterAlgorithmCuda.h:1606
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::Schedule
static void Schedule(FunctorType &&functor, vtkm::Id numInstances)
Definition: DeviceAdapterAlgorithmCuda.h:1776
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::ReducePortalImpl
static T ReducePortalImpl(const InputPortal &input, T initialValue, BinaryFunctor binary_functor, std::true_type)
Definition: DeviceAdapterAlgorithmCuda.h:639
DeviceAdapterAlgorithmGeneral.h
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::GetBlocksAndThreads
static void GetBlocksAndThreads(vtkm::cont::internal::HintList< Hints... >, Args &&... args)
Definition: DeviceAdapterAlgorithmCuda.h:1667
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::CountSetBitsFunctor::Reduce
__device__ void Reduce() const
Definition: DeviceAdapterAlgorithmCuda.h:549
vtkm::cont::cuda::ScheduleParameters::two_d_threads_per_block
dim3 two_d_threads_per_block
Definition: DeviceAdapterAlgorithmCuda.h:87
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::ScanInclusiveByKeyPortal
static void ScanInclusiveByKeyPortal(const KeysPortal &keys, const ValuesPortal &values, const OutputPortal &output, BinaryPredicate binary_predicate, AssociativeOperator binary_operator)
Definition: DeviceAdapterAlgorithmCuda.h:847
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::CountSetBitsFunctor::operator()
__device__ void operator()(vtkm::Id wordIdx) const
Definition: DeviceAdapterAlgorithmCuda.h:519
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::ScanInclusivePortal
static InputPortal::ValueType ScanInclusivePortal(const InputPortal &input, const OutputPortal &output)
Definition: DeviceAdapterAlgorithmCuda.h:795
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::UpperBoundsPortal
static void UpperBoundsPortal(const InputPortal &input, const ValuesPortal &values, const OutputPortal &output)
Definition: DeviceAdapterAlgorithmCuda.h:1020
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::UniquePortal
static vtkm::Id UniquePortal(const ValuesPortal values)
Definition: DeviceAdapterAlgorithmCuda.h:983
vtkm::cont::ArrayHandle::GetNumberOfValues
vtkm::Id GetNumberOfValues() const
Returns the number of entries in the array.
Definition: ArrayHandle.h:468
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::BitFieldToUnorderedSetFunctor::Output
const IndicesPortal Output
Definition: DeviceAdapterAlgorithmCuda.h:399
vtkm::cont::cuda::ScheduleParameters::one_d_threads_per_block
int one_d_threads_per_block
Definition: DeviceAdapterAlgorithmCuda.h:84
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::ScheduleTask
static void ScheduleTask(vtkm::exec::cuda::internal::TaskStrided1D< WType, IType, Hints > &functor, vtkm::Id numInstances)
Definition: DeviceAdapterAlgorithmCuda.h:1692
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::BitFieldToUnorderedSetFunctor::ReduceAllocate
__device__ void ReduceAllocate() const
Definition: DeviceAdapterAlgorithmCuda.h:355
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::Reduce
static U Reduce(const vtkm::cont::ArrayHandle< T, vtkm::cont::StorageTagMultiplexer< SIns... >> &input, U initialValue)
Definition: DeviceAdapterAlgorithmCuda.h:1357
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::ScanExclusive
static T ScanExclusive(const vtkm::cont::ArrayHandle< T, SIn > &input, vtkm::cont::ArrayHandle< T, SOut > &output, BinaryFunctor binary_functor, const T &initialValue)
Definition: DeviceAdapterAlgorithmCuda.h:1434
vtkm::cont::BitField::GetNumberOfBits
vtkm::Id GetNumberOfBits() const
Return the number of bits stored by this BitField.
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::CopyIf
static void CopyIf(const vtkm::cont::ArrayHandle< U, SIn > &input, const vtkm::cont::ArrayHandle< T, SStencil > &stencil, vtkm::cont::ArrayHandle< U, SOut > &output, UnaryPredicate unary_predicate)
Definition: DeviceAdapterAlgorithmCuda.h:1186
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::UpperBoundsPortal
static void UpperBoundsPortal(const InputPortal &input, const OutputPortal &values_output)
Definition: DeviceAdapterAlgorithmCuda.h:1066
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::BitFieldToUnorderedSetFunctor::Initialize
void Initialize()
Definition: DeviceAdapterAlgorithmCuda.h:306
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::Unique
static void Unique(vtkm::cont::ArrayHandle< T, Storage > &values)
Definition: DeviceAdapterAlgorithmCuda.h:1841
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::ReducePortalImpl
static T ReducePortalImpl(const InputPortal &input, T initialValue, BinaryFunctor binary_functor, std::false_type)
Definition: DeviceAdapterAlgorithmCuda.h:665
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >
Definition: DeviceAdapterAlgorithmCuda.h:262
DeviceAdapterAlgorithm.h
DeviceAdapterTimerImplementationCuda.h
DeviceAdapterRuntimeDetectorCuda.h
VTKM_PASS_COMMAS
#define VTKM_PASS_COMMAS(...)
Definition: Configure.h:364
ThrustPatches.h
DeviceAdapterTagCuda.h
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::ScanExclusiveByKeyPortal
static void ScanExclusiveByKeyPortal(const KeysPortal &keys, const ValuesPortal &values, const OutputPortal &output)
Definition: DeviceAdapterAlgorithmCuda.h:877
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::SortPortal
static void SortPortal(const ValuesPortal &values)
Definition: DeviceAdapterAlgorithmCuda.h:928
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::ReduceByKeyPortal
static vtkm::Id ReduceByKeyPortal(const KeysPortal &keys, const ValuesPortal &values, const KeysOutputPortal &keys_output, const ValueOutputPortal &values_output, BinaryFunctor binary_functor)
Definition: DeviceAdapterAlgorithmCuda.h:699
vtkm::cont::DeviceAdapterAlgorithm::VIn
static T VIn
Definition: DeviceAdapterAlgorithm.h:349
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::BitFieldToUnorderedSetFunctor::Word
vtkm::AtomicTypePreferred Word
Definition: DeviceAdapterAlgorithmCuda.h:287
vtkm::Add
Definition: Types.h:260
vtkm::cont::Token
A token to hold the scope of an ArrayHandle or other object.
Definition: Token.h:35
TypeTraits.h
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::CountSetBitsFunctor::GlobalPopCount
GlobalPopCountType * GlobalPopCount
Definition: DeviceAdapterAlgorithmCuda.h:573
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::LowerBoundsPortal
static void LowerBoundsPortal(const InputPortal &input, const ValuesPortal &values, const OutputPortal &output, BinaryCompare binary_compare)
Definition: DeviceAdapterAlgorithmCuda.h:598
ThrustExceptionHandler.h
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::ScanInclusiveByKeyPortal
static void ScanInclusiveByKeyPortal(const KeysPortal &keys, const ValuesPortal &values, const OutputPortal &output)
Definition: DeviceAdapterAlgorithmCuda.h:832
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::CountSetBitsFunctor::CountSetBitsFunctor
CountSetBitsFunctor(const BitsPortal &portal, GlobalPopCountType *globalPopCount)
Definition: DeviceAdapterAlgorithmCuda.h:502
vtkm::cont::ArrayHandle::PrepareForInPlace
WritePortalType PrepareForInPlace(vtkm::cont::DeviceAdapterId device, vtkm::cont::Token &token) const
Prepares this array to be used in an in-place operation (both as input and output) in the execution e...
Definition: ArrayHandle.h:618
vtkm::FindFirstSetBit
vtkm::Int32 FindFirstSetBit(vtkm::UInt32 word)
Bitwise operations.
Definition: Math.h:2844
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::CountSetBitsFunctor::Portal
const BitsPortal Portal
Definition: DeviceAdapterAlgorithmCuda.h:572
vtkm::cont::DeviceAdapterAlgorithm::KIn
static T KIn
Definition: DeviceAdapterAlgorithm.h:348
Math.h
VTKM_STATIC_ASSERT
#define VTKM_STATIC_ASSERT(condition)
Definition: StaticAssert.h:16
VTKM_CUDA_CALL
#define VTKM_CUDA_CALL(command)
A macro that can be wrapped around a CUDA command and will throw an ErrorCuda exception if the CUDA c...
Definition: ErrorCuda.h:38
ErrorMessageBuffer.h
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::UpperBounds
static void UpperBounds(const vtkm::cont::ArrayHandle< vtkm::Id, SIn > &input, vtkm::cont::ArrayHandle< vtkm::Id, SOut > &values_output)
Definition: DeviceAdapterAlgorithmCuda.h:1902
vtkm::cont::LogLevel::KernelLaunches
@ KernelLaunches
Details on device-side kernel launches.
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::SortByKeyPortal
static void SortByKeyPortal(const KeysPortal &keys, const ValuesPortal &values, BinaryCompare binary_compare)
Definition: DeviceAdapterAlgorithmCuda.h:961
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::Reduce
static U Reduce(const vtkm::cont::ArrayHandle< T, SIn > &input, U initialValue)
Definition: DeviceAdapterAlgorithmCuda.h:1323
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::ReduceByKey
static void ReduceByKey(const vtkm::cont::ArrayHandle< T, KIn > &keys, const vtkm::cont::ArrayHandle< U, VIn > &values, vtkm::cont::ArrayHandle< T, KOut > &keys_output, vtkm::cont::ArrayHandle< U, VOut > &values_output, BinaryFunctor binary_functor)
Definition: DeviceAdapterAlgorithmCuda.h:1379
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::Sort
static void Sort(vtkm::cont::ArrayHandle< T, Storage > &values)
Definition: DeviceAdapterAlgorithmCuda.h:1798
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::Unique
static void Unique(vtkm::cont::ArrayHandle< T, Storage > &values, BinaryCompare binary_compare)
Definition: DeviceAdapterAlgorithmCuda.h:1856
VTKM_STATIC_ASSERT_MSG
#define VTKM_STATIC_ASSERT_MSG(condition, message)
Definition: StaticAssert.h:18
VTKM_CONT_EXPORT
#define VTKM_CONT_EXPORT
Definition: vtkm_cont_export.h:44
vtkm::cont::DeviceAdapterAlgorithm::U
static T U
Definition: DeviceAdapterAlgorithm.h:347
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::BitFieldToUnorderedSetFunctor::Finalize
vtkm::Id Finalize() const
Definition: DeviceAdapterAlgorithmCuda.h:340
MakeThrustIterator.h
vtkm::cont::ArrayHandle::PrepareForOutput
WritePortalType PrepareForOutput(vtkm::Id numberOfValues, vtkm::cont::DeviceAdapterId device, vtkm::cont::Token &token) const
Prepares (allocates) this array to be used as an output from an operation in the execution environmen...
Definition: ArrayHandle.h:638
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::CountSetBitsFunctor::~CountSetBitsFunctor
~CountSetBitsFunctor()
Definition: DeviceAdapterAlgorithmCuda.h:510
vtkm::cont::DeviceTaskTypes
Class providing a device-specific support for selecting the optimal Task type for a given worklet.
Definition: DeviceAdapterAlgorithm.h:744
vtkm_cont_export.h
VTKM_CONT
#define VTKM_CONT
Definition: ExportMacros.h:57
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::BitFieldToUnorderedSetFunctor::operator()
__device__ void operator()(vtkm::Id wordIdx) const
Definition: DeviceAdapterAlgorithmCuda.h:313
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::Schedule
static void Schedule(Hints, Functor functor, vtkm::Id numInstances)
Definition: DeviceAdapterAlgorithmCuda.h:1765
vtkm::Id
vtkm::Int64 Id
Base type to use to index arrays.
Definition: Types.h:227
VTKM_LOG_SCOPE_FUNCTION
#define VTKM_LOG_SCOPE_FUNCTION(level)
Definition: Logging.h:214
vtkm::cont::DeviceAdapterAlgorithm
Struct containing device adapter algorithms.
Definition: DeviceAdapterAlgorithm.h:41
vtkm::CopyFlag::On
@ On
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::Reduce
static U Reduce(const vtkm::cont::ArrayHandle< T, SIn > &input, U initialValue, BinaryFunctor binary_functor)
Definition: DeviceAdapterAlgorithmCuda.h:1337
vtkm::AtomicTypePreferred
vtkm::UInt32 AtomicTypePreferred
The preferred type to use for atomic operations.
Definition: Atomic.h:768
vtkm::cont::cuda::ScheduleParameters::one_d_blocks
int one_d_blocks
Definition: DeviceAdapterAlgorithmCuda.h:83
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::BitFieldToUnorderedSetPortal
static vtkm::Id BitFieldToUnorderedSetPortal(const BitsPortal &bits, const IndicesPortal &indices)
Definition: DeviceAdapterAlgorithmCuda.h:1085
ErrorExecution.h
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::CountSetBitsFunctor::Word
vtkm::AtomicTypePreferred Word
Definition: DeviceAdapterAlgorithmCuda.h:499
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::ScanInclusiveByKey
static void ScanInclusiveByKey(const vtkm::cont::ArrayHandle< T, KIn > &keys, const vtkm::cont::ArrayHandle< U, VIn > &values, vtkm::cont::ArrayHandle< U, VOut > &output, BinaryFunctor binary_functor)
Definition: DeviceAdapterAlgorithmCuda.h:1543
vtkm::cont::GetStderrLogLevel
vtkm::cont::LogLevel GetStderrLogLevel()
Get the active highest log level that will be printed to stderr.
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::CopySubRangePortal
static void CopySubRangePortal(const InputPortal &input, vtkm::Id inputOffset, vtkm::Id size, const OutputPortal &output, vtkm::Id outputOffset)
Definition: DeviceAdapterAlgorithmCuda.h:469
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::UpperBounds
static void UpperBounds(const vtkm::cont::ArrayHandle< T, SIn > &input, const vtkm::cont::ArrayHandle< T, SVal > &values, vtkm::cont::ArrayHandle< vtkm::Id, SOut > &output)
Definition: DeviceAdapterAlgorithmCuda.h:1872
vtkm::cont::DeviceAdapterAlgorithm::CopySubRange
static bool CopySubRange(const vtkm::cont::ArrayHandle< T, CIn > &input, vtkm::Id inputStartIndex, vtkm::Id numberOfElementsToCopy, vtkm::cont::ArrayHandle< U, COut > &output, vtkm::Id outputIndex=0)
Copy the contents of a section of one ArrayHandle to another.
vtkm::cont::DeviceTaskTypes< vtkm::cont::DeviceAdapterTagCuda >::MakeTask
static vtkm::exec::cuda::internal::TaskStrided1D< WorkletType, InvocationType, Hints > MakeTask(WorkletType &worklet, InvocationType &invocation, vtkm::Id, Hints=Hints{})
Definition: DeviceAdapterAlgorithmCuda.h:1927
vtkm::exec::FunctorBase
Base class for all user worklets invoked in the execution environment from a call to vtkm::cont::Devi...
Definition: FunctorBase.h:30
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::ScanExclusivePortal
static InputPortal::ValueType ScanExclusivePortal(const InputPortal &input, const OutputPortal &output, BinaryFunctor binaryOp, typename InputPortal::ValueType initialValue)
Definition: DeviceAdapterAlgorithmCuda.h:747
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::UpperBoundsPortal
static void UpperBoundsPortal(const InputPortal &input, const ValuesPortal &values, const OutputPortal &output, BinaryCompare binary_compare)
Definition: DeviceAdapterAlgorithmCuda.h:1040
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::Reduce
static U Reduce(const vtkm::cont::ArrayHandle< T, vtkm::cont::StorageTagMultiplexer< SIns... >> &input, U initialValue, BinaryFunctor binary_functor)
Definition: DeviceAdapterAlgorithmCuda.h:1364
vtkm::Vec< vtkm::Id, 3 >
VTKM_THIRDPARTY_PRE_INCLUDE
#define VTKM_THIRDPARTY_PRE_INCLUDE
Definition: Configure.h:191
vtkm::cont::DeviceTaskTypes< vtkm::cont::DeviceAdapterTagCuda >::MakeTask
static auto MakeTask(WorkletType &worklet, InvocationType &invocation, const RangeType &range)
Definition: DeviceAdapterAlgorithmCuda.h:1940
vtkm::UInt32
uint32_t UInt32
Base type to use for 32-bit unsigned integer numbers.
Definition: Types.h:185
vtkm::cont::BitField
Definition: BitField.h:497
ArrayHandleMultiplexer.h
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::BitFieldToUnorderedSet
static vtkm::Id BitFieldToUnorderedSet(const vtkm::cont::BitField &bits, vtkm::cont::ArrayHandle< Id, IndicesStorage > &indices)
Definition: DeviceAdapterAlgorithmCuda.h:1119
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::ScanInclusive
static T ScanInclusive(const vtkm::cont::ArrayHandle< T, SIn > &input, vtkm::cont::ArrayHandle< T, SOut > &output)
Definition: DeviceAdapterAlgorithmCuda.h:1462
vtkm::NotZeroInitialized
Predicate that takes a single argument x, and returns True if it isn't the identity of the Type T.
Definition: UnaryPredicates.h:32
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::ReducePortal
static T ReducePortal(const InputPortal &input, T initialValue)
Definition: DeviceAdapterAlgorithmCuda.h:624
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::Copy
static void Copy(const vtkm::cont::ArrayHandle< T, SIn > &input, vtkm::cont::ArrayHandle< U, SOut > &output)
Definition: DeviceAdapterAlgorithmCuda.h:1141
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::ScanExclusive
static T ScanExclusive(const vtkm::cont::ArrayHandle< T, SIn > &input, vtkm::cont::ArrayHandle< T, SOut > &output)
Definition: DeviceAdapterAlgorithmCuda.h:1411
vtkm::cont::BitField::PrepareForInput
ReadPortalType PrepareForInput(vtkm::cont::DeviceAdapterId device, vtkm::cont::Token &token) const
Prepares this BitField to be used as an input to an operation in the execution environment.
vtkm::UInt64
unsigned long long UInt64
Base type to use for 64-bit signed integer numbers.
Definition: Types.h:207
vtkm::Int32
int32_t Int32
Base type to use for 32-bit signed integer numbers.
Definition: Types.h:181
vtkm::cont::ArrayHandle::PrepareForInput
ReadPortalType PrepareForInput(vtkm::cont::DeviceAdapterId device, vtkm::cont::Token &token) const
Prepares this array to be used as an input to an operation in the execution environment.
Definition: ArrayHandle.h:599
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::Superclass
vtkm::cont::internal::DeviceAdapterAlgorithmGeneral< vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >, vtkm::cont::DeviceAdapterTagCuda > Superclass
Definition: DeviceAdapterAlgorithmCuda.h:275
vtkm::cont::cuda::ScheduleParameters::three_d_threads_per_block
dim3 three_d_threads_per_block
Definition: DeviceAdapterAlgorithmCuda.h:90
vtkm::TypeTraits::ZeroInitialization
static T ZeroInitialization()
A static function that returns 0 (or the closest equivalent to it) for the given type.
Definition: TypeTraits.h:77
vtkm::cont::DeviceAdapterAlgorithm::Reduce
static U Reduce(const vtkm::cont::ArrayHandle< T, CIn > &input, U initialValue)
Compute a accumulated sum operation on the input ArrayHandle.
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::BitFieldToUnorderedSetFunctor::GlobalPopCount
GlobalPopCountType * GlobalPopCount
Definition: DeviceAdapterAlgorithmCuda.h:400
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::LowerBoundsPortal
static void LowerBoundsPortal(const InputPortal &input, const ValuesPortal &values, const OutputPortal &output)
Definition: DeviceAdapterAlgorithmCuda.h:581
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::LowerBoundsPortal
static void LowerBoundsPortal(const InputPortal &input, const OutputPortal &values_output)
Definition: DeviceAdapterAlgorithmCuda.h:590
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::Synchronize
static void Synchronize()
Definition: DeviceAdapterAlgorithmCuda.h:1912
vtkm::cont::ArrayHandle::Allocate
void Allocate(vtkm::Id numberOfValues, vtkm::CopyFlag preserve, vtkm::cont::Token &token) const
Allocates an array large enough to hold the given number of values.
Definition: ArrayHandle.h:490
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::LowerBounds
static void LowerBounds(const vtkm::cont::ArrayHandle< T, SIn > &input, const vtkm::cont::ArrayHandle< T, SVal > &values, vtkm::cont::ArrayHandle< vtkm::Id, SOut > &output, BinaryCompare binary_compare)
Definition: DeviceAdapterAlgorithmCuda.h:1296
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::ScanInclusivePortal
static InputPortal::ValueType ScanInclusivePortal(const InputPortal &input, const OutputPortal &output, BinaryFunctor binary_functor)
Definition: DeviceAdapterAlgorithmCuda.h:803
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::CopyIf
static void CopyIf(const vtkm::cont::ArrayHandle< U, SIn > &input, const vtkm::cont::ArrayHandle< T, SStencil > &stencil, vtkm::cont::ArrayHandle< U, SOut > &output)
Definition: DeviceAdapterAlgorithmCuda.h:1158
Logging.h
Logging utilities.
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::BitFieldToUnorderedSetFunctor::Input
const BitsPortal Input
Definition: DeviceAdapterAlgorithmCuda.h:398
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::ScanExclusiveByKeyPortal
static void ScanExclusiveByKeyPortal(const KeysPortal &keys, const ValuesPortal &values, const OutputPortal &output, T initValue, BinaryPredicate binary_predicate, AssociativeOperator binary_operator)
Definition: DeviceAdapterAlgorithmCuda.h:897
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::LowerBounds
static void LowerBounds(const vtkm::cont::ArrayHandle< T, SIn > &input, const vtkm::cont::ArrayHandle< T, SVal > &values, vtkm::cont::ArrayHandle< vtkm::Id, SOut > &output)
Definition: DeviceAdapterAlgorithmCuda.h:1282
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::ScheduleTask
static void ScheduleTask(vtkm::exec::cuda::internal::TaskStrided3D< WType, IType, Hints > &functor, vtkm::Id3 rangeMax)
Definition: DeviceAdapterAlgorithmCuda.h:1726
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::CopyPortal
static void CopyPortal(const InputPortal &input, const OutputPortal &output)
Definition: DeviceAdapterAlgorithmCuda.h:409
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::CopyIfPortal
static vtkm::Id CopyIfPortal(ValueIterator valuesBegin, ValueIterator valuesEnd, StencilPortal stencil, OutputPortal output, UnaryPredicate unary_predicate)
Definition: DeviceAdapterAlgorithmCuda.h:425
vtkm::cont::cuda::ScheduleParameters::two_d_blocks
int two_d_blocks
Definition: DeviceAdapterAlgorithmCuda.h:86
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::CountSetBitsFunctor::Finalize
vtkm::Id Finalize() const
Definition: DeviceAdapterAlgorithmCuda.h:535
ErrorCuda.h
Token.h
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::SortByKey
static void SortByKey(vtkm::cont::ArrayHandle< T, StorageT > &keys, vtkm::cont::ArrayHandle< U, StorageU > &values, BinaryCompare binary_compare)
Definition: DeviceAdapterAlgorithmCuda.h:1828
VTKM_SUPPRESS_EXEC_WARNINGS
#define VTKM_SUPPRESS_EXEC_WARNINGS
Definition: ExportMacros.h:53
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::SortByKeyPortal
static void SortByKeyPortal(const KeysPortal &keys, const ValuesPortal &values)
Definition: DeviceAdapterAlgorithmCuda.h:954
vtkm::cont::cuda::InitScheduleParameters
void InitScheduleParameters(vtkm::cont::cuda::ScheduleParameters(*)(char const *name, int major, int minor, int multiProcessorCount, int maxThreadsPerMultiProcessor, int maxThreadsPerBlock))
Specify the custom scheduling to use for VTK-m CUDA kernel launches.
vtkm::cont::DeviceAdapterAlgorithm::VOut
static T VOut
Definition: DeviceAdapterAlgorithm.h:350
TaskStrided.h
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::BitFieldToUnorderedSetFunctor::~BitFieldToUnorderedSetFunctor
~BitFieldToUnorderedSetFunctor()
Definition: DeviceAdapterAlgorithmCuda.h:304
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::ScanInclusiveByKey
static void ScanInclusiveByKey(const vtkm::cont::ArrayHandle< T, KIn > &keys, const vtkm::cont::ArrayHandle< U, VIn > &values, vtkm::cont::ArrayHandle< U, VOut > &output)
Definition: DeviceAdapterAlgorithmCuda.h:1511
vtkm::cont::LogLevel::Perf
@ Perf
General timing data and algorithm flow information, such as filter execution, worklet dispatches,...
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::CountSetBitsPortal
static vtkm::Id CountSetBitsPortal(const BitsPortal &bits)
Definition: DeviceAdapterAlgorithmCuda.h:1101
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::SortByKey
static void SortByKey(vtkm::cont::ArrayHandle< T, StorageT > &keys, vtkm::cont::ArrayHandle< U, StorageU > &values)
Definition: DeviceAdapterAlgorithmCuda.h:1817
vtkm::CountSetBits
vtkm::Int32 CountSetBits(vtkm::UInt32 word)
Count the total number of bits set in word.
Definition: Math.h:2940
WrappedOperators.h
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::CopyIfPortal
static vtkm::Id CopyIfPortal(ValuePortal values, StencilPortal stencil, OutputPortal output, UnaryPredicate unary_predicate)
Definition: DeviceAdapterAlgorithmCuda.h:456
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::ScanInclusive
static T ScanInclusive(const vtkm::cont::ArrayHandle< T, SIn > &input, vtkm::cont::ArrayHandle< T, SOut > &output, BinaryFunctor binary_functor)
Definition: DeviceAdapterAlgorithmCuda.h:1485
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::ReducePortal
static T ReducePortal(const InputPortal &input, T initialValue, BinaryFunctor binary_functor)
Definition: DeviceAdapterAlgorithmCuda.h:630
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::UniquePortal
static vtkm::Id UniquePortal(const ValuesPortal values, BinaryCompare binary_compare)
Definition: DeviceAdapterAlgorithmCuda.h:1000
vtkm::cont::DeviceAdapterAlgorithm< vtkm::cont::DeviceAdapterTagCuda >::Sort
static void Sort(vtkm::cont::ArrayHandle< T, Storage > &values, BinaryCompare binary_compare)
Definition: DeviceAdapterAlgorithmCuda.h:1807
vtkm::cont::StorageTagMultiplexer
Definition: ArrayHandleMultiplexer.h:141