10 #ifndef vtk_m_cont_cuda_internal_DeviceAdapterAlgorithmCuda_h 
   11 #define vtk_m_cont_cuda_internal_DeviceAdapterAlgorithmCuda_h 
   25 #include <vtkm/cont/vtkm_cont_export.h> 
   42 VTKM_THIRDPARTY_PRE_INCLUDE
 
   44 #include <vtkm/exec/cuda/internal/ExecutionPolicy.h> 
   46 #include <cooperative_groups.h> 
   48 #include <thrust/advance.h> 
   49 #include <thrust/binary_search.h> 
   50 #include <thrust/copy.h> 
   51 #include <thrust/count.h> 
   52 #include <thrust/iterator/counting_iterator.h> 
   53 #include <thrust/scan.h> 
   54 #include <thrust/sort.h> 
   55 #include <thrust/system/cpp/memory.h> 
   56 #include <thrust/system/cuda/vector.h> 
   57 #include <thrust/unique.h> 
   59 VTKM_THIRDPARTY_POST_INCLUDE
 
  131                                            int multiProcessorCount,
 
  132                                            int maxThreadsPerMultiProcessor,
 
  133                                            int maxThreadsPerBlock));
 
  138 #if (defined(VTKM_GCC) || defined(VTKM_CLANG)) 
  139 #pragma GCC diagnostic push 
  140 #pragma GCC diagnostic ignored "-Wunused-parameter" 
  143 template <
typename TaskType>
 
  144 __global__ 
void TaskStrided1DLaunch(TaskType task, 
vtkm::Id size)
 
  148   const vtkm::Id start = blockIdx.x * blockDim.x + threadIdx.x;
 
  149   const vtkm::Id inc = blockDim.x * gridDim.x;
 
  150   task(start, size, inc);
 
  153 template <
typename TaskType>
 
  154 __global__ 
void TaskStrided3DLaunch(TaskType task, 
vtkm::Id3 size)
 
  157   const dim3 start(blockIdx.x * blockDim.x + threadIdx.x,
 
  158                    blockIdx.y * blockDim.y + threadIdx.y,
 
  159                    blockIdx.z * blockDim.z + threadIdx.z);
 
  160   const dim3 inc(blockDim.x * gridDim.x, blockDim.y * gridDim.y, blockDim.z * gridDim.z);
 
  162   for (
vtkm::Id k = start.z; k < size[2]; k += inc.z)
 
  164     for (
vtkm::Id j = start.y; j < size[1]; j += inc.y)
 
  166       task(size, start.x, size[0], inc.x, j, k);
 
  171 template <
typename T, 
typename BinaryOperationType>
 
  172 __global__ 
void SumExclusiveScan(T a, T b, T result, BinaryOperationType binary_op)
 
  174   result = binary_op(a, b);
 
  177 #if (defined(VTKM_GCC) || defined(VTKM_CLANG)) 
  178 #pragma GCC diagnostic pop 
  181 template <
typename FunctorType, 
typename ArgType>
 
  182 struct FunctorSupportsUnaryImpl
 
  184   template <typename F, typename A, typename = decltype(std::declval<F>()(std::declval<A>()))>
 
  185   static std::true_type has(
int);
 
  186   template <
typename F, 
typename A>
 
  187   static std::false_type has(...);
 
  188   using type = decltype(has<FunctorType, ArgType>(0));
 
  190 template <
typename FunctorType, 
typename ArgType>
 
  191 using FunctorSupportsUnary = 
typename FunctorSupportsUnaryImpl<FunctorType, ArgType>::type;
 
  193 template <
typename PortalType,
 
  194           typename BinaryAndUnaryFunctor,
 
  195           typename = FunctorSupportsUnary<BinaryAndUnaryFunctor, typename PortalType::ValueType>>
 
  198 template <
typename PortalType, 
typename BinaryAndUnaryFunctor>
 
  199 struct CastPortal<PortalType, BinaryAndUnaryFunctor, std::true_type>
 
  201   using InputType = 
typename PortalType::ValueType;
 
  202   using ValueType = decltype(std::declval<BinaryAndUnaryFunctor>()(std::declval<InputType>()));
 
  205   BinaryAndUnaryFunctor Functor;
 
  208   CastPortal(
const PortalType& portal, 
const BinaryAndUnaryFunctor& functor)
 
  215   vtkm::Id GetNumberOfValues()
 const { 
return this->Portal.GetNumberOfValues(); }
 
  218   ValueType 
Get(
vtkm::Id index)
 const { 
return this->Functor(this->Portal.Get(index)); }
 
  221 template <
typename PortalType, 
typename BinaryFunctor>
 
  222 struct CastPortal<PortalType, BinaryFunctor, std::false_type>
 
  224   using InputType = 
typename PortalType::ValueType;
 
  226     decltype(std::declval<BinaryFunctor>()(std::declval<InputType>(), std::declval<InputType>()));
 
  231   CastPortal(
const PortalType& portal, 
const BinaryFunctor&)
 
  237   vtkm::Id GetNumberOfValues()
 const { 
return this->Portal.GetNumberOfValues(); }
 
  240   ValueType 
Get(
vtkm::Id index)
 const { 
return static_cast<ValueType
>(this->Portal.Get(index)); }
 
  243 struct CudaFreeFunctor
 
  245   void operator()(
void* ptr)
 const { 
VTKM_CUDA_CALL(cudaFree(ptr)); }
 
  248 template <
typename T>
 
  249 using CudaUniquePtr = std::unique_ptr<T, CudaFreeFunctor>;
 
  251 template <
typename T>
 
  252 CudaUniquePtr<T> make_CudaUniquePtr(std::size_t numElements)
 
  256   return CudaUniquePtr<T>(ptr);
 
  263   : vtkm::cont::internal::DeviceAdapterAlgorithmGeneral<
 
  264       vtkm::cont::DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>,
 
  265       vtkm::cont::DeviceAdapterTagCuda>
 
  273   using Superclass = vtkm::cont::internal::DeviceAdapterAlgorithmGeneral<
 
  275     vtkm::cont::DeviceAdapterTagCuda>;
 
  277   template <
typename BitsPortal, 
typename IndicesPortal, 
typename GlobalPopCountType>
 
  281                                             std::is_same<GlobalPopCountType, vtkm::UInt32>::value ||
 
  282                                             std::is_same<GlobalPopCountType, vtkm::UInt64>::value),
 
  283                            "Unsupported GlobalPopCountType. Must support CUDA atomicAdd.");
 
  290       VTKM_PASS_COMMAS(std::is_same<typename IndicesPortal::ValueType, vtkm::Id>::value));
 
  294                                   const IndicesPortal& output,
 
  295                                   GlobalPopCountType* globalPopCount)
 
  298       , GlobalPopCount{ globalPopCount }
 
  299       , FinalWordIndex{ input.GetNumberOfWords() - 1 }
 
  300       , FinalWordMask(input.GetFinalWordMask())
 
  308       assert(this->GlobalPopCount != 
nullptr);
 
  309       VTKM_CUDA_CALL(cudaMemset(this->GlobalPopCount, 0, 
sizeof(GlobalPopCountType)));
 
  315       Word word = this->Input.GetWord(wordIdx);
 
  318       const Word mask = wordIdx == this->FinalWordIndex ? this->FinalWordMask : ~
Word{ 0 };
 
  325         this->ReduceAllocate();
 
  327         vtkm::Id firstBitIdx = wordIdx * 
sizeof(
Word) * CHAR_BIT;
 
  332           vtkm::Id outIdx = this->GetNextOutputIndex();
 
  334           this->Output.Set(outIdx, firstBitIdx + bit);
 
  342       assert(this->GlobalPopCount != 
nullptr);
 
  343       GlobalPopCountType result;
 
  345         &result, this->GlobalPopCount, 
sizeof(GlobalPopCountType), cudaMemcpyDeviceToHost));
 
  346       return static_cast<vtkm::Id>(result);
 
  357       const auto activeLanes = cooperative_groups::coalesced_threads();
 
  358       const int activeRank = activeLanes.thread_rank();
 
  359       const int activeSize = activeLanes.size();
 
  363       for (
int delta = 1; delta < activeSize; delta *= 2)
 
  365         const vtkm::Int32 shflVal = activeLanes.shfl_down(rVal, delta);
 
  366         if (activeRank + delta < activeSize)
 
  374         this->AllocationHead =
 
  375           atomicAdd(this->GlobalPopCount, 
static_cast<GlobalPopCountType
>(rVal));
 
  378       this->AllocationHead = activeLanes.shfl(this->AllocationHead, 0);
 
  388       const auto activeLanes = cooperative_groups::coalesced_threads();
 
  389       const int activeRank = activeLanes.thread_rank();
 
  390       const int activeSize = activeLanes.size();
 
  392       vtkm::Id nextIdx = 
static_cast<vtkm::Id>(this->AllocationHead + activeRank);
 
  393       this->AllocationHead += activeSize;
 
  401     mutable vtkm::UInt64 AllocationHead{ 0 };
 
  408   template <
class InputPortal, 
class OutputPortal>
 
  413       ::thrust::copy(ThrustCudaPolicyPerThread,
 
  414                      cuda::internal::IteratorBegin(input),
 
  415                      cuda::internal::IteratorEnd(input),
 
  416                      cuda::internal::IteratorBegin(output));
 
  420       cuda::internal::throwAsVTKmException();
 
  424   template <
class ValueIterator, 
class StencilPortal, 
class OutputPortal, 
class UnaryPredicate>
 
  426                                          ValueIterator valuesEnd,
 
  427                                          StencilPortal stencil,
 
  429                                          UnaryPredicate unary_predicate)
 
  431     auto outputBegin = cuda::internal::IteratorBegin(output);
 
  433     using ValueType = 
typename StencilPortal::ValueType;
 
  435     vtkm::exec::cuda::internal::WrappedUnaryPredicate<ValueType, UnaryPredicate> up(
 
  440       auto newLast = ::thrust::copy_if(ThrustCudaPolicyPerThread,
 
  443                                        cuda::internal::IteratorBegin(stencil),
 
  446       return static_cast<vtkm::Id>(::thrust::distance(outputBegin, newLast));
 
  450       cuda::internal::throwAsVTKmException();
 
  455   template <
class ValuePortal, 
class StencilPortal, 
class OutputPortal, 
class UnaryPredicate>
 
  457                                          StencilPortal stencil,
 
  459                                          UnaryPredicate unary_predicate)
 
  461     return CopyIfPortal(cuda::internal::IteratorBegin(values),
 
  462                         cuda::internal::IteratorEnd(values),
 
  468   template <
class InputPortal, 
class OutputPortal>
 
  472                                            const OutputPortal& output,
 
  477       ::thrust::copy_n(ThrustCudaPolicyPerThread,
 
  478                        cuda::internal::IteratorBegin(input) + inputOffset,
 
  479                        static_cast<std::size_t
>(size),
 
  480                        cuda::internal::IteratorBegin(output) + outputOffset);
 
  484       cuda::internal::throwAsVTKmException();
 
  489   template <
typename BitsPortal, 
typename GlobalPopCountType>
 
  493                                             std::is_same<GlobalPopCountType, vtkm::UInt32>::value ||
 
  494                                             std::is_same<GlobalPopCountType, vtkm::UInt64>::value),
 
  495                            "Unsupported GlobalPopCountType. Must support CUDA atomicAdd.");
 
  504       , GlobalPopCount{ globalPopCount }
 
  505       , FinalWordIndex{ portal.GetNumberOfWords() - 1 }
 
  506       , FinalWordMask{ portal.GetFinalWordMask() }
 
  514       assert(this->GlobalPopCount != 
nullptr);
 
  515       VTKM_CUDA_CALL(cudaMemset(this->GlobalPopCount, 0, 
sizeof(GlobalPopCountType)));
 
  521       Word word = this->Portal.GetWord(wordIdx);
 
  524       const Word mask = wordIdx == this->FinalWordIndex ? this->FinalWordMask : ~
Word{ 0 };
 
  537       assert(this->GlobalPopCount != 
nullptr);
 
  538       GlobalPopCountType result;
 
  540         &result, this->GlobalPopCount, 
sizeof(GlobalPopCountType), cudaMemcpyDeviceToHost));
 
  541       return static_cast<vtkm::Id>(result);
 
  551       const auto activeLanes = cooperative_groups::coalesced_threads();
 
  552       const int activeRank = activeLanes.thread_rank();
 
  553       const int activeSize = activeLanes.size();
 
  557       for (
int delta = 1; delta < activeSize; delta *= 2)
 
  559         const vtkm::Int32 shflVal = activeLanes.shfl_down(rVal, delta);
 
  560         if (activeRank + delta < activeSize)
 
  568         atomicAdd(this->GlobalPopCount, 
static_cast<GlobalPopCountType
>(rVal));
 
  580   template <
class InputPortal, 
class ValuesPortal, 
class OutputPortal>
 
  582                                           const ValuesPortal& values,
 
  583                                           const OutputPortal& output)
 
  585     using ValueType = 
typename ValuesPortal::ValueType;
 
  586     LowerBoundsPortal(input, values, output, ::thrust::less<ValueType>());
 
  589   template <
class InputPortal, 
class OutputPortal>
 
  591                                           const OutputPortal& values_output)
 
  593     using ValueType = 
typename InputPortal::ValueType;
 
  594     LowerBoundsPortal(input, values_output, values_output, ::thrust::less<ValueType>());
 
  597   template <
class InputPortal, 
class ValuesPortal, 
class OutputPortal, 
class BinaryCompare>
 
  599                                           const ValuesPortal& values,
 
  600                                           const OutputPortal& output,
 
  601                                           BinaryCompare binary_compare)
 
  603     using ValueType = 
typename InputPortal::ValueType;
 
  604     vtkm::exec::cuda::internal::WrappedBinaryPredicate<ValueType, BinaryCompare> bop(
 
  609       ::thrust::lower_bound(ThrustCudaPolicyPerThread,
 
  610                             cuda::internal::IteratorBegin(input),
 
  611                             cuda::internal::IteratorEnd(input),
 
  612                             cuda::internal::IteratorBegin(values),
 
  613                             cuda::internal::IteratorEnd(values),
 
  614                             cuda::internal::IteratorBegin(output),
 
  619       cuda::internal::throwAsVTKmException();
 
  623   template <
class InputPortal, 
typename T>
 
  626     return ReducePortal(input, initialValue, ::thrust::plus<T>());
 
  629   template <
class InputPortal, 
typename T, 
class BinaryFunctor>
 
  632                                   BinaryFunctor binary_functor)
 
  634     using fast_path = std::is_same<typename InputPortal::ValueType, T>;
 
  635     return ReducePortalImpl(input, initialValue, binary_functor, fast_path());
 
  638   template <
class InputPortal, 
typename T, 
class BinaryFunctor>
 
  641                                       BinaryFunctor binary_functor,
 
  646     vtkm::exec::cuda::internal::WrappedBinaryOperator<T, BinaryFunctor> bop(binary_functor);
 
  650       return ::thrust::reduce(ThrustCudaPolicyPerThread,
 
  651                               cuda::internal::IteratorBegin(input),
 
  652                               cuda::internal::IteratorEnd(input),
 
  658       cuda::internal::throwAsVTKmException();
 
  664   template <
class InputPortal, 
typename T, 
class BinaryFunctor>
 
  667                                       BinaryFunctor binary_functor,
 
  673     vtkm::cont::cuda::internal::CastPortal<InputPortal, BinaryFunctor> castPortal(input,
 
  676     vtkm::exec::cuda::internal::WrappedBinaryOperator<T, BinaryFunctor> bop(binary_functor);
 
  680       return ::thrust::reduce(ThrustCudaPolicyPerThread,
 
  681                               cuda::internal::IteratorBegin(castPortal),
 
  682                               cuda::internal::IteratorEnd(castPortal),
 
  688       cuda::internal::throwAsVTKmException();
 
  694   template <
class KeysPortal,
 
  696             class KeysOutputPortal,
 
  697             class ValueOutputPortal,
 
  700                                               const ValuesPortal& values,
 
  701                                               const KeysOutputPortal& keys_output,
 
  702                                               const ValueOutputPortal& values_output,
 
  703                                               BinaryFunctor binary_functor)
 
  705     auto keys_out_begin = cuda::internal::IteratorBegin(keys_output);
 
  706     auto values_out_begin = cuda::internal::IteratorBegin(values_output);
 
  708     ::thrust::pair<decltype(keys_out_begin), decltype(values_out_begin)> result_iterators;
 
  710     ::thrust::equal_to<typename KeysPortal::ValueType> binaryPredicate;
 
  712     using ValueType = 
typename ValuesPortal::ValueType;
 
  713     vtkm::exec::cuda::internal::WrappedBinaryOperator<ValueType, BinaryFunctor> bop(binary_functor);
 
  717       result_iterators = ::thrust::reduce_by_key(vtkm_cuda_policy(),
 
  718                                                  cuda::internal::IteratorBegin(keys),
 
  719                                                  cuda::internal::IteratorEnd(keys),
 
  720                                                  cuda::internal::IteratorBegin(values),
 
  728       cuda::internal::throwAsVTKmException();
 
  731     return static_cast<vtkm::Id>(::thrust::distance(keys_out_begin, result_iterators.first));
 
  734   template <
class InputPortal, 
class OutputPortal>
 
  736                                                                        const OutputPortal& output)
 
  738     using ValueType = 
typename OutputPortal::ValueType;
 
  740     return ScanExclusivePortal(input,
 
  742                                (::thrust::plus<ValueType>()),
 
  746   template <
class InputPortal, 
class OutputPortal, 
class BinaryFunctor>
 
  748     const InputPortal& input,
 
  749     const OutputPortal& output,
 
  750     BinaryFunctor binaryOp,
 
  751     typename InputPortal::ValueType initialValue)
 
  755     using ValueType = 
typename OutputPortal::ValueType;
 
  759     ::thrust::system::cuda::vector<ValueType> sum(3);
 
  767         ThrustCudaPolicyPerThread, cuda::internal::IteratorEnd(input) - 1, 1, sum.begin());
 
  769       vtkm::exec::cuda::internal::WrappedBinaryOperator<ValueType, BinaryFunctor> bop(binaryOp);
 
  771       auto end = ::thrust::exclusive_scan(ThrustCudaPolicyPerThread,
 
  772                                           cuda::internal::IteratorBegin(input),
 
  773                                           cuda::internal::IteratorEnd(input),
 
  774                                           cuda::internal::IteratorBegin(output),
 
  781       ::thrust::copy_n(ThrustCudaPolicyPerThread, (end - 1), 1, sum.begin() + 1);
 
  784       cuda::internal::SumExclusiveScan<<<1, 1, 0, cudaStreamPerThread>>>(
 
  785         sum[0], sum[1], sum[2], bop);
 
  789       cuda::internal::throwAsVTKmException();
 
  794   template <
class InputPortal, 
class OutputPortal>
 
  796                                                                        const OutputPortal& output)
 
  798     using ValueType = 
typename OutputPortal::ValueType;
 
  799     return ScanInclusivePortal(input, output, ::thrust::plus<ValueType>());
 
  802   template <
class InputPortal, 
class OutputPortal, 
class BinaryFunctor>
 
  804                                                                        const OutputPortal& output,
 
  805                                                                        BinaryFunctor binary_functor)
 
  807     using ValueType = 
typename OutputPortal::ValueType;
 
  808     vtkm::exec::cuda::internal::WrappedBinaryOperator<ValueType, BinaryFunctor> bop(binary_functor);
 
  812       ::thrust::system::cuda::vector<ValueType> result(1);
 
  813       auto end = ::thrust::inclusive_scan(ThrustCudaPolicyPerThread,
 
  814                                           cuda::internal::IteratorBegin(input),
 
  815                                           cuda::internal::IteratorEnd(input),
 
  816                                           cuda::internal::IteratorBegin(output),
 
  819       ::thrust::copy_n(ThrustCudaPolicyPerThread, end - 1, 1, result.begin());
 
  824       cuda::internal::throwAsVTKmException();
 
  825       return typename InputPortal::ValueType();
 
  831   template <
typename KeysPortal, 
typename ValuesPortal, 
typename OutputPortal>
 
  833                                                  const ValuesPortal& values,
 
  834                                                  const OutputPortal& output)
 
  836     using KeyType = 
typename KeysPortal::ValueType;
 
  837     using ValueType = 
typename OutputPortal::ValueType;
 
  838     ScanInclusiveByKeyPortal(
 
  839       keys, values, output, ::thrust::equal_to<KeyType>(), ::thrust::plus<ValueType>());
 
  842   template <
typename KeysPortal,
 
  843             typename ValuesPortal,
 
  844             typename OutputPortal,
 
  845             typename BinaryPredicate,
 
  846             typename AssociativeOperator>
 
  848                                                  const ValuesPortal& values,
 
  849                                                  const OutputPortal& output,
 
  850                                                  BinaryPredicate binary_predicate,
 
  851                                                  AssociativeOperator binary_operator)
 
  853     using KeyType = 
typename KeysPortal::ValueType;
 
  854     vtkm::exec::cuda::internal::WrappedBinaryOperator<KeyType, BinaryPredicate> bpred(
 
  856     using ValueType = 
typename OutputPortal::ValueType;
 
  857     vtkm::exec::cuda::internal::WrappedBinaryOperator<ValueType, AssociativeOperator> bop(
 
  862       ::thrust::inclusive_scan_by_key(ThrustCudaPolicyPerThread,
 
  863                                       cuda::internal::IteratorBegin(keys),
 
  864                                       cuda::internal::IteratorEnd(keys),
 
  865                                       cuda::internal::IteratorBegin(values),
 
  866                                       cuda::internal::IteratorBegin(output),
 
  872       cuda::internal::throwAsVTKmException();
 
  876   template <
typename KeysPortal, 
typename ValuesPortal, 
typename OutputPortal>
 
  878                                                  const ValuesPortal& values,
 
  879                                                  const OutputPortal& output)
 
  881     using KeyType = 
typename KeysPortal::ValueType;
 
  882     using ValueType = 
typename OutputPortal::ValueType;
 
  883     ScanExclusiveByKeyPortal(keys,
 
  887                              ::thrust::equal_to<KeyType>(),
 
  888                              ::thrust::plus<ValueType>());
 
  891   template <
typename KeysPortal,
 
  892             typename ValuesPortal,
 
  893             typename OutputPortal,
 
  895             typename BinaryPredicate,
 
  896             typename AssociativeOperator>
 
  898                                                  const ValuesPortal& values,
 
  899                                                  const OutputPortal& output,
 
  901                                                  BinaryPredicate binary_predicate,
 
  902                                                  AssociativeOperator binary_operator)
 
  904     using KeyType = 
typename KeysPortal::ValueType;
 
  905     vtkm::exec::cuda::internal::WrappedBinaryOperator<KeyType, BinaryPredicate> bpred(
 
  907     using ValueType = 
typename OutputPortal::ValueType;
 
  908     vtkm::exec::cuda::internal::WrappedBinaryOperator<ValueType, AssociativeOperator> bop(
 
  912       ::thrust::exclusive_scan_by_key(ThrustCudaPolicyPerThread,
 
  913                                       cuda::internal::IteratorBegin(keys),
 
  914                                       cuda::internal::IteratorEnd(keys),
 
  915                                       cuda::internal::IteratorBegin(values),
 
  916                                       cuda::internal::IteratorBegin(output),
 
  923       cuda::internal::throwAsVTKmException();
 
  927   template <
class ValuesPortal>
 
  930     using ValueType = 
typename ValuesPortal::ValueType;
 
  931     SortPortal(values, ::thrust::less<ValueType>());
 
  934   template <
class ValuesPortal, 
class BinaryCompare>
 
  937     using ValueType = 
typename ValuesPortal::ValueType;
 
  938     vtkm::exec::cuda::internal::WrappedBinaryPredicate<ValueType, BinaryCompare> bop(
 
  942       ::thrust::sort(vtkm_cuda_policy(),
 
  943                      cuda::internal::IteratorBegin(values),
 
  944                      cuda::internal::IteratorEnd(values),
 
  949       cuda::internal::throwAsVTKmException();
 
  953   template <
class KeysPortal, 
class ValuesPortal>
 
  956     using ValueType = 
typename KeysPortal::ValueType;
 
  957     SortByKeyPortal(keys, values, ::thrust::less<ValueType>());
 
  960   template <
class KeysPortal, 
class ValuesPortal, 
class BinaryCompare>
 
  962                                         const ValuesPortal& values,
 
  963                                         BinaryCompare binary_compare)
 
  965     using ValueType = 
typename KeysPortal::ValueType;
 
  966     vtkm::exec::cuda::internal::WrappedBinaryPredicate<ValueType, BinaryCompare> bop(
 
  970       ::thrust::sort_by_key(vtkm_cuda_policy(),
 
  971                             cuda::internal::IteratorBegin(keys),
 
  972                             cuda::internal::IteratorEnd(keys),
 
  973                             cuda::internal::IteratorBegin(values),
 
  978       cuda::internal::throwAsVTKmException();
 
  982   template <
class ValuesPortal>
 
  987       auto begin = cuda::internal::IteratorBegin(values);
 
  989         ::thrust::unique(ThrustCudaPolicyPerThread, begin, cuda::internal::IteratorEnd(values));
 
  990       return static_cast<vtkm::Id>(::thrust::distance(begin, newLast));
 
  994       cuda::internal::throwAsVTKmException();
 
  999   template <
class ValuesPortal, 
class BinaryCompare>
 
 1002     using ValueType = 
typename ValuesPortal::ValueType;
 
 1003     vtkm::exec::cuda::internal::WrappedBinaryPredicate<ValueType, BinaryCompare> bop(
 
 1007       auto begin = cuda::internal::IteratorBegin(values);
 
 1008       auto newLast = ::thrust::unique(
 
 1009         ThrustCudaPolicyPerThread, begin, cuda::internal::IteratorEnd(values), bop);
 
 1010       return static_cast<vtkm::Id>(::thrust::distance(begin, newLast));
 
 1014       cuda::internal::throwAsVTKmException();
 
 1019   template <
class InputPortal, 
class ValuesPortal, 
class OutputPortal>
 
 1021                                           const ValuesPortal& values,
 
 1022                                           const OutputPortal& output)
 
 1026       ::thrust::upper_bound(ThrustCudaPolicyPerThread,
 
 1027                             cuda::internal::IteratorBegin(input),
 
 1028                             cuda::internal::IteratorEnd(input),
 
 1029                             cuda::internal::IteratorBegin(values),
 
 1030                             cuda::internal::IteratorEnd(values),
 
 1031                             cuda::internal::IteratorBegin(output));
 
 1035       cuda::internal::throwAsVTKmException();
 
 1039   template <
class InputPortal, 
class ValuesPortal, 
class OutputPortal, 
class BinaryCompare>
 
 1041                                           const ValuesPortal& values,
 
 1042                                           const OutputPortal& output,
 
 1043                                           BinaryCompare binary_compare)
 
 1045     using ValueType = 
typename OutputPortal::ValueType;
 
 1047     vtkm::exec::cuda::internal::WrappedBinaryPredicate<ValueType, BinaryCompare> bop(
 
 1051       ::thrust::upper_bound(ThrustCudaPolicyPerThread,
 
 1052                             cuda::internal::IteratorBegin(input),
 
 1053                             cuda::internal::IteratorEnd(input),
 
 1054                             cuda::internal::IteratorBegin(values),
 
 1055                             cuda::internal::IteratorEnd(values),
 
 1056                             cuda::internal::IteratorBegin(output),
 
 1061       cuda::internal::throwAsVTKmException();
 
 1065   template <
class InputPortal, 
class OutputPortal>
 
 1067                                           const OutputPortal& values_output)
 
 1071       ::thrust::upper_bound(ThrustCudaPolicyPerThread,
 
 1072                             cuda::internal::IteratorBegin(input),
 
 1073                             cuda::internal::IteratorEnd(input),
 
 1074                             cuda::internal::IteratorBegin(values_output),
 
 1075                             cuda::internal::IteratorEnd(values_output),
 
 1076                             cuda::internal::IteratorBegin(values_output));
 
 1080       cuda::internal::throwAsVTKmException();
 
 1084   template <
typename GlobalPopCountType, 
typename BitsPortal, 
typename IndicesPortal>
 
 1086                                                          const IndicesPortal& indices)
 
 1088     using Functor = BitFieldToUnorderedSetFunctor<BitsPortal, IndicesPortal, GlobalPopCountType>;
 
 1091     auto globalCount = cuda::internal::make_CudaUniquePtr<GlobalPopCountType>(1);
 
 1092     Functor functor{ bits, indices, globalCount.get() };
 
 1094     functor.Initialize();
 
 1095     Schedule(functor, bits.GetNumberOfWords());
 
 1097     return functor.Finalize();
 
 1100   template <
typename GlobalPopCountType, 
typename BitsPortal>
 
 1103     using Functor = CountSetBitsFunctor<BitsPortal, GlobalPopCountType>;
 
 1106     auto globalCount = cuda::internal::make_CudaUniquePtr<GlobalPopCountType>(1);
 
 1107     Functor functor{ bits, globalCount.get() };
 
 1109     functor.Initialize();
 
 1110     Schedule(functor, bits.GetNumberOfWords());
 
 1112     return functor.Finalize();
 
 1118   template <
typename IndicesStorage>
 
 1129       auto bitsPortal = bits.
PrepareForInput(DeviceAdapterTagCuda{}, token);
 
 1130       auto indicesPortal = indices.
PrepareForOutput(numBits, DeviceAdapterTagCuda{}, token);
 
 1133       numBits = BitFieldToUnorderedSetPortal<vtkm::UInt64>(bitsPortal, indicesPortal);
 
 1140   template <
typename T, 
typename U, 
class SIn, 
class SOut>
 
 1157   template <
typename T, 
typename U, 
class SIn, 
class SStencil, 
class SOut>
 
 1176       newSize = CopyIfPortal(input.
PrepareForInput(DeviceAdapterTagCuda(), token),
 
 1185   template <
typename T, 
typename U, 
class SIn, 
class SStencil, 
class SOut, 
class UnaryPredicate>
 
 1189                                UnaryPredicate unary_predicate)
 
 1204       newSize = CopyIfPortal(input.
PrepareForInput(DeviceAdapterTagCuda(), token),
 
 1213   template <
typename T, 
typename U, 
class SIn, 
class SOut>
 
 1225     if (input == output &&
 
 1226         ((outputIndex >= inputStartIndex &&
 
 1227           outputIndex < inputStartIndex + numberOfElementsToCopy) ||
 
 1228          (inputStartIndex >= outputIndex &&
 
 1229           inputStartIndex < outputIndex + numberOfElementsToCopy)))
 
 1234     if (inputStartIndex < 0 || numberOfElementsToCopy < 0 || outputIndex < 0 ||
 
 1235         inputStartIndex >= inSize)
 
 1241     if (inSize < (inputStartIndex + numberOfElementsToCopy))
 
 1243       numberOfElementsToCopy = (inSize - inputStartIndex);
 
 1247     const vtkm::Id copyOutEnd = outputIndex + numberOfElementsToCopy;
 
 1248     if (outSize < copyOutEnd)
 
 1264     CopySubRangePortal(input.
PrepareForInput(DeviceAdapterTagCuda(), token),
 
 1266                        numberOfElementsToCopy,
 
 1276     auto bitsPortal = bits.
PrepareForInput(DeviceAdapterTagCuda{}, token);
 
 1278     return CountSetBitsPortal<vtkm::UInt64>(bitsPortal);
 
 1281   template <
typename T, 
class SIn, 
class SVal, 
class SOut>
 
 1290     LowerBoundsPortal(input.
PrepareForInput(DeviceAdapterTagCuda(), token),
 
 1295   template <
typename T, 
class SIn, 
class SVal, 
class SOut, 
class BinaryCompare>
 
 1299                                     BinaryCompare binary_compare)
 
 1305     LowerBoundsPortal(input.
PrepareForInput(DeviceAdapterTagCuda(), token),
 
 1311   template <
class SIn, 
class SOut>
 
 1318     LowerBoundsPortal(input.
PrepareForInput(DeviceAdapterTagCuda(), token),
 
 1322   template <
typename T, 
typename U, 
class SIn>
 
 1328     if (numberOfValues <= 0)
 
 1330       return initialValue;
 
 1333     return ReducePortal(input.
PrepareForInput(DeviceAdapterTagCuda(), token), initialValue);
 
 1336   template <
typename T, 
typename U, 
class SIn, 
class BinaryFunctor>
 
 1339                             BinaryFunctor binary_functor)
 
 1344     if (numberOfValues <= 0)
 
 1346       return initialValue;
 
 1349     return ReducePortal(
 
 1350       input.
PrepareForInput(DeviceAdapterTagCuda(), token), initialValue, binary_functor);
 
 1356   template <
typename T, 
typename U, 
typename... SIns>
 
 1361     return Superclass::Reduce(input, initialValue);
 
 1363   template <
typename T, 
typename U, 
typename BinaryFunctor, 
typename... SIns>
 
 1367     BinaryFunctor binary_functor)
 
 1369     return Superclass::Reduce(input, initialValue, binary_functor);
 
 1372   template <
typename T,
 
 1378             class BinaryFunctor>
 
 1383                                     BinaryFunctor binary_functor)
 
 1390     if (numberOfValues <= 0)
 
 1398       reduced_size = ReduceByKeyPortal(
 
 1401         keys_output.
PrepareForOutput(numberOfValues, DeviceAdapterTagCuda(), token),
 
 1402         values_output.
PrepareForOutput(numberOfValues, DeviceAdapterTagCuda(), token),
 
 1410   template <
typename T, 
class SIn, 
class SOut>
 
 1417     if (numberOfValues <= 0)
 
 1428     auto inputPortal = input.
PrepareForInput(DeviceAdapterTagCuda(), token);
 
 1429     return ScanExclusivePortal(
 
 1430       inputPortal, output.
PrepareForOutput(numberOfValues, DeviceAdapterTagCuda(), token));
 
 1433   template <
typename T, 
class SIn, 
class SOut, 
class BinaryFunctor>
 
 1436                                    BinaryFunctor binary_functor,
 
 1437                                    const T& initialValue)
 
 1442     if (numberOfValues <= 0)
 
 1453     auto inputPortal = input.
PrepareForInput(DeviceAdapterTagCuda(), token);
 
 1454     return ScanExclusivePortal(
 
 1461   template <
typename T, 
class SIn, 
class SOut>
 
 1468     if (numberOfValues <= 0)
 
 1479     auto inputPortal = input.
PrepareForInput(DeviceAdapterTagCuda(), token);
 
 1480     return ScanInclusivePortal(
 
 1481       inputPortal, output.
PrepareForOutput(numberOfValues, DeviceAdapterTagCuda(), token));
 
 1484   template <
typename T, 
class SIn, 
class SOut, 
class BinaryFunctor>
 
 1487                                    BinaryFunctor binary_functor)
 
 1492     if (numberOfValues <= 0)
 
 1503     auto inputPortal = input.
PrepareForInput(DeviceAdapterTagCuda(), token);
 
 1504     return ScanInclusivePortal(
 
 1510   template <
typename T, 
typename U, 
typename KIn, 
typename VIn, 
typename VOut>
 
 1518     if (numberOfValues <= 0)
 
 1529     auto keysPortal = keys.
PrepareForInput(DeviceAdapterTagCuda(), token);
 
 1530     auto valuesPortal = values.
PrepareForInput(DeviceAdapterTagCuda(), token);
 
 1531     ScanInclusiveByKeyPortal(
 
 1537   template <
typename T,
 
 1542             typename BinaryFunctor>
 
 1546                                            BinaryFunctor binary_functor)
 
 1551     if (numberOfValues <= 0)
 
 1562     auto keysPortal = keys.
PrepareForInput(DeviceAdapterTagCuda(), token);
 
 1563     auto valuesPortal = values.
PrepareForInput(DeviceAdapterTagCuda(), token);
 
 1564     ScanInclusiveByKeyPortal(keysPortal,
 
 1567                              ::thrust::equal_to<T>(),
 
 1571   template <
typename T, 
typename U, 
typename KIn, 
typename VIn, 
typename VOut>
 
 1579     if (numberOfValues <= 0)
 
 1590     auto keysPortal = keys.
PrepareForInput(DeviceAdapterTagCuda(), token);
 
 1591     auto valuesPortal = values.
PrepareForInput(DeviceAdapterTagCuda(), token);
 
 1592     ScanExclusiveByKeyPortal(keysPortal,
 
 1596                              ::thrust::equal_to<T>(),
 
 1600   template <
typename T,
 
 1605             typename BinaryFunctor>
 
 1609                                            const U& initialValue,
 
 1610                                            BinaryFunctor binary_functor)
 
 1615     if (numberOfValues <= 0)
 
 1626     auto keysPortal = keys.
PrepareForInput(DeviceAdapterTagCuda(), token);
 
 1627     auto valuesPortal = values.
PrepareForInput(DeviceAdapterTagCuda(), token);
 
 1628     ScanExclusiveByKeyPortal(keysPortal,
 
 1632                              ::thrust::equal_to<T>(),
 
 1638   struct VTKM_CONT_EXPORT PinnedErrorArray
 
 1640     char* HostPtr = 
nullptr;
 
 1641     char* DevicePtr = 
nullptr;
 
 1646   static const PinnedErrorArray& GetPinnedErrorArray();
 
 1649   static void CheckForErrors(); 
 
 1652   static void SetupErrorBuffer(vtkm::exec::cuda::internal::TaskStrided& functor);
 
 1660   static void GetBlocksAndThreads(
vtkm::UInt32& blocks, dim3& threadsPerBlock, 
const dim3& size);
 
 1663   static void LogKernelLaunch(
const cudaFuncAttributes& func_attrs,
 
 1664                               const std::type_info& worklet_info,
 
 1670   static void LogKernelLaunch(
const cudaFuncAttributes& func_attrs,
 
 1671                               const std::type_info& worklet_info,
 
 1673                               dim3 threadsPerBlock,
 
 1677   template <
typename WType, 
typename IType>
 
 1678   static void ScheduleTask(vtkm::exec::cuda::internal::TaskStrided1D<WType, IType>& functor,
 
 1684     if (numInstances < 1)
 
 1691     SetupErrorBuffer(functor);
 
 1694     GetBlocksAndThreads(blocks, threadsPerBlock, numInstances);
 
 1696 #ifdef VTKM_ENABLE_LOGGING 
 1699       using FunctorType = vtkm::exec::cuda::internal::TaskStrided1D<WType, IType>;
 
 1700       cudaFuncAttributes empty_kernel_attrs;
 
 1702                                            cuda::internal::TaskStrided1DLaunch<FunctorType>));
 
 1703       LogKernelLaunch(empty_kernel_attrs, 
typeid(WType), blocks, threadsPerBlock, numInstances);
 
 1707     cuda::internal::TaskStrided1DLaunch<<<blocks, threadsPerBlock, 0, cudaStreamPerThread>>>(
 
 1708       functor, numInstances);
 
 1711   template <
typename WType, 
typename IType>
 
 1712   static void ScheduleTask(vtkm::exec::cuda::internal::TaskStrided3D<WType, IType>& functor,
 
 1717     VTKM_ASSERT((rangeMax[0] >= 0) && (rangeMax[1] >= 0) && (rangeMax[2] >= 0));
 
 1718     if ((rangeMax[0] < 1) || (rangeMax[1] < 1) || (rangeMax[2] < 1))
 
 1725     SetupErrorBuffer(functor);
 
 1727     const dim3 ranges(
static_cast<vtkm::UInt32>(rangeMax[0]),
 
 1732     dim3 threadsPerBlock;
 
 1733     GetBlocksAndThreads(blocks, threadsPerBlock, ranges);
 
 1735 #ifdef VTKM_ENABLE_LOGGING 
 1738       using FunctorType = vtkm::exec::cuda::internal::TaskStrided3D<WType, IType>;
 
 1739       cudaFuncAttributes empty_kernel_attrs;
 
 1741                                            cuda::internal::TaskStrided3DLaunch<FunctorType>));
 
 1742       LogKernelLaunch(empty_kernel_attrs, 
typeid(WType), blocks, threadsPerBlock, ranges);
 
 1746     cuda::internal::TaskStrided3DLaunch<<<blocks, threadsPerBlock, 0, cudaStreamPerThread>>>(
 
 1750   template <
class Functor>
 
 1755     vtkm::exec::cuda::internal::TaskStrided1D<Functor, vtkm::internal::NullType> kernel(functor);
 
 1757     ScheduleTask(kernel, numInstances);
 
 1760   template <
class Functor>
 
 1765     vtkm::exec::cuda::internal::TaskStrided3D<Functor, vtkm::internal::NullType> kernel(functor);
 
 1766     ScheduleTask(kernel, rangeMax);
 
 1769   template <
typename T, 
class Storage>
 
 1778   template <
typename T, 
class Storage, 
class BinaryCompare>
 
 1780                              BinaryCompare binary_compare)
 
 1785     SortPortal(values.
PrepareForInPlace(DeviceAdapterTagCuda(), token), binary_compare);
 
 1788   template <
typename T, 
typename U, 
class StorageT, 
class StorageU>
 
 1799   template <
typename T, 
typename U, 
class StorageT, 
class StorageU, 
class BinaryCompare>
 
 1802                                   BinaryCompare binary_compare)
 
 1812   template <
typename T, 
class Storage>
 
 1821       newSize = UniquePortal(values.
PrepareForInPlace(DeviceAdapterTagCuda(), token));
 
 1827   template <
typename T, 
class Storage, 
class BinaryCompare>
 
 1829                                BinaryCompare binary_compare)
 
 1837         UniquePortal(values.
PrepareForInPlace(DeviceAdapterTagCuda(), token), binary_compare);
 
 1843   template <
typename T, 
class SIn, 
class SVal, 
class SOut>
 
 1852     UpperBoundsPortal(input.
PrepareForInput(DeviceAdapterTagCuda(), token),
 
 1857   template <
typename T, 
class SIn, 
class SVal, 
class SOut, 
class BinaryCompare>
 
 1861                                     BinaryCompare binary_compare)
 
 1867     UpperBoundsPortal(input.
PrepareForInput(DeviceAdapterTagCuda(), token),
 
 1873   template <
class SIn, 
class SOut>
 
 1880     UpperBoundsPortal(input.
PrepareForInput(DeviceAdapterTagCuda(), token),
 
 1897   template <
typename WorkletType, 
typename InvocationType>
 
 1898   static vtkm::exec::cuda::internal::TaskStrided1D<WorkletType, InvocationType>
 
 1901     using Task = vtkm::exec::cuda::internal::TaskStrided1D<WorkletType, InvocationType>;
 
 1902     return Task(worklet, invocation);
 
 1905   template <
typename WorkletType, 
typename InvocationType>
 
 1906   static vtkm::exec::cuda::internal::TaskStrided3D<WorkletType, InvocationType>
 
 1909     using Task = vtkm::exec::cuda::internal::TaskStrided3D<WorkletType, InvocationType>;
 
 1910     return Task(worklet, invocation);
 
 1916 #endif //vtk_m_cont_cuda_internal_DeviceAdapterAlgorithmCuda_h