10 #ifndef vtk_m_cont_cuda_internal_DeviceAdapterAlgorithmCuda_h
11 #define vtk_m_cont_cuda_internal_DeviceAdapterAlgorithmCuda_h
25 #include <vtkm/cont/vtkm_cont_export.h>
42 VTKM_THIRDPARTY_PRE_INCLUDE
44 #include <vtkm/exec/cuda/internal/ExecutionPolicy.h>
46 #include <cooperative_groups.h>
48 #include <thrust/advance.h>
49 #include <thrust/binary_search.h>
50 #include <thrust/copy.h>
51 #include <thrust/count.h>
52 #include <thrust/iterator/counting_iterator.h>
53 #include <thrust/scan.h>
54 #include <thrust/sort.h>
55 #include <thrust/system/cpp/memory.h>
56 #include <thrust/system/cuda/vector.h>
57 #include <thrust/unique.h>
59 VTKM_THIRDPARTY_POST_INCLUDE
131 int multiProcessorCount,
132 int maxThreadsPerMultiProcessor,
133 int maxThreadsPerBlock));
138 #if (defined(VTKM_GCC) || defined(VTKM_CLANG))
139 #pragma GCC diagnostic push
140 #pragma GCC diagnostic ignored "-Wunused-parameter"
143 template <
typename TaskType>
144 __global__
void TaskStrided1DLaunch(TaskType task,
vtkm::Id size)
148 const vtkm::Id start = blockIdx.x * blockDim.x + threadIdx.x;
149 const vtkm::Id inc = blockDim.x * gridDim.x;
150 task(start, size, inc);
153 template <
typename TaskType>
154 __global__
void TaskStrided3DLaunch(TaskType task,
vtkm::Id3 size)
157 const dim3 start(blockIdx.x * blockDim.x + threadIdx.x,
158 blockIdx.y * blockDim.y + threadIdx.y,
159 blockIdx.z * blockDim.z + threadIdx.z);
160 const dim3 inc(blockDim.x * gridDim.x, blockDim.y * gridDim.y, blockDim.z * gridDim.z);
162 for (
vtkm::Id k = start.z; k < size[2]; k += inc.z)
164 for (
vtkm::Id j = start.y; j < size[1]; j += inc.y)
166 task(size, start.x, size[0], inc.x, j, k);
171 template <
typename T,
typename BinaryOperationType>
172 __global__
void SumExclusiveScan(T a, T b, T result, BinaryOperationType binary_op)
174 result = binary_op(a, b);
177 #if (defined(VTKM_GCC) || defined(VTKM_CLANG))
178 #pragma GCC diagnostic pop
181 template <
typename FunctorType,
typename ArgType>
182 struct FunctorSupportsUnaryImpl
184 template <typename F, typename A, typename = decltype(std::declval<F>()(std::declval<A>()))>
185 static std::true_type has(
int);
186 template <
typename F,
typename A>
187 static std::false_type has(...);
188 using type = decltype(has<FunctorType, ArgType>(0));
190 template <
typename FunctorType,
typename ArgType>
191 using FunctorSupportsUnary =
typename FunctorSupportsUnaryImpl<FunctorType, ArgType>::type;
193 template <
typename PortalType,
194 typename BinaryAndUnaryFunctor,
195 typename = FunctorSupportsUnary<BinaryAndUnaryFunctor, typename PortalType::ValueType>>
198 template <
typename PortalType,
typename BinaryAndUnaryFunctor>
199 struct CastPortal<PortalType, BinaryAndUnaryFunctor, std::true_type>
201 using InputType =
typename PortalType::ValueType;
202 using ValueType = decltype(std::declval<BinaryAndUnaryFunctor>()(std::declval<InputType>()));
205 BinaryAndUnaryFunctor Functor;
208 CastPortal(
const PortalType& portal,
const BinaryAndUnaryFunctor& functor)
215 vtkm::Id GetNumberOfValues()
const {
return this->Portal.GetNumberOfValues(); }
218 ValueType
Get(
vtkm::Id index)
const {
return this->Functor(this->Portal.Get(index)); }
221 template <
typename PortalType,
typename BinaryFunctor>
222 struct CastPortal<PortalType, BinaryFunctor, std::false_type>
224 using InputType =
typename PortalType::ValueType;
226 decltype(std::declval<BinaryFunctor>()(std::declval<InputType>(), std::declval<InputType>()));
231 CastPortal(
const PortalType& portal,
const BinaryFunctor&)
237 vtkm::Id GetNumberOfValues()
const {
return this->Portal.GetNumberOfValues(); }
240 ValueType
Get(
vtkm::Id index)
const {
return static_cast<ValueType
>(this->Portal.Get(index)); }
243 struct CudaFreeFunctor
245 void operator()(
void* ptr)
const {
VTKM_CUDA_CALL(cudaFree(ptr)); }
248 template <
typename T>
249 using CudaUniquePtr = std::unique_ptr<T, CudaFreeFunctor>;
251 template <
typename T>
252 CudaUniquePtr<T> make_CudaUniquePtr(std::size_t numElements)
256 return CudaUniquePtr<T>(ptr);
263 : vtkm::cont::internal::DeviceAdapterAlgorithmGeneral<
264 vtkm::cont::DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>,
265 vtkm::cont::DeviceAdapterTagCuda>
273 using Superclass = vtkm::cont::internal::DeviceAdapterAlgorithmGeneral<
275 vtkm::cont::DeviceAdapterTagCuda>;
277 template <
typename BitsPortal,
typename IndicesPortal,
typename GlobalPopCountType>
281 std::is_same<GlobalPopCountType, vtkm::UInt32>::value ||
282 std::is_same<GlobalPopCountType, vtkm::UInt64>::value),
283 "Unsupported GlobalPopCountType. Must support CUDA atomicAdd.");
290 VTKM_PASS_COMMAS(std::is_same<typename IndicesPortal::ValueType, vtkm::Id>::value));
294 const IndicesPortal& output,
295 GlobalPopCountType* globalPopCount)
298 , GlobalPopCount{ globalPopCount }
299 , FinalWordIndex{ input.GetNumberOfWords() - 1 }
300 , FinalWordMask(input.GetFinalWordMask())
308 assert(this->GlobalPopCount !=
nullptr);
309 VTKM_CUDA_CALL(cudaMemset(this->GlobalPopCount, 0,
sizeof(GlobalPopCountType)));
315 Word word = this->Input.GetWord(wordIdx);
318 const Word mask = wordIdx == this->FinalWordIndex ? this->FinalWordMask : ~
Word{ 0 };
325 this->ReduceAllocate();
327 vtkm::Id firstBitIdx = wordIdx *
sizeof(
Word) * CHAR_BIT;
332 vtkm::Id outIdx = this->GetNextOutputIndex();
334 this->Output.Set(outIdx, firstBitIdx + bit);
342 assert(this->GlobalPopCount !=
nullptr);
343 GlobalPopCountType result;
345 &result, this->GlobalPopCount,
sizeof(GlobalPopCountType), cudaMemcpyDeviceToHost));
346 return static_cast<vtkm::Id>(result);
357 const auto activeLanes = cooperative_groups::coalesced_threads();
358 const int activeRank = activeLanes.thread_rank();
359 const int activeSize = activeLanes.size();
363 for (
int delta = 1; delta < activeSize; delta *= 2)
365 const vtkm::Int32 shflVal = activeLanes.shfl_down(rVal, delta);
366 if (activeRank + delta < activeSize)
374 this->AllocationHead =
375 atomicAdd(this->GlobalPopCount,
static_cast<GlobalPopCountType
>(rVal));
378 this->AllocationHead = activeLanes.shfl(this->AllocationHead, 0);
388 const auto activeLanes = cooperative_groups::coalesced_threads();
389 const int activeRank = activeLanes.thread_rank();
390 const int activeSize = activeLanes.size();
392 vtkm::Id nextIdx =
static_cast<vtkm::Id>(this->AllocationHead + activeRank);
393 this->AllocationHead += activeSize;
401 mutable vtkm::UInt64 AllocationHead{ 0 };
408 template <
class InputPortal,
class OutputPortal>
413 ::thrust::copy(ThrustCudaPolicyPerThread,
414 cuda::internal::IteratorBegin(input),
415 cuda::internal::IteratorEnd(input),
416 cuda::internal::IteratorBegin(output));
420 cuda::internal::throwAsVTKmException();
424 template <
class ValueIterator,
class StencilPortal,
class OutputPortal,
class UnaryPredicate>
426 ValueIterator valuesEnd,
427 StencilPortal stencil,
429 UnaryPredicate unary_predicate)
431 auto outputBegin = cuda::internal::IteratorBegin(output);
433 using ValueType =
typename StencilPortal::ValueType;
435 vtkm::exec::cuda::internal::WrappedUnaryPredicate<ValueType, UnaryPredicate> up(
440 auto newLast = ::thrust::copy_if(ThrustCudaPolicyPerThread,
443 cuda::internal::IteratorBegin(stencil),
446 return static_cast<vtkm::Id>(::thrust::distance(outputBegin, newLast));
450 cuda::internal::throwAsVTKmException();
455 template <
class ValuePortal,
class StencilPortal,
class OutputPortal,
class UnaryPredicate>
457 StencilPortal stencil,
459 UnaryPredicate unary_predicate)
461 return CopyIfPortal(cuda::internal::IteratorBegin(values),
462 cuda::internal::IteratorEnd(values),
468 template <
class InputPortal,
class OutputPortal>
472 const OutputPortal& output,
477 ::thrust::copy_n(ThrustCudaPolicyPerThread,
478 cuda::internal::IteratorBegin(input) + inputOffset,
479 static_cast<std::size_t
>(size),
480 cuda::internal::IteratorBegin(output) + outputOffset);
484 cuda::internal::throwAsVTKmException();
489 template <
typename BitsPortal,
typename GlobalPopCountType>
493 std::is_same<GlobalPopCountType, vtkm::UInt32>::value ||
494 std::is_same<GlobalPopCountType, vtkm::UInt64>::value),
495 "Unsupported GlobalPopCountType. Must support CUDA atomicAdd.");
504 , GlobalPopCount{ globalPopCount }
505 , FinalWordIndex{ portal.GetNumberOfWords() - 1 }
506 , FinalWordMask{ portal.GetFinalWordMask() }
514 assert(this->GlobalPopCount !=
nullptr);
515 VTKM_CUDA_CALL(cudaMemset(this->GlobalPopCount, 0,
sizeof(GlobalPopCountType)));
521 Word word = this->Portal.GetWord(wordIdx);
524 const Word mask = wordIdx == this->FinalWordIndex ? this->FinalWordMask : ~
Word{ 0 };
537 assert(this->GlobalPopCount !=
nullptr);
538 GlobalPopCountType result;
540 &result, this->GlobalPopCount,
sizeof(GlobalPopCountType), cudaMemcpyDeviceToHost));
541 return static_cast<vtkm::Id>(result);
551 const auto activeLanes = cooperative_groups::coalesced_threads();
552 const int activeRank = activeLanes.thread_rank();
553 const int activeSize = activeLanes.size();
557 for (
int delta = 1; delta < activeSize; delta *= 2)
559 const vtkm::Int32 shflVal = activeLanes.shfl_down(rVal, delta);
560 if (activeRank + delta < activeSize)
568 atomicAdd(this->GlobalPopCount,
static_cast<GlobalPopCountType
>(rVal));
580 template <
class InputPortal,
class ValuesPortal,
class OutputPortal>
582 const ValuesPortal& values,
583 const OutputPortal& output)
585 using ValueType =
typename ValuesPortal::ValueType;
586 LowerBoundsPortal(input, values, output, ::thrust::less<ValueType>());
589 template <
class InputPortal,
class OutputPortal>
591 const OutputPortal& values_output)
593 using ValueType =
typename InputPortal::ValueType;
594 LowerBoundsPortal(input, values_output, values_output, ::thrust::less<ValueType>());
597 template <
class InputPortal,
class ValuesPortal,
class OutputPortal,
class BinaryCompare>
599 const ValuesPortal& values,
600 const OutputPortal& output,
601 BinaryCompare binary_compare)
603 using ValueType =
typename InputPortal::ValueType;
604 vtkm::exec::cuda::internal::WrappedBinaryPredicate<ValueType, BinaryCompare> bop(
609 ::thrust::lower_bound(ThrustCudaPolicyPerThread,
610 cuda::internal::IteratorBegin(input),
611 cuda::internal::IteratorEnd(input),
612 cuda::internal::IteratorBegin(values),
613 cuda::internal::IteratorEnd(values),
614 cuda::internal::IteratorBegin(output),
619 cuda::internal::throwAsVTKmException();
623 template <
class InputPortal,
typename T>
626 return ReducePortal(input, initialValue, ::thrust::plus<T>());
629 template <
class InputPortal,
typename T,
class BinaryFunctor>
632 BinaryFunctor binary_functor)
634 using fast_path = std::is_same<typename InputPortal::ValueType, T>;
635 return ReducePortalImpl(input, initialValue, binary_functor, fast_path());
638 template <
class InputPortal,
typename T,
class BinaryFunctor>
641 BinaryFunctor binary_functor,
646 vtkm::exec::cuda::internal::WrappedBinaryOperator<T, BinaryFunctor> bop(binary_functor);
650 return ::thrust::reduce(ThrustCudaPolicyPerThread,
651 cuda::internal::IteratorBegin(input),
652 cuda::internal::IteratorEnd(input),
658 cuda::internal::throwAsVTKmException();
664 template <
class InputPortal,
typename T,
class BinaryFunctor>
667 BinaryFunctor binary_functor,
673 vtkm::cont::cuda::internal::CastPortal<InputPortal, BinaryFunctor> castPortal(input,
676 vtkm::exec::cuda::internal::WrappedBinaryOperator<T, BinaryFunctor> bop(binary_functor);
680 return ::thrust::reduce(ThrustCudaPolicyPerThread,
681 cuda::internal::IteratorBegin(castPortal),
682 cuda::internal::IteratorEnd(castPortal),
688 cuda::internal::throwAsVTKmException();
694 template <
class KeysPortal,
696 class KeysOutputPortal,
697 class ValueOutputPortal,
700 const ValuesPortal& values,
701 const KeysOutputPortal& keys_output,
702 const ValueOutputPortal& values_output,
703 BinaryFunctor binary_functor)
705 auto keys_out_begin = cuda::internal::IteratorBegin(keys_output);
706 auto values_out_begin = cuda::internal::IteratorBegin(values_output);
708 ::thrust::pair<decltype(keys_out_begin), decltype(values_out_begin)> result_iterators;
710 ::thrust::equal_to<typename KeysPortal::ValueType> binaryPredicate;
712 using ValueType =
typename ValuesPortal::ValueType;
713 vtkm::exec::cuda::internal::WrappedBinaryOperator<ValueType, BinaryFunctor> bop(binary_functor);
717 result_iterators = ::thrust::reduce_by_key(vtkm_cuda_policy(),
718 cuda::internal::IteratorBegin(keys),
719 cuda::internal::IteratorEnd(keys),
720 cuda::internal::IteratorBegin(values),
728 cuda::internal::throwAsVTKmException();
731 return static_cast<vtkm::Id>(::thrust::distance(keys_out_begin, result_iterators.first));
734 template <
class InputPortal,
class OutputPortal>
736 const OutputPortal& output)
738 using ValueType =
typename OutputPortal::ValueType;
740 return ScanExclusivePortal(input,
742 (::thrust::plus<ValueType>()),
746 template <
class InputPortal,
class OutputPortal,
class BinaryFunctor>
748 const InputPortal& input,
749 const OutputPortal& output,
750 BinaryFunctor binaryOp,
751 typename InputPortal::ValueType initialValue)
755 using ValueType =
typename OutputPortal::ValueType;
759 ::thrust::system::cuda::vector<ValueType> sum(3);
767 ThrustCudaPolicyPerThread, cuda::internal::IteratorEnd(input) - 1, 1, sum.begin());
769 vtkm::exec::cuda::internal::WrappedBinaryOperator<ValueType, BinaryFunctor> bop(binaryOp);
771 auto end = ::thrust::exclusive_scan(ThrustCudaPolicyPerThread,
772 cuda::internal::IteratorBegin(input),
773 cuda::internal::IteratorEnd(input),
774 cuda::internal::IteratorBegin(output),
781 ::thrust::copy_n(ThrustCudaPolicyPerThread, (end - 1), 1, sum.begin() + 1);
784 cuda::internal::SumExclusiveScan<<<1, 1, 0, cudaStreamPerThread>>>(
785 sum[0], sum[1], sum[2], bop);
789 cuda::internal::throwAsVTKmException();
794 template <
class InputPortal,
class OutputPortal>
796 const OutputPortal& output)
798 using ValueType =
typename OutputPortal::ValueType;
799 return ScanInclusivePortal(input, output, ::thrust::plus<ValueType>());
802 template <
class InputPortal,
class OutputPortal,
class BinaryFunctor>
804 const OutputPortal& output,
805 BinaryFunctor binary_functor)
807 using ValueType =
typename OutputPortal::ValueType;
808 vtkm::exec::cuda::internal::WrappedBinaryOperator<ValueType, BinaryFunctor> bop(binary_functor);
812 ::thrust::system::cuda::vector<ValueType> result(1);
813 auto end = ::thrust::inclusive_scan(ThrustCudaPolicyPerThread,
814 cuda::internal::IteratorBegin(input),
815 cuda::internal::IteratorEnd(input),
816 cuda::internal::IteratorBegin(output),
819 ::thrust::copy_n(ThrustCudaPolicyPerThread, end - 1, 1, result.begin());
824 cuda::internal::throwAsVTKmException();
825 return typename InputPortal::ValueType();
831 template <
typename KeysPortal,
typename ValuesPortal,
typename OutputPortal>
833 const ValuesPortal& values,
834 const OutputPortal& output)
836 using KeyType =
typename KeysPortal::ValueType;
837 using ValueType =
typename OutputPortal::ValueType;
838 ScanInclusiveByKeyPortal(
839 keys, values, output, ::thrust::equal_to<KeyType>(), ::thrust::plus<ValueType>());
842 template <
typename KeysPortal,
843 typename ValuesPortal,
844 typename OutputPortal,
845 typename BinaryPredicate,
846 typename AssociativeOperator>
848 const ValuesPortal& values,
849 const OutputPortal& output,
850 BinaryPredicate binary_predicate,
851 AssociativeOperator binary_operator)
853 using KeyType =
typename KeysPortal::ValueType;
854 vtkm::exec::cuda::internal::WrappedBinaryOperator<KeyType, BinaryPredicate> bpred(
856 using ValueType =
typename OutputPortal::ValueType;
857 vtkm::exec::cuda::internal::WrappedBinaryOperator<ValueType, AssociativeOperator> bop(
862 ::thrust::inclusive_scan_by_key(ThrustCudaPolicyPerThread,
863 cuda::internal::IteratorBegin(keys),
864 cuda::internal::IteratorEnd(keys),
865 cuda::internal::IteratorBegin(values),
866 cuda::internal::IteratorBegin(output),
872 cuda::internal::throwAsVTKmException();
876 template <
typename KeysPortal,
typename ValuesPortal,
typename OutputPortal>
878 const ValuesPortal& values,
879 const OutputPortal& output)
881 using KeyType =
typename KeysPortal::ValueType;
882 using ValueType =
typename OutputPortal::ValueType;
883 ScanExclusiveByKeyPortal(keys,
887 ::thrust::equal_to<KeyType>(),
888 ::thrust::plus<ValueType>());
891 template <
typename KeysPortal,
892 typename ValuesPortal,
893 typename OutputPortal,
895 typename BinaryPredicate,
896 typename AssociativeOperator>
898 const ValuesPortal& values,
899 const OutputPortal& output,
901 BinaryPredicate binary_predicate,
902 AssociativeOperator binary_operator)
904 using KeyType =
typename KeysPortal::ValueType;
905 vtkm::exec::cuda::internal::WrappedBinaryOperator<KeyType, BinaryPredicate> bpred(
907 using ValueType =
typename OutputPortal::ValueType;
908 vtkm::exec::cuda::internal::WrappedBinaryOperator<ValueType, AssociativeOperator> bop(
912 ::thrust::exclusive_scan_by_key(ThrustCudaPolicyPerThread,
913 cuda::internal::IteratorBegin(keys),
914 cuda::internal::IteratorEnd(keys),
915 cuda::internal::IteratorBegin(values),
916 cuda::internal::IteratorBegin(output),
923 cuda::internal::throwAsVTKmException();
927 template <
class ValuesPortal>
930 using ValueType =
typename ValuesPortal::ValueType;
931 SortPortal(values, ::thrust::less<ValueType>());
934 template <
class ValuesPortal,
class BinaryCompare>
937 using ValueType =
typename ValuesPortal::ValueType;
938 vtkm::exec::cuda::internal::WrappedBinaryPredicate<ValueType, BinaryCompare> bop(
942 ::thrust::sort(vtkm_cuda_policy(),
943 cuda::internal::IteratorBegin(values),
944 cuda::internal::IteratorEnd(values),
949 cuda::internal::throwAsVTKmException();
953 template <
class KeysPortal,
class ValuesPortal>
956 using ValueType =
typename KeysPortal::ValueType;
957 SortByKeyPortal(keys, values, ::thrust::less<ValueType>());
960 template <
class KeysPortal,
class ValuesPortal,
class BinaryCompare>
962 const ValuesPortal& values,
963 BinaryCompare binary_compare)
965 using ValueType =
typename KeysPortal::ValueType;
966 vtkm::exec::cuda::internal::WrappedBinaryPredicate<ValueType, BinaryCompare> bop(
970 ::thrust::sort_by_key(vtkm_cuda_policy(),
971 cuda::internal::IteratorBegin(keys),
972 cuda::internal::IteratorEnd(keys),
973 cuda::internal::IteratorBegin(values),
978 cuda::internal::throwAsVTKmException();
982 template <
class ValuesPortal>
987 auto begin = cuda::internal::IteratorBegin(values);
989 ::thrust::unique(ThrustCudaPolicyPerThread, begin, cuda::internal::IteratorEnd(values));
990 return static_cast<vtkm::Id>(::thrust::distance(begin, newLast));
994 cuda::internal::throwAsVTKmException();
999 template <
class ValuesPortal,
class BinaryCompare>
1002 using ValueType =
typename ValuesPortal::ValueType;
1003 vtkm::exec::cuda::internal::WrappedBinaryPredicate<ValueType, BinaryCompare> bop(
1007 auto begin = cuda::internal::IteratorBegin(values);
1008 auto newLast = ::thrust::unique(
1009 ThrustCudaPolicyPerThread, begin, cuda::internal::IteratorEnd(values), bop);
1010 return static_cast<vtkm::Id>(::thrust::distance(begin, newLast));
1014 cuda::internal::throwAsVTKmException();
1019 template <
class InputPortal,
class ValuesPortal,
class OutputPortal>
1021 const ValuesPortal& values,
1022 const OutputPortal& output)
1026 ::thrust::upper_bound(ThrustCudaPolicyPerThread,
1027 cuda::internal::IteratorBegin(input),
1028 cuda::internal::IteratorEnd(input),
1029 cuda::internal::IteratorBegin(values),
1030 cuda::internal::IteratorEnd(values),
1031 cuda::internal::IteratorBegin(output));
1035 cuda::internal::throwAsVTKmException();
1039 template <
class InputPortal,
class ValuesPortal,
class OutputPortal,
class BinaryCompare>
1041 const ValuesPortal& values,
1042 const OutputPortal& output,
1043 BinaryCompare binary_compare)
1045 using ValueType =
typename OutputPortal::ValueType;
1047 vtkm::exec::cuda::internal::WrappedBinaryPredicate<ValueType, BinaryCompare> bop(
1051 ::thrust::upper_bound(ThrustCudaPolicyPerThread,
1052 cuda::internal::IteratorBegin(input),
1053 cuda::internal::IteratorEnd(input),
1054 cuda::internal::IteratorBegin(values),
1055 cuda::internal::IteratorEnd(values),
1056 cuda::internal::IteratorBegin(output),
1061 cuda::internal::throwAsVTKmException();
1065 template <
class InputPortal,
class OutputPortal>
1067 const OutputPortal& values_output)
1071 ::thrust::upper_bound(ThrustCudaPolicyPerThread,
1072 cuda::internal::IteratorBegin(input),
1073 cuda::internal::IteratorEnd(input),
1074 cuda::internal::IteratorBegin(values_output),
1075 cuda::internal::IteratorEnd(values_output),
1076 cuda::internal::IteratorBegin(values_output));
1080 cuda::internal::throwAsVTKmException();
1084 template <
typename GlobalPopCountType,
typename BitsPortal,
typename IndicesPortal>
1086 const IndicesPortal& indices)
1088 using Functor = BitFieldToUnorderedSetFunctor<BitsPortal, IndicesPortal, GlobalPopCountType>;
1091 auto globalCount = cuda::internal::make_CudaUniquePtr<GlobalPopCountType>(1);
1092 Functor functor{ bits, indices, globalCount.get() };
1094 functor.Initialize();
1095 Schedule(functor, bits.GetNumberOfWords());
1097 return functor.Finalize();
1100 template <
typename GlobalPopCountType,
typename BitsPortal>
1103 using Functor = CountSetBitsFunctor<BitsPortal, GlobalPopCountType>;
1106 auto globalCount = cuda::internal::make_CudaUniquePtr<GlobalPopCountType>(1);
1107 Functor functor{ bits, globalCount.get() };
1109 functor.Initialize();
1110 Schedule(functor, bits.GetNumberOfWords());
1112 return functor.Finalize();
1118 template <
typename IndicesStorage>
1129 auto bitsPortal = bits.
PrepareForInput(DeviceAdapterTagCuda{}, token);
1130 auto indicesPortal = indices.
PrepareForOutput(numBits, DeviceAdapterTagCuda{}, token);
1133 numBits = BitFieldToUnorderedSetPortal<vtkm::UInt64>(bitsPortal, indicesPortal);
1140 template <
typename T,
typename U,
class SIn,
class SOut>
1157 template <
typename T,
typename U,
class SIn,
class SStencil,
class SOut>
1176 newSize = CopyIfPortal(input.
PrepareForInput(DeviceAdapterTagCuda(), token),
1185 template <
typename T,
typename U,
class SIn,
class SStencil,
class SOut,
class UnaryPredicate>
1189 UnaryPredicate unary_predicate)
1204 newSize = CopyIfPortal(input.
PrepareForInput(DeviceAdapterTagCuda(), token),
1213 template <
typename T,
typename U,
class SIn,
class SOut>
1225 if (input == output &&
1226 ((outputIndex >= inputStartIndex &&
1227 outputIndex < inputStartIndex + numberOfElementsToCopy) ||
1228 (inputStartIndex >= outputIndex &&
1229 inputStartIndex < outputIndex + numberOfElementsToCopy)))
1234 if (inputStartIndex < 0 || numberOfElementsToCopy < 0 || outputIndex < 0 ||
1235 inputStartIndex >= inSize)
1241 if (inSize < (inputStartIndex + numberOfElementsToCopy))
1243 numberOfElementsToCopy = (inSize - inputStartIndex);
1247 const vtkm::Id copyOutEnd = outputIndex + numberOfElementsToCopy;
1248 if (outSize < copyOutEnd)
1264 CopySubRangePortal(input.
PrepareForInput(DeviceAdapterTagCuda(), token),
1266 numberOfElementsToCopy,
1276 auto bitsPortal = bits.
PrepareForInput(DeviceAdapterTagCuda{}, token);
1278 return CountSetBitsPortal<vtkm::UInt64>(bitsPortal);
1281 template <
typename T,
class SIn,
class SVal,
class SOut>
1290 LowerBoundsPortal(input.
PrepareForInput(DeviceAdapterTagCuda(), token),
1295 template <
typename T,
class SIn,
class SVal,
class SOut,
class BinaryCompare>
1299 BinaryCompare binary_compare)
1305 LowerBoundsPortal(input.
PrepareForInput(DeviceAdapterTagCuda(), token),
1311 template <
class SIn,
class SOut>
1318 LowerBoundsPortal(input.
PrepareForInput(DeviceAdapterTagCuda(), token),
1322 template <
typename T,
typename U,
class SIn>
1328 if (numberOfValues <= 0)
1330 return initialValue;
1333 return ReducePortal(input.
PrepareForInput(DeviceAdapterTagCuda(), token), initialValue);
1336 template <
typename T,
typename U,
class SIn,
class BinaryFunctor>
1339 BinaryFunctor binary_functor)
1344 if (numberOfValues <= 0)
1346 return initialValue;
1349 return ReducePortal(
1350 input.
PrepareForInput(DeviceAdapterTagCuda(), token), initialValue, binary_functor);
1356 template <
typename T,
typename U,
typename... SIns>
1361 return Superclass::Reduce(input, initialValue);
1363 template <
typename T,
typename U,
typename BinaryFunctor,
typename... SIns>
1367 BinaryFunctor binary_functor)
1369 return Superclass::Reduce(input, initialValue, binary_functor);
1372 template <
typename T,
1378 class BinaryFunctor>
1383 BinaryFunctor binary_functor)
1390 if (numberOfValues <= 0)
1398 reduced_size = ReduceByKeyPortal(
1401 keys_output.
PrepareForOutput(numberOfValues, DeviceAdapterTagCuda(), token),
1402 values_output.
PrepareForOutput(numberOfValues, DeviceAdapterTagCuda(), token),
1410 template <
typename T,
class SIn,
class SOut>
1417 if (numberOfValues <= 0)
1428 auto inputPortal = input.
PrepareForInput(DeviceAdapterTagCuda(), token);
1429 return ScanExclusivePortal(
1430 inputPortal, output.
PrepareForOutput(numberOfValues, DeviceAdapterTagCuda(), token));
1433 template <
typename T,
class SIn,
class SOut,
class BinaryFunctor>
1436 BinaryFunctor binary_functor,
1437 const T& initialValue)
1442 if (numberOfValues <= 0)
1453 auto inputPortal = input.
PrepareForInput(DeviceAdapterTagCuda(), token);
1454 return ScanExclusivePortal(
1461 template <
typename T,
class SIn,
class SOut>
1468 if (numberOfValues <= 0)
1479 auto inputPortal = input.
PrepareForInput(DeviceAdapterTagCuda(), token);
1480 return ScanInclusivePortal(
1481 inputPortal, output.
PrepareForOutput(numberOfValues, DeviceAdapterTagCuda(), token));
1484 template <
typename T,
class SIn,
class SOut,
class BinaryFunctor>
1487 BinaryFunctor binary_functor)
1492 if (numberOfValues <= 0)
1503 auto inputPortal = input.
PrepareForInput(DeviceAdapterTagCuda(), token);
1504 return ScanInclusivePortal(
1510 template <
typename T,
typename U,
typename KIn,
typename VIn,
typename VOut>
1518 if (numberOfValues <= 0)
1529 auto keysPortal = keys.
PrepareForInput(DeviceAdapterTagCuda(), token);
1530 auto valuesPortal = values.
PrepareForInput(DeviceAdapterTagCuda(), token);
1531 ScanInclusiveByKeyPortal(
1537 template <
typename T,
1542 typename BinaryFunctor>
1546 BinaryFunctor binary_functor)
1551 if (numberOfValues <= 0)
1562 auto keysPortal = keys.
PrepareForInput(DeviceAdapterTagCuda(), token);
1563 auto valuesPortal = values.
PrepareForInput(DeviceAdapterTagCuda(), token);
1564 ScanInclusiveByKeyPortal(keysPortal,
1567 ::thrust::equal_to<T>(),
1571 template <
typename T,
typename U,
typename KIn,
typename VIn,
typename VOut>
1579 if (numberOfValues <= 0)
1590 auto keysPortal = keys.
PrepareForInput(DeviceAdapterTagCuda(), token);
1591 auto valuesPortal = values.
PrepareForInput(DeviceAdapterTagCuda(), token);
1592 ScanExclusiveByKeyPortal(keysPortal,
1596 ::thrust::equal_to<T>(),
1600 template <
typename T,
1605 typename BinaryFunctor>
1609 const U& initialValue,
1610 BinaryFunctor binary_functor)
1615 if (numberOfValues <= 0)
1626 auto keysPortal = keys.
PrepareForInput(DeviceAdapterTagCuda(), token);
1627 auto valuesPortal = values.
PrepareForInput(DeviceAdapterTagCuda(), token);
1628 ScanExclusiveByKeyPortal(keysPortal,
1632 ::thrust::equal_to<T>(),
1638 struct VTKM_CONT_EXPORT PinnedErrorArray
1640 char* HostPtr =
nullptr;
1641 char* DevicePtr =
nullptr;
1646 static const PinnedErrorArray& GetPinnedErrorArray();
1649 static void CheckForErrors();
1652 static void SetupErrorBuffer(vtkm::exec::cuda::internal::TaskStrided& functor);
1660 static void GetBlocksAndThreads(
vtkm::UInt32& blocks, dim3& threadsPerBlock,
const dim3& size);
1663 static void LogKernelLaunch(
const cudaFuncAttributes& func_attrs,
1664 const std::type_info& worklet_info,
1670 static void LogKernelLaunch(
const cudaFuncAttributes& func_attrs,
1671 const std::type_info& worklet_info,
1673 dim3 threadsPerBlock,
1677 template <
typename WType,
typename IType>
1678 static void ScheduleTask(vtkm::exec::cuda::internal::TaskStrided1D<WType, IType>& functor,
1684 if (numInstances < 1)
1691 SetupErrorBuffer(functor);
1694 GetBlocksAndThreads(blocks, threadsPerBlock, numInstances);
1696 #ifdef VTKM_ENABLE_LOGGING
1699 using FunctorType = vtkm::exec::cuda::internal::TaskStrided1D<WType, IType>;
1700 cudaFuncAttributes empty_kernel_attrs;
1702 cuda::internal::TaskStrided1DLaunch<FunctorType>));
1703 LogKernelLaunch(empty_kernel_attrs,
typeid(WType), blocks, threadsPerBlock, numInstances);
1707 cuda::internal::TaskStrided1DLaunch<<<blocks, threadsPerBlock, 0, cudaStreamPerThread>>>(
1708 functor, numInstances);
1711 template <
typename WType,
typename IType>
1712 static void ScheduleTask(vtkm::exec::cuda::internal::TaskStrided3D<WType, IType>& functor,
1717 VTKM_ASSERT((rangeMax[0] >= 0) && (rangeMax[1] >= 0) && (rangeMax[2] >= 0));
1718 if ((rangeMax[0] < 1) || (rangeMax[1] < 1) || (rangeMax[2] < 1))
1725 SetupErrorBuffer(functor);
1727 const dim3 ranges(
static_cast<vtkm::UInt32>(rangeMax[0]),
1732 dim3 threadsPerBlock;
1733 GetBlocksAndThreads(blocks, threadsPerBlock, ranges);
1735 #ifdef VTKM_ENABLE_LOGGING
1738 using FunctorType = vtkm::exec::cuda::internal::TaskStrided3D<WType, IType>;
1739 cudaFuncAttributes empty_kernel_attrs;
1741 cuda::internal::TaskStrided3DLaunch<FunctorType>));
1742 LogKernelLaunch(empty_kernel_attrs,
typeid(WType), blocks, threadsPerBlock, ranges);
1746 cuda::internal::TaskStrided3DLaunch<<<blocks, threadsPerBlock, 0, cudaStreamPerThread>>>(
1750 template <
class Functor>
1755 vtkm::exec::cuda::internal::TaskStrided1D<Functor, vtkm::internal::NullType> kernel(functor);
1757 ScheduleTask(kernel, numInstances);
1760 template <
class Functor>
1765 vtkm::exec::cuda::internal::TaskStrided3D<Functor, vtkm::internal::NullType> kernel(functor);
1766 ScheduleTask(kernel, rangeMax);
1769 template <
typename T,
class Storage>
1778 template <
typename T,
class Storage,
class BinaryCompare>
1780 BinaryCompare binary_compare)
1785 SortPortal(values.
PrepareForInPlace(DeviceAdapterTagCuda(), token), binary_compare);
1788 template <
typename T,
typename U,
class StorageT,
class StorageU>
1799 template <
typename T,
typename U,
class StorageT,
class StorageU,
class BinaryCompare>
1802 BinaryCompare binary_compare)
1812 template <
typename T,
class Storage>
1821 newSize = UniquePortal(values.
PrepareForInPlace(DeviceAdapterTagCuda(), token));
1827 template <
typename T,
class Storage,
class BinaryCompare>
1829 BinaryCompare binary_compare)
1837 UniquePortal(values.
PrepareForInPlace(DeviceAdapterTagCuda(), token), binary_compare);
1843 template <
typename T,
class SIn,
class SVal,
class SOut>
1852 UpperBoundsPortal(input.
PrepareForInput(DeviceAdapterTagCuda(), token),
1857 template <
typename T,
class SIn,
class SVal,
class SOut,
class BinaryCompare>
1861 BinaryCompare binary_compare)
1867 UpperBoundsPortal(input.
PrepareForInput(DeviceAdapterTagCuda(), token),
1873 template <
class SIn,
class SOut>
1880 UpperBoundsPortal(input.
PrepareForInput(DeviceAdapterTagCuda(), token),
1897 template <
typename WorkletType,
typename InvocationType>
1898 static vtkm::exec::cuda::internal::TaskStrided1D<WorkletType, InvocationType>
1901 using Task = vtkm::exec::cuda::internal::TaskStrided1D<WorkletType, InvocationType>;
1902 return Task(worklet, invocation);
1905 template <
typename WorkletType,
typename InvocationType>
1906 static vtkm::exec::cuda::internal::TaskStrided3D<WorkletType, InvocationType>
1909 using Task = vtkm::exec::cuda::internal::TaskStrided3D<WorkletType, InvocationType>;
1910 return Task(worklet, invocation);
1916 #endif //vtk_m_cont_cuda_internal_DeviceAdapterAlgorithmCuda_h