10 #ifndef vtk_m_cont_openmp_internal_FunctorsOpenMP_h
11 #define vtk_m_cont_openmp_internal_FunctorsOpenMP_h
28 #include <type_traits>
34 #define VTKM_OPENMP_DIRECTIVE_IMPL(fullDir) _Pragma(#fullDir)
35 #define VTKM_OPENMP_DIRECTIVE(dir) VTKM_OPENMP_DIRECTIVE_IMPL(omp dir)
37 #define VTKM_OPENMP_DIRECTIVE(directive)
46 #if defined(VTKM_GCC) && (__GNUC__ < 9)
47 #define VTKM_OPENMP_SHARED_CONST(...)
49 #define VTKM_OPENMP_SHARED_CONST(...) shared(__VA_ARGS__)
66 constexpr
static vtkm::Id VTKM_CACHE_LINE_SIZE = 64;
67 constexpr
static vtkm::Id VTKM_PAGE_SIZE = 4096;
71 static constexpr T CeilDivide(
const T& numerator,
const T& denominator)
73 return (numerator + denominator - 1) / denominator;
78 static void ComputeChunkSize(
const vtkm::Id numVals,
86 const vtkm::Id bytesIn = numVals * bytesPerValue;
87 const vtkm::Id pagesIn = CeilDivide(bytesIn, VTKM_PAGE_SIZE);
89 numChunks = (pagesIn > numThreads * chunksPerThread) ? numThreads * chunksPerThread : numThreads;
90 const vtkm::Id pagesPerChunk = CeilDivide(pagesIn, numChunks);
91 valuesPerChunk = CeilDivide(pagesPerChunk * VTKM_PAGE_SIZE, bytesPerValue);
100 template <
typename PortalType>
103 using type =
typename PortalType::ValueType;
106 template <
typename T>
109 template <
typename T,
typename U>
110 static void DoCopy(T src, U dst,
vtkm::Id numVals, std::true_type)
114 std::copy(src, src + numVals, dst);
119 template <
typename InIterT,
typename OutIterT>
120 static void DoCopy(InIterT inIter, OutIterT outIter,
vtkm::Id numVals, std::false_type)
122 using InValueType = CleanArrayRef<typename std::iterator_traits<InIterT>::value_type>;
123 using OutValueType = CleanArrayRef<typename std::iterator_traits<OutIterT>::value_type>;
125 for (
vtkm::Id i = 0; i < numVals; ++i)
131 *(outIter++) =
static_cast<OutValueType
>(
static_cast<InValueType
>(*(inIter++)));
135 template <
typename InIterT,
typename OutIterT>
136 static void DoCopy(InIterT inIter, OutIterT outIter,
vtkm::Id numVals)
138 using InValueType = CleanArrayRef<typename std::iterator_traits<InIterT>::value_type>;
139 using OutValueType = CleanArrayRef<typename std::iterator_traits<OutIterT>::value_type>;
141 DoCopy(inIter, outIter, numVals, std::is_same<InValueType, OutValueType>());
145 template <
typename InPortalT,
typename OutPortalT>
146 static void CopyHelper(InPortalT inPortal,
147 OutPortalT outPortal,
152 using InValueT =
typename InPortalT::ValueType;
153 using OutValueT =
typename OutPortalT::ValueType;
154 constexpr
auto isSame = std::is_same<InValueT, OutValueT>();
172 .GetThreads(numThreads);
173 ComputeChunkSize(numVals, numThreads, 8,
sizeof(InValueT), numChunks, valuesPerChunk);
177 for (
vtkm::
Id i = 0; i < numVals; i += valuesPerChunk)
179 vtkm::Id chunkSize = std::min(numVals - i, valuesPerChunk);
180 DoCopy(inIter + i, outIter + i, chunkSize, isSame);
199 this->NumValues = numValues;
202 .GetThreads(this->NumThreads);
203 this->ValueSize = valueSize;
208 this->NumValues, this->NumThreads, 8, valueSize, this->NumChunks, this->ChunkSize);
210 this->EndIds.resize(
static_cast<std::size_t
>(this->NumChunks));
213 template <
typename InIterT,
typename StencilIterT,
typename OutIterT,
typename PredicateT>
215 StencilIterT stencilIter,
220 vtkm::Id startPos = std::min(chunk * this->ChunkSize, this->NumValues);
221 vtkm::Id endPos = std::min((chunk + 1) * this->ChunkSize, this->NumValues);
224 for (
vtkm::Id inPos = startPos; inPos < endPos; ++inPos)
226 if (pred(stencilIter[inPos]))
228 outIter[outPos++] = inIter[inPos];
232 this->EndIds[
static_cast<std::size_t
>(chunk)] = outPos;
235 template <
typename OutIterT>
238 vtkm::Id endPos = this->EndIds.front();
241 vtkm::Id chunkStart = std::min(i * this->ChunkSize, this->NumValues);
242 vtkm::Id chunkEnd = this->EndIds[
static_cast<std::size_t
>(i)];
243 vtkm::Id numValuesToCopy = chunkEnd - chunkStart;
244 if (numValuesToCopy > 0 && chunkStart != endPos)
246 std::copy(data + chunkStart, data + chunkEnd, data + endPos);
248 endPos += numValuesToCopy;
254 #ifdef VTKM_OPENMP_USE_NATIVE_REDUCTION
257 template <
typename T>
302 template <
typename T>
304 #endif // VTKM_OPENMP_USE_NATIVE_REDUCTION
309 template <
typename T>
314 template <
typename T, vtkm::IdComponent Size>
319 template <
typename T,
typename U>
321 :
public std::integral_constant<bool, std::is_integral<T>::value && std::is_integral<U>::value>
326 template <
typename PortalT,
typename ReturnType,
typename Functor>
327 static ReturnType
Execute(PortalT portal, ReturnType init, Functor functorIn, std::false_type)
329 internal::WrappedBinaryOperator<ReturnType, Functor> f(functorIn);
331 const vtkm::Id numVals = portal.GetNumberOfValues();
334 bool doParallel =
false;
339 .GetThreads(numThreads);
341 std::unique_ptr<ReturnType[]> threadData;
346 int tid = omp_get_thread_num();
350 if (numVals >= numThreads * 2)
353 threadData.reset(
new ReturnType[
static_cast<std::size_t
>(numThreads)]);
360 const ReturnType localResult = ReduceHelper::DoParallelReduction<ReturnType>(
363 threadData[
static_cast<std::size_t
>(tid)] = localResult;
370 for (
size_t i = 0; i < static_cast<size_t>(numThreads); ++i)
372 init = f(init, threadData[i]);
378 for (
vtkm::Id i = 0; i < numVals; ++i)
380 init = f(init, data[i]);
389 template <
typename ReturnType,
typename IterType,
typename FunctorType>
393 const int& numThreads,
398 ReturnType accum = f(data[2 * tid], data[2 * tid + 1]);
400 const vtkm::Id offset = numThreads * 2;
401 const vtkm::Id end = std::max(((numVals / 4) * 4) - 4, offset);
402 const vtkm::Id unrollEnd = end - ((end - offset) % 4);
407 #pragma GCC diagnostic push
408 #pragma GCC diagnostic ignored "-Wsign-conversion"
410 for (i = offset; i < unrollEnd; i += 4)
411 #pragma GCC diagnostic pop
413 const auto t1 = f(data[i], data[i + 1]);
414 const auto t2 = f(data[i + 2], data[i + 3]);
415 accum = f(accum, t1);
416 accum = f(accum, t2);
421 if (tid == numThreads - 1)
423 for (i = unrollEnd; i < numVals; ++i)
425 accum = f(accum, data[i]);
434 template <
typename ReturnType,
typename IterType,
typename FunctorType>
438 const int& numThreads,
443 ReturnType accum = f(data[2 * tid], data[2 * tid + 1]);
446 #pragma GCC diagnostic push
447 #pragma GCC diagnostic ignored "-Wsign-conversion"
449 for (
vtkm::Id i = numThreads * 2; i < numVals; i++)
450 #pragma GCC diagnostic pop
452 accum = f(accum, data[i]);
458 #ifdef VTKM_OPENMP_USE_NATIVE_REDUCTION
461 #define VTKM_OPENMP_SPECIALIZE_REDUCE1(FunctorType, PragmaString) \
462 template <typename PortalT, typename ReturnType> \
463 static ReturnType Execute( \
464 PortalT portal, ReturnType value, FunctorType functorIn, std::true_type) \
466 const vtkm::Id numValues = portal.GetNumberOfValues(); \
467 internal::WrappedBinaryOperator<ReturnType, FunctorType> f(functorIn); \
468 _Pragma(#PragmaString) for (vtkm::Id i = 0; i < numValues; ++i) \
470 value = f(value, portal.Get(i)); \
477 #define VTKM_OPENMP_SPECIALIZE_REDUCE(FunctorType, Operator) \
478 VTKM_OPENMP_SPECIALIZE_REDUCE1(FunctorType, "omp parallel for reduction(" #Operator ":value)")
481 VTKM_OPENMP_SPECIALIZE_REDUCE(
vtkm::Add, +)
482 VTKM_OPENMP_SPECIALIZE_REDUCE(
vtkm::
Sum, +)
503 #undef VTKM_OPENMP_SPECIALIZE_REDUCE
504 #undef VTKM_OPENMP_SPECIALIZE_REDUCE1
506 #endif // VTKM_OPENMP_USE_NATIVE_REDUCTION
509 template <
typename KeysInArray,
510 typename ValuesInArray,
511 typename KeysOutArray,
512 typename ValuesOutArray,
513 typename BinaryFunctor>
515 ValuesInArray valuesInArray,
516 KeysOutArray keysOutArray,
517 ValuesOutArray valuesOutArray,
518 BinaryFunctor functor)
520 using KeyType =
typename KeysInArray::ValueType;
521 using ValueType =
typename ValuesInArray::ValueType;
525 const vtkm::Id numValues = keysInArray.GetNumberOfValues();
526 auto keysInPortal = keysInArray.PrepareForInput(DeviceAdapterTagOpenMP(), token);
527 auto valuesInPortal = valuesInArray.PrepareForInput(DeviceAdapterTagOpenMP(), token);
531 auto keysOutPortal = keysOutArray.PrepareForOutput(numValues, DeviceAdapterTagOpenMP(), token);
532 auto valuesOutPortal =
533 valuesOutArray.PrepareForOutput(numValues, DeviceAdapterTagOpenMP(), token);
537 internal::WrappedBinaryOperator<ValueType, BinaryFunctor> f(functor);
543 .GetThreads(numThreads);
548 int tid = omp_get_thread_num();
551 vtkm::Id chunkSize = (numValues + numThreads - 1) / numThreads;
552 vtkm::Id scanIdx = std::min(tid * chunkSize, numValues);
553 vtkm::Id scanEnd = std::min(scanIdx + chunkSize, numValues);
555 auto threadKeysBegin = keysOut + scanIdx;
556 auto threadValuesBegin = valuesOut + scanIdx;
557 auto threadKey = threadKeysBegin;
558 auto threadValue = threadValuesBegin;
562 ValueType rangeValue;
565 if (scanIdx < scanEnd)
567 rangeKey = keysIn[scanIdx];
568 rangeValue = valuesIn[scanIdx];
572 while (scanIdx < scanEnd &&
static_cast<KeyType
>(keysIn[scanIdx]) == rangeKey)
574 rangeValue = f(rangeValue, valuesIn[scanIdx]);
578 *threadKey = rangeKey;
579 *threadValue = rangeValue;
591 outIdx =
static_cast<vtkm::Id>(threadKey - threadKeysBegin);
596 for (
int i = 1; i < numThreads; ++i)
608 if (outIdx > 0 && threadKeysBegin < threadKey && keysOut[outIdx - 1] == *threadKeysBegin)
610 valuesOut[outIdx - 1] = f(valuesOut[outIdx - 1], *threadValuesBegin);
616 if (threadKeysBegin < threadKey && threadKeysBegin != keysOut + outIdx)
618 std::copy(threadKeysBegin, threadKey, keysOut + outIdx);
619 std::copy(threadValuesBegin, threadValue, valuesOut + outIdx);
622 outIdx +=
static_cast<vtkm::Id>(threadKey - threadKeysBegin);
634 template <
typename IterT,
typename RawPredicateT>
637 using ValueType =
typename std::iterator_traits<IterT>::value_type;
638 using PredicateT = internal::WrappedBinaryOperator<bool, RawPredicateT>;
696 .GetThreads(numThreads);
701 this->NumValues, numThreads, chunksPerThread,
sizeof(
ValueType), numChunks, this->LeafSize);
704 std::size_t numNodes =
static_cast<std::size_t
>(numChunks);
705 while (numChunks > 1)
707 numChunks = (numChunks + 1) / 2;
708 numNodes +=
static_cast<std::size_t
>(numChunks);
710 this->Nodes.resize(numNodes);
719 #pragma GCC diagnostic push
720 #pragma GCC diagnostic ignored "-Wunused-value"
728 #pragma GCC diagnostic pop
732 return &this->Nodes[nodeIdx];
741 const vtkm::Id n = range[1] - range[0];
744 return CeilDivide(n / 2, np) * np + range[0];
761 auto explicitThis =
this;
786 auto start = this->Data + node->
InputRange[0];
788 end = std::unique(start, end, this->Predicate);
798 #endif // vtk_m_cont_openmp_internal_FunctorsOpenMP_h