10 #ifndef vtk_m_cont_openmp_internal_FunctorsOpenMP_h 
   11 #define vtk_m_cont_openmp_internal_FunctorsOpenMP_h 
   28 #include <type_traits> 
   34 #define VTKM_OPENMP_DIRECTIVE_IMPL(fullDir) _Pragma(#fullDir) 
   35 #define VTKM_OPENMP_DIRECTIVE(dir) VTKM_OPENMP_DIRECTIVE_IMPL(omp dir) 
   37 #define VTKM_OPENMP_DIRECTIVE(directive) 
   46 #if defined(VTKM_GCC) && (__GNUC__ < 9) 
   47 #define VTKM_OPENMP_SHARED_CONST(...) 
   49 #define VTKM_OPENMP_SHARED_CONST(...) shared(__VA_ARGS__) 
   66 constexpr 
static vtkm::Id VTKM_CACHE_LINE_SIZE = 64;
 
   67 constexpr 
static vtkm::Id VTKM_PAGE_SIZE = 4096;
 
   71 static constexpr T CeilDivide(
const T& numerator, 
const T& denominator)
 
   73   return (numerator + denominator - 1) / denominator;
 
   78 static void ComputeChunkSize(
const vtkm::Id numVals,
 
   86   const vtkm::Id bytesIn = numVals * bytesPerValue;
 
   87   const vtkm::Id pagesIn = CeilDivide(bytesIn, VTKM_PAGE_SIZE);
 
   89   numChunks = (pagesIn > numThreads * chunksPerThread) ? numThreads * chunksPerThread : numThreads;
 
   90   const vtkm::Id pagesPerChunk = CeilDivide(pagesIn, numChunks);
 
   91   valuesPerChunk = CeilDivide(pagesPerChunk * VTKM_PAGE_SIZE, bytesPerValue);
 
  100 template <
typename PortalType>
 
  103   using type = 
typename PortalType::ValueType;
 
  106 template <
typename T>
 
  109 template <
typename T, 
typename U>
 
  110 static void DoCopy(T src, U dst, 
vtkm::Id numVals, std::true_type)
 
  114     std::copy(src, src + numVals, dst);
 
  119 template <
typename InIterT, 
typename OutIterT>
 
  120 static void DoCopy(InIterT inIter, OutIterT outIter, 
vtkm::Id numVals, std::false_type)
 
  122   using InValueType = CleanArrayRef<typename std::iterator_traits<InIterT>::value_type>;
 
  123   using OutValueType = CleanArrayRef<typename std::iterator_traits<OutIterT>::value_type>;
 
  125   for (
vtkm::Id i = 0; i < numVals; ++i)
 
  131     *(outIter++) = 
static_cast<OutValueType
>(
static_cast<InValueType
>(*(inIter++)));
 
  135 template <
typename InIterT, 
typename OutIterT>
 
  136 static void DoCopy(InIterT inIter, OutIterT outIter, 
vtkm::Id numVals)
 
  138   using InValueType = CleanArrayRef<typename std::iterator_traits<InIterT>::value_type>;
 
  139   using OutValueType = CleanArrayRef<typename std::iterator_traits<OutIterT>::value_type>;
 
  141   DoCopy(inIter, outIter, numVals, std::is_same<InValueType, OutValueType>());
 
  145 template <
typename InPortalT, 
typename OutPortalT>
 
  146 static void CopyHelper(InPortalT inPortal,
 
  147                        OutPortalT outPortal,
 
  152   using InValueT = 
typename InPortalT::ValueType;
 
  153   using OutValueT = 
typename OutPortalT::ValueType;
 
  154   constexpr 
auto isSame = std::is_same<InValueT, OutValueT>();
 
  172         .GetThreads(numThreads);
 
  173       ComputeChunkSize(numVals, numThreads, 8, 
sizeof(InValueT), numChunks, valuesPerChunk);
 
  177     for (
vtkm::
Id i = 0; i < numVals; i += valuesPerChunk)
 
  179       vtkm::Id chunkSize = std::min(numVals - i, valuesPerChunk);
 
  180       DoCopy(inIter + i, outIter + i, chunkSize, isSame);
 
  199     this->NumValues = numValues;
 
  202       .GetThreads(this->NumThreads);
 
  203     this->ValueSize = valueSize;
 
  208       this->NumValues, this->NumThreads, 8, valueSize, this->NumChunks, this->ChunkSize);
 
  210     this->EndIds.resize(
static_cast<std::size_t
>(this->NumChunks));
 
  213   template <
typename InIterT, 
typename StencilIterT, 
typename OutIterT, 
typename PredicateT>
 
  215               StencilIterT stencilIter,
 
  220     vtkm::Id startPos = std::min(chunk * this->ChunkSize, this->NumValues);
 
  221     vtkm::Id endPos = std::min((chunk + 1) * this->ChunkSize, this->NumValues);
 
  224     for (
vtkm::Id inPos = startPos; inPos < endPos; ++inPos)
 
  226       if (pred(stencilIter[inPos]))
 
  228         outIter[outPos++] = inIter[inPos];
 
  232     this->EndIds[
static_cast<std::size_t
>(chunk)] = outPos;
 
  235   template <
typename OutIterT>
 
  238     vtkm::Id endPos = this->EndIds.front();
 
  241       vtkm::Id chunkStart = std::min(i * this->ChunkSize, this->NumValues);
 
  242       vtkm::Id chunkEnd = this->EndIds[
static_cast<std::size_t
>(i)];
 
  243       vtkm::Id numValuesToCopy = chunkEnd - chunkStart;
 
  244       if (numValuesToCopy > 0 && chunkStart != endPos)
 
  246         std::copy(data + chunkStart, data + chunkEnd, data + endPos);
 
  248       endPos += numValuesToCopy;
 
  254 #ifdef VTKM_OPENMP_USE_NATIVE_REDUCTION 
  257 template <
typename T>
 
  302 template <
typename T>
 
  304 #endif // VTKM_OPENMP_USE_NATIVE_REDUCTION 
  309   template <
typename T>
 
  314   template <
typename T, vtkm::IdComponent Size>
 
  319   template <
typename T, 
typename U>
 
  321     : 
public std::integral_constant<bool, std::is_integral<T>::value && std::is_integral<U>::value>
 
  326   template <
typename PortalT, 
typename ReturnType, 
typename Functor>
 
  327   static ReturnType 
Execute(PortalT portal, ReturnType init, Functor functorIn, std::false_type)
 
  329     internal::WrappedBinaryOperator<ReturnType, Functor> f(functorIn);
 
  331     const vtkm::Id numVals = portal.GetNumberOfValues();
 
  334     bool doParallel = 
false;
 
  339       .GetThreads(numThreads);
 
  341     std::unique_ptr<ReturnType[]> threadData;
 
  346       int tid = omp_get_thread_num();
 
  350         if (numVals >= numThreads * 2)
 
  353           threadData.reset(
new ReturnType[
static_cast<std::size_t
>(numThreads)]);
 
  360         const ReturnType localResult = ReduceHelper::DoParallelReduction<ReturnType>(
 
  363         threadData[
static_cast<std::size_t
>(tid)] = localResult;
 
  370       for (
size_t i = 0; i < static_cast<size_t>(numThreads); ++i)
 
  372         init = f(init, threadData[i]);
 
  378       for (
vtkm::Id i = 0; i < numVals; ++i)
 
  380         init = f(init, data[i]);
 
  389   template <
typename ReturnType, 
typename IterType, 
typename FunctorType>
 
  393                                         const int& numThreads,
 
  398     ReturnType accum = f(data[2 * tid], data[2 * tid + 1]);
 
  400     const vtkm::Id offset = numThreads * 2;
 
  401     const vtkm::Id end = std::max(((numVals / 4) * 4) - 4, offset);
 
  402     const vtkm::Id unrollEnd = end - ((end - offset) % 4);
 
  407 #pragma GCC diagnostic push 
  408 #pragma GCC diagnostic ignored "-Wsign-conversion" 
  410     for (i = offset; i < unrollEnd; i += 4)
 
  411 #pragma GCC diagnostic pop
 
  413       const auto t1 = f(data[i], data[i + 1]);
 
  414       const auto t2 = f(data[i + 2], data[i + 3]);
 
  415       accum = f(accum, t1);
 
  416       accum = f(accum, t2);
 
  421     if (tid == numThreads - 1)
 
  423       for (i = unrollEnd; i < numVals; ++i)
 
  425         accum = f(accum, data[i]);
 
  434   template <
typename ReturnType, 
typename IterType, 
typename FunctorType>
 
  438                                         const int& numThreads,
 
  443     ReturnType accum = f(data[2 * tid], data[2 * tid + 1]);
 
  446 #pragma GCC diagnostic push 
  447 #pragma GCC diagnostic ignored "-Wsign-conversion" 
  449     for (
vtkm::Id i = numThreads * 2; i < numVals; i++)
 
  450 #pragma GCC diagnostic pop
 
  452       accum = f(accum, data[i]);
 
  458 #ifdef VTKM_OPENMP_USE_NATIVE_REDUCTION 
  461 #define VTKM_OPENMP_SPECIALIZE_REDUCE1(FunctorType, PragmaString)            \ 
  462   template <typename PortalT, typename ReturnType>                           \ 
  463   static ReturnType Execute(                                                 \ 
  464     PortalT portal, ReturnType value, FunctorType functorIn, std::true_type) \ 
  466     const vtkm::Id numValues = portal.GetNumberOfValues();                   \ 
  467     internal::WrappedBinaryOperator<ReturnType, FunctorType> f(functorIn);   \ 
  468     _Pragma(#PragmaString) for (vtkm::Id i = 0; i < numValues; ++i)          \ 
  470       value = f(value, portal.Get(i));                                       \ 
  477 #define VTKM_OPENMP_SPECIALIZE_REDUCE(FunctorType, Operator) \ 
  478   VTKM_OPENMP_SPECIALIZE_REDUCE1(FunctorType, "omp parallel for reduction(" #Operator ":value)") 
  481   VTKM_OPENMP_SPECIALIZE_REDUCE(
vtkm::Add, +)
 
  482   VTKM_OPENMP_SPECIALIZE_REDUCE(
vtkm::
Sum, +)
 
  503 #undef VTKM_OPENMP_SPECIALIZE_REDUCE 
  504 #undef VTKM_OPENMP_SPECIALIZE_REDUCE1 
  506 #endif // VTKM_OPENMP_USE_NATIVE_REDUCTION 
  509 template <
typename KeysInArray,
 
  510           typename ValuesInArray,
 
  511           typename KeysOutArray,
 
  512           typename ValuesOutArray,
 
  513           typename BinaryFunctor>
 
  515                        ValuesInArray valuesInArray,
 
  516                        KeysOutArray keysOutArray,
 
  517                        ValuesOutArray valuesOutArray,
 
  518                        BinaryFunctor functor)
 
  520   using KeyType = 
typename KeysInArray::ValueType;
 
  521   using ValueType = 
typename ValuesInArray::ValueType;
 
  525   const vtkm::Id numValues = keysInArray.GetNumberOfValues();
 
  526   auto keysInPortal = keysInArray.PrepareForInput(DeviceAdapterTagOpenMP(), token);
 
  527   auto valuesInPortal = valuesInArray.PrepareForInput(DeviceAdapterTagOpenMP(), token);
 
  531   auto keysOutPortal = keysOutArray.PrepareForOutput(numValues, DeviceAdapterTagOpenMP(), token);
 
  532   auto valuesOutPortal =
 
  533     valuesOutArray.PrepareForOutput(numValues, DeviceAdapterTagOpenMP(), token);
 
  537   internal::WrappedBinaryOperator<ValueType, BinaryFunctor> f(functor);
 
  543     .GetThreads(numThreads);
 
  548     int tid = omp_get_thread_num();
 
  551     vtkm::Id chunkSize = (numValues + numThreads - 1) / numThreads;
 
  552     vtkm::Id scanIdx = std::min(tid * chunkSize, numValues);
 
  553     vtkm::Id scanEnd = std::min(scanIdx + chunkSize, numValues);
 
  555     auto threadKeysBegin = keysOut + scanIdx;
 
  556     auto threadValuesBegin = valuesOut + scanIdx;
 
  557     auto threadKey = threadKeysBegin;
 
  558     auto threadValue = threadValuesBegin;
 
  562     ValueType rangeValue;
 
  565       if (scanIdx < scanEnd)
 
  567         rangeKey = keysIn[scanIdx];
 
  568         rangeValue = valuesIn[scanIdx];
 
  572         while (scanIdx < scanEnd && 
static_cast<KeyType
>(keysIn[scanIdx]) == rangeKey)
 
  574           rangeValue = f(rangeValue, valuesIn[scanIdx]);
 
  578         *threadKey = rangeKey;
 
  579         *threadValue = rangeValue;
 
  591       outIdx = 
static_cast<vtkm::Id>(threadKey - threadKeysBegin);
 
  596     for (
int i = 1; i < numThreads; ++i)
 
  608         if (outIdx > 0 && threadKeysBegin < threadKey && keysOut[outIdx - 1] == *threadKeysBegin)
 
  610           valuesOut[outIdx - 1] = f(valuesOut[outIdx - 1], *threadValuesBegin);
 
  616         if (threadKeysBegin < threadKey && threadKeysBegin != keysOut + outIdx)
 
  618           std::copy(threadKeysBegin, threadKey, keysOut + outIdx);
 
  619           std::copy(threadValuesBegin, threadValue, valuesOut + outIdx);
 
  622         outIdx += 
static_cast<vtkm::Id>(threadKey - threadKeysBegin);
 
  634 template <
typename IterT, 
typename RawPredicateT>
 
  637   using ValueType = 
typename std::iterator_traits<IterT>::value_type;
 
  638   using PredicateT = internal::WrappedBinaryOperator<bool, RawPredicateT>;
 
  696       .GetThreads(numThreads);
 
  701       this->NumValues, numThreads, chunksPerThread, 
sizeof(
ValueType), numChunks, this->LeafSize);
 
  704     std::size_t numNodes = 
static_cast<std::size_t
>(numChunks);
 
  705     while (numChunks > 1)
 
  707       numChunks = (numChunks + 1) / 2;
 
  708       numNodes += 
static_cast<std::size_t
>(numChunks);
 
  710     this->Nodes.resize(numNodes);
 
  719 #pragma GCC diagnostic push 
  720 #pragma GCC diagnostic ignored "-Wunused-value" 
  728 #pragma GCC diagnostic pop 
  732     return &this->Nodes[nodeIdx];
 
  741     const vtkm::Id n = range[1] - range[0];
 
  744     return CeilDivide(n / 2, np) * np + range[0];
 
  761       auto explicitThis = 
this;
 
  786       auto start = this->Data + node->
InputRange[0];
 
  788       end = std::unique(start, end, this->Predicate);
 
  798 #endif // vtk_m_cont_openmp_internal_FunctorsOpenMP_h