VTK-m  2.2
ThrustPatches.h
Go to the documentation of this file.
1 //============================================================================
2 // Copyright (c) Kitware, Inc.
3 // All rights reserved.
4 // See LICENSE.txt for details.
5 //
6 // This software is distributed WITHOUT ANY WARRANTY; without even
7 // the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
8 // PURPOSE. See the above copyright notice for more information.
9 //============================================================================
10 #ifndef vtk_m_exec_cuda_internal_ThrustPatches_h
11 #define vtk_m_exec_cuda_internal_ThrustPatches_h
12 
13 #include <vtkm/Types.h>
14 
15 #ifdef VTKM_ENABLE_CUDA
16 
17 // Needed so we can conditionally include components
18 #include <thrust/version.h>
19 
20 #if THRUST_VERSION >= 100900 && THRUST_VERSION < 100906
21 //So for thrust 1.9.0+ ( CUDA 9.X+ ) the aligned_reinterpret_cast has a bug
22 //where it is not marked as __host__device__. To fix this we add a new
23 //overload for void* with the correct markup (which is what everyone calls).
24 namespace thrust
25 {
26 namespace detail
27 {
28 //just in-case somebody has this fix also for primitive types
29 template <typename T, typename U>
30 T aligned_reinterpret_cast(U u);
31 
32 #define ALIGN_RE_T(RT) \
33  template <> \
34  inline __host__ __device__ RT* aligned_reinterpret_cast(void* u) \
35  { \
36  return reinterpret_cast<RT*>(reinterpret_cast<void*>(u)); \
37  } \
38  template <> \
39  inline __host__ __device__ RT* aligned_reinterpret_cast(vtkm::UInt8* u) \
40  { \
41  return reinterpret_cast<RT*>(reinterpret_cast<void*>(u)); \
42  } \
43  struct SwallowSemicolon
44 
45 #define ALIGN_RE_VEC(RT) \
46  template <> \
47  inline __host__ __device__ vtkm::Vec<RT, 2>* aligned_reinterpret_cast(void* u) \
48  { \
49  return reinterpret_cast<vtkm::Vec<RT, 2>*>(reinterpret_cast<void*>(u)); \
50  } \
51  template <> \
52  inline __host__ __device__ vtkm::Vec<RT, 3>* aligned_reinterpret_cast(void* u) \
53  { \
54  return reinterpret_cast<vtkm::Vec<RT, 3>*>(reinterpret_cast<void*>(u)); \
55  } \
56  template <> \
57  inline __host__ __device__ vtkm::Vec<RT, 4>* aligned_reinterpret_cast(void* u) \
58  { \
59  return reinterpret_cast<vtkm::Vec<RT, 4>*>(reinterpret_cast<void*>(u)); \
60  } \
61  template <> \
62  inline __host__ __device__ vtkm::Vec<vtkm::Vec<RT, 3>, 2>* aligned_reinterpret_cast(void* u) \
63  { \
64  return reinterpret_cast<vtkm::Vec<vtkm::Vec<RT, 3>, 2>*>(reinterpret_cast<void*>(u)); \
65  } \
66  template <> \
67  inline __host__ __device__ vtkm::Vec<vtkm::Vec<RT, 9>, 2>* aligned_reinterpret_cast(void* u) \
68  { \
69  return reinterpret_cast<vtkm::Vec<vtkm::Vec<RT, 9>, 2>*>(reinterpret_cast<void*>(u)); \
70  } \
71  template <> \
72  inline __host__ __device__ vtkm::Vec<RT, 2>* aligned_reinterpret_cast(vtkm::UInt8* u) \
73  { \
74  return reinterpret_cast<vtkm::Vec<RT, 2>*>(reinterpret_cast<void*>(u)); \
75  } \
76  template <> \
77  inline __host__ __device__ vtkm::Vec<RT, 3>* aligned_reinterpret_cast(vtkm::UInt8* u) \
78  { \
79  return reinterpret_cast<vtkm::Vec<RT, 3>*>(reinterpret_cast<void*>(u)); \
80  } \
81  template <> \
82  inline __host__ __device__ vtkm::Vec<RT, 4>* aligned_reinterpret_cast(vtkm::UInt8* u) \
83  { \
84  return reinterpret_cast<vtkm::Vec<RT, 4>*>(reinterpret_cast<void*>(u)); \
85  } \
86  template <> \
87  inline __host__ __device__ vtkm::Vec<vtkm::Vec<RT, 2>, 2>* aligned_reinterpret_cast( \
88  vtkm::UInt8* u) \
89  { \
90  return reinterpret_cast<vtkm::Vec<vtkm::Vec<RT, 2>, 2>*>(reinterpret_cast<void*>(u)); \
91  } \
92  template <> \
93  inline __host__ __device__ vtkm::Vec<vtkm::Vec<RT, 3>, 2>* aligned_reinterpret_cast( \
94  vtkm::UInt8* u) \
95  { \
96  return reinterpret_cast<vtkm::Vec<vtkm::Vec<RT, 3>, 2>*>(reinterpret_cast<void*>(u)); \
97  } \
98  template <> \
99  inline __host__ __device__ vtkm::Vec<vtkm::Vec<RT, 4>, 2>* aligned_reinterpret_cast( \
100  vtkm::UInt8* u) \
101  { \
102  return reinterpret_cast<vtkm::Vec<vtkm::Vec<RT, 4>, 2>*>(reinterpret_cast<void*>(u)); \
103  } \
104  template <> \
105  inline __host__ __device__ vtkm::Vec<vtkm::Vec<RT, 9>, 2>* aligned_reinterpret_cast( \
106  vtkm::UInt8* u) \
107  { \
108  return reinterpret_cast<vtkm::Vec<vtkm::Vec<RT, 9>, 2>*>(reinterpret_cast<void*>(u)); \
109  } \
110  struct SwallowSemicolon
111 
112 #define ALIGN_RE_PAIR(T, U) \
113  template <> \
114  inline __host__ __device__ vtkm::Pair<T, U>* aligned_reinterpret_cast(void* u) \
115  { \
116  return reinterpret_cast<vtkm::Pair<T, U>*>(reinterpret_cast<void*>(u)); \
117  } \
118  template <> \
119  inline __host__ __device__ vtkm::Pair<T, U>* aligned_reinterpret_cast(vtkm::UInt8* u) \
120  { \
121  return reinterpret_cast<vtkm::Pair<T, U>*>(reinterpret_cast<void*>(u)); \
122  } \
123  struct SwallowSemicolon
124 
125 #ifndef VTKM_DONT_FIX_THRUST
126 ALIGN_RE_T(bool);
127 ALIGN_RE_T(char);
128 ALIGN_RE_T(vtkm::Int8);
129 ALIGN_RE_T(vtkm::UInt8);
130 ALIGN_RE_T(vtkm::Int16);
131 ALIGN_RE_T(vtkm::UInt16);
132 ALIGN_RE_T(vtkm::Int32);
133 ALIGN_RE_T(vtkm::UInt32);
134 // Need these for vtk. don't need long long, since those are used for [U]Int64.
135 ALIGN_RE_T(long);
136 ALIGN_RE_T(unsigned long);
137 ALIGN_RE_T(vtkm::Int64);
138 ALIGN_RE_T(vtkm::UInt64);
139 ALIGN_RE_T(vtkm::Float32);
140 ALIGN_RE_T(vtkm::Float64);
141 #endif
142 
143 ALIGN_RE_VEC(char);
144 ALIGN_RE_VEC(vtkm::Int8);
145 ALIGN_RE_VEC(vtkm::UInt8);
146 ALIGN_RE_VEC(vtkm::Int16);
147 ALIGN_RE_VEC(vtkm::UInt16);
148 ALIGN_RE_VEC(vtkm::Int32);
149 ALIGN_RE_VEC(vtkm::UInt32);
150 // Need these for vtk. don't need long long, since those are used for [U]Int64.
151 ALIGN_RE_VEC(long);
152 ALIGN_RE_VEC(unsigned long);
153 ALIGN_RE_VEC(vtkm::Int64);
154 ALIGN_RE_VEC(vtkm::UInt64);
155 ALIGN_RE_VEC(vtkm::Float32);
156 ALIGN_RE_VEC(vtkm::Float64);
157 
158 ALIGN_RE_PAIR(vtkm::Int32, vtkm::Int32);
159 ALIGN_RE_PAIR(vtkm::Int32, vtkm::Int64);
160 ALIGN_RE_PAIR(vtkm::Int32, vtkm::Float32);
161 ALIGN_RE_PAIR(vtkm::Int32, vtkm::Float64);
162 
163 ALIGN_RE_PAIR(vtkm::Int64, vtkm::Int32);
164 ALIGN_RE_PAIR(vtkm::Int64, vtkm::Int64);
165 ALIGN_RE_PAIR(vtkm::Int64, vtkm::Float32);
166 ALIGN_RE_PAIR(vtkm::Int64, vtkm::Float64);
167 
168 #undef ALIGN_RE_T
169 #undef ALIGN_RE_VEC
170 #undef ALIGN_RE_PAIR
171 }
172 }
173 #endif //THRUST_VERSION >= 100900 && THRUST_VERSION < 100906
174 
175 #if (THRUST_VERSION >= 100904) && (THRUST_VERSION < 100909)
176 //So for thrust 1.9.4+ (CUDA 10.1+) the stateless_resource_allocator has a bug
177 //where it is not marked as __host__ __device__ && __thrust_exec_check_disable__.
178 //To fix this we add a new partial specialization on cuda::memory_resource
179 //which the correct markup (which is what everyone calls).
180 //See: https://github.com/thrust/thrust/issues/972
182 #include <thrust/mr/allocator.h>
183 #include <thrust/system/cuda/memory_resource.h>
185 namespace thrust
186 {
187 namespace mr
188 {
189 
190 template <typename T>
191 class stateless_resource_allocator<T, ::thrust::system::cuda::memory_resource>
192  : public thrust::mr::allocator<T, ::thrust::system::cuda::memory_resource>
193 {
194  typedef ::thrust::system::cuda::memory_resource Upstream;
195  typedef thrust::mr::allocator<T, Upstream> base;
196 
197 public:
202  template <typename U>
203  struct rebind
204  {
207  typedef stateless_resource_allocator<U, Upstream> other;
208  };
209 
213  __thrust_exec_check_disable__ //modification, required to suppress warnings
214  __host__ __device__ //modification, required to suppress warnings
215  stateless_resource_allocator()
216  : base(get_global_resource<Upstream>())
217  {
218  }
219 
221  __host__ __device__ stateless_resource_allocator(const stateless_resource_allocator& other)
222  : base(other)
223  {
224  }
225 
227  template <typename U>
228  __host__ __device__
229  stateless_resource_allocator(const stateless_resource_allocator<U, Upstream>& other)
230  : base(other)
231  {
232  }
233 
235  __host__ __device__ ~stateless_resource_allocator() {}
236 };
237 }
238 }
239 #endif //(THRUST_VERSION >= 100904) && (THRUST_VERSION < 100909)
240 
241 
242 #if THRUST_VERSION < 100900
243 //So for thrust 1.8.0 - 1.8.2 the inclusive_scan has a bug when accumulating
244 //values when the binary operators states it is not commutative.
245 //For more complex value types, we patch thrust/bulk with fix that is found
246 //in issue: https://github.com/thrust/thrust/issues/692
247 //
248 //This specialization needs to be included before ANY thrust includes otherwise
249 //other device code inside thrust that calls it will not see it
250 namespace vtkm
251 {
252 namespace exec
253 {
254 namespace cuda
255 {
256 namespace internal
257 {
258 //Forward declare of WrappedBinaryOperator
259 template <typename T, typename F>
260 class WrappedBinaryOperator;
261 }
262 }
263 }
264 } //namespace vtkm::exec::cuda::internal
265 
266 namespace thrust
267 {
268 namespace system
269 {
270 namespace cuda
271 {
272 namespace detail
273 {
274 namespace bulk_
275 {
276 namespace detail
277 {
278 namespace accumulate_detail
279 {
280 template <typename ConcurrentGroup,
281  typename RandomAccessIterator,
282  typename Size,
283  typename T,
284  typename F>
285 __device__ T
286 destructive_accumulate_n(ConcurrentGroup& g,
287  RandomAccessIterator first,
288  Size n,
289  T init,
290  vtkm::exec::cuda::internal::WrappedBinaryOperator<T, F> binary_op)
291 {
292  using size_type = typename ConcurrentGroup::size_type;
293 
294  size_type tid = g.this_exec.index();
295 
296  T x = init;
297  if (tid < n)
298  {
299  x = first[tid];
300  }
301 
302  g.wait();
303 
304  for (size_type offset = 1; offset < g.size(); offset += offset)
305  {
306  if (tid >= offset && tid - offset < n)
307  {
308  x = binary_op(first[tid - offset], x);
309  }
310 
311  g.wait();
312 
313  if (tid < n)
314  {
315  first[tid] = x;
316  }
317 
318  g.wait();
319  }
320 
321  T result = binary_op(init, first[n - 1]);
322 
323  g.wait();
324 
325  return result;
326 }
327 }
328 }
329 } //namespace bulk_::detail::accumulate_detail
330 }
331 }
332 }
333 } //namespace thrust::system::cuda::detail
334 #endif //THRUST_VERSION < 100900
335 
336 #endif //CUDA enabled
337 
338 #endif //vtk_m_exec_cuda_internal_ThrustPatches_h
VTKM_THIRDPARTY_POST_INCLUDE
#define VTKM_THIRDPARTY_POST_INCLUDE
Definition: Configure.h:192
vtkm
Groups connected points that have the same field value.
Definition: Atomic.h:19
Types.h
vtkm::Int16
int16_t Int16
Base type to use for 16-bit signed integer numbers.
Definition: Types.h:173
vtkm::Int8
int8_t Int8
Base type to use for 8-bit signed integer numbers.
Definition: Types.h:165
vtkm::Int64
signed long long Int64
Base type to use for 64-bit signed integer numbers.
Definition: Types.h:204
vtkm::UInt8
uint8_t UInt8
Base type to use for 8-bit unsigned integer numbers.
Definition: Types.h:169
VTKM_THIRDPARTY_PRE_INCLUDE
#define VTKM_THIRDPARTY_PRE_INCLUDE
Definition: Configure.h:191
vtkm::UInt32
uint32_t UInt32
Base type to use for 32-bit unsigned integer numbers.
Definition: Types.h:185
vtkm::Float32
float Float32
Base type to use for 32-bit floating-point numbers.
Definition: Types.h:157
vtkm::UInt64
unsigned long long UInt64
Base type to use for 64-bit signed integer numbers.
Definition: Types.h:207
vtkm::Int32
int32_t Int32
Base type to use for 32-bit signed integer numbers.
Definition: Types.h:181
vtkm::Float64
double Float64
Base type to use for 64-bit floating-point numbers.
Definition: Types.h:161
vtkm::UInt16
uint16_t UInt16
Base type to use for 16-bit unsigned integer numbers.
Definition: Types.h:177