Taskflow  2.4-master-branch
cuda_flow.hpp
1 #pragma once
2 
3 #include "cuda_task.hpp"
4 
5 namespace tf {
6 
18 class cudaFlow {
19 
20  friend class Executor;
21 
22  public:
23 
32  template <typename P>
33  cudaFlow(cudaGraph& graph, P&& p);
34 
38  bool empty() const;
39 
50  cudaTask noop();
51 
52  // CUDA seems pretty restrictive about calling host in a cudaGraph.
53  // We disable this function and wait for future stability.
54  //
55  //@brief creates a host execution task
56  //
57  //@tparam C callable type
58  //
59  //@param c a callable object constructible from std::function<void()>.
60 
61  //A host can only execute CPU-specific functions and cannot do any CUDA calls
62  //(e.g., cudaMalloc).
63  //
64  //template <typename C>
65  //cudaTask host(C&& c);
66 
81  template <typename F, typename... ArgsT>
82  cudaTask kernel(dim3 g, dim3 b, size_t s, F&& f, ArgsT&&... args);
83 
99  template <typename F, typename... ArgsT>
100  cudaTask kernel_on(int d, dim3 g, dim3 b, size_t s, F&& f, ArgsT&&... args);
101 
112  cudaTask memset(void* dst, int v, size_t count);
113 
126  cudaTask memcpy(void* tgt, const void* src, size_t bytes);
127 
138  template <typename T>
139  std::enable_if_t<
140  is_pod_v<T> && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4),
141  cudaTask
142  >
143  zero(T* dst, size_t count);
144 
157  template <typename T>
158  std::enable_if_t<
159  is_pod_v<T> && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4),
160  cudaTask
161  >
162  fill(T* dst, T value, size_t count);
163 
178  template <
179  typename T,
180  std::enable_if_t<!std::is_same<T, void>::value, void>* = nullptr
181  >
182  cudaTask copy(T* tgt, const T* src, size_t num);
183 
189  void device(int device);
190 
194  int device() const;
195 
201  void stream(cudaStream_t stream);
202 
211  template <typename P>
212  void predicate(P&& p);
213 
217  void repeat(size_t n);
218 
219  private:
220 
221  cudaGraph& _graph;
222 
223  int _device {0};
224 
225  nstd::optional<cudaStream_t> _stream;
226 
227  std::function<bool()> _predicate;
228 };
229 
230 // Constructor
231 template <typename P>
232 cudaFlow::cudaFlow(cudaGraph& g, P&& p) :
233  _graph {g},
234  _predicate {std::forward<P>(p)} {
235 }
236 
237 // Procedure: predicate
238 template <typename P>
239 void cudaFlow::predicate(P&& pred) {
240  _predicate = std::forward<P>(pred);
241 }
242 
243 // Procedure: repeat
244 inline void cudaFlow::repeat(size_t n) {
245  _predicate = [n] () mutable { return n-- == 0; };
246 }
247 
248 // Function: empty
249 inline bool cudaFlow::empty() const {
250  return _graph._nodes.empty();
251 }
252 
253 // Procedure: device
254 inline void cudaFlow::device(int d) {
255  _device = d;
256 }
257 
258 // Function: device
259 inline int cudaFlow::device() const {
260  return _device;
261 }
262 
263 // Procedure: stream
264 inline void cudaFlow::stream(cudaStream_t s) {
265  _stream = s;
266 }
267 
268 // Function: noop
270  auto node = _graph.emplace_back(nstd::in_place_type_t<cudaNode::Noop>{},
271  [](cudaGraph_t& graph, cudaGraphNode_t& node){
272  TF_CHECK_CUDA(
273  ::cudaGraphAddEmptyNode(&node, graph, nullptr, 0),
274  "failed to create a no-operation (empty) node"
275  );
276  }
277  );
278  return cudaTask(node);
279 }
280 
282 //template <typename C>
283 //cudaTask cudaFlow::host(C&& c) {
284 // auto node = _graph.emplace_back(nstd::in_place_type_t<cudaNode::Host>{},
285 // [c=std::forward<C>(c)](cudaGraph_t& graph, cudaGraphNode_t& node) mutable {
286 // cudaHostNodeParams p;
287 // p.fn = [] (void* data) { (*static_cast<C*>(data))(); };
288 // p.userData = &c;
289 // TF_CHECK_CUDA(
290 // ::cudaGraphAddHostNode(&node, graph, nullptr, 0, &p),
291 // "failed to create a host node"
292 // );
293 // }
294 // );
295 // return cudaTask(node);
296 //}
297 
298 // Function: kernel
299 template <typename F, typename... ArgsT>
301  dim3 g, dim3 b, size_t s, F&& f, ArgsT&&... args
302 ) {
303 
304  using traits = function_traits<F>;
305 
306  static_assert(traits::arity == sizeof...(ArgsT), "arity mismatches");
307 
308  auto node = _graph.emplace_back(nstd::in_place_type_t<cudaNode::Kernel>{},
309  [g, b, s, f=(void*)f, args...] (cudaGraph_t& graph, cudaGraphNode_t& node) {
310 
311  cudaKernelNodeParams p;
312  void* arguments[sizeof...(ArgsT)] = { (void*)(&args)... };
313  p.func = f;
314  p.gridDim = g;
315  p.blockDim = b;
316  p.sharedMemBytes = s;
317  p.kernelParams = arguments;
318  p.extra = nullptr;
319 
320  TF_CHECK_CUDA(
321  ::cudaGraphAddKernelNode(&node, graph, nullptr, 0, &p),
322  "failed to create a cudaGraph node in kernel task"
323  );
324  }
325  );
326 
327  return cudaTask(node);
328 }
329 
330 // Function: kernel
331 template <typename F, typename... ArgsT>
333  int d, dim3 g, dim3 b, size_t s, F&& f, ArgsT&&... args
334 ) {
335 
336  using traits = function_traits<F>;
337 
338  static_assert(traits::arity == sizeof...(ArgsT), "arity mismatches");
339 
340  auto node = _graph.emplace_back(nstd::in_place_type_t<cudaNode::Kernel>{},
341  [d, g, b, s, f=(void*)f, args...] (cudaGraph_t& graph, cudaGraphNode_t& node) {
342 
343  cudaKernelNodeParams p;
344  void* arguments[sizeof...(ArgsT)] = { (void*)(&args)... };
345  p.func = f;
346  p.gridDim = g;
347  p.blockDim = b;
348  p.sharedMemBytes = s;
349  p.kernelParams = arguments;
350  p.extra = nullptr;
351 
352  cudaScopedDevice ctx(d);
353  TF_CHECK_CUDA(
354  ::cudaGraphAddKernelNode(&node, graph, nullptr, 0, &p),
355  "failed to create a cudaGraph node in kernel_on task"
356  );
357  }
358  );
359 
360  return cudaTask(node);
361 }
362 
363 // Function: zero
364 template <typename T>
365 std::enable_if_t<
366  is_pod_v<T> && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4),
367  cudaTask
368 >
369 cudaFlow::zero(T* dst, size_t count) {
370  auto node = _graph.emplace_back(nstd::in_place_type_t<cudaNode::Memset>{},
371  [dst, count] (cudaGraph_t& graph, cudaGraphNode_t& node) {
372  cudaMemsetParams p;
373  p.dst = dst;
374  p.value = 0;
375  p.pitch = 0;
376  p.elementSize = sizeof(T); // either 1, 2, or 4
377  p.width = count;
378  p.height = 1;
379  TF_CHECK_CUDA(
380  cudaGraphAddMemsetNode(&node, graph, nullptr, 0, &p),
381  "failed to create a cudaGraph node in zero task"
382  );
383  }
384  );
385  return cudaTask(node);
386 }
387 
388 // Function: fill
389 template <typename T>
390 std::enable_if_t<
391  is_pod_v<T> && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4),
392  cudaTask
393 >
394 cudaFlow::fill(T* dst, T value, size_t count) {
395  auto node = _graph.emplace_back(nstd::in_place_type_t<cudaNode::Memset>{},
396  [dst, value, count] (cudaGraph_t& graph, cudaGraphNode_t& node) {
397  cudaMemsetParams p;
398  p.dst = dst;
399 
400  // perform bit-wise copy
401  p.value = 0; // crucial
402  static_assert(sizeof(T) <= sizeof(p.value), "internal error");
403  std::memcpy(&p.value, &value, sizeof(T));
404 
405  p.pitch = 0;
406  p.elementSize = sizeof(T); // either 1, 2, or 4
407  p.width = count;
408  p.height = 1;
409  TF_CHECK_CUDA(
410  cudaGraphAddMemsetNode(&node, graph, nullptr, 0, &p),
411  "failed to create a cudaGraph node in fill task"
412  );
413  }
414  );
415  return cudaTask(node);
416 }
417 
418 // Function: copy
419 template <
420  typename T,
421  std::enable_if_t<!std::is_same<T, void>::value, void>*
422 >
423 cudaTask cudaFlow::copy(T* tgt, const T* src, size_t num) {
424 
425  using U = std::decay_t<T>;
426 
427  auto node = _graph.emplace_back(nstd::in_place_type_t<cudaNode::Copy>{},
428  [tgt, src, num] (cudaGraph_t& graph, cudaGraphNode_t& node) {
429 
430  cudaMemcpy3DParms p;
431  p.srcArray = nullptr;
432  p.srcPos = ::make_cudaPos(0, 0, 0);
433  p.srcPtr = ::make_cudaPitchedPtr(const_cast<T*>(src), num*sizeof(U), num, 1);
434  p.dstArray = nullptr;
435  p.dstPos = ::make_cudaPos(0, 0, 0);
436  p.dstPtr = ::make_cudaPitchedPtr(tgt, num*sizeof(U), num, 1);
437  p.extent = ::make_cudaExtent(num*sizeof(U), 1, 1);
438  p.kind = cudaMemcpyDefault;
439 
440  TF_CHECK_CUDA(
441  cudaGraphAddMemcpyNode(&node, graph, nullptr, 0, &p),
442  "failed to create a cudaGraph node in copy task"
443  );
444  }
445  );
446 
447  return cudaTask(node);
448 }
449 
450 // Function: memset
451 inline cudaTask cudaFlow::memset(void* dst, int ch, size_t count) {
452 
453  auto node = _graph.emplace_back(nstd::in_place_type_t<cudaNode::Memset>{},
454  [dst, ch, count] (cudaGraph_t& graph, cudaGraphNode_t& node) {
455  cudaMemsetParams p;
456  p.dst = dst;
457  p.value = ch;
458  p.pitch = 0;
459  //p.elementSize = (count & 1) == 0 ? ((count & 3) == 0 ? 4 : 2) : 1;
460  //p.width = (count & 1) == 0 ? ((count & 3) == 0 ? count >> 2 : count >> 1) : count;
461  p.elementSize = 1; // either 1, 2, or 4
462  p.width = count;
463  p.height = 1;
464  TF_CHECK_CUDA(
465  cudaGraphAddMemsetNode(&node, graph, nullptr, 0, &p),
466  "failed to create a cudaGraph node in memset task"
467  );
468  }
469  );
470 
471  return cudaTask(node);
472 }
473 
474 // Function: memcpy
475 inline cudaTask cudaFlow::memcpy(void* tgt, const void* src, size_t bytes) {
476  auto node = _graph.emplace_back(nstd::in_place_type_t<cudaNode::Copy>{},
477  [tgt, src, bytes] (cudaGraph_t& graph, cudaGraphNode_t& node) {
478  // Parameters in cudaPitchedPtr
479  // d - Pointer to allocated memory
480  // p - Pitch of allocated memory in bytes
481  // xsz - Logical width of allocation in elements
482  // ysz - Logical height of allocation in elements
483  cudaMemcpy3DParms p;
484  p.srcArray = nullptr;
485  p.srcPos = ::make_cudaPos(0, 0, 0);
486  p.srcPtr = ::make_cudaPitchedPtr(const_cast<void*>(src), bytes, bytes, 1);
487  p.dstArray = nullptr;
488  p.dstPos = ::make_cudaPos(0, 0, 0);
489  p.dstPtr = ::make_cudaPitchedPtr(tgt, bytes, bytes, 1);
490  p.extent = ::make_cudaExtent(bytes, 1, 1);
491  p.kind = cudaMemcpyDefault;
492  TF_CHECK_CUDA(
493  cudaGraphAddMemcpyNode(&node, graph, nullptr, 0, &p),
494  "failed to create a cudaGraph node in memcpy task"
495  );
496  }
497  );
498  return cudaTask(node);
499 }
500 
501 } // end of namespace tf -----------------------------------------------------
502 
503 
cudaTask memcpy(void *tgt, const void *src, size_t bytes)
creates a memcpy task
Definition: cuda_flow.hpp:475
void repeat(size_t n)
repeats the execution of the cudaFlow by n times
Definition: cuda_flow.hpp:244
cudaTask copy(T *tgt, const T *src, size_t num)
creates a copy task
Definition: cuda_flow.hpp:423
std::enable_if_t< is_pod_v< T > &&(sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), cudaTask > fill(T *dst, T value, size_t count)
creates a fill task that fills a typed memory block with a value
Definition: cuda_flow.hpp:394
Definition: error.hpp:9
cudaTask memset(void *dst, int v, size_t count)
creates a memset task
Definition: cuda_flow.hpp:451
bool empty() const
queries the emptiness of the graph
Definition: cuda_flow.hpp:249
methods for building a CUDA task dependency graph.
Definition: cuda_flow.hpp:18
cudaTask noop()
creates a no-operation task
Definition: cuda_flow.hpp:269
cudaTask kernel(dim3 g, dim3 b, size_t s, F &&f, ArgsT &&... args)
creates a kernel task
Definition: cuda_flow.hpp:300
cudaTask kernel_on(int d, dim3 g, dim3 b, size_t s, F &&f, ArgsT &&... args)
creates a kernel task on a device
Definition: cuda_flow.hpp:332
cudaFlow(cudaGraph &graph, P &&p)
constructs a cudaFlow builder object
Definition: cuda_flow.hpp:232
int device() const
queries the device associated with the cudaFlow
Definition: cuda_flow.hpp:259
handle to a node in a cudaGraph
Definition: cuda_task.hpp:12
void predicate(P &&p)
assigns a predicate to loop the cudaFlow until the predicate is satisfied
Definition: cuda_flow.hpp:239
void stream(cudaStream_t stream)
assigns a stream to launch the cudaFlow
Definition: cuda_flow.hpp:264
execution interface for running a taskflow graph
Definition: executor.hpp:43
std::enable_if_t< is_pod_v< T > &&(sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), cudaTask > zero(T *dst, size_t count)
creates a zero task that zeroes a typed memory block
Definition: cuda_flow.hpp:369