Taskflow  2.7.0
cuda_flow.hpp
1 #pragma once
2 
3 #include "cuda_task.hpp"
4 #include "cuda_ops.hpp"
5 
6 namespace tf {
7 
11 constexpr size_t cuda_default_threads_per_block(size_t N) {
12  return N >= 256 ? 256 : 128;
13 }
14 
26 class cudaFlow {
27 
28  friend class Executor;
29 
30  public:
31 
35  bool empty() const;
36 
47  cudaTask noop();
48 
49  // CUDA seems pretty restrictive about calling host in a cudaGraph.
50  // We disable this function and wait for future stability.
51  //
52  //@brief creates a host execution task
53  //
54  //@tparam C callable type
55  //
56  //@param c a callable object constructible from std::function<void()>.
57 
58  //A host can only execute CPU-specific functions and cannot do any CUDA calls
59  //(e.g., cudaMalloc).
60  //
61  //template <typename C>
62  //cudaTask host(C&& c);
63 
78  template <typename F, typename... ArgsT>
79  cudaTask kernel(dim3 g, dim3 b, size_t s, F&& f, ArgsT&&... args);
80 
96  template <typename F, typename... ArgsT>
97  cudaTask kernel_on(int d, dim3 g, dim3 b, size_t s, F&& f, ArgsT&&... args);
98 
109  cudaTask memset(void* dst, int v, size_t count);
110 
123  cudaTask memcpy(void* tgt, const void* src, size_t bytes);
124 
135  template <typename T>
136  std::enable_if_t<
137  is_pod_v<T> && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4),
138  cudaTask
139  >
140  zero(T* dst, size_t count);
141 
154  template <typename T>
155  std::enable_if_t<
156  is_pod_v<T> && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4),
157  cudaTask
158  >
159  fill(T* dst, T value, size_t count);
160 
175  template <
176  typename T,
177  std::enable_if_t<!std::is_same<T, void>::value, void>* = nullptr
178  >
179  cudaTask copy(T* tgt, const T* src, size_t num);
180 
188  void device(int device);
189 
193  int device() const;
194 
208  template <typename P>
209  void join_until(P&& predicate);
210 
216  void join_n(size_t N);
217 
221  void join();
222 
223  // ------------------------------------------------------------------------
224  // generic operations
225  // ------------------------------------------------------------------------
226 
245  template <typename I, typename C>
246  cudaTask for_each(I first, I last, C&& callable);
247 
273  template <typename I, typename C>
274  cudaTask for_each_index(I first, I last, I step, C&& callable);
275 
296  template <typename T, typename C, typename... S>
297  cudaTask transform(T* tgt, size_t N, C&& callable, S*... srcs);
298 
299  // TODO:
300  //template <typename T, typename B>
301  //cudaTask reduce(T* tgt, size_t N, T& init, B&& op);
302 
303  template <typename T>
304  cudaTask transpose(const T* d_in, T* d_out, size_t rows, size_t cols);
305 
306  //template <typename T>
307  //cudaTask inplace_transpose(T* data, size_t rows, size_t cols);
308 
309  //template <typename T>
310  //cudaTask matmul(const T* A, const T* B, T* C, size_t M, size_t K, size_t N);
311 
312  //[] (tf::cudaFlow& cf) {
313  //
314  // auto task1 = cf.for_each(...);
315  // auto task2 = cf.matmul(...);
316  //
317  // task1.get<KernelParameter>().block(100);
318 
319  // cf.offload();
320  // cf.memset();
321  //}
322 
323  private:
324 
325  cudaFlow(Executor& executor, cudaGraph& graph);
326 
327  Executor& _executor;
328  cudaGraph& _graph;
329 
330  int _device {-1};
331 
332  bool _joinable {true};
333 
334  // ---- working items
335 
347  template <typename P>
348  void offload_until(P&& predicate);
349 
355  void offload_n(size_t N);
356 
360  void offload();
361 };
362 
363 // Constructor
364 inline cudaFlow::cudaFlow(Executor& e, cudaGraph& g) :
365  _executor {e},
366  _graph {g} {
367 }
368 
369 // Function: empty
370 inline bool cudaFlow::empty() const {
371  return _graph._nodes.empty();
372 }
373 
374 // Procedure: device
375 inline void cudaFlow::device(int d) {
376  if(_device != -1) {
377  TF_THROW("cudaFlow has been assigned to device ", _device);
378  }
379  _device = d;
380 }
381 
382 // Function: device
383 inline int cudaFlow::device() const {
384  return _device;
385 }
386 
387 // Function: noop
389  auto node = _graph.emplace_back(
390  [](cudaGraph_t& graph, cudaGraphNode_t& node){
391  TF_CHECK_CUDA(
392  ::cudaGraphAddEmptyNode(&node, graph, nullptr, 0),
393  "failed to create a no-operation (empty) node"
394  );
395  },
396  nstd::in_place_type_t<cudaNode::Noop>{}
397  );
398  return cudaTask(node);
399 }
400 
402 //template <typename C>
403 //cudaTask cudaFlow::host(C&& c) {
404 // auto node = _graph.emplace_back(nstd::in_place_type_t<cudaNode::Host>{},
405 // [c=std::forward<C>(c)](cudaGraph_t& graph, cudaGraphNode_t& node) mutable {
406 // cudaHostNodeParams p;
407 // p.fn = [] (void* data) { (*static_cast<C*>(data))(); };
408 // p.userData = &c;
409 // TF_CHECK_CUDA(
410 // ::cudaGraphAddHostNode(&node, graph, nullptr, 0, &p),
411 // "failed to create a host node"
412 // );
413 // }
414 // );
415 // return cudaTask(node);
416 //}
417 
418 // Function: kernel
419 template <typename F, typename... ArgsT>
421  dim3 g, dim3 b, size_t s, F&& f, ArgsT&&... args
422 ) {
423 
424  using traits = function_traits<F>;
425 
426  static_assert(traits::arity == sizeof...(ArgsT), "arity mismatches");
427 
428  auto node = _graph.emplace_back(
429  [g, b, s, f=(void*)f, args...]
430  (cudaGraph_t& graph, cudaGraphNode_t& node) mutable {
431 
432  cudaKernelNodeParams p;
433  void* arguments[sizeof...(ArgsT)] = { (void*)(&args)... };
434  p.func = f;
435  p.gridDim = g;
436  p.blockDim = b;
437  p.sharedMemBytes = s;
438  p.kernelParams = arguments;
439  p.extra = nullptr;
440 
441  TF_CHECK_CUDA(
442  ::cudaGraphAddKernelNode(&node, graph, nullptr, 0, &p),
443  "failed to create a cudaGraph node of kernel task"
444  );
445  },
446  nstd::in_place_type_t<cudaNode::Kernel>{}
447  );
448 
449  return cudaTask(node);
450 }
451 
452 // Function: kernel
453 template <typename F, typename... ArgsT>
454 cudaTask cudaFlow::kernel_on(
455  int d, dim3 g, dim3 b, size_t s, F&& f, ArgsT&&... args
456 ) {
457 
458  using traits = function_traits<F>;
459 
460  static_assert(traits::arity == sizeof...(ArgsT), "arity mismatches");
461 
462  auto node = _graph.emplace_back(
463  [d, g, b, s, f=(void*)f, args...]
464  (cudaGraph_t& graph, cudaGraphNode_t& node) mutable {
465 
466  cudaKernelNodeParams p;
467  void* arguments[sizeof...(ArgsT)] = { (void*)(&args)... };
468  p.func = f;
469  p.gridDim = g;
470  p.blockDim = b;
471  p.sharedMemBytes = s;
472  p.kernelParams = arguments;
473  p.extra = nullptr;
474 
475  cudaScopedDevice ctx(d);
476  TF_CHECK_CUDA(
477  ::cudaGraphAddKernelNode(&node, graph, nullptr, 0, &p),
478  "failed to create a cudaGraph node of kernel_on task"
479  );
480  },
481  nstd::in_place_type_t<cudaNode::Kernel>{}
482  );
483 
484  return cudaTask(node);
485 }
486 
487 // Function: zero
488 template <typename T>
489 std::enable_if_t<
490  is_pod_v<T> && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4),
491  cudaTask
492 >
493 cudaFlow::zero(T* dst, size_t count) {
494  auto node = _graph.emplace_back(
495  [dst, count] (cudaGraph_t& graph, cudaGraphNode_t& node) mutable {
496  cudaMemsetParams p;
497  p.dst = dst;
498  p.value = 0;
499  p.pitch = 0;
500  p.elementSize = sizeof(T); // either 1, 2, or 4
501  p.width = count;
502  p.height = 1;
503  TF_CHECK_CUDA(
504  cudaGraphAddMemsetNode(&node, graph, nullptr, 0, &p),
505  "failed to create a cudaGraph node of zero task"
506  );
507  },
508  nstd::in_place_type_t<cudaNode::Memset>{}
509  );
510  return cudaTask(node);
511 }
512 
513 // Function: fill
514 template <typename T>
515 std::enable_if_t<
516  is_pod_v<T> && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4),
517  cudaTask
518 >
519 cudaFlow::fill(T* dst, T value, size_t count) {
520  auto node = _graph.emplace_back(
521  [dst, value, count] (cudaGraph_t& graph, cudaGraphNode_t& node) mutable {
522  cudaMemsetParams p;
523  p.dst = dst;
524 
525  // perform bit-wise copy
526  p.value = 0; // crucial
527  static_assert(sizeof(T) <= sizeof(p.value), "internal error");
528  std::memcpy(&p.value, &value, sizeof(T));
529 
530  p.pitch = 0;
531  p.elementSize = sizeof(T); // either 1, 2, or 4
532  p.width = count;
533  p.height = 1;
534  TF_CHECK_CUDA(
535  cudaGraphAddMemsetNode(&node, graph, nullptr, 0, &p),
536  "failed to create a cudaGraph node of fill task"
537  );
538  },
539  nstd::in_place_type_t<cudaNode::Memset>{}
540  );
541  return cudaTask(node);
542 }
543 
544 // Function: copy
545 template <
546  typename T,
547  std::enable_if_t<!std::is_same<T, void>::value, void>*
548 >
549 cudaTask cudaFlow::copy(T* tgt, const T* src, size_t num) {
550 
551  using U = std::decay_t<T>;
552 
553  auto node = _graph.emplace_back(
554  [tgt, src, num] (cudaGraph_t& graph, cudaGraphNode_t& node) mutable {
555 
556  cudaMemcpy3DParms p;
557  p.srcArray = nullptr;
558  p.srcPos = ::make_cudaPos(0, 0, 0);
559  p.srcPtr = ::make_cudaPitchedPtr(const_cast<T*>(src), num*sizeof(U), num, 1);
560  p.dstArray = nullptr;
561  p.dstPos = ::make_cudaPos(0, 0, 0);
562  p.dstPtr = ::make_cudaPitchedPtr(tgt, num*sizeof(U), num, 1);
563  p.extent = ::make_cudaExtent(num*sizeof(U), 1, 1);
564  p.kind = cudaMemcpyDefault;
565 
566  TF_CHECK_CUDA(
567  cudaGraphAddMemcpyNode(&node, graph, nullptr, 0, &p),
568  "failed to create a cudaGraph node of copy task"
569  );
570  },
571  nstd::in_place_type_t<cudaNode::Copy>{}
572  );
573 
574  return cudaTask(node);
575 }
576 
577 // Function: memset
578 inline cudaTask cudaFlow::memset(void* dst, int ch, size_t count) {
579 
580  auto node = _graph.emplace_back(
581  [dst, ch, count] (cudaGraph_t& graph, cudaGraphNode_t& node) mutable {
582  cudaMemsetParams p;
583  p.dst = dst;
584  p.value = ch;
585  p.pitch = 0;
586  //p.elementSize = (count & 1) == 0 ? ((count & 3) == 0 ? 4 : 2) : 1;
587  //p.width = (count & 1) == 0 ? ((count & 3) == 0 ? count >> 2 : count >> 1) : count;
588  p.elementSize = 1; // either 1, 2, or 4
589  p.width = count;
590  p.height = 1;
591  TF_CHECK_CUDA(
592  cudaGraphAddMemsetNode(&node, graph, nullptr, 0, &p),
593  "failed to create a cudaGraph node of memset task"
594  );
595  },
596  nstd::in_place_type_t<cudaNode::Memset>{}
597  );
598 
599  return cudaTask(node);
600 }
601 
602 // Function: memcpy
603 inline cudaTask cudaFlow::memcpy(void* tgt, const void* src, size_t bytes) {
604  auto node = _graph.emplace_back(
605  [tgt, src, bytes] (cudaGraph_t& graph, cudaGraphNode_t& node) mutable {
606  // Parameters in cudaPitchedPtr
607  // d - Pointer to allocated memory
608  // p - Pitch of allocated memory in bytes
609  // xsz - Logical width of allocation in elements
610  // ysz - Logical height of allocation in elements
611  cudaMemcpy3DParms p;
612  p.srcArray = nullptr;
613  p.srcPos = ::make_cudaPos(0, 0, 0);
614  p.srcPtr = ::make_cudaPitchedPtr(const_cast<void*>(src), bytes, bytes, 1);
615  p.dstArray = nullptr;
616  p.dstPos = ::make_cudaPos(0, 0, 0);
617  p.dstPtr = ::make_cudaPitchedPtr(tgt, bytes, bytes, 1);
618  p.extent = ::make_cudaExtent(bytes, 1, 1);
619  p.kind = cudaMemcpyDefault;
620  TF_CHECK_CUDA(
621  cudaGraphAddMemcpyNode(&node, graph, nullptr, 0, &p),
622  "failed to create a cudaGraph node of memcpy task"
623  );
624  },
625  nstd::in_place_type_t<cudaNode::Copy>{}
626  );
627  return cudaTask(node);
628 }
629 
630 // Function: for_each
631 template <typename I, typename C>
632 cudaTask cudaFlow::for_each(I first, I last, C&& c) {
633 
634  size_t N = std::distance(first, last);
635  size_t B = cuda_default_threads_per_block(N);
636 
637  return kernel(
638  (N+B-1) / B, B, 0, cuda_for_each<I, C>, first, N, std::forward<C>(c)
639  );
640 }
641 
642 // Function: for_each_index
643 template <typename I, typename C>
644 cudaTask cudaFlow::for_each_index(I beg, I end, I inc, C&& c) {
645 
646  if(is_range_invalid(beg, end, inc)) {
647  TF_THROW("invalid range [", beg, ", ", end, ") with inc size ", inc);
648  }
649 
650  size_t N = distance(beg, end, inc);
651 
652  if(N == 0) {
653  return noop();
654  }
655 
656  size_t B = cuda_default_threads_per_block(N);
657 
658  return kernel(
659  (N+B-1) / B, B, 0, cuda_for_each_index<I, C>, beg, inc, N, std::forward<C>(c)
660  );
661 }
662 
663 // Function: transform
664 template <typename T, typename C, typename... S>
665 cudaTask cudaFlow::transform(T* tgt, size_t N, C&& c, S*... srcs) {
666 
667  if(N == 0) {
668  return noop();
669  }
670 
671  size_t B = cuda_default_threads_per_block(N);
672 
673  return kernel(
674  (N+B-1) / B, B, 0, cuda_transform<T, C, S...>,
675  tgt, N, std::forward<C>(c), srcs...
676  );
677 }
678 
679 // Function: row-wise matrix transpose
680 template <typename T>
681 cudaTask cudaFlow::transpose(const T* d_in, T* d_out, size_t rows, size_t cols) {
682 
683  if(rows == 0 || cols == 0) {
684  return noop();
685  }
686 
687  size_t grid_dimx = (cols + 31) / 32;
688  size_t grid_dimy = (rows + 31) / 32;
689 
690  return kernel(
691  dim3(grid_dimx, grid_dimy, 1),
692  dim3(32, 8, 1),
693  0,
694  cuda_transpose<T>,
695  d_in,
696  d_out,
697  rows,
698  cols
699  );
700 
701 }
702 
703 //template <typename T, typename B>>
704 //cudaTask cudaFlow::reduce(T* tgt, size_t N, T& init, B&& op) {
705  //if(N == 0) {
706  //return noop();
707  //}
708  //size_t B = cuda_default_threads_per_block(N);
709 //}
710 
711 
712 } // end of namespace tf -----------------------------------------------------
713 
714 
cudaTask for_each(I first, I last, C &&callable)
applies a callable to each dereferenced element of the data array
Definition: cuda_flow.hpp:632
cudaTask memcpy(void *tgt, const void *src, size_t bytes)
creates a memcpy task
Definition: cuda_flow.hpp:603
cudaTask copy(T *tgt, const T *src, size_t num)
creates a copy task
Definition: cuda_flow.hpp:549
std::enable_if_t< is_pod_v< T > &&(sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), cudaTask > fill(T *dst, T value, size_t count)
creates a fill task that fills a typed memory block with a value
Definition: cuda_flow.hpp:519
cudaTask for_each_index(I first, I last, I step, C &&callable)
applies a callable to each index in the range with the step size
Definition: cuda_flow.hpp:644
Definition: error.hpp:9
cudaTask memset(void *dst, int v, size_t count)
creates a memset task
Definition: cuda_flow.hpp:578
bool empty() const
queries the emptiness of the graph
Definition: cuda_flow.hpp:370
methods for building a CUDA task dependency graph.
Definition: cuda_flow.hpp:26
cudaTask noop()
creates a no-operation task
Definition: cuda_flow.hpp:388
cudaTask kernel(dim3 g, dim3 b, size_t s, F &&f, ArgsT &&... args)
creates a kernel task
Definition: cuda_flow.hpp:420
void join_until(P &&predicate)
offloads the cudaFlow with the given stop predicate and then joins the execution
Definition: executor.hpp:1432
cudaTask kernel_on(int d, dim3 g, dim3 b, size_t s, F &&f, ArgsT &&... args)
creates a kernel task on a device
Definition: cuda_flow.hpp:454
void join()
offloads the cudaFlow once and then joins the execution
Definition: executor.hpp:1448
int device() const
queries the device associated with the cudaFlow
Definition: cuda_flow.hpp:383
handle to a node in a cudaGraph
Definition: cuda_task.hpp:12
cudaTask transform(T *tgt, size_t N, C &&callable, S *... srcs)
applies a callable to a source range and stores the result in a target ange
Definition: cuda_flow.hpp:665
execution interface for running a taskflow graph
Definition: executor.hpp:24
std::enable_if_t< is_pod_v< T > &&(sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), cudaTask > zero(T *dst, size_t count)
creates a zero task that zeroes a typed memory block
Definition: cuda_flow.hpp:493
void join_n(size_t N)
offloads the cudaFlow by the given times and then joins the execution
Definition: executor.hpp:1443