3 #include "cuda_task.hpp" 4 #include "cuda_ops.hpp" 11 constexpr
size_t cuda_default_threads_per_block(
size_t N) {
12 return N >= 256 ? 256 : 128;
78 template <
typename F,
typename... ArgsT>
96 template <
typename F,
typename... ArgsT>
135 template <
typename T>
137 is_pod_v<T> && (
sizeof(T)==1 ||
sizeof(T)==2 ||
sizeof(T)==4),
140 zero(T* dst,
size_t count);
154 template <
typename T>
156 is_pod_v<T> && (
sizeof(T)==1 ||
sizeof(T)==2 ||
sizeof(T)==4),
159 fill(T* dst, T value,
size_t count);
177 std::enable_if_t<!std::is_same<T, void>::value,
void>* =
nullptr 208 template <
typename P>
245 template <
typename I,
typename C>
273 template <
typename I,
typename C>
296 template <
typename T,
typename C,
typename... S>
303 template <
typename T>
304 cudaTask transpose(
const T* d_in, T* d_out,
size_t rows,
size_t cols);
332 bool _joinable {
true};
347 template <
typename P>
348 void offload_until(P&& predicate);
355 void offload_n(
size_t N);
364 inline cudaFlow::cudaFlow(
Executor& e, cudaGraph& g) :
371 return _graph._nodes.empty();
377 TF_THROW(
"cudaFlow has been assigned to device ", _device);
389 auto node = _graph.emplace_back(
390 [](cudaGraph_t& graph, cudaGraphNode_t& node){
392 ::cudaGraphAddEmptyNode(&node, graph,
nullptr, 0),
393 "failed to create a no-operation (empty) node" 396 nstd::in_place_type_t<cudaNode::Noop>{}
419 template <
typename F,
typename... ArgsT>
421 dim3 g, dim3 b,
size_t s, F&& f, ArgsT&&... args
424 using traits = function_traits<F>;
426 static_assert(traits::arity ==
sizeof...(ArgsT),
"arity mismatches");
428 auto node = _graph.emplace_back(
429 [g, b, s, f=(
void*)f, args...]
430 (cudaGraph_t& graph, cudaGraphNode_t& node)
mutable {
432 cudaKernelNodeParams p;
433 void* arguments[sizeof...(ArgsT)] = { (void*)(&args)... };
437 p.sharedMemBytes = s;
438 p.kernelParams = arguments;
442 ::cudaGraphAddKernelNode(&node, graph,
nullptr, 0, &p),
443 "failed to create a cudaGraph node of kernel task" 446 nstd::in_place_type_t<cudaNode::Kernel>{}
449 return cudaTask(node);
453 template <
typename F,
typename... ArgsT>
455 int d, dim3 g, dim3 b,
size_t s, F&& f, ArgsT&&... args
458 using traits = function_traits<F>;
460 static_assert(traits::arity ==
sizeof...(ArgsT),
"arity mismatches");
462 auto node = _graph.emplace_back(
463 [d, g, b, s, f=(
void*)f, args...]
464 (cudaGraph_t& graph, cudaGraphNode_t& node)
mutable {
466 cudaKernelNodeParams p;
467 void* arguments[sizeof...(ArgsT)] = { (void*)(&args)... };
471 p.sharedMemBytes = s;
472 p.kernelParams = arguments;
475 cudaScopedDevice ctx(d);
477 ::cudaGraphAddKernelNode(&node, graph,
nullptr, 0, &p),
478 "failed to create a cudaGraph node of kernel_on task" 481 nstd::in_place_type_t<cudaNode::Kernel>{}
484 return cudaTask(node);
488 template <
typename T>
490 is_pod_v<T> && (
sizeof(T)==1 ||
sizeof(T)==2 ||
sizeof(T)==4),
493 cudaFlow::zero(T* dst,
size_t count) {
494 auto node = _graph.emplace_back(
495 [dst, count] (cudaGraph_t& graph, cudaGraphNode_t& node)
mutable {
500 p.elementSize =
sizeof(T);
504 cudaGraphAddMemsetNode(&node, graph,
nullptr, 0, &p),
505 "failed to create a cudaGraph node of zero task" 508 nstd::in_place_type_t<cudaNode::Memset>{}
514 template <
typename T>
516 is_pod_v<T> && (
sizeof(T)==1 ||
sizeof(T)==2 ||
sizeof(T)==4),
519 cudaFlow::fill(T* dst, T value,
size_t count) {
520 auto node = _graph.emplace_back(
521 [dst, value, count] (cudaGraph_t& graph, cudaGraphNode_t& node)
mutable {
527 static_assert(
sizeof(T) <=
sizeof(p.value),
"internal error");
528 std::memcpy(&p.value, &value,
sizeof(T));
531 p.elementSize =
sizeof(T);
535 cudaGraphAddMemsetNode(&node, graph,
nullptr, 0, &p),
536 "failed to create a cudaGraph node of fill task" 539 nstd::in_place_type_t<cudaNode::Memset>{}
547 std::enable_if_t<!std::is_same<T, void>::value,
void>*
549 cudaTask cudaFlow::copy(T* tgt,
const T* src,
size_t num) {
551 using U = std::decay_t<T>;
553 auto node = _graph.emplace_back(
554 [tgt, src, num] (cudaGraph_t& graph, cudaGraphNode_t& node)
mutable {
557 p.srcArray =
nullptr;
558 p.srcPos = ::make_cudaPos(0, 0, 0);
559 p.srcPtr = ::make_cudaPitchedPtr(const_cast<T*>(src), num*
sizeof(U), num, 1);
560 p.dstArray =
nullptr;
561 p.dstPos = ::make_cudaPos(0, 0, 0);
562 p.dstPtr = ::make_cudaPitchedPtr(tgt, num*
sizeof(U), num, 1);
563 p.extent = ::make_cudaExtent(num*
sizeof(U), 1, 1);
564 p.kind = cudaMemcpyDefault;
567 cudaGraphAddMemcpyNode(&node, graph,
nullptr, 0, &p),
568 "failed to create a cudaGraph node of copy task" 571 nstd::in_place_type_t<cudaNode::Copy>{}
578 inline cudaTask cudaFlow::memset(
void* dst,
int ch,
size_t count) {
580 auto node = _graph.emplace_back(
581 [dst, ch, count] (cudaGraph_t& graph, cudaGraphNode_t& node)
mutable {
592 cudaGraphAddMemsetNode(&node, graph,
nullptr, 0, &p),
593 "failed to create a cudaGraph node of memset task" 596 nstd::in_place_type_t<cudaNode::Memset>{}
603 inline cudaTask cudaFlow::memcpy(
void* tgt,
const void* src,
size_t bytes) {
604 auto node = _graph.emplace_back(
605 [tgt, src, bytes] (cudaGraph_t& graph, cudaGraphNode_t& node)
mutable {
612 p.srcArray =
nullptr;
613 p.srcPos = ::make_cudaPos(0, 0, 0);
614 p.srcPtr = ::make_cudaPitchedPtr(const_cast<void*>(src), bytes, bytes, 1);
615 p.dstArray =
nullptr;
616 p.dstPos = ::make_cudaPos(0, 0, 0);
617 p.dstPtr = ::make_cudaPitchedPtr(tgt, bytes, bytes, 1);
618 p.extent = ::make_cudaExtent(bytes, 1, 1);
619 p.kind = cudaMemcpyDefault;
621 cudaGraphAddMemcpyNode(&node, graph,
nullptr, 0, &p),
622 "failed to create a cudaGraph node of memcpy task" 625 nstd::in_place_type_t<cudaNode::Copy>{}
631 template <
typename I,
typename C>
632 cudaTask cudaFlow::for_each(I first, I last, C&& c) {
634 size_t N = std::distance(first, last);
635 size_t B = cuda_default_threads_per_block(N);
638 (N+B-1) / B, B, 0, cuda_for_each<I, C>, first, N, std::forward<C>(c)
643 template <
typename I,
typename C>
644 cudaTask cudaFlow::for_each_index(I beg, I end, I inc, C&& c) {
646 if(is_range_invalid(beg, end, inc)) {
647 TF_THROW(
"invalid range [", beg,
", ", end,
") with inc size ", inc);
650 size_t N = distance(beg, end, inc);
656 size_t B = cuda_default_threads_per_block(N);
659 (N+B-1) / B, B, 0, cuda_for_each_index<I, C>, beg, inc, N, std::forward<C>(c)
664 template <
typename T,
typename C,
typename... S>
665 cudaTask cudaFlow::transform(T* tgt,
size_t N, C&& c, S*... srcs) {
671 size_t B = cuda_default_threads_per_block(N);
674 (N+B-1) / B, B, 0, cuda_transform<T, C, S...>,
675 tgt, N, std::forward<C>(c), srcs...
680 template <
typename T>
681 cudaTask cudaFlow::transpose(
const T* d_in, T* d_out,
size_t rows,
size_t cols) {
683 if(rows == 0 || cols == 0) {
687 size_t grid_dimx = (cols + 31) / 32;
688 size_t grid_dimy = (rows + 31) / 32;
691 dim3(grid_dimx, grid_dimy, 1),
cudaTask for_each(I first, I last, C &&callable)
applies a callable to each dereferenced element of the data array
Definition: cuda_flow.hpp:632
cudaTask memcpy(void *tgt, const void *src, size_t bytes)
creates a memcpy task
Definition: cuda_flow.hpp:603
cudaTask copy(T *tgt, const T *src, size_t num)
creates a copy task
Definition: cuda_flow.hpp:549
std::enable_if_t< is_pod_v< T > &&(sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), cudaTask > fill(T *dst, T value, size_t count)
creates a fill task that fills a typed memory block with a value
Definition: cuda_flow.hpp:519
cudaTask for_each_index(I first, I last, I step, C &&callable)
applies a callable to each index in the range with the step size
Definition: cuda_flow.hpp:644
cudaTask memset(void *dst, int v, size_t count)
creates a memset task
Definition: cuda_flow.hpp:578
bool empty() const
queries the emptiness of the graph
Definition: cuda_flow.hpp:370
methods for building a CUDA task dependency graph.
Definition: cuda_flow.hpp:26
cudaTask noop()
creates a no-operation task
Definition: cuda_flow.hpp:388
cudaTask kernel(dim3 g, dim3 b, size_t s, F &&f, ArgsT &&... args)
creates a kernel task
Definition: cuda_flow.hpp:420
void join_until(P &&predicate)
offloads the cudaFlow with the given stop predicate and then joins the execution
Definition: executor.hpp:1432
cudaTask kernel_on(int d, dim3 g, dim3 b, size_t s, F &&f, ArgsT &&... args)
creates a kernel task on a device
Definition: cuda_flow.hpp:454
void join()
offloads the cudaFlow once and then joins the execution
Definition: executor.hpp:1448
int device() const
queries the device associated with the cudaFlow
Definition: cuda_flow.hpp:383
handle to a node in a cudaGraph
Definition: cuda_task.hpp:12
cudaTask transform(T *tgt, size_t N, C &&callable, S *... srcs)
applies a callable to a source range and stores the result in a target ange
Definition: cuda_flow.hpp:665
execution interface for running a taskflow graph
Definition: executor.hpp:24
std::enable_if_t< is_pod_v< T > &&(sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), cudaTask > zero(T *dst, size_t count)
creates a zero task that zeroes a typed memory block
Definition: cuda_flow.hpp:493
void join_n(size_t N)
offloads the cudaFlow by the given times and then joins the execution
Definition: executor.hpp:1443