3 #include "cuda_task.hpp" 81 template <
typename F,
typename... ArgsT>
99 template <
typename F,
typename... ArgsT>
138 template <
typename T>
140 is_pod_v<T> && (
sizeof(T)==1 ||
sizeof(T)==2 ||
sizeof(T)==4),
143 zero(T* dst,
size_t count);
157 template <
typename T>
159 is_pod_v<T> && (
sizeof(T)==1 ||
sizeof(T)==2 ||
sizeof(T)==4),
162 fill(T* dst, T value,
size_t count);
180 std::enable_if_t<!std::is_same<T, void>::value,
void>* =
nullptr 211 template <
typename P>
225 nstd::optional<cudaStream_t> _stream;
231 template <
typename P>
234 _predicate {std::forward<P>(p)} {
238 template <
typename P>
240 _predicate = std::forward<P>(pred);
245 _predicate = [n] ()
mutable {
return n-- == 0; };
250 return _graph._nodes.empty();
270 auto node = _graph.emplace_back(nstd::in_place_type_t<cudaNode::Noop>{},
271 [](cudaGraph_t& graph, cudaGraphNode_t& node){
273 ::cudaGraphAddEmptyNode(&node, graph,
nullptr, 0),
274 "failed to create a no-operation (empty) node" 299 template <
typename F,
typename... ArgsT>
301 dim3 g, dim3 b,
size_t s, F&& f, ArgsT&&... args
304 using traits = function_traits<F>;
306 static_assert(traits::arity ==
sizeof...(ArgsT),
"arity mismatches");
308 auto node = _graph.emplace_back(nstd::in_place_type_t<cudaNode::Kernel>{},
309 [g, b, s, f=(
void*)f, args...] (cudaGraph_t& graph, cudaGraphNode_t& node) {
311 cudaKernelNodeParams p;
312 void* arguments[
sizeof...(ArgsT)] = { (
void*)(&args)... };
316 p.sharedMemBytes = s;
317 p.kernelParams = arguments;
321 ::cudaGraphAddKernelNode(&node, graph,
nullptr, 0, &p),
322 "failed to create a cudaGraph node in kernel task" 331 template <
typename F,
typename... ArgsT>
333 int d, dim3 g, dim3 b,
size_t s, F&& f, ArgsT&&... args
336 using traits = function_traits<F>;
338 static_assert(traits::arity ==
sizeof...(ArgsT),
"arity mismatches");
340 auto node = _graph.emplace_back(nstd::in_place_type_t<cudaNode::Kernel>{},
341 [d, g, b, s, f=(
void*)f, args...] (cudaGraph_t& graph, cudaGraphNode_t& node) {
343 cudaKernelNodeParams p;
344 void* arguments[
sizeof...(ArgsT)] = { (
void*)(&args)... };
348 p.sharedMemBytes = s;
349 p.kernelParams = arguments;
352 cudaScopedDevice ctx(d);
354 ::cudaGraphAddKernelNode(&node, graph,
nullptr, 0, &p),
355 "failed to create a cudaGraph node in kernel_on task" 364 template <
typename T>
366 is_pod_v<T> && (
sizeof(T)==1 ||
sizeof(T)==2 ||
sizeof(T)==4),
370 auto node = _graph.emplace_back(nstd::in_place_type_t<cudaNode::Memset>{},
371 [dst, count] (cudaGraph_t& graph, cudaGraphNode_t& node) {
376 p.elementSize =
sizeof(T);
380 cudaGraphAddMemsetNode(&node, graph,
nullptr, 0, &p),
381 "failed to create a cudaGraph node in zero task" 389 template <
typename T>
391 is_pod_v<T> && (
sizeof(T)==1 ||
sizeof(T)==2 ||
sizeof(T)==4),
395 auto node = _graph.emplace_back(nstd::in_place_type_t<cudaNode::Memset>{},
396 [dst, value, count] (cudaGraph_t& graph, cudaGraphNode_t& node) {
402 static_assert(
sizeof(T) <=
sizeof(p.value),
"internal error");
403 std::memcpy(&p.value, &value,
sizeof(T));
406 p.elementSize =
sizeof(T);
410 cudaGraphAddMemsetNode(&node, graph,
nullptr, 0, &p),
411 "failed to create a cudaGraph node in fill task" 421 std::enable_if_t<!std::is_same<T, void>::value,
void>*
425 using U = std::decay_t<T>;
427 auto node = _graph.emplace_back(nstd::in_place_type_t<cudaNode::Copy>{},
428 [tgt, src, num] (cudaGraph_t& graph, cudaGraphNode_t& node) {
431 p.srcArray =
nullptr;
432 p.srcPos = ::make_cudaPos(0, 0, 0);
433 p.srcPtr = ::make_cudaPitchedPtr(const_cast<T*>(src), num*
sizeof(U), num, 1);
434 p.dstArray =
nullptr;
435 p.dstPos = ::make_cudaPos(0, 0, 0);
436 p.dstPtr = ::make_cudaPitchedPtr(tgt, num*
sizeof(U), num, 1);
437 p.extent = ::make_cudaExtent(num*
sizeof(U), 1, 1);
438 p.kind = cudaMemcpyDefault;
441 cudaGraphAddMemcpyNode(&node, graph,
nullptr, 0, &p),
442 "failed to create a cudaGraph node in copy task" 453 auto node = _graph.emplace_back(nstd::in_place_type_t<cudaNode::Memset>{},
454 [dst, ch, count] (cudaGraph_t& graph, cudaGraphNode_t& node) {
465 cudaGraphAddMemsetNode(&node, graph,
nullptr, 0, &p),
466 "failed to create a cudaGraph node in memset task" 476 auto node = _graph.emplace_back(nstd::in_place_type_t<cudaNode::Copy>{},
477 [tgt, src, bytes] (cudaGraph_t& graph, cudaGraphNode_t& node) {
484 p.srcArray =
nullptr;
485 p.srcPos = ::make_cudaPos(0, 0, 0);
486 p.srcPtr = ::make_cudaPitchedPtr(const_cast<void*>(src), bytes, bytes, 1);
487 p.dstArray =
nullptr;
488 p.dstPos = ::make_cudaPos(0, 0, 0);
489 p.dstPtr = ::make_cudaPitchedPtr(tgt, bytes, bytes, 1);
490 p.extent = ::make_cudaExtent(bytes, 1, 1);
491 p.kind = cudaMemcpyDefault;
493 cudaGraphAddMemcpyNode(&node, graph,
nullptr, 0, &p),
494 "failed to create a cudaGraph node in memcpy task" cudaTask memcpy(void *tgt, const void *src, size_t bytes)
creates a memcpy task
Definition: cuda_flow.hpp:475
void repeat(size_t n)
repeats the execution of the cudaFlow by n times
Definition: cuda_flow.hpp:244
cudaTask copy(T *tgt, const T *src, size_t num)
creates a copy task
Definition: cuda_flow.hpp:423
std::enable_if_t< is_pod_v< T > &&(sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), cudaTask > fill(T *dst, T value, size_t count)
creates a fill task that fills a typed memory block with a value
Definition: cuda_flow.hpp:394
cudaTask memset(void *dst, int v, size_t count)
creates a memset task
Definition: cuda_flow.hpp:451
bool empty() const
queries the emptiness of the graph
Definition: cuda_flow.hpp:249
methods for building a CUDA task dependency graph.
Definition: cuda_flow.hpp:18
cudaTask noop()
creates a no-operation task
Definition: cuda_flow.hpp:269
cudaTask kernel(dim3 g, dim3 b, size_t s, F &&f, ArgsT &&... args)
creates a kernel task
Definition: cuda_flow.hpp:300
cudaTask kernel_on(int d, dim3 g, dim3 b, size_t s, F &&f, ArgsT &&... args)
creates a kernel task on a device
Definition: cuda_flow.hpp:332
cudaFlow(cudaGraph &graph, P &&p)
constructs a cudaFlow builder object
Definition: cuda_flow.hpp:232
int device() const
queries the device associated with the cudaFlow
Definition: cuda_flow.hpp:259
handle to a node in a cudaGraph
Definition: cuda_task.hpp:12
void predicate(P &&p)
assigns a predicate to loop the cudaFlow until the predicate is satisfied
Definition: cuda_flow.hpp:239
void stream(cudaStream_t stream)
assigns a stream to launch the cudaFlow
Definition: cuda_flow.hpp:264
execution interface for running a taskflow graph
Definition: executor.hpp:43
std::enable_if_t< is_pod_v< T > &&(sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), cudaTask > zero(T *dst, size_t count)
creates a zero task that zeroes a typed memory block
Definition: cuda_flow.hpp:369