Taskflow
2.4-master-branch
|
methods for building a CUDA task dependency graph. More...
#include <cuda_flow.hpp>
Public Member Functions | |
template<typename P > | |
cudaFlow (cudaGraph &graph, P &&p) | |
constructs a cudaFlow builder object More... | |
bool | empty () const |
queries the emptiness of the graph | |
cudaTask | noop () |
creates a no-operation task More... | |
template<typename F , typename... ArgsT> | |
cudaTask | kernel (dim3 g, dim3 b, size_t s, F &&f, ArgsT &&... args) |
creates a kernel task More... | |
template<typename F , typename... ArgsT> | |
cudaTask | kernel_on (int d, dim3 g, dim3 b, size_t s, F &&f, ArgsT &&... args) |
creates a kernel task on a device More... | |
cudaTask | memset (void *dst, int v, size_t count) |
creates a memset task More... | |
cudaTask | memcpy (void *tgt, const void *src, size_t bytes) |
creates a memcpy task More... | |
template<typename T > | |
std::enable_if_t< is_pod_v< T > &&(sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), cudaTask > | zero (T *dst, size_t count) |
creates a zero task that zeroes a typed memory block More... | |
template<typename T > | |
std::enable_if_t< is_pod_v< T > &&(sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), cudaTask > | fill (T *dst, T value, size_t count) |
creates a fill task that fills a typed memory block with a value More... | |
template<typename T , std::enable_if_t<!std::is_same< T, void >::value, void > * = nullptr> | |
cudaTask | copy (T *tgt, const T *src, size_t num) |
creates a copy task More... | |
void | device (int device) |
assigns a device to launch the cudaFlow More... | |
int | device () const |
queries the device associated with the cudaFlow | |
void | stream (cudaStream_t stream) |
assigns a stream to launch the cudaFlow More... | |
template<typename P > | |
void | predicate (P &&p) |
assigns a predicate to loop the cudaFlow until the predicate is satisfied More... | |
void | repeat (size_t n) |
repeats the execution of the cudaFlow by n times | |
Friends | |
class | Executor |
methods for building a CUDA task dependency graph.
A cudaFlow is a high-level interface to manipulate GPU tasks using the task dependency graph model. The class provides a set of methods for creating and launch different tasks on one or multiple CUDA devices, for instance, kernel tasks, data transfer tasks, and memory operation tasks.
tf::cudaFlow::cudaFlow | ( | cudaGraph & | graph, |
P && | p | ||
) |
constructs a cudaFlow builder object
P | predicate type |
graph | a cudaGraph to manipulate |
p | predicate which return true if the launching should be contined |
cudaTask tf::cudaFlow::copy | ( | T * | tgt, |
const T * | src, | ||
size_t | num | ||
) |
creates a copy task
T | element type (non-void) |
tgt | pointer to the target memory block |
src | pointer to the source memory block |
num | number of elements to copy |
A copy task transfers num*sizeof(T)
bytes of data from a source location to a target location. Direction can be arbitrary among CPUs and GPUs.
|
inline |
assigns a device to launch the cudaFlow
device | target device identifier |
std::enable_if_t< is_pod_v< T > &&(sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), cudaTask > tf::cudaFlow::fill | ( | T * | dst, |
T | value, | ||
size_t | count | ||
) |
creates a fill task that fills a typed memory block with a value
T | element type (size of T must be either 1, 2, or 4) |
dst | pointer to the destination device memory area |
value | value to fill for each element of type T |
count | number of elements |
A fill task fills the first count
elements of type T
with value
in a device memory area pointed by dst
. The value to fill is interpreted in type T
rather than byte.
cudaTask tf::cudaFlow::kernel | ( | dim3 | g, |
dim3 | b, | ||
size_t | s, | ||
F && | f, | ||
ArgsT &&... | args | ||
) |
creates a kernel task
F | kernel function type |
ArgsT | kernel function parameters type |
g | configured grid |
b | configured block |
s | configured shared memory |
f | kernel function |
args | arguments to forward to the kernel function by copy |
cudaTask tf::cudaFlow::kernel_on | ( | int | d, |
dim3 | g, | ||
dim3 | b, | ||
size_t | s, | ||
F && | f, | ||
ArgsT &&... | args | ||
) |
creates a kernel task on a device
F | kernel function type |
ArgsT | kernel function parameters type |
d | device identifier to luanch the kernel |
g | configured grid |
b | configured block |
s | configured shared memory |
f | kernel function |
args | arguments to forward to the kernel function by copy |
|
inline |
creates a memcpy task
tgt | pointer to the target memory block |
src | pointer to the source memory block |
bytes | bytes to copy |
A memcpy task transfers bytes
of data from a course location to a target location. Direction can be arbitrary among CPUs and GPUs.
|
inline |
creates a memset task
dst | pointer to the destination device memory area |
v | value to set for each byte of specified memory |
count | size in bytes to set |
A memset task fills the first count
bytes of device memory area pointed by dst
with the byte value v
.
|
inline |
creates a no-operation task
An empty node performs no operation during execution, but can be used for transitive ordering. For example, a phased execution graph with 2 groups of n nodes with a barrier between them can be represented using an empty node and 2*n dependency edges, rather than no empty node and n^2 dependency edges.
void tf::cudaFlow::predicate | ( | P && | p | ) |
|
inline |
assigns a stream to launch the cudaFlow
stream | target stream identifier |
std::enable_if_t< is_pod_v< T > &&(sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), cudaTask > tf::cudaFlow::zero | ( | T * | dst, |
size_t | count | ||
) |
creates a zero task that zeroes a typed memory block
T | element type (size of T must be either 1, 2, or 4) |
dst | pointer to the destination device memory area |
count | number of elements |
A zero task zeroes the first count
elements of type T
in a device memory area pointed by dst
.