tf namespace

taskflow namespace

Classes

class ChromeObserver
observer interface based on Chrome tracing format
class CriticalSection
class to create a critical region of limited workers to run tasks
class cublasFlowCapturer
class to construct a cuBLAS task graph
class cudaFlow
class for building a CUDA task dependency graph
class cudaFlowCapturer
class for building a CUDA task dependency graph through stream capture
class cudaFlowCapturerBase
base class to construct a CUDA task graph through stream capture
class cudaRoundRobinCapturing
class to capture the described graph into a native cudaGraph using a greedy round-robin algorithm on a fixed number of streams
class cudaScopedDevice
RAII-styled device context switch.
class cudaScopedPerThreadEvent
class that provides RAII-styled guard of event acquisition
class cudaScopedPerThreadStream
class that provides RAII-styled guard of stream acquisition
class cudaSequentialCapturing
class to capture the described graph into a native cudaGraph using a single stream
class cudaTask
handle to a node of the internal CUDA graph
class Executor
execution interface for running a taskflow graph
class FlowBuilder
building methods of a task dependency graph
template<typename T>
class Future
class to access the result of task execution
class ObserverInterface
The interface class for creating an executor observer.
class Semaphore
class to create a semophore object for building a concurrency constraint
class Subflow
class to construct a subflow graph from the execution of a dynamic task
class Task
handle to a node in a task dependency graph
class Taskflow
main entry to create a task dependency graph
class TaskView
class to access task information from the observer interface
class TFProfObserver
observer interface based on the built-in taskflow profiler format
class WorkerView
class to create an immutable view of a worker in an executor

Enums

enum class TaskType: int { PLACEHOLDER = 0, CUDAFLOW, STATIC, DYNAMIC, CONDITION, MODULE, ASYNC, UNDEFINED }
enumeration of all task types
enum class ObserverType: int { TFPROF = 0, CHROME, UNDEFINED }
enumeration of all observer types
enum class cudaTaskType: int { EMPTY = 0, HOST, MEMSET, MEMCPY, KERNEL, SUBFLOW, CAPTURE, UNDEFINED }
enumeration of all cudaTask types

Typedefs

using observer_stamp_t = std::chrono::time_point<std::chrono::steady_clock>
default time point type of observers
using cudaPerThreadStreamPool = cudaPerThreadDeviceObjectPool<cudaStream_t, cudaStreamCreator, cudaStreamDeleter>
alias of per-thread stream pool type
using cudaPerThreadEventPool = cudaPerThreadDeviceObjectPool<cudaEvent_t, cudaEventCreator, cudaEventDeleter>
alias of per-thread event pool type

Functions

auto to_string(TaskType type) -> const char*
convert a task type to a human-readable string
auto operator<<(std::ostream& os, const Task& task) -> std::ostream&
overload of ostream inserter operator for cudaTask
auto to_string(ObserverType type) -> const char*
convert an observer type to a human-readable string
auto cuda_get_num_devices() -> size_t
queries the number of available devices
auto cuda_get_device() -> int
gets the current device associated with the caller thread
void cuda_set_device(int id)
switches to a given device context
void cuda_get_device_property(int i, cudaDeviceProp& p)
obtains the device property
auto cuda_get_device_property(int i) -> cudaDeviceProp
obtains the device property
void cuda_dump_device_property(std::ostream& os, const cudaDeviceProp& p)
dumps the device property
auto cuda_get_device_max_threads_per_block(int d) -> size_t
queries the maximum threads per block on a device
auto cuda_get_device_max_x_dim_per_block(int d) -> size_t
queries the maximum x-dimension per block on a device
auto cuda_get_device_max_y_dim_per_block(int d) -> size_t
queries the maximum y-dimension per block on a device
auto cuda_get_device_max_z_dim_per_block(int d) -> size_t
queries the maximum z-dimension per block on a device
auto cuda_get_device_max_x_dim_per_grid(int d) -> size_t
queries the maximum x-dimension per grid on a device
auto cuda_get_device_max_y_dim_per_grid(int d) -> size_t
queries the maximum y-dimension per grid on a device
auto cuda_get_device_max_z_dim_per_grid(int d) -> size_t
queries the maximum z-dimension per grid on a device
auto cuda_get_device_max_shm_per_block(int d) -> size_t
queries the maximum shared memory size in bytes per block on a device
auto cuda_get_device_warp_size(int d) -> size_t
queries the warp size on a device
auto cuda_get_device_compute_capability_major(int d) -> int
queries the major number of compute capability of a device
auto cuda_get_device_compute_capability_minor(int d) -> int
queries the minor number of compute capability of a device
auto cuda_get_device_unified_addressing(int d) -> bool
queries if the device supports unified addressing
auto cuda_get_driver_version() -> int
queries the latest CUDA version (1000 * major + 10 * minor) supported by the driver
auto cuda_get_runtime_version() -> int
queries the CUDA Runtime version (1000 * major + 10 * minor)
auto cuda_get_free_mem(int d) -> size_t
queries the free memory (expensive call)
auto cuda_get_total_mem(int d) -> size_t
queries the total available memory (expensive call)
template<typename T>
auto cuda_malloc_device(size_t N, int d) -> T*
allocates memory on the given device for holding N elements of type T
template<typename T>
auto cuda_malloc_device(size_t N) -> T*
allocates memory on the current device associated with the caller
template<typename T>
auto cuda_malloc_shared(size_t N) -> T*
allocates shared memory for holding N elements of type T
template<typename T>
void cuda_free(T* ptr, int d)
frees memory on the GPU device
template<typename T>
void cuda_free(T* ptr)
frees memory on the GPU device
void cuda_memcpy_async(cudaStream_t stream, void* dst, const void* src, size_t count)
copies data between host and device asynchronously through a stream
void cuda_memset_async(cudaStream_t stream, void* devPtr, int value, size_t count)
initializes or sets GPU memory to the given value byte by byte
auto cuda_per_thread_stream_pool() -> cudaPerThreadStreamPool&
acquires the per-thread cuda stream pool
auto cuda_per_thread_event_pool() -> cudaPerThreadEventPool&
per-thread cuda event pool
auto to_string(cudaTaskType type) -> const char* constexpr
convert a cuda_task type to a human-readable string
auto operator<<(std::ostream& os, const cudaTask& ct) -> std::ostream&
overload of ostream inserter operator for cudaTask
auto cuda_default_max_threads_per_block() -> size_t constexpr
queries the maximum threads allowed per block
auto cuda_default_threads_per_block(size_t N) -> size_t constexpr
queries the default number of threads per block in an 1D vector of N elements
auto version() -> const char* constexpr
queries the version information in a string format major.minor.patch

Variables

std::array<TaskType, 7> TASK_TYPES constexpr
array of all task types (used for iterating task types)
template<typename C>
bool is_static_task_v constexpr
determines if a callable is a static task
template<typename C>
bool is_dynamic_task_v constexpr
determines if a callable is a dynamic task
template<typename C>
bool is_condition_task_v constexpr
determines if a callable is a condition task
template<typename C>
bool is_cudaflow_task_v constexpr
determines if a callable is a cudaflow task

Function documentation

template<typename T>
T* tf::cuda_malloc_device(size_t N, int d)

allocates memory on the given device for holding N elements of type T

The function calls cudaMalloc to allocate N*sizeof(T) bytes of memory on the given device d and returns a pointer to the starting address of the device memory.

template<typename T>
T* tf::cuda_malloc_device(size_t N)

allocates memory on the current device associated with the caller

The function calls cuda_malloc_device from the current device associated with the caller.

template<typename T>
T* tf::cuda_malloc_shared(size_t N)

allocates shared memory for holding N elements of type T

The function calls cudaMallocManaged to allocate N*sizeof(T) bytes of memory and returns a pointer to the starting address of the shared memory.

template<typename T>
void tf::cuda_free(T* ptr, int d)

frees memory on the GPU device

Template parameters
T pointer type
Parameters
ptr device pointer to memory to free
d device context identifier

This methods call cudaFree to free the memory space pointed to by ptr using the given device context.

template<typename T>
void tf::cuda_free(T* ptr)

frees memory on the GPU device

Template parameters
T pointer type
Parameters
ptr device pointer to memory to free

This methods call cudaFree to free the memory space pointed to by ptr using the current device context of the caller.

void tf::cuda_memcpy_async(cudaStream_t stream, void* dst, const void* src, size_t count)

copies data between host and device asynchronously through a stream

Parameters
stream stream identifier
dst destination memory address
src source memory address
count size in bytes to copy

The method calls cudaMemcpyAsync with the given stream using cudaMemcpyDefault to infer the memory space of the source and the destination pointers. The memory areas may not overlap.

void tf::cuda_memset_async(cudaStream_t stream, void* devPtr, int value, size_t count)

initializes or sets GPU memory to the given value byte by byte

Parameters
stream stream identifier
devPtr pointer to GPU mempry
value value to set for each byte of the specified memory
count size in bytes to set

The method calls cudaMemsetAsync with the given stream to fill the first count bytes of the memory area pointed to by devPtr with the constant byte value value.

Variable documentation

template<typename C>
bool tf::is_static_task_v constexpr

determines if a callable is a static task

A static task is a callable object constructible from std::function<void()>.

template<typename C>
bool tf::is_dynamic_task_v constexpr

determines if a callable is a dynamic task

A dynamic task is a callable object constructible from std::function<void(Subflow&)>.

template<typename C>
bool tf::is_condition_task_v constexpr

determines if a callable is a condition task

A condition task is a callable object constructible from std::function<int()>.

template<typename C>
bool tf::is_cudaflow_task_v constexpr

determines if a callable is a cudaflow task

A cudaFlow task is a callable object constructible from std::function<void(tf::cudaFlow&)> or std::function<void(tf::cudaFlowCapturer&)>.