3 #include "cuda_task.hpp" 56 template <
typename F,
typename... ArgsT>
74 template <
typename F,
typename... ArgsT>
107 std::enable_if_t<!std::is_same<T, void>::value,
void>* =
nullptr 133 cudaStream_t
stream()
const;
141 cudaStream_t _stream {
nullptr};
150 return _graph._nodes.empty();
175 auto node = _graph.emplace_back(nstd::in_place_type_t<cudaNode::Noop>{},
176 [](cudaGraph_t& graph, cudaGraphNode_t& node){
178 ::cudaGraphAddEmptyNode(&node, graph,
nullptr, 0),
179 "failed to create a no-operation (empty) node" 187 template <
typename F,
typename... ArgsT>
189 dim3 grid, dim3 block,
size_t shm, F&& func, ArgsT&&... args
192 using traits = function_traits<F>;
194 static_assert(traits::arity ==
sizeof...(ArgsT),
"arity mismatches");
196 auto node = _graph.emplace_back(nstd::in_place_type_t<cudaNode::Kernel>{},
197 [=] (cudaGraph_t& graph, cudaGraphNode_t& node) {
199 cudaKernelNodeParams p;
200 void* arguments[
sizeof...(ArgsT)] = { (
void*)(&args)... };
201 p.func = (
void*)func;
204 p.sharedMemBytes = shm;
205 p.kernelParams = arguments;
209 ::cudaGraphAddKernelNode(&node, graph,
nullptr, 0, &p),
210 "failed to create a cudaKernel node" 219 template <
typename F,
typename... ArgsT>
221 int dev, dim3 grid, dim3 block,
size_t shm, F&& func, ArgsT&&... args
224 using traits = function_traits<F>;
226 static_assert(traits::arity ==
sizeof...(ArgsT),
"arity mismatches");
228 auto node = _graph.emplace_back(nstd::in_place_type_t<cudaNode::Kernel>{},
229 [=] (cudaGraph_t& graph, cudaGraphNode_t& node) {
231 cudaKernelNodeParams p;
232 void* arguments[
sizeof...(ArgsT)] = { (
void*)(&args)... };
233 p.func = (
void*)func;
236 p.sharedMemBytes = shm;
237 p.kernelParams = arguments;
240 cudaScopedDevice ctx(dev);
242 ::cudaGraphAddKernelNode(&node, graph,
nullptr, 0, &p),
243 "failed to create a cudaKernel node" 254 auto node = _graph.emplace_back(nstd::in_place_type_t<cudaNode::Memset>{},
255 [=] (cudaGraph_t& graph, cudaGraphNode_t& node) {
267 cudaGraphAddMemsetNode(&node, graph,
nullptr, 0, &p),
268 "failed to create a cudaMemset node" 280 std::enable_if_t<!std::is_same<T, void>::value,
void>*
284 using U = std::decay_t<T>;
286 auto node = _graph.emplace_back(nstd::in_place_type_t<cudaNode::Copy>{},
287 [=] (cudaGraph_t& graph, cudaGraphNode_t& node) {
290 p.srcArray =
nullptr;
291 p.srcPos = ::make_cudaPos(0, 0, 0);
292 p.srcPtr = ::make_cudaPitchedPtr(const_cast<T*>(src), num*
sizeof(U), num, 1);
293 p.dstArray =
nullptr;
294 p.dstPos = ::make_cudaPos(0, 0, 0);
295 p.dstPtr = ::make_cudaPitchedPtr(tgt, num*
sizeof(U), num, 1);
296 p.extent = ::make_cudaExtent(num*
sizeof(U), 1, 1);
297 p.kind = cudaMemcpyDefault;
300 cudaGraphAddMemcpyNode(&node, graph,
nullptr, 0, &p),
301 "failed to create a cudaCopy node" cudaFlow(cudaGraph &graph)
constructs a cudaFlow builder object
Definition: cuda_flow_builder.hpp:145
cudaTask copy(T *tgt, const T *src, size_t num)
creates an 1D copy task
Definition: cuda_flow_builder.hpp:282
cudaTask memset(void *dst, int v, size_t count)
creates a memset node
Definition: cuda_flow_builder.hpp:252
bool empty() const
queries the emptiness of the graph
Definition: cuda_flow_builder.hpp:149
Building methods for a cuda task dependency graph.
Definition: cuda_flow_builder.hpp:12
cudaTask noop()
creates a no-operation task
Definition: cuda_flow_builder.hpp:174
cudaTask kernel(dim3 g, dim3 b, size_t s, F &&f, ArgsT &&... args)
creates a kernel task
Definition: cuda_flow_builder.hpp:188
cudaTask kernel_on(int d, dim3 g, dim3 b, size_t s, F &&f, ArgsT &&... args)
creates a kernel task on a device
Definition: cuda_flow_builder.hpp:220
cudaStream_t stream() const
queries the stream associated with the cudaFlow
Definition: cuda_flow_builder.hpp:169
int device() const
queries the device associated with the cudaFlow
Definition: cuda_flow_builder.hpp:159
handle to a node in a cudaGraph
Definition: cuda_task.hpp:12
execution interface for running a taskflow graph
Definition: executor.hpp:32