Cpp-Taskflow  2.4-master-branch
cuda_flow_builder.hpp
1 #pragma once
2 
3 #include "cuda_task.hpp"
4 
5 namespace tf {
6 
12 class cudaFlow {
13 
14  friend class Executor;
15 
16  public:
17 
23  cudaFlow(cudaGraph& graph);
24 
28  bool empty() const;
29 
40  cudaTask noop();
41 
56  template <typename F, typename... ArgsT>
57  cudaTask kernel(dim3 g, dim3 b, size_t s, F&& f, ArgsT&&... args);
58 
74  template <typename F, typename... ArgsT>
75  cudaTask kernel_on(int d, dim3 g, dim3 b, size_t s, F&& f, ArgsT&&... args);
76 
88  cudaTask memset(void* dst, int v, size_t count);
89 
105  template <
106  typename T,
107  std::enable_if_t<!std::is_same<T, void>::value, void>* = nullptr
108  >
109  cudaTask copy(T* tgt, const T* src, size_t num);
110 
116  void device(int device);
117 
121  int device() const;
122 
128  void stream(cudaStream_t stream);
129 
133  cudaStream_t stream() const;
134 
135  private:
136 
137  cudaGraph& _graph;
138 
139  int _device {0};
140 
141  cudaStream_t _stream {nullptr};
142 };
143 
144 // Constructor
145 inline cudaFlow::cudaFlow(cudaGraph& g) : _graph {g} {
146 }
147 
148 // Function: empty
149 inline bool cudaFlow::empty() const {
150  return _graph._nodes.empty();
151 }
152 
153 // Procedure: device
154 inline void cudaFlow::device(int d) {
155  _device = d;
156 }
157 
158 // Function: device
159 inline int cudaFlow::device() const {
160  return _device;
161 }
162 
163 // Procedure: stream
164 inline void cudaFlow::stream(cudaStream_t s) {
165  _stream = s;
166 }
167 
168 // Function: stream
169 inline cudaStream_t cudaFlow::stream() const {
170  return _stream;
171 }
172 
173 // Function: noop
175  auto node = _graph.emplace_back(nstd::in_place_type_t<cudaNode::Noop>{},
176  [](cudaGraph_t& graph, cudaGraphNode_t& node){
177  TF_CHECK_CUDA(
178  ::cudaGraphAddEmptyNode(&node, graph, nullptr, 0),
179  "failed to create a no-operation (empty) node"
180  );
181  }
182  );
183  return cudaTask(node);
184 }
185 
186 // Function: kernel
187 template <typename F, typename... ArgsT>
189  dim3 grid, dim3 block, size_t shm, F&& func, ArgsT&&... args
190 ) {
191 
192  using traits = function_traits<F>;
193 
194  static_assert(traits::arity == sizeof...(ArgsT), "arity mismatches");
195 
196  auto node = _graph.emplace_back(nstd::in_place_type_t<cudaNode::Kernel>{},
197  [=] (cudaGraph_t& graph, cudaGraphNode_t& node) {
198 
199  cudaKernelNodeParams p;
200  void* arguments[sizeof...(ArgsT)] = { (void*)(&args)... };
201  p.func = (void*)func;
202  p.gridDim = grid;
203  p.blockDim = block;
204  p.sharedMemBytes = shm;
205  p.kernelParams = arguments;
206  p.extra = nullptr;
207 
208  TF_CHECK_CUDA(
209  ::cudaGraphAddKernelNode(&node, graph, nullptr, 0, &p),
210  "failed to create a cudaKernel node"
211  );
212  }
213  );
214 
215  return cudaTask(node);
216 }
217 
218 // Function: kernel
219 template <typename F, typename... ArgsT>
221  int dev, dim3 grid, dim3 block, size_t shm, F&& func, ArgsT&&... args
222 ) {
223 
224  using traits = function_traits<F>;
225 
226  static_assert(traits::arity == sizeof...(ArgsT), "arity mismatches");
227 
228  auto node = _graph.emplace_back(nstd::in_place_type_t<cudaNode::Kernel>{},
229  [=] (cudaGraph_t& graph, cudaGraphNode_t& node) {
230 
231  cudaKernelNodeParams p;
232  void* arguments[sizeof...(ArgsT)] = { (void*)(&args)... };
233  p.func = (void*)func;
234  p.gridDim = grid;
235  p.blockDim = block;
236  p.sharedMemBytes = shm;
237  p.kernelParams = arguments;
238  p.extra = nullptr;
239 
240  cudaScopedDevice ctx(dev);
241  TF_CHECK_CUDA(
242  ::cudaGraphAddKernelNode(&node, graph, nullptr, 0, &p),
243  "failed to create a cudaKernel node"
244  );
245  }
246  );
247 
248  return cudaTask(node);
249 }
250 
251 // Function: memset
252 inline cudaTask cudaFlow::memset(void* dst, int ch, size_t count) {
253 
254  auto node = _graph.emplace_back(nstd::in_place_type_t<cudaNode::Memset>{},
255  [=] (cudaGraph_t& graph, cudaGraphNode_t& node) {
256  cudaMemsetParams p;
257  p.dst = dst;
258  p.value = ch;
259  p.pitch = 0;
260  //p.elementSize = (count & 1) == 0 ? ((count & 3) == 0 ? 4 : 2) : 1;
261  //p.width = (count & 1) == 0 ? ((count & 3) == 0 ? count >> 2 : count >> 1) : count;
262  p.elementSize = 1; // either 1, 2, or 4
263  p.width = count;
264 
265  p.height = 1;
266  TF_CHECK_CUDA(
267  cudaGraphAddMemsetNode(&node, graph, nullptr, 0, &p),
268  "failed to create a cudaMemset node"
269  );
270  }
271  );
272 
273  return cudaTask(node);
274 }
275 
276 
277 // Function: copy
278 template <
279  typename T,
280  std::enable_if_t<!std::is_same<T, void>::value, void>*
281 >
282 cudaTask cudaFlow::copy(T* tgt, const T* src, size_t num) {
283 
284  using U = std::decay_t<T>;
285 
286  auto node = _graph.emplace_back(nstd::in_place_type_t<cudaNode::Copy>{},
287  [=] (cudaGraph_t& graph, cudaGraphNode_t& node) {
288 
289  cudaMemcpy3DParms p;
290  p.srcArray = nullptr;
291  p.srcPos = ::make_cudaPos(0, 0, 0);
292  p.srcPtr = ::make_cudaPitchedPtr(const_cast<T*>(src), num*sizeof(U), num, 1);
293  p.dstArray = nullptr;
294  p.dstPos = ::make_cudaPos(0, 0, 0);
295  p.dstPtr = ::make_cudaPitchedPtr(tgt, num*sizeof(U), num, 1);
296  p.extent = ::make_cudaExtent(num*sizeof(U), 1, 1);
297  p.kind = cudaMemcpyDefault;
298 
299  TF_CHECK_CUDA(
300  cudaGraphAddMemcpyNode(&node, graph, nullptr, 0, &p),
301  "failed to create a cudaCopy node"
302  );
303  }
304  );
305 
306  return cudaTask(node);
307 }
308 
309 } // end of namespace tf -----------------------------------------------------
cudaFlow(cudaGraph &graph)
constructs a cudaFlow builder object
Definition: cuda_flow_builder.hpp:145
cudaTask copy(T *tgt, const T *src, size_t num)
creates an 1D copy task
Definition: cuda_flow_builder.hpp:282
Definition: error.hpp:9
cudaTask memset(void *dst, int v, size_t count)
creates a memset node
Definition: cuda_flow_builder.hpp:252
bool empty() const
queries the emptiness of the graph
Definition: cuda_flow_builder.hpp:149
Building methods for a cuda task dependency graph.
Definition: cuda_flow_builder.hpp:12
cudaTask noop()
creates a no-operation task
Definition: cuda_flow_builder.hpp:174
cudaTask kernel(dim3 g, dim3 b, size_t s, F &&f, ArgsT &&... args)
creates a kernel task
Definition: cuda_flow_builder.hpp:188
cudaTask kernel_on(int d, dim3 g, dim3 b, size_t s, F &&f, ArgsT &&... args)
creates a kernel task on a device
Definition: cuda_flow_builder.hpp:220
cudaStream_t stream() const
queries the stream associated with the cudaFlow
Definition: cuda_flow_builder.hpp:169
int device() const
queries the device associated with the cudaFlow
Definition: cuda_flow_builder.hpp:159
handle to a node in a cudaGraph
Definition: cuda_task.hpp:12
execution interface for running a taskflow graph
Definition: executor.hpp:32