5 #ifndef FML_GPU_GPUHELPERS_H
6 #define FML_GPU_GPUHELPERS_H
12 #include "../_internals/arraytools/src/arraytools.hpp"
14 #include "../cpu/cpumat.hh"
15 #include "../cpu/cpuvec.hh"
17 #include "internals/kernelfuns.hh"
36 inline std::shared_ptr<card>
new_card(
int id=0)
38 return std::make_shared<card>(
id);
45 static const size_t CPLEN = 1024;
47 template <
typename REAL_IN,
typename REAL_OUT>
48 void copy_gpu2gpu(
const len_t m,
const len_t n, std::shared_ptr<card> c, dim3 griddim, dim3 blockdim,
const REAL_IN *in, REAL_OUT *out)
50 if (std::is_same<REAL_IN, REAL_OUT>::value)
52 const size_t len = (size_t) m*n*
sizeof(REAL_IN);
53 c->mem_gpu2gpu((
void*)out, (
void*)in, len);
56 fml::kernelfuns::kernel_copy<<<griddim, blockdim>>>(m, n, in, out);
59 template <
typename REAL_IN,
typename REAL_OUT>
60 void copy_gpu2cpu(
const len_t m,
const len_t n, std::shared_ptr<card> c,
const REAL_IN *in, REAL_OUT *out)
62 if (std::is_same<REAL_IN, REAL_OUT>::value)
64 const size_t len = (size_t) m*n*
sizeof(REAL_IN);
65 c->mem_gpu2cpu((
void*)out, (
void*)in, len);
69 size_t top = (size_t) m*n;
70 size_t tmplen = std::min(top, CPLEN);
72 REAL_IN *tmp_d = tmp.data_ptr();
74 for (
size_t i=0; i<top; i+=tmplen)
76 const size_t rem = top - i;
77 const size_t copylen = std::min(tmplen, rem);
78 c->mem_gpu2cpu((
void*)tmp_d, (
void*)(in + i), copylen*
sizeof(*in));
79 arraytools::copy(copylen, tmp_d, out + i);
84 template <
typename REAL_IN,
typename REAL_OUT>
85 void copy_cpu2gpu(
const len_t m,
const len_t n, std::shared_ptr<card> c,
const REAL_IN *in, REAL_OUT *out)
87 if (std::is_same<REAL_IN, REAL_OUT>::value)
89 const size_t len = (size_t) m*n*
sizeof(REAL_IN);
90 c->mem_cpu2gpu((
void*)out, (
void*)in, len);
94 size_t top = (size_t) m*n;
95 size_t tmplen = std::min(top, CPLEN);
97 REAL_OUT *tmp_d = tmp.data_ptr();
99 for (
size_t i=0; i<top; i+=tmplen)
101 const size_t rem = top - i;
102 const size_t copylen = std::min(tmplen, rem);
103 arraytools::copy(copylen, in + i, tmp_d);
104 c->mem_cpu2gpu((
void*)(out + i), (
void*)tmp_d, copylen*
sizeof(*out));
128 template <
typename REAL_IN,
typename REAL_OUT>
136 template <
typename REAL>
146 template <
typename REAL_IN,
typename REAL_OUT>
154 template <
typename REAL>
181 template <
typename REAL_IN,
typename REAL_OUT>
189 template <
typename REAL_IN,
typename REAL_OUT>
217 template <
typename REAL_IN,
typename REAL_OUT>
220 auto c = gpu_in.get_card();
221 if (c->get_id() != gpu_out.get_card()->get_id())
222 throw std::logic_error(
"input/output data must be on the same gpu");
225 internals::copy_gpu2gpu(gpu_in.
size(), (len_t)1, c, gpu_in.get_griddim(), gpu_in.get_blockdim(), gpu_in.
data_ptr(), gpu_out.
data_ptr());
229 template <
typename REAL>
239 template <
typename REAL_IN,
typename REAL_OUT>
242 auto c = gpu_in.get_card();
243 if (c->get_id() != gpu_out.get_card()->get_id())
244 throw std::logic_error(
"input/output data must be on the same gpu");
247 internals::copy_gpu2gpu(gpu_in.
nrows(), gpu_in.
ncols(), c, gpu_in.get_griddim(), gpu_in.get_blockdim(), gpu_in.
data_ptr(), gpu_out.
data_ptr());
251 template <
typename REAL>