12 #include "../_internals/arraytools/src/arraytools.hpp"
14 #include "../cpu/cpumat.hh"
15 #include "../cpu/cpuvec.hh"
17 #include "internals/kernelfuns.hh"
30 static const size_t CPLEN = 1024;
32 template <
typename REAL_IN,
typename REAL_OUT>
33 void copy_gpu2gpu(
const len_t m,
const len_t n, std::shared_ptr<card> c, dim3 griddim, dim3 blockdim,
const REAL_IN *in, REAL_OUT *out)
35 if (std::is_same<REAL_IN, REAL_OUT>::value)
37 const size_t len = (size_t) m*n*
sizeof(REAL_IN);
38 c->mem_gpu2gpu((
void*)out, (
void*)in, len);
41 fml::kernelfuns::kernel_copy<<<griddim, blockdim>>>(m, n, in, out);
44 template <
typename REAL_IN,
typename REAL_OUT>
45 void copy_gpu2cpu(
const len_t m,
const len_t n, std::shared_ptr<card> c,
const REAL_IN *in, REAL_OUT *out)
47 if (std::is_same<REAL_IN, REAL_OUT>::value)
49 const size_t len = (size_t) m*n*
sizeof(REAL_IN);
50 c->mem_gpu2cpu((
void*)out, (
void*)in, len);
54 size_t top = (size_t) m*n;
55 size_t tmplen = std::min(top, CPLEN);
56 cpuvec<REAL_IN> tmp(tmplen);
57 REAL_IN *tmp_d = tmp.data_ptr();
59 for (
size_t i=0; i<top; i+=tmplen)
61 const size_t rem = top - i;
62 const size_t copylen = std::min(tmplen, rem);
63 c->mem_gpu2cpu((
void*)tmp_d, (
void*)(in + i), copylen*
sizeof(*in));
64 arraytools::copy(copylen, tmp_d, out + i);
69 template <
typename REAL_IN,
typename REAL_OUT>
70 void copy_cpu2gpu(
const len_t m,
const len_t n, std::shared_ptr<card> c,
const REAL_IN *in, REAL_OUT *out)
72 if (std::is_same<REAL_IN, REAL_OUT>::value)
74 const size_t len = (size_t) m*n*
sizeof(REAL_IN);
75 c->mem_cpu2gpu((
void*)out, (
void*)in, len);
79 size_t top = (size_t) m*n;
80 size_t tmplen = std::min(top, CPLEN);
81 cpuvec<REAL_OUT> tmp(tmplen);
82 REAL_OUT *tmp_d = tmp.data_ptr();
84 for (
size_t i=0; i<top; i+=tmplen)
86 const size_t rem = top - i;
87 const size_t copylen = std::min(tmplen, rem);
88 arraytools::copy(copylen, in + i, tmp_d);
89 c->mem_cpu2gpu((
void*)(out + i), (
void*)tmp_d, copylen*
sizeof(*out));
113 template <
typename REAL_IN,
typename REAL_OUT>
121 template <
typename REAL>
131 template <
typename REAL_IN,
typename REAL_OUT>
139 template <
typename REAL>
166 template <
typename REAL_IN,
typename REAL_OUT>
174 template <
typename REAL_IN,
typename REAL_OUT>
202 template <
typename REAL_IN,
typename REAL_OUT>
205 auto c = gpu_in.get_card();
206 if (c->get_id() != gpu_out.get_card()->get_id())
207 throw std::logic_error(
"input/output data must be on the same gpu");
210 internals::copy_gpu2gpu(gpu_in.
size(), (len_t)1, c, gpu_in.get_griddim(), gpu_in.get_blockdim(), gpu_in.
data_ptr(), gpu_out.
data_ptr());
214 template <
typename REAL>
224 template <
typename REAL_IN,
typename REAL_OUT>
227 auto c = gpu_in.get_card();
228 if (c->get_id() != gpu_out.get_card()->get_id())
229 throw std::logic_error(
"input/output data must be on the same gpu");
232 internals::copy_gpu2gpu(gpu_in.
nrows(), gpu_in.
ncols(), c, gpu_in.get_griddim(), gpu_in.get_blockdim(), gpu_in.
data_ptr(), gpu_out.
data_ptr());
236 template <
typename REAL>