5 #ifndef FML_GPU_DIMOPS_H
6 #define FML_GPU_DIMOPS_H
10 #include "../_internals/dimops.hh"
12 #include "internals/kernelfuns.hh"
26 template <
typename REAL>
27 __global__
void kernel_sweep_add(
const len_t m,
const len_t n,
28 REAL *x,
const REAL *s,
const REAL sgn)
30 int i = blockDim.x*blockIdx.x + threadIdx.x;
31 int j = blockDim.y*blockIdx.y + threadIdx.y;
34 x[i + m*j] += sgn * s[j];
37 template <
typename REAL>
38 static inline void sweep_add(gpumat<REAL> &x,
const gpuvec<REAL> &s)
40 kernel_sweep_add<<<x.get_griddim(), x.get_blockdim()>>>(x.nrows(),
41 x.ncols(), x.data_ptr(), s.data_ptr(), (REAL) 1.0);
44 template <
typename REAL>
45 static inline void sweep_sub(gpumat<REAL> &x,
const gpuvec<REAL> &s)
47 kernel_sweep_add<<<x.get_griddim(), x.get_blockdim()>>>(x.nrows(),
48 x.ncols(), x.data_ptr(), s.data_ptr(), (REAL) -1.0);
51 template <
typename REAL>
52 __global__
void kernel_sweep_mul(
const len_t m,
const len_t n,
53 REAL *x,
const REAL *s)
55 int i = blockDim.x*blockIdx.x + threadIdx.x;
56 int j = blockDim.y*blockIdx.y + threadIdx.y;
62 template <
typename REAL>
63 static inline void sweep_mul(gpumat<REAL> &x,
const gpuvec<REAL> &s)
65 kernel_sweep_mul<<<x.get_griddim(), x.get_blockdim()>>>(x.nrows(),
66 x.ncols(), x.data_ptr(), s.data_ptr());
69 template <
typename REAL>
70 __global__
void kernel_sweep_div(
const len_t m,
const len_t n,
71 REAL *x,
const REAL *s)
73 int i = blockDim.x*blockIdx.x + threadIdx.x;
74 int j = blockDim.y*blockIdx.y + threadIdx.y;
80 template <
typename REAL>
81 static inline void sweep_div(gpumat<REAL> &x,
const gpuvec<REAL> &s)
83 kernel_sweep_div<<<x.get_griddim(), x.get_blockdim()>>>(x.nrows(),
84 x.ncols(), x.data_ptr(), s.data_ptr());
89 template <
typename REAL>
90 __global__
void kernel_colsdevs(
const len_t m,
const len_t n,
91 const REAL *x,
const REAL *means, REAL *sdevs)
93 int i = blockDim.x*blockIdx.x + threadIdx.x;
94 int j = blockDim.y*blockIdx.y + threadIdx.y;
98 REAL tmp = (x[i + m*j] - means[j]) * (x[i + m*j] - means[j]) / (m-1);
99 atomicAdd(sdevs+j, tmp);
103 template <
typename REAL>
104 static inline void colsdevs(
const gpumat<REAL> &x,
105 const gpuvec<REAL> &means, gpuvec<REAL> &sdevs)
108 kernel_colsdevs<<<x.get_griddim(), x.get_blockdim()>>>(x.nrows(),
109 x.ncols(), x.data_ptr(), means.data_ptr(), sdevs.data_ptr());
110 fml::kernelfuns::kernel_root_abs<<<sdevs.get_griddim(),
111 sdevs.get_blockdim()>>>(sdevs.size(), sdevs.data_ptr());
133 template <
typename REAL>
136 linalg::err::check_card(x, s);
137 const len_t m = x.
nrows();
138 const len_t n = x.
ncols();
166 template <
typename REAL>
191 template <
typename REAL>
194 linalg::err::check_card(x, s);
195 const len_t m = x.
nrows();
196 const len_t n = x.
ncols();
224 template <
typename REAL>
235 template <
typename REAL>
236 __global__
void kernel_rowsweep(
const len_t m,
const len_t n,
237 REAL *x,
const REAL *s,
const sweep_op op)
239 int i = blockDim.x*blockIdx.x + threadIdx.x;
240 int j = blockDim.y*blockIdx.y + threadIdx.y;
246 else if (op == dimops::SWEEP_SUB)
248 else if (op == dimops::SWEEP_MUL)
250 else if (op == dimops::SWEEP_DIV)
267 template <
typename REAL>
268 static inline void rowsweep(gpumat<REAL> &x,
const gpuvec<REAL> &s,
271 if (s.size() != x.nrows())
272 throw std::runtime_error(
"non-conformal arguments");
274 internals::kernel_rowsweep<<<x.get_griddim(), x.get_blockdim()>>>(x.nrows(),
275 x.ncols(), x.data_ptr(), s.data_ptr(), op);
282 template <
typename REAL>
283 __global__
void kernel_colsweep(
const len_t m,
const len_t n,
284 REAL *x,
const REAL *s,
const sweep_op op)
286 int i = blockDim.x*blockIdx.x + threadIdx.x;
287 int j = blockDim.y*blockIdx.y + threadIdx.y;
293 else if (op == dimops::SWEEP_SUB)
295 else if (op == dimops::SWEEP_MUL)
297 else if (op == dimops::SWEEP_DIV)
314 template <
typename REAL>
315 static inline void colsweep(gpumat<REAL> &x,
const gpuvec<REAL> &s,
318 if (s.size() != x.ncols())
319 throw std::runtime_error(
"non-conformal arguments");
321 internals::kernel_colsweep<<<x.get_griddim(), x.get_blockdim()>>>(x.nrows(),
322 x.ncols(), x.data_ptr(), s.data_ptr(), op);
341 template <
typename REAL>
344 const len_t n = x.
ncols();
346 if (rm_mean && rm_sd)
351 internals::colsdevs(x, means, sdevs);
353 internals::sweep_sub(x, means);
354 internals::sweep_div(x, sdevs);
361 internals::sweep_sub(x, means);
368 internals::colsdevs(x, means, sdevs);
370 internals::sweep_div(x, sdevs);