fml  0.1-0
Fused Matrix Library
dimops.hh
1 // This file is part of fml which is released under the Boost Software
2 // License, Version 1.0. See accompanying file LICENSE or copy at
3 // https://www.boost.org/LICENSE_1_0.txt
4 
5 #ifndef FML_GPU_DIMOPS_H
6 #define FML_GPU_DIMOPS_H
7 #pragma once
8 
9 
10 #include "../_internals/dimops.hh"
11 
12 #include "internals/kernelfuns.hh"
13 
14 #include "gpumat.hh"
15 #include "gpuvec.hh"
16 #include "linalg.hh"
17 
18 
19 namespace fml
20 {
22 namespace dimops
23 {
24  namespace internals
25  {
26  template <typename REAL>
27  __global__ void kernel_sweep_add(const len_t m, const len_t n,
28  REAL *x, const REAL *s, const REAL sgn)
29  {
30  int i = blockDim.x*blockIdx.x + threadIdx.x;
31  int j = blockDim.y*blockIdx.y + threadIdx.y;
32 
33  if (i < m && j < n)
34  x[i + m*j] += sgn * s[j];
35  }
36 
37  template <typename REAL>
38  static inline void sweep_add(gpumat<REAL> &x, const gpuvec<REAL> &s)
39  {
40  kernel_sweep_add<<<x.get_griddim(), x.get_blockdim()>>>(x.nrows(),
41  x.ncols(), x.data_ptr(), s.data_ptr(), (REAL) 1.0);
42  }
43 
44  template <typename REAL>
45  static inline void sweep_sub(gpumat<REAL> &x, const gpuvec<REAL> &s)
46  {
47  kernel_sweep_add<<<x.get_griddim(), x.get_blockdim()>>>(x.nrows(),
48  x.ncols(), x.data_ptr(), s.data_ptr(), (REAL) -1.0);
49  }
50 
51  template <typename REAL>
52  __global__ void kernel_sweep_mul(const len_t m, const len_t n,
53  REAL *x, const REAL *s)
54  {
55  int i = blockDim.x*blockIdx.x + threadIdx.x;
56  int j = blockDim.y*blockIdx.y + threadIdx.y;
57 
58  if (i < m && j < n)
59  x[i + m*j] *= s[j];
60  }
61 
62  template <typename REAL>
63  static inline void sweep_mul(gpumat<REAL> &x, const gpuvec<REAL> &s)
64  {
65  kernel_sweep_mul<<<x.get_griddim(), x.get_blockdim()>>>(x.nrows(),
66  x.ncols(), x.data_ptr(), s.data_ptr());
67  }
68 
69  template <typename REAL>
70  __global__ void kernel_sweep_div(const len_t m, const len_t n,
71  REAL *x, const REAL *s)
72  {
73  int i = blockDim.x*blockIdx.x + threadIdx.x;
74  int j = blockDim.y*blockIdx.y + threadIdx.y;
75 
76  if (i < m && j < n)
77  x[i + m*j] /= s[j];
78  }
79 
80  template <typename REAL>
81  static inline void sweep_div(gpumat<REAL> &x, const gpuvec<REAL> &s)
82  {
83  kernel_sweep_div<<<x.get_griddim(), x.get_blockdim()>>>(x.nrows(),
84  x.ncols(), x.data_ptr(), s.data_ptr());
85  }
86 
87 
88 
89  template <typename REAL>
90  __global__ void kernel_colsdevs(const len_t m, const len_t n,
91  const REAL *x, const REAL *means, REAL *sdevs)
92  {
93  int i = blockDim.x*blockIdx.x + threadIdx.x;
94  int j = blockDim.y*blockIdx.y + threadIdx.y;
95 
96  if (i < m && j < n)
97  {
98  REAL tmp = (x[i + m*j] - means[j]) * (x[i + m*j] - means[j]) / (m-1);
99  atomicAdd(sdevs+j, tmp);
100  }
101  }
102 
103  template <typename REAL>
104  static inline void colsdevs(const gpumat<REAL> &x,
105  const gpuvec<REAL> &means, gpuvec<REAL> &sdevs)
106  {
107  sdevs.fill_zero();
108  kernel_colsdevs<<<x.get_griddim(), x.get_blockdim()>>>(x.nrows(),
109  x.ncols(), x.data_ptr(), means.data_ptr(), sdevs.data_ptr());
110  fml::kernelfuns::kernel_root_abs<<<sdevs.get_griddim(),
111  sdevs.get_blockdim()>>>(sdevs.size(), sdevs.data_ptr());
112  }
113  }
114 
115 
116 
133  template <typename REAL>
134  void rowsums(const gpumat<REAL> &x, gpuvec<REAL> &s)
135  {
136  linalg::err::check_card(x, s);
137  const len_t m = x.nrows();
138  const len_t n = x.ncols();
139 
140  s.resize(m);
141 
142  gpuvec<REAL> ones(s.get_card(), n);
143  ones.fill_val(1);
144 
145  linalg::matmult(false, false, (REAL) 1.0, x, ones, s);
146  }
147 
148 
149 
166  template <typename REAL>
167  void rowmeans(const gpumat<REAL> &x, gpuvec<REAL> &s)
168  {
169  rowsums(x, s);
170  s.scale((REAL) 1.0/x.ncols());
171  }
172 
173 
174 
191  template <typename REAL>
192  void colsums(const gpumat<REAL> &x, gpuvec<REAL> &s)
193  {
194  linalg::err::check_card(x, s);
195  const len_t m = x.nrows();
196  const len_t n = x.ncols();
197 
198  s.resize(n);
199 
200  gpuvec<REAL> ones(s.get_card(), m);
201  ones.fill_val(1);
202 
203  linalg::matmult(true, false, (REAL) 1.0, ones, x, s);
204  }
205 
206 
207 
224  template <typename REAL>
225  void colmeans(const gpumat<REAL> &x, gpuvec<REAL> &s)
226  {
227  colsums(x, s);
228  s.scale((REAL) 1.0/x.nrows());
229  }
230 
231 
232 
233  namespace internals
234  {
235  template <typename REAL>
236  __global__ void kernel_rowsweep(const len_t m, const len_t n,
237  REAL *x, const REAL *s, const sweep_op op)
238  {
239  int i = blockDim.x*blockIdx.x + threadIdx.x;
240  int j = blockDim.y*blockIdx.y + threadIdx.y;
241 
242  if (i < m && j < n)
243  {
244  if (op == dimops::SWEEP_ADD)
245  x[i + m*j] += s[i];
246  else if (op == dimops::SWEEP_SUB)
247  x[i + m*j] -= s[i];
248  else if (op == dimops::SWEEP_MUL)
249  x[i + m*j] *= s[i];
250  else if (op == dimops::SWEEP_DIV)
251  x[i + m*j] /= s[i];
252  }
253  }
254  }
255 
267  template <typename REAL>
268  static inline void rowsweep(gpumat<REAL> &x, const gpuvec<REAL> &s,
269  const sweep_op op)
270  {
271  if (s.size() != x.nrows())
272  throw std::runtime_error("non-conformal arguments");
273 
274  internals::kernel_rowsweep<<<x.get_griddim(), x.get_blockdim()>>>(x.nrows(),
275  x.ncols(), x.data_ptr(), s.data_ptr(), op);
276  }
277 
278 
279 
280  namespace internals
281  {
282  template <typename REAL>
283  __global__ void kernel_colsweep(const len_t m, const len_t n,
284  REAL *x, const REAL *s, const sweep_op op)
285  {
286  int i = blockDim.x*blockIdx.x + threadIdx.x;
287  int j = blockDim.y*blockIdx.y + threadIdx.y;
288 
289  if (i < m && j < n)
290  {
291  if (op == dimops::SWEEP_ADD)
292  x[i + m*j] += s[j];
293  else if (op == dimops::SWEEP_SUB)
294  x[i + m*j] -= s[j];
295  else if (op == dimops::SWEEP_MUL)
296  x[i + m*j] *= s[j];
297  else if (op == dimops::SWEEP_DIV)
298  x[i + m*j] /= s[j];
299  }
300  }
301  }
302 
314  template <typename REAL>
315  static inline void colsweep(gpumat<REAL> &x, const gpuvec<REAL> &s,
316  const sweep_op op)
317  {
318  if (s.size() != x.ncols())
319  throw std::runtime_error("non-conformal arguments");
320 
321  internals::kernel_colsweep<<<x.get_griddim(), x.get_blockdim()>>>(x.nrows(),
322  x.ncols(), x.data_ptr(), s.data_ptr(), op);
323  }
324 
325 
326 
341  template <typename REAL>
342  void scale(const bool rm_mean, const bool rm_sd, gpumat<REAL> &x)
343  {
344  const len_t n = x.ncols();
345 
346  if (rm_mean && rm_sd)
347  {
348  gpuvec<REAL> means(x.get_card(), n);
349  gpuvec<REAL> sdevs(x.get_card(), n);
350  colmeans(x, means);
351  internals::colsdevs(x, means, sdevs);
352 
353  internals::sweep_sub(x, means);
354  internals::sweep_div(x, sdevs);
355  }
356  else if (rm_mean)
357  {
358  gpuvec<REAL> means(x.get_card());
359  colmeans(x, means);
360 
361  internals::sweep_sub(x, means);
362  }
363  else if (rm_sd)
364  {
365  gpuvec<REAL> means(x.get_card());
366  gpuvec<REAL> sdevs(x.get_card(), n);
367  colmeans(x, means);
368  internals::colsdevs(x, means, sdevs);
369 
370  internals::sweep_div(x, sdevs);
371  }
372  }
373 }
374 }
375 
376 
377 #endif
fml::gpuvec::fill_val
void fill_val(const T v)
Set all values to input value.
Definition: gpuvec.hh:401
fml::gpuvec
Vector class for data held on a single GPU.
Definition: gpuvec.hh:32
fml::gpuvec::resize
void resize(len_t size)
Resize the internal object storage.
Definition: gpuvec.hh:225
fml::unimat::nrows
len_t nrows() const
Number of rows.
Definition: unimat.hh:36
fml::dimops::scale
void scale(const bool rm_mean, const bool rm_sd, cpumat< REAL > &x)
Remove the mean and/or the sd from a matrix.
Definition: dimops.hh:376
fml::unimat::ncols
len_t ncols() const
Number of columns.
Definition: unimat.hh:38
fml::gpuvec::scale
void scale(const T s)
Multiply all values by the input value.
Definition: gpuvec.hh:438
fml::dimops::SWEEP_ADD
@ SWEEP_ADD
Definition: dimops.hh:22
fml
Core namespace.
Definition: dimops.hh:10
fml::dimops::sweep_op
sweep_op
Definition: dimops.hh:14
fml::dimops::rowsums
void rowsums(const cpumat< REAL > &x, cpuvec< REAL > &s)
Compute the row sums.
Definition: dimops.hh:72
fml::gpumat
Matrix class for data held on a single GPU.
Definition: gpumat.hh:35
fml::dimops::colsums
void colsums(const cpumat< REAL > &x, cpuvec< REAL > &s)
Compute the column sums.
Definition: dimops.hh:125
fml::dimops::colmeans
void colmeans(const cpumat< REAL > &x, cpuvec< REAL > &s)
Compute the column means.
Definition: dimops.hh:151
fml::linalg::matmult
void matmult(const bool transx, const bool transy, const REAL alpha, const cpumat< REAL > &x, const cpumat< REAL > &y, cpumat< REAL > &ret)
Computes ret = alpha*op(x)*op(y) where op(A) is A or A^T.
Definition: matmult.hh:43
fml::dimops::rowmeans
void rowmeans(const cpumat< REAL > &x, cpuvec< REAL > &s)
Compute the row means.
Definition: dimops.hh:101