fml  0.1-0
Fused Matrix Library
svd.hh
1 // This file is part of fml which is released under the Boost Software
2 // License, Version 1.0. See accompanying file LICENSE or copy at
3 // https://www.boost.org/LICENSE_1_0.txt
4 
5 #ifndef FML_GPU_LINALG_SVD_H
6 #define FML_GPU_LINALG_SVD_H
7 #pragma once
8 
9 
10 #include <stdexcept>
11 
12 #include "../../_internals/linalgutils.hh"
13 
14 #include "../arch/arch.hh"
15 
16 #include "../internals/gpu_utils.hh"
17 #include "../internals/gpuscalar.hh"
18 #include "../internals/kernelfuns.hh"
19 
20 #include "../copy.hh"
21 #include "../gpumat.hh"
22 #include "../gpuvec.hh"
23 
24 #include "crossprod.hh"
25 #include "eigen.hh"
26 #include "matmult.hh"
27 #include "qr.hh"
28 #include "xpose.hh"
29 
30 
31 namespace fml
32 {
33 namespace linalg
34 {
35  namespace
36  {
37  template <typename REAL>
38  int svd_internals(const int nu, const int nv, gpumat<REAL> &x, gpuvec<REAL> &s, gpumat<REAL> &u, gpumat<REAL> &vt)
39  {
40  auto c = x.get_card();
41 
42  const len_t m = x.nrows();
43  const len_t n = x.ncols();
44  const len_t minmn = std::min(m, n);
45 
46  s.resize(minmn);
47 
48  signed char jobu, jobvt;
49  if (nu == 0 && nv == 0)
50  {
51  jobu = 'N';
52  jobvt = 'N';
53  }
54  else //if (nu <= minmn && nv <= minmn)
55  {
56  jobu = 'S';
57  jobvt = 'S';
58 
59  u.resize(m, minmn);
60  vt.resize(minmn, n);
61  }
62 
63  int lwork;
64  gpulapack_status_t check = gpulapack::gesvd_buflen(c->lapack_handle(), m, n,
65  x.data_ptr(), &lwork);
66  gpulapack::err::check_ret(check, "gesvd_bufferSize");
67 
68  gpuvec<REAL> work(c, lwork);
69  gpuvec<REAL> rwork(c, minmn-1);
70 
71  int info = 0;
72  gpuscalar<int> info_device(c, info);
73 
74  check = gpulapack::gesvd(c->lapack_handle(), jobu, jobvt, m, n, x.data_ptr(),
75  m, s.data_ptr(), u.data_ptr(), m, vt.data_ptr(), minmn, work.data_ptr(),
76  lwork, rwork.data_ptr(), info_device.data_ptr());
77 
78  info_device.get_val(&info);
79  gpulapack::err::check_ret(check, "gesvd");
80 
81  return info;
82  }
83  }
84 
105  template <typename REAL>
107  {
108  err::check_card(x, s);
109 
110  gpumat<REAL> ignored(x.get_card());
111  int info = svd_internals(0, 0, x, s, ignored, ignored);
112  fml::linalgutils::check_info(info, "gesvd");
113  }
114 
116  template <typename REAL>
118  {
119  err::check_card(x, s, u, vt);
120 
121  if (x.nrows() >= x.ncols())
122  {
123  int info = svd_internals(1, 1, x, s, u, vt);
124  fml::linalgutils::check_info(info, "gesvd");
125  }
126  else
127  {
128  auto tx = xpose(x);
129  gpumat<REAL> v(x.get_card());
130  int info = svd_internals(1, 1, tx, s, v, u);
131  xpose(v, vt);
132  fml::linalgutils::check_info(info, "gesvd");
133  }
134  }
135 
136 
137 
138  namespace
139  {
140  template <typename REAL>
142  {
143  const len_t m = x.nrows();
144  const len_t n = x.ncols();
145  if (m <= n)
146  throw std::runtime_error("'x' must have more rows than cols");
147 
148  auto c = x.get_card();
149 
150  gpuvec<REAL> qraux(c);
151  gpuvec<REAL> work(c);
152  qr_internals(false, x, qraux, work);
153 
154  gpumat<REAL> R(c, n, n);
155  qr_R(x, R);
156 
157  gpumat<REAL> u_R(c, n, n);
158  int info = svd_internals(1, 1, R, s, u_R, vt);
159  fml::linalgutils::check_info(info, "gesvd");
160 
161  u.resize(m, n);
162  qr_Q(x, qraux, u, work);
163 
164  matmult(false, false, (REAL)1.0, u, u_R, x);
165  copy::gpu2gpu(x, u);
166  }
167 
168  template <typename REAL>
169  void tssvd(gpumat<REAL> &x, gpuvec<REAL> &s)
170  {
171  const len_t m = x.nrows();
172  const len_t n = x.ncols();
173  if (m <= n)
174  throw std::runtime_error("'x' must have more rows than cols");
175 
176  auto c = x.get_card();
177  s.resize(n);
178 
179  gpuvec<REAL> qraux(c);
180  gpuvec<REAL> work(c);
181  qr_internals(false, x, qraux, work);
182 
183  fml::gpu_utils::tri2zero('L', false, n, n, x.data_ptr(), m);
184 
185  int lwork;
186  gpulapack_status_t check = gpulapack::gesvd_buflen(c->lapack_handle(), n, n,
187  x.data_ptr(), &lwork);
188  gpulapack::err::check_ret(check, "gesvd_bufferSize");
189 
190  if (lwork > work.size())
191  work.resize(lwork);
192  if (m-1 > qraux.size())
193  qraux.resize(m-1);
194 
195  int info = 0;
196  gpuscalar<int> info_device(c, info);
197 
198  check = gpulapack::gesvd(c->lapack_handle(), 'N', 'N', n, n, x.data_ptr(),
199  m, s.data_ptr(), NULL, m, NULL, 1, work.data_ptr(), lwork,
200  qraux.data_ptr(), info_device.data_ptr());
201 
202  info_device.get_val(&info);
203  gpulapack::err::check_ret(check, "gesvd");
204  fml::linalgutils::check_info(info, "gesvd");
205  }
206 
207 
208 
209  template <typename REAL>
210  void sfsvd(gpumat<REAL> &x, gpuvec<REAL> &s, gpumat<REAL> &u, gpumat<REAL> &vt)
211  {
212  const len_t m = x.nrows();
213  const len_t n = x.ncols();
214  if (m >= n)
215  throw std::runtime_error("'x' must have more cols than rows");
216 
217  gpumat<REAL> tx = xpose(x);
218  gpumat<REAL> v(x.get_card());
219  tssvd(tx, s, v, u);
220  xpose(v, vt);
221  }
222 
223  template <typename REAL>
224  void sfsvd(gpumat<REAL> &x, gpuvec<REAL> &s)
225  {
226  const len_t m = x.nrows();
227  const len_t n = x.ncols();
228  if (m >= n)
229  throw std::runtime_error("'x' must have more cols than rows");
230 
231  auto tx = xpose(x);
232  tssvd(tx, s);
233  }
234  }
235 
269  template <typename REAL>
271  {
272  err::check_card(x, s, u, vt);
273 
274  if (x.is_square())
275  svd(x, s, u, vt);
276  else if (x.nrows() > x.ncols())
277  tssvd(x, s, u, vt);
278  else
279  sfsvd(x, s, u, vt);
280  }
281 
283  template <typename REAL>
285  {
286  err::check_card(x, s);
287 
288  if (x.is_square())
289  svd(x, s);
290  else if (x.nrows() > x.ncols())
291  tssvd(x, s);
292  else
293  sfsvd(x, s);
294  }
295 
296 
297 
298  namespace
299  {
300  template <typename REAL>
301  __global__ void kernel_sweep_cols_div(const len_t m, const len_t n, REAL *data, const REAL *v)
302  {
303  int i = blockDim.x*blockIdx.x + threadIdx.x;
304  int j = blockDim.y*blockIdx.y + threadIdx.y;
305 
306  if (i < m && j < n)
307  data[i + m*j] /= v[j];
308  }
309  }
310 
336  template <typename REAL>
338  {
339  err::check_card(x, s, u, vt);
340 
341  const len_t m = x.nrows();
342  const len_t n = x.ncols();
343  const len_t minmn = std::min(m, n);
344 
345  auto c = x.get_card();
346 
347  gpumat<REAL> cp(c);
348 
349  if (m >= n)
350  {
351  crossprod((REAL)1.0, x, cp);
352  eigen_sym(cp, s, vt);
353  vt.rev_cols();
354  copy::gpu2gpu(vt, cp);
355  }
356  else
357  {
358  tcrossprod((REAL)1.0, x, cp);
359  eigen_sym(cp, s, u);
360  u.rev_cols();
361  copy::gpu2gpu(u, cp);
362  }
363 
364  s.rev();
365  auto sgrid = s.get_griddim();
366  auto sblock = s.get_blockdim();
367  fml::kernelfuns::kernel_root_abs<<<sgrid, sblock>>>(s.size(), s.data_ptr());
368 
369  REAL *ev_d;
370  if (m >= n)
371  ev_d = vt.data_ptr();
372  else
373  ev_d = cp.data_ptr();
374 
375  auto xgrid = x.get_griddim();
376  auto xblock = x.get_blockdim();
377  kernel_sweep_cols_div<<<xgrid, xblock>>>(minmn, minmn, ev_d, s.data_ptr());
378 
379  if (m >= n)
380  {
381  matmult(false, false, (REAL)1.0, x, vt, u);
382  xpose(cp, vt);
383  }
384  else
385  matmult(true, false, (REAL)1.0, cp, x, vt);
386  }
387 
389  template <typename REAL>
390  void cpsvd(const gpumat<REAL> &x, gpuvec<REAL> &s)
391  {
392  err::check_card(x, s);
393 
394  const len_t m = x.nrows();
395  const len_t n = x.ncols();
396 
397  auto c = x.get_card();
398 
399  gpumat<REAL> cp(c);
400 
401  if (m >= n)
402  crossprod((REAL)1.0, x, cp);
403  else
404  tcrossprod((REAL)1.0, x, cp);
405 
406  eigen_sym(cp, s);
407 
408  s.rev();
409  fml::kernelfuns::kernel_root_abs<<<s.get_griddim(), s.get_blockdim()>>>(s.size(), s.data_ptr());
410  }
411 
412 
413 
414  namespace
415  {
416  template <typename REAL>
417  void rsvd_A(const uint32_t seed, const int k, const int q, gpumat<REAL> &x,
418  gpumat<REAL> &QY)
419  {
420  const len_t m = x.nrows();
421  const len_t n = x.ncols();
422 
423  gpumat<REAL> omega(x.get_card(), n, 2*k);
424  omega.fill_runif(seed);
425 
426  gpumat<REAL> Y(x.get_card(), m, 2*k);
427  gpumat<REAL> Z(x.get_card(), n, 2*k);
428  gpumat<REAL> QZ(x.get_card(), n, 2*k);
429 
430  gpuvec<REAL> qraux(x.get_card());
431  gpuvec<REAL> work(x.get_card());
432 
433  gpumat<REAL> B(x.get_card(), 2*k, n);
434 
435  // Stage A
436  matmult(false, false, (REAL)1.0, x, omega, Y);
437  qr_internals(false, Y, qraux, work);
438  qr_Q(Y, qraux, QY, work);
439 
440  for (int i=0; i<q; i++)
441  {
442  matmult(true, false, (REAL)1.0, x, QY, Z);
443  qr_internals(false, Z, qraux, work);
444  qr_Q(Z, qraux, QZ, work);
445 
446  matmult(false, false, (REAL)1.0, x, QZ, Y);
447  qr_internals(false, Y, qraux, work);
448  qr_Q(Y, qraux, QY, work);
449  }
450  }
451  }
452 
474  template <typename REAL>
475  void rsvd(const uint32_t seed, const int k, const int q, gpumat<REAL> &x,
476  gpuvec<REAL> &s)
477  {
478  const len_t m = x.nrows();
479  const len_t n = x.ncols();
480 
481  gpumat<REAL> QY(x.get_card(), m, 2*k);
482  gpumat<REAL> B(x.get_card(), 2*k, n);
483 
484  // Stage A
485  rsvd_A(seed, k, q, x, QY);
486 
487  // Stage B
488  matmult(true, false, (REAL)1.0, QY, x, B);
489 
490  cpumat<REAL> uB;
491  svd(B, s);
492 
493  s.resize(k);
494  }
495 
497  template <typename REAL>
498  void rsvd(const uint32_t seed, const int k, const int q, gpumat<REAL> &x,
500  {
501  const len_t m = x.nrows();
502  const len_t n = x.ncols();
503 
504  gpumat<REAL> QY(x.get_card(), m, 2*k);
505  gpumat<REAL> B(x.get_card(), 2*k, n);
506 
507  // Stage A
508  rsvd_A(seed, k, q, x, QY);
509 
510  // Stage B
511  matmult(true, false, (REAL)1.0, QY, x, B);
512 
513  gpumat<REAL> uB(x.get_card());
514  qrsvd(B, s, uB, vt);
515 
516  s.resize(k);
517 
518  matmult(false, false, (REAL)1.0, QY, uB, u);
519  u.resize(u.nrows(), k);
520  }
521 }
522 }
523 
524 
525 #endif
fml::cpumat
Matrix class for data held on a single CPU.
Definition: cpumat.hh:36
fml::linalg::crossprod
void crossprod(const REAL alpha, const cpumat< REAL > &x, cpumat< REAL > &ret)
Computes lower triangle of alpha*x^T*x.
Definition: crossprod.hh:37
fml::unimat::is_square
bool is_square() const
Is the matrix square?
Definition: unimat.hh:34
fml::linalg::xpose
void xpose(const cpumat< REAL > &x, cpumat< REAL > &tx)
Computes the transpose out-of-place (i.e. in a copy).
Definition: xpose.hh:37
fml::univec::data_ptr
T * data_ptr()
Pointer to the internal array.
Definition: univec.hh:28
fml::gpuvec
Vector class for data held on a single GPU.
Definition: gpuvec.hh:32
fml::gpuvec::resize
void resize(len_t size)
Resize the internal object storage.
Definition: gpuvec.hh:225
fml::unimat::nrows
len_t nrows() const
Number of rows.
Definition: unimat.hh:36
fml::linalg::rsvd
void rsvd(const uint32_t seed, const int k, const int q, cpumat< REAL > &x, cpuvec< REAL > &s)
Computes the truncated singular value decomposition using the normal projections method of Halko et a...
Definition: svd.hh:462
fml::copy::gpu2gpu
void gpu2gpu(const gpuvec< REAL_IN > &gpu_in, gpuvec< REAL_OUT > &gpu_out)
Copy data from a GPU object to another.
Definition: copy.hh:203
fml::unimat::ncols
len_t ncols() const
Number of columns.
Definition: unimat.hh:38
fml::linalg::qrsvd
void qrsvd(cpumat< REAL > &x, cpuvec< REAL > &s, cpumat< REAL > &u, cpumat< REAL > &vt)
Computes the singular value decomposition by first reducing the rectangular matrix to a square matrix...
Definition: svd.hh:273
fml::linalg::tssvd
void tssvd(parmat_cpu< REAL > &x, cpuvec< REAL > &s)
Computes the singular value decomposition by first reducing the rectangular matrix to a square matrix...
Definition: svd.hh:130
fml::linalg::svd
void svd(cpumat< REAL > &x, cpuvec< REAL > &s)
Computes the singular value decomposition.
Definition: svd.hh:101
fml::unimat::data_ptr
REAL * data_ptr()
Pointer to the internal array.
Definition: unimat.hh:40
fml::gpuvec::rev
void rev()
Reverse the vector.
Definition: gpuvec.hh:462
fml
Core namespace.
Definition: dimops.hh:10
fml::linalg::qr_Q
void qr_Q(const cpumat< REAL > &QR, const cpuvec< REAL > &qraux, cpumat< REAL > &Q, cpuvec< REAL > &work)
Recover the Q matrix from a QR decomposition.
Definition: qr.hh:120
fml::univec::size
len_t size() const
Number of elements in the vector.
Definition: univec.hh:26
fml::linalg::qr_R
void qr_R(const cpumat< REAL > &QR, cpumat< REAL > &R)
Recover the R matrix from a QR decomposition.
Definition: qr.hh:162
fml::gpumat::resize
void resize(len_t nrows, len_t ncols)
Resize the internal object storage.
Definition: gpumat.hh:256
fml::linalg::eigen_sym
void eigen_sym(cpumat< REAL > &x, cpuvec< REAL > &values)
Compute the eigenvalues and optionally the eigenvectors for a symmetric matrix.
Definition: eigen.hh:95
fml::gpumat
Matrix class for data held on a single GPU.
Definition: gpumat.hh:35
fml::linalg::tcrossprod
void tcrossprod(const REAL alpha, const cpumat< REAL > &x, cpumat< REAL > &ret)
Computes lower triangle of alpha*x*x^T.
Definition: crossprod.hh:81
fml::gpumat::rev_cols
void rev_cols()
Reverse the columns of the matrix.
Definition: gpumat.hh:616
fml::linalg::matmult
void matmult(const bool transx, const bool transy, const REAL alpha, const cpumat< REAL > &x, const cpumat< REAL > &y, cpumat< REAL > &ret)
Computes ret = alpha*op(x)*op(y) where op(A) is A or A^T.
Definition: matmult.hh:43
fml::linalg::cpsvd
void cpsvd(const cpumat< REAL > &x, cpuvec< REAL > &s, cpumat< REAL > &u, cpumat< REAL > &vt)
Computes the singular value decomposition using the "crossproducts SVD". This method is not numerical...
Definition: svd.hh:323