fml  0.1-0
Fused Matrix Library
linalg_svd.hh
1 // This file is part of fml which is released under the Boost Software
2 // License, Version 1.0. See accompanying file LICENSE or copy at
3 // https://www.boost.org/LICENSE_1_0.txt
4 
5 #ifndef FML_GPU_LINALG_LINALG_SVD_H
6 #define FML_GPU_LINALG_LINALG_SVD_H
7 #pragma once
8 
9 
10 #include <stdexcept>
11 
12 #include "../../_internals/linalgutils.hh"
13 
14 #include "../arch/arch.hh"
15 
16 #include "../internals/gpu_utils.hh"
17 #include "../internals/gpuscalar.hh"
18 #include "../internals/kernelfuns.hh"
19 
20 #include "../copy.hh"
21 #include "../gpumat.hh"
22 #include "../gpuvec.hh"
23 
24 #include "linalg_blas.hh"
25 #include "linalg_eigen.hh"
26 #include "linalg_qr.hh"
27 
28 
29 namespace fml
30 {
31 namespace linalg
32 {
33  namespace
34  {
35  template <typename REAL>
36  int svd_internals(const int nu, const int nv, gpumat<REAL> &x, gpuvec<REAL> &s, gpumat<REAL> &u, gpumat<REAL> &vt)
37  {
38  auto c = x.get_card();
39 
40  const len_t m = x.nrows();
41  const len_t n = x.ncols();
42  const len_t minmn = std::min(m, n);
43 
44  s.resize(minmn);
45 
46  signed char jobu, jobvt;
47  if (nu == 0 && nv == 0)
48  {
49  jobu = 'N';
50  jobvt = 'N';
51  }
52  else //if (nu <= minmn && nv <= minmn)
53  {
54  jobu = 'S';
55  jobvt = 'S';
56 
57  u.resize(m, minmn);
58  vt.resize(minmn, n);
59  }
60 
61  int lwork;
62  gpulapack_status_t check = gpulapack::gesvd_buflen(c->lapack_handle(), m, n,
63  x.data_ptr(), &lwork);
64  gpulapack::err::check_ret(check, "gesvd_bufferSize");
65 
66  gpuvec<REAL> work(c, lwork);
67  gpuvec<REAL> rwork(c, minmn-1);
68 
69  int info = 0;
70  gpuscalar<int> info_device(c, info);
71 
72  check = gpulapack::gesvd(c->lapack_handle(), jobu, jobvt, m, n, x.data_ptr(),
73  m, s.data_ptr(), u.data_ptr(), m, vt.data_ptr(), minmn, work.data_ptr(),
74  lwork, rwork.data_ptr(), info_device.data_ptr());
75 
76  info_device.get_val(&info);
77  gpulapack::err::check_ret(check, "gesvd");
78 
79  return info;
80  }
81  }
82 
103  template <typename REAL>
105  {
106  err::check_card(x, s);
107 
108  gpumat<REAL> ignored(x.get_card());
109  int info = svd_internals(0, 0, x, s, ignored, ignored);
110  fml::linalgutils::check_info(info, "gesvd");
111  }
112 
114  template <typename REAL>
116  {
117  err::check_card(x, s, u, vt);
118 
119  if (x.nrows() >= x.ncols())
120  {
121  int info = svd_internals(1, 1, x, s, u, vt);
122  fml::linalgutils::check_info(info, "gesvd");
123  }
124  else
125  {
126  auto tx = xpose(x);
127  gpumat<REAL> v(x.get_card());
128  int info = svd_internals(1, 1, tx, s, v, u);
129  xpose(v, vt);
130  fml::linalgutils::check_info(info, "gesvd");
131  }
132  }
133 
134 
135 
136  namespace
137  {
138  template <typename REAL>
140  {
141  const len_t m = x.nrows();
142  const len_t n = x.ncols();
143  if (m <= n)
144  throw std::runtime_error("'x' must have more rows than cols");
145 
146  auto c = x.get_card();
147 
148  gpuvec<REAL> qraux(c);
149  gpuvec<REAL> work(c);
150  qr_internals(false, x, qraux, work);
151 
152  gpumat<REAL> R(c, n, n);
153  qr_R(x, R);
154 
155  gpumat<REAL> u_R(c, n, n);
156  int info = svd_internals(1, 1, R, s, u_R, vt);
157  fml::linalgutils::check_info(info, "gesvd");
158 
159  u.resize(m, n);
160  qr_Q(x, qraux, u, work);
161 
162  matmult(false, false, (REAL)1.0, u, u_R, x);
163  copy::gpu2gpu(x, u);
164  }
165 
166  template <typename REAL>
167  void tssvd(gpumat<REAL> &x, gpuvec<REAL> &s)
168  {
169  const len_t m = x.nrows();
170  const len_t n = x.ncols();
171  if (m <= n)
172  throw std::runtime_error("'x' must have more rows than cols");
173 
174  auto c = x.get_card();
175  s.resize(n);
176 
177  gpuvec<REAL> qraux(c);
178  gpuvec<REAL> work(c);
179  qr_internals(false, x, qraux, work);
180 
181  fml::gpu_utils::tri2zero('L', false, n, n, x.data_ptr(), m);
182 
183  int lwork;
184  gpulapack_status_t check = gpulapack::gesvd_buflen(c->lapack_handle(), n, n,
185  x.data_ptr(), &lwork);
186  gpulapack::err::check_ret(check, "gesvd_bufferSize");
187 
188  if (lwork > work.size())
189  work.resize(lwork);
190  if (m-1 > qraux.size())
191  qraux.resize(m-1);
192 
193  int info = 0;
194  gpuscalar<int> info_device(c, info);
195 
196  check = gpulapack::gesvd(c->lapack_handle(), 'N', 'N', n, n, x.data_ptr(),
197  m, s.data_ptr(), NULL, m, NULL, 1, work.data_ptr(), lwork,
198  qraux.data_ptr(), info_device.data_ptr());
199 
200  info_device.get_val(&info);
201  gpulapack::err::check_ret(check, "gesvd");
202  fml::linalgutils::check_info(info, "gesvd");
203  }
204 
205 
206 
207  template <typename REAL>
208  void sfsvd(gpumat<REAL> &x, gpuvec<REAL> &s, gpumat<REAL> &u, gpumat<REAL> &vt)
209  {
210  const len_t m = x.nrows();
211  const len_t n = x.ncols();
212  if (m >= n)
213  throw std::runtime_error("'x' must have more cols than rows");
214 
215  gpumat<REAL> tx = xpose(x);
216  gpumat<REAL> v(x.get_card());
217  tssvd(tx, s, v, u);
218  xpose(v, vt);
219  }
220 
221  template <typename REAL>
222  void sfsvd(gpumat<REAL> &x, gpuvec<REAL> &s)
223  {
224  const len_t m = x.nrows();
225  const len_t n = x.ncols();
226  if (m >= n)
227  throw std::runtime_error("'x' must have more cols than rows");
228 
229  auto tx = xpose(x);
230  tssvd(tx, s);
231  }
232  }
233 
267  template <typename REAL>
269  {
270  err::check_card(x, s, u, vt);
271 
272  if (x.is_square())
273  svd(x, s, u, vt);
274  else if (x.nrows() > x.ncols())
275  tssvd(x, s, u, vt);
276  else
277  sfsvd(x, s, u, vt);
278  }
279 
281  template <typename REAL>
283  {
284  err::check_card(x, s);
285 
286  if (x.is_square())
287  svd(x, s);
288  else if (x.nrows() > x.ncols())
289  tssvd(x, s);
290  else
291  sfsvd(x, s);
292  }
293 
294 
320  template <typename REAL>
322  {
323  err::check_card(x, s, u, vt);
324 
325  const len_t m = x.nrows();
326  const len_t n = x.ncols();
327  const len_t minmn = std::min(m, n);
328 
329  auto c = x.get_card();
330 
331  gpumat<REAL> cp(c);
332 
333  if (m >= n)
334  {
335  crossprod((REAL)1.0, x, cp);
336  eigen_sym(cp, s, vt);
337  vt.rev_cols();
338  copy::gpu2gpu(vt, cp);
339  }
340  else
341  {
342  tcrossprod((REAL)1.0, x, cp);
343  eigen_sym(cp, s, u);
344  u.rev_cols();
345  copy::gpu2gpu(u, cp);
346  }
347 
348  s.rev();
349  auto sgrid = s.get_griddim();
350  auto sblock = s.get_blockdim();
351  fml::kernelfuns::kernel_root_abs<<<sgrid, sblock>>>(s.size(), s.data_ptr());
352 
353  REAL *ev_d;
354  if (m >= n)
355  ev_d = vt.data_ptr();
356  else
357  ev_d = cp.data_ptr();
358 
359  auto xgrid = x.get_griddim();
360  auto xblock = x.get_blockdim();
361  fml::kernelfuns::kernel_sweep_cols_div<<<xgrid, xblock>>>(minmn, minmn,
362  ev_d, s.data_ptr());
363 
364  if (m >= n)
365  {
366  matmult(false, false, (REAL)1.0, x, vt, u);
367  xpose(cp, vt);
368  }
369  else
370  matmult(true, false, (REAL)1.0, cp, x, vt);
371  }
372 
374  template <typename REAL>
375  void cpsvd(const gpumat<REAL> &x, gpuvec<REAL> &s)
376  {
377  err::check_card(x, s);
378 
379  const len_t m = x.nrows();
380  const len_t n = x.ncols();
381 
382  auto c = x.get_card();
383 
384  gpumat<REAL> cp(c);
385 
386  if (m >= n)
387  crossprod((REAL)1.0, x, cp);
388  else
389  tcrossprod((REAL)1.0, x, cp);
390 
391  eigen_sym(cp, s);
392 
393  s.rev();
394  fml::kernelfuns::kernel_root_abs<<<s.get_griddim(), s.get_blockdim()>>>(s.size(), s.data_ptr());
395  }
396 
397 
398 
399  namespace
400  {
401  template <typename REAL>
402  void rsvd_A(const uint32_t seed, const int k, const int q, gpumat<REAL> &x,
403  gpumat<REAL> &QY)
404  {
405  const len_t m = x.nrows();
406  const len_t n = x.ncols();
407 
408  gpumat<REAL> omega(x.get_card(), n, 2*k);
409  omega.fill_runif(seed);
410 
411  gpumat<REAL> Y(x.get_card(), m, 2*k);
412  gpumat<REAL> Z(x.get_card(), n, 2*k);
413  gpumat<REAL> QZ(x.get_card(), n, 2*k);
414 
415  gpuvec<REAL> qraux(x.get_card());
416  gpuvec<REAL> work(x.get_card());
417 
418  gpumat<REAL> B(x.get_card(), 2*k, n);
419 
420  // Stage A
421  matmult(false, false, (REAL)1.0, x, omega, Y);
422  qr_internals(false, Y, qraux, work);
423  qr_Q(Y, qraux, QY, work);
424 
425  for (int i=0; i<q; i++)
426  {
427  matmult(true, false, (REAL)1.0, x, QY, Z);
428  qr_internals(false, Z, qraux, work);
429  qr_Q(Z, qraux, QZ, work);
430 
431  matmult(false, false, (REAL)1.0, x, QZ, Y);
432  qr_internals(false, Y, qraux, work);
433  qr_Q(Y, qraux, QY, work);
434  }
435  }
436  }
437 
459  template <typename REAL>
460  void rsvd(const uint32_t seed, const int k, const int q, gpumat<REAL> &x,
461  gpuvec<REAL> &s)
462  {
463  const len_t m = x.nrows();
464  const len_t n = x.ncols();
465 
466  gpumat<REAL> QY(x.get_card(), m, 2*k);
467  gpumat<REAL> B(x.get_card(), 2*k, n);
468 
469  // Stage A
470  rsvd_A(seed, k, q, x, QY);
471 
472  // Stage B
473  matmult(true, false, (REAL)1.0, QY, x, B);
474 
475  cpumat<REAL> uB;
476  svd(B, s);
477 
478  s.resize(k);
479  }
480 
482  template <typename REAL>
483  void rsvd(const uint32_t seed, const int k, const int q, gpumat<REAL> &x,
485  {
486  const len_t m = x.nrows();
487  const len_t n = x.ncols();
488 
489  gpumat<REAL> QY(x.get_card(), m, 2*k);
490  gpumat<REAL> B(x.get_card(), 2*k, n);
491 
492  // Stage A
493  rsvd_A(seed, k, q, x, QY);
494 
495  // Stage B
496  matmult(true, false, (REAL)1.0, QY, x, B);
497 
498  gpumat<REAL> uB(x.get_card());
499  qrsvd(B, s, uB, vt);
500 
501  s.resize(k);
502 
503  matmult(false, false, (REAL)1.0, QY, uB, u);
504  u.resize(u.nrows(), k);
505  }
506 }
507 }
508 
509 
510 #endif
fml::cpumat
Matrix class for data held on a single CPU.
Definition: cpumat.hh:36
fml::linalg::crossprod
void crossprod(const REAL alpha, const cpumat< REAL > &x, cpumat< REAL > &ret)
Computes lower triangle of alpha*x^T*x.
Definition: linalg_blas.hh:379
fml::unimat::is_square
bool is_square() const
Is the matrix square?
Definition: unimat.hh:34
fml::linalg::xpose
void xpose(const cpumat< REAL > &x, cpumat< REAL > &tx)
Computes the transpose out-of-place (i.e. in a copy).
Definition: linalg_blas.hh:463
fml::univec::data_ptr
T * data_ptr()
Pointer to the internal array.
Definition: univec.hh:28
fml::gpuvec
Vector class for data held on a single GPU.
Definition: gpuvec.hh:32
fml::gpuvec::resize
void resize(len_t size)
Resize the internal object storage.
Definition: gpuvec.hh:224
fml::unimat::nrows
len_t nrows() const
Number of rows.
Definition: unimat.hh:36
fml::linalg::rsvd
void rsvd(const uint32_t seed, const int k, const int q, cpumat< REAL > &x, cpuvec< REAL > &s)
Computes the truncated singular value decomposition using the normal projections method of Halko et a...
Definition: linalg_svd.hh:458
fml::copy::gpu2gpu
void gpu2gpu(const gpuvec< REAL_IN > &gpu_in, gpuvec< REAL_OUT > &gpu_out)
Copy data from a GPU object to another.
Definition: copy.hh:203
fml::unimat::ncols
len_t ncols() const
Number of columns.
Definition: unimat.hh:38
fml::linalg::qrsvd
void qrsvd(cpumat< REAL > &x, cpuvec< REAL > &s, cpumat< REAL > &u, cpumat< REAL > &vt)
Computes the singular value decomposition by first reducing the rectangular matrix to a square matrix...
Definition: linalg_svd.hh:269
fml::linalg::tssvd
void tssvd(parmat_cpu< REAL > &x, cpuvec< REAL > &s)
Computes the singular value decomposition by first reducing the rectangular matrix to a square matrix...
Definition: svd.hh:130
fml::linalg::svd
void svd(cpumat< REAL > &x, cpuvec< REAL > &s)
Computes the singular value decomposition.
Definition: linalg_svd.hh:97
fml::unimat::data_ptr
REAL * data_ptr()
Pointer to the internal array.
Definition: unimat.hh:40
fml::gpuvec::rev
void rev()
Reverse the vector.
Definition: gpuvec.hh:452
fml
Core namespace.
Definition: dimops.hh:10
fml::linalg::qr_Q
void qr_Q(const cpumat< REAL > &QR, const cpuvec< REAL > &qraux, cpumat< REAL > &Q, cpuvec< REAL > &work)
Recover the Q matrix from a QR decomposition.
Definition: linalg_qr.hh:120
fml::univec::size
len_t size() const
Number of elements in the vector.
Definition: univec.hh:26
fml::linalg::qr_R
void qr_R(const cpumat< REAL > &QR, cpumat< REAL > &R)
Recover the R matrix from a QR decomposition.
Definition: linalg_qr.hh:162
fml::gpumat::resize
void resize(len_t nrows, len_t ncols)
Resize the internal object storage.
Definition: gpumat.hh:253
fml::linalg::eigen_sym
void eigen_sym(cpumat< REAL > &x, cpuvec< REAL > &values)
Compute the eigenvalues and optionally the eigenvectors for a symmetric matrix.
Definition: linalg_eigen.hh:93
fml::gpumat
Matrix class for data held on a single GPU.
Definition: gpumat.hh:35
fml::linalg::tcrossprod
void tcrossprod(const REAL alpha, const cpumat< REAL > &x, cpumat< REAL > &ret)
Computes lower triangle of alpha*x*x^T.
Definition: linalg_blas.hh:423
fml::gpumat::rev_cols
void rev_cols()
Reverse the columns of the matrix.
Definition: gpumat.hh:604
fml::linalg::matmult
void matmult(const bool transx, const bool transy, const REAL alpha, const cpumat< REAL > &x, const cpumat< REAL > &y, cpumat< REAL > &ret)
Computes ret = alpha*op(x)*op(y) where op(A) is A or A^T.
Definition: linalg_blas.hh:257
fml::linalg::cpsvd
void cpsvd(const cpumat< REAL > &x, cpuvec< REAL > &s, cpumat< REAL > &u, cpumat< REAL > &vt)
Computes the singular value decomposition using the "crossproducts SVD". This method is not numerical...
Definition: linalg_svd.hh:319