fml  0.1-0
Fused Matrix Library
linalg_blas.hh
1 // This file is part of fml which is released under the Boost Software
2 // License, Version 1.0. See accompanying file LICENSE or copy at
3 // https://www.boost.org/LICENSE_1_0.txt
4 
5 #ifndef FML_CPU_LINALG_LINALG_BLAS_H
6 #define FML_CPU_LINALG_LINALG_BLAS_H
7 #pragma once
8 
9 
10 #include <cmath>
11 #include <stdexcept>
12 
13 #include "../../_internals/linalgutils.hh"
14 #include "../../_internals/omp.hh"
15 
16 #include "blas.hh"
17 
18 #include "../cpumat.hh"
19 
20 
21 namespace fml
22 {
24 namespace linalg
25 {
39  template <typename REAL>
40  REAL dot(const cpuvec<REAL> &x, const cpuvec<REAL> &y)
41  {
42  const len_t n = std::min(x.size(), y.size());
43  const REAL *x_d = x.data_ptr();
44  const REAL *y_d = y.data_ptr();
45 
46  REAL d = 0;
47  #pragma omp simd reduction(+:d)
48  for (len_t i=0; i<n; i++)
49  d += x_d[i] * y_d[i];
50 
51  return d;
52  }
53 
54  template <typename REAL>
55  REAL dot(const cpuvec<REAL> &x)
56  {
57  return dot(x, x);
58  }
59 
60 
61 
76  template <typename REAL>
77  void add(const bool transx, const bool transy, const REAL alpha,
78  const REAL beta, const cpumat<REAL> &x, const cpumat<REAL> &y,
79  cpumat<REAL> &ret)
80  {
81  len_t m, n;
82  fml::linalgutils::matadd_params(transx, transy, x.nrows(), x.ncols(),
83  y.nrows(), y.ncols(), &m, &n);
84 
85  if (ret.nrows() != m || ret.ncols() != n)
86  ret.resize(m, n);
87 
88  const REAL *x_d = x.data_ptr();
89  const REAL *y_d = y.data_ptr();
90  REAL *ret_d = ret.data_ptr();
91 
92  if (!transx && !transy)
93  {
94  #pragma omp parallel for if(m*n > fml::omp::OMP_MIN_SIZE)
95  for (len_t j=0; j<n; j++)
96  {
97  #pragma omp simd
98  for (len_t i=0; i<m; i++)
99  ret_d[i + m*j] = alpha*x_d[i + m*j] + beta*y_d[i + m*j];
100  }
101  }
102  else if (transx && transy)
103  {
104  #pragma omp parallel for if(m*n > fml::omp::OMP_MIN_SIZE)
105  for (len_t j=0; j<m; j++)
106  {
107  #pragma omp simd
108  for (len_t i=0; i<n; i++)
109  ret_d[j + m*i] = alpha*x_d[i + n*j] + beta*y_d[i + n*j];
110  }
111  }
112  else if (transx && !transy)
113  {
114  #pragma omp parallel for if(m*n > fml::omp::OMP_MIN_SIZE)
115  for (len_t j=0; j<n; j++)
116  {
117  #pragma omp simd
118  for (len_t i=0; i<m; i++)
119  ret_d[i + m*j] = alpha*x_d[j + n*i] + beta*y_d[i + m*j];
120  }
121  }
122  else if (!transx && transy)
123  {
124  #pragma omp parallel for if(m*n > fml::omp::OMP_MIN_SIZE)
125  for (len_t j=0; j<n; j++)
126  {
127  #pragma omp simd
128  for (len_t i=0; i<m; i++)
129  ret_d[i + m*j] = alpha*x_d[i + m*j] + beta*y_d[j + n*i];
130  }
131  }
132  }
133 
135  template <typename REAL>
136  cpumat<REAL> add(const bool transx, const bool transy, const REAL alpha,
137  const REAL beta, const cpumat<REAL> &x, const cpumat<REAL> &y)
138  {
139  len_t m, n;
140  fml::linalgutils::matadd_params(transx, transy, x.nrows(), x.ncols(),
141  y.nrows(), y.ncols(), &m, &n);
142 
143  cpumat<REAL> ret(m, n);
144  add(transx, transy, alpha, beta, x, y, ret);
145  return ret;
146  }
147 
148 
149 
164  template <typename REAL>
165  void prod(const bool transx, const bool transy, const REAL alpha,
166  const REAL beta, const cpumat<REAL> &x, const cpumat<REAL> &y,
167  cpumat<REAL> &ret)
168  {
169  len_t m, n;
170  fml::linalgutils::matadd_params(transx, transy, x.nrows(), x.ncols(),
171  y.nrows(), y.ncols(), &m, &n);
172 
173  if (ret.nrows() != m || ret.ncols() != n)
174  ret.resize(m, n);
175 
176  const REAL *x_d = x.data_ptr();
177  const REAL *y_d = y.data_ptr();
178  REAL *ret_d = ret.data_ptr();
179 
180  if (!transx && !transy)
181  {
182  #pragma omp parallel for if(m*n > fml::omp::OMP_MIN_SIZE)
183  for (len_t j=0; j<n; j++)
184  {
185  #pragma omp simd
186  for (len_t i=0; i<m; i++)
187  ret_d[i + m*j] = alpha*x_d[i + m*j] * beta*y_d[i + m*j];
188  }
189  }
190  else if (transx && transy)
191  {
192  #pragma omp parallel for if(m*n > fml::omp::OMP_MIN_SIZE)
193  for (len_t j=0; j<m; j++)
194  {
195  #pragma omp simd
196  for (len_t i=0; i<n; i++)
197  ret_d[j + m*i] = alpha*x_d[i + n*j] * beta*y_d[i + n*j];
198  }
199  }
200  else if (transx && !transy)
201  {
202  #pragma omp parallel for if(m*n > fml::omp::OMP_MIN_SIZE)
203  for (len_t j=0; j<n; j++)
204  {
205  #pragma omp simd
206  for (len_t i=0; i<m; i++)
207  ret_d[i + m*j] = alpha*x_d[j + n*i] * beta*y_d[i + m*j];
208  }
209  }
210  else if (!transx && transy)
211  {
212  #pragma omp parallel for if(m*n > fml::omp::OMP_MIN_SIZE)
213  for (len_t j=0; j<n; j++)
214  {
215  #pragma omp simd
216  for (len_t i=0; i<m; i++)
217  ret_d[i + m*j] = alpha*x_d[i + m*j] * beta*y_d[j + n*i];
218  }
219  }
220  }
221 
223  template <typename REAL>
224  cpumat<REAL> prod(const bool transx, const bool transy, const REAL alpha,
225  const REAL beta, const cpumat<REAL> &x, const cpumat<REAL> &y)
226  {
227  len_t m, n;
228  fml::linalgutils::matadd_params(transx, transy, x.nrows(), x.ncols(),
229  y.nrows(), y.ncols(), &m, &n);
230 
231  cpumat<REAL> ret(m, n);
232  prod(transx, transy, alpha, beta, x, y, ret);
233  return ret;
234  }
235 
236 
237 
238 
256  template <typename REAL>
257  void matmult(const bool transx, const bool transy, const REAL alpha,
258  const cpumat<REAL> &x, const cpumat<REAL> &y, cpumat<REAL> &ret)
259  {
260  len_t m, n, k;
261  const len_t mx = x.nrows();
262  const len_t my = y.nrows();
263 
264  fml::linalgutils::matmult_params(transx, transy, mx, x.ncols(), my,
265  y.ncols(), &m, &n, &k);
266 
267  if (m != ret.nrows() || n != ret.ncols())
268  ret.resize(m, n);
269 
270  const char ctransx = transx ? 'T' : 'N';
271  const char ctransy = transy ? 'T' : 'N';
272 
273  fml::blas::gemm(ctransx, ctransy, m, n, k, alpha,
274  x.data_ptr(), mx, y.data_ptr(), my,
275  (REAL)0, ret.data_ptr(), m);
276  }
277 
279  template <typename REAL>
280  cpumat<REAL> matmult(const bool transx, const bool transy, const REAL alpha,
281  const cpumat<REAL> &x, const cpumat<REAL> &y)
282  {
283  cpumat<REAL> ret;
284  matmult(transx, transy, alpha, x, y, ret);
285 
286  return ret;
287  }
288 
290  template <typename REAL>
291  void matmult(const bool transx, const bool transy, const REAL alpha,
292  const cpumat<REAL> &x, const cpuvec<REAL> &y, cpuvec<REAL> &ret)
293  {
294  len_t m, n, k;
295  const len_t mx = x.nrows();
296  const len_t my = y.size();
297 
298  fml::linalgutils::matmult_params(transx, transy, mx, x.ncols(), my,
299  1, &m, &n, &k);
300 
301  int len = std::max(m, n);
302  if (len != ret.size())
303  ret.resize(len);
304 
305  const char ctransx = transx ? 'T' : 'N';
306  const char ctransy = transy ? 'T' : 'N';
307 
308  fml::blas::gemm(ctransx, ctransy, m, n, k, alpha,
309  x.data_ptr(), mx, y.data_ptr(), my,
310  (REAL)0, ret.data_ptr(), m);
311  }
312 
314  template <typename REAL>
315  cpuvec<REAL> matmult(const bool transx, const bool transy, const REAL alpha,
316  const cpumat<REAL> &x, const cpuvec<REAL> &y)
317  {
318  cpuvec<REAL> ret;
319  matmult(transx, transy, alpha, x, y, ret);
320 
321  return ret;
322  }
323 
325  template <typename REAL>
326  void matmult(const bool transx, const bool transy, const REAL alpha,
327  const cpuvec<REAL> &x, const cpumat<REAL> &y, cpuvec<REAL> &ret)
328  {
329  len_t m, n, k;
330  const len_t mx = x.size();
331  const len_t my = y.nrows();
332 
333  fml::linalgutils::matmult_params(transx, transy, mx, 1, my,
334  y.ncols(), &m, &n, &k);
335 
336  int len = std::max(m, n);
337  if (len != ret.size())
338  ret.resize(len);
339 
340  const char ctransx = transx ? 'T' : 'N';
341  const char ctransy = transy ? 'T' : 'N';
342 
343  fml::blas::gemm(ctransx, ctransy, m, n, k, alpha,
344  x.data_ptr(), mx, y.data_ptr(), my,
345  (REAL)0, ret.data_ptr(), m);
346  }
347 
349  template <typename REAL>
350  cpuvec<REAL> matmult(const bool transx, const bool transy, const REAL alpha,
351  const cpuvec<REAL> &x, const cpumat<REAL> &y)
352  {
353  cpuvec<REAL> ret;
354  matmult(transx, transy, alpha, x, y, ret);
355 
356  return ret;
357  }
358 
359 
360 
378  template <typename REAL>
379  void crossprod(const REAL alpha, const cpumat<REAL> &x, cpumat<REAL> &ret)
380  {
381  const len_t m = x.nrows();
382  const len_t n = x.ncols();
383 
384  if (n != ret.nrows() || n != ret.ncols())
385  ret.resize(n, n);
386 
387  ret.fill_zero();
388  fml::blas::syrk('L', 'T', n, m, alpha, x.data_ptr(), m, (REAL)0.0, ret.data_ptr(), n);
389  }
390 
392  template <typename REAL>
393  cpumat<REAL> crossprod(const REAL alpha, const cpumat<REAL> &x)
394  {
395  const len_t n = x.ncols();
396  cpumat<REAL> ret(n, n);
397 
398  crossprod(alpha, x, ret);
399 
400  return ret;
401  }
402 
403 
404 
422  template <typename REAL>
423  void tcrossprod(const REAL alpha, const cpumat<REAL> &x, cpumat<REAL> &ret)
424  {
425  const len_t m = x.nrows();
426  const len_t n = x.ncols();
427 
428  if (m != ret.nrows() || m != ret.ncols())
429  ret.resize(m, m);
430 
431  ret.fill_zero();
432  fml::blas::syrk('L', 'N', m, n, alpha, x.data_ptr(), m, (REAL)0.0, ret.data_ptr(), m);
433  }
434 
435  template <typename REAL>
436  cpumat<REAL> tcrossprod(const REAL alpha, const cpumat<REAL> &x)
437  {
438  const len_t m = x.nrows();
439  cpumat<REAL> ret(m, m);
440 
441  tcrossprod(alpha, x, ret);
442 
443  return ret;
444  }
445 
446 
447 
462  template <typename REAL>
463  void xpose(const cpumat<REAL> &x, cpumat<REAL> &tx)
464  {
465  const len_t m = x.nrows();
466  const len_t n = x.ncols();
467 
468  if (m != tx.ncols() || n != tx.nrows())
469  tx.resize(n, m);
470 
471  const int blocksize = 8;
472  const REAL *x_d = x.data_ptr();
473  REAL *tx_d = tx.data_ptr();
474 
475  #pragma omp parallel for shared(tx) schedule(dynamic, 1) if(m*n > fml::omp::OMP_MIN_SIZE)
476  for (int j=0; j<n; j+=blocksize)
477  {
478  for (int i=0; i<m; i+=blocksize)
479  {
480  for (int col=j; col<j+blocksize && col<n; ++col)
481  {
482  for (int row=i; row<i+blocksize && row<m; ++row)
483  tx_d[col + n*row] = x_d[row + m*col];
484  }
485  }
486  }
487  }
488 
490  template <typename REAL>
492  {
493  cpumat<REAL> tx(x.ncols(), x.nrows());
494  xpose(x, tx);
495  return tx;
496  }
497 }
498 }
499 
500 
501 #endif
fml::cpumat
Matrix class for data held on a single CPU.
Definition: cpumat.hh:36
fml::linalg::crossprod
void crossprod(const REAL alpha, const cpumat< REAL > &x, cpumat< REAL > &ret)
Computes lower triangle of alpha*x^T*x.
Definition: linalg_blas.hh:379
fml::linalg::xpose
void xpose(const cpumat< REAL > &x, cpumat< REAL > &tx)
Computes the transpose out-of-place (i.e. in a copy).
Definition: linalg_blas.hh:463
fml::cpumat::fill_zero
void fill_zero()
Set all values to zero.
Definition: cpumat.hh:359
fml::univec::data_ptr
T * data_ptr()
Pointer to the internal array.
Definition: univec.hh:28
fml::unimat::nrows
len_t nrows() const
Number of rows.
Definition: unimat.hh:36
fml::cpuvec::resize
void resize(len_t size)
Resize the internal object storage.
Definition: cpuvec.hh:209
fml::cpumat::resize
void resize(len_t nrows, len_t ncols)
Resize the internal object storage.
Definition: cpumat.hh:230
fml::cpuvec
Vector class for data held on a single CPU.
Definition: cpuvec.hh:31
fml::unimat::ncols
len_t ncols() const
Number of columns.
Definition: unimat.hh:38
fml::unimat::data_ptr
REAL * data_ptr()
Pointer to the internal array.
Definition: unimat.hh:40
fml
Core namespace.
Definition: dimops.hh:10
fml::univec::size
len_t size() const
Number of elements in the vector.
Definition: univec.hh:26
fml::linalg::add
void add(const bool transx, const bool transy, const REAL alpha, const REAL beta, const cpumat< REAL > &x, const cpumat< REAL > &y, cpumat< REAL > &ret)
Returns alpha*op(x) + beta*op(y) where op(A) is A or A^T.
Definition: linalg_blas.hh:77
fml::linalg::prod
void prod(const bool transx, const bool transy, const REAL alpha, const REAL beta, const cpumat< REAL > &x, const cpumat< REAL > &y, cpumat< REAL > &ret)
Returns element-wise alpha*op(x) * beta*op(y) where op(A) is A or A^T.
Definition: linalg_blas.hh:165
fml::linalg::dot
REAL dot(const cpuvec< REAL > &x, const cpuvec< REAL > &y)
Computes the dot product of two vectors, i.e. the sum of the product of the elements.
Definition: linalg_blas.hh:40
fml::linalg::tcrossprod
void tcrossprod(const REAL alpha, const cpumat< REAL > &x, cpumat< REAL > &ret)
Computes lower triangle of alpha*x*x^T.
Definition: linalg_blas.hh:423
fml::linalg::matmult
void matmult(const bool transx, const bool transy, const REAL alpha, const cpumat< REAL > &x, const cpumat< REAL > &y, cpumat< REAL > &ret)
Computes ret = alpha*op(x)*op(y) where op(A) is A or A^T.
Definition: linalg_blas.hh:257