fml  0.1-0
Fused Matrix Library
dimops.hh
1 // This file is part of fml which is released under the Boost Software
2 // License, Version 1.0. See accompanying file LICENSE or copy at
3 // https://www.boost.org/LICENSE_1_0.txt
4 
5 #ifndef FML_MPI_DIMOPS_H
6 #define FML_MPI_DIMOPS_H
7 #pragma once
8 
9 
10 #include <cmath>
11 
12 #include "../_internals/dimops.hh"
13 
14 #include "../cpu/internals/vecops.hh"
15 #include "../cpu/cpuvec.hh"
16 
17 #include "internals/bcutils.hh"
18 #include "mpimat.hh"
19 
20 
21 namespace fml
22 {
24 namespace dimops
25 {
34  template <typename REAL>
35  void rowsums(const mpimat<REAL> &x, cpuvec<REAL> &s)
36  {
37  const REAL *x_d = x.data_ptr();
38  const len_t m = x.nrows();
39  const len_t m_local = x.nrows_local();
40  const len_t n_local = x.ncols_local();
41  const len_t mb = x.bf_rows();
42 
43  const grid g = x.get_grid();
44 
45  s.resize(m);
46  s.fill_zero();
47  REAL *s_d = s.data_ptr();
48 
49  for (len_t j=0; j<n_local; j++)
50  {
51  for (len_t i=0; i<m_local; i++)
52  {
53  const len_t gi = fml::bcutils::l2g(i, mb, g.nprow(), g.myrow());
54  s_d[gi] += x_d[i + m_local*j];
55  }
56  }
57 
58  x.get_grid().allreduce(m, 1, s_d, 'A');
59  }
60 
61 
62 
71  template <typename REAL>
72  void rowmeans(const mpimat<REAL> &x, cpuvec<REAL> &s)
73  {
74  rowsums(x, s);
75  fml::vecops::cpu::sweep_mul((REAL) 1.0/x.ncols(), x.nrows(), s.data_ptr());
76  }
77 
78 
79 
88  template <typename REAL>
89  void colsums(const mpimat<REAL> &x, cpuvec<REAL> &s)
90  {
91  const REAL *x_d = x.data_ptr();
92  const len_t n = x.ncols();
93  const len_t m_local = x.nrows_local();
94  const len_t n_local = x.ncols_local();
95  const len_t nb = x.bf_cols();
96 
97  const grid g = x.get_grid();
98 
99  s.resize(n);
100  s.fill_zero();
101  REAL *s_d = s.data_ptr();
102 
103  #pragma omp parallel for if(m_local*n_local > fml::omp::OMP_MIN_SIZE)
104  for (len_t j=0; j<n_local; j++)
105  {
106  const len_t gj = fml::bcutils::l2g(j, nb, g.npcol(), g.mycol());
107  fml::vecops::cpu::sum(m_local, x_d + m_local*j, s_d[gj]);
108  }
109 
110  x.get_grid().allreduce(n, 1, s_d, 'A');
111  }
112 
113 
114 
123  template <typename REAL>
124  void colmeans(const mpimat<REAL> &x, cpuvec<REAL> &s)
125  {
126  colsums(x, s);
127  fml::vecops::cpu::sweep_mul((REAL) 1.0/x.nrows(), x.ncols(), s.data_ptr());
128  }
129 
130 
131 
145  template <typename REAL>
146  static inline void rowsweep(mpimat<REAL> &x, const cpuvec<REAL> &s, const sweep_op op)
147  {
148  if (s.size() != x.nrows())
149  throw std::runtime_error("non-conformal arguments");
150 
151  const len_t m_local = x.nrows_local();
152  const len_t n_local = x.ncols_local();
153  REAL *x_d = x.data_ptr();
154  const REAL *s_d = s.data_ptr();
155 
156  const auto g = x.get_grid();
157 
158  if (op == SWEEP_ADD)
159  {
160  #pragma omp parallel for if(m_local*n_local > fml::omp::OMP_MIN_SIZE)
161  for (len_t j=0; j<n_local; j++)
162  {
163  #pragma omp simd
164  for (len_t i=0; i<m_local; i++)
165  x_d[i + m_local*j] += s_d[bcutils::l2g(i, x.bf_rows(), g.nprow(), g.myrow())];
166  }
167  }
168  else if (op == SWEEP_SUB)
169  {
170  #pragma omp parallel for if(m_local*n_local > fml::omp::OMP_MIN_SIZE)
171  for (len_t j=0; j<n_local; j++)
172  {
173  #pragma omp simd
174  for (len_t i=0; i<m_local; i++)
175  x_d[i + m_local*j] -= s_d[bcutils::l2g(i, x.bf_rows(), g.nprow(), g.myrow())];
176  }
177  }
178  else if (op == SWEEP_MUL)
179  {
180  #pragma omp parallel for if(m_local*n_local > fml::omp::OMP_MIN_SIZE)
181  for (len_t j=0; j<n_local; j++)
182  {
183  #pragma omp simd
184  for (len_t i=0; i<m_local; i++)
185  x_d[i + m_local*j] *= s_d[bcutils::l2g(i, x.bf_rows(), g.nprow(), g.myrow())];
186  }
187  }
188  else if (op == SWEEP_DIV)
189  {
190  #pragma omp parallel for if(m_local*n_local > fml::omp::OMP_MIN_SIZE)
191  for (len_t j=0; j<n_local; j++)
192  {
193  #pragma omp simd
194  for (len_t i=0; i<m_local; i++)
195  x_d[i + m_local*j] /= s_d[bcutils::l2g(i, x.bf_rows(), g.nprow(), g.myrow())];
196  }
197  }
198  }
199 
200 
201 
215  template <typename REAL>
216  static inline void colsweep(mpimat<REAL> &x, const cpuvec<REAL> &s, const sweep_op op)
217  {
218  if (s.size() != x.ncols())
219  throw std::runtime_error("non-conformal arguments");
220 
221  const len_t m_local = x.nrows_local();
222  const len_t n_local = x.ncols_local();
223  REAL *x_d = x.data_ptr();
224  const REAL *s_d = s.data_ptr();
225 
226  const auto g = x.get_grid();
227 
228  if (op == SWEEP_ADD)
229  {
230  #pragma omp parallel for if(m_local*n_local > fml::omp::OMP_MIN_SIZE)
231  for (len_t j=0; j<n_local; j++)
232  {
233  #pragma omp simd
234  for (len_t i=0; i<m_local; i++)
235  x_d[i + m_local*j] += s_d[bcutils::l2g(j, x.bf_cols(), g.npcol(), g.mycol())];
236  }
237  }
238  else if (op == SWEEP_SUB)
239  {
240  #pragma omp parallel for if(m_local*n_local > fml::omp::OMP_MIN_SIZE)
241  for (len_t j=0; j<n_local; j++)
242  {
243  #pragma omp simd
244  for (len_t i=0; i<m_local; i++)
245  x_d[i + m_local*j] -= s_d[bcutils::l2g(j, x.bf_cols(), g.npcol(), g.mycol())];
246  }
247  }
248  else if (op == SWEEP_MUL)
249  {
250  #pragma omp parallel for if(m_local*n_local > fml::omp::OMP_MIN_SIZE)
251  for (len_t j=0; j<n_local; j++)
252  {
253  #pragma omp simd
254  for (len_t i=0; i<m_local; i++)
255  x_d[i + m_local*j] *= s_d[bcutils::l2g(j, x.bf_cols(), g.npcol(), g.mycol())];
256  }
257  }
258  else if (op == SWEEP_DIV)
259  {
260  #pragma omp parallel for if(m_local*n_local > fml::omp::OMP_MIN_SIZE)
261  for (len_t j=0; j<n_local; j++)
262  {
263  #pragma omp simd
264  for (len_t i=0; i<m_local; i++)
265  x_d[i + m_local*j] /= s_d[bcutils::l2g(j, x.bf_cols(), g.npcol(), g.mycol())];
266  }
267  }
268  }
269 
270 
271 
272  namespace internals
273  {
274  template <typename REAL>
275  static inline void col_mean(const grid g, const len_t j, const len_t m, const len_t m_local, const REAL *x, REAL &mean)
276  {
277  mean = 0;
278  fml::vecops::cpu::sum(m_local, x + m_local*j, mean);
279  g.allreduce(1, 1, &mean, 'C');
280  mean /= (REAL) m;
281  }
282 
283  template <typename REAL>
284  static inline void col_var(const grid g, const len_t j, const len_t m, const len_t m_local, const REAL *x, const REAL &mean, REAL *work, REAL &var)
285  {
286  work[0] = 0;
287  work[1] = 0;
288 
289  for (len_t i = 0; i<m_local; i++)
290  {
291  REAL diff = x[i + m_local*j] - mean;
292  work[0] += diff*diff;
293  work[1] += diff;
294  }
295 
296  g.allreduce(2, 1, work, 'C');
297 
298  var = (work[0] - work[1]*work[1]/m) / (m-1);
299  }
300 
301 
302 
303  template <typename REAL>
304  static inline void center(mpimat<REAL> &x)
305  {
306  REAL *x_d = x.data_ptr();
307  const len_t m = x.nrows();
308  const len_t m_local = x.nrows_local();
309  const len_t n_local = x.ncols_local();
310 
311  grid g = x.get_grid();
312 
313  for (len_t j=0; j<n_local; j++)
314  {
315  REAL mean;
316  col_mean(g, j, m, m_local, x_d, mean);
317  fml::vecops::cpu::sweep_add(-mean, m_local, x_d + m_local*j);
318  }
319  }
320 
321  template <typename REAL>
322  static inline void scale(mpimat<REAL> &x)
323  {
324  REAL *x_d = x.data_ptr();
325  const len_t m = x.nrows();
326  const len_t m_local = x.nrows_local();
327  const len_t n_local = x.ncols_local();
328 
329  grid g = x.get_grid();
330 
331  REAL work[2];
332 
333  for (len_t j=0; j<n_local; j++)
334  {
335  REAL mean;
336  col_mean(g, j, m, m_local, x_d, mean);
337 
338  REAL var;
339  col_var(g, j, m, m_local, x_d, mean, work, var);
340  var = (REAL)1.0/sqrt(var);
341  fml::vecops::cpu::sweep_mul(var, m_local, x_d + m_local*j);
342  }
343  }
344 
345  template <typename REAL>
346  static inline void center_and_scale(mpimat<REAL> &x)
347  {
348  REAL *x_d = x.data_ptr();
349  const len_t m = x.nrows();
350  const len_t m_local = x.nrows_local();
351  const len_t n_local = x.ncols_local();
352 
353  grid g = x.get_grid();
354 
355  REAL work[2];
356 
357  for (len_t j=0; j<n_local; j++)
358  {
359  REAL mean;
360  col_mean(g, j, m, m_local, x_d, mean);
361 
362  REAL var;
363  col_var(g, j, m, m_local, x_d, mean, work, var);
364 
365  #pragma omp simd
366  for (len_t i=0; i<m_local; i++)
367  x_d[i + m_local*j] = (x_d[i + m_local*j] - mean) / sqrt(var);
368  }
369  }
370  }
371 
372 
373 
383  template <typename REAL>
384  void scale(const bool rm_mean, const bool rm_sd, mpimat<REAL> &x)
385  {
386  if (rm_mean && rm_sd)
387  internals::center_and_scale(x);
388  else if (rm_mean)
389  internals::center(x);
390  else if (rm_sd)
391  internals::scale(x);
392  }
393 }
394 }
395 
396 
397 #endif
fml::grid
2-dimensional MPI process grid.
Definition: grid.hh:70
fml::mpimat
Matrix class for data distributed over MPI in the 2-d block cyclic format.
Definition: mpimat.hh:40
fml::grid::mycol
int mycol() const
The process column (0-based index) of the calling process.
Definition: grid.hh:129
fml::univec::data_ptr
T * data_ptr()
Pointer to the internal array.
Definition: univec.hh:28
fml::unimat::nrows
len_t nrows() const
Number of rows.
Definition: unimat.hh:36
fml::cpuvec::resize
void resize(len_t size)
Resize the internal object storage.
Definition: cpuvec.hh:210
fml::cpuvec
Vector class for data held on a single CPU.
Definition: cpuvec.hh:31
fml::dimops::scale
void scale(const bool rm_mean, const bool rm_sd, cpumat< REAL > &x)
Remove the mean and/or the sd from a matrix.
Definition: dimops.hh:376
fml::unimat::ncols
len_t ncols() const
Number of columns.
Definition: unimat.hh:38
fml::unimat::data_ptr
REAL * data_ptr()
Pointer to the internal array.
Definition: unimat.hh:40
fml::dimops::SWEEP_ADD
@ SWEEP_ADD
Definition: dimops.hh:22
fml
Core namespace.
Definition: dimops.hh:10
fml::univec::size
len_t size() const
Number of elements in the vector.
Definition: univec.hh:26
fml::cpuvec::fill_zero
void fill_zero()
Set all values to zero.
Definition: cpuvec.hh:317
fml::dimops::sweep_op
sweep_op
Definition: dimops.hh:14
fml::dimops::rowsums
void rowsums(const cpumat< REAL > &x, cpuvec< REAL > &s)
Compute the row sums.
Definition: dimops.hh:72
fml::grid::npcol
int npcol() const
The number of processes columns in the BLACS context.
Definition: grid.hh:125
fml::dimops::colsums
void colsums(const cpumat< REAL > &x, cpuvec< REAL > &s)
Compute the column sums.
Definition: dimops.hh:125
fml::grid::myrow
int myrow() const
The process row (0-based index) of the calling process.
Definition: grid.hh:127
fml::grid::nprow
int nprow() const
The number of processes rows in the BLACS context.
Definition: grid.hh:123
fml::dimops::colmeans
void colmeans(const cpumat< REAL > &x, cpuvec< REAL > &s)
Compute the column means.
Definition: dimops.hh:151
fml::dimops::rowmeans
void rowmeans(const cpumat< REAL > &x, cpuvec< REAL > &s)
Compute the row means.
Definition: dimops.hh:101