fml  0.1-0
Fused Matrix Library
copy.hh
1 // This file is part of fml which is released under the Boost Software
2 // License, Version 1.0. See accompanying file LICENSE or copy at
3 // https://www.boost.org/LICENSE_1_0.txt
4 
5 #ifndef FML_MPI_COPY_H
6 #define FML_MPI_COPY_H
7 #pragma once
8 
9 
10 #include <stdexcept>
11 
12 #include "../_internals/arraytools/src/arraytools.hpp"
13 
14 #include "../cpu/cpumat.hh"
15 #include "../cpu/cpuvec.hh"
16 
17 #include "internals/bcutils.hh"
18 #include "grid.hh"
19 #include "mpimat.hh"
20 
21 
22 namespace fml
23 {
25 namespace copy
26 {
47  template <typename REAL_IN, typename REAL_OUT>
49  {
50  grid g = mpi.get_grid();
51  if (!g.ingrid())
52  return;
53 
54  const len_local_t m_local = mpi.nrows_local();
55  const len_local_t n_local = mpi.ncols_local();
56 
57  const int mb = mpi.bf_rows();
58 
59  const len_t m = mpi.nrows();
60  const len_t n = mpi.ncols();
61 
62  if (m != cpu.nrows() || n != cpu.ncols())
63  cpu.resize(m, n);
64 
65  cpu.fill_zero();
66 
67  REAL_OUT *gbl = cpu.data_ptr();
68  const REAL_IN *sub = mpi.data_ptr();
69 
70  if (m_local > 0 && n_local > 0)
71  {
72  #pragma omp parallel for if(m_local*n_local > fml::omp::OMP_MIN_SIZE)
73  for (len_local_t j=0; j<n_local; j++)
74  {
75  const int gj = fml::bcutils::l2g(j, mpi.bf_cols(), g.npcol(), g.mycol());
76 
77  for (len_local_t i=0; i<m_local; i+=mb)
78  {
79  const int gi = fml::bcutils::l2g(i, mpi.bf_rows(), g.nprow(), g.myrow());
80 
81  for (int ii=0; ii<mb && ii+i<m_local; ii++)
82  gbl[gi+ii + m*gj] = (REAL_OUT) sub[i+ii + m_local*j];
83  }
84  }
85  }
86 
87  g.allreduce(m, n, gbl);
88  }
89 
90 
91 
93  template <typename REAL>
95  {
96  cpumat<REAL> cpu;
97  mpi2cpu_all(mpi, cpu);
98 
99  return cpu;
100  }
101 
102 
103 
128  template <typename REAL_IN, typename REAL_OUT>
129  void mpi2cpu(const mpimat<REAL_IN> &mpi, cpumat<REAL_OUT> &cpu, int rdest=0, int cdest=0)
130  {
131  const grid g = mpi.get_grid();
132  if (!g.ingrid())
133  return;
134 
135  bool i_am_ret = (g.myrow() == rdest && g.mycol() == cdest) ? true : false;
136 
137  const len_local_t m_local = mpi.nrows_local();
138 
139  const int mb = mpi.bf_rows();
140  const int nb = mpi.bf_cols();
141 
142  const len_t m = mpi.nrows();
143  const len_t n = mpi.ncols();
144 
145  if (i_am_ret)
146  {
147  if (m != cpu.nrows() || n != cpu.ncols())
148  cpu.resize(m, n);
149 
150  cpu.fill_zero();
151  }
152 
153  REAL_OUT *gbl = cpu.data_ptr();
154  const REAL_IN *sub = mpi.data_ptr();
155 
156  cpumat<REAL_OUT> tmp(mb, nb);
157  REAL_OUT *tmp_d = tmp.data_ptr();
158 
159  for (len_t gj=0; gj<n; gj+=nb)
160  {
161  const int pc = fml::bcutils::g2p(gj, nb, g.npcol());
162  const len_t j = fml::bcutils::g2l(gj, nb, g.npcol());
163  const len_t col_copylen = std::min(nb, n-gj);
164 
165  for (len_t gi=0; gi<m; gi+=mb)
166  {
167  const int pr = fml::bcutils::g2p(gi, mb, g.nprow());
168  const len_t i = fml::bcutils::g2l(gi, mb, g.nprow());
169  const len_t row_copylen = std::min(mb, m-gi);
170 
171  if (i_am_ret)
172  {
173  if (pr == g.myrow() && pc == g.mycol())
174  {
175  for (int jj=0; jj<col_copylen; jj++)
176  arraytools::copy(row_copylen, sub + i+m_local*(j+jj), gbl + gi+m*(gj+jj));
177  }
178  else
179  g.recv(row_copylen, col_copylen, m, gbl + gi+m*gj, pr, pc);
180  }
181  else if (pr == g.myrow() && pc == g.mycol())
182  {
183  for (len_t jj=0; jj<col_copylen; jj++)
184  {
185  for (len_t ii=0; ii<row_copylen; ii++)
186  tmp_d[ii + mb*jj] = (REAL_OUT) sub[i+ii + m_local*(j+jj)];
187  }
188 
189  g.send(row_copylen, col_copylen, mb, tmp_d, rdest, cdest);
190  }
191  }
192  }
193  }
194 
195 
196 
198  template <typename REAL>
200  {
201  cpumat<REAL> cpu;
202  mpi2cpu(mpi, cpu);
203 
204  return cpu;
205  }
206 
207 
208 
231  template <typename REAL_IN, typename REAL_OUT>
232  void cpu2mpi(const cpumat<REAL_IN> &cpu, mpimat<REAL_OUT> &mpi)
233  {
234  const len_t m = cpu.nrows();
235  const len_t n = cpu.ncols();
236 
237  if (m != mpi.nrows() || n != mpi.ncols())
238  mpi.resize(m, n);
239 
240  mpi.fill_zero();
241 
242  const grid g = mpi.get_grid();
243 
244  const len_local_t m_local = mpi.nrows_local();
245  const len_local_t n_local = mpi.ncols_local();
246  const int mb = mpi.bf_rows();
247 
248  const REAL_IN *gbl = cpu.data_ptr();
249  REAL_OUT *sub = mpi.data_ptr();
250 
251  if (m_local > 0 && n_local > 0)
252  {
253  #pragma omp parallel for if(m_local*n_local > fml::omp::OMP_MIN_SIZE)
254  for (len_local_t j=0; j<n_local; j++)
255  {
256  const int gj = fml::bcutils::l2g(j, mpi.bf_cols(), g.npcol(), g.mycol());
257 
258  for (len_local_t i=0; i<m_local; i+=mb)
259  {
260  const int gi = fml::bcutils::l2g(i, mpi.bf_rows(), g.nprow(), g.myrow());
261 
262  for (int ii=0; ii<mb && ii+i<m_local; ii++)
263  sub[i+ii + m_local*j] = (REAL_OUT) gbl[gi+ii + m*gj];
264  }
265  }
266  }
267  }
268 
269 
270 
287  template <typename REAL_IN, typename REAL_OUT>
288  void mpi2mpi(const mpimat<REAL_IN> &mpi_in, mpimat<REAL_OUT> &mpi_out)
289  {
290  if (mpi_in.get_grid().ictxt() != mpi_out.get_grid().ictxt())
291  throw std::runtime_error("mpimat objects must be distributed on the same process grid");
292 
293  mpi_out.resize(mpi_in.nrows(), mpi_in.ncols());
294 
295  size_t len = (size_t) mpi_in.nrows_local() * mpi_in.ncols_local();
296  arraytools::copy(len, mpi_in.data_ptr(), mpi_out.data_ptr());
297  }
298 }
299 }
300 
301 
302 #endif
fml::cpumat
Matrix class for data held on a single CPU.
Definition: cpumat.hh:36
fml::mpimat::resize
void resize(len_t nrows, len_t ncols)
Resize the internal object storage.
Definition: mpimat.hh:326
fml::grid::allreduce
void allreduce(const int m, const int n, int *x, const char scope='A', const blacsops op=BLACS_SUM) const
Sum reduce operation across all processes in the grid.
Definition: grid.hh:420
fml::grid
2-dimensional MPI process grid.
Definition: grid.hh:70
fml::grid::send
void send(const int m, const int n, const int *x, const int rdest=0, const int cdest=0) const
Point-to-point send. Should be matched by a corresponding 'recv' call.
Definition: grid.hh:321
fml::mpimat
Matrix class for data distributed over MPI in the 2-d block cyclic format.
Definition: mpimat.hh:40
fml::grid::mycol
int mycol() const
The process column (0-based index) of the calling process.
Definition: grid.hh:129
fml::cpumat::fill_zero
void fill_zero()
Set all values to zero.
Definition: cpumat.hh:362
fml::unimat::nrows
len_t nrows() const
Number of rows.
Definition: unimat.hh:36
fml::copy::mpi2mpi
void mpi2mpi(const mpimat< REAL_IN > &mpi_in, mpimat< REAL_OUT > &mpi_out)
Copy data from an MPI object to another.
Definition: copy.hh:288
fml::cpumat::resize
void resize(len_t nrows, len_t ncols)
Resize the internal object storage.
Definition: cpumat.hh:233
fml::unimat::ncols
len_t ncols() const
Number of columns.
Definition: unimat.hh:38
fml::copy::mpi2cpu_all
void mpi2cpu_all(const mpimat< REAL_IN > &mpi, cpumat< REAL_OUT > &cpu)
Copy data from an MPI object to a CPU object.
Definition: copy.hh:48
fml::unimat::data_ptr
REAL * data_ptr()
Pointer to the internal array.
Definition: unimat.hh:40
fml::grid::ingrid
bool ingrid() const
Check if the executing process is in the grid, i.e., if neither the process row nor column are -1.
Definition: grid.hh:303
fml::grid::recv
void recv(const int m, const int n, int *x, const int rsrc=0, const int csrc=0) const
Point-to-point receive. Should be matched by a corresponding 'send' call.
Definition: grid.hh:363
fml
Core namespace.
Definition: dimops.hh:10
fml::mpimat::fill_zero
void fill_zero()
Set all values to zero.
Definition: mpimat.hh:565
fml::copy::cpu2mpi
void cpu2mpi(const cpumat< REAL_IN > &cpu, mpimat< REAL_OUT > &mpi)
Copy data from a CPU object to an MPI object.
Definition: copy.hh:232
fml::grid::npcol
int npcol() const
The number of processes columns in the BLACS context.
Definition: grid.hh:125
fml::copy::mpi2cpu
void mpi2cpu(const mpimat< REAL_IN > &mpi, cpumat< REAL_OUT > &cpu, int rdest=0, int cdest=0)
Copy data from an MPI object to a CPU object.
Definition: copy.hh:129
fml::grid::myrow
int myrow() const
The process row (0-based index) of the calling process.
Definition: grid.hh:127
fml::grid::nprow
int nprow() const
The number of processes rows in the BLACS context.
Definition: grid.hh:123