12 #include "../_internals/arraytools/src/arraytools.hpp"
14 #include "../cpu/cpumat.hh"
15 #include "../cpu/cpuvec.hh"
17 #include "internals/bcutils.hh"
47 template <
typename REAL_IN,
typename REAL_OUT>
50 grid g = mpi.get_grid();
54 const len_local_t m_local = mpi.nrows_local();
55 const len_local_t n_local = mpi.ncols_local();
57 const int mb = mpi.bf_rows();
59 const len_t m = mpi.
nrows();
60 const len_t n = mpi.
ncols();
70 if (m_local > 0 && n_local > 0)
72 #pragma omp parallel for if(m_local*n_local > fml::omp::OMP_MIN_SIZE)
73 for (len_local_t j=0; j<n_local; j++)
75 const int gj = fml::bcutils::l2g(j, mpi.bf_cols(), g.
npcol(), g.
mycol());
77 for (len_local_t i=0; i<m_local; i+=mb)
79 const int gi = fml::bcutils::l2g(i, mpi.bf_rows(), g.
nprow(), g.
myrow());
81 for (
int ii=0; ii<mb && ii+i<m_local; ii++)
82 gbl[gi+ii + m*gj] = (REAL_OUT) sub[i+ii + m_local*j];
93 template <
typename REAL>
128 template <
typename REAL_IN,
typename REAL_OUT>
131 const grid g = mpi.get_grid();
135 bool i_am_ret = (g.
myrow() == rdest && g.
mycol() == cdest) ?
true :
false;
137 const len_local_t m_local = mpi.nrows_local();
139 const int mb = mpi.bf_rows();
140 const int nb = mpi.bf_cols();
142 const len_t m = mpi.
nrows();
143 const len_t n = mpi.
ncols();
154 const REAL_IN *sub = mpi.
data_ptr();
159 for (len_t gj=0; gj<n; gj+=nb)
161 const int pc = fml::bcutils::g2p(gj, nb, g.
npcol());
162 const len_t j = fml::bcutils::g2l(gj, nb, g.
npcol());
163 const len_t col_copylen = std::min(nb, n-gj);
165 for (len_t gi=0; gi<m; gi+=mb)
167 const int pr = fml::bcutils::g2p(gi, mb, g.
nprow());
168 const len_t i = fml::bcutils::g2l(gi, mb, g.
nprow());
169 const len_t row_copylen = std::min(mb, m-gi);
175 for (
int jj=0; jj<col_copylen; jj++)
176 arraytools::copy(row_copylen, sub + i+m_local*(j+jj), gbl + gi+m*(gj+jj));
179 g.
recv(row_copylen, col_copylen, m, gbl + gi+m*gj, pr, pc);
183 for (len_t jj=0; jj<col_copylen; jj++)
185 for (len_t ii=0; ii<row_copylen; ii++)
186 tmp_d[ii + mb*jj] = (REAL_OUT) sub[i+ii + m_local*(j+jj)];
189 g.
send(row_copylen, col_copylen, mb, tmp_d, rdest, cdest);
198 template <
typename REAL>
231 template <
typename REAL_IN,
typename REAL_OUT>
234 const len_t m = cpu.
nrows();
235 const len_t n = cpu.
ncols();
242 const grid g = mpi.get_grid();
244 const len_local_t m_local = mpi.nrows_local();
245 const len_local_t n_local = mpi.ncols_local();
246 const int mb = mpi.bf_rows();
248 const REAL_IN *gbl = cpu.
data_ptr();
251 if (m_local > 0 && n_local > 0)
253 #pragma omp parallel for if(m_local*n_local > fml::omp::OMP_MIN_SIZE)
254 for (len_local_t j=0; j<n_local; j++)
256 const int gj = fml::bcutils::l2g(j, mpi.bf_cols(), g.
npcol(), g.
mycol());
258 for (len_local_t i=0; i<m_local; i+=mb)
260 const int gi = fml::bcutils::l2g(i, mpi.bf_rows(), g.
nprow(), g.
myrow());
262 for (
int ii=0; ii<mb && ii+i<m_local; ii++)
263 sub[i+ii + m_local*j] = (REAL_OUT) gbl[gi+ii + m*gj];
287 template <
typename REAL_IN,
typename REAL_OUT>
290 if (mpi_in.get_grid().ictxt() != mpi_out.get_grid().ictxt())
291 throw std::runtime_error(
"mpimat objects must be distributed on the same process grid");
295 size_t len = (size_t) mpi_in.nrows_local() * mpi_in.ncols_local();