fml  0.1-0
Fused Matrix Library
qr_allreduce.hh
1 // This file is part of fml which is released under the Boost Software
2 // License, Version 1.0. See accompanying file LICENSE or copy at
3 // https://www.boost.org/LICENSE_1_0.txt
4 
5 #ifndef FML_PAR_GPU_LINALG_QR_ALLREDUCE_NOGPUDIRECT_H
6 #define FML_PAR_GPU_LINALG_QR_ALLREDUCE_NOGPUDIRECT_H
7 #pragma once
8 
9 
10 #include "../../../_internals/arraytools/src/arraytools.hpp"
11 #include "../../../_internals/restrict.hh"
12 
13 #include "../../../gpu/card.hh"
14 
15 #include "../../internals/mpi_utils.hh"
16 
17 
18 namespace fml
19 {
20 namespace tsqr
21 {
22  namespace internals
23  {
24  // TODO mark inline when cuda gets C++17 support
25  fml::card_sp_t c;
26  cusolverStatus_t check;
27  dim3 griddim, blockdim;
28  int *info_dev;
29 
30  bool badinfo;
31  int _m, _n, minmn, mtb;
32  int lwork;
33 
34  template <typename REAL>
35  REAL *tallboy;
36  template <typename REAL>
37  REAL *work;
38  template <typename REAL>
39  REAL *qraux;
40 
41  template <typename REAL>
42  REAL *a;
43  template <typename REAL>
44  REAL *b;
45 
46 
47 
48  template <typename REAL>
49  void qr_global_cleanup()
50  {
51  c->mem_free(tallboy<REAL>);
52  tallboy<REAL> = NULL;
53 
54  c->mem_free(work<REAL>);
55  work<REAL> = NULL;
56 
57  c->mem_free(qraux<REAL>);
58  qraux<REAL> = NULL;
59 
60  c->mem_free(info_dev);
61 
62  c.reset();
63  }
64 
65 
66 
67  template <typename REAL>
68  static inline int qrworksize(const int m, const int n)
69  {
70  REAL tmp;
71 
72  int info;
73  check = fml::gpulapack::geqrf_buflen(c->lapack_handle(), m, n, &tmp, m, &lwork);
74 
75  // TODO check
76  (void)info;
77 
78  return std::max(lwork, 1);
79  }
80 
81 
82 
83  template <typename REAL>
84  void qr_global_init(fml::card_sp_t c_, int m, int n)
85  {
86  c = c_;
87  blockdim = fml::kernel_launcher::dim_block2();
88  griddim = fml::kernel_launcher::dim_grid(m, n);
89 
90  _m = m;
91  _n = n;
92  minmn = std::min(_m, _n);
93  mtb = 2*_m;
94 
95  badinfo = false;
96 
97  tallboy<REAL> = (REAL*) c->mem_alloc((size_t)mtb*_n*sizeof(REAL));
98 
99  lwork = qrworksize<REAL>(mtb, _n);
100  work<REAL> = (REAL*) c->mem_alloc((size_t)lwork*sizeof(REAL));
101 
102  qraux<REAL> = (REAL*) c->mem_alloc((size_t)minmn*sizeof(REAL));
103 
104  info_dev = (int*) c->mem_alloc(sizeof(int));
105  }
106 
107 
108 
109  template <typename REAL>
110  __global__ void kernel_stack(const len_t m, const len_t n, const len_t mtb,
111  const REAL *a, const REAL *b, REAL *tallboy)
112  {
113  int i = blockDim.x*blockIdx.x + threadIdx.x;
114  int j = blockDim.y*blockIdx.y + threadIdx.y;
115 
116  if (i < m && j < n)
117  {
118  tallboy[i + mtb*j] = a[i + m*j];
119  tallboy[m+i + mtb*j] = b[i + m*j];
120  }
121  }
122 
123 
124 
125  template <typename REAL>
126  void custom_op_qr(void *a_, void *b_, int *len, MPI_Datatype *dtype)
127  {
128  (void)len;
129  (void)dtype;
130 
131  REAL *a_cpu = (REAL*)a_;
132  REAL *b_cpu = (REAL*)b_;
133 
134  c->mem_cpu2gpu(a<REAL>, a_cpu, _m*_n*sizeof(REAL));
135  c->mem_cpu2gpu(b<REAL>, b_cpu, _m*_n*sizeof(REAL));
136  c->synch();
137 
138  kernel_stack<<<griddim, blockdim>>>(_m, _n, mtb, a<REAL>, b<REAL>, tallboy<REAL>);
139 
140  int info;
141  c->mem_set(info_dev, 0, sizeof(int));
142  check = fml::gpulapack::geqrf(c->lapack_handle(), mtb, _n, tallboy<REAL>, mtb, qraux<REAL>, work<REAL>, lwork, info_dev);
143  c->mem_gpu2cpu(&info, info_dev, sizeof(int));
144  // TODO check
145  if (info != 0)
146  badinfo = true;
147 
148  c->mem_set(b<REAL>, 0, (size_t)_m*_n*sizeof(REAL));
149  fml::gpu_utils::lacpy('U', _m, _n, tallboy<REAL>, mtb, b<REAL>, _m);
150 
151  c->mem_gpu2cpu(b_cpu, b<REAL>, _m*_n*sizeof(REAL));
152  }
153  }
154 
155 
156 
157  template <typename REAL>
158  void qr_allreduce(const int root, const int m, const int n,
159  REAL *const restrict a_, REAL *const restrict b_, MPI_Comm comm,
160  fml::card_sp_t c_)
161  {
162  int mpi_ret;
163 
164  internals::qr_global_init<REAL>(c_, m, n);
165 
166  internals::a<REAL> = a_;
167  internals::b<REAL> = b_;
168 
169  REAL *a_cpu, *b_cpu;
170  arraytools::alloc(m, n, &a_cpu);
171  arraytools::alloc(m, n, &b_cpu);
172  arraytools::check_alloc(a_cpu, b_cpu);
173 
174  c_->mem_gpu2cpu(a_cpu, a_, m*n*sizeof(REAL));
175  c_->synch();
176 
177  // custom data type
178  MPI_Datatype mat_type;
179  mpi::contig_type(m*n, a_cpu, &mat_type);
180 
181  // custom op + reduce
182  MPI_Op op;
183  const int commutative = 1;
184 
185  MPI_Op_create((MPI_User_function*) internals::custom_op_qr<REAL>, commutative, &op);
186  if (root == mpi::REDUCE_TO_ALL)
187  mpi_ret = MPI_Allreduce(a_cpu, b_cpu, 1, mat_type, op, comm);
188  else
189  mpi_ret = MPI_Reduce(a_cpu, b_cpu, 1, mat_type, op, root, comm);
190 
191  // cleanup and return
192  c_->mem_cpu2gpu(internals::b<REAL>, b_cpu, m*n*sizeof(REAL));
193 
194  MPI_Op_free(&op);
195  MPI_Type_free(&mat_type);
196 
197  internals::qr_global_cleanup<REAL>();
198 
199  arraytools::free(a_cpu);
200  arraytools::free(b_cpu);
201 
202  mpi::check_MPI_ret(mpi_ret);
203  if (internals::badinfo)
204  throw std::runtime_error("unrecoverable error with LAPACK function geqrf() occurred during reduction");
205  }
206 }
207 }
208 
209 
210 #endif
fml
Core namespace.
Definition: dimops.hh:10