5 #ifndef FML_GPU_LINALG_LINALG_NORM_H
6 #define FML_GPU_LINALG_LINALG_NORM_H
12 #include "../arch/arch.hh"
14 #include "../internals/atomics.hh"
15 #include "../internals/gpu_utils.hh"
16 #include "../internals/gpuscalar.hh"
17 #include "../internals/kernelfuns.hh"
19 #include "../gpumat.hh"
21 #include "linalg_qr.hh"
22 #include "linalg_svd.hh"
31 template <
typename REAL>
32 __global__
void kernel_norm_1(
const len_t m,
const len_t n,
const REAL *x, REAL *macs, REAL *norm)
34 int i = blockDim.x*blockIdx.x + threadIdx.x;
35 int j = blockDim.y*blockIdx.y + threadIdx.y;
39 REAL tmp = fabs(x[i + m*j]);
40 atomicAdd(macs + j, tmp);
41 atomics::atomicMaxf(norm, macs[j]);
60 template <
typename REAL>
68 kernel_norm_1<<<x.get_griddim(), x.get_blockdim()>>>(x.
nrows(),
71 norm_gpu.get_val(&norm);
80 template <
typename REAL>
81 __global__
void kernel_norm_I(
const len_t m,
const len_t n,
const REAL *x, REAL *mars, REAL *norm)
83 int i = blockDim.x*blockIdx.x + threadIdx.x;
84 int j = blockDim.y*blockIdx.y + threadIdx.y;
88 REAL tmp = fabs(x[i + m*j]);
89 atomicAdd(mars + i, tmp);
90 atomics::atomicMaxf(norm, mars[i]);
109 template <
typename REAL>
117 kernel_norm_I<<<x.get_griddim(), x.get_blockdim()>>>(x.
nrows(),
120 norm_gpu.get_val(&norm);
129 template <
typename REAL>
130 __global__
void kernel_norm_F_sq(
const len_t m,
const len_t n,
const REAL *x, REAL *norm)
132 int i = blockDim.x*blockIdx.x + threadIdx.x;
133 int j = blockDim.y*blockIdx.y + threadIdx.y;
137 REAL tmp = x[i + m*j] * x[i + m*j];
138 atomicAdd(norm, tmp);
152 template <
typename REAL>
158 kernel_norm_F_sq<<<x.get_griddim(), x.get_blockdim()>>>(x.
nrows(),
161 norm_gpu.get_val(&norm);
170 template <
typename REAL>
171 __global__
void kernel_norm_M(
const len_t m,
const len_t n,
const REAL *x, REAL *norm)
173 int i = blockDim.x*blockIdx.x + threadIdx.x;
174 int j = blockDim.y*blockIdx.y + threadIdx.y;
178 REAL tmp = fabs(x[i + m*j]);
179 atomics::atomicMaxf(norm, tmp);
193 template <
typename REAL>
199 kernel_norm_M<<<x.get_griddim(), x.get_blockdim()>>>(x.
nrows(),
202 norm_gpu.get_val(&norm);
227 template <
typename REAL>
242 template <
typename REAL>
243 REAL cond_square_internals(
const char norm,
gpumat<REAL> &x)
245 const len_t n = x.
nrows();
264 template <
typename REAL>
265 REAL cond_nonsquare_internals(
const char norm, gpumat<REAL> &x)
267 const len_t m = x.nrows();
268 const len_t n = x.ncols();
270 auto c = x.get_card();
273 gpublas_status_t check;
290 check = gpublas::trsm(c->blas_handle(), GPUBLAS_SIDE_LEFT,
291 GPUBLAS_FILL_U, GPUBLAS_OP_N, GPUBLAS_DIAG_NON_UNIT, n, n,
292 (REAL)1, R.data_ptr(), n, x.data_ptr(), m);
293 gpublas::err::check_ret(check,
"trsm");
295 gpu_utils::lacpy(
'U', n, n, x.data_ptr(), m, R.data_ptr(), n);
315 check = gpublas::trsm(c->blas_handle(), GPUBLAS_SIDE_LEFT,
316 GPUBLAS_FILL_L, GPUBLAS_OP_N, GPUBLAS_DIAG_NON_UNIT, m, m,
317 (REAL)1, L.data_ptr(), m, x.data_ptr(), m);
318 gpublas::err::check_ret(check,
"trsm");
320 gpu_utils::lacpy(
'L', m, m, x.data_ptr(), m, L.data_ptr(), m);
350 template <
typename REAL>
354 return cond_square_internals(
'1', x);
356 return cond_nonsquare_internals(
'1', x);
379 template <
typename REAL>
383 return cond_square_internals(
'I', x);
385 return cond_nonsquare_internals(
'I', x);
407 template <
typename REAL>
420 kernelfuns::kernel_min_nz<<<s.get_griddim(), s.get_blockdim()>>>(s.
size(),
422 min_gpu.get_val(&min);