5 #ifndef FML_GPU_INTERNALS_GPU_UTILS_H
6 #define FML_GPU_INTERNALS_GPU_UTILS_H
10 #include "../../_internals/types.hh"
12 #include "../arch/arch.hh"
13 #include "launcher.hh"
22 template <
typename REAL>
23 __global__
void kernel_lacpy(
const char uplo,
const len_t m,
24 const len_t n,
const REAL *A,
const len_t lda, REAL *B,
const len_t ldb)
26 len_t i = blockDim.x*blockIdx.x + threadIdx.x;
27 len_t j = blockDim.y*blockIdx.y + threadIdx.y;
29 if ((i < m && j < n) && ((uplo ==
'A') || (uplo ==
'U' && i <= j) || (uplo ==
'L' && i >= j)))
30 B[i + ldb*j] = A[i + lda*j];
34 template <
typename REAL>
35 void lacpy(
const char uplo,
const len_t m,
const len_t n,
const REAL *A,
36 const len_t lda, REAL *B,
const len_t ldb)
38 auto dim_block = fml::kernel_launcher::dim_block2();
39 auto dim_grid = fml::kernel_launcher::dim_grid(m, n);
40 internals::kernel_lacpy<<<dim_grid, dim_block>>>(uplo, m, n, A, lda, B, ldb);
47 template <
typename REAL>
48 __global__
void kernel_tri2zero(
const char uplo,
const bool diag,
49 const len_t m,
const len_t n, REAL *A,
const len_t lda)
51 len_t i = blockDim.x*blockIdx.x + threadIdx.x;
52 len_t j = blockDim.y*blockIdx.y + threadIdx.y;
54 if ((i < m && j < n) && ((diag && i == j) || (uplo ==
'U' && i < j) || (uplo ==
'L' && i > j)))
55 A[i + lda*j] = (REAL) 0.0;
59 template <
typename REAL>
60 void tri2zero(
const char uplo,
const bool diag,
const len_t m,
const len_t n,
61 REAL *A,
const len_t lda)
63 auto dim_block = fml::kernel_launcher::dim_block2();
64 auto dim_grid = fml::kernel_launcher::dim_grid(m, n);
65 internals::kernel_tri2zero<<<dim_grid, dim_block>>>(uplo, diag, m, n, A, lda);