fml  0.1-0
Fused Matrix Library
gpu_utils.hh
1 // This file is part of fml which is released under the Boost Software
2 // License, Version 1.0. See accompanying file LICENSE or copy at
3 // https://www.boost.org/LICENSE_1_0.txt
4 
5 #ifndef FML_GPU_INTERNALS_GPU_UTILS_H
6 #define FML_GPU_INTERNALS_GPU_UTILS_H
7 #pragma once
8 
9 
10 #include "../../_internals/types.hh"
11 
12 #include "../arch/arch.hh"
13 #include "launcher.hh"
14 
15 
16 namespace fml
17 {
18  namespace gpu_utils
19  {
20  namespace internals
21  {
22  template <typename REAL>
23  __global__ void kernel_lacpy(const char uplo, const len_t m,
24  const len_t n, const REAL *A, const len_t lda, REAL *B, const len_t ldb)
25  {
26  len_t i = blockDim.x*blockIdx.x + threadIdx.x;
27  len_t j = blockDim.y*blockIdx.y + threadIdx.y;
28 
29  if ((i < m && j < n) && ((uplo == 'A') || (uplo == 'U' && i <= j) || (uplo == 'L' && i >= j)))
30  B[i + ldb*j] = A[i + lda*j];
31  }
32  }
33 
34  template <typename REAL>
35  void lacpy(const char uplo, const len_t m, const len_t n, const REAL *A,
36  const len_t lda, REAL *B, const len_t ldb)
37  {
38  auto dim_block = fml::kernel_launcher::dim_block2();
39  auto dim_grid = fml::kernel_launcher::dim_grid(m, n);
40  internals::kernel_lacpy<<<dim_grid, dim_block>>>(uplo, m, n, A, lda, B, ldb);
41  }
42 
43 
44 
45  namespace internals
46  {
47  template <typename REAL>
48  __global__ void kernel_tri2zero(const char uplo, const bool diag,
49  const len_t m, const len_t n, REAL *A, const len_t lda)
50  {
51  len_t i = blockDim.x*blockIdx.x + threadIdx.x;
52  len_t j = blockDim.y*blockIdx.y + threadIdx.y;
53 
54  if ((i < m && j < n) && ((diag && i == j) || (uplo == 'U' && i < j) || (uplo == 'L' && i > j)))
55  A[i + lda*j] = (REAL) 0.0;
56  }
57  }
58 
59  template <typename REAL>
60  void tri2zero(const char uplo, const bool diag, const len_t m, const len_t n,
61  REAL *A, const len_t lda)
62  {
63  auto dim_block = fml::kernel_launcher::dim_block2();
64  auto dim_grid = fml::kernel_launcher::dim_grid(m, n);
65  internals::kernel_tri2zero<<<dim_grid, dim_block>>>(uplo, diag, m, n, A, lda);
66  }
67  }
68 }
69 
70 
71 #endif
fml
Core namespace.
Definition: dimops.hh:10