5 #ifndef FML_GPU_INTERNALS_KERNELFUNS_H
6 #define FML_GPU_INTERNALS_KERNELFUNS_H
10 #include <cuda_runtime.h>
12 #include "../../_internals/types.hh"
14 #include "../../_internals/arraytools/src/arraytools.cuh"
16 #include "../internals/atomics.hh"
23 template <
typename REAL>
24 __global__
void kernel_rev_rows(
const len_t m,
const len_t n, REAL *data)
26 int i = blockDim.x*blockIdx.x + threadIdx.x;
27 int j = blockDim.y*blockIdx.y + threadIdx.y;
31 REAL tmp = data[i + m*j];
32 data[i + m*j] = data[m-i-1 + m*j];
33 data[m-i-1 + m*j] = tmp;
39 template <
typename REAL>
40 __global__
void kernel_rev_cols(
const len_t m,
const len_t n, REAL *data)
42 int i = blockDim.x*blockIdx.x + threadIdx.x;
43 int j = blockDim.y*blockIdx.y + threadIdx.y;
47 REAL tmp = data[i + m*j];
48 data[i + m*j] = data[i + m*(n-j-1)];
49 data[i + m*(n-j-1)] = tmp;
55 template <
typename REAL>
56 __global__
void kernel_fill_eye(
const len_t m,
const len_t n, REAL *data)
58 int i = blockDim.x*blockIdx.x + threadIdx.x;
59 int j = blockDim.y*blockIdx.y + threadIdx.y;
64 data[i + m*j] = (REAL) 1;
66 data[i + m*j] = (REAL) 0;
72 template <
typename REAL>
73 __global__
void kernel_fill_diag(
const len_t size,
const REAL *v,
const len_t m,
const len_t n, REAL *data)
75 int i = blockDim.x*blockIdx.x + threadIdx.x;
76 int j = blockDim.y*blockIdx.y + threadIdx.y;
81 data[i + m*j] = v[i % size];
83 data[i + m*j] = (REAL) 0;
89 template <
typename REAL>
90 __global__
void kernel_fill_val(
const REAL v,
const len_t m,
const len_t n, REAL *data)
92 int i = blockDim.x*blockIdx.x + threadIdx.x;
93 int j = blockDim.y*blockIdx.y + threadIdx.y;
101 static __global__
void kernel_fill_linspace(
const __half start,
const __half stop,
const len_t m,
const len_t n, __half *data)
103 int i = blockDim.x*blockIdx.x + threadIdx.x;
104 int j = blockDim.y*blockIdx.y + threadIdx.y;
108 float v_f = ((float)(stop-start))/((
float) m*n-1);
109 __half v = (__half) v_f;
111 data[ind] = v*__int2half_rz(ind) + start;
115 template <
typename REAL>
116 __global__
void kernel_fill_linspace(
const REAL start,
const REAL stop,
const len_t m,
const len_t n, REAL *data)
118 int i = blockDim.x*blockIdx.x + threadIdx.x;
119 int j = blockDim.y*blockIdx.y + threadIdx.y;
123 REAL v = (stop-start)/((REAL) m*n-1);
125 data[ind] = v*((REAL) ind) + start;
131 template <
typename REAL>
132 __global__
void kernel_fill_runif_update(
const REAL min,
const REAL max,
const len_t m,
const len_t n, REAL *data)
134 int i = blockDim.x*blockIdx.x + threadIdx.x;
135 int j = blockDim.y*blockIdx.y + threadIdx.y;
138 data[i + m*j] = min + (max - min)*data[i + m*j];
143 template <
typename REAL>
144 __global__
void kernel_diag(
const len_t m,
const len_t n,
const REAL *data, REAL *v)
146 int i = blockDim.x*blockIdx.x + threadIdx.x;
147 int j = blockDim.y*blockIdx.y + threadIdx.y;
149 if (i < m && j < n && i == j)
150 v[i] = data[i + m*j];
155 template <
typename REAL>
156 __global__
void kernel_antidiag(
const len_t m,
const len_t n,
const REAL *data, REAL *v)
158 int i = blockDim.x*blockIdx.x + threadIdx.x;
159 int j = blockDim.y*blockIdx.y + threadIdx.y;
161 if (i < m && j < n && m-1-i == j)
162 v[j] = data[i + m*j];
167 template <
typename REAL>
168 __global__
void kernel_scale(
const REAL s,
const len_t m,
const len_t n, REAL *data)
170 int i = blockDim.x*blockIdx.x + threadIdx.x;
171 int j = blockDim.y*blockIdx.y + threadIdx.y;
179 template <
typename REAL>
180 __global__
void kernel_pow(
const REAL p,
const len_t m,
const len_t n, REAL *data)
182 int i = blockDim.x*blockIdx.x + threadIdx.x;
183 int j = blockDim.y*blockIdx.y + threadIdx.y;
186 data[i + m*j] = pow(data[i + m*j], p);
191 template <
typename REAL>
192 __global__
void kernel_sum(
const len_t len,
const REAL *data, REAL *s)
194 int i = blockDim.x*blockIdx.x + threadIdx.x;
197 atomicAdd(s, data[i]);
202 static __global__
void kernel_max(
const len_t len,
const float *data,
float *mx)
204 int i = blockDim.x*blockIdx.x + threadIdx.x;
207 atomics::atomicMaxf(mx, data[i]);
210 static __global__
void kernel_max(
const len_t len,
const double *data,
double *mx)
212 int i = blockDim.x*blockIdx.x + threadIdx.x;
215 atomics::atomicMaxf(mx, data[i]);
218 template <
typename T>
219 __global__
void kernel_max(
const len_t len,
const T *data, T *mx)
221 int i = blockDim.x*blockIdx.x + threadIdx.x;
224 atomicMax(mx, data[i]);
229 static __global__
void kernel_min(
const len_t len,
const float *data,
float *mn)
231 int i = blockDim.x*blockIdx.x + threadIdx.x;
234 atomics::atomicMinf(mn, data[i]);
237 static __global__
void kernel_min(
const len_t len,
const double *data,
double *mn)
239 int i = blockDim.x*blockIdx.x + threadIdx.x;
242 atomics::atomicMinf(mn, data[i]);
245 template <
typename T>
246 __global__
void kernel_min(
const len_t len,
const T *data, T *mn)
248 int i = blockDim.x*blockIdx.x + threadIdx.x;
251 atomicMin(mn, data[i]);
256 template <
typename REAL>
257 __global__
void kernel_any_inf(
const len_t m,
const len_t n,
const REAL *data,
int *has_inf)
259 int i = blockDim.x*blockIdx.x + threadIdx.x;
260 int j = blockDim.y*blockIdx.y + threadIdx.y;
264 if (isinf(data[i + m*j]))
265 atomicMax(has_inf, 1);
271 template <
typename REAL>
272 __global__
void kernel_any_nan(
const len_t m,
const len_t n,
const REAL *data,
int *has_nan)
274 int i = blockDim.x*blockIdx.x + threadIdx.x;
275 int j = blockDim.y*blockIdx.y + threadIdx.y;
279 if (isnan(data[i + m*j]))
280 atomicMax(has_nan, 1);
286 template <
typename REAL>
287 __global__
void kernel_all_eq(
const len_t m,
const len_t n,
const REAL *x,
const REAL *y,
int *all_eq)
289 int i = blockDim.x*blockIdx.x + threadIdx.x;
290 int j = blockDim.y*blockIdx.y + threadIdx.y;
295 arraytools::fltcmp_gpu::eq(x + i + m*j, y + i + m*j, &all_eq_local);
297 atomicMin(all_eq, 0);
303 static __global__
void kernel_copy(len_t m, len_t n, __half *in,
float *out)
305 int i = blockDim.x*blockIdx.x + threadIdx.x;
306 int j = blockDim.y*blockIdx.y + threadIdx.y;
309 out[i + m*j] = __half2float(in[i + m*j]);
312 static __global__
void kernel_copy(len_t m, len_t n,
float *in, __half *out)
314 int i = blockDim.x*blockIdx.x + threadIdx.x;
315 int j = blockDim.y*blockIdx.y + threadIdx.y;
318 out[i + m*j] = __float2half(in[i + m*j]);
321 template <
typename REAL_IN,
typename REAL_OUT>
322 __global__
void kernel_copy(len_t m, len_t n, REAL_IN *in, REAL_OUT *out)
324 int i = blockDim.x*blockIdx.x + threadIdx.x;
325 int j = blockDim.y*blockIdx.y + threadIdx.y;
328 out[i + m*j] = (REAL_OUT) in[i + m*j];
333 template <
typename REAL>
334 __global__
void kernel_get_row(
const len_t row,
const len_t m,
const len_t n,
const REAL *data, REAL *v)
336 int i = blockDim.x*blockIdx.x + threadIdx.x;
337 int j = blockDim.y*blockIdx.y + threadIdx.y;
339 if (i < m && j < n && i == row)
340 v[j] = data[i + m*j];
343 template <
typename REAL>
344 __global__
void kernel_set_row(
const len_t row,
const len_t m,
const len_t n, REAL *data,
const REAL *v)
346 int i = blockDim.x*blockIdx.x + threadIdx.x;
347 int j = blockDim.y*blockIdx.y + threadIdx.y;
349 if (i < m && j < n && i == row)
350 data[i + m*j] = v[j];
353 template <
typename REAL>
354 __global__
void kernel_get_col(
const len_t col,
const len_t m,
const len_t n,
const REAL *data, REAL *v)
356 int i = blockDim.x*blockIdx.x + threadIdx.x;
357 int j = blockDim.y*blockIdx.y + threadIdx.y;
359 if (i < m && j < n && j == col)
360 v[i] = data[i + m*j];
363 template <
typename REAL>
364 __global__
void kernel_set_col(
const len_t col,
const len_t m,
const len_t n, REAL *data,
const REAL *v)
366 int i = blockDim.x*blockIdx.x + threadIdx.x;
367 int j = blockDim.y*blockIdx.y + threadIdx.y;
369 if (i < m && j < n && j == col)
370 data[i + m*j] = v[i];
375 template <
typename REAL>
376 __global__
void kernel_root_abs(
const len_t len, REAL *x)
378 int i = blockDim.x*blockIdx.x + threadIdx.x;
381 x[i] = sqrt(fabs(x[i]));