fml  0.1-0
Fused Matrix Library
kernelfuns.hh
1 // This file is part of fml which is released under the Boost Software
2 // License, Version 1.0. See accompanying file LICENSE or copy at
3 // https://www.boost.org/LICENSE_1_0.txt
4 
5 #ifndef FML_GPU_INTERNALS_KERNELFUNS_H
6 #define FML_GPU_INTERNALS_KERNELFUNS_H
7 #pragma once
8 
9 
10 #include <cuda_runtime.h>
11 
12 #include "../../_internals/types.hh"
13 
14 #include "../../_internals/arraytools/src/arraytools.cuh"
15 
16 #include "../internals/atomics.hh"
17 
18 
19 namespace fml
20 {
21  namespace kernelfuns
22  {
23  template <typename REAL>
24  __global__ void kernel_rev_rows(const len_t m, const len_t n, REAL *data)
25  {
26  int i = blockDim.x*blockIdx.x + threadIdx.x;
27  int j = blockDim.y*blockIdx.y + threadIdx.y;
28 
29  if (i < m/2 && j < n)
30  {
31  REAL tmp = data[i + m*j];
32  data[i + m*j] = data[m-i-1 + m*j];
33  data[m-i-1 + m*j] = tmp;
34  }
35  }
36 
37 
38 
39  template <typename REAL>
40  __global__ void kernel_rev_cols(const len_t m, const len_t n, REAL *data)
41  {
42  int i = blockDim.x*blockIdx.x + threadIdx.x;
43  int j = blockDim.y*blockIdx.y + threadIdx.y;
44 
45  if (i < m && j < n/2)
46  {
47  REAL tmp = data[i + m*j];
48  data[i + m*j] = data[i + m*(n-j-1)];
49  data[i + m*(n-j-1)] = tmp;
50  }
51  }
52 
53 
54 
55  template <typename REAL>
56  __global__ void kernel_fill_eye(const len_t m, const len_t n, REAL *data)
57  {
58  int i = blockDim.x*blockIdx.x + threadIdx.x;
59  int j = blockDim.y*blockIdx.y + threadIdx.y;
60 
61  if (i < m && j < n)
62  {
63  if (i == j)
64  data[i + m*j] = (REAL) 1;
65  else
66  data[i + m*j] = (REAL) 0;
67  }
68  }
69 
70 
71 
72  template <typename REAL>
73  __global__ void kernel_fill_diag(const len_t size, const REAL *v, const len_t m, const len_t n, REAL *data)
74  {
75  int i = blockDim.x*blockIdx.x + threadIdx.x;
76  int j = blockDim.y*blockIdx.y + threadIdx.y;
77 
78  if (i < m && j < n)
79  {
80  if (i == j)
81  data[i + m*j] = v[i % size];
82  else
83  data[i + m*j] = (REAL) 0;
84  }
85  }
86 
87 
88 
89  template <typename REAL>
90  __global__ void kernel_fill_val(const REAL v, const len_t m, const len_t n, REAL *data)
91  {
92  int i = blockDim.x*blockIdx.x + threadIdx.x;
93  int j = blockDim.y*blockIdx.y + threadIdx.y;
94 
95  if (i < m && j < n)
96  data[i + m*j] = v;
97  }
98 
99 
100 
101  static __global__ void kernel_fill_linspace(const __half start, const __half stop, const len_t m, const len_t n, __half *data)
102  {
103  int i = blockDim.x*blockIdx.x + threadIdx.x;
104  int j = blockDim.y*blockIdx.y + threadIdx.y;
105 
106  if (i < m && j < n)
107  {
108  float v_f = ((float)(stop-start))/((float) m*n-1);
109  __half v = (__half) v_f;
110  len_t ind = i + m*j;
111  data[ind] = v*__int2half_rz(ind) + start;
112  }
113  }
114 
115  template <typename REAL>
116  __global__ void kernel_fill_linspace(const REAL start, const REAL stop, const len_t m, const len_t n, REAL *data)
117  {
118  int i = blockDim.x*blockIdx.x + threadIdx.x;
119  int j = blockDim.y*blockIdx.y + threadIdx.y;
120 
121  if (i < m && j < n)
122  {
123  REAL v = (stop-start)/((REAL) m*n-1);
124  len_t ind = i + m*j;
125  data[ind] = v*((REAL) ind) + start;
126  }
127  }
128 
129 
130 
131  template <typename REAL>
132  __global__ void kernel_fill_runif_update(const REAL min, const REAL max, const len_t m, const len_t n, REAL *data)
133  {
134  int i = blockDim.x*blockIdx.x + threadIdx.x;
135  int j = blockDim.y*blockIdx.y + threadIdx.y;
136 
137  if (i < m && j < n)
138  data[i + m*j] = min + (max - min)*data[i + m*j];
139  }
140 
141 
142 
143  template <typename REAL>
144  __global__ void kernel_diag(const len_t m, const len_t n, const REAL *data, REAL *v)
145  {
146  int i = blockDim.x*blockIdx.x + threadIdx.x;
147  int j = blockDim.y*blockIdx.y + threadIdx.y;
148 
149  if (i < m && j < n && i == j)
150  v[i] = data[i + m*j];
151  }
152 
153 
154 
155  template <typename REAL>
156  __global__ void kernel_antidiag(const len_t m, const len_t n, const REAL *data, REAL *v)
157  {
158  int i = blockDim.x*blockIdx.x + threadIdx.x;
159  int j = blockDim.y*blockIdx.y + threadIdx.y;
160 
161  if (i < m && j < n && m-1-i == j)
162  v[j] = data[i + m*j];
163  }
164 
165 
166 
167  template <typename REAL>
168  __global__ void kernel_scale(const REAL s, const len_t m, const len_t n, REAL *data)
169  {
170  int i = blockDim.x*blockIdx.x + threadIdx.x;
171  int j = blockDim.y*blockIdx.y + threadIdx.y;
172 
173  if (i < m && j < n)
174  data[i + m*j] *= s;
175  }
176 
177 
178 
179  template <typename REAL>
180  __global__ void kernel_pow(const REAL p, const len_t m, const len_t n, REAL *data)
181  {
182  int i = blockDim.x*blockIdx.x + threadIdx.x;
183  int j = blockDim.y*blockIdx.y + threadIdx.y;
184 
185  if (i < m && j < n)
186  data[i + m*j] = pow(data[i + m*j], p);
187  }
188 
189 
190 
191  template <typename REAL>
192  __global__ void kernel_sum(const len_t len, const REAL *data, REAL *s)
193  {
194  int i = blockDim.x*blockIdx.x + threadIdx.x;
195 
196  if (i < len)
197  atomicAdd(s, data[i]);
198  }
199 
200 
201 
202  static __global__ void kernel_max(const len_t len, const float *data, float *mx)
203  {
204  int i = blockDim.x*blockIdx.x + threadIdx.x;
205 
206  if (i < len)
207  atomics::atomicMaxf(mx, data[i]);
208  }
209 
210  static __global__ void kernel_max(const len_t len, const double *data, double *mx)
211  {
212  int i = blockDim.x*blockIdx.x + threadIdx.x;
213 
214  if (i < len)
215  atomics::atomicMaxf(mx, data[i]);
216  }
217 
218  template <typename T>
219  __global__ void kernel_max(const len_t len, const T *data, T *mx)
220  {
221  int i = blockDim.x*blockIdx.x + threadIdx.x;
222 
223  if (i < len)
224  atomicMax(mx, data[i]);
225  }
226 
227 
228 
229  static __global__ void kernel_min(const len_t len, const float *data, float *mn)
230  {
231  int i = blockDim.x*blockIdx.x + threadIdx.x;
232 
233  if (i < len)
234  atomics::atomicMinf(mn, data[i]);
235  }
236 
237  static __global__ void kernel_min(const len_t len, const double *data, double *mn)
238  {
239  int i = blockDim.x*blockIdx.x + threadIdx.x;
240 
241  if (i < len)
242  atomics::atomicMinf(mn, data[i]);
243  }
244 
245  template <typename T>
246  __global__ void kernel_min(const len_t len, const T *data, T *mn)
247  {
248  int i = blockDim.x*blockIdx.x + threadIdx.x;
249 
250  if (i < len)
251  atomicMin(mn, data[i]);
252  }
253 
254 
255 
256  template <typename REAL>
257  __global__ void kernel_any_inf(const len_t m, const len_t n, const REAL *data, int *has_inf)
258  {
259  int i = blockDim.x*blockIdx.x + threadIdx.x;
260  int j = blockDim.y*blockIdx.y + threadIdx.y;
261 
262  if (i < m && j < n)
263  {
264  if (isinf(data[i + m*j]))
265  atomicMax(has_inf, 1);
266  }
267  }
268 
269 
270 
271  template <typename REAL>
272  __global__ void kernel_any_nan(const len_t m, const len_t n, const REAL *data, int *has_nan)
273  {
274  int i = blockDim.x*blockIdx.x + threadIdx.x;
275  int j = blockDim.y*blockIdx.y + threadIdx.y;
276 
277  if (i < m && j < n)
278  {
279  if (isnan(data[i + m*j]))
280  atomicMax(has_nan, 1);
281  }
282  }
283 
284 
285 
286  template <typename REAL>
287  __global__ void kernel_all_eq(const len_t m, const len_t n, const REAL *x, const REAL *y, int *all_eq)
288  {
289  int i = blockDim.x*blockIdx.x + threadIdx.x;
290  int j = blockDim.y*blockIdx.y + threadIdx.y;
291 
292  if (i < m && j < n)
293  {
294  bool all_eq_local;
295  arraytools::fltcmp_gpu::eq(x + i + m*j, y + i + m*j, &all_eq_local);
296  if (!all_eq_local)
297  atomicMin(all_eq, 0);
298  }
299  }
300 
301 
302 
303  static __global__ void kernel_copy(len_t m, len_t n, __half *in, float *out)
304  {
305  int i = blockDim.x*blockIdx.x + threadIdx.x;
306  int j = blockDim.y*blockIdx.y + threadIdx.y;
307 
308  if (i < m && j < n)
309  out[i + m*j] = __half2float(in[i + m*j]);
310  }
311 
312  static __global__ void kernel_copy(len_t m, len_t n, float *in, __half *out)
313  {
314  int i = blockDim.x*blockIdx.x + threadIdx.x;
315  int j = blockDim.y*blockIdx.y + threadIdx.y;
316 
317  if (i < m && j < n)
318  out[i + m*j] = __float2half(in[i + m*j]);
319  }
320 
321  template <typename REAL_IN, typename REAL_OUT>
322  __global__ void kernel_copy(len_t m, len_t n, REAL_IN *in, REAL_OUT *out)
323  {
324  int i = blockDim.x*blockIdx.x + threadIdx.x;
325  int j = blockDim.y*blockIdx.y + threadIdx.y;
326 
327  if (i < m && j < n)
328  out[i + m*j] = (REAL_OUT) in[i + m*j];
329  }
330 
331 
332 
333  template <typename REAL>
334  __global__ void kernel_get_row(const len_t row, const len_t m, const len_t n, const REAL *data, REAL *v)
335  {
336  int i = blockDim.x*blockIdx.x + threadIdx.x;
337  int j = blockDim.y*blockIdx.y + threadIdx.y;
338 
339  if (i < m && j < n && i == row)
340  v[j] = data[i + m*j];
341  }
342 
343  template <typename REAL>
344  __global__ void kernel_set_row(const len_t row, const len_t m, const len_t n, REAL *data, const REAL *v)
345  {
346  int i = blockDim.x*blockIdx.x + threadIdx.x;
347  int j = blockDim.y*blockIdx.y + threadIdx.y;
348 
349  if (i < m && j < n && i == row)
350  data[i + m*j] = v[j];
351  }
352 
353  template <typename REAL>
354  __global__ void kernel_get_col(const len_t col, const len_t m, const len_t n, const REAL *data, REAL *v)
355  {
356  int i = blockDim.x*blockIdx.x + threadIdx.x;
357  int j = blockDim.y*blockIdx.y + threadIdx.y;
358 
359  if (i < m && j < n && j == col)
360  v[i] = data[i + m*j];
361  }
362 
363  template <typename REAL>
364  __global__ void kernel_set_col(const len_t col, const len_t m, const len_t n, REAL *data, const REAL *v)
365  {
366  int i = blockDim.x*blockIdx.x + threadIdx.x;
367  int j = blockDim.y*blockIdx.y + threadIdx.y;
368 
369  if (i < m && j < n && j == col)
370  data[i + m*j] = v[i];
371  }
372 
373 
374 
375  template <typename REAL>
376  __global__ void kernel_root_abs(const len_t len, REAL *x)
377  {
378  int i = blockDim.x*blockIdx.x + threadIdx.x;
379 
380  if (i < len)
381  x[i] = sqrt(fabs(x[i]));
382  }
383  }
384 }
385 
386 
387 #endif
fml
Core namespace.
Definition: dimops.hh:10