CUB
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Groups
block_discontinuity.cuh
Go to the documentation of this file.
1 /******************************************************************************
2  * Copyright (c) 2011, Duane Merrill. All rights reserved.
3  * Copyright (c) 2011-2015, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  ******************************************************************************/
28 
34 #pragma once
35 
36 #include "../util_type.cuh"
37 #include "../util_ptx.cuh"
38 #include "../util_namespace.cuh"
39 
41 CUB_NS_PREFIX
42 
44 namespace cub {
45 
102 template <
103  typename T,
104  int BLOCK_DIM_X,
105  int BLOCK_DIM_Y = 1,
106  int BLOCK_DIM_Z = 1,
107  int PTX_ARCH = CUB_PTX_ARCH>
109 {
110 private:
111 
112  /******************************************************************************
113  * Constants and type definitions
114  ******************************************************************************/
115 
117  enum
118  {
120  BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
121  };
122 
123 
125  struct _TempStorage
126  {
127  T first_items[BLOCK_THREADS];
128  T last_items[BLOCK_THREADS];
129  };
130 
131 
132  /******************************************************************************
133  * Utility methods
134  ******************************************************************************/
135 
137  __device__ __forceinline__ _TempStorage& PrivateStorage()
138  {
139  __shared__ _TempStorage private_storage;
140  return private_storage;
141  }
142 
143 
145  template <typename FlagOp, bool HAS_PARAM = BinaryOpHasIdxParam<T, FlagOp>::HAS_PARAM>
146  struct ApplyOp
147  {
148  // Apply flag operator
149  static __device__ __forceinline__ bool FlagT(FlagOp flag_op, const T &a, const T &b, int idx)
150  {
151  return flag_op(a, b, idx);
152  }
153  };
154 
156  template <typename FlagOp>
157  struct ApplyOp<FlagOp, false>
158  {
159  // Apply flag operator
160  static __device__ __forceinline__ bool FlagT(FlagOp flag_op, const T &a, const T &b, int idx)
161  {
162  return flag_op(a, b);
163  }
164  };
165 
167  template <int ITERATION, int MAX_ITERATIONS>
168  struct Iterate
169  {
170  // Head flags
171  template <
172  int ITEMS_PER_THREAD,
173  typename FlagT,
174  typename FlagOp>
175  static __device__ __forceinline__ void FlagHeads(
176  int linear_tid,
177  FlagT (&flags)[ITEMS_PER_THREAD],
178  T (&input)[ITEMS_PER_THREAD],
179  T (&preds)[ITEMS_PER_THREAD],
180  FlagOp flag_op)
181  {
182  preds[ITERATION] = input[ITERATION - 1];
183 
184  flags[ITERATION] = ApplyOp<FlagOp>::FlagT(
185  flag_op,
186  preds[ITERATION],
187  input[ITERATION],
188  (linear_tid * ITEMS_PER_THREAD) + ITERATION);
189 
190  Iterate<ITERATION + 1, MAX_ITERATIONS>::FlagHeads(linear_tid, flags, input, preds, flag_op);
191  }
192 
193  // Tail flags
194  template <
195  int ITEMS_PER_THREAD,
196  typename FlagT,
197  typename FlagOp>
198  static __device__ __forceinline__ void FlagTails(
199  int linear_tid,
200  FlagT (&flags)[ITEMS_PER_THREAD],
201  T (&input)[ITEMS_PER_THREAD],
202  FlagOp flag_op)
203  {
204  flags[ITERATION] = ApplyOp<FlagOp>::FlagT(
205  flag_op,
206  input[ITERATION],
207  input[ITERATION + 1],
208  (linear_tid * ITEMS_PER_THREAD) + ITERATION + 1);
209 
210  Iterate<ITERATION + 1, MAX_ITERATIONS>::FlagTails(linear_tid, flags, input, flag_op);
211  }
212 
213  };
214 
216  template <int MAX_ITERATIONS>
217  struct Iterate<MAX_ITERATIONS, MAX_ITERATIONS>
218  {
219  // Head flags
220  template <
221  int ITEMS_PER_THREAD,
222  typename FlagT,
223  typename FlagOp>
224  static __device__ __forceinline__ void FlagHeads(
225  int linear_tid,
226  FlagT (&flags)[ITEMS_PER_THREAD],
227  T (&input)[ITEMS_PER_THREAD],
228  T (&preds)[ITEMS_PER_THREAD],
229  FlagOp flag_op)
230  {}
231 
232  // Tail flags
233  template <
234  int ITEMS_PER_THREAD,
235  typename FlagT,
236  typename FlagOp>
237  static __device__ __forceinline__ void FlagTails(
238  int linear_tid,
239  FlagT (&flags)[ITEMS_PER_THREAD],
240  T (&input)[ITEMS_PER_THREAD],
241  FlagOp flag_op)
242  {}
243  };
244 
245 
246  /******************************************************************************
247  * Thread fields
248  ******************************************************************************/
249 
251  _TempStorage &temp_storage;
252 
254  int linear_tid;
255 
256 
257 public:
258 
260  struct TempStorage : Uninitialized<_TempStorage> {};
261 
262 
263  /******************************************************************/
267 
271  __device__ __forceinline__ BlockDiscontinuity()
272  :
273  temp_storage(PrivateStorage()),
274  linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
275  {}
276 
277 
281  __device__ __forceinline__ BlockDiscontinuity(
282  TempStorage &temp_storage)
283  :
284  temp_storage(temp_storage.Alias()),
285  linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
286  {}
287 
288 
290  /******************************************************************/
294 
295 
296 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
297 
298  template <
299  int ITEMS_PER_THREAD,
300  typename FlagT,
301  typename FlagOp>
302  __device__ __forceinline__ void FlagHeads(
303  FlagT (&head_flags)[ITEMS_PER_THREAD],
304  T (&input)[ITEMS_PER_THREAD],
305  T (&preds)[ITEMS_PER_THREAD],
306  FlagOp flag_op)
307  {
308  // Share last item
309  temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
310 
311  __syncthreads();
312 
313  if (linear_tid == 0)
314  {
315  // Set flag for first thread-item (preds[0] is undefined)
316  head_flags[0] = 1;
317  }
318  else
319  {
320  preds[0] = temp_storage.last_items[linear_tid - 1];
321  head_flags[0] = ApplyOp<FlagOp>::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD);
322  }
323 
324  // Set head_flags for remaining items
325  Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
326  }
327 
328  template <
329  int ITEMS_PER_THREAD,
330  typename FlagT,
331  typename FlagOp>
332  __device__ __forceinline__ void FlagHeads(
333  FlagT (&head_flags)[ITEMS_PER_THREAD],
334  T (&input)[ITEMS_PER_THREAD],
335  T (&preds)[ITEMS_PER_THREAD],
336  FlagOp flag_op,
337  T tile_predecessor_item)
338  {
339  // Share last item
340  temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
341 
342  __syncthreads();
343 
344  // Set flag for first thread-item
345  preds[0] = (linear_tid == 0) ?
346  tile_predecessor_item : // First thread
347  temp_storage.last_items[linear_tid - 1];
348 
349  head_flags[0] = ApplyOp<FlagOp>::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD);
350 
351  // Set head_flags for remaining items
352  Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
353  }
354 
355 #endif // DOXYGEN_SHOULD_SKIP_THIS
356 
357 
407  template <
408  int ITEMS_PER_THREAD,
409  typename FlagT,
410  typename FlagOp>
411  __device__ __forceinline__ void FlagHeads(
412  FlagT (&head_flags)[ITEMS_PER_THREAD],
413  T (&input)[ITEMS_PER_THREAD],
414  FlagOp flag_op)
415  {
416  T preds[ITEMS_PER_THREAD];
417  FlagHeads(head_flags, input, preds, flag_op);
418  }
419 
420 
476  template <
477  int ITEMS_PER_THREAD,
478  typename FlagT,
479  typename FlagOp>
480  __device__ __forceinline__ void FlagHeads(
481  FlagT (&head_flags)[ITEMS_PER_THREAD],
482  T (&input)[ITEMS_PER_THREAD],
483  FlagOp flag_op,
484  T tile_predecessor_item)
485  {
486  T preds[ITEMS_PER_THREAD];
487  FlagHeads(head_flags, input, preds, flag_op, tile_predecessor_item);
488  }
489 
490 
491 
493  /******************************************************************/
497 
498 
549  template <
550  int ITEMS_PER_THREAD,
551  typename FlagT,
552  typename FlagOp>
553  __device__ __forceinline__ void FlagTails(
554  FlagT (&tail_flags)[ITEMS_PER_THREAD],
555  T (&input)[ITEMS_PER_THREAD],
556  FlagOp flag_op)
557  {
558  // Share first item
559  temp_storage.first_items[linear_tid] = input[0];
560 
561  __syncthreads();
562 
563  // Set flag for last thread-item
564  tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
565  1 : // Last thread
566  ApplyOp<FlagOp>::FlagT(
567  flag_op,
568  input[ITEMS_PER_THREAD - 1],
569  temp_storage.first_items[linear_tid + 1],
570  (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
571 
572  // Set tail_flags for remaining items
573  Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
574  }
575 
576 
633  template <
634  int ITEMS_PER_THREAD,
635  typename FlagT,
636  typename FlagOp>
637  __device__ __forceinline__ void FlagTails(
638  FlagT (&tail_flags)[ITEMS_PER_THREAD],
639  T (&input)[ITEMS_PER_THREAD],
640  FlagOp flag_op,
641  T tile_successor_item)
642  {
643  // Share first item
644  temp_storage.first_items[linear_tid] = input[0];
645 
646  __syncthreads();
647 
648  // Set flag for last thread-item
649  T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
650  tile_successor_item : // Last thread
651  temp_storage.first_items[linear_tid + 1];
652 
653  tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
654  flag_op,
655  input[ITEMS_PER_THREAD - 1],
656  successor_item,
657  (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
658 
659  // Set tail_flags for remaining items
660  Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
661  }
662 
663 
665  /******************************************************************/
669 
670 
731  template <
732  int ITEMS_PER_THREAD,
733  typename FlagT,
734  typename FlagOp>
735  __device__ __forceinline__ void FlagHeadsAndTails(
736  FlagT (&head_flags)[ITEMS_PER_THREAD],
737  FlagT (&tail_flags)[ITEMS_PER_THREAD],
738  T (&input)[ITEMS_PER_THREAD],
739  FlagOp flag_op)
740  {
741  // Share first and last items
742  temp_storage.first_items[linear_tid] = input[0];
743  temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
744 
745  __syncthreads();
746 
747  T preds[ITEMS_PER_THREAD];
748 
749  // Set flag for first thread-item
750  preds[0] = temp_storage.last_items[linear_tid - 1];
751  if (linear_tid == 0)
752  {
753  head_flags[0] = 1;
754  }
755  else
756  {
757  head_flags[0] = ApplyOp<FlagOp>::FlagT(
758  flag_op,
759  preds[0],
760  input[0],
761  linear_tid * ITEMS_PER_THREAD);
762  }
763 
764 
765  // Set flag for last thread-item
766  tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
767  1 : // Last thread
768  ApplyOp<FlagOp>::FlagT(
769  flag_op,
770  input[ITEMS_PER_THREAD - 1],
771  temp_storage.first_items[linear_tid + 1],
772  (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
773 
774  // Set head_flags for remaining items
775  Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
776 
777  // Set tail_flags for remaining items
778  Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
779  }
780 
781 
847  template <
848  int ITEMS_PER_THREAD,
849  typename FlagT,
850  typename FlagOp>
851  __device__ __forceinline__ void FlagHeadsAndTails(
852  FlagT (&head_flags)[ITEMS_PER_THREAD],
853  FlagT (&tail_flags)[ITEMS_PER_THREAD],
854  T tile_successor_item,
855  T (&input)[ITEMS_PER_THREAD],
856  FlagOp flag_op)
857  {
858  // Share first and last items
859  temp_storage.first_items[linear_tid] = input[0];
860  temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
861 
862  __syncthreads();
863 
864  T preds[ITEMS_PER_THREAD];
865 
866  // Set flag for first thread-item
867  if (linear_tid == 0)
868  {
869  head_flags[0] = 1;
870  }
871  else
872  {
873  preds[0] = temp_storage.last_items[linear_tid - 1];
874  head_flags[0] = ApplyOp<FlagOp>::FlagT(
875  flag_op,
876  preds[0],
877  input[0],
878  linear_tid * ITEMS_PER_THREAD);
879  }
880 
881  // Set flag for last thread-item
882  T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
883  tile_successor_item : // Last thread
884  temp_storage.first_items[linear_tid + 1];
885 
886  tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
887  flag_op,
888  input[ITEMS_PER_THREAD - 1],
889  successor_item,
890  (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
891 
892  // Set head_flags for remaining items
893  Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
894 
895  // Set tail_flags for remaining items
896  Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
897  }
898 
899 
971  template <
972  int ITEMS_PER_THREAD,
973  typename FlagT,
974  typename FlagOp>
975  __device__ __forceinline__ void FlagHeadsAndTails(
976  FlagT (&head_flags)[ITEMS_PER_THREAD],
977  T tile_predecessor_item,
978  FlagT (&tail_flags)[ITEMS_PER_THREAD],
979  T (&input)[ITEMS_PER_THREAD],
980  FlagOp flag_op)
981  {
982  // Share first and last items
983  temp_storage.first_items[linear_tid] = input[0];
984  temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
985 
986  __syncthreads();
987 
988  T preds[ITEMS_PER_THREAD];
989 
990  // Set flag for first thread-item
991  preds[0] = (linear_tid == 0) ?
992  tile_predecessor_item : // First thread
993  temp_storage.last_items[linear_tid - 1];
994 
995  head_flags[0] = ApplyOp<FlagOp>::FlagT(
996  flag_op,
997  preds[0],
998  input[0],
999  linear_tid * ITEMS_PER_THREAD);
1000 
1001  // Set flag for last thread-item
1002  tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
1003  1 : // Last thread
1004  ApplyOp<FlagOp>::FlagT(
1005  flag_op,
1006  input[ITEMS_PER_THREAD - 1],
1007  temp_storage.first_items[linear_tid + 1],
1008  (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
1009 
1010  // Set head_flags for remaining items
1011  Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
1012 
1013  // Set tail_flags for remaining items
1014  Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
1015  }
1016 
1017 
1090  template <
1091  int ITEMS_PER_THREAD,
1092  typename FlagT,
1093  typename FlagOp>
1094  __device__ __forceinline__ void FlagHeadsAndTails(
1095  FlagT (&head_flags)[ITEMS_PER_THREAD],
1096  T tile_predecessor_item,
1097  FlagT (&tail_flags)[ITEMS_PER_THREAD],
1098  T tile_successor_item,
1099  T (&input)[ITEMS_PER_THREAD],
1100  FlagOp flag_op)
1101  {
1102  // Share first and last items
1103  temp_storage.first_items[linear_tid] = input[0];
1104  temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
1105 
1106  __syncthreads();
1107 
1108  T preds[ITEMS_PER_THREAD];
1109 
1110  // Set flag for first thread-item
1111  preds[0] = (linear_tid == 0) ?
1112  tile_predecessor_item : // First thread
1113  temp_storage.last_items[linear_tid - 1];
1114 
1115  head_flags[0] = ApplyOp<FlagOp>::FlagT(
1116  flag_op,
1117  preds[0],
1118  input[0],
1119  linear_tid * ITEMS_PER_THREAD);
1120 
1121  // Set flag for last thread-item
1122  T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
1123  tile_successor_item : // Last thread
1124  temp_storage.first_items[linear_tid + 1];
1125 
1126  tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
1127  flag_op,
1128  input[ITEMS_PER_THREAD - 1],
1129  successor_item,
1130  (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
1131 
1132  // Set head_flags for remaining items
1133  Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
1134 
1135  // Set tail_flags for remaining items
1136  Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
1137  }
1138 
1139 
1140 
1141 
1143 
1144 };
1145 
1146 
1147 } // CUB namespace
1148 CUB_NS_POSTFIX // Optional outer namespace(s)