template<
typename T,
int BLOCK_DIM_X,
int ITEMS_PER_THREAD,
bool WARP_TIME_SLICING = false,
int BLOCK_DIM_Y = 1,
int BLOCK_DIM_Z = 1,
int PTX_ARCH = CUB_PTX_ARCH>
class cub::BlockExchange< T, BLOCK_DIM_X, ITEMS_PER_THREAD, WARP_TIME_SLICING, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >
The BlockExchange class provides collective methods for rearranging data partitioned across a CUDA thread block.
.
- Template Parameters
-
T | The data type to be exchanged. |
BLOCK_DIM_X | The thread block length in threads along the X dimension |
ITEMS_PER_THREAD | The number of items partitioned onto each thread. |
WARP_TIME_SLICING | [optional] When true , only use enough shared memory for a single warp's worth of tile data, time-slicing the block-wide exchange over multiple synchronized rounds. Yields a smaller memory footprint at the expense of decreased parallelism. (Default: false) |
BLOCK_DIM_Y | [optional] The thread block length in threads along the Y dimension (default: 1) |
BLOCK_DIM_Z | [optional] The thread block length in threads along the Z dimension (default: 1) |
PTX_ARCH | [optional] The PTX compute capability for which to to specialize this collective, formatted as per the CUDA_ARCH macro (e.g., 350 for sm_35). Useful for determining the collective's storage requirements for a given device from the host. (Default: the value of CUDA_ARCH during the current compiler pass) |
- Overview
- It is commonplace for blocks of threads to rearrange data items between threads. For example, the device-accessible memory subsystem prefers access patterns where data items are "striped" across threads (where consecutive threads access consecutive items), yet most block-wide operations prefer a "blocked" partitioning of items across threads (where consecutive items belong to a single thread).
- BlockExchange supports the following types of data exchanges:
- For multi-dimensional blocks, threads are linearly ranked in row-major order.
- A Simple Example
- Every thread in the block uses the BlockExchange class by first specializing the BlockExchange type, then instantiating an instance with parameters for communication, and finally invoking one or more collective member functions.
- The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement of 512 integer items partitioned across 128 threads where each thread owns 4 items.
__global__ void ExampleKernel(int *d_data, ...)
{
__shared__ typename BlockExchange::TempStorage temp_storage;
int thread_data[4];
cub::LoadDirectStriped<128>(threadIdx.x, d_data, thread_data);
- Suppose the set of striped input
thread_data
across the block of threads is { [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }
. The corresponding output thread_data
in those threads will be { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }
.
- Performance Considerations
- Proper device-specific padding ensures zero bank conflicts for most types.
Definition at line 116 of file block_exchange.cuh.
|
|
__device__ __forceinline__ | BlockExchange () |
| Collective constructor using a private static allocation of shared memory as temporary storage.
|
|
__device__ __forceinline__ | BlockExchange (TempStorage &temp_storage) |
| Collective constructor using the specified memory allocation as temporary storage. More...
|
|
|
__device__ __forceinline__ void | StripedToBlocked (T items[ITEMS_PER_THREAD]) |
| Transposes data items from striped arrangement to blocked arrangement. More...
|
|
__device__ __forceinline__ void | BlockedToStriped (T items[ITEMS_PER_THREAD]) |
| Transposes data items from blocked arrangement to striped arrangement. More...
|
|
__device__ __forceinline__ void | WarpStripedToBlocked (T items[ITEMS_PER_THREAD]) |
| Transposes data items from warp-striped arrangement to blocked arrangement. More...
|
|
__device__ __forceinline__ void | BlockedToWarpStriped (T items[ITEMS_PER_THREAD]) |
| Transposes data items from blocked arrangement to warp-striped arrangement. More...
|
|
|
template<typename OffsetT > |
__device__ __forceinline__ void | ScatterToBlocked (T items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD]) |
| Exchanges data items annotated by rank into blocked arrangement. More...
|
|
template<typename OffsetT > |
__device__ __forceinline__ void | ScatterToStriped (T items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD]) |
| Exchanges data items annotated by rank into striped arrangement. More...
|
|
template<typename OffsetT > |
__device__ __forceinline__ void | ScatterToStripedGuarded (T items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD]) |
| Exchanges data items annotated by rank into striped arrangement. Items with rank -1 are not exchanged. More...
|
|
template<typename OffsetT , typename ValidFlag > |
__device__ __forceinline__ void | ScatterToStriped (T items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD], ValidFlag is_valid[ITEMS_PER_THREAD]) |
| Exchanges valid data items annotated by rank into striped arrangement. More...
|
|
template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, bool WARP_TIME_SLICING = false, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
Collective constructor using the specified memory allocation as temporary storage.
- Parameters
-
[in] | temp_storage | Reference to memory allocation having layout type TempStorage |
Definition at line 704 of file block_exchange.cuh.
template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, bool WARP_TIME_SLICING = false, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
__device__ __forceinline__ void cub::BlockExchange< T, BLOCK_DIM_X, ITEMS_PER_THREAD, WARP_TIME_SLICING, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StripedToBlocked |
( |
T |
items[ITEMS_PER_THREAD]) | |
|
|
inline |
Transposes data items from striped arrangement to blocked arrangement.
- A subsequent
__syncthreads()
threadblock barrier should be invoked after calling this method if the collective's temporary storage (e.g., temp_storage
) is to be reused or repurposed.
- Snippet
- The code snippet below illustrates the conversion from a "striped" to a "blocked" arrangement of 512 integer items partitioned across 128 threads where each thread owns 4 items.
__global__ void ExampleKernel(int *d_data, ...)
{
__shared__ typename BlockExchange::TempStorage temp_storage;
int thread_data[4];
cub::LoadDirectStriped<128>(threadIdx.x, d_data, thread_data);
- Suppose the set of striped input
thread_data
across the block of threads is { [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }
after loading from device-accessible memory. The corresponding output thread_data
in those threads will be { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }
.
- Parameters
-
[in,out] | items | Items to exchange, converting between striped and blocked arrangements. |
Definition at line 757 of file block_exchange.cuh.
template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, bool WARP_TIME_SLICING = false, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
__device__ __forceinline__ void cub::BlockExchange< T, BLOCK_DIM_X, ITEMS_PER_THREAD, WARP_TIME_SLICING, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::BlockedToStriped |
( |
T |
items[ITEMS_PER_THREAD]) | |
|
|
inline |
Transposes data items from blocked arrangement to striped arrangement.
- A subsequent
__syncthreads()
threadblock barrier should be invoked after calling this method if the collective's temporary storage (e.g., temp_storage
) is to be reused or repurposed.
- Snippet
- The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement of 512 integer items partitioned across 128 threads where each thread owns 4 items.
__global__ void ExampleKernel(int *d_data, ...)
{
__shared__ typename BlockExchange::TempStorage temp_storage;
int thread_data[4];
...
cub::StoreDirectStriped<STORE_DEFAULT, 128>(threadIdx.x, d_data, thread_data);
- Suppose the set of blocked input
thread_data
across the block of threads is { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }
. The corresponding output thread_data
in those threads will be { [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }
in preparation for storing to device-accessible memory.
- Parameters
-
[in,out] | items | Items to exchange, converting between blocked and striped arrangements. |
Definition at line 803 of file block_exchange.cuh.
template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, bool WARP_TIME_SLICING = false, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
__device__ __forceinline__ void cub::BlockExchange< T, BLOCK_DIM_X, ITEMS_PER_THREAD, WARP_TIME_SLICING, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::WarpStripedToBlocked |
( |
T |
items[ITEMS_PER_THREAD]) | |
|
|
inline |
Transposes data items from warp-striped arrangement to blocked arrangement.
- A subsequent
__syncthreads()
threadblock barrier should be invoked after calling this method if the collective's temporary storage (e.g., temp_storage
) is to be reused or repurposed.
- Snippet
- The code snippet below illustrates the conversion from a "warp-striped" to a "blocked" arrangement of 512 integer items partitioned across 128 threads where each thread owns 4 items.
__global__ void ExampleKernel(int *d_data, ...)
{
__shared__ typename BlockExchange::TempStorage temp_storage;
int thread_data[4];
cub::LoadSWarptriped<LOAD_DEFAULT>(threadIdx.x, d_data, thread_data);
- Suppose the set of warp-striped input
thread_data
across the block of threads is { [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] }
after loading from device-accessible memory. (The first 128 items are striped across the first warp of 32 threads, the second 128 items are striped across the second warp, etc.) The corresponding output thread_data
in those threads will be { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }
.
- Parameters
-
[in,out] | items | Items to exchange, converting between warp-striped and blocked arrangements. |
Definition at line 848 of file block_exchange.cuh.
template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, bool WARP_TIME_SLICING = false, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
__device__ __forceinline__ void cub::BlockExchange< T, BLOCK_DIM_X, ITEMS_PER_THREAD, WARP_TIME_SLICING, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::BlockedToWarpStriped |
( |
T |
items[ITEMS_PER_THREAD]) | |
|
|
inline |
Transposes data items from blocked arrangement to warp-striped arrangement.
- A subsequent
__syncthreads()
threadblock barrier should be invoked after calling this method if the collective's temporary storage (e.g., temp_storage
) is to be reused or repurposed.
- Snippet
- The code snippet below illustrates the conversion from a "blocked" to a "warp-striped" arrangement of 512 integer items partitioned across 128 threads where each thread owns 4 items.
__global__ void ExampleKernel(int *d_data, ...)
{
__shared__ typename BlockExchange::TempStorage temp_storage;
int thread_data[4];
...
cub::StoreDirectStriped<STORE_DEFAULT, 128>(threadIdx.x, d_data, thread_data);
- Suppose the set of blocked input
thread_data
across the block of threads is { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }
. The corresponding output thread_data
in those threads will be { [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] }
in preparation for storing to device-accessible memory. (The first 128 items are striped across the first warp of 32 threads, the second 128 items are striped across the second warp, etc.)
- Parameters
-
[in,out] | items | Items to exchange, converting between blocked and warp-striped arrangements. |
Definition at line 895 of file block_exchange.cuh.
template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, bool WARP_TIME_SLICING = false, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<typename OffsetT >
__device__ __forceinline__ void cub::BlockExchange< T, BLOCK_DIM_X, ITEMS_PER_THREAD, WARP_TIME_SLICING, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::ScatterToBlocked |
( |
T |
items[ITEMS_PER_THREAD], |
|
|
OffsetT |
ranks[ITEMS_PER_THREAD] |
|
) |
| |
|
inline |
Exchanges data items annotated by rank into blocked arrangement.
- A subsequent
__syncthreads()
threadblock barrier should be invoked after calling this method if the collective's temporary storage (e.g., temp_storage
) is to be reused or repurposed.
- Template Parameters
-
OffsetT | [inferred] Signed integer type for local offsets |
- Parameters
-
[in,out] | items | Items to exchange |
[in] | ranks | Corresponding scatter ranks |
Definition at line 918 of file block_exchange.cuh.
template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, bool WARP_TIME_SLICING = false, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<typename OffsetT >
__device__ __forceinline__ void cub::BlockExchange< T, BLOCK_DIM_X, ITEMS_PER_THREAD, WARP_TIME_SLICING, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::ScatterToStriped |
( |
T |
items[ITEMS_PER_THREAD], |
|
|
OffsetT |
ranks[ITEMS_PER_THREAD] |
|
) |
| |
|
inline |
Exchanges data items annotated by rank into striped arrangement.
- A subsequent
__syncthreads()
threadblock barrier should be invoked after calling this method if the collective's temporary storage (e.g., temp_storage
) is to be reused or repurposed.
- Template Parameters
-
OffsetT | [inferred] Signed integer type for local offsets |
- Parameters
-
[in,out] | items | Items to exchange |
[in] | ranks | Corresponding scatter ranks |
Definition at line 935 of file block_exchange.cuh.
template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, bool WARP_TIME_SLICING = false, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<typename OffsetT >
__device__ __forceinline__ void cub::BlockExchange< T, BLOCK_DIM_X, ITEMS_PER_THREAD, WARP_TIME_SLICING, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::ScatterToStripedGuarded |
( |
T |
items[ITEMS_PER_THREAD], |
|
|
OffsetT |
ranks[ITEMS_PER_THREAD] |
|
) |
| |
|
inline |
Exchanges data items annotated by rank into striped arrangement. Items with rank -1 are not exchanged.
- A subsequent
__syncthreads()
threadblock barrier should be invoked after calling this method if the collective's temporary storage (e.g., temp_storage
) is to be reused or repurposed.
- Template Parameters
-
OffsetT | [inferred] Signed integer type for local offsets |
- Parameters
-
[in,out] | items | Items to exchange |
[in] | ranks | Corresponding scatter ranks |
Definition at line 952 of file block_exchange.cuh.
template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, bool WARP_TIME_SLICING = false, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<typename OffsetT , typename ValidFlag >
__device__ __forceinline__ void cub::BlockExchange< T, BLOCK_DIM_X, ITEMS_PER_THREAD, WARP_TIME_SLICING, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::ScatterToStriped |
( |
T |
items[ITEMS_PER_THREAD], |
|
|
OffsetT |
ranks[ITEMS_PER_THREAD], |
|
|
ValidFlag |
is_valid[ITEMS_PER_THREAD] |
|
) |
| |
|
inline |
Exchanges valid data items annotated by rank into striped arrangement.
- A subsequent
__syncthreads()
threadblock barrier should be invoked after calling this method if the collective's temporary storage (e.g., temp_storage
) is to be reused or repurposed.
- Template Parameters
-
OffsetT | [inferred] Signed integer type for local offsets |
ValidFlag | [inferred] FlagT type denoting which items are valid |
- Parameters
-
[in,out] | items | Items to exchange |
[in] | ranks | Corresponding scatter ranks |
[in] | is_valid | Corresponding flag denoting item validity |
Definition at line 986 of file block_exchange.cuh.