1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
| //------------------------------------------------------------------------------
// Normal problem size invocation (two-pass)
//------------------------------------------------------------------------------
/// Invoke two-passes to reduce
template <
typename ActivePolicyT, ///< Umbrella policy active for the target device
typename ReduceKernelT, ///< Function type of cub::DeviceReduceKernel
typename SingleTileKernelT> ///< Function type of cub::DeviceReduceSingleTileKernel
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t InvokePasses(
ReduceKernelT reduce_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceKernel
SingleTileKernelT single_tile_kernel) ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceSingleTileKernel
{
#ifndef CUB_RUNTIME_ENABLED
(void) reduce_kernel;
(void) single_tile_kernel;
// Kernel launch not supported from this device
return CubDebug(cudaErrorNotSupported );
#else
cudaError error = cudaSuccess;
do
{
// Get device ordinal
int device_ordinal;
if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
// Get SM count
int sm_count;
if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
// Init regular kernel configuration
KernelConfig reduce_config;
if (CubDebug(error = reduce_config.Init<typename ActivePolicyT::ReducePolicy>(reduce_kernel))) break;
int reduce_device_occupancy = reduce_config.sm_occupancy * sm_count;
// Even-share work distribution
int max_blocks = reduce_device_occupancy * CUB_SUBSCRIPTION_FACTOR(ptx_version);
GridEvenShare<OffsetT> even_share;
even_share.DispatchInit(num_items, max_blocks, reduce_config.tile_size);
// Temporary storage allocation requirements
void* allocations[1] = {};
size_t allocation_sizes[1] =
{
max_blocks * sizeof(OutputT) // bytes needed for privatized block reductions
};
// Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
if (d_temp_storage == NULL)
{
// Return if the caller is simply requesting the size of the storage allocation
return cudaSuccess;
}
// Alias the allocation for the privatized per-block reductions
OutputT *d_block_reductions = (OutputT*) allocations[0];
// Get grid size for device_reduce_sweep_kernel
int reduce_grid_size = even_share.grid_size;
// Log device_reduce_sweep_kernel configuration
if (debug_synchronous) _CubLog("Invoking DeviceReduceKernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
reduce_grid_size,
ActivePolicyT::ReducePolicy::BLOCK_THREADS,
(long long) stream,
ActivePolicyT::ReducePolicy::ITEMS_PER_THREAD,
reduce_config.sm_occupancy);
// Invoke DeviceReduceKernel
THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
reduce_grid_size, ActivePolicyT::ReducePolicy::BLOCK_THREADS,
0, stream
).doit(reduce_kernel,
d_in,
d_block_reductions,
num_items,
even_share,
reduction_op);
// Check for failure to launch
if (CubDebug(error = cudaPeekAtLastError())) break;
// Sync the stream if specified to flush runtime errors
if (debug_synchronous && (CubDebug(error = DebugSyncStream(stream)))) break;
// Log single_reduce_sweep_kernel configuration
if (debug_synchronous) _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), %d items per thread\n",
ActivePolicyT::SingleTilePolicy::BLOCK_THREADS,
(long long) stream,
ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD);
// Invoke DeviceReduceSingleTileKernel
THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream
).doit(single_tile_kernel,
d_block_reductions,
d_out,
reduce_grid_size,
reduction_op,
init);
// Check for failure to launch
if (CubDebug(error = cudaPeekAtLastError())) break;
// Sync the stream if specified to flush runtime errors
if (debug_synchronous && (CubDebug(error = DebugSyncStream(stream)))) break;
}
while (0);
return error;
#endif // CUB_RUNTIME_ENABLED
}
|