Changeset View
Changeset View
Standalone View
Standalone View
intern/cycles/kernel/device/gpu/parallel_active_index.h
| Show All 25 Lines | |||||
| #include "util/atomic.h" | #include "util/atomic.h" | ||||
| #ifdef __HIP__ | #ifdef __HIP__ | ||||
| # define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 1024 | # define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 1024 | ||||
| #else | #else | ||||
| # define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512 | # define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512 | ||||
| #endif | #endif | ||||
| #ifdef __KERNEL_METAL__ | #ifndef __KERNEL_METAL__ | ||||
| struct ActiveIndexContext { | template<uint blocksize, typename IsActiveOp> | ||||
| ActiveIndexContext(int _thread_index, | __device__ | ||||
| int _global_index, | #endif | ||||
| int _threadgroup_size, | void gpu_parallel_active_index_array_impl(const uint num_states, | ||||
| int _simdgroup_size, | |||||
| int _simd_lane_index, | |||||
| int _simd_group_index, | |||||
| int _num_simd_groups, | |||||
| threadgroup int *_simdgroup_offset) | |||||
| : thread_index(_thread_index), | |||||
| global_index(_global_index), | |||||
| blocksize(_threadgroup_size), | |||||
| ccl_gpu_warp_size(_simdgroup_size), | |||||
| thread_warp(_simd_lane_index), | |||||
| warp_index(_simd_group_index), | |||||
| num_warps(_num_simd_groups), | |||||
| warp_offset(_simdgroup_offset) | |||||
| { | |||||
| } | |||||
| const int thread_index, global_index, blocksize, ccl_gpu_warp_size, thread_warp, warp_index, | |||||
| num_warps; | |||||
| threadgroup int *warp_offset; | |||||
| template<uint blocksizeDummy, typename IsActiveOp> | |||||
| void active_index_array(const uint num_states, | |||||
| ccl_global int *indices, | ccl_global int *indices, | ||||
| ccl_global int *num_indices, | ccl_global int *num_indices, | ||||
| IsActiveOp is_active_op) | #ifdef __KERNEL_METAL__ | ||||
| const uint is_active, | |||||
| const uint blocksize, | |||||
| const int thread_index, | |||||
| const uint state_index, | |||||
| const int ccl_gpu_warp_size, | |||||
| const int thread_warp, | |||||
| const int warp_index, | |||||
| const int num_warps, | |||||
| threadgroup int *warp_offset) | |||||
| { | { | ||||
| const uint state_index = global_index; | |||||
| #else | #else | ||||
| template<uint blocksize, typename IsActiveOp> | |||||
| __device__ void gpu_parallel_active_index_array(const uint num_states, | |||||
| ccl_global int *indices, | |||||
| ccl_global int *num_indices, | |||||
| IsActiveOp is_active_op) | IsActiveOp is_active_op) | ||||
| { | { | ||||
| extern ccl_gpu_shared int warp_offset[]; | extern ccl_gpu_shared int warp_offset[]; | ||||
| const uint thread_index = ccl_gpu_thread_idx_x; | const uint thread_index = ccl_gpu_thread_idx_x; | ||||
| const uint thread_warp = thread_index % ccl_gpu_warp_size; | const uint thread_warp = thread_index % ccl_gpu_warp_size; | ||||
| const uint warp_index = thread_index / ccl_gpu_warp_size; | const uint warp_index = thread_index / ccl_gpu_warp_size; | ||||
| const uint num_warps = blocksize / ccl_gpu_warp_size; | const uint num_warps = blocksize / ccl_gpu_warp_size; | ||||
| const uint state_index = ccl_gpu_block_idx_x * blocksize + thread_index; | const uint state_index = ccl_gpu_block_idx_x * blocksize + thread_index; | ||||
| #endif | |||||
| /* Test if state corresponding to this thread is active. */ | /* Test if state corresponding to this thread is active. */ | ||||
| const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0; | const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0; | ||||
| #endif | |||||
| /* For each thread within a warp compute how many other active states precede it. */ | /* For each thread within a warp compute how many other active states precede it. */ | ||||
| const uint thread_offset = popcount(ccl_gpu_ballot(is_active) & | const uint thread_offset = popcount(ccl_gpu_ballot(is_active) & | ||||
| ccl_gpu_thread_mask(thread_warp)); | ccl_gpu_thread_mask(thread_warp)); | ||||
| /* Last thread in warp stores number of active states for each warp. */ | /* Last thread in warp stores number of active states for each warp. */ | ||||
| if (thread_warp == ccl_gpu_warp_size - 1) { | if (thread_warp == ccl_gpu_warp_size - 1) { | ||||
| warp_offset[warp_index] = thread_offset + is_active; | warp_offset[warp_index] = thread_offset + is_active; | ||||
| } | } | ||||
| ccl_gpu_syncthreads(); | ccl_gpu_syncthreads(); | ||||
| /* Last thread in block converts per-warp sizes to offsets, increments global size of | /* Last thread in block converts per-warp sizes to offsets, increments global size of | ||||
| * index array and gets offset to write to. */ | * index array and gets offset to write to. */ | ||||
| if (thread_index == blocksize - 1) { | if (thread_index == blocksize - 1) { | ||||
| /* TODO: parallelize this. */ | /* TODO: parallelize this. */ | ||||
| int offset = 0; | int offset = 0; | ||||
| for (int i = 0; i < num_warps; i++) { | for (int i = 0; i < num_warps; i++) { | ||||
| int num_active = warp_offset[i]; | int num_active = warp_offset[i]; | ||||
| warp_offset[i] = offset; | warp_offset[i] = offset; | ||||
| offset += num_active; | offset += num_active; | ||||
| } | } | ||||
| const uint block_num_active = warp_offset[warp_index] + thread_offset + is_active; | const uint block_num_active = warp_offset[warp_index] + thread_offset + is_active; | ||||
| warp_offset[num_warps] = atomic_fetch_and_add_uint32(num_indices, block_num_active); | warp_offset[num_warps] = atomic_fetch_and_add_uint32(num_indices, block_num_active); | ||||
| } | } | ||||
| ccl_gpu_syncthreads(); | ccl_gpu_syncthreads(); | ||||
| /* Write to index array. */ | /* Write to index array. */ | ||||
| if (is_active) { | if (is_active) { | ||||
| const uint block_offset = warp_offset[num_warps]; | const uint block_offset = warp_offset[num_warps]; | ||||
| indices[block_offset + warp_offset[warp_index] + thread_offset] = state_index; | indices[block_offset + warp_offset[warp_index] + thread_offset] = state_index; | ||||
| } | } | ||||
| } | } | ||||
| #ifdef __KERNEL_METAL__ | #ifdef __KERNEL_METAL__ | ||||
| }; /* end class ActiveIndexContext */ | |||||
| /* inject the required thread params into a struct, and redirect to its templated member function | # define gpu_parallel_active_index_array(dummy, num_states, indices, num_indices, is_active_op) \ | ||||
| */ | const uint is_active = (ccl_gpu_global_id_x() < num_states) ? is_active_op(ccl_gpu_global_id_x()) : 0; \ | ||||
| # define gpu_parallel_active_index_array \ | gpu_parallel_active_index_array_impl(num_states, indices, num_indices, is_active, \ | ||||
| ActiveIndexContext(metal_local_id, \ | metal_local_size, metal_local_id, metal_global_id, simdgroup_size, simd_lane_index, \ | ||||
| metal_global_id, \ | simd_group_index, num_simd_groups, simdgroup_offset) | ||||
| metal_local_size, \ | |||||
| simdgroup_size, \ | #else | ||||
| simd_lane_index, \ | |||||
| simd_group_index, \ | # define gpu_parallel_active_index_array(blocksize, num_states, indices, num_indices, is_active_op) \ | ||||
| num_simd_groups, \ | gpu_parallel_active_index_array_impl<blocksize>(num_states, indices, num_indices, is_active_op) | ||||
| simdgroup_offset) \ | |||||
| .active_index_array | |||||
| #endif | #endif | ||||
| CCL_NAMESPACE_END | CCL_NAMESPACE_END | ||||