Changeset View
Changeset View
Standalone View
Standalone View
intern/cycles/kernel/kernel_work_stealing.h
| /* | /* | ||||
| * Copyright 2011-2015 Blender Foundation | * Copyright 2011-2015 Blender Foundation | ||||
| * | * | ||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
| * you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
| * You may obtain a copy of the License at | * You may obtain a copy of the License at | ||||
| * | * | ||||
| * http://www.apache.org/licenses/LICENSE-2.0 | * http://www.apache.org/licenses/LICENSE-2.0 | ||||
| * | * | ||||
| * Unless required by applicable law or agreed to in writing, software | * Unless required by applicable law or agreed to in writing, software | ||||
| * distributed under the License is distributed on an "AS IS" BASIS, | * distributed under the License is distributed on an "AS IS" BASIS, | ||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||
| * See the License for the specific language governing permissions and | * See the License for the specific language governing permissions and | ||||
| * limitations under the License. | * limitations under the License. | ||||
| */ | */ | ||||
| #ifndef __KERNEL_WORK_STEALING_H__ | #pragma once | ||||
| #define __KERNEL_WORK_STEALING_H__ | |||||
| CCL_NAMESPACE_BEGIN | CCL_NAMESPACE_BEGIN | ||||
| /* | /* | ||||
| * Utility functions for work stealing | * Utility functions for work stealing | ||||
| */ | */ | ||||
| /* Map global work index to tile, pixel X/Y and sample. */ | /* Map global work index to tile, pixel X/Y and sample. */ | ||||
| ccl_device_inline void get_work_pixel(ccl_global const WorkTile *tile, | ccl_device_inline void get_work_pixel(ccl_global const KernelWorkTile *tile, | ||||
| uint global_work_index, | uint global_work_index, | ||||
| ccl_private uint *x, | ccl_private uint *x, | ||||
| ccl_private uint *y, | ccl_private uint *y, | ||||
| ccl_private uint *sample) | ccl_private uint *sample) | ||||
| { | { | ||||
| #ifdef __KERNEL_CUDA__ | #if 0 | ||||
| /* Keeping threads for the same pixel together improves performance on CUDA. */ | /* Keep threads for the same sample together. */ | ||||
| uint sample_offset = global_work_index % tile->num_samples; | |||||
| uint pixel_offset = global_work_index / tile->num_samples; | |||||
| #else /* __KERNEL_CUDA__ */ | |||||
| uint tile_pixels = tile->w * tile->h; | uint tile_pixels = tile->w * tile->h; | ||||
| uint sample_offset = global_work_index / tile_pixels; | uint sample_offset = global_work_index / tile_pixels; | ||||
| uint pixel_offset = global_work_index - sample_offset * tile_pixels; | uint pixel_offset = global_work_index - sample_offset * tile_pixels; | ||||
| #endif /* __KERNEL_CUDA__ */ | #else | ||||
| /* Keeping threads for the same pixel together. | |||||
| * Appears to improve performance by a few % on CUDA and OptiX. */ | |||||
| uint sample_offset = global_work_index % tile->num_samples; | |||||
| uint pixel_offset = global_work_index / tile->num_samples; | |||||
| #endif | |||||
| uint y_offset = pixel_offset / tile->w; | uint y_offset = pixel_offset / tile->w; | ||||
| uint x_offset = pixel_offset - y_offset * tile->w; | uint x_offset = pixel_offset - y_offset * tile->w; | ||||
| *x = tile->x + x_offset; | *x = tile->x + x_offset; | ||||
| *y = tile->y + y_offset; | *y = tile->y + y_offset; | ||||
| *sample = tile->start_sample + sample_offset; | *sample = tile->start_sample + sample_offset; | ||||
| } | } | ||||
| #ifdef __KERNEL_OPENCL__ | |||||
| # pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable | |||||
| #endif | |||||
| #ifdef __SPLIT_KERNEL__ | |||||
| /* Returns true if there is work */ | |||||
| ccl_device bool get_next_work_item(KernelGlobals *kg, | |||||
| ccl_global uint *work_pools, | |||||
| uint total_work_size, | |||||
| uint ray_index, | |||||
| ccl_private uint *global_work_index) | |||||
| { | |||||
| /* With a small amount of work there may be more threads than work due to | |||||
| * rounding up of global size, stop such threads immediately. */ | |||||
| if (ray_index >= total_work_size) { | |||||
| return false; | |||||
| } | |||||
| /* Increase atomic work index counter in pool. */ | |||||
| uint pool = ray_index / WORK_POOL_SIZE; | |||||
| uint work_index = atomic_fetch_and_inc_uint32(&work_pools[pool]); | |||||
| /* Map per-pool work index to a global work index. */ | |||||
| uint global_size = ccl_global_size(0) * ccl_global_size(1); | |||||
| kernel_assert(global_size % WORK_POOL_SIZE == 0); | |||||
| kernel_assert(ray_index < global_size); | |||||
| *global_work_index = (work_index / WORK_POOL_SIZE) * global_size + (pool * WORK_POOL_SIZE) + | |||||
| (work_index % WORK_POOL_SIZE); | |||||
| /* Test if all work for this pool is done. */ | |||||
| return (*global_work_index < total_work_size); | |||||
| } | |||||
| ccl_device bool get_next_work(KernelGlobals *kg, | |||||
| ccl_global uint *work_pools, | |||||
| uint total_work_size, | |||||
| uint ray_index, | |||||
| ccl_private uint *global_work_index) | |||||
| { | |||||
| bool got_work = false; | |||||
| if (kernel_data.film.pass_adaptive_aux_buffer) { | |||||
| do { | |||||
| got_work = get_next_work_item(kg, work_pools, total_work_size, ray_index, global_work_index); | |||||
| if (got_work) { | |||||
| ccl_global WorkTile *tile = &kernel_split_params.tile; | |||||
| uint x, y, sample; | |||||
| get_work_pixel(tile, *global_work_index, &x, &y, &sample); | |||||
| uint buffer_offset = (tile->offset + x + y * tile->stride) * kernel_data.film.pass_stride; | |||||
| ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset; | |||||
| ccl_global float4 *aux = (ccl_global float4 *)(buffer + | |||||
| kernel_data.film.pass_adaptive_aux_buffer); | |||||
| if ((*aux).w == 0.0f) { | |||||
| break; | |||||
| } | |||||
| } | |||||
| } while (got_work); | |||||
| } | |||||
| else { | |||||
| got_work = get_next_work_item(kg, work_pools, total_work_size, ray_index, global_work_index); | |||||
| } | |||||
| return got_work; | |||||
| } | |||||
| #endif | |||||
| CCL_NAMESPACE_END | CCL_NAMESPACE_END | ||||
| #endif /* __KERNEL_WORK_STEALING_H__ */ | |||||