Changeset View
Changeset View
Standalone View
Standalone View
intern/cycles/device/opencl/opencl_base.cpp
| Show First 20 Lines • Show All 410 Lines • ▼ Show 20 Lines | if(mem.device_pointer) { | ||||
| } | } | ||||
| mem.device_pointer = 0; | mem.device_pointer = 0; | ||||
| stats.mem_free(mem.device_size); | stats.mem_free(mem.device_size); | ||||
| mem.device_size = 0; | mem.device_size = 0; | ||||
| } | } | ||||
| } | } | ||||
| int OpenCLDeviceBase::mem_address_alignment() | |||||
| { | |||||
| return OpenCLInfo::mem_address_alignment(cdDevice); | |||||
| } | |||||
| device_ptr OpenCLDeviceBase::mem_alloc_sub_ptr(device_memory& mem, int offset, int size, MemoryType type) | |||||
| { | |||||
| cl_mem_flags mem_flag; | |||||
| if(type == MEM_READ_ONLY) | |||||
| mem_flag = CL_MEM_READ_ONLY; | |||||
| else if(type == MEM_WRITE_ONLY) | |||||
| mem_flag = CL_MEM_WRITE_ONLY; | |||||
| else | |||||
| mem_flag = CL_MEM_READ_WRITE; | |||||
| cl_buffer_region info; | |||||
| info.origin = mem.memory_elements_size(offset); | |||||
| info.size = mem.memory_elements_size(size); | |||||
| device_ptr sub_buf = (device_ptr) clCreateSubBuffer(CL_MEM_PTR(mem.device_pointer), | |||||
| mem_flag, | |||||
| CL_BUFFER_CREATE_TYPE_REGION, | |||||
| &info, | |||||
| &ciErr); | |||||
| opencl_assert_err(ciErr, "clCreateSubBuffer"); | |||||
| return sub_buf; | |||||
| } | |||||
| void OpenCLDeviceBase::mem_free_sub_ptr(device_ptr device_pointer) | |||||
| { | |||||
| if(device_pointer && device_pointer != null_mem) { | |||||
| opencl_assert(clReleaseMemObject(CL_MEM_PTR(device_pointer))); | |||||
| } | |||||
| } | |||||
| void OpenCLDeviceBase::const_copy_to(const char *name, void *host, size_t size) | void OpenCLDeviceBase::const_copy_to(const char *name, void *host, size_t size) | ||||
| { | { | ||||
| ConstMemMap::iterator i = const_mem_map.find(name); | ConstMemMap::iterator i = const_mem_map.find(name); | ||||
| if(i == const_mem_map.end()) { | if(i == const_mem_map.end()) { | ||||
| device_vector<uchar> *data = new device_vector<uchar>(); | device_vector<uchar> *data = new device_vector<uchar>(); | ||||
| data->copy((uchar*)host, size); | data->copy((uchar*)host, size); | ||||
| Show All 37 Lines | |||||
| } | } | ||||
| size_t OpenCLDeviceBase::global_size_round_up(int group_size, int global_size) | size_t OpenCLDeviceBase::global_size_round_up(int group_size, int global_size) | ||||
| { | { | ||||
| int r = global_size % group_size; | int r = global_size % group_size; | ||||
| return global_size + ((r == 0)? 0: group_size - r); | return global_size + ((r == 0)? 0: group_size - r); | ||||
| } | } | ||||
| void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h) | void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h, size_t max_workgroup_size) | ||||
| { | { | ||||
| size_t workgroup_size, max_work_items[3]; | size_t workgroup_size, max_work_items[3]; | ||||
| clGetKernelWorkGroupInfo(kernel, cdDevice, | clGetKernelWorkGroupInfo(kernel, cdDevice, | ||||
| CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL); | CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL); | ||||
| clGetDeviceInfo(cdDevice, | clGetDeviceInfo(cdDevice, | ||||
| CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*3, max_work_items, NULL); | CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*3, max_work_items, NULL); | ||||
| if(max_workgroup_size > 0 && workgroup_size > max_workgroup_size) { | |||||
| workgroup_size = max_workgroup_size; | |||||
| } | |||||
| /* Try to divide evenly over 2 dimensions. */ | /* Try to divide evenly over 2 dimensions. */ | ||||
| size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1); | size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1); | ||||
| size_t local_size[2] = {sqrt_workgroup_size, sqrt_workgroup_size}; | size_t local_size[2] = {sqrt_workgroup_size, sqrt_workgroup_size}; | ||||
| /* Some implementations have max size 1 on 2nd dimension. */ | /* Some implementations have max size 1 on 2nd dimension. */ | ||||
| if(local_size[1] > max_work_items[1]) { | if(local_size[1] > max_work_items[1]) { | ||||
| local_size[0] = workgroup_size/max_work_items[1]; | local_size[0] = workgroup_size/max_work_items[1]; | ||||
| local_size[1] = max_work_items[1]; | local_size[1] = max_work_items[1]; | ||||
| ▲ Show 20 Lines • Show All 69 Lines • ▼ Show 20 Lines | start_arg_index += kernel_set_args(ckFilmConvertKernel, | ||||
| d_w, | d_w, | ||||
| d_h, | d_h, | ||||
| d_offset, | d_offset, | ||||
| d_stride); | d_stride); | ||||
| enqueue_kernel(ckFilmConvertKernel, d_w, d_h); | enqueue_kernel(ckFilmConvertKernel, d_w, d_h); | ||||
| } | } | ||||
| bool OpenCLDeviceBase::denoising_non_local_means(device_ptr image_ptr, | |||||
| device_ptr guide_ptr, | |||||
| device_ptr variance_ptr, | |||||
| device_ptr out_ptr, | |||||
| DenoisingTask *task) | |||||
| { | |||||
| int4 rect = task->rect; | |||||
| int w = rect.z-rect.x; | |||||
| int h = rect.w-rect.y; | |||||
brecht: Why 4? | |||||
| int r = task->nlm_state.r; | |||||
| int f = task->nlm_state.f; | |||||
| float a = task->nlm_state.a; | |||||
| float k_2 = task->nlm_state.k_2; | |||||
| cl_mem difference = CL_MEM_PTR(task->nlm_state.temporary_1_ptr); | |||||
| cl_mem blurDifference = CL_MEM_PTR(task->nlm_state.temporary_2_ptr); | |||||
| cl_mem weightAccum = CL_MEM_PTR(task->nlm_state.temporary_3_ptr); | |||||
| cl_mem image_mem = CL_MEM_PTR(image_ptr); | |||||
| cl_mem guide_mem = CL_MEM_PTR(guide_ptr); | |||||
| cl_mem variance_mem = CL_MEM_PTR(variance_ptr); | |||||
| cl_mem out_mem = CL_MEM_PTR(out_ptr); | |||||
| mem_zero_kernel(task->nlm_state.temporary_3_ptr, sizeof(float)*w*h); | |||||
| mem_zero_kernel(out_ptr, sizeof(float)*w*h); | |||||
| cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference")); | |||||
| cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur")); | |||||
| cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight")); | |||||
| cl_kernel ckNLMUpdateOutput = denoising_program(ustring("filter_nlm_update_output")); | |||||
| cl_kernel ckNLMNormalize = denoising_program(ustring("filter_nlm_normalize")); | |||||
| for(int i = 0; i < (2*r+1)*(2*r+1); i++) { | |||||
| int dy = i / (2*r+1) - r; | |||||
| int dx = i % (2*r+1) - r; | |||||
| int4 local_rect = make_int4(max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy)); | |||||
| kernel_set_args(ckNLMCalcDifference, 0, | |||||
| dx, dy, guide_mem, variance_mem, | |||||
| difference, local_rect, w, 0, a, k_2); | |||||
| kernel_set_args(ckNLMBlur, 0, | |||||
| difference, blurDifference, local_rect, w, f); | |||||
| kernel_set_args(ckNLMCalcWeight, 0, | |||||
| blurDifference, difference, local_rect, w, f); | |||||
| kernel_set_args(ckNLMUpdateOutput, 0, | |||||
| dx, dy, blurDifference, image_mem, | |||||
| out_mem, weightAccum, local_rect, w, f); | |||||
| enqueue_kernel(ckNLMCalcDifference, w, h); | |||||
| enqueue_kernel(ckNLMBlur, w, h); | |||||
| enqueue_kernel(ckNLMCalcWeight, w, h); | |||||
| enqueue_kernel(ckNLMBlur, w, h); | |||||
| enqueue_kernel(ckNLMUpdateOutput, w, h); | |||||
| } | |||||
| int4 local_rect = make_int4(0, 0, w, h); | |||||
| kernel_set_args(ckNLMNormalize, 0, | |||||
| out_mem, weightAccum, local_rect, w); | |||||
| enqueue_kernel(ckNLMNormalize, w, h); | |||||
| return true; | |||||
| } | |||||
| bool OpenCLDeviceBase::denoising_construct_transform(DenoisingTask *task) | |||||
| { | |||||
| cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer); | |||||
| cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer); | |||||
| cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer); | |||||
| cl_kernel ckFilterConstructTransform = denoising_program(ustring("filter_construct_transform")); | |||||
| kernel_set_args(ckFilterConstructTransform, 0, | |||||
| buffer_mem, | |||||
| transform_mem, | |||||
| rank_mem, | |||||
| task->filter_area, | |||||
| task->rect, | |||||
| task->buffer.pass_stride, | |||||
| task->radius, | |||||
| task->pca_threshold); | |||||
| enqueue_kernel(ckFilterConstructTransform, | |||||
| task->storage.w, | |||||
| task->storage.h, | |||||
| 256); | |||||
| return true; | |||||
| } | |||||
| bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr, | |||||
| device_ptr color_variance_ptr, | |||||
| device_ptr guide_ptr, | |||||
| device_ptr guide_variance_ptr, | |||||
| device_ptr output_ptr, | |||||
| DenoisingTask *task) | |||||
| { | |||||
| mem_zero(task->storage.XtWX); | |||||
| mem_zero(task->storage.XtWY); | |||||
| cl_mem color_mem = CL_MEM_PTR(color_ptr); | |||||
| cl_mem color_variance_mem = CL_MEM_PTR(color_variance_ptr); | |||||
| cl_mem guide_mem = CL_MEM_PTR(guide_ptr); | |||||
| cl_mem guide_variance_mem = CL_MEM_PTR(guide_variance_ptr); | |||||
| cl_mem output_mem = CL_MEM_PTR(output_ptr); | |||||
| cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer); | |||||
| cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer); | |||||
| cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer); | |||||
| cl_mem XtWX_mem = CL_MEM_PTR(task->storage.XtWX.device_pointer); | |||||
| cl_mem XtWY_mem = CL_MEM_PTR(task->storage.XtWY.device_pointer); | |||||
| cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference")); | |||||
| cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur")); | |||||
| cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight")); | |||||
| cl_kernel ckNLMConstructGramian = denoising_program(ustring("filter_nlm_construct_gramian")); | |||||
| cl_kernel ckFinalize = denoising_program(ustring("filter_finalize")); | |||||
| cl_mem difference = CL_MEM_PTR(task->reconstruction_state.temporary_1_ptr); | |||||
| cl_mem blurDifference = CL_MEM_PTR(task->reconstruction_state.temporary_2_ptr); | |||||
| int r = task->radius; | |||||
| int f = 4; | |||||
| float a = 1.0f; | |||||
| for(int i = 0; i < (2*r+1)*(2*r+1); i++) { | |||||
| int dy = i / (2*r+1) - r; | |||||
| int dx = i % (2*r+1) - r; | |||||
| int local_rect[4] = {max(0, -dx), max(0, -dy), | |||||
| task->reconstruction_state.source_w - max(0, dx), | |||||
| task->reconstruction_state.source_h - max(0, dy)}; | |||||
| kernel_set_args(ckNLMCalcDifference, 0, | |||||
| dx, dy, | |||||
| guide_mem, | |||||
| guide_variance_mem, | |||||
| difference, | |||||
| local_rect, | |||||
| task->buffer.w, | |||||
| task->buffer.pass_stride, | |||||
| a, task->nlm_k_2); | |||||
| enqueue_kernel(ckNLMCalcDifference, | |||||
| task->reconstruction_state.source_w, | |||||
| task->reconstruction_state.source_h); | |||||
| kernel_set_args(ckNLMBlur, 0, | |||||
| difference, | |||||
| blurDifference, | |||||
| local_rect, | |||||
| task->buffer.w, | |||||
| f); | |||||
| enqueue_kernel(ckNLMBlur, | |||||
| task->reconstruction_state.source_w, | |||||
| task->reconstruction_state.source_h); | |||||
| kernel_set_args(ckNLMCalcWeight, 0, | |||||
| blurDifference, | |||||
| difference, | |||||
| local_rect, | |||||
| task->buffer.w, | |||||
| f); | |||||
| enqueue_kernel(ckNLMCalcWeight, | |||||
| task->reconstruction_state.source_w, | |||||
| task->reconstruction_state.source_h); | |||||
| /* Reuse previous arguments. */ | |||||
| enqueue_kernel(ckNLMBlur, | |||||
| task->reconstruction_state.source_w, | |||||
| task->reconstruction_state.source_h); | |||||
| kernel_set_args(ckNLMConstructGramian, 0, | |||||
| dx, dy, | |||||
| blurDifference, | |||||
| buffer_mem, | |||||
| color_mem, | |||||
| color_variance_mem, | |||||
| transform_mem, | |||||
| rank_mem, | |||||
| XtWX_mem, | |||||
| XtWY_mem, | |||||
| local_rect, | |||||
| task->reconstruction_state.filter_rect, | |||||
| task->buffer.w, | |||||
| task->buffer.h, | |||||
| f, | |||||
| task->buffer.pass_stride); | |||||
| enqueue_kernel(ckNLMConstructGramian, | |||||
| task->reconstruction_state.source_w, | |||||
| task->reconstruction_state.source_h, | |||||
| 256); | |||||
| } | |||||
| kernel_set_args(ckFinalize, 0, | |||||
| task->buffer.w, | |||||
| task->buffer.h, | |||||
| output_mem, | |||||
| rank_mem, | |||||
| XtWX_mem, | |||||
| XtWY_mem, | |||||
| task->filter_area, | |||||
| task->reconstruction_state.buffer_params, | |||||
| task->render_buffer.samples); | |||||
| enqueue_kernel(ckFinalize, | |||||
| task->reconstruction_state.source_w, | |||||
| task->reconstruction_state.source_h); | |||||
| return true; | |||||
| } | |||||
| bool OpenCLDeviceBase::denoising_combine_halves(device_ptr a_ptr, | |||||
| device_ptr b_ptr, | |||||
| device_ptr mean_ptr, | |||||
| device_ptr variance_ptr, | |||||
| int r, int4 rect, | |||||
| DenoisingTask *task) | |||||
| { | |||||
| (void) task; | |||||
| cl_mem a_mem = CL_MEM_PTR(a_ptr); | |||||
| cl_mem b_mem = CL_MEM_PTR(b_ptr); | |||||
| cl_mem mean_mem = CL_MEM_PTR(mean_ptr); | |||||
| cl_mem variance_mem = CL_MEM_PTR(variance_ptr); | |||||
| cl_kernel ckFilterCombineHalves = denoising_program(ustring("filter_combine_halves")); | |||||
| kernel_set_args(ckFilterCombineHalves, 0, | |||||
| mean_mem, | |||||
| variance_mem, | |||||
| a_mem, | |||||
| b_mem, | |||||
| rect, | |||||
| r); | |||||
| enqueue_kernel(ckFilterCombineHalves, | |||||
| task->rect.z-task->rect.x, | |||||
| task->rect.w-task->rect.y); | |||||
| return true; | |||||
| } | |||||
| bool OpenCLDeviceBase::denoising_divide_shadow(device_ptr a_ptr, | |||||
| device_ptr b_ptr, | |||||
| device_ptr sample_variance_ptr, | |||||
| device_ptr sv_variance_ptr, | |||||
| device_ptr buffer_variance_ptr, | |||||
| DenoisingTask *task) | |||||
| { | |||||
| (void) task; | |||||
| cl_mem a_mem = CL_MEM_PTR(a_ptr); | |||||
| cl_mem b_mem = CL_MEM_PTR(b_ptr); | |||||
| cl_mem sample_variance_mem = CL_MEM_PTR(sample_variance_ptr); | |||||
| cl_mem sv_variance_mem = CL_MEM_PTR(sv_variance_ptr); | |||||
| cl_mem buffer_variance_mem = CL_MEM_PTR(buffer_variance_ptr); | |||||
| cl_mem tiles_mem = CL_MEM_PTR(task->tiles_mem.device_pointer); | |||||
| cl_kernel ckFilterDivideShadow = denoising_program(ustring("filter_divide_shadow")); | |||||
| char split_kernel = is_split_kernel()? 1 : 0; | |||||
| kernel_set_args(ckFilterDivideShadow, 0, | |||||
| task->render_buffer.samples, | |||||
| tiles_mem, | |||||
| a_mem, | |||||
| b_mem, | |||||
| sample_variance_mem, | |||||
| sv_variance_mem, | |||||
| buffer_variance_mem, | |||||
| task->rect, | |||||
| task->render_buffer.pass_stride, | |||||
| task->render_buffer.denoising_data_offset, | |||||
| split_kernel); | |||||
| enqueue_kernel(ckFilterDivideShadow, | |||||
| task->rect.z-task->rect.x, | |||||
| task->rect.w-task->rect.y); | |||||
| return true; | |||||
| } | |||||
| bool OpenCLDeviceBase::denoising_get_feature(int mean_offset, | |||||
| int variance_offset, | |||||
| device_ptr mean_ptr, | |||||
| device_ptr variance_ptr, | |||||
| DenoisingTask *task) | |||||
| { | |||||
| cl_mem mean_mem = CL_MEM_PTR(mean_ptr); | |||||
| cl_mem variance_mem = CL_MEM_PTR(variance_ptr); | |||||
| cl_mem tiles_mem = CL_MEM_PTR(task->tiles_mem.device_pointer); | |||||
| cl_kernel ckFilterGetFeature = denoising_program(ustring("filter_get_feature")); | |||||
| char split_kernel = is_split_kernel()? 1 : 0; | |||||
| kernel_set_args(ckFilterGetFeature, 0, | |||||
| task->render_buffer.samples, | |||||
| tiles_mem, | |||||
| mean_offset, | |||||
| variance_offset, | |||||
| mean_mem, | |||||
| variance_mem, | |||||
| task->rect, | |||||
| task->render_buffer.pass_stride, | |||||
| task->render_buffer.denoising_data_offset, | |||||
| split_kernel); | |||||
| enqueue_kernel(ckFilterGetFeature, | |||||
| task->rect.z-task->rect.x, | |||||
| task->rect.w-task->rect.y); | |||||
| return true; | |||||
| } | |||||
| bool OpenCLDeviceBase::denoising_set_tiles(device_ptr *buffers, | |||||
| DenoisingTask *task) | |||||
| { | |||||
| mem_alloc("Denoising Tile Info", task->tiles_mem, MEM_READ_WRITE); | |||||
| mem_copy_to(task->tiles_mem); | |||||
| cl_mem tiles_mem = CL_MEM_PTR(task->tiles_mem.device_pointer); | |||||
| cl_kernel ckFilterSetTiles = denoising_program(ustring("filter_set_tiles")); | |||||
| kernel_set_args(ckFilterSetTiles, 0, tiles_mem); | |||||
| for(int i = 0; i < 9; i++) { | |||||
| cl_mem buffer_mem = CL_MEM_PTR(buffers[i]); | |||||
| kernel_set_args(ckFilterSetTiles, i+1, buffer_mem); | |||||
| } | |||||
| enqueue_kernel(ckFilterSetTiles, 1, 1); | |||||
| return true; | |||||
| } | |||||
| void OpenCLDeviceBase::denoise(RenderTile &rtile, const DeviceTask &task) | |||||
| { | |||||
| DenoisingTask denoising(this); | |||||
| denoising.functions.set_tiles = function_bind(&OpenCLDeviceBase::denoising_set_tiles, this, _1, &denoising); | |||||
| denoising.functions.construct_transform = function_bind(&OpenCLDeviceBase::denoising_construct_transform, this, &denoising); | |||||
| denoising.functions.reconstruct = function_bind(&OpenCLDeviceBase::denoising_reconstruct, this, _1, _2, _3, _4, _5, &denoising); | |||||
| denoising.functions.divide_shadow = function_bind(&OpenCLDeviceBase::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising); | |||||
| denoising.functions.non_local_means = function_bind(&OpenCLDeviceBase::denoising_non_local_means, this, _1, _2, _3, _4, &denoising); | |||||
| denoising.functions.combine_halves = function_bind(&OpenCLDeviceBase::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising); | |||||
| denoising.functions.get_feature = function_bind(&OpenCLDeviceBase::denoising_get_feature, this, _1, _2, _3, _4, &denoising); | |||||
| denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h); | |||||
| denoising.render_buffer.samples = rtile.sample; | |||||
| RenderTile rtiles[9]; | |||||
| rtiles[4] = rtile; | |||||
| task.map_neighbor_tiles(rtiles, this); | |||||
| denoising.tiles_from_rendertiles(rtiles); | |||||
| denoising.init_from_devicetask(task); | |||||
| denoising.run_denoising(); | |||||
| task.unmap_neighbor_tiles(rtiles, this); | |||||
| } | |||||
| void OpenCLDeviceBase::shader(DeviceTask& task) | void OpenCLDeviceBase::shader(DeviceTask& task) | ||||
| { | { | ||||
| /* cast arguments to cl types */ | /* cast arguments to cl types */ | ||||
| cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer); | cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer); | ||||
| cl_mem d_input = CL_MEM_PTR(task.shader_input); | cl_mem d_input = CL_MEM_PTR(task.shader_input); | ||||
| cl_mem d_output = CL_MEM_PTR(task.shader_output); | cl_mem d_output = CL_MEM_PTR(task.shader_output); | ||||
| cl_mem d_output_luma = CL_MEM_PTR(task.shader_output_luma); | cl_mem d_output_luma = CL_MEM_PTR(task.shader_output_luma); | ||||
| cl_int d_shader_eval_type = task.shader_eval_type; | cl_int d_shader_eval_type = task.shader_eval_type; | ||||
| ▲ Show 20 Lines • Show All 255 Lines • Show Last 20 Lines | |||||
Why 4?