Changeset View
Changeset View
Standalone View
Standalone View
intern/cycles/device/opencl/device_opencl_impl.cpp
| Show First 20 Lines • Show All 465 Lines • ▼ Show 20 Lines | virtual bool enqueue_split_kernel_data_init(const KernelDimensions &dim, | ||||
| device_memory &use_queues_flag, | device_memory &use_queues_flag, | ||||
| device_memory &work_pool_wgs) | device_memory &work_pool_wgs) | ||||
| { | { | ||||
| cl_int dQueue_size = dim.global_size[0] * dim.global_size[1]; | cl_int dQueue_size = dim.global_size[0] * dim.global_size[1]; | ||||
| /* Set the range of samples to be processed for every ray in | /* Set the range of samples to be processed for every ray in | ||||
| * path-regeneration logic. | * path-regeneration logic. | ||||
| */ | */ | ||||
| cl_int start_sample = rtile.start_sample; | cl_int start_sample = rtile.get_start_sample(); | ||||
| cl_int end_sample = rtile.start_sample + rtile.num_samples; | cl_int end_sample = rtile.get_start_sample() + rtile.get_num_samples(); | ||||
| OpenCLDevice::OpenCLSplitPrograms *programs = device->get_split_programs(); | OpenCLDevice::OpenCLSplitPrograms *programs = device->get_split_programs(); | ||||
| cl_kernel kernel_data_init = programs->program_split(ustring("path_trace_data_init")); | cl_kernel kernel_data_init = programs->program_split(ustring("path_trace_data_init")); | ||||
| cl_uint start_arg_index = device->kernel_set_args(kernel_data_init, | cl_uint start_arg_index = device->kernel_set_args(kernel_data_init, | ||||
| 0, | 0, | ||||
| kernel_globals, | kernel_globals, | ||||
| kernel_data, | kernel_data, | ||||
| split_data, | split_data, | ||||
| num_global_elements, | num_global_elements, | ||||
| ray_state); | ray_state); | ||||
| device->set_kernel_arg_buffers(kernel_data_init, &start_arg_index); | device->set_kernel_arg_buffers(kernel_data_init, &start_arg_index); | ||||
| start_arg_index += device->kernel_set_args(kernel_data_init, | start_arg_index += device->kernel_set_args(kernel_data_init, | ||||
| start_arg_index, | start_arg_index, | ||||
| start_sample, | start_sample, | ||||
| end_sample, | end_sample, | ||||
| rtile.x, | rtile.get_x(), | ||||
| rtile.y, | rtile.get_y(), | ||||
| rtile.w, | rtile.get_w(), | ||||
| rtile.h, | rtile.get_h(), | ||||
| rtile.offset, | rtile.get_offset(), | ||||
| rtile.stride, | rtile.get_stride(), | ||||
| queue_index, | queue_index, | ||||
| dQueue_size, | dQueue_size, | ||||
| use_queues_flag, | use_queues_flag, | ||||
| work_pool_wgs, | work_pool_wgs, | ||||
| rtile.num_samples, | rtile.get_num_samples(), | ||||
| rtile.buffer); | rtile.get_buffer()); | ||||
| /* Enqueue ckPathTraceKernel_data_init kernel. */ | /* Enqueue ckPathTraceKernel_data_init kernel. */ | ||||
| device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue, | device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue, | ||||
| kernel_data_init, | kernel_data_init, | ||||
| 2, | 2, | ||||
| NULL, | NULL, | ||||
| dim.global_size, | dim.global_size, | ||||
| dim.local_size, | dim.local_size, | ||||
| Show All 10 Lines | if (device->ciErr != CL_SUCCESS) { | ||||
| return false; | return false; | ||||
| } | } | ||||
| cached_memory.split_data = &split_data; | cached_memory.split_data = &split_data; | ||||
| cached_memory.ray_state = &ray_state; | cached_memory.ray_state = &ray_state; | ||||
| cached_memory.queue_index = &queue_index; | cached_memory.queue_index = &queue_index; | ||||
| cached_memory.use_queues_flag = &use_queues_flag; | cached_memory.use_queues_flag = &use_queues_flag; | ||||
| cached_memory.work_pools = &work_pool_wgs; | cached_memory.work_pools = &work_pool_wgs; | ||||
| cached_memory.buffer = &rtile.buffer; | cached_memory.buffer = &rtile.get_buffer(); | ||||
| cached_memory.id++; | cached_memory.id++; | ||||
| return true; | return true; | ||||
| } | } | ||||
| virtual int2 split_kernel_local_size() | virtual int2 split_kernel_local_size() | ||||
| { | { | ||||
| return make_int2(64, 1); | return make_int2(64, 1); | ||||
| Show All 10 Lines | if (type == CL_DEVICE_TYPE_CPU) { | ||||
| return make_int2(64, 64); | return make_int2(64, 64); | ||||
| } | } | ||||
| cl_ulong max_buffer_size; | cl_ulong max_buffer_size; | ||||
| clGetDeviceInfo( | clGetDeviceInfo( | ||||
| device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_buffer_size, NULL); | device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_buffer_size, NULL); | ||||
| if (DebugFlags().opencl.mem_limit) { | if (DebugFlags().opencl.mem_limit) { | ||||
| max_buffer_size = min(max_buffer_size, | max_buffer_size = min( | ||||
| cl_ulong(DebugFlags().opencl.mem_limit - device->stats.mem_used)); | max_buffer_size, cl_ulong(DebugFlags().opencl.mem_limit - device->stats.get_mem_used())); | ||||
| } | } | ||||
| VLOG(1) << "Maximum device allocation size: " << string_human_readable_number(max_buffer_size) | VLOG(1) << "Maximum device allocation size: " << string_human_readable_number(max_buffer_size) | ||||
| << " bytes. (" << string_human_readable_size(max_buffer_size) << ")."; | << " bytes. (" << string_human_readable_size(max_buffer_size) << ")."; | ||||
| /* Limit to 2gb, as we shouldn't need more than that and some devices may support much more. */ | /* Limit to 2gb, as we shouldn't need more than that and some devices may support much more. */ | ||||
| max_buffer_size = min(max_buffer_size / 2, (cl_ulong)2l * 1024 * 1024 * 1024); | max_buffer_size = min(max_buffer_size / 2, (cl_ulong)2l * 1024 * 1024 * 1024); | ||||
| ▲ Show 20 Lines • Show All 365 Lines • ▼ Show 20 Lines | void OpenCLDevice::mem_alloc(device_memory &mem) | ||||
| size_t size = mem.memory_size(); | size_t size = mem.memory_size(); | ||||
| /* check there is enough memory available for the allocation */ | /* check there is enough memory available for the allocation */ | ||||
| cl_ulong max_alloc_size = 0; | cl_ulong max_alloc_size = 0; | ||||
| clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_alloc_size, NULL); | clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_alloc_size, NULL); | ||||
| if (DebugFlags().opencl.mem_limit) { | if (DebugFlags().opencl.mem_limit) { | ||||
| max_alloc_size = min(max_alloc_size, cl_ulong(DebugFlags().opencl.mem_limit - stats.mem_used)); | max_alloc_size = min(max_alloc_size, | ||||
| cl_ulong(DebugFlags().opencl.mem_limit - stats.get_mem_used())); | |||||
| } | } | ||||
| if (size > max_alloc_size) { | if (size > max_alloc_size) { | ||||
| string error = "Scene too complex to fit in available memory."; | string error = "Scene too complex to fit in available memory."; | ||||
| if (mem.name != NULL) { | if (mem.name != NULL) { | ||||
| error += string_printf(" (allocating buffer %s failed.)", mem.name); | error += string_printf(" (allocating buffer %s failed.)", mem.name); | ||||
| } | } | ||||
| set_error(error); | set_error(error); | ||||
| ▲ Show 20 Lines • Show All 404 Lines • ▼ Show 20 Lines | if (task.type == DeviceTask::RENDER) { | ||||
| DenoisingTask denoising(this, task); | DenoisingTask denoising(this, task); | ||||
| /* Allocate buffer for kernel globals */ | /* Allocate buffer for kernel globals */ | ||||
| device_only_memory<KernelGlobalsDummy> kgbuffer(this, "kernel_globals"); | device_only_memory<KernelGlobalsDummy> kgbuffer(this, "kernel_globals"); | ||||
| kgbuffer.alloc_to_device(1); | kgbuffer.alloc_to_device(1); | ||||
| /* Keep rendering tiles until done. */ | /* Keep rendering tiles until done. */ | ||||
| while (task.acquire_tile(this, tile, task.tile_types)) { | while (task.acquire_tile(this, tile, task.tile_types)) { | ||||
| if (tile.task == RenderTile::PATH_TRACE) { | if (tile.get_task() == RenderTile::PATH_TRACE) { | ||||
| assert(tile.task == RenderTile::PATH_TRACE); | assert(tile.get_task() == RenderTile::PATH_TRACE); | ||||
| scoped_timer timer(&tile.buffers->render_time); | scoped_timer timer(&tile.get_buffers()->get_render_time()); | ||||
| split_kernel->path_trace(task, tile, kgbuffer, *const_mem_map["__data"]); | split_kernel->path_trace(task, tile, kgbuffer, *const_mem_map["__data"]); | ||||
| /* Complete kernel execution before release tile. */ | /* Complete kernel execution before release tile. */ | ||||
| /* This helps in multi-device render; | /* This helps in multi-device render; | ||||
| * The device that reaches the critical-section function | * The device that reaches the critical-section function | ||||
| * release_tile waits (stalling other devices from entering | * release_tile waits (stalling other devices from entering | ||||
| * release_tile) for all kernels to complete. If device1 (a | * release_tile) for all kernels to complete. If device1 (a | ||||
| * slow-render device) reaches release_tile first then it would | * slow-render device) reaches release_tile first then it would | ||||
| * stall device2 (a fast-render device) from proceeding to render | * stall device2 (a fast-render device) from proceeding to render | ||||
| * next tile. | * next tile. | ||||
| */ | */ | ||||
| clFinish(cqCommandQueue); | clFinish(cqCommandQueue); | ||||
| } | } | ||||
| else if (tile.task == RenderTile::BAKE) { | else if (tile.get_task() == RenderTile::BAKE) { | ||||
| bake(task, tile); | bake(task, tile); | ||||
| } | } | ||||
| else if (tile.task == RenderTile::DENOISE) { | else if (tile.get_task() == RenderTile::DENOISE) { | ||||
| tile.sample = tile.start_sample + tile.num_samples; | tile.get_sample() = tile.get_start_sample() + tile.get_num_samples(); | ||||
| denoise(tile, denoising); | denoise(tile, denoising); | ||||
| task.update_progress(&tile, tile.w * tile.h); | task.update_progress(&tile, tile.get_w() * tile.get_h()); | ||||
| } | } | ||||
| task.release_tile(tile); | task.release_tile(tile); | ||||
| } | } | ||||
| kgbuffer.free(); | kgbuffer.free(); | ||||
| } | } | ||||
| else if (task.type == DeviceTask::SHADER) { | else if (task.type == DeviceTask::SHADER) { | ||||
| shader(task); | shader(task); | ||||
| } | } | ||||
| else if (task.type == DeviceTask::FILM_CONVERT) { | else if (task.type == DeviceTask::FILM_CONVERT) { | ||||
| film_convert(task, task.buffer, task.rgba_byte, task.rgba_half); | film_convert(task, task.buffer, task.rgba_byte, task.rgba_half); | ||||
| } | } | ||||
| else if (task.type == DeviceTask::DENOISE_BUFFER) { | else if (task.type == DeviceTask::DENOISE_BUFFER) { | ||||
| RenderTile tile; | RenderTile tile; | ||||
| tile.x = task.x; | tile.get_x() = task.x; | ||||
| tile.y = task.y; | tile.get_y() = task.y; | ||||
| tile.w = task.w; | tile.get_w() = task.w; | ||||
| tile.h = task.h; | tile.get_h() = task.h; | ||||
| tile.buffer = task.buffer; | tile.get_buffer() = task.buffer; | ||||
| tile.sample = task.sample + task.num_samples; | tile.get_sample() = task.sample + task.num_samples; | ||||
| tile.num_samples = task.num_samples; | tile.get_num_samples() = task.num_samples; | ||||
| tile.start_sample = task.sample; | tile.get_start_sample() = task.sample; | ||||
| tile.offset = task.offset; | tile.get_offset() = task.offset; | ||||
| tile.stride = task.stride; | tile.get_stride() = task.stride; | ||||
| tile.buffers = task.buffers; | tile.get_buffers() = task.buffers; | ||||
| DenoisingTask denoising(this, task); | DenoisingTask denoising(this, task); | ||||
| denoise(tile, denoising); | denoise(tile, denoising); | ||||
| task.update_progress(&tile, tile.w * tile.h); | task.update_progress(&tile, tile.get_w() * tile.get_h()); | ||||
| } | } | ||||
| } | } | ||||
| void OpenCLDevice::film_convert(DeviceTask &task, | void OpenCLDevice::film_convert(DeviceTask &task, | ||||
| device_ptr buffer, | device_ptr buffer, | ||||
| device_ptr rgba_byte, | device_ptr rgba_byte, | ||||
| device_ptr rgba_half) | device_ptr rgba_half) | ||||
| { | { | ||||
| ▲ Show 20 Lines • Show All 421 Lines • ▼ Show 20 Lines | denoising.functions.combine_halves = function_bind( | ||||
| &OpenCLDevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising); | &OpenCLDevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising); | ||||
| denoising.functions.get_feature = function_bind( | denoising.functions.get_feature = function_bind( | ||||
| &OpenCLDevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising); | &OpenCLDevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising); | ||||
| denoising.functions.write_feature = function_bind( | denoising.functions.write_feature = function_bind( | ||||
| &OpenCLDevice::denoising_write_feature, this, _1, _2, _3, &denoising); | &OpenCLDevice::denoising_write_feature, this, _1, _2, _3, &denoising); | ||||
| denoising.functions.detect_outliers = function_bind( | denoising.functions.detect_outliers = function_bind( | ||||
| &OpenCLDevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising); | &OpenCLDevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising); | ||||
| denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h); | denoising.filter_area = make_int4(rtile.get_x(), rtile.get_y(), rtile.get_w(), rtile.get_h()); | ||||
| denoising.render_buffer.samples = rtile.sample; | denoising.render_buffer.samples = rtile.get_sample(); | ||||
| denoising.buffer.gpu_temporary_mem = true; | denoising.buffer.gpu_temporary_mem = true; | ||||
| denoising.run_denoising(rtile); | denoising.run_denoising(rtile); | ||||
| } | } | ||||
| void OpenCLDevice::shader(DeviceTask &task) | void OpenCLDevice::shader(DeviceTask &task) | ||||
| { | { | ||||
| /* cast arguments to cl types */ | /* cast arguments to cl types */ | ||||
| Show All 35 Lines | for (int sample = 0; sample < task.num_samples; sample++) { | ||||
| clFinish(cqCommandQueue); | clFinish(cqCommandQueue); | ||||
| task.update_progress(NULL); | task.update_progress(NULL); | ||||
| } | } | ||||
| } | } | ||||
| void OpenCLDevice::bake(DeviceTask &task, RenderTile &rtile) | void OpenCLDevice::bake(DeviceTask &task, RenderTile &rtile) | ||||
| { | { | ||||
| scoped_timer timer(&rtile.buffers->render_time); | scoped_timer timer(&rtile.get_buffers()->get_render_time()); | ||||
| /* Cast arguments to cl types. */ | /* Cast arguments to cl types. */ | ||||
| cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer); | cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer); | ||||
| cl_mem d_buffer = CL_MEM_PTR(rtile.buffer); | cl_mem d_buffer = CL_MEM_PTR(rtile.get_buffer()); | ||||
| cl_int d_x = rtile.x; | cl_int d_x = rtile.get_x(); | ||||
| cl_int d_y = rtile.y; | cl_int d_y = rtile.get_y(); | ||||
| cl_int d_w = rtile.w; | cl_int d_w = rtile.get_w(); | ||||
| cl_int d_h = rtile.h; | cl_int d_h = rtile.get_h(); | ||||
| cl_int d_offset = rtile.offset; | cl_int d_offset = rtile.get_offset(); | ||||
| cl_int d_stride = rtile.stride; | cl_int d_stride = rtile.get_stride(); | ||||
| bake_program.wait_for_availability(); | bake_program.wait_for_availability(); | ||||
| cl_kernel kernel = bake_program(); | cl_kernel kernel = bake_program(); | ||||
| cl_uint start_arg_index = kernel_set_args(kernel, 0, d_data, d_buffer); | cl_uint start_arg_index = kernel_set_args(kernel, 0, d_data, d_buffer); | ||||
| set_kernel_arg_buffers(kernel, &start_arg_index); | set_kernel_arg_buffers(kernel, &start_arg_index); | ||||
| start_arg_index += kernel_set_args( | start_arg_index += kernel_set_args( | ||||
| kernel, start_arg_index, d_x, d_y, d_w, d_h, d_offset, d_stride); | kernel, start_arg_index, d_x, d_y, d_w, d_h, d_offset, d_stride); | ||||
| int start_sample = rtile.start_sample; | int start_sample = rtile.get_start_sample(); | ||||
| int end_sample = rtile.start_sample + rtile.num_samples; | int end_sample = rtile.get_start_sample() + rtile.get_num_samples(); | ||||
| for (int sample = start_sample; sample < end_sample; sample++) { | for (int sample = start_sample; sample < end_sample; sample++) { | ||||
| if (task.get_cancel()) { | if (task.get_cancel()) { | ||||
| if (task.need_finish_queue == false) | if (task.need_finish_queue == false) | ||||
| break; | break; | ||||
| } | } | ||||
| kernel_set_args(kernel, start_arg_index, sample); | kernel_set_args(kernel, start_arg_index, sample); | ||||
| enqueue_kernel(kernel, d_w, d_h); | enqueue_kernel(kernel, d_w, d_h); | ||||
| clFinish(cqCommandQueue); | clFinish(cqCommandQueue); | ||||
| rtile.sample = sample + 1; | rtile.get_sample() = sample + 1; | ||||
| task.update_progress(&rtile, rtile.w * rtile.h); | task.update_progress(&rtile, rtile.get_w() * rtile.get_h()); | ||||
| } | } | ||||
| } | } | ||||
| static bool kernel_build_opencl_2(cl_device_id cdDevice) | static bool kernel_build_opencl_2(cl_device_id cdDevice) | ||||
| { | { | ||||
| /* Build with OpenCL 2.0 if available, this improves performance | /* Build with OpenCL 2.0 if available, this improves performance | ||||
| * with AMD OpenCL drivers on Windows and Linux (legacy drivers). | * with AMD OpenCL drivers on Windows and Linux (legacy drivers). | ||||
| * Note that OpenCL selects the highest 1.x version by default, | * Note that OpenCL selects the highest 1.x version by default, | ||||
| ▲ Show 20 Lines • Show All 225 Lines • Show Last 20 Lines | |||||