Changeset View
Changeset View
Standalone View
Standalone View
intern/cycles/device/opencl/opencl_split.cpp
| Show All 12 Lines | |||||
| * See the License for the specific language governing permissions and | * See the License for the specific language governing permissions and | ||||
| * limitations under the License. | * limitations under the License. | ||||
| */ | */ | ||||
| #ifdef WITH_OPENCL | #ifdef WITH_OPENCL | ||||
| #include "device/opencl/opencl.h" | #include "device/opencl/opencl.h" | ||||
| #include "render/buffers.h" | |||||
| #include "kernel/kernel_types.h" | #include "kernel/kernel_types.h" | ||||
| #include "kernel/split/kernel_split_data_types.h" | #include "kernel/split/kernel_split_data_types.h" | ||||
| #include "device/device_split_kernel.h" | |||||
| #include "util/util_algorithm.h" | #include "util/util_algorithm.h" | ||||
| #include "util/util_debug.h" | #include "util/util_debug.h" | ||||
| #include "util/util_foreach.h" | |||||
| #include "util/util_logging.h" | #include "util/util_logging.h" | ||||
| #include "util/util_md5.h" | #include "util/util_md5.h" | ||||
| #include "util/util_path.h" | #include "util/util_path.h" | ||||
| #include "util/util_time.h" | #include "util/util_time.h" | ||||
| CCL_NAMESPACE_BEGIN | CCL_NAMESPACE_BEGIN | ||||
| class OpenCLSplitKernel; | struct texture_slot_t { | ||||
| texture_slot_t(const string& name, int slot) | |||||
| namespace { | : name(name), | ||||
| slot(slot) { | |||||
| /* Copy dummy KernelGlobals related to OpenCL from kernel_globals.h to | |||||
| * fetch its size. | |||||
| */ | |||||
| typedef struct KernelGlobalsDummy { | |||||
| ccl_constant KernelData *data; | |||||
| ccl_global char *buffers[8]; | |||||
| #define KERNEL_TEX(type, name) \ | |||||
| TextureInfo name; | |||||
| # include "kernel/kernel_textures.h" | |||||
| #undef KERNEL_TEX | |||||
| SplitData split_data; | |||||
| SplitParams split_param_data; | |||||
| } KernelGlobalsDummy; | |||||
| } // namespace | |||||
| static string get_build_options(OpenCLDeviceBase *device, const DeviceRequestedFeatures& requested_features) | |||||
| { | |||||
| string build_options = "-D__SPLIT_KERNEL__ "; | |||||
| build_options += requested_features.get_build_options(); | |||||
| /* Set compute device build option. */ | |||||
| cl_device_type device_type; | |||||
| OpenCLInfo::get_device_type(device->cdDevice, &device_type, &device->ciErr); | |||||
| assert(device->ciErr == CL_SUCCESS); | |||||
| if(device_type == CL_DEVICE_TYPE_GPU) { | |||||
| build_options += " -D__COMPUTE_DEVICE_GPU__"; | |||||
| } | |||||
| return build_options; | |||||
| } | |||||
| /* OpenCLDeviceSplitKernel's declaration/definition. */ | |||||
| class OpenCLDeviceSplitKernel : public OpenCLDeviceBase | |||||
| { | |||||
| public: | |||||
| DeviceSplitKernel *split_kernel; | |||||
| OpenCLProgram program_data_init; | |||||
| OpenCLProgram program_state_buffer_size; | |||||
| OpenCLProgram program_split; | |||||
| OpenCLProgram program_path_init; | |||||
| OpenCLProgram program_scene_intersect; | |||||
| OpenCLProgram program_lamp_emission; | |||||
| OpenCLProgram program_do_volume; | |||||
| OpenCLProgram program_queue_enqueue; | |||||
| OpenCLProgram program_indirect_background; | |||||
| OpenCLProgram program_shader_setup; | |||||
| OpenCLProgram program_shader_sort; | |||||
| OpenCLProgram program_shader_eval; | |||||
| OpenCLProgram program_holdout_emission_blurring_pathtermination_ao; | |||||
| OpenCLProgram program_subsurface_scatter; | |||||
| OpenCLProgram program_direct_lighting; | |||||
| OpenCLProgram program_shadow_blocked_ao; | |||||
| OpenCLProgram program_shadow_blocked_dl; | |||||
| OpenCLProgram program_enqueue_inactive; | |||||
| OpenCLProgram program_next_iteration_setup; | |||||
| OpenCLProgram program_indirect_subsurface; | |||||
| OpenCLProgram program_buffer_update; | |||||
| OpenCLDeviceSplitKernel(DeviceInfo& info, Stats &stats, Profiler &profiler, bool background_); | |||||
| ~OpenCLDeviceSplitKernel() | |||||
| { | |||||
| task_pool.stop(); | |||||
| /* Release kernels */ | |||||
| program_data_init.release(); | |||||
| delete split_kernel; | |||||
| } | |||||
| virtual bool show_samples() const { | |||||
| return true; | |||||
| } | |||||
| virtual BVHLayoutMask get_bvh_layout_mask() const { | |||||
| return BVH_LAYOUT_BVH2; | |||||
| } | |||||
| virtual bool load_kernels(const DeviceRequestedFeatures& requested_features) | |||||
| { | |||||
| if (!OpenCLDeviceBase::load_kernels(requested_features)) { | |||||
| return false; | |||||
| } | |||||
| return split_kernel->load_kernels(requested_features); | |||||
| } | } | ||||
| string name; | |||||
| int slot; | |||||
| }; | |||||
| const string fast_compiled_kernels = | static const string fast_compiled_kernels = | ||||
| "path_init " | "path_init " | ||||
| "scene_intersect " | "scene_intersect " | ||||
| "queue_enqueue " | "queue_enqueue " | ||||
| "shader_setup " | "shader_setup " | ||||
| "shader_sort " | "shader_sort " | ||||
| "enqueue_inactive " | "enqueue_inactive " | ||||
| "next_iteration_setup " | "next_iteration_setup " | ||||
| "indirect_subsurface " | "indirect_subsurface " | ||||
| "buffer_update"; | "buffer_update"; | ||||
| const string get_opencl_program_name(bool single_program, const string& kernel_name) | const string OpenCLDevice::get_opencl_program_name(bool single_program, const string& kernel_name) | ||||
| { | { | ||||
| if (single_program) { | if (single_program) { | ||||
| return "split"; | return "split"; | ||||
| } | } | ||||
| else { | else { | ||||
| if (fast_compiled_kernels.find(kernel_name) != std::string::npos) { | if (fast_compiled_kernels.find(kernel_name) != std::string::npos) { | ||||
| return "split_bundle"; | return "split_bundle"; | ||||
| } | } | ||||
| else { | else { | ||||
| return "split_" + kernel_name; | return "split_" + kernel_name; | ||||
| } | } | ||||
| } | } | ||||
| } | } | ||||
| const string get_opencl_program_filename(bool single_program, const string& kernel_name) | const string OpenCLDevice::get_opencl_program_filename(bool single_program, const string& kernel_name) | ||||
| { | { | ||||
| if (single_program) { | if (single_program) { | ||||
| return "kernel_split.cl"; | return "kernel_split.cl"; | ||||
| } | } | ||||
| else { | else { | ||||
| if (fast_compiled_kernels.find(kernel_name) != std::string::npos) { | if (fast_compiled_kernels.find(kernel_name) != std::string::npos) { | ||||
| return "kernel_split_bundle.cl"; | return "kernel_split_bundle.cl"; | ||||
| } | } | ||||
| else { | else { | ||||
| return "kernel_" + kernel_name + ".cl"; | return "kernel_" + kernel_name + ".cl"; | ||||
| } | } | ||||
| } | } | ||||
| } | } | ||||
| virtual bool add_kernel_programs(const DeviceRequestedFeatures& requested_features, | string OpenCLDevice::get_build_options(const DeviceRequestedFeatures& requested_features) | ||||
| vector<OpenCLDeviceBase::OpenCLProgram*> &programs) | |||||
| { | { | ||||
| bool single_program = OpenCLInfo::use_single_program(); | string build_options = "-D__SPLIT_KERNEL__ "; | ||||
| program_data_init = OpenCLDeviceBase::OpenCLProgram( | build_options += requested_features.get_build_options(); | ||||
| this, | |||||
| get_opencl_program_name(single_program, "data_init"), | |||||
| get_opencl_program_filename(single_program, "data_init"), | |||||
| get_build_options(this, requested_features)); | |||||
| program_data_init.add_kernel(ustring("path_trace_data_init")); | |||||
| programs.push_back(&program_data_init); | |||||
| program_state_buffer_size = OpenCLDeviceBase::OpenCLProgram( | |||||
| this, | |||||
| get_opencl_program_name(single_program, "state_buffer_size"), | |||||
| get_opencl_program_filename(single_program, "state_buffer_size"), | |||||
| get_build_options(this, requested_features)); | |||||
| program_state_buffer_size.add_kernel(ustring("path_trace_state_buffer_size")); | |||||
| programs.push_back(&program_state_buffer_size); | |||||
| #define ADD_SPLIT_KERNEL_SINGLE_PROGRAM(kernel_name) program_split.add_kernel(ustring("path_trace_"#kernel_name)); | |||||
| #define ADD_SPLIT_KERNEL_SPLIT_PROGRAM(kernel_name) \ | |||||
| program_##kernel_name = \ | |||||
| OpenCLDeviceBase::OpenCLProgram(this, \ | |||||
| "split_"#kernel_name, \ | |||||
| "kernel_"#kernel_name".cl", \ | |||||
| get_build_options(this, requested_features)); \ | |||||
| program_##kernel_name.add_kernel(ustring("path_trace_"#kernel_name)); \ | |||||
| programs.push_back(&program_##kernel_name); | |||||
| if (single_program) { | |||||
| program_split = OpenCLDeviceBase::OpenCLProgram( | |||||
| this, | |||||
| "split" , | |||||
| "kernel_split.cl", | |||||
| get_build_options(this, requested_features)); | |||||
| ADD_SPLIT_KERNEL_SINGLE_PROGRAM(path_init); | |||||
| ADD_SPLIT_KERNEL_SINGLE_PROGRAM(scene_intersect); | |||||
| ADD_SPLIT_KERNEL_SINGLE_PROGRAM(lamp_emission); | |||||
| ADD_SPLIT_KERNEL_SINGLE_PROGRAM(do_volume); | |||||
| ADD_SPLIT_KERNEL_SINGLE_PROGRAM(queue_enqueue); | |||||
| ADD_SPLIT_KERNEL_SINGLE_PROGRAM(indirect_background); | |||||
| ADD_SPLIT_KERNEL_SINGLE_PROGRAM(shader_setup); | |||||
| ADD_SPLIT_KERNEL_SINGLE_PROGRAM(shader_sort); | |||||
| ADD_SPLIT_KERNEL_SINGLE_PROGRAM(shader_eval); | |||||
| ADD_SPLIT_KERNEL_SINGLE_PROGRAM(holdout_emission_blurring_pathtermination_ao); | |||||
| ADD_SPLIT_KERNEL_SINGLE_PROGRAM(subsurface_scatter); | |||||
| ADD_SPLIT_KERNEL_SINGLE_PROGRAM(direct_lighting); | |||||
| ADD_SPLIT_KERNEL_SINGLE_PROGRAM(shadow_blocked_ao); | |||||
| ADD_SPLIT_KERNEL_SINGLE_PROGRAM(shadow_blocked_dl); | |||||
| ADD_SPLIT_KERNEL_SINGLE_PROGRAM(enqueue_inactive); | |||||
| ADD_SPLIT_KERNEL_SINGLE_PROGRAM(next_iteration_setup); | |||||
| ADD_SPLIT_KERNEL_SINGLE_PROGRAM(indirect_subsurface); | |||||
| ADD_SPLIT_KERNEL_SINGLE_PROGRAM(buffer_update); | |||||
| programs.push_back(&program_split); | |||||
| } | |||||
| else { | |||||
| /* Ordered with most complex kernels first, to reduce overall compile time. */ | |||||
| ADD_SPLIT_KERNEL_SPLIT_PROGRAM(subsurface_scatter); | |||||
| ADD_SPLIT_KERNEL_SPLIT_PROGRAM(do_volume); | |||||
| ADD_SPLIT_KERNEL_SPLIT_PROGRAM(shadow_blocked_dl); | |||||
| ADD_SPLIT_KERNEL_SPLIT_PROGRAM(shadow_blocked_ao); | |||||
| ADD_SPLIT_KERNEL_SPLIT_PROGRAM(holdout_emission_blurring_pathtermination_ao); | |||||
| ADD_SPLIT_KERNEL_SPLIT_PROGRAM(lamp_emission); | |||||
| ADD_SPLIT_KERNEL_SPLIT_PROGRAM(direct_lighting); | |||||
| ADD_SPLIT_KERNEL_SPLIT_PROGRAM(indirect_background); | |||||
| ADD_SPLIT_KERNEL_SPLIT_PROGRAM(shader_eval); | |||||
| /* Quick kernels bundled in a single program to reduce overhead of starting | |||||
| * Blender processes. */ | |||||
| program_split = OpenCLDeviceBase::OpenCLProgram( | |||||
| this, | |||||
| "split_bundle" , | |||||
| "kernel_split_bundle.cl", | |||||
| get_build_options(this, requested_features)); | |||||
| ADD_SPLIT_KERNEL_SINGLE_PROGRAM(path_init); | /* Set compute device build option. */ | ||||
| ADD_SPLIT_KERNEL_SINGLE_PROGRAM(scene_intersect); | cl_device_type device_type; | ||||
| ADD_SPLIT_KERNEL_SINGLE_PROGRAM(queue_enqueue); | OpenCLInfo::get_device_type(this->cdDevice, &device_type, &this->ciErr); | ||||
| ADD_SPLIT_KERNEL_SINGLE_PROGRAM(shader_setup); | assert(this->ciErr == CL_SUCCESS); | ||||
| ADD_SPLIT_KERNEL_SINGLE_PROGRAM(shader_sort); | if(device_type == CL_DEVICE_TYPE_GPU) { | ||||
| ADD_SPLIT_KERNEL_SINGLE_PROGRAM(enqueue_inactive); | build_options += " -D__COMPUTE_DEVICE_GPU__"; | ||||
| ADD_SPLIT_KERNEL_SINGLE_PROGRAM(next_iteration_setup); | |||||
| ADD_SPLIT_KERNEL_SINGLE_PROGRAM(indirect_subsurface); | |||||
| ADD_SPLIT_KERNEL_SINGLE_PROGRAM(buffer_update); | |||||
| programs.push_back(&program_split); | |||||
| } | } | ||||
| #undef ADD_SPLIT_KERNEL_SPLIT_PROGRAM | |||||
| #undef ADD_SPLIT_KERNEL_SINGLE_PROGRAM | |||||
| return true; | return build_options; | ||||
| } | } | ||||
| void thread_run(DeviceTask *task) | string OpenCLDevice::get_build_options_for_bake(const DeviceRequestedFeatures& requested_features) | ||||
| { | { | ||||
| flush_texture_buffers(); | return requested_features.get_build_options(); | ||||
| if(task->type == DeviceTask::FILM_CONVERT) { | |||||
| film_convert(*task, task->buffer, task->rgba_byte, task->rgba_half); | |||||
| } | |||||
| else if(task->type == DeviceTask::SHADER) { | |||||
| shader(*task); | |||||
| } | } | ||||
| else if(task->type == DeviceTask::RENDER) { | |||||
| RenderTile tile; | |||||
| DenoisingTask denoising(this, *task); | |||||
| /* Allocate buffer for kernel globals */ | |||||
| device_only_memory<KernelGlobalsDummy> kgbuffer(this, "kernel_globals"); | |||||
| kgbuffer.alloc_to_device(1); | |||||
| /* Keep rendering tiles until done. */ | namespace { | ||||
| while(task->acquire_tile(this, tile)) { | |||||
| if(tile.task == RenderTile::PATH_TRACE) { | |||||
| assert(tile.task == RenderTile::PATH_TRACE); | |||||
| scoped_timer timer(&tile.buffers->render_time); | |||||
| split_kernel->path_trace(task, | |||||
| tile, | |||||
| kgbuffer, | |||||
| *const_mem_map["__data"]); | |||||
| /* Complete kernel execution before release tile. */ | /* Copy dummy KernelGlobals related to OpenCL from kernel_globals.h to | ||||
| /* This helps in multi-device render; | * fetch its size. | ||||
| * The device that reaches the critical-section function | |||||
| * release_tile waits (stalling other devices from entering | |||||
| * release_tile) for all kernels to complete. If device1 (a | |||||
| * slow-render device) reaches release_tile first then it would | |||||
| * stall device2 (a fast-render device) from proceeding to render | |||||
| * next tile. | |||||
| */ | */ | ||||
| clFinish(cqCommandQueue); | typedef struct KernelGlobalsDummy { | ||||
| } | ccl_constant KernelData *data; | ||||
| else if(tile.task == RenderTile::DENOISE) { | ccl_global char *buffers[8]; | ||||
| tile.sample = tile.start_sample + tile.num_samples; | |||||
| denoise(tile, denoising); | |||||
| task->update_progress(&tile, tile.w*tile.h); | |||||
| } | |||||
| task->release_tile(tile); | |||||
| } | |||||
| kgbuffer.free(); | |||||
| } | |||||
| } | |||||
| bool is_split_kernel() | |||||
| { | |||||
| return true; | |||||
| } | |||||
| protected: | #define KERNEL_TEX(type, name) \ | ||||
| /* ** Those guys are for workign around some compiler-specific bugs ** */ | TextureInfo name; | ||||
| # include "kernel/kernel_textures.h" | |||||
| #undef KERNEL_TEX | |||||
| SplitData split_data; | |||||
| SplitParams split_param_data; | |||||
| } KernelGlobalsDummy; | |||||
| string build_options_for_bake_program( | } // namespace | ||||
| const DeviceRequestedFeatures& requested_features) | |||||
| { | |||||
| return requested_features.get_build_options(); | |||||
| } | |||||
| friend class OpenCLSplitKernel; | |||||
| friend class OpenCLSplitKernelFunction; | |||||
| }; | |||||
| struct CachedSplitMemory { | struct CachedSplitMemory { | ||||
| int id; | int id; | ||||
| device_memory *split_data; | device_memory *split_data; | ||||
| device_memory *ray_state; | device_memory *ray_state; | ||||
| device_memory *queue_index; | device_memory *queue_index; | ||||
| device_memory *use_queues_flag; | device_memory *use_queues_flag; | ||||
| device_memory *work_pools; | device_memory *work_pools; | ||||
| device_ptr *buffer; | device_ptr *buffer; | ||||
| }; | }; | ||||
| class OpenCLSplitKernelFunction : public SplitKernelFunction { | class OpenCLSplitKernelFunction : public SplitKernelFunction { | ||||
| public: | public: | ||||
| OpenCLDeviceSplitKernel* device; | OpenCLDevice* device; | ||||
| OpenCLDeviceBase::OpenCLProgram program; | OpenCLDevice::OpenCLProgram program; | ||||
| CachedSplitMemory& cached_memory; | CachedSplitMemory& cached_memory; | ||||
| int cached_id; | int cached_id; | ||||
| OpenCLSplitKernelFunction(OpenCLDeviceSplitKernel* device, CachedSplitMemory& cached_memory) : | OpenCLSplitKernelFunction(OpenCLDevice* device, CachedSplitMemory& cached_memory) : | ||||
| device(device), cached_memory(cached_memory), cached_id(cached_memory.id-1) | device(device), cached_memory(cached_memory), cached_id(cached_memory.id-1) | ||||
| { | { | ||||
| } | } | ||||
| ~OpenCLSplitKernelFunction() | ~OpenCLSplitKernelFunction() | ||||
| { | { | ||||
| program.release(); | program.release(); | ||||
| } | } | ||||
| ▲ Show 20 Lines • Show All 41 Lines • ▼ Show 20 Lines | if(device->ciErr != CL_SUCCESS) { | ||||
| return false; | return false; | ||||
| } | } | ||||
| return true; | return true; | ||||
| } | } | ||||
| }; | }; | ||||
| class OpenCLSplitKernel : public DeviceSplitKernel { | class OpenCLSplitKernel : public DeviceSplitKernel { | ||||
| OpenCLDeviceSplitKernel *device; | OpenCLDevice *device; | ||||
| CachedSplitMemory cached_memory; | CachedSplitMemory cached_memory; | ||||
| public: | public: | ||||
| explicit OpenCLSplitKernel(OpenCLDeviceSplitKernel *device) : DeviceSplitKernel(device), device(device) { | explicit OpenCLSplitKernel(OpenCLDevice *device) : DeviceSplitKernel(device), device(device) { | ||||
| } | } | ||||
| virtual SplitKernelFunction* get_split_kernel_function(const string& kernel_name, | virtual SplitKernelFunction* get_split_kernel_function(const string& kernel_name, | ||||
| const DeviceRequestedFeatures& requested_features) | const DeviceRequestedFeatures& requested_features) | ||||
| { | { | ||||
| OpenCLSplitKernelFunction* kernel = new OpenCLSplitKernelFunction(device, cached_memory); | OpenCLSplitKernelFunction* kernel = new OpenCLSplitKernelFunction(device, cached_memory); | ||||
| bool single_program = OpenCLInfo::use_single_program(); | bool single_program = OpenCLInfo::use_single_program(); | ||||
| kernel->program = | kernel->program = | ||||
| OpenCLDeviceBase::OpenCLProgram(device, | OpenCLDevice::OpenCLProgram(device, | ||||
| device->get_opencl_program_name(single_program, kernel_name), | device->get_opencl_program_name(single_program, kernel_name), | ||||
| device->get_opencl_program_filename(single_program, kernel_name), | device->get_opencl_program_filename(single_program, kernel_name), | ||||
| get_build_options(device, requested_features)); | device->get_build_options(requested_features)); | ||||
| kernel->program.add_kernel(ustring("path_trace_" + kernel_name)); | kernel->program.add_kernel(ustring("path_trace_" + kernel_name)); | ||||
| kernel->program.load(); | kernel->program.load(); | ||||
| if(!kernel->program.is_loaded()) { | if(!kernel->program.is_loaded()) { | ||||
| delete kernel; | delete kernel; | ||||
| return NULL; | return NULL; | ||||
| } | } | ||||
| ▲ Show 20 Lines • Show All 148 Lines • ▼ Show 20 Lines | virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask * /*task*/) | ||||
| size_t num_elements = max_elements_for_max_buffer_size(kg, data, max_buffer_size); | size_t num_elements = max_elements_for_max_buffer_size(kg, data, max_buffer_size); | ||||
| int2 global_size = make_int2(max(round_down((int)sqrt(num_elements), 64), 64), (int)sqrt(num_elements)); | int2 global_size = make_int2(max(round_down((int)sqrt(num_elements), 64), 64), (int)sqrt(num_elements)); | ||||
| VLOG(1) << "Global size: " << global_size << "."; | VLOG(1) << "Global size: " << global_size << "."; | ||||
| return global_size; | return global_size; | ||||
| } | } | ||||
| }; | }; | ||||
| OpenCLDeviceSplitKernel::OpenCLDeviceSplitKernel(DeviceInfo& info, Stats &stats, Profiler &profiler, bool background_) | bool OpenCLDevice::opencl_error(cl_int err) | ||||
| : OpenCLDeviceBase(info, stats, profiler, background_) | { | ||||
| if(err != CL_SUCCESS) { | |||||
| string message = string_printf("OpenCL error (%d): %s", err, clewErrorString(err)); | |||||
| if(error_msg == "") | |||||
| error_msg = message; | |||||
| fprintf(stderr, "%s\n", message.c_str()); | |||||
| return true; | |||||
| } | |||||
| return false; | |||||
| } | |||||
| void OpenCLDevice::opencl_error(const string& message) | |||||
| { | |||||
| if(error_msg == "") | |||||
| error_msg = message; | |||||
| fprintf(stderr, "%s\n", message.c_str()); | |||||
| } | |||||
| void OpenCLDevice::opencl_assert_err(cl_int err, const char* where) | |||||
| { | |||||
| if(err != CL_SUCCESS) { | |||||
| string message = string_printf("OpenCL error (%d): %s in %s", err, clewErrorString(err), where); | |||||
| if(error_msg == "") | |||||
| error_msg = message; | |||||
| fprintf(stderr, "%s\n", message.c_str()); | |||||
| #ifndef NDEBUG | |||||
| abort(); | |||||
| #endif | |||||
| } | |||||
| } | |||||
| OpenCLDevice::OpenCLDevice(DeviceInfo& info, Stats &stats, Profiler &profiler, bool background) | |||||
| : Device(info, stats, profiler, background), | |||||
| memory_manager(this), | |||||
| texture_info(this, "__texture_info", MEM_TEXTURE) | |||||
| { | { | ||||
| cpPlatform = NULL; | |||||
| cdDevice = NULL; | |||||
| cxContext = NULL; | |||||
| cqCommandQueue = NULL; | |||||
| null_mem = 0; | |||||
| device_initialized = false; | |||||
| textures_need_update = true; | |||||
| vector<OpenCLPlatformDevice> usable_devices; | |||||
| OpenCLInfo::get_usable_devices(&usable_devices); | |||||
| if(usable_devices.size() == 0) { | |||||
| opencl_error("OpenCL: no devices found."); | |||||
| return; | |||||
| } | |||||
| assert(info.num < usable_devices.size()); | |||||
| OpenCLPlatformDevice& platform_device = usable_devices[info.num]; | |||||
| device_num = info.num; | |||||
| cpPlatform = platform_device.platform_id; | |||||
| cdDevice = platform_device.device_id; | |||||
| platform_name = platform_device.platform_name; | |||||
| device_name = platform_device.device_name; | |||||
| VLOG(2) << "Creating new Cycles device for OpenCL platform " | |||||
| << platform_name << ", device " | |||||
| << device_name << "."; | |||||
| { | |||||
| /* try to use cached context */ | |||||
| thread_scoped_lock cache_locker; | |||||
| cxContext = OpenCLCache::get_context(cpPlatform, cdDevice, cache_locker); | |||||
| if(cxContext == NULL) { | |||||
| /* create context properties array to specify platform */ | |||||
| const cl_context_properties context_props[] = { | |||||
| CL_CONTEXT_PLATFORM, (cl_context_properties)cpPlatform, | |||||
| 0, 0 | |||||
| }; | |||||
| /* create context */ | |||||
| cxContext = clCreateContext(context_props, 1, &cdDevice, | |||||
| context_notify_callback, cdDevice, &ciErr); | |||||
| if(opencl_error(ciErr)) { | |||||
| opencl_error("OpenCL: clCreateContext failed"); | |||||
| return; | |||||
| } | |||||
| /* cache it */ | |||||
| OpenCLCache::store_context(cpPlatform, cdDevice, cxContext, cache_locker); | |||||
| } | |||||
| } | |||||
| cqCommandQueue = clCreateCommandQueue(cxContext, cdDevice, 0, &ciErr); | |||||
| if(opencl_error(ciErr)) { | |||||
| opencl_error("OpenCL: Error creating command queue"); | |||||
| return; | |||||
| } | |||||
| null_mem = (device_ptr)clCreateBuffer(cxContext, CL_MEM_READ_ONLY, 1, NULL, &ciErr); | |||||
| if(opencl_error(ciErr)) { | |||||
| opencl_error("OpenCL: Error creating memory buffer for NULL"); | |||||
| return; | |||||
| } | |||||
| /* Allocate this right away so that texture_info is placed at offset 0 in the device memory buffers */ | |||||
| texture_info.resize(1); | |||||
| memory_manager.alloc("texture_info", texture_info); | |||||
| device_initialized = true; | |||||
| split_kernel = new OpenCLSplitKernel(this); | split_kernel = new OpenCLSplitKernel(this); | ||||
| background = background; | |||||
| } | |||||
| OpenCLDevice::~OpenCLDevice() | |||||
| { | |||||
| task_pool.stop(); | |||||
| memory_manager.free(); | |||||
| if(null_mem) | |||||
| clReleaseMemObject(CL_MEM_PTR(null_mem)); | |||||
| ConstMemMap::iterator mt; | |||||
| for(mt = const_mem_map.begin(); mt != const_mem_map.end(); mt++) { | |||||
| delete mt->second; | |||||
| } | |||||
| base_program.release(); | |||||
| bake_program.release(); | |||||
| displace_program.release(); | |||||
| background_program.release(); | |||||
| program_data_init.release(); | |||||
| if(cqCommandQueue) | |||||
| clReleaseCommandQueue(cqCommandQueue); | |||||
| if(cxContext) | |||||
| clReleaseContext(cxContext); | |||||
| delete split_kernel; | |||||
| } | |||||
| void CL_CALLBACK OpenCLDevice::context_notify_callback(const char *err_info, | |||||
| const void * /*private_info*/, size_t /*cb*/, void *user_data) | |||||
| { | |||||
| string device_name = OpenCLInfo::get_device_name((cl_device_id)user_data); | |||||
| fprintf(stderr, "OpenCL error (%s): %s\n", device_name.c_str(), err_info); | |||||
| } | |||||
| bool OpenCLDevice::opencl_version_check() | |||||
| { | |||||
| string error; | |||||
| if(!OpenCLInfo::platform_version_check(cpPlatform, &error)) { | |||||
| opencl_error(error); | |||||
| return false; | |||||
| } | |||||
| if(!OpenCLInfo::device_version_check(cdDevice, &error)) { | |||||
| opencl_error(error); | |||||
| return false; | |||||
| } | |||||
| return true; | |||||
| } | |||||
| string OpenCLDevice::device_md5_hash(string kernel_custom_build_options) | |||||
| { | |||||
| MD5Hash md5; | |||||
| char version[256], driver[256], name[256], vendor[256]; | |||||
| clGetPlatformInfo(cpPlatform, CL_PLATFORM_VENDOR, sizeof(vendor), &vendor, NULL); | |||||
| clGetDeviceInfo(cdDevice, CL_DEVICE_VERSION, sizeof(version), &version, NULL); | |||||
| clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(name), &name, NULL); | |||||
| clGetDeviceInfo(cdDevice, CL_DRIVER_VERSION, sizeof(driver), &driver, NULL); | |||||
| md5.append((uint8_t*)vendor, strlen(vendor)); | |||||
| md5.append((uint8_t*)version, strlen(version)); | |||||
| md5.append((uint8_t*)name, strlen(name)); | |||||
| md5.append((uint8_t*)driver, strlen(driver)); | |||||
| string options = kernel_build_options(); | |||||
| options += kernel_custom_build_options; | |||||
| md5.append((uint8_t*)options.c_str(), options.size()); | |||||
| return md5.get_hex(); | |||||
| } | |||||
| bool OpenCLDevice::load_kernels(const DeviceRequestedFeatures& requested_features) | |||||
| { | |||||
| VLOG(2) << "Loading kernels for platform " << platform_name | |||||
| << ", device " << device_name << "."; | |||||
| /* Verify if device was initialized. */ | |||||
| if(!device_initialized) { | |||||
| fprintf(stderr, "OpenCL: failed to initialize device.\n"); | |||||
| return false; | |||||
| } | |||||
| /* Verify we have right opencl version. */ | |||||
| if(!opencl_version_check()) | |||||
| return false; | |||||
| base_program = OpenCLProgram(this, "base", "kernel_base.cl", ""); | |||||
| base_program.add_kernel(ustring("convert_to_byte")); | |||||
| base_program.add_kernel(ustring("convert_to_half_float")); | |||||
| base_program.add_kernel(ustring("zero_buffer")); | |||||
| bake_program = OpenCLProgram(this, "bake", "kernel_bake.cl", get_build_options_for_bake(requested_features)); | |||||
| bake_program.add_kernel(ustring("bake")); | |||||
| displace_program = OpenCLProgram(this, "displace", "kernel_displace.cl", get_build_options_for_bake(requested_features)); | |||||
| displace_program.add_kernel(ustring("displace")); | |||||
| background_program = OpenCLProgram(this, "background", "kernel_background.cl", get_build_options_for_bake(requested_features)); | |||||
| background_program.add_kernel(ustring("background")); | |||||
| denoising_program = OpenCLProgram(this, "denoising", "filter.cl", ""); | |||||
| denoising_program.add_kernel(ustring("filter_divide_shadow")); | |||||
| denoising_program.add_kernel(ustring("filter_get_feature")); | |||||
| denoising_program.add_kernel(ustring("filter_detect_outliers")); | |||||
| denoising_program.add_kernel(ustring("filter_combine_halves")); | |||||
| denoising_program.add_kernel(ustring("filter_construct_transform")); | |||||
| denoising_program.add_kernel(ustring("filter_nlm_calc_difference")); | |||||
| denoising_program.add_kernel(ustring("filter_nlm_blur")); | |||||
| denoising_program.add_kernel(ustring("filter_nlm_calc_weight")); | |||||
| denoising_program.add_kernel(ustring("filter_nlm_update_output")); | |||||
| denoising_program.add_kernel(ustring("filter_nlm_normalize")); | |||||
| denoising_program.add_kernel(ustring("filter_nlm_construct_gramian")); | |||||
| denoising_program.add_kernel(ustring("filter_finalize")); | |||||
| vector<OpenCLProgram*> programs; | |||||
| programs.push_back(&bake_program); | |||||
| programs.push_back(&displace_program); | |||||
| programs.push_back(&background_program); | |||||
| bool single_program = OpenCLInfo::use_single_program(); | |||||
| program_data_init = OpenCLDevice::OpenCLProgram( | |||||
| this, | |||||
| get_opencl_program_name(single_program, "data_init"), | |||||
| get_opencl_program_filename(single_program, "data_init"), | |||||
| get_build_options(requested_features)); | |||||
| program_data_init.add_kernel(ustring("path_trace_data_init")); | |||||
| programs.push_back(&program_data_init); | |||||
| program_state_buffer_size = OpenCLDevice::OpenCLProgram( | |||||
| this, | |||||
| get_opencl_program_name(single_program, "state_buffer_size"), | |||||
| get_opencl_program_filename(single_program, "state_buffer_size"), | |||||
| get_build_options(requested_features)); | |||||
| program_state_buffer_size.add_kernel(ustring("path_trace_state_buffer_size")); | |||||
| programs.push_back(&program_state_buffer_size); | |||||
| #define ADD_SPLIT_KERNEL_SINGLE_PROGRAM(kernel_name) program_split.add_kernel(ustring("path_trace_"#kernel_name)); | |||||
| #define ADD_SPLIT_KERNEL_SPLIT_PROGRAM(kernel_name) \ | |||||
| program_##kernel_name = \ | |||||
| OpenCLDevice::OpenCLProgram(this, \ | |||||
| "split_"#kernel_name, \ | |||||
| "kernel_"#kernel_name".cl", \ | |||||
| get_build_options(requested_features)); \ | |||||
| program_##kernel_name.add_kernel(ustring("path_trace_"#kernel_name)); \ | |||||
| programs.push_back(&program_##kernel_name); | |||||
| if (single_program) { | |||||
| program_split = OpenCLDevice::OpenCLProgram( | |||||
| this, | |||||
| "split" , | |||||
| "kernel_split.cl", | |||||
| get_build_options(requested_features)); | |||||
| ADD_SPLIT_KERNEL_SINGLE_PROGRAM(path_init); | |||||
| ADD_SPLIT_KERNEL_SINGLE_PROGRAM(scene_intersect); | |||||
| ADD_SPLIT_KERNEL_SINGLE_PROGRAM(lamp_emission); | |||||
| ADD_SPLIT_KERNEL_SINGLE_PROGRAM(do_volume); | |||||
| ADD_SPLIT_KERNEL_SINGLE_PROGRAM(queue_enqueue); | |||||
| ADD_SPLIT_KERNEL_SINGLE_PROGRAM(indirect_background); | |||||
| ADD_SPLIT_KERNEL_SINGLE_PROGRAM(shader_setup); | |||||
| ADD_SPLIT_KERNEL_SINGLE_PROGRAM(shader_sort); | |||||
| ADD_SPLIT_KERNEL_SINGLE_PROGRAM(shader_eval); | |||||
| ADD_SPLIT_KERNEL_SINGLE_PROGRAM(holdout_emission_blurring_pathtermination_ao); | |||||
| ADD_SPLIT_KERNEL_SINGLE_PROGRAM(subsurface_scatter); | |||||
| ADD_SPLIT_KERNEL_SINGLE_PROGRAM(direct_lighting); | |||||
| ADD_SPLIT_KERNEL_SINGLE_PROGRAM(shadow_blocked_ao); | |||||
| ADD_SPLIT_KERNEL_SINGLE_PROGRAM(shadow_blocked_dl); | |||||
| ADD_SPLIT_KERNEL_SINGLE_PROGRAM(enqueue_inactive); | |||||
| ADD_SPLIT_KERNEL_SINGLE_PROGRAM(next_iteration_setup); | |||||
| ADD_SPLIT_KERNEL_SINGLE_PROGRAM(indirect_subsurface); | |||||
| ADD_SPLIT_KERNEL_SINGLE_PROGRAM(buffer_update); | |||||
| programs.push_back(&program_split); | |||||
| } | |||||
| else { | |||||
| /* Ordered with most complex kernels first, to reduce overall compile time. */ | |||||
| ADD_SPLIT_KERNEL_SPLIT_PROGRAM(subsurface_scatter); | |||||
| ADD_SPLIT_KERNEL_SPLIT_PROGRAM(do_volume); | |||||
| ADD_SPLIT_KERNEL_SPLIT_PROGRAM(shadow_blocked_dl); | |||||
| ADD_SPLIT_KERNEL_SPLIT_PROGRAM(shadow_blocked_ao); | |||||
| ADD_SPLIT_KERNEL_SPLIT_PROGRAM(holdout_emission_blurring_pathtermination_ao); | |||||
| ADD_SPLIT_KERNEL_SPLIT_PROGRAM(lamp_emission); | |||||
| ADD_SPLIT_KERNEL_SPLIT_PROGRAM(direct_lighting); | |||||
| ADD_SPLIT_KERNEL_SPLIT_PROGRAM(indirect_background); | |||||
| ADD_SPLIT_KERNEL_SPLIT_PROGRAM(shader_eval); | |||||
| /* Quick kernels bundled in a single program to reduce overhead of starting | |||||
| * Blender processes. */ | |||||
| program_split = OpenCLDevice::OpenCLProgram( | |||||
| this, | |||||
| "split_bundle" , | |||||
| "kernel_split_bundle.cl", | |||||
| get_build_options(requested_features)); | |||||
| ADD_SPLIT_KERNEL_SINGLE_PROGRAM(path_init); | |||||
| ADD_SPLIT_KERNEL_SINGLE_PROGRAM(scene_intersect); | |||||
| ADD_SPLIT_KERNEL_SINGLE_PROGRAM(queue_enqueue); | |||||
| ADD_SPLIT_KERNEL_SINGLE_PROGRAM(shader_setup); | |||||
| ADD_SPLIT_KERNEL_SINGLE_PROGRAM(shader_sort); | |||||
| ADD_SPLIT_KERNEL_SINGLE_PROGRAM(enqueue_inactive); | |||||
| ADD_SPLIT_KERNEL_SINGLE_PROGRAM(next_iteration_setup); | |||||
| ADD_SPLIT_KERNEL_SINGLE_PROGRAM(indirect_subsurface); | |||||
| ADD_SPLIT_KERNEL_SINGLE_PROGRAM(buffer_update); | |||||
| programs.push_back(&program_split); | |||||
| } | |||||
| #undef ADD_SPLIT_KERNEL_SPLIT_PROGRAM | |||||
| #undef ADD_SPLIT_KERNEL_SINGLE_PROGRAM | |||||
| programs.push_back(&base_program); | |||||
| programs.push_back(&denoising_program); | |||||
| /* Parallel compilation of Cycles kernels, this launches multiple | |||||
| * processes to workaround OpenCL frameworks serializing the calls | |||||
| * internally within a single process. */ | |||||
| TaskPool task_pool; | |||||
| foreach(OpenCLProgram *program, programs) { | |||||
| task_pool.push(function_bind(&OpenCLProgram::load, program)); | |||||
| } | |||||
| task_pool.wait_work(); | |||||
| foreach(OpenCLProgram *program, programs) { | |||||
| VLOG(2) << program->get_log(); | |||||
| if(!program->is_loaded()) { | |||||
| program->report_error(); | |||||
| return false; | |||||
| } | |||||
| } | |||||
| return split_kernel->load_kernels(requested_features); | |||||
| } | |||||
| void OpenCLDevice::mem_alloc(device_memory& mem) | |||||
| { | |||||
| if(mem.name) { | |||||
| VLOG(1) << "Buffer allocate: " << mem.name << ", " | |||||
| << string_human_readable_number(mem.memory_size()) << " bytes. (" | |||||
| << string_human_readable_size(mem.memory_size()) << ")"; | |||||
| } | |||||
| size_t size = mem.memory_size(); | |||||
| /* check there is enough memory available for the allocation */ | |||||
| cl_ulong max_alloc_size = 0; | |||||
| clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_alloc_size, NULL); | |||||
| if(DebugFlags().opencl.mem_limit) { | |||||
| max_alloc_size = min(max_alloc_size, | |||||
| cl_ulong(DebugFlags().opencl.mem_limit - stats.mem_used)); | |||||
| } | |||||
| if(size > max_alloc_size) { | |||||
| string error = "Scene too complex to fit in available memory."; | |||||
| if(mem.name != NULL) { | |||||
| error += string_printf(" (allocating buffer %s failed.)", mem.name); | |||||
| } | |||||
| set_error(error); | |||||
| return; | |||||
| } | |||||
| cl_mem_flags mem_flag; | |||||
| void *mem_ptr = NULL; | |||||
| if(mem.type == MEM_READ_ONLY || mem.type == MEM_TEXTURE) | |||||
| mem_flag = CL_MEM_READ_ONLY; | |||||
| else | |||||
| mem_flag = CL_MEM_READ_WRITE; | |||||
| /* Zero-size allocation might be invoked by render, but not really | |||||
| * supported by OpenCL. Using NULL as device pointer also doesn't really | |||||
| * work for some reason, so for the time being we'll use special case | |||||
| * will null_mem buffer. | |||||
| */ | |||||
| if(size != 0) { | |||||
| mem.device_pointer = (device_ptr)clCreateBuffer(cxContext, | |||||
| mem_flag, | |||||
| size, | |||||
| mem_ptr, | |||||
| &ciErr); | |||||
| opencl_assert_err(ciErr, "clCreateBuffer"); | |||||
| } | |||||
| else { | |||||
| mem.device_pointer = null_mem; | |||||
| } | |||||
| stats.mem_alloc(size); | |||||
| mem.device_size = size; | |||||
| } | |||||
| void OpenCLDevice::mem_copy_to(device_memory& mem) | |||||
| { | |||||
| if(mem.type == MEM_TEXTURE) { | |||||
| tex_free(mem); | |||||
| tex_alloc(mem); | |||||
| } | |||||
| else { | |||||
| if(!mem.device_pointer) { | |||||
| mem_alloc(mem); | |||||
| } | |||||
| /* this is blocking */ | |||||
| size_t size = mem.memory_size(); | |||||
| if(size != 0) { | |||||
| opencl_assert(clEnqueueWriteBuffer(cqCommandQueue, | |||||
| CL_MEM_PTR(mem.device_pointer), | |||||
| CL_TRUE, | |||||
| 0, | |||||
| size, | |||||
| mem.host_pointer, | |||||
| 0, | |||||
| NULL, NULL)); | |||||
| } | |||||
| } | |||||
| } | |||||
| void OpenCLDevice::mem_copy_from(device_memory& mem, int y, int w, int h, int elem) | |||||
| { | |||||
| size_t offset = elem*y*w; | |||||
| size_t size = elem*w*h; | |||||
| assert(size != 0); | |||||
| opencl_assert(clEnqueueReadBuffer(cqCommandQueue, | |||||
| CL_MEM_PTR(mem.device_pointer), | |||||
| CL_TRUE, | |||||
| offset, | |||||
| size, | |||||
| (uchar*)mem.host_pointer + offset, | |||||
| 0, | |||||
| NULL, NULL)); | |||||
| } | |||||
| void OpenCLDevice::mem_zero_kernel(device_ptr mem, size_t size) | |||||
| { | |||||
| cl_kernel ckZeroBuffer = base_program(ustring("zero_buffer")); | |||||
| size_t global_size[] = {1024, 1024}; | |||||
| size_t num_threads = global_size[0] * global_size[1]; | |||||
| cl_mem d_buffer = CL_MEM_PTR(mem); | |||||
| cl_ulong d_offset = 0; | |||||
| cl_ulong d_size = 0; | |||||
| while(d_offset < size) { | |||||
| d_size = std::min<cl_ulong>(num_threads*sizeof(float4), size - d_offset); | |||||
| kernel_set_args(ckZeroBuffer, 0, d_buffer, d_size, d_offset); | |||||
| ciErr = clEnqueueNDRangeKernel(cqCommandQueue, | |||||
| ckZeroBuffer, | |||||
| 2, | |||||
| NULL, | |||||
| global_size, | |||||
| NULL, | |||||
| 0, | |||||
| NULL, | |||||
| NULL); | |||||
| opencl_assert_err(ciErr, "clEnqueueNDRangeKernel"); | |||||
| d_offset += d_size; | |||||
| } | |||||
| } | |||||
| void OpenCLDevice::mem_zero(device_memory& mem) | |||||
| { | |||||
| if(!mem.device_pointer) { | |||||
| mem_alloc(mem); | |||||
| } | |||||
| if(mem.device_pointer) { | |||||
| if(base_program.is_loaded()) { | |||||
| mem_zero_kernel(mem.device_pointer, mem.memory_size()); | |||||
| } | |||||
| if(mem.host_pointer) { | |||||
| memset(mem.host_pointer, 0, mem.memory_size()); | |||||
| } | |||||
| if(!base_program.is_loaded()) { | |||||
| void* zero = mem.host_pointer; | |||||
| if(!mem.host_pointer) { | |||||
| zero = util_aligned_malloc(mem.memory_size(), 16); | |||||
| memset(zero, 0, mem.memory_size()); | |||||
| } | |||||
| opencl_assert(clEnqueueWriteBuffer(cqCommandQueue, | |||||
| CL_MEM_PTR(mem.device_pointer), | |||||
| CL_TRUE, | |||||
| 0, | |||||
| mem.memory_size(), | |||||
| zero, | |||||
| 0, | |||||
| NULL, NULL)); | |||||
| if(!mem.host_pointer) { | |||||
| util_aligned_free(zero); | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| void OpenCLDevice::mem_free(device_memory& mem) | |||||
| { | |||||
| if(mem.type == MEM_TEXTURE) { | |||||
| tex_free(mem); | |||||
| } | |||||
| else { | |||||
| if(mem.device_pointer) { | |||||
| if(mem.device_pointer != null_mem) { | |||||
| opencl_assert(clReleaseMemObject(CL_MEM_PTR(mem.device_pointer))); | |||||
| } | |||||
| mem.device_pointer = 0; | |||||
| stats.mem_free(mem.device_size); | |||||
| mem.device_size = 0; | |||||
| } | |||||
| } | |||||
| } | |||||
| int OpenCLDevice::mem_sub_ptr_alignment() | |||||
| { | |||||
| return OpenCLInfo::mem_sub_ptr_alignment(cdDevice); | |||||
| } | |||||
| device_ptr OpenCLDevice::mem_alloc_sub_ptr(device_memory& mem, int offset, int size) | |||||
| { | |||||
| cl_mem_flags mem_flag; | |||||
| if(mem.type == MEM_READ_ONLY || mem.type == MEM_TEXTURE) | |||||
| mem_flag = CL_MEM_READ_ONLY; | |||||
| else | |||||
| mem_flag = CL_MEM_READ_WRITE; | |||||
| cl_buffer_region info; | |||||
| info.origin = mem.memory_elements_size(offset); | |||||
| info.size = mem.memory_elements_size(size); | |||||
| device_ptr sub_buf = (device_ptr) clCreateSubBuffer(CL_MEM_PTR(mem.device_pointer), | |||||
| mem_flag, | |||||
| CL_BUFFER_CREATE_TYPE_REGION, | |||||
| &info, | |||||
| &ciErr); | |||||
| opencl_assert_err(ciErr, "clCreateSubBuffer"); | |||||
| return sub_buf; | |||||
| } | |||||
| void OpenCLDevice::mem_free_sub_ptr(device_ptr device_pointer) | |||||
| { | |||||
| if(device_pointer && device_pointer != null_mem) { | |||||
| opencl_assert(clReleaseMemObject(CL_MEM_PTR(device_pointer))); | |||||
| } | |||||
| } | |||||
| void OpenCLDevice::const_copy_to(const char *name, void *host, size_t size) | |||||
| { | |||||
| ConstMemMap::iterator i = const_mem_map.find(name); | |||||
| device_vector<uchar> *data; | |||||
| if(i == const_mem_map.end()) { | |||||
| data = new device_vector<uchar>(this, name, MEM_READ_ONLY); | |||||
| data->alloc(size); | |||||
| const_mem_map.insert(ConstMemMap::value_type(name, data)); | |||||
| } | |||||
| else { | |||||
| data = i->second; | |||||
| } | |||||
| memcpy(data->data(), host, size); | |||||
| data->copy_to_device(); | |||||
| } | |||||
| void OpenCLDevice::tex_alloc(device_memory& mem) | |||||
| { | |||||
| VLOG(1) << "Texture allocate: " << mem.name << ", " | |||||
| << string_human_readable_number(mem.memory_size()) << " bytes. (" | |||||
| << string_human_readable_size(mem.memory_size()) << ")"; | |||||
| memory_manager.alloc(mem.name, mem); | |||||
| /* Set the pointer to non-null to keep code that inspects its value from thinking its unallocated. */ | |||||
| mem.device_pointer = 1; | |||||
| textures[mem.name] = &mem; | |||||
| textures_need_update = true; | |||||
| } | |||||
| void OpenCLDevice::tex_free(device_memory& mem) | |||||
| { | |||||
| if(mem.device_pointer) { | |||||
| mem.device_pointer = 0; | |||||
| if(memory_manager.free(mem)) { | |||||
| textures_need_update = true; | |||||
| } | |||||
| foreach(TexturesMap::value_type& value, textures) { | |||||
| if(value.second == &mem) { | |||||
| textures.erase(value.first); | |||||
| break; | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| size_t OpenCLDevice::global_size_round_up(int group_size, int global_size) | |||||
| { | |||||
| int r = global_size % group_size; | |||||
| return global_size + ((r == 0)? 0: group_size - r); | |||||
| } | |||||
| void OpenCLDevice::enqueue_kernel(cl_kernel kernel, size_t w, size_t h, bool x_workgroups, size_t max_workgroup_size) | |||||
| { | |||||
| size_t workgroup_size, max_work_items[3]; | |||||
| clGetKernelWorkGroupInfo(kernel, cdDevice, | |||||
| CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL); | |||||
| clGetDeviceInfo(cdDevice, | |||||
| CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*3, max_work_items, NULL); | |||||
| if(max_workgroup_size > 0 && workgroup_size > max_workgroup_size) { | |||||
| workgroup_size = max_workgroup_size; | |||||
| } | |||||
| /* Try to divide evenly over 2 dimensions. */ | |||||
| size_t local_size[2]; | |||||
| if(x_workgroups) { | |||||
| local_size[0] = workgroup_size; | |||||
| local_size[1] = 1; | |||||
| } | |||||
| else { | |||||
| size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1); | |||||
| local_size[0] = local_size[1] = sqrt_workgroup_size; | |||||
| } | |||||
| /* Some implementations have max size 1 on 2nd dimension. */ | |||||
| if(local_size[1] > max_work_items[1]) { | |||||
| local_size[0] = workgroup_size/max_work_items[1]; | |||||
| local_size[1] = max_work_items[1]; | |||||
| } | |||||
| size_t global_size[2] = {global_size_round_up(local_size[0], w), | |||||
| global_size_round_up(local_size[1], h)}; | |||||
| /* Vertical size of 1 is coming from bake/shade kernels where we should | |||||
| * not round anything up because otherwise we'll either be doing too | |||||
| * much work per pixel (if we don't check global ID on Y axis) or will | |||||
| * be checking for global ID to always have Y of 0. | |||||
| */ | |||||
| if(h == 1) { | |||||
| global_size[h] = 1; | |||||
| } | |||||
| /* run kernel */ | |||||
| opencl_assert(clEnqueueNDRangeKernel(cqCommandQueue, kernel, 2, NULL, global_size, NULL, 0, NULL, NULL)); | |||||
| opencl_assert(clFlush(cqCommandQueue)); | |||||
| } | |||||
| void OpenCLDevice::set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name) | |||||
| { | |||||
| cl_mem ptr; | |||||
| MemMap::iterator i = mem_map.find(name); | |||||
| if(i != mem_map.end()) { | |||||
| ptr = CL_MEM_PTR(i->second); | |||||
| } | |||||
| else { | |||||
| /* work around NULL not working, even though the spec says otherwise */ | |||||
| ptr = CL_MEM_PTR(null_mem); | |||||
| } | |||||
| opencl_assert(clSetKernelArg(kernel, (*narg)++, sizeof(ptr), (void*)&ptr)); | |||||
| } | |||||
| void OpenCLDevice::set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg) | |||||
| { | |||||
| flush_texture_buffers(); | |||||
| memory_manager.set_kernel_arg_buffers(kernel, narg); | |||||
| } | |||||
| void OpenCLDevice::flush_texture_buffers() | |||||
| { | |||||
| if(!textures_need_update) { | |||||
| return; | |||||
| } | |||||
| textures_need_update = false; | |||||
| /* Setup slots for textures. */ | |||||
| int num_slots = 0; | |||||
| vector<texture_slot_t> texture_slots; | |||||
| #define KERNEL_TEX(type, name) \ | |||||
| if(textures.find(#name) != textures.end()) { \ | |||||
| texture_slots.push_back(texture_slot_t(#name, num_slots)); \ | |||||
| } \ | |||||
| num_slots++; | |||||
| #include "kernel/kernel_textures.h" | |||||
| int num_data_slots = num_slots; | |||||
| foreach(TexturesMap::value_type& tex, textures) { | |||||
| string name = tex.first; | |||||
| if(string_startswith(name, "__tex_image")) { | |||||
| int pos = name.rfind("_"); | |||||
| int id = atoi(name.data() + pos + 1); | |||||
| texture_slots.push_back(texture_slot_t(name, | |||||
| num_data_slots + id)); | |||||
| num_slots = max(num_slots, num_data_slots + id + 1); | |||||
| } | |||||
| } | |||||
| /* Realloc texture descriptors buffer. */ | |||||
| memory_manager.free(texture_info); | |||||
| texture_info.resize(num_slots); | |||||
| memory_manager.alloc("texture_info", texture_info); | |||||
| /* Fill in descriptors */ | |||||
| foreach(texture_slot_t& slot, texture_slots) { | |||||
| TextureInfo& info = texture_info[slot.slot]; | |||||
| MemoryManager::BufferDescriptor desc = memory_manager.get_descriptor(slot.name); | |||||
| info.data = desc.offset; | |||||
| info.cl_buffer = desc.device_buffer; | |||||
| if(string_startswith(slot.name, "__tex_image")) { | |||||
| device_memory *mem = textures[slot.name]; | |||||
| info.width = mem->data_width; | |||||
| info.height = mem->data_height; | |||||
| info.depth = mem->data_depth; | |||||
| info.interpolation = mem->interpolation; | |||||
| info.extension = mem->extension; | |||||
| } | |||||
| } | |||||
| /* Force write of descriptors. */ | |||||
| memory_manager.free(texture_info); | |||||
| memory_manager.alloc("texture_info", texture_info); | |||||
| } | |||||
| void OpenCLDevice::thread_run(DeviceTask *task) | |||||
| { | |||||
| flush_texture_buffers(); | |||||
| if(task->type == DeviceTask::FILM_CONVERT) { | |||||
| film_convert(*task, task->buffer, task->rgba_byte, task->rgba_half); | |||||
| } | |||||
| else if(task->type == DeviceTask::SHADER) { | |||||
| shader(*task); | |||||
| } | |||||
| else if(task->type == DeviceTask::RENDER) { | |||||
| RenderTile tile; | |||||
| DenoisingTask denoising(this, *task); | |||||
| /* Allocate buffer for kernel globals */ | |||||
| device_only_memory<KernelGlobalsDummy> kgbuffer(this, "kernel_globals"); | |||||
| kgbuffer.alloc_to_device(1); | |||||
| /* Keep rendering tiles until done. */ | |||||
| while(task->acquire_tile(this, tile)) { | |||||
| if(tile.task == RenderTile::PATH_TRACE) { | |||||
| assert(tile.task == RenderTile::PATH_TRACE); | |||||
| scoped_timer timer(&tile.buffers->render_time); | |||||
| split_kernel->path_trace(task, | |||||
| tile, | |||||
| kgbuffer, | |||||
| *const_mem_map["__data"]); | |||||
| /* Complete kernel execution before release tile. */ | |||||
| /* This helps in multi-device render; | |||||
| * The device that reaches the critical-section function | |||||
| * release_tile waits (stalling other devices from entering | |||||
| * release_tile) for all kernels to complete. If device1 (a | |||||
| * slow-render device) reaches release_tile first then it would | |||||
| * stall device2 (a fast-render device) from proceeding to render | |||||
| * next tile. | |||||
| */ | |||||
| clFinish(cqCommandQueue); | |||||
| } | |||||
| else if(tile.task == RenderTile::DENOISE) { | |||||
| tile.sample = tile.start_sample + tile.num_samples; | |||||
| denoise(tile, denoising); | |||||
| task->update_progress(&tile, tile.w*tile.h); | |||||
| } | |||||
| task->release_tile(tile); | |||||
| } | |||||
| kgbuffer.free(); | |||||
| } | |||||
| } | |||||
| void OpenCLDevice::film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half) | |||||
| { | |||||
| /* cast arguments to cl types */ | |||||
| cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer); | |||||
| cl_mem d_rgba = (rgba_byte)? CL_MEM_PTR(rgba_byte): CL_MEM_PTR(rgba_half); | |||||
| cl_mem d_buffer = CL_MEM_PTR(buffer); | |||||
| cl_int d_x = task.x; | |||||
| cl_int d_y = task.y; | |||||
| cl_int d_w = task.w; | |||||
| cl_int d_h = task.h; | |||||
| cl_float d_sample_scale = 1.0f/(task.sample + 1); | |||||
| cl_int d_offset = task.offset; | |||||
| cl_int d_stride = task.stride; | |||||
| cl_kernel ckFilmConvertKernel = (rgba_byte)? base_program(ustring("convert_to_byte")): base_program(ustring("convert_to_half_float")); | |||||
| cl_uint start_arg_index = | |||||
| kernel_set_args(ckFilmConvertKernel, | |||||
| 0, | |||||
| d_data, | |||||
| d_rgba, | |||||
| d_buffer); | |||||
| set_kernel_arg_buffers(ckFilmConvertKernel, &start_arg_index); | |||||
| start_arg_index += kernel_set_args(ckFilmConvertKernel, | |||||
| start_arg_index, | |||||
| d_sample_scale, | |||||
| d_x, | |||||
| d_y, | |||||
| d_w, | |||||
| d_h, | |||||
| d_offset, | |||||
| d_stride); | |||||
| enqueue_kernel(ckFilmConvertKernel, d_w, d_h); | |||||
| } | |||||
| bool OpenCLDevice::denoising_non_local_means(device_ptr image_ptr, | |||||
| device_ptr guide_ptr, | |||||
| device_ptr variance_ptr, | |||||
| device_ptr out_ptr, | |||||
| DenoisingTask *task) | |||||
| { | |||||
| int stride = task->buffer.stride; | |||||
| int w = task->buffer.width; | |||||
| int h = task->buffer.h; | |||||
| int r = task->nlm_state.r; | |||||
| int f = task->nlm_state.f; | |||||
| float a = task->nlm_state.a; | |||||
| float k_2 = task->nlm_state.k_2; | |||||
| int pass_stride = task->buffer.pass_stride; | |||||
| int num_shifts = (2*r+1)*(2*r+1); | |||||
| int channel_offset = task->nlm_state.is_color? task->buffer.pass_stride : 0; | |||||
| device_sub_ptr difference(task->buffer.temporary_mem, 0, pass_stride*num_shifts); | |||||
| device_sub_ptr blurDifference(task->buffer.temporary_mem, pass_stride*num_shifts, pass_stride*num_shifts); | |||||
| device_sub_ptr weightAccum(task->buffer.temporary_mem, 2*pass_stride*num_shifts, pass_stride); | |||||
| cl_mem weightAccum_mem = CL_MEM_PTR(*weightAccum); | |||||
| cl_mem difference_mem = CL_MEM_PTR(*difference); | |||||
| cl_mem blurDifference_mem = CL_MEM_PTR(*blurDifference); | |||||
| cl_mem image_mem = CL_MEM_PTR(image_ptr); | |||||
| cl_mem guide_mem = CL_MEM_PTR(guide_ptr); | |||||
| cl_mem variance_mem = CL_MEM_PTR(variance_ptr); | |||||
| cl_mem out_mem = CL_MEM_PTR(out_ptr); | |||||
| cl_mem scale_mem = NULL; | |||||
| mem_zero_kernel(*weightAccum, sizeof(float)*pass_stride); | |||||
| mem_zero_kernel(out_ptr, sizeof(float)*pass_stride); | |||||
| cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference")); | |||||
| cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur")); | |||||
| cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight")); | |||||
| cl_kernel ckNLMUpdateOutput = denoising_program(ustring("filter_nlm_update_output")); | |||||
| cl_kernel ckNLMNormalize = denoising_program(ustring("filter_nlm_normalize")); | |||||
| kernel_set_args(ckNLMCalcDifference, 0, | |||||
| guide_mem, | |||||
| variance_mem, | |||||
| scale_mem, | |||||
| difference_mem, | |||||
| w, h, stride, | |||||
| pass_stride, | |||||
| r, channel_offset, | |||||
| 0, a, k_2); | |||||
| kernel_set_args(ckNLMBlur, 0, | |||||
| difference_mem, | |||||
| blurDifference_mem, | |||||
| w, h, stride, | |||||
| pass_stride, | |||||
| r, f); | |||||
| kernel_set_args(ckNLMCalcWeight, 0, | |||||
| blurDifference_mem, | |||||
| difference_mem, | |||||
| w, h, stride, | |||||
| pass_stride, | |||||
| r, f); | |||||
| kernel_set_args(ckNLMUpdateOutput, 0, | |||||
| blurDifference_mem, | |||||
| image_mem, | |||||
| out_mem, | |||||
| weightAccum_mem, | |||||
| w, h, stride, | |||||
| pass_stride, | |||||
| channel_offset, | |||||
| r, f); | |||||
| enqueue_kernel(ckNLMCalcDifference, w*h, num_shifts, true); | |||||
| enqueue_kernel(ckNLMBlur, w*h, num_shifts, true); | |||||
| enqueue_kernel(ckNLMCalcWeight, w*h, num_shifts, true); | |||||
| enqueue_kernel(ckNLMBlur, w*h, num_shifts, true); | |||||
| enqueue_kernel(ckNLMUpdateOutput, w*h, num_shifts, true); | |||||
| kernel_set_args(ckNLMNormalize, 0, | |||||
| out_mem, weightAccum_mem, w, h, stride); | |||||
| enqueue_kernel(ckNLMNormalize, w, h); | |||||
| return true; | |||||
| } | |||||
| bool OpenCLDevice::denoising_construct_transform(DenoisingTask *task) | |||||
| { | |||||
| cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer); | |||||
| cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer); | |||||
| cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer); | |||||
| cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer); | |||||
| char use_time = task->buffer.use_time? 1 : 0; | |||||
| cl_kernel ckFilterConstructTransform = denoising_program(ustring("filter_construct_transform")); | |||||
| int arg_ofs = kernel_set_args(ckFilterConstructTransform, 0, | |||||
| buffer_mem, | |||||
| tile_info_mem); | |||||
| cl_mem buffers[9]; | |||||
| for(int i = 0; i < 9; i++) { | |||||
| buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]); | |||||
| arg_ofs += kernel_set_args(ckFilterConstructTransform, | |||||
| arg_ofs, | |||||
| buffers[i]); | |||||
| } | |||||
| kernel_set_args(ckFilterConstructTransform, | |||||
| arg_ofs, | |||||
| transform_mem, | |||||
| rank_mem, | |||||
| task->filter_area, | |||||
| task->rect, | |||||
| task->buffer.pass_stride, | |||||
| task->buffer.frame_stride, | |||||
| use_time, | |||||
| task->radius, | |||||
| task->pca_threshold); | |||||
| enqueue_kernel(ckFilterConstructTransform, | |||||
| task->storage.w, | |||||
| task->storage.h, | |||||
| 256); | |||||
| return true; | |||||
| } | |||||
| bool OpenCLDevice::denoising_accumulate(device_ptr color_ptr, | |||||
| device_ptr color_variance_ptr, | |||||
| device_ptr scale_ptr, | |||||
| int frame, | |||||
| DenoisingTask *task) | |||||
| { | |||||
| cl_mem color_mem = CL_MEM_PTR(color_ptr); | |||||
| cl_mem color_variance_mem = CL_MEM_PTR(color_variance_ptr); | |||||
| cl_mem scale_mem = CL_MEM_PTR(scale_ptr); | |||||
| cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer); | |||||
| cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer); | |||||
| cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer); | |||||
| cl_mem XtWX_mem = CL_MEM_PTR(task->storage.XtWX.device_pointer); | |||||
| cl_mem XtWY_mem = CL_MEM_PTR(task->storage.XtWY.device_pointer); | |||||
| cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference")); | |||||
| cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur")); | |||||
| cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight")); | |||||
| cl_kernel ckNLMConstructGramian = denoising_program(ustring("filter_nlm_construct_gramian")); | |||||
| int w = task->reconstruction_state.source_w; | |||||
| int h = task->reconstruction_state.source_h; | |||||
| int stride = task->buffer.stride; | |||||
| int frame_offset = frame * task->buffer.frame_stride; | |||||
| int t = task->tile_info->frames[frame]; | |||||
| char use_time = task->buffer.use_time? 1 : 0; | |||||
| int r = task->radius; | |||||
| int pass_stride = task->buffer.pass_stride; | |||||
| int num_shifts = (2*r+1)*(2*r+1); | |||||
| device_sub_ptr difference(task->buffer.temporary_mem, 0, pass_stride*num_shifts); | |||||
| device_sub_ptr blurDifference(task->buffer.temporary_mem, pass_stride*num_shifts, pass_stride*num_shifts); | |||||
| cl_mem difference_mem = CL_MEM_PTR(*difference); | |||||
| cl_mem blurDifference_mem = CL_MEM_PTR(*blurDifference); | |||||
| kernel_set_args(ckNLMCalcDifference, 0, | |||||
| color_mem, | |||||
| color_variance_mem, | |||||
| scale_mem, | |||||
| difference_mem, | |||||
| w, h, stride, | |||||
| pass_stride, | |||||
| r, | |||||
| pass_stride, | |||||
| frame_offset, | |||||
| 1.0f, task->nlm_k_2); | |||||
| kernel_set_args(ckNLMBlur, 0, | |||||
| difference_mem, | |||||
| blurDifference_mem, | |||||
| w, h, stride, | |||||
| pass_stride, | |||||
| r, 4); | |||||
| kernel_set_args(ckNLMCalcWeight, 0, | |||||
| blurDifference_mem, | |||||
| difference_mem, | |||||
| w, h, stride, | |||||
| pass_stride, | |||||
| r, 4); | |||||
| kernel_set_args(ckNLMConstructGramian, 0, | |||||
| t, | |||||
| blurDifference_mem, | |||||
| buffer_mem, | |||||
| transform_mem, | |||||
| rank_mem, | |||||
| XtWX_mem, | |||||
| XtWY_mem, | |||||
| task->reconstruction_state.filter_window, | |||||
| w, h, stride, | |||||
| pass_stride, | |||||
| r, 4, | |||||
| frame_offset, | |||||
| use_time); | |||||
| enqueue_kernel(ckNLMCalcDifference, w*h, num_shifts, true); | |||||
| enqueue_kernel(ckNLMBlur, w*h, num_shifts, true); | |||||
| enqueue_kernel(ckNLMCalcWeight, w*h, num_shifts, true); | |||||
| enqueue_kernel(ckNLMBlur, w*h, num_shifts, true); | |||||
| enqueue_kernel(ckNLMConstructGramian, w*h, num_shifts, true, 256); | |||||
| return true; | |||||
| } | |||||
| bool OpenCLDevice::denoising_solve(device_ptr output_ptr, | |||||
| DenoisingTask *task) | |||||
| { | |||||
| cl_kernel ckFinalize = denoising_program(ustring("filter_finalize")); | |||||
| cl_mem output_mem = CL_MEM_PTR(output_ptr); | |||||
| cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer); | |||||
| cl_mem XtWX_mem = CL_MEM_PTR(task->storage.XtWX.device_pointer); | |||||
| cl_mem XtWY_mem = CL_MEM_PTR(task->storage.XtWY.device_pointer); | |||||
| int w = task->reconstruction_state.source_w; | |||||
| int h = task->reconstruction_state.source_h; | |||||
| kernel_set_args(ckFinalize, 0, | |||||
| output_mem, | |||||
| rank_mem, | |||||
| XtWX_mem, | |||||
| XtWY_mem, | |||||
| task->filter_area, | |||||
| task->reconstruction_state.buffer_params, | |||||
| task->render_buffer.samples); | |||||
| enqueue_kernel(ckFinalize, w, h); | |||||
| return true; | |||||
| } | |||||
| bool OpenCLDevice::denoising_combine_halves(device_ptr a_ptr, | |||||
| device_ptr b_ptr, | |||||
| device_ptr mean_ptr, | |||||
| device_ptr variance_ptr, | |||||
| int r, int4 rect, | |||||
| DenoisingTask *task) | |||||
| { | |||||
| cl_mem a_mem = CL_MEM_PTR(a_ptr); | |||||
| cl_mem b_mem = CL_MEM_PTR(b_ptr); | |||||
| cl_mem mean_mem = CL_MEM_PTR(mean_ptr); | |||||
| cl_mem variance_mem = CL_MEM_PTR(variance_ptr); | |||||
| cl_kernel ckFilterCombineHalves = denoising_program(ustring("filter_combine_halves")); | |||||
| kernel_set_args(ckFilterCombineHalves, 0, | |||||
| mean_mem, | |||||
| variance_mem, | |||||
| a_mem, | |||||
| b_mem, | |||||
| rect, | |||||
| r); | |||||
| enqueue_kernel(ckFilterCombineHalves, | |||||
| task->rect.z-task->rect.x, | |||||
| task->rect.w-task->rect.y); | |||||
| return true; | |||||
| } | |||||
| bool OpenCLDevice::denoising_divide_shadow(device_ptr a_ptr, | |||||
| device_ptr b_ptr, | |||||
| device_ptr sample_variance_ptr, | |||||
| device_ptr sv_variance_ptr, | |||||
| device_ptr buffer_variance_ptr, | |||||
| DenoisingTask *task) | |||||
| { | |||||
| cl_mem a_mem = CL_MEM_PTR(a_ptr); | |||||
| cl_mem b_mem = CL_MEM_PTR(b_ptr); | |||||
| cl_mem sample_variance_mem = CL_MEM_PTR(sample_variance_ptr); | |||||
| cl_mem sv_variance_mem = CL_MEM_PTR(sv_variance_ptr); | |||||
| cl_mem buffer_variance_mem = CL_MEM_PTR(buffer_variance_ptr); | |||||
| cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer); | |||||
| cl_kernel ckFilterDivideShadow = denoising_program(ustring("filter_divide_shadow")); | |||||
| int arg_ofs = kernel_set_args(ckFilterDivideShadow, 0, | |||||
| task->render_buffer.samples, | |||||
| tile_info_mem); | |||||
| cl_mem buffers[9]; | |||||
| for(int i = 0; i < 9; i++) { | |||||
| buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]); | |||||
| arg_ofs += kernel_set_args(ckFilterDivideShadow, arg_ofs, | |||||
| buffers[i]); | |||||
| } | |||||
| kernel_set_args(ckFilterDivideShadow, arg_ofs, | |||||
| a_mem, | |||||
| b_mem, | |||||
| sample_variance_mem, | |||||
| sv_variance_mem, | |||||
| buffer_variance_mem, | |||||
| task->rect, | |||||
| task->render_buffer.pass_stride, | |||||
| task->render_buffer.offset); | |||||
| enqueue_kernel(ckFilterDivideShadow, | |||||
| task->rect.z-task->rect.x, | |||||
| task->rect.w-task->rect.y); | |||||
| return true; | |||||
| } | |||||
| bool OpenCLDevice::denoising_get_feature(int mean_offset, | |||||
| int variance_offset, | |||||
| device_ptr mean_ptr, | |||||
| device_ptr variance_ptr, | |||||
| float scale, | |||||
| DenoisingTask *task) | |||||
| { | |||||
| cl_mem mean_mem = CL_MEM_PTR(mean_ptr); | |||||
| cl_mem variance_mem = CL_MEM_PTR(variance_ptr); | |||||
| background = background_; | cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer); | ||||
| cl_kernel ckFilterGetFeature = denoising_program(ustring("filter_get_feature")); | |||||
| int arg_ofs = kernel_set_args(ckFilterGetFeature, 0, | |||||
| task->render_buffer.samples, | |||||
| tile_info_mem); | |||||
| cl_mem buffers[9]; | |||||
| for(int i = 0; i < 9; i++) { | |||||
| buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]); | |||||
| arg_ofs += kernel_set_args(ckFilterGetFeature, arg_ofs, | |||||
| buffers[i]); | |||||
| } | |||||
| kernel_set_args(ckFilterGetFeature, arg_ofs, | |||||
| mean_offset, | |||||
| variance_offset, | |||||
| mean_mem, | |||||
| variance_mem, | |||||
| scale, | |||||
| task->rect, | |||||
| task->render_buffer.pass_stride, | |||||
| task->render_buffer.offset); | |||||
| enqueue_kernel(ckFilterGetFeature, | |||||
| task->rect.z-task->rect.x, | |||||
| task->rect.w-task->rect.y); | |||||
| return true; | |||||
| } | |||||
| bool OpenCLDevice::denoising_write_feature(int out_offset, | |||||
| device_ptr from_ptr, | |||||
| device_ptr buffer_ptr, | |||||
| DenoisingTask *task) | |||||
| { | |||||
| cl_mem from_mem = CL_MEM_PTR(from_ptr); | |||||
| cl_mem buffer_mem = CL_MEM_PTR(buffer_ptr); | |||||
| cl_kernel ckFilterWriteFeature = denoising_program(ustring("filter_write_feature")); | |||||
| kernel_set_args(ckFilterWriteFeature, 0, | |||||
| task->render_buffer.samples, | |||||
| task->reconstruction_state.buffer_params, | |||||
| task->filter_area, | |||||
| from_mem, | |||||
| buffer_mem, | |||||
| out_offset, | |||||
| task->rect); | |||||
| enqueue_kernel(ckFilterWriteFeature, | |||||
| task->filter_area.z, | |||||
| task->filter_area.w); | |||||
| return true; | |||||
| } | |||||
| bool OpenCLDevice::denoising_detect_outliers(device_ptr image_ptr, | |||||
| device_ptr variance_ptr, | |||||
| device_ptr depth_ptr, | |||||
| device_ptr output_ptr, | |||||
| DenoisingTask *task) | |||||
| { | |||||
| cl_mem image_mem = CL_MEM_PTR(image_ptr); | |||||
| cl_mem variance_mem = CL_MEM_PTR(variance_ptr); | |||||
| cl_mem depth_mem = CL_MEM_PTR(depth_ptr); | |||||
| cl_mem output_mem = CL_MEM_PTR(output_ptr); | |||||
| cl_kernel ckFilterDetectOutliers = denoising_program(ustring("filter_detect_outliers")); | |||||
| kernel_set_args(ckFilterDetectOutliers, 0, | |||||
| image_mem, | |||||
| variance_mem, | |||||
| depth_mem, | |||||
| output_mem, | |||||
| task->rect, | |||||
| task->buffer.pass_stride); | |||||
| enqueue_kernel(ckFilterDetectOutliers, | |||||
| task->rect.z-task->rect.x, | |||||
| task->rect.w-task->rect.y); | |||||
| return true; | |||||
| } | |||||
| void OpenCLDevice::denoise(RenderTile &rtile, DenoisingTask& denoising) | |||||
| { | |||||
| denoising.functions.construct_transform = function_bind(&OpenCLDevice::denoising_construct_transform, this, &denoising); | |||||
| denoising.functions.accumulate = function_bind(&OpenCLDevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising); | |||||
| denoising.functions.solve = function_bind(&OpenCLDevice::denoising_solve, this, _1, &denoising); | |||||
| denoising.functions.divide_shadow = function_bind(&OpenCLDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising); | |||||
| denoising.functions.non_local_means = function_bind(&OpenCLDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising); | |||||
| denoising.functions.combine_halves = function_bind(&OpenCLDevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising); | |||||
| denoising.functions.get_feature = function_bind(&OpenCLDevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising); | |||||
| denoising.functions.write_feature = function_bind(&OpenCLDevice::denoising_write_feature, this, _1, _2, _3, &denoising); | |||||
| denoising.functions.detect_outliers = function_bind(&OpenCLDevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising); | |||||
| denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h); | |||||
| denoising.render_buffer.samples = rtile.sample; | |||||
| denoising.buffer.gpu_temporary_mem = true; | |||||
| denoising.run_denoising(&rtile); | |||||
| } | |||||
| void OpenCLDevice::shader(DeviceTask& task) | |||||
| { | |||||
| /* cast arguments to cl types */ | |||||
| cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer); | |||||
| cl_mem d_input = CL_MEM_PTR(task.shader_input); | |||||
| cl_mem d_output = CL_MEM_PTR(task.shader_output); | |||||
| cl_int d_shader_eval_type = task.shader_eval_type; | |||||
| cl_int d_shader_filter = task.shader_filter; | |||||
| cl_int d_shader_x = task.shader_x; | |||||
| cl_int d_shader_w = task.shader_w; | |||||
| cl_int d_offset = task.offset; | |||||
| cl_kernel kernel; | |||||
| if(task.shader_eval_type >= SHADER_EVAL_BAKE) { | |||||
| kernel = bake_program(ustring("bake")); | |||||
| } | |||||
| else if(task.shader_eval_type == SHADER_EVAL_DISPLACE) { | |||||
| kernel = displace_program(ustring("displace")); | |||||
| } | |||||
| else { | |||||
| kernel = background_program(ustring("background")); | |||||
| } | |||||
| cl_uint start_arg_index = | |||||
| kernel_set_args(kernel, | |||||
| 0, | |||||
| d_data, | |||||
| d_input, | |||||
| d_output); | |||||
| set_kernel_arg_buffers(kernel, &start_arg_index); | |||||
| start_arg_index += kernel_set_args(kernel, | |||||
| start_arg_index, | |||||
| d_shader_eval_type); | |||||
| if(task.shader_eval_type >= SHADER_EVAL_BAKE) { | |||||
| start_arg_index += kernel_set_args(kernel, | |||||
| start_arg_index, | |||||
| d_shader_filter); | |||||
| } | |||||
| start_arg_index += kernel_set_args(kernel, | |||||
| start_arg_index, | |||||
| d_shader_x, | |||||
| d_shader_w, | |||||
| d_offset); | |||||
| for(int sample = 0; sample < task.num_samples; sample++) { | |||||
| if(task.get_cancel()) | |||||
| break; | |||||
| kernel_set_args(kernel, start_arg_index, sample); | |||||
| enqueue_kernel(kernel, task.shader_w, 1); | |||||
| clFinish(cqCommandQueue); | |||||
| task.update_progress(NULL); | |||||
| } | |||||
| } | |||||
| string OpenCLDevice::kernel_build_options(const string *debug_src) | |||||
| { | |||||
| string build_options = "-cl-no-signed-zeros -cl-mad-enable "; | |||||
| if(platform_name == "NVIDIA CUDA") { | |||||
| build_options += "-D__KERNEL_OPENCL_NVIDIA__ " | |||||
| "-cl-nv-maxrregcount=32 " | |||||
| "-cl-nv-verbose "; | |||||
| uint compute_capability_major, compute_capability_minor; | |||||
| clGetDeviceInfo(cdDevice, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, | |||||
| sizeof(cl_uint), &compute_capability_major, NULL); | |||||
| clGetDeviceInfo(cdDevice, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, | |||||
| sizeof(cl_uint), &compute_capability_minor, NULL); | |||||
| build_options += string_printf("-D__COMPUTE_CAPABILITY__=%u ", | |||||
| compute_capability_major * 100 + | |||||
| compute_capability_minor * 10); | |||||
| } | |||||
| else if(platform_name == "Apple") | |||||
| build_options += "-D__KERNEL_OPENCL_APPLE__ "; | |||||
| else if(platform_name == "AMD Accelerated Parallel Processing") | |||||
| build_options += "-D__KERNEL_OPENCL_AMD__ "; | |||||
| else if(platform_name == "Intel(R) OpenCL") { | |||||
| build_options += "-D__KERNEL_OPENCL_INTEL_CPU__ "; | |||||
| /* Options for gdb source level kernel debugging. | |||||
| * this segfaults on linux currently. | |||||
| */ | |||||
| if(OpenCLInfo::use_debug() && debug_src) | |||||
| build_options += "-g -s \"" + *debug_src + "\" "; | |||||
| } | |||||
| if(info.has_half_images) { | |||||
| build_options += "-D__KERNEL_CL_KHR_FP16__ "; | |||||
| } | |||||
| if(OpenCLInfo::use_debug()) { | |||||
| build_options += "-D__KERNEL_OPENCL_DEBUG__ "; | |||||
| } | |||||
| #ifdef WITH_CYCLES_DEBUG | |||||
| build_options += "-D__KERNEL_DEBUG__ "; | |||||
| #endif | |||||
| return build_options; | |||||
| } | |||||
| /* TODO(sergey): In the future we can use variadic templates, once | |||||
| * C++0x is allowed. Should allow to clean this up a bit. | |||||
| */ | |||||
| int OpenCLDevice::kernel_set_args(cl_kernel kernel, | |||||
| int start_argument_index, | |||||
| const ArgumentWrapper& arg1, | |||||
| const ArgumentWrapper& arg2, | |||||
| const ArgumentWrapper& arg3, | |||||
| const ArgumentWrapper& arg4, | |||||
| const ArgumentWrapper& arg5, | |||||
| const ArgumentWrapper& arg6, | |||||
| const ArgumentWrapper& arg7, | |||||
| const ArgumentWrapper& arg8, | |||||
| const ArgumentWrapper& arg9, | |||||
| const ArgumentWrapper& arg10, | |||||
| const ArgumentWrapper& arg11, | |||||
| const ArgumentWrapper& arg12, | |||||
| const ArgumentWrapper& arg13, | |||||
| const ArgumentWrapper& arg14, | |||||
| const ArgumentWrapper& arg15, | |||||
| const ArgumentWrapper& arg16, | |||||
| const ArgumentWrapper& arg17, | |||||
| const ArgumentWrapper& arg18, | |||||
| const ArgumentWrapper& arg19, | |||||
| const ArgumentWrapper& arg20, | |||||
| const ArgumentWrapper& arg21, | |||||
| const ArgumentWrapper& arg22, | |||||
| const ArgumentWrapper& arg23, | |||||
| const ArgumentWrapper& arg24, | |||||
| const ArgumentWrapper& arg25, | |||||
| const ArgumentWrapper& arg26, | |||||
| const ArgumentWrapper& arg27, | |||||
| const ArgumentWrapper& arg28, | |||||
| const ArgumentWrapper& arg29, | |||||
| const ArgumentWrapper& arg30, | |||||
| const ArgumentWrapper& arg31, | |||||
| const ArgumentWrapper& arg32, | |||||
| const ArgumentWrapper& arg33) | |||||
| { | |||||
| int current_arg_index = 0; | |||||
| #define FAKE_VARARG_HANDLE_ARG(arg) \ | |||||
| do { \ | |||||
| if(arg.pointer != NULL) { \ | |||||
| opencl_assert(clSetKernelArg( \ | |||||
| kernel, \ | |||||
| start_argument_index + current_arg_index, \ | |||||
| arg.size, arg.pointer)); \ | |||||
| ++current_arg_index; \ | |||||
| } \ | |||||
| else { \ | |||||
| return current_arg_index; \ | |||||
| } \ | |||||
| } while(false) | |||||
| FAKE_VARARG_HANDLE_ARG(arg1); | |||||
| FAKE_VARARG_HANDLE_ARG(arg2); | |||||
| FAKE_VARARG_HANDLE_ARG(arg3); | |||||
| FAKE_VARARG_HANDLE_ARG(arg4); | |||||
| FAKE_VARARG_HANDLE_ARG(arg5); | |||||
| FAKE_VARARG_HANDLE_ARG(arg6); | |||||
| FAKE_VARARG_HANDLE_ARG(arg7); | |||||
| FAKE_VARARG_HANDLE_ARG(arg8); | |||||
| FAKE_VARARG_HANDLE_ARG(arg9); | |||||
| FAKE_VARARG_HANDLE_ARG(arg10); | |||||
| FAKE_VARARG_HANDLE_ARG(arg11); | |||||
| FAKE_VARARG_HANDLE_ARG(arg12); | |||||
| FAKE_VARARG_HANDLE_ARG(arg13); | |||||
| FAKE_VARARG_HANDLE_ARG(arg14); | |||||
| FAKE_VARARG_HANDLE_ARG(arg15); | |||||
| FAKE_VARARG_HANDLE_ARG(arg16); | |||||
| FAKE_VARARG_HANDLE_ARG(arg17); | |||||
| FAKE_VARARG_HANDLE_ARG(arg18); | |||||
| FAKE_VARARG_HANDLE_ARG(arg19); | |||||
| FAKE_VARARG_HANDLE_ARG(arg20); | |||||
| FAKE_VARARG_HANDLE_ARG(arg21); | |||||
| FAKE_VARARG_HANDLE_ARG(arg22); | |||||
| FAKE_VARARG_HANDLE_ARG(arg23); | |||||
| FAKE_VARARG_HANDLE_ARG(arg24); | |||||
| FAKE_VARARG_HANDLE_ARG(arg25); | |||||
| FAKE_VARARG_HANDLE_ARG(arg26); | |||||
| FAKE_VARARG_HANDLE_ARG(arg27); | |||||
| FAKE_VARARG_HANDLE_ARG(arg28); | |||||
| FAKE_VARARG_HANDLE_ARG(arg29); | |||||
| FAKE_VARARG_HANDLE_ARG(arg30); | |||||
| FAKE_VARARG_HANDLE_ARG(arg31); | |||||
| FAKE_VARARG_HANDLE_ARG(arg32); | |||||
| FAKE_VARARG_HANDLE_ARG(arg33); | |||||
| #undef FAKE_VARARG_HANDLE_ARG | |||||
| return current_arg_index; | |||||
| } | |||||
| void OpenCLDevice::release_kernel_safe(cl_kernel kernel) | |||||
| { | |||||
| if(kernel) { | |||||
| clReleaseKernel(kernel); | |||||
| } | |||||
| } | |||||
| void OpenCLDevice::release_mem_object_safe(cl_mem mem) | |||||
| { | |||||
| if(mem != NULL) { | |||||
| clReleaseMemObject(mem); | |||||
| } | |||||
| } | |||||
| void OpenCLDevice::release_program_safe(cl_program program) | |||||
| { | |||||
| if(program) { | |||||
| clReleaseProgram(program); | |||||
| } | |||||
| } | |||||
| /* ** Those guys are for workign around some compiler-specific bugs ** */ | |||||
| cl_program OpenCLDevice::load_cached_kernel( | |||||
| ustring key, | |||||
| thread_scoped_lock& cache_locker) | |||||
| { | |||||
| return OpenCLCache::get_program(cpPlatform, | |||||
| cdDevice, | |||||
| key, | |||||
| cache_locker); | |||||
| } | |||||
| void OpenCLDevice::store_cached_kernel( | |||||
| cl_program program, | |||||
| ustring key, | |||||
| thread_scoped_lock& cache_locker) | |||||
| { | |||||
| OpenCLCache::store_program(cpPlatform, | |||||
| cdDevice, | |||||
| program, | |||||
| key, | |||||
| cache_locker); | |||||
| } | } | ||||
| Device *opencl_create_split_device(DeviceInfo& info, Stats& stats, Profiler &profiler, bool background) | Device *opencl_create_split_device(DeviceInfo& info, Stats& stats, Profiler &profiler, bool background) | ||||
| { | { | ||||
| return new OpenCLDeviceSplitKernel(info, stats, profiler, background); | return new OpenCLDevice(info, stats, profiler, background); | ||||
| } | } | ||||
| CCL_NAMESPACE_END | CCL_NAMESPACE_END | ||||
| #endif /* WITH_OPENCL */ | #endif | ||||