Changeset View
Changeset View
Standalone View
Standalone View
intern/cycles/device/device_cuda.cpp
| Show All 15 Lines | |||||
| #include <climits> | #include <climits> | ||||
| #include <limits.h> | #include <limits.h> | ||||
| #include <stdio.h> | #include <stdio.h> | ||||
| #include <stdlib.h> | #include <stdlib.h> | ||||
| #include <string.h> | #include <string.h> | ||||
| #include "device/device.h" | #include "device/device.h" | ||||
| #include "device/device_denoising.h" | |||||
| #include "device/device_intern.h" | #include "device/device_intern.h" | ||||
| #include "device/device_split_kernel.h" | #include "device/device_split_kernel.h" | ||||
| #include "render/buffers.h" | #include "render/buffers.h" | ||||
| #include "filter/filter_defines.h" | |||||
| #ifdef WITH_CUDA_DYNLOAD | #ifdef WITH_CUDA_DYNLOAD | ||||
| # include "cuew.h" | # include "cuew.h" | ||||
| #else | #else | ||||
| # include "util/util_opengl.h" | # include "util/util_opengl.h" | ||||
| # include <cuda.h> | # include <cuda.h> | ||||
| # include <cudaGL.h> | # include <cudaGL.h> | ||||
| #endif | #endif | ||||
| #include "util/util_debug.h" | #include "util/util_debug.h" | ||||
| ▲ Show 20 Lines • Show All 71 Lines • ▼ Show 20 Lines | |||||
| }; | }; | ||||
| class CUDADevice : public Device | class CUDADevice : public Device | ||||
| { | { | ||||
| public: | public: | ||||
| DedicatedTaskPool task_pool; | DedicatedTaskPool task_pool; | ||||
| CUdevice cuDevice; | CUdevice cuDevice; | ||||
| CUcontext cuContext; | CUcontext cuContext; | ||||
| CUmodule cuModule; | CUmodule cuModule, cuFilterModule; | ||||
| map<device_ptr, bool> tex_interp_map; | map<device_ptr, bool> tex_interp_map; | ||||
| map<device_ptr, uint> tex_bindless_map; | map<device_ptr, uint> tex_bindless_map; | ||||
| int cuDevId; | int cuDevId; | ||||
| int cuDevArchitecture; | int cuDevArchitecture; | ||||
| bool first_error; | bool first_error; | ||||
| struct PixelMem { | struct PixelMem { | ||||
| GLuint cuPBO; | GLuint cuPBO; | ||||
| ▲ Show 20 Lines • Show All 166 Lines • ▼ Show 20 Lines | #define cuda_error(stmt) cuda_error_(stmt, #stmt) | ||||
| { | { | ||||
| return DebugFlags().cuda.split_kernel; | return DebugFlags().cuda.split_kernel; | ||||
| } | } | ||||
| /* Common NVCC flags which stays the same regardless of shading model, | /* Common NVCC flags which stays the same regardless of shading model, | ||||
| * kernel sources md5 and only depends on compiler or compilation settings. | * kernel sources md5 and only depends on compiler or compilation settings. | ||||
| */ | */ | ||||
| string compile_kernel_get_common_cflags( | string compile_kernel_get_common_cflags( | ||||
| const DeviceRequestedFeatures& requested_features, bool split=false) | const DeviceRequestedFeatures& requested_features, | ||||
| bool filter=false, bool split=false) | |||||
| { | { | ||||
| const int cuda_version = cuewCompilerVersion(); | const int cuda_version = cuewCompilerVersion(); | ||||
| const int machine = system_cpu_bits(); | const int machine = system_cpu_bits(); | ||||
| const string kernel_path = path_get("source/kernel"); | const string kernel_path = path_get("source/kernel"); | ||||
| const string include = path_dirname(kernel_path); | const string include = path_dirname(kernel_path); | ||||
| string cflags = string_printf("-m%d " | string cflags = string_printf("-m%d " | ||||
| "--ptxas-options=\"-v\" " | "--ptxas-options=\"-v\" " | ||||
| "--use_fast_math " | "--use_fast_math " | ||||
| "-DNVCC " | "-DNVCC " | ||||
| "-D__KERNEL_CUDA_VERSION__=%d " | "-D__KERNEL_CUDA_VERSION__=%d " | ||||
| "-I\"%s\"", | "-I\"%s\"", | ||||
| machine, | machine, | ||||
| cuda_version, | cuda_version, | ||||
| include.c_str()); | include.c_str()); | ||||
| if(use_adaptive_compilation()) { | if(!filter && use_adaptive_compilation()) { | ||||
| cflags += " " + requested_features.get_build_options(); | cflags += " " + requested_features.get_build_options(); | ||||
| } | } | ||||
| const char *extra_cflags = getenv("CYCLES_CUDA_EXTRA_CFLAGS"); | const char *extra_cflags = getenv("CYCLES_CUDA_EXTRA_CFLAGS"); | ||||
| if(extra_cflags) { | if(extra_cflags) { | ||||
| cflags += string(" ") + string(extra_cflags); | cflags += string(" ") + string(extra_cflags); | ||||
| } | } | ||||
| #ifdef WITH_CYCLES_DEBUG | #ifdef WITH_CYCLES_DEBUG | ||||
| cflags += " -D__KERNEL_DEBUG__"; | cflags += " -D__KERNEL_DEBUG__"; | ||||
| Show All 31 Lines | bool compile_check_compiler() { | ||||
| else if(cuda_version != 80) { | else if(cuda_version != 80) { | ||||
| printf("CUDA version %d.%d detected, build may succeed but only " | printf("CUDA version %d.%d detected, build may succeed but only " | ||||
| "CUDA 8.0 is officially supported.\n", | "CUDA 8.0 is officially supported.\n", | ||||
| major, minor); | major, minor); | ||||
| } | } | ||||
| return true; | return true; | ||||
| } | } | ||||
| string compile_kernel(const DeviceRequestedFeatures& requested_features, bool split=false) | string compile_kernel(const DeviceRequestedFeatures& requested_features, | ||||
| bool filter=false, bool split=false) | |||||
| { | { | ||||
| const char *name, *source; | const char *name, *source; | ||||
| if(split) { | if(filter) { | ||||
| name = "kernel_filter"; | |||||
| source = "filter.cu"; | |||||
| } | |||||
| else if(split) { | |||||
| name = "kernel_split"; | name = "kernel_split"; | ||||
| source = "kernel_split.cu"; | source = "kernel_split.cu"; | ||||
| } | } | ||||
| else { | else { | ||||
| name = "kernel"; | name = "kernel"; | ||||
| source = "kernel.cu"; | source = "kernel.cu"; | ||||
| } | } | ||||
| /* Compute cubin name. */ | /* Compute cubin name. */ | ||||
| int major, minor; | int major, minor; | ||||
| cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId); | cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId); | ||||
| cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId); | cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId); | ||||
| /* Attempt to use kernel provided with Blender. */ | /* Attempt to use kernel provided with Blender. */ | ||||
| if(!use_adaptive_compilation()) { | if(!use_adaptive_compilation()) { | ||||
| const string cubin = path_get(string_printf("lib/%s_sm_%d%d.cubin", | const string cubin = path_get(string_printf("lib/%s_sm_%d%d.cubin", | ||||
| name, major, minor)); | name, major, minor)); | ||||
| VLOG(1) << "Testing for pre-compiled kernel " << cubin << "."; | VLOG(1) << "Testing for pre-compiled kernel " << cubin << "."; | ||||
| if(path_exists(cubin)) { | if(path_exists(cubin)) { | ||||
| VLOG(1) << "Using precompiled kernel."; | VLOG(1) << "Using precompiled kernel."; | ||||
| return cubin; | return cubin; | ||||
| } | } | ||||
| } | } | ||||
| const string common_cflags = | const string common_cflags = | ||||
| compile_kernel_get_common_cflags(requested_features, split); | compile_kernel_get_common_cflags(requested_features, filter, split); | ||||
| /* Try to use locally compiled kernel. */ | /* Try to use locally compiled kernel. */ | ||||
| const string kernel_path = path_get("source/kernel"); | const string kernel_path = path_get("source/kernel"); | ||||
| const string kernel_md5 = path_files_md5_hash(kernel_path); | const string kernel_md5 = path_files_md5_hash(kernel_path); | ||||
| /* We include cflags into md5 so changing cuda toolkit or changing other | /* We include cflags into md5 so changing cuda toolkit or changing other | ||||
| * compiler command line arguments makes sure cubin gets re-built. | * compiler command line arguments makes sure cubin gets re-built. | ||||
| */ | */ | ||||
| ▲ Show 20 Lines • Show All 77 Lines • ▼ Show 20 Lines | bool load_kernels(const DeviceRequestedFeatures& requested_features) | ||||
| if(cuContext == 0) | if(cuContext == 0) | ||||
| return false; | return false; | ||||
| /* check if GPU is supported */ | /* check if GPU is supported */ | ||||
| if(!support_device(requested_features)) | if(!support_device(requested_features)) | ||||
| return false; | return false; | ||||
| /* get kernel */ | /* get kernel */ | ||||
| string cubin = compile_kernel(requested_features, use_split_kernel()); | string cubin = compile_kernel(requested_features, false, use_split_kernel()); | ||||
| if(cubin == "") | if(cubin == "") | ||||
| return false; | return false; | ||||
| string filter_cubin = compile_kernel(requested_features, true, false); | |||||
| if(filter_cubin == "") | |||||
| return false; | |||||
| /* open module */ | /* open module */ | ||||
| cuda_push_context(); | cuda_push_context(); | ||||
| string cubin_data; | string cubin_data; | ||||
| CUresult result; | CUresult result; | ||||
| if(path_read_text(cubin, cubin_data)) | if(path_read_text(cubin, cubin_data)) | ||||
| result = cuModuleLoadData(&cuModule, cubin_data.c_str()); | result = cuModuleLoadData(&cuModule, cubin_data.c_str()); | ||||
| else | else | ||||
| result = CUDA_ERROR_FILE_NOT_FOUND; | result = CUDA_ERROR_FILE_NOT_FOUND; | ||||
| if(cuda_error_(result, "cuModuleLoad")) | if(cuda_error_(result, "cuModuleLoad")) | ||||
| cuda_error_message(string_printf("Failed loading CUDA kernel %s.", cubin.c_str())); | cuda_error_message(string_printf("Failed loading CUDA kernel %s.", cubin.c_str())); | ||||
| if(path_read_text(filter_cubin, cubin_data)) | |||||
| result = cuModuleLoadData(&cuFilterModule, cubin_data.c_str()); | |||||
| else | |||||
| result = CUDA_ERROR_FILE_NOT_FOUND; | |||||
| if(cuda_error_(result, "cuModuleLoad")) | |||||
| cuda_error_message(string_printf("Failed loading CUDA kernel %s.", filter_cubin.c_str())); | |||||
| cuda_pop_context(); | cuda_pop_context(); | ||||
| return (result == CUDA_SUCCESS); | return (result == CUDA_SUCCESS); | ||||
| } | } | ||||
| void load_bindless_mapping() | void load_bindless_mapping() | ||||
| { | { | ||||
| if(info.has_bindless_textures && need_bindless_mapping) { | if(info.has_bindless_textures && need_bindless_mapping) { | ||||
| ▲ Show 20 Lines • Show All 66 Lines • ▼ Show 20 Lines | if(mem.device_pointer) { | ||||
| mem.device_pointer = 0; | mem.device_pointer = 0; | ||||
| stats.mem_free(mem.device_size); | stats.mem_free(mem.device_size); | ||||
| mem.device_size = 0; | mem.device_size = 0; | ||||
| } | } | ||||
| } | } | ||||
| virtual device_ptr mem_get_offset_ptr(device_memory& mem, int offset, int /*size*/, MemoryType /*type*/) | |||||
| { | |||||
| return (device_ptr) (((char*) mem.device_pointer) + mem.memory_num_to_bytes(offset)); | |||||
| } | |||||
| void const_copy_to(const char *name, void *host, size_t size) | void const_copy_to(const char *name, void *host, size_t size) | ||||
| { | { | ||||
| CUdeviceptr mem; | CUdeviceptr mem; | ||||
| size_t bytes; | size_t bytes; | ||||
| cuda_push_context(); | cuda_push_context(); | ||||
| cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, name)); | cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, name)); | ||||
| //assert(bytes == size); | //assert(bytes == size); | ||||
| ▲ Show 20 Lines • Show All 284 Lines • ▼ Show 20 Lines | if(mem.device_pointer) { | ||||
| } | } | ||||
| else { | else { | ||||
| tex_interp_map.erase(tex_interp_map.find(mem.device_pointer)); | tex_interp_map.erase(tex_interp_map.find(mem.device_pointer)); | ||||
| mem_free(mem); | mem_free(mem); | ||||
| } | } | ||||
| } | } | ||||
| } | } | ||||
| bool denoising_set_tiles(device_ptr *buffers, DenoisingTask *task) | |||||
| { | |||||
| mem_alloc("Denoising Tile Info", task->tiles_mem, MEM_READ_ONLY); | |||||
| TilesInfo *tiles = (TilesInfo*) task->tiles_mem.data_pointer; | |||||
| for(int i = 0; i < 9; i++) { | |||||
| tiles->buffers[i] = buffers[i]; | |||||
| } | |||||
| mem_copy_to(task->tiles_mem); | |||||
| return !have_error(); | |||||
| } | |||||
| #define CUDA_GET_BLOCKSIZE(func, w, h) \ | |||||
| int threads_per_block; \ | |||||
| cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \ | |||||
| int threads = (int)sqrt((float)threads_per_block); \ | |||||
| int xblocks = ((w) + threads - 1)/threads; \ | |||||
| int yblocks = ((h) + threads - 1)/threads; | |||||
| #define CUDA_LAUNCH_KERNEL(func, args) \ | |||||
| cuda_assert(cuLaunchKernel(func, \ | |||||
| xblocks, yblocks, 1, \ | |||||
| threads, threads, 1, \ | |||||
| 0, 0, args, 0)); | |||||
| bool denoising_non_local_means(device_ptr image_ptr, device_ptr guide_ptr, device_ptr variance_ptr, device_ptr out_ptr, | |||||
| DenoisingTask *task) | |||||
| { | |||||
| if(have_error()) | |||||
| return false; | |||||
| cuda_push_context(); | |||||
| int4 rect = task->rect; | |||||
| int w = align_up(rect.z-rect.x, 4); | |||||
| int h = rect.w-rect.y; | |||||
| int r = task->nlm_state.r; | |||||
| int f = task->nlm_state.f; | |||||
| float a = task->nlm_state.a; | |||||
| float k_2 = task->nlm_state.k_2; | |||||
| CUdeviceptr difference = task->nlm_state.temporary_1_ptr; | |||||
| CUdeviceptr blurDifference = task->nlm_state.temporary_2_ptr; | |||||
| CUdeviceptr weightAccum = task->nlm_state.temporary_3_ptr; | |||||
| cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float)*w*h)); | |||||
| cuda_assert(cuMemsetD8(out_ptr, 0, sizeof(float)*w*h)); | |||||
| CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMUpdateOutput, cuNLMNormalize; | |||||
| cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference")); | |||||
| cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur")); | |||||
| cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight")); | |||||
| cuda_assert(cuModuleGetFunction(&cuNLMUpdateOutput, cuFilterModule, "kernel_cuda_filter_nlm_update_output")); | |||||
| cuda_assert(cuModuleGetFunction(&cuNLMNormalize, cuFilterModule, "kernel_cuda_filter_nlm_normalize")); | |||||
| cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1)); | |||||
| cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1)); | |||||
| cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1)); | |||||
| cuda_assert(cuFuncSetCacheConfig(cuNLMUpdateOutput, CU_FUNC_CACHE_PREFER_L1)); | |||||
| cuda_assert(cuFuncSetCacheConfig(cuNLMNormalize, CU_FUNC_CACHE_PREFER_L1)); | |||||
| CUDA_GET_BLOCKSIZE(cuNLMCalcDifference, rect.z-rect.x, rect.w-rect.y); | |||||
| int dx, dy; | |||||
| int4 local_rect; | |||||
| int channel_offset = 0; | |||||
| void *calc_difference_args[] = {&dx, &dy, &guide_ptr, &variance_ptr, &difference, &local_rect, &w, &channel_offset, &a, &k_2}; | |||||
| void *blur_args[] = {&difference, &blurDifference, &local_rect, &w, &f}; | |||||
| void *calc_weight_args[] = {&blurDifference, &difference, &local_rect, &w, &f}; | |||||
| void *update_output_args[] = {&dx, &dy, &blurDifference, &image_ptr, &out_ptr, &weightAccum, &local_rect, &w, &f}; | |||||
| for(int i = 0; i < (2*r+1)*(2*r+1); i++) { | |||||
| dy = i / (2*r+1) - r; | |||||
| dx = i % (2*r+1) - r; | |||||
| local_rect = make_int4(max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy)); | |||||
| CUDA_LAUNCH_KERNEL(cuNLMCalcDifference, calc_difference_args); | |||||
| CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args); | |||||
| CUDA_LAUNCH_KERNEL(cuNLMCalcWeight, calc_weight_args); | |||||
| CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args); | |||||
| CUDA_LAUNCH_KERNEL(cuNLMUpdateOutput, update_output_args); | |||||
| } | |||||
| local_rect = make_int4(0, 0, rect.z-rect.x, rect.w-rect.y); | |||||
| void *normalize_args[] = {&out_ptr, &weightAccum, &local_rect, &w}; | |||||
| CUDA_LAUNCH_KERNEL(cuNLMNormalize, normalize_args); | |||||
| cuda_assert(cuCtxSynchronize()); | |||||
| cuda_pop_context(); | |||||
| return !have_error(); | |||||
| } | |||||
| bool denoising_construct_transform(DenoisingTask *task) | |||||
| { | |||||
| if(have_error()) | |||||
| return false; | |||||
| cuda_push_context(); | |||||
| CUfunction cuFilterConstructTransform; | |||||
| cuda_assert(cuModuleGetFunction(&cuFilterConstructTransform, cuFilterModule, "kernel_cuda_filter_construct_transform")); | |||||
| cuda_assert(cuFuncSetCacheConfig(cuFilterConstructTransform, CU_FUNC_CACHE_PREFER_SHARED)); | |||||
| CUDA_GET_BLOCKSIZE(cuFilterConstructTransform, | |||||
| task->storage.w, | |||||
| task->storage.h); | |||||
| void *args[] = {&task->render_buffer.samples, | |||||
| &task->buffer.mem.device_pointer, | |||||
| &task->storage.transform.device_pointer, | |||||
| &task->storage.rank.device_pointer, | |||||
| &task->filter_area, | |||||
| &task->rect, | |||||
| &task->radius, | |||||
| &task->relative_pca, | |||||
| &task->buffer.pass_stride}; | |||||
| CUDA_LAUNCH_KERNEL(cuFilterConstructTransform, args); | |||||
| cuda_assert(cuCtxSynchronize()); | |||||
| cuda_pop_context(); | |||||
| return !have_error(); | |||||
| } | |||||
| bool denoising_reconstruct(device_ptr color_ptr, | |||||
| device_ptr color_variance_ptr, | |||||
| device_ptr guide_ptr, | |||||
| device_ptr guide_variance_ptr, | |||||
| device_ptr output_ptr, | |||||
| DenoisingTask *task) | |||||
| { | |||||
| if(have_error()) | |||||
| return false; | |||||
| mem_zero(task->storage.XtWX); | |||||
| mem_zero(task->storage.XtWY); | |||||
| cuda_push_context(); | |||||
| CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian, cuFinalize; | |||||
| cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference")); | |||||
| cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur")); | |||||
| cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight")); | |||||
| cuda_assert(cuModuleGetFunction(&cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian")); | |||||
| cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize")); | |||||
| cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1)); | |||||
| cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1)); | |||||
| cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1)); | |||||
| cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED)); | |||||
| cuda_assert(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1)); | |||||
| CUDA_GET_BLOCKSIZE(cuNLMCalcDifference, | |||||
| task->reconstruction_state.source_w, | |||||
| task->reconstruction_state.source_h); | |||||
| CUdeviceptr difference = task->reconstruction_state.temporary_1_ptr; | |||||
| CUdeviceptr blurDifference = task->reconstruction_state.temporary_2_ptr; | |||||
| int r = task->radius; | |||||
| int f = 4; | |||||
| float a = 1.0f; | |||||
| for(int i = 0; i < (2*r+1)*(2*r+1); i++) { | |||||
| int dy = i / (2*r+1) - r; | |||||
| int dx = i % (2*r+1) - r; | |||||
| int local_rect[4] = {max(0, -dx), max(0, -dy), | |||||
| task->reconstruction_state.source_w - max(0, dx), | |||||
| task->reconstruction_state.source_h - max(0, dy)}; | |||||
| void *calc_difference_args[] = {&dx, &dy, | |||||
| &guide_ptr, | |||||
| &guide_variance_ptr, | |||||
| &difference, | |||||
| &local_rect, | |||||
| &task->buffer.w, | |||||
| &task->buffer.pass_stride, | |||||
| &a, | |||||
| &task->nlm_k_2}; | |||||
| CUDA_LAUNCH_KERNEL(cuNLMCalcDifference, calc_difference_args); | |||||
| void *blur_args[] = {&difference, | |||||
| &blurDifference, | |||||
| &local_rect, | |||||
| &task->buffer.w, | |||||
| &f}; | |||||
| CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args); | |||||
| void *calc_weight_args[] = {&blurDifference, | |||||
| &difference, | |||||
| &local_rect, | |||||
| &task->buffer.w, | |||||
| &f}; | |||||
| CUDA_LAUNCH_KERNEL(cuNLMCalcWeight, calc_weight_args); | |||||
| /* Reuse previous arguments. */ | |||||
| CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args); | |||||
| void *construct_gramian_args[] = {&dx, &dy, | |||||
| &blurDifference, | |||||
| &task->buffer.mem.device_pointer, | |||||
| &color_ptr, | |||||
| &color_variance_ptr, | |||||
| &task->storage.transform.device_pointer, | |||||
| &task->storage.rank.device_pointer, | |||||
| &task->storage.XtWX.device_pointer, | |||||
| &task->storage.XtWY.device_pointer, | |||||
| &local_rect, | |||||
| &task->reconstruction_state.filter_rect, | |||||
| &task->buffer.w, | |||||
| &task->buffer.h, | |||||
| &f, | |||||
| &task->buffer.pass_stride}; | |||||
| CUDA_LAUNCH_KERNEL(cuNLMConstructGramian, construct_gramian_args); | |||||
| } | |||||
| void *finalize_args[] = {&task->buffer.w, | |||||
| &task->buffer.h, | |||||
| &output_ptr, | |||||
| &task->storage.rank.device_pointer, | |||||
| &task->storage.XtWX.device_pointer, | |||||
| &task->storage.XtWY.device_pointer, | |||||
| &task->filter_area, | |||||
| &task->reconstruction_state.buffer_params.x, | |||||
| &task->render_buffer.samples}; | |||||
| CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args); | |||||
| cuda_assert(cuCtxSynchronize()); | |||||
| cuda_pop_context(); | |||||
| return !have_error(); | |||||
| } | |||||
| bool denoising_combine_halves(device_ptr a_ptr, device_ptr b_ptr, | |||||
| device_ptr mean_ptr, device_ptr variance_ptr, | |||||
| int r, int4 rect, DenoisingTask *task) | |||||
| { | |||||
| (void) task; | |||||
| if(have_error()) | |||||
| return false; | |||||
| cuda_push_context(); | |||||
| CUfunction cuFilterCombineHalves; | |||||
| cuda_assert(cuModuleGetFunction(&cuFilterCombineHalves, cuFilterModule, "kernel_cuda_filter_combine_halves")); | |||||
| cuda_assert(cuFuncSetCacheConfig(cuFilterCombineHalves, CU_FUNC_CACHE_PREFER_L1)); | |||||
| CUDA_GET_BLOCKSIZE(cuFilterCombineHalves, | |||||
| task->rect.z-task->rect.x, | |||||
| task->rect.w-task->rect.y); | |||||
| void *args[] = {&mean_ptr, | |||||
| &variance_ptr, | |||||
| &a_ptr, | |||||
| &b_ptr, | |||||
| &rect, | |||||
| &r}; | |||||
| CUDA_LAUNCH_KERNEL(cuFilterCombineHalves, args); | |||||
| cuda_assert(cuCtxSynchronize()); | |||||
| cuda_pop_context(); | |||||
| return !have_error(); | |||||
| } | |||||
| bool denoising_divide_shadow(device_ptr a_ptr, device_ptr b_ptr, | |||||
| device_ptr sample_variance_ptr, device_ptr sv_variance_ptr, | |||||
| device_ptr buffer_variance_ptr, DenoisingTask *task) | |||||
| { | |||||
| (void) task; | |||||
| if(have_error()) | |||||
| return false; | |||||
| cuda_push_context(); | |||||
| CUfunction cuFilterDivideShadow; | |||||
| cuda_assert(cuModuleGetFunction(&cuFilterDivideShadow, cuFilterModule, "kernel_cuda_filter_divide_shadow")); | |||||
| cuda_assert(cuFuncSetCacheConfig(cuFilterDivideShadow, CU_FUNC_CACHE_PREFER_L1)); | |||||
| CUDA_GET_BLOCKSIZE(cuFilterDivideShadow, | |||||
| task->rect.z-task->rect.x, | |||||
| task->rect.w-task->rect.y); | |||||
| void *args[] = {&task->render_buffer.samples, | |||||
| &task->tiles_mem.device_pointer, | |||||
| &a_ptr, | |||||
| &b_ptr, | |||||
| &sample_variance_ptr, | |||||
| &sv_variance_ptr, | |||||
| &buffer_variance_ptr, | |||||
| &task->rect, | |||||
| &task->render_buffer.pass_stride, | |||||
| &task->render_buffer.denoising_data_offset, | |||||
| &task->use_split_variance}; | |||||
| CUDA_LAUNCH_KERNEL(cuFilterDivideShadow, args); | |||||
| cuda_assert(cuCtxSynchronize()); | |||||
| cuda_pop_context(); | |||||
| return !have_error(); | |||||
| } | |||||
| bool denoising_get_feature(int mean_offset, | |||||
| int variance_offset, | |||||
| device_ptr mean_ptr, | |||||
| device_ptr variance_ptr, | |||||
| DenoisingTask *task) | |||||
| { | |||||
| if(have_error()) | |||||
| return false; | |||||
| cuda_push_context(); | |||||
| CUfunction cuFilterGetFeature; | |||||
| cuda_assert(cuModuleGetFunction(&cuFilterGetFeature, cuFilterModule, "kernel_cuda_filter_get_feature")); | |||||
| cuda_assert(cuFuncSetCacheConfig(cuFilterGetFeature, CU_FUNC_CACHE_PREFER_L1)); | |||||
| CUDA_GET_BLOCKSIZE(cuFilterGetFeature, | |||||
| task->rect.z-task->rect.x, | |||||
| task->rect.w-task->rect.y); | |||||
| void *args[] = {&task->render_buffer.samples, | |||||
| &task->tiles_mem.device_pointer, | |||||
| &mean_offset, | |||||
| &variance_offset, | |||||
| &mean_ptr, | |||||
| &variance_ptr, | |||||
| &task->rect, | |||||
| &task->render_buffer.pass_stride, | |||||
| &task->render_buffer.denoising_data_offset, | |||||
| &task->use_split_variance}; | |||||
| CUDA_LAUNCH_KERNEL(cuFilterGetFeature, args); | |||||
| cuda_assert(cuCtxSynchronize()); | |||||
| cuda_pop_context(); | |||||
| return !have_error(); | |||||
| } | |||||
| void denoise(RenderTile &rtile, const DeviceTask &task) | |||||
| { | |||||
| DenoisingTask denoising(this); | |||||
| denoising.functions.construct_transform = function_bind(&CUDADevice::denoising_construct_transform, this, &denoising); | |||||
| denoising.functions.reconstruct = function_bind(&CUDADevice::denoising_reconstruct, this, _1, _2, _3, _4, _5, &denoising); | |||||
| denoising.functions.divide_shadow = function_bind(&CUDADevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising); | |||||
| denoising.functions.non_local_means = function_bind(&CUDADevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising); | |||||
| denoising.functions.combine_halves = function_bind(&CUDADevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising); | |||||
| denoising.functions.get_feature = function_bind(&CUDADevice::denoising_get_feature, this, _1, _2, _3, _4, &denoising); | |||||
| denoising.functions.set_tiles = function_bind(&CUDADevice::denoising_set_tiles, this, _1, &denoising); | |||||
| denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h); | |||||
| denoising.render_buffer.samples = rtile.sample; | |||||
| RenderTile rtiles[9]; | |||||
| rtiles[4] = rtile; | |||||
| task.get_neighbor_tiles(rtiles, this); | |||||
| denoising.tiles_from_rendertiles(rtiles); | |||||
| denoising.init_from_devicetask(task); | |||||
| denoising.run_denoising(); | |||||
| task.release_neighbor_tiles(rtiles, this); | |||||
| } | |||||
| void path_trace(RenderTile& rtile, int sample, bool branched) | void path_trace(RenderTile& rtile, int sample, bool branched) | ||||
| { | { | ||||
| if(have_error()) | if(have_error()) | ||||
| return; | return; | ||||
| cuda_push_context(); | cuda_push_context(); | ||||
| CUfunction cuPathTrace; | CUfunction cuPathTrace; | ||||
| ▲ Show 20 Lines • Show All 981 Lines • Show Last 20 Lines | |||||