Changeset View
Changeset View
Standalone View
Standalone View
intern/cycles/device/device_optix.cpp
| Show First 20 Lines • Show All 191 Lines • ▼ Show 20 Lines | # endif | ||||
| OptixDenoiser denoiser = NULL; | OptixDenoiser denoiser = NULL; | ||||
| device_only_memory<unsigned char> denoiser_state; | device_only_memory<unsigned char> denoiser_state; | ||||
| int denoiser_input_passes = 0; | int denoiser_input_passes = 0; | ||||
| public: | public: | ||||
| OptiXDevice(DeviceInfo &info_, Stats &stats_, Profiler &profiler_, bool background_) | OptiXDevice(DeviceInfo &info_, Stats &stats_, Profiler &profiler_, bool background_) | ||||
| : CUDADevice(info_, stats_, profiler_, background_), | : CUDADevice(info_, stats_, profiler_, background_), | ||||
| sbt_data(this, "__sbt", MEM_READ_ONLY), | sbt_data(this, "__sbt", MEM_READ_ONLY), | ||||
| launch_params(this, "__params"), | launch_params(this, "__params", MEM_READ_ONLY), | ||||
| denoiser_state(this, "__denoiser_state") | denoiser_state(this, "__denoiser_state", MEM_READ_WRITE) | ||||
| { | { | ||||
| // Store number of CUDA streams in device info | // Store number of CUDA streams in device info | ||||
| info.cpu_threads = DebugFlags().optix.cuda_streams; | info.cpu_threads = DebugFlags().optix.cuda_streams; | ||||
| // Make the CUDA context current | // Make the CUDA context current | ||||
| if (!cuContext) { | if (!cuContext) { | ||||
| return; // Do not initialize if CUDA context creation failed already | return; // Do not initialize if CUDA context creation failed already | ||||
| } | } | ||||
| ▲ Show 20 Lines • Show All 663 Lines • ▼ Show 20 Lines | if (task.denoising.type == DENOISER_OPTIX) { | ||||
| (task.pass_denoising_data + DENOISING_PASS_ALBEDO) * (int)sizeof(float), | (task.pass_denoising_data + DENOISING_PASS_ALBEDO) * (int)sizeof(float), | ||||
| (task.pass_denoising_data + DENOISING_PASS_NORMAL) * (int)sizeof(float)}; | (task.pass_denoising_data + DENOISING_PASS_NORMAL) * (int)sizeof(float)}; | ||||
| // Start with the current tile pointer offset | // Start with the current tile pointer offset | ||||
| int input_stride = pixel_stride; | int input_stride = pixel_stride; | ||||
| device_ptr input_ptr = rtile.buffer + pixel_offset; | device_ptr input_ptr = rtile.buffer + pixel_offset; | ||||
| // Copy tile data into a common buffer if necessary | // Copy tile data into a common buffer if necessary | ||||
| device_only_memory<float> input(this, "denoiser input"); | device_only_memory<float> input(this, "denoiser input", true); | ||||
| device_vector<TileInfo> tile_info_mem(this, "denoiser tile info", MEM_READ_WRITE); | device_vector<TileInfo> tile_info_mem(this, "denoiser tile info", MEM_READ_ONLY); | ||||
| bool contiguous_memory = true; | bool contiguous_memory = true; | ||||
| for (int i = 0; i < RenderTileNeighbors::SIZE; i++) { | for (int i = 0; i < RenderTileNeighbors::SIZE; i++) { | ||||
| if (neighbors.tiles[i].buffer && neighbors.tiles[i].buffer != rtile.buffer) { | if (neighbors.tiles[i].buffer && neighbors.tiles[i].buffer != rtile.buffer) { | ||||
| contiguous_memory = false; | contiguous_memory = false; | ||||
| } | } | ||||
| } | } | ||||
| Show All 28 Lines | if (task.denoising.type == DENOISER_OPTIX) { | ||||
| tile_info_mem.copy_to_device(); | tile_info_mem.copy_to_device(); | ||||
| void *args[] = { | void *args[] = { | ||||
| &input.device_pointer, &tile_info_mem.device_pointer, &rect.x, &task.pass_stride}; | &input.device_pointer, &tile_info_mem.device_pointer, &rect.x, &task.pass_stride}; | ||||
| launch_filter_kernel("kernel_cuda_filter_copy_input", rect_size.x, rect_size.y, args); | launch_filter_kernel("kernel_cuda_filter_copy_input", rect_size.x, rect_size.y, args); | ||||
| } | } | ||||
| # if OPTIX_DENOISER_NO_PIXEL_STRIDE | # if OPTIX_DENOISER_NO_PIXEL_STRIDE | ||||
| device_only_memory<float> input_rgb(this, "denoiser input rgb"); | device_only_memory<float> input_rgb(this, "denoiser input rgb", true); | ||||
| input_rgb.alloc_to_device(rect_size.x * rect_size.y * 3 * task.denoising.input_passes); | input_rgb.alloc_to_device(rect_size.x * rect_size.y * 3 * task.denoising.input_passes); | ||||
| void *input_args[] = {&input_rgb.device_pointer, | void *input_args[] = {&input_rgb.device_pointer, | ||||
| &input_ptr, | &input_ptr, | ||||
| &rect_size.x, | &rect_size.x, | ||||
| &rect_size.y, | &rect_size.y, | ||||
| &input_stride, | &input_stride, | ||||
| &task.pass_stride, | &task.pass_stride, | ||||
| ▲ Show 20 Lines • Show All 205 Lines • ▼ Show 20 Lines | # endif | ||||
| } | } | ||||
| } | } | ||||
| bool build_optix_bvh(BVHOptiX *bvh, | bool build_optix_bvh(BVHOptiX *bvh, | ||||
| OptixBuildOperation operation, | OptixBuildOperation operation, | ||||
| const OptixBuildInput &build_input, | const OptixBuildInput &build_input, | ||||
| uint16_t num_motion_steps) | uint16_t num_motion_steps) | ||||
| { | { | ||||
| /* Allocate and build acceleration structures only one at a time, to prevent parallel builds | |||||
| * from running out of memory (since both original and compacted acceleration structure memory | |||||
| * may be allocated at the same time for the duration of this function). The builds would | |||||
| * otherwise happen on the same CUDA stream anyway. */ | |||||
| static thread_mutex mutex; | |||||
| thread_scoped_lock lock(mutex); | |||||
| const CUDAContextScope scope(cuContext); | const CUDAContextScope scope(cuContext); | ||||
| // Compute memory usage | // Compute memory usage | ||||
| OptixAccelBufferSizes sizes = {}; | OptixAccelBufferSizes sizes = {}; | ||||
| OptixAccelBuildOptions options; | OptixAccelBuildOptions options; | ||||
| options.operation = operation; | options.operation = operation; | ||||
| if (background) { | if (background) { | ||||
| // Prefer best performance and lowest memory consumption in background | // Prefer best performance and lowest memory consumption in background | ||||
| options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_TRACE | OPTIX_BUILD_FLAG_ALLOW_COMPACTION; | options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_TRACE | OPTIX_BUILD_FLAG_ALLOW_COMPACTION; | ||||
| } | } | ||||
| else { | else { | ||||
| // Prefer fast updates in viewport | // Prefer fast updates in viewport | ||||
| options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_BUILD | OPTIX_BUILD_FLAG_ALLOW_UPDATE; | options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_BUILD | OPTIX_BUILD_FLAG_ALLOW_UPDATE; | ||||
| } | } | ||||
| options.motionOptions.numKeys = num_motion_steps; | options.motionOptions.numKeys = num_motion_steps; | ||||
| options.motionOptions.flags = OPTIX_MOTION_FLAG_START_VANISH | OPTIX_MOTION_FLAG_END_VANISH; | options.motionOptions.flags = OPTIX_MOTION_FLAG_START_VANISH | OPTIX_MOTION_FLAG_END_VANISH; | ||||
| options.motionOptions.timeBegin = 0.0f; | options.motionOptions.timeBegin = 0.0f; | ||||
| options.motionOptions.timeEnd = 1.0f; | options.motionOptions.timeEnd = 1.0f; | ||||
| check_result_optix_ret( | check_result_optix_ret( | ||||
| optixAccelComputeMemoryUsage(context, &options, &build_input, 1, &sizes)); | optixAccelComputeMemoryUsage(context, &options, &build_input, 1, &sizes)); | ||||
| // Allocate required output buffers | // Allocate required output buffers | ||||
| device_only_memory<char> temp_mem(this, "optix temp as build mem"); | device_only_memory<char> temp_mem(this, "optix temp as build mem", true); | ||||
| temp_mem.alloc_to_device(align_up(sizes.tempSizeInBytes, 8) + 8); | temp_mem.alloc_to_device(align_up(sizes.tempSizeInBytes, 8) + 8); | ||||
| if (!temp_mem.device_pointer) | if (!temp_mem.device_pointer) | ||||
| return false; // Make sure temporary memory allocation succeeded | return false; // Make sure temporary memory allocation succeeded | ||||
| // Acceleration structure memory has to be allocated on the device (not allowed to be on host) | |||||
| device_only_memory<char> &out_data = bvh->as_data; | device_only_memory<char> &out_data = bvh->as_data; | ||||
| if (operation == OPTIX_BUILD_OPERATION_BUILD) { | if (operation == OPTIX_BUILD_OPERATION_BUILD) { | ||||
| assert(out_data.device == this); | assert(out_data.device == this); | ||||
| out_data.alloc_to_device(sizes.outputSizeInBytes); | out_data.alloc_to_device(sizes.outputSizeInBytes); | ||||
| if (!out_data.device_pointer) | if (!out_data.device_pointer) | ||||
| return false; | return false; | ||||
| } | } | ||||
| else { | else { | ||||
| Show All 31 Lines | if (background) { | ||||
| check_result_cuda_ret( | check_result_cuda_ret( | ||||
| cuMemcpyDtoH(&compacted_size, compacted_size_prop.result, sizeof(compacted_size))); | cuMemcpyDtoH(&compacted_size, compacted_size_prop.result, sizeof(compacted_size))); | ||||
| // Temporary memory is no longer needed, so free it now to make space | // Temporary memory is no longer needed, so free it now to make space | ||||
| temp_mem.free(); | temp_mem.free(); | ||||
| // There is no point compacting if the size does not change | // There is no point compacting if the size does not change | ||||
| if (compacted_size < sizes.outputSizeInBytes) { | if (compacted_size < sizes.outputSizeInBytes) { | ||||
| device_only_memory<char> compacted_data(this, "optix compacted as"); | device_only_memory<char> compacted_data(this, "optix compacted as", false); | ||||
| compacted_data.alloc_to_device(compacted_size); | compacted_data.alloc_to_device(compacted_size); | ||||
| if (!compacted_data.device_pointer) | if (!compacted_data.device_pointer) | ||||
| // Do not compact if memory allocation for compacted acceleration structure fails | // Do not compact if memory allocation for compacted acceleration structure fails | ||||
| // Can just use the uncompacted one then, so succeed here regardless | // Can just use the uncompacted one then, so succeed here regardless | ||||
| return true; | return true; | ||||
| check_result_optix_ret(optixAccelCompact(context, | check_result_optix_ret(optixAccelCompact(context, | ||||
| NULL, | NULL, | ||||
| out_handle, | out_handle, | ||||
| compacted_data.device_pointer, | compacted_data.device_pointer, | ||||
| compacted_size, | compacted_size, | ||||
| &out_handle)); | &out_handle)); | ||||
| bvh->traversable_handle = static_cast<uint64_t>(out_handle); | bvh->traversable_handle = static_cast<uint64_t>(out_handle); | ||||
| // Wait for compaction to finish | // Wait for compaction to finish | ||||
| check_result_cuda_ret(cuStreamSynchronize(NULL)); | check_result_cuda_ret(cuStreamSynchronize(NULL)); | ||||
| std::swap(out_data.device_size, compacted_data.device_size); | std::swap(out_data.device_size, compacted_data.device_size); | ||||
| std::swap(out_data.device_pointer, compacted_data.device_pointer); | std::swap(out_data.device_pointer, compacted_data.device_pointer); | ||||
| // Original acceleration structure memory is freed when 'compacted_data' goes out of scope | |||||
| } | } | ||||
| } | } | ||||
| return true; | return true; | ||||
| } | } | ||||
| void build_bvh(BVH *bvh, Progress &progress, bool refit) override | void build_bvh(BVH *bvh, Progress &progress, bool refit) override | ||||
| { | { | ||||
| ▲ Show 20 Lines • Show All 621 Lines • Show Last 20 Lines | |||||