Changeset View
Changeset View
Standalone View
Standalone View
intern/cycles/device/device_optix.cpp
| Show All 22 Lines | |||||
| # include "device/device_denoising.h" | # include "device/device_denoising.h" | ||||
| # include "device/device_intern.h" | # include "device/device_intern.h" | ||||
| # include "render/buffers.h" | # include "render/buffers.h" | ||||
| # include "render/hair.h" | # include "render/hair.h" | ||||
| # include "render/mesh.h" | # include "render/mesh.h" | ||||
| # include "render/object.h" | # include "render/object.h" | ||||
| # include "render/scene.h" | # include "render/scene.h" | ||||
| # include "util/util_debug.h" | # include "util/util_debug.h" | ||||
| # include "util/util_foreach.h" | |||||
| # include "util/util_logging.h" | # include "util/util_logging.h" | ||||
| # include "util/util_md5.h" | # include "util/util_md5.h" | ||||
| # include "util/util_path.h" | # include "util/util_path.h" | ||||
| # include "util/util_progress.h" | # include "util/util_progress.h" | ||||
| # include "util/util_time.h" | # include "util/util_time.h" | ||||
| # ifdef WITH_CUDA_DYNLOAD | # ifdef WITH_CUDA_DYNLOAD | ||||
| # include <cuew.h> | # include <cuew.h> | ||||
| ▲ Show 20 Lines • Show All 660 Lines • ▼ Show 20 Lines | void thread_run(DeviceTask &task, int thread_index) // Main task entry point | ||||
| if (task.type == DeviceTask::RENDER) { | if (task.type == DeviceTask::RENDER) { | ||||
| if (thread_index != 0) { | if (thread_index != 0) { | ||||
| // Only execute denoising in a single thread (see also 'task_add') | // Only execute denoising in a single thread (see also 'task_add') | ||||
| task.tile_types &= ~RenderTile::DENOISE; | task.tile_types &= ~RenderTile::DENOISE; | ||||
| } | } | ||||
| RenderTile tile; | RenderTile tile; | ||||
| while (task.acquire_tile(this, tile, task.tile_types)) { | while (task.acquire_tile(this, tile, task.tile_types)) { | ||||
| if (tile.task == RenderTile::PATH_TRACE) | if (tile.get_task() == RenderTile::PATH_TRACE) | ||||
| launch_render(task, tile, thread_index); | launch_render(task, tile, thread_index); | ||||
| else if (tile.task == RenderTile::BAKE) { | else if (tile.get_task() == RenderTile::BAKE) { | ||||
| // Perform baking using CUDA, since it is not currently implemented in OptiX | // Perform baking using CUDA, since it is not currently implemented in OptiX | ||||
| device_vector<WorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY); | device_vector<WorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY); | ||||
| CUDADevice::render(task, tile, work_tiles); | CUDADevice::render(task, tile, work_tiles); | ||||
| } | } | ||||
| else if (tile.task == RenderTile::DENOISE) | else if (tile.get_task() == RenderTile::DENOISE) | ||||
| launch_denoise(task, tile); | launch_denoise(task, tile); | ||||
| task.release_tile(tile); | task.release_tile(tile); | ||||
| if (task.get_cancel() && !task.need_finish_queue) | if (task.get_cancel() && !task.need_finish_queue) | ||||
| break; // User requested cancellation | break; // User requested cancellation | ||||
| else if (have_error()) | else if (have_error()) | ||||
| break; // Abort rendering when encountering an error | break; // Abort rendering when encountering an error | ||||
| } | } | ||||
| } | } | ||||
| else if (task.type == DeviceTask::SHADER) { | else if (task.type == DeviceTask::SHADER) { | ||||
| launch_shader_eval(task, thread_index); | launch_shader_eval(task, thread_index); | ||||
| } | } | ||||
| else if (task.type == DeviceTask::DENOISE_BUFFER) { | else if (task.type == DeviceTask::DENOISE_BUFFER) { | ||||
| // Set up a single tile that covers the whole task and denoise it | // Set up a single tile that covers the whole task and denoise it | ||||
| RenderTile tile; | RenderTile tile = RenderTile::from_device_task(task, false); | ||||
| tile.x = task.x; | |||||
| tile.y = task.y; | |||||
| tile.w = task.w; | |||||
| tile.h = task.h; | |||||
| tile.buffer = task.buffer; | |||||
| tile.num_samples = task.num_samples; | |||||
| tile.start_sample = task.sample; | |||||
| tile.offset = task.offset; | |||||
| tile.stride = task.stride; | |||||
| tile.buffers = task.buffers; | |||||
| launch_denoise(task, tile); | launch_denoise(task, tile); | ||||
| } | } | ||||
| } | } | ||||
| void launch_render(DeviceTask &task, RenderTile &rtile, int thread_index) | void launch_render(DeviceTask &task, RenderTile &rtile, int thread_index) | ||||
| { | { | ||||
| assert(thread_index < launch_params.data_size); | assert(thread_index < launch_params.data_size); | ||||
| // Keep track of total render time of this tile | // Keep track of total render time of this tile | ||||
| const scoped_timer timer(&rtile.buffers->render_time); | const scoped_timer timer(&rtile.get_buffers()->get_render_time()); | ||||
| WorkTile wtile; | WorkTile wtile = rtile.work_tile(); | ||||
| wtile.x = rtile.x; | |||||
| wtile.y = rtile.y; | |||||
| wtile.w = rtile.w; | |||||
| wtile.h = rtile.h; | |||||
| wtile.offset = rtile.offset; | |||||
| wtile.stride = rtile.stride; | |||||
| wtile.buffer = (float *)rtile.buffer; | |||||
| const int end_sample = rtile.start_sample + rtile.num_samples; | const int end_sample = rtile.get_start_sample() + rtile.get_num_samples(); | ||||
| // Keep this number reasonable to avoid running into TDRs | // Keep this number reasonable to avoid running into TDRs | ||||
| int step_samples = (info.display_device ? 8 : 32); | int step_samples = (info.display_device ? 8 : 32); | ||||
| if (task.adaptive_sampling.use) { | if (task.adaptive_sampling.use) { | ||||
| step_samples = task.adaptive_sampling.align_static_samples(step_samples); | step_samples = task.adaptive_sampling.align_static_samples(step_samples); | ||||
| } | } | ||||
| // Offset into launch params buffer so that streams use separate data | // Offset into launch params buffer so that streams use separate data | ||||
| device_ptr launch_params_ptr = launch_params.device_pointer + | device_ptr launch_params_ptr = launch_params.device_pointer + | ||||
| thread_index * launch_params.data_elements; | thread_index * launch_params.data_elements; | ||||
| const CUDAContextScope scope(cuContext); | const CUDAContextScope scope(cuContext); | ||||
| for (int sample = rtile.start_sample; sample < end_sample; sample += step_samples) { | for (int sample = rtile.get_start_sample(); sample < end_sample; sample += step_samples) { | ||||
| // Copy work tile information to device | // Copy work tile information to device | ||||
| wtile.num_samples = min(step_samples, end_sample - sample); | wtile.num_samples = min(step_samples, end_sample - sample); | ||||
| wtile.start_sample = sample; | wtile.start_sample = sample; | ||||
| device_ptr d_wtile_ptr = launch_params_ptr + offsetof(KernelParams, tile); | device_ptr d_wtile_ptr = launch_params_ptr + offsetof(KernelParams, tile); | ||||
| check_result_cuda( | check_result_cuda( | ||||
| cuMemcpyHtoDAsync(d_wtile_ptr, &wtile, sizeof(wtile), cuda_stream[thread_index])); | cuMemcpyHtoDAsync(d_wtile_ptr, &wtile, sizeof(wtile), cuda_stream[thread_index])); | ||||
| OptixShaderBindingTable sbt_params = {}; | OptixShaderBindingTable sbt_params = {}; | ||||
| Show All 28 Lines | # endif | ||||
| if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) { | if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) { | ||||
| adaptive_sampling_filter(filter_sample, &wtile, d_wtile_ptr, cuda_stream[thread_index]); | adaptive_sampling_filter(filter_sample, &wtile, d_wtile_ptr, cuda_stream[thread_index]); | ||||
| } | } | ||||
| // Wait for launch to finish | // Wait for launch to finish | ||||
| check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index])); | check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index])); | ||||
| // Update current sample, so it is displayed correctly | // Update current sample, so it is displayed correctly | ||||
| rtile.sample = wtile.start_sample + wtile.num_samples; | rtile.get_sample() = wtile.start_sample + wtile.num_samples; | ||||
| // Update task progress after the kernel completed rendering | // Update task progress after the kernel completed rendering | ||||
| task.update_progress(&rtile, wtile.w * wtile.h * wtile.num_samples); | task.update_progress(&rtile, wtile.w * wtile.h * wtile.num_samples); | ||||
| if (task.get_cancel() && !task.need_finish_queue) | if (task.get_cancel() && !task.need_finish_queue) | ||||
| return; // Cancel rendering | return; // Cancel rendering | ||||
| } | } | ||||
| // Finalize adaptive sampling | // Finalize adaptive sampling | ||||
| if (task.adaptive_sampling.use) { | if (task.adaptive_sampling.use) { | ||||
| device_ptr d_wtile_ptr = launch_params_ptr + offsetof(KernelParams, tile); | device_ptr d_wtile_ptr = launch_params_ptr + offsetof(KernelParams, tile); | ||||
| adaptive_sampling_post(rtile, &wtile, d_wtile_ptr, cuda_stream[thread_index]); | adaptive_sampling_post(rtile, &wtile, d_wtile_ptr, cuda_stream[thread_index]); | ||||
| check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index])); | check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index])); | ||||
| task.update_progress(&rtile, rtile.w * rtile.h * wtile.num_samples); | task.update_progress(&rtile, rtile.get_w() * rtile.get_h() * wtile.num_samples); | ||||
| } | } | ||||
| } | } | ||||
| bool launch_denoise(DeviceTask &task, RenderTile &rtile) | bool launch_denoise(DeviceTask &task, RenderTile &rtile) | ||||
| { | { | ||||
| // Update current sample (for display and NLM denoising task) | // Update current sample (for display and NLM denoising task) | ||||
| rtile.sample = rtile.start_sample + rtile.num_samples; | rtile.get_sample() = rtile.get_start_sample() + rtile.get_num_samples(); | ||||
| // Make CUDA context current now, since it is used for both denoising tasks | // Make CUDA context current now, since it is used for both denoising tasks | ||||
| const CUDAContextScope scope(cuContext); | const CUDAContextScope scope(cuContext); | ||||
| // Choose between OptiX and NLM denoising | // Choose between OptiX and NLM denoising | ||||
| if (task.denoising.type == DENOISER_OPTIX) { | if (task.denoising.type == DENOISER_OPTIX) { | ||||
| // Map neighboring tiles onto this device, indices are as following: | // Map neighboring tiles onto this device, indices are as following: | ||||
| // Where index 4 is the center tile and index 9 is the target for the result. | // Where index 4 is the center tile and index 9 is the target for the result. | ||||
| // 0 1 2 | // 0 1 2 | ||||
| // 3 4 5 | // 3 4 5 | ||||
| // 6 7 8 9 | // 6 7 8 9 | ||||
| RenderTileNeighbors neighbors(rtile); | RenderTileNeighbors neighbors(rtile); | ||||
| task.map_neighbor_tiles(neighbors, this); | task.map_neighbor_tiles(neighbors, this); | ||||
| RenderTile ¢er_tile = neighbors.tiles[RenderTileNeighbors::CENTER]; | RenderTile ¢er_tile = neighbors.get_tiles()[RenderTileNeighbors::CENTER]; | ||||
| RenderTile &target_tile = neighbors.target; | RenderTile &target_tile = neighbors.get_target(); | ||||
| rtile = center_tile; // Tile may have been modified by mapping code | rtile = center_tile; // Tile may have been modified by mapping code | ||||
| // Calculate size of the tile to denoise (including overlap) | // Calculate size of the tile to denoise (including overlap) | ||||
| int4 rect = center_tile.bounds(); | int4 rect = center_tile.bounds(); | ||||
| // Overlap between tiles has to be at least 64 pixels | // Overlap between tiles has to be at least 64 pixels | ||||
| // TODO(pmours): Query this value from OptiX | // TODO(pmours): Query this value from OptiX | ||||
| rect = rect_expand(rect, 64); | rect = rect_expand(rect, 64); | ||||
| int4 clip_rect = neighbors.bounds(); | int4 clip_rect = neighbors.bounds(); | ||||
| rect = rect_clip(rect, clip_rect); | rect = rect_clip(rect, clip_rect); | ||||
| int2 rect_size = make_int2(rect.z - rect.x, rect.w - rect.y); | int2 rect_size = make_int2(rect.z - rect.x, rect.w - rect.y); | ||||
| int2 overlap_offset = make_int2(rtile.x - rect.x, rtile.y - rect.y); | int2 overlap_offset = make_int2(rtile.get_x() - rect.x, rtile.get_y() - rect.y); | ||||
| // Calculate byte offsets and strides | // Calculate byte offsets and strides | ||||
| int pixel_stride = task.pass_stride * (int)sizeof(float); | int pixel_stride = task.pass_stride * (int)sizeof(float); | ||||
| int pixel_offset = (rtile.offset + rtile.x + rtile.y * rtile.stride) * pixel_stride; | int pixel_offset = (rtile.get_offset() + rtile.get_x() + | ||||
| rtile.get_y() * rtile.get_stride()) * | |||||
| pixel_stride; | |||||
| const int pass_offset[3] = { | const int pass_offset[3] = { | ||||
| (task.pass_denoising_data + DENOISING_PASS_COLOR) * (int)sizeof(float), | (task.pass_denoising_data + DENOISING_PASS_COLOR) * (int)sizeof(float), | ||||
| (task.pass_denoising_data + DENOISING_PASS_ALBEDO) * (int)sizeof(float), | (task.pass_denoising_data + DENOISING_PASS_ALBEDO) * (int)sizeof(float), | ||||
| (task.pass_denoising_data + DENOISING_PASS_NORMAL) * (int)sizeof(float)}; | (task.pass_denoising_data + DENOISING_PASS_NORMAL) * (int)sizeof(float)}; | ||||
| // Start with the current tile pointer offset | // Start with the current tile pointer offset | ||||
| int input_stride = pixel_stride; | int input_stride = pixel_stride; | ||||
| device_ptr input_ptr = rtile.buffer + pixel_offset; | device_ptr input_ptr = rtile.get_buffer() + pixel_offset; | ||||
| // Copy tile data into a common buffer if necessary | // Copy tile data into a common buffer if necessary | ||||
| device_only_memory<float> input(this, "denoiser input"); | device_only_memory<float> input(this, "denoiser input"); | ||||
| device_vector<TileInfo> tile_info_mem(this, "denoiser tile info", MEM_READ_WRITE); | device_vector<TileInfo> tile_info_mem(this, "denoiser tile info", MEM_READ_WRITE); | ||||
| bool contiguous_memory = true; | bool contiguous_memory = true; | ||||
| for (int i = 0; i < RenderTileNeighbors::SIZE; i++) { | foreach (RenderTile &ntile, neighbors.get_tiles()) { | ||||
| if (neighbors.tiles[i].buffer && neighbors.tiles[i].buffer != rtile.buffer) { | if (ntile.get_buffer() && ntile.get_buffer() != rtile.get_buffer()) { | ||||
| contiguous_memory = false; | contiguous_memory = false; | ||||
| } | } | ||||
| } | } | ||||
| if (contiguous_memory) { | if (contiguous_memory) { | ||||
| // Tiles are in continous memory, so can just subtract overlap offset | // Tiles are in continous memory, so can just subtract overlap offset | ||||
| input_ptr -= (overlap_offset.x + overlap_offset.y * rtile.stride) * pixel_stride; | input_ptr -= (overlap_offset.x + overlap_offset.y * rtile.get_stride()) * pixel_stride; | ||||
| // Stride covers the whole width of the image and not just a single tile | // Stride covers the whole width of the image and not just a single tile | ||||
| input_stride *= rtile.stride; | input_stride *= rtile.get_stride(); | ||||
| } | } | ||||
| else { | else { | ||||
| // Adjacent tiles are in separate memory regions, so need to copy them into a single one | // Adjacent tiles are in separate memory regions, so need to copy them into a single one | ||||
| input.alloc_to_device(rect_size.x * rect_size.y * task.pass_stride); | input.alloc_to_device(rect_size.x * rect_size.y * task.pass_stride); | ||||
| // Start with the new input buffer | // Start with the new input buffer | ||||
| input_ptr = input.device_pointer; | input_ptr = input.device_pointer; | ||||
| // Stride covers the width of the new input buffer, which includes tile width and overlap | // Stride covers the width of the new input buffer, which includes tile width and overlap | ||||
| input_stride *= rect_size.x; | input_stride *= rect_size.x; | ||||
| TileInfo *tile_info = tile_info_mem.alloc(1); | TileInfo *tile_info = tile_info_mem.alloc(1); | ||||
| for (int i = 0; i < RenderTileNeighbors::SIZE; i++) { | neighbors.fill_tile_info(tile_info); | ||||
| tile_info->offsets[i] = neighbors.tiles[i].offset; | |||||
| tile_info->strides[i] = neighbors.tiles[i].stride; | |||||
| tile_info->buffers[i] = neighbors.tiles[i].buffer; | |||||
| } | |||||
| tile_info->x[0] = neighbors.tiles[3].x; | |||||
| tile_info->x[1] = neighbors.tiles[4].x; | |||||
| tile_info->x[2] = neighbors.tiles[5].x; | |||||
| tile_info->x[3] = neighbors.tiles[5].x + neighbors.tiles[5].w; | |||||
| tile_info->y[0] = neighbors.tiles[1].y; | |||||
| tile_info->y[1] = neighbors.tiles[4].y; | |||||
| tile_info->y[2] = neighbors.tiles[7].y; | |||||
| tile_info->y[3] = neighbors.tiles[7].y + neighbors.tiles[7].h; | |||||
| tile_info_mem.copy_to_device(); | tile_info_mem.copy_to_device(); | ||||
| void *args[] = { | void *args[] = { | ||||
| &input.device_pointer, &tile_info_mem.device_pointer, &rect.x, &task.pass_stride}; | &input.device_pointer, &tile_info_mem.device_pointer, &rect.x, &task.pass_stride}; | ||||
| launch_filter_kernel("kernel_cuda_filter_copy_input", rect_size.x, rect_size.y, args); | launch_filter_kernel("kernel_cuda_filter_copy_input", rect_size.x, rect_size.y, args); | ||||
| } | } | ||||
| # if OPTIX_DENOISER_NO_PIXEL_STRIDE | # if OPTIX_DENOISER_NO_PIXEL_STRIDE | ||||
| device_only_memory<float> input_rgb(this, "denoiser input rgb"); | device_only_memory<float> input_rgb(this, "denoiser input rgb"); | ||||
| input_rgb.alloc_to_device(rect_size.x * rect_size.y * 3 * task.denoising.input_passes); | input_rgb.alloc_to_device(rect_size.x * rect_size.y * 3 * task.denoising.input_passes); | ||||
| void *input_args[] = {&input_rgb.device_pointer, | void *input_args[] = {&input_rgb.device_pointer, | ||||
| &input_ptr, | &input_ptr, | ||||
| &rect_size.x, | &rect_size.x, | ||||
| &rect_size.y, | &rect_size.y, | ||||
| &input_stride, | &input_stride, | ||||
| &task.pass_stride, | &task.pass_stride, | ||||
| const_cast<int *>(pass_offset), | const_cast<int *>(pass_offset), | ||||
| &task.denoising.input_passes, | &task.denoising.input_passes, | ||||
| &rtile.sample}; | &rtile.get_sample()}; | ||||
| launch_filter_kernel( | launch_filter_kernel( | ||||
| "kernel_cuda_filter_convert_to_rgb", rect_size.x, rect_size.y, input_args); | "kernel_cuda_filter_convert_to_rgb", rect_size.x, rect_size.y, input_args); | ||||
| input_ptr = input_rgb.device_pointer; | input_ptr = input_rgb.device_pointer; | ||||
| pixel_stride = 3 * sizeof(float); | pixel_stride = 3 * sizeof(float); | ||||
| input_stride = rect_size.x * pixel_stride; | input_stride = rect_size.x * pixel_stride; | ||||
| # endif | # endif | ||||
| ▲ Show 20 Lines • Show All 72 Lines • ▼ Show 20 Lines | # if OPTIX_DENOISER_NO_PIXEL_STRIDE | ||||
| output_layers[0].data = input_ptr; | output_layers[0].data = input_ptr; | ||||
| output_layers[0].width = rect_size.x; | output_layers[0].width = rect_size.x; | ||||
| output_layers[0].height = rect_size.y; | output_layers[0].height = rect_size.y; | ||||
| output_layers[0].rowStrideInBytes = input_stride; | output_layers[0].rowStrideInBytes = input_stride; | ||||
| output_layers[0].pixelStrideInBytes = pixel_stride; | output_layers[0].pixelStrideInBytes = pixel_stride; | ||||
| int2 output_offset = overlap_offset; | int2 output_offset = overlap_offset; | ||||
| overlap_offset = make_int2(0, 0); // Not supported by denoiser API, so apply manually | overlap_offset = make_int2(0, 0); // Not supported by denoiser API, so apply manually | ||||
| # else | # else | ||||
| output_layers[0].data = target_tile.buffer + pixel_offset; | output_layers[0].data = target_tile.get_buffer() + pixel_offset; | ||||
| output_layers[0].width = target_tile.w; | output_layers[0].width = target_tile.get_w(); | ||||
| output_layers[0].height = target_tile.h; | output_layers[0].height = target_tile.get_h(); | ||||
| output_layers[0].rowStrideInBytes = target_tile.stride * pixel_stride; | output_layers[0].rowStrideInBytes = target_tile.get_stride() * pixel_stride; | ||||
| output_layers[0].pixelStrideInBytes = pixel_stride; | output_layers[0].pixelStrideInBytes = pixel_stride; | ||||
| # endif | # endif | ||||
| output_layers[0].format = OPTIX_PIXEL_FORMAT_FLOAT3; | output_layers[0].format = OPTIX_PIXEL_FORMAT_FLOAT3; | ||||
| // Finally run denonising | // Finally run denonising | ||||
| OptixDenoiserParams params = {}; // All parameters are disabled/zero | OptixDenoiserParams params = {}; // All parameters are disabled/zero | ||||
| check_result_optix_ret(optixDenoiserInvoke(denoiser, | check_result_optix_ret(optixDenoiserInvoke(denoiser, | ||||
| 0, | 0, | ||||
| ¶ms, | ¶ms, | ||||
| denoiser_state.device_pointer, | denoiser_state.device_pointer, | ||||
| scratch_offset, | scratch_offset, | ||||
| input_layers, | input_layers, | ||||
| task.denoising.input_passes, | task.denoising.input_passes, | ||||
| overlap_offset.x, | overlap_offset.x, | ||||
| overlap_offset.y, | overlap_offset.y, | ||||
| output_layers, | output_layers, | ||||
| denoiser_state.device_pointer + scratch_offset, | denoiser_state.device_pointer + scratch_offset, | ||||
| scratch_size)); | scratch_size)); | ||||
| # if OPTIX_DENOISER_NO_PIXEL_STRIDE | # if OPTIX_DENOISER_NO_PIXEL_STRIDE | ||||
| void *output_args[] = {&input_ptr, | void *output_args[] = {&input_ptr, | ||||
| &target_tile.buffer, | &target_tile.get_buffer(), | ||||
| &output_offset.x, | &output_offset.x, | ||||
| &output_offset.y, | &output_offset.y, | ||||
| &rect_size.x, | &rect_size.x, | ||||
| &rect_size.y, | &rect_size.y, | ||||
| &target_tile.x, | &target_tile.get_x(), | ||||
| &target_tile.y, | &target_tile.get_y(), | ||||
| &target_tile.w, | &target_tile.get_w(), | ||||
| &target_tile.h, | &target_tile.get_h(), | ||||
| &target_tile.offset, | &target_tile.get_offset(), | ||||
| &target_tile.stride, | &target_tile.get_stride(), | ||||
| &task.pass_stride, | &task.pass_stride, | ||||
| &rtile.sample}; | &rtile.get_sample()}; | ||||
| launch_filter_kernel( | launch_filter_kernel("kernel_cuda_filter_convert_from_rgb", | ||||
| "kernel_cuda_filter_convert_from_rgb", target_tile.w, target_tile.h, output_args); | target_tile.get_w(), | ||||
| target_tile.get_h(), | |||||
| output_args); | |||||
| # endif | # endif | ||||
| check_result_cuda_ret(cuStreamSynchronize(0)); | check_result_cuda_ret(cuStreamSynchronize(0)); | ||||
| task.unmap_neighbor_tiles(neighbors, this); | task.unmap_neighbor_tiles(neighbors, this); | ||||
| } | } | ||||
| else { | else { | ||||
| // Run CUDA denoising kernels | // Run CUDA denoising kernels | ||||
| DenoisingTask denoising(this, task); | DenoisingTask denoising(this, task); | ||||
| CUDADevice::denoise(rtile, denoising); | CUDADevice::denoise(rtile, denoising); | ||||
| } | } | ||||
| // Update task progress after the denoiser completed processing | // Update task progress after the denoiser completed processing | ||||
| task.update_progress(&rtile, rtile.w * rtile.h); | task.update_progress(&rtile, rtile.get_w() * rtile.get_h()); | ||||
| return true; | return true; | ||||
| } | } | ||||
| void launch_shader_eval(DeviceTask &task, int thread_index) | void launch_shader_eval(DeviceTask &task, int thread_index) | ||||
| { | { | ||||
| unsigned int rgen_index = PG_BACK; | unsigned int rgen_index = PG_BACK; | ||||
| if (task.shader_eval_type >= SHADER_EVAL_BAKE) | if (task.shader_eval_type >= SHADER_EVAL_BAKE) | ||||
| ▲ Show 20 Lines • Show All 183 Lines • ▼ Show 20 Lines | if (!bvh->params.top_level) { | ||||
| } | } | ||||
| else { | else { | ||||
| bvh_optix->as_data.free(); | bvh_optix->as_data.free(); | ||||
| bvh_optix->traversable_handle = 0; | bvh_optix->traversable_handle = 0; | ||||
| } | } | ||||
| // Build bottom level acceleration structures (BLAS) | // Build bottom level acceleration structures (BLAS) | ||||
| Geometry *const geom = bvh->geometry[0]; | Geometry *const geom = bvh->geometry[0]; | ||||
| if (geom->geometry_type == Geometry::HAIR) { | if (geom->is_hair()) { | ||||
| // Build BLAS for curve primitives | // Build BLAS for curve primitives | ||||
| Hair *const hair = static_cast<Hair *const>(geom); | Hair *const hair = static_cast<Hair *const>(geom); | ||||
| if (hair->num_curves() == 0) { | if (hair->num_curves() == 0) { | ||||
| return; | return; | ||||
| } | } | ||||
| const size_t num_segments = hair->num_segments(); | const size_t num_segments = hair->num_segments(); | ||||
| size_t num_motion_steps = 1; | size_t num_motion_steps = 1; | ||||
| Attribute *motion_keys = hair->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION); | Attribute *motion_keys = hair->get_attributes().find(ATTR_STD_MOTION_VERTEX_POSITION); | ||||
| if (motion_blur && hair->get_use_motion_blur() && motion_keys) { | if (motion_blur && hair->get_use_motion_blur() && motion_keys) { | ||||
| num_motion_steps = hair->get_motion_steps(); | num_motion_steps = hair->get_motion_steps(); | ||||
| } | } | ||||
| device_vector<OptixAabb> aabb_data(this, "optix temp aabb data", MEM_READ_ONLY); | device_vector<OptixAabb> aabb_data(this, "optix temp aabb data", MEM_READ_ONLY); | ||||
| # if OPTIX_ABI_VERSION >= 36 | # if OPTIX_ABI_VERSION >= 36 | ||||
| device_vector<int> index_data(this, "optix temp index data", MEM_READ_ONLY); | device_vector<int> index_data(this, "optix temp index data", MEM_READ_ONLY); | ||||
| device_vector<float4> vertex_data(this, "optix temp vertex data", MEM_READ_ONLY); | device_vector<float4> vertex_data(this, "optix temp vertex data", MEM_READ_ONLY); | ||||
| ▲ Show 20 Lines • Show All 125 Lines • ▼ Show 20 Lines | # endif | ||||
| build_input.type = OPTIX_BUILD_INPUT_TYPE_CUSTOM_PRIMITIVES; | build_input.type = OPTIX_BUILD_INPUT_TYPE_CUSTOM_PRIMITIVES; | ||||
| # if OPTIX_ABI_VERSION < 23 | # if OPTIX_ABI_VERSION < 23 | ||||
| build_input.aabbArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data(); | build_input.aabbArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data(); | ||||
| build_input.aabbArray.numPrimitives = num_segments; | build_input.aabbArray.numPrimitives = num_segments; | ||||
| build_input.aabbArray.strideInBytes = sizeof(OptixAabb); | build_input.aabbArray.strideInBytes = sizeof(OptixAabb); | ||||
| build_input.aabbArray.flags = &build_flags; | build_input.aabbArray.flags = &build_flags; | ||||
| build_input.aabbArray.numSbtRecords = 1; | build_input.aabbArray.numSbtRecords = 1; | ||||
| build_input.aabbArray.primitiveIndexOffset = hair->optix_prim_offset; | build_input.aabbArray.primitiveIndexOffset = hair->get_optix_prim_offset(); | ||||
| # else | # else | ||||
| build_input.customPrimitiveArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data(); | build_input.customPrimitiveArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data(); | ||||
| build_input.customPrimitiveArray.numPrimitives = num_segments; | build_input.customPrimitiveArray.numPrimitives = num_segments; | ||||
| build_input.customPrimitiveArray.strideInBytes = sizeof(OptixAabb); | build_input.customPrimitiveArray.strideInBytes = sizeof(OptixAabb); | ||||
| build_input.customPrimitiveArray.flags = &build_flags; | build_input.customPrimitiveArray.flags = &build_flags; | ||||
| build_input.customPrimitiveArray.numSbtRecords = 1; | build_input.customPrimitiveArray.numSbtRecords = 1; | ||||
| build_input.customPrimitiveArray.primitiveIndexOffset = hair->optix_prim_offset; | build_input.customPrimitiveArray.primitiveIndexOffset = hair->optix_prim_offset; | ||||
| # endif | # endif | ||||
| } | } | ||||
| if (!build_optix_bvh(bvh_optix, operation, build_input, num_motion_steps)) { | if (!build_optix_bvh(bvh_optix, operation, build_input, num_motion_steps)) { | ||||
| progress.set_error("Failed to build OptiX acceleration structure"); | progress.set_error("Failed to build OptiX acceleration structure"); | ||||
| } | } | ||||
| } | } | ||||
| else if (geom->geometry_type == Geometry::MESH || geom->geometry_type == Geometry::VOLUME) { | else if (geom->is_mesh() || geom->is_volume()) { | ||||
| // Build BLAS for triangle primitives | // Build BLAS for triangle primitives | ||||
| Mesh *const mesh = static_cast<Mesh *const>(geom); | Mesh *const mesh = static_cast<Mesh *const>(geom); | ||||
| if (mesh->num_triangles() == 0) { | if (mesh->num_triangles() == 0) { | ||||
| return; | return; | ||||
| } | } | ||||
| const size_t num_verts = mesh->get_verts().size(); | const size_t num_verts = mesh->get_verts().size(); | ||||
| size_t num_motion_steps = 1; | size_t num_motion_steps = 1; | ||||
| Attribute *motion_keys = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION); | Attribute *motion_keys = mesh->get_attributes().find(ATTR_STD_MOTION_VERTEX_POSITION); | ||||
| if (motion_blur && mesh->get_use_motion_blur() && motion_keys) { | if (motion_blur && mesh->get_use_motion_blur() && motion_keys) { | ||||
| num_motion_steps = mesh->get_motion_steps(); | num_motion_steps = mesh->get_motion_steps(); | ||||
| } | } | ||||
| device_vector<int> index_data(this, "optix temp index data", MEM_READ_ONLY); | device_vector<int> index_data(this, "optix temp index data", MEM_READ_ONLY); | ||||
| index_data.alloc(mesh->get_triangles().size()); | index_data.alloc(mesh->get_triangles().size()); | ||||
| memcpy(index_data.data(), | memcpy(index_data.data(), | ||||
| mesh->get_triangles().data(), | mesh->get_triangles().data(), | ||||
| Show All 36 Lines | # endif | ||||
| build_input.triangleArray.numIndexTriplets = mesh->num_triangles(); | build_input.triangleArray.numIndexTriplets = mesh->num_triangles(); | ||||
| build_input.triangleArray.indexFormat = OPTIX_INDICES_FORMAT_UNSIGNED_INT3; | build_input.triangleArray.indexFormat = OPTIX_INDICES_FORMAT_UNSIGNED_INT3; | ||||
| build_input.triangleArray.indexStrideInBytes = 3 * sizeof(int); | build_input.triangleArray.indexStrideInBytes = 3 * sizeof(int); | ||||
| build_input.triangleArray.flags = &build_flags; | build_input.triangleArray.flags = &build_flags; | ||||
| // The SBT does not store per primitive data since Cycles already allocates separate | // The SBT does not store per primitive data since Cycles already allocates separate | ||||
| // buffers for that purpose. OptiX does not allow this to be zero though, so just pass in | // buffers for that purpose. OptiX does not allow this to be zero though, so just pass in | ||||
| // one and rely on that having the same meaning in this case. | // one and rely on that having the same meaning in this case. | ||||
| build_input.triangleArray.numSbtRecords = 1; | build_input.triangleArray.numSbtRecords = 1; | ||||
| build_input.triangleArray.primitiveIndexOffset = mesh->optix_prim_offset; | build_input.triangleArray.primitiveIndexOffset = mesh->get_optix_prim_offset(); | ||||
| if (!build_optix_bvh(bvh_optix, operation, build_input, num_motion_steps)) { | if (!build_optix_bvh(bvh_optix, operation, build_input, num_motion_steps)) { | ||||
| progress.set_error("Failed to build OptiX acceleration structure"); | progress.set_error("Failed to build OptiX acceleration structure"); | ||||
| } | } | ||||
| } | } | ||||
| } | } | ||||
| else { | else { | ||||
| unsigned int num_instances = 0; | unsigned int num_instances = 0; | ||||
| ▲ Show 20 Lines • Show All 42 Lines • ▼ Show 20 Lines | # endif | ||||
| bvh_optix->motion_transform_data.alloc_to_device(total_motion_transform_size); | bvh_optix->motion_transform_data.alloc_to_device(total_motion_transform_size); | ||||
| } | } | ||||
| for (Object *ob : bvh->objects) { | for (Object *ob : bvh->objects) { | ||||
| // Skip non-traceable objects | // Skip non-traceable objects | ||||
| if (!ob->is_traceable()) | if (!ob->is_traceable()) | ||||
| continue; | continue; | ||||
| BVHOptiX *const blas = static_cast<BVHOptiX *>(ob->get_geometry()->bvh); | BVHOptiX *const blas = static_cast<BVHOptiX *>(ob->get_geometry()->get_bvh()); | ||||
| OptixTraversableHandle handle = blas->traversable_handle; | OptixTraversableHandle handle = blas->traversable_handle; | ||||
| # if OPTIX_ABI_VERSION < 41 | # if OPTIX_ABI_VERSION < 41 | ||||
| OptixAabb &aabb = aabbs[num_instances]; | OptixAabb &aabb = aabbs[num_instances]; | ||||
| aabb.minX = ob->bounds.min.x; | aabb.minX = ob->get_bounds().min.x; | ||||
| aabb.minY = ob->bounds.min.y; | aabb.minY = ob->get_bounds().min.y; | ||||
| aabb.minZ = ob->bounds.min.z; | aabb.minZ = ob->get_bounds().min.z; | ||||
| aabb.maxX = ob->bounds.max.x; | aabb.maxX = ob->get_bounds().max.x; | ||||
| aabb.maxY = ob->bounds.max.y; | aabb.maxY = ob->get_bounds().max.y; | ||||
| aabb.maxZ = ob->bounds.max.z; | aabb.maxZ = ob->get_bounds().max.z; | ||||
| # endif | # endif | ||||
| OptixInstance &instance = instances[num_instances++]; | OptixInstance &instance = instances[num_instances++]; | ||||
| memset(&instance, 0, sizeof(instance)); | memset(&instance, 0, sizeof(instance)); | ||||
| // Clear transform to identity matrix | // Clear transform to identity matrix | ||||
| instance.transform[0] = 1.0f; | instance.transform[0] = 1.0f; | ||||
| instance.transform[5] = 1.0f; | instance.transform[5] = 1.0f; | ||||
| instance.transform[10] = 1.0f; | instance.transform[10] = 1.0f; | ||||
| // Set user instance ID to object index (but leave low bit blank) | // Set user instance ID to object index (but leave low bit blank) | ||||
| instance.instanceId = ob->get_device_index() << 1; | instance.instanceId = ob->get_device_index() << 1; | ||||
| // Have to have at least one bit in the mask, or else instance would always be culled | // Have to have at least one bit in the mask, or else instance would always be culled | ||||
| instance.visibilityMask = 1; | instance.visibilityMask = 1; | ||||
| if (ob->get_geometry()->has_volume) { | if (ob->get_geometry()->get_has_volume()) { | ||||
| // Volumes have a special bit set in the visibility mask so a trace can mask only volumes | // Volumes have a special bit set in the visibility mask so a trace can mask only volumes | ||||
| instance.visibilityMask |= 2; | instance.visibilityMask |= 2; | ||||
| } | } | ||||
| if (ob->get_geometry()->geometry_type == Geometry::HAIR) { | if (ob->get_geometry()->is_hair()) { | ||||
| // Same applies to curves (so they can be skipped in local trace calls) | // Same applies to curves (so they can be skipped in local trace calls) | ||||
| instance.visibilityMask |= 4; | instance.visibilityMask |= 4; | ||||
| # if OPTIX_ABI_VERSION >= 36 | # if OPTIX_ABI_VERSION >= 36 | ||||
| if (motion_blur && ob->get_geometry()->has_motion_blur() && | if (motion_blur && ob->get_geometry()->has_motion_blur() && | ||||
| DebugFlags().optix.curves_api && | DebugFlags().optix.curves_api && | ||||
| static_cast<const Hair *>(ob->get_geometry())->curve_shape == CURVE_THICK) { | static_cast<const Hair *>(ob->get_geometry())->curve_shape == CURVE_THICK) { | ||||
| // Select between motion blur and non-motion blur built-in intersection module | // Select between motion blur and non-motion blur built-in intersection module | ||||
| ▲ Show 20 Lines • Show All 263 Lines • Show Last 20 Lines | |||||