Changeset View
Changeset View
Standalone View
Standalone View
intern/cycles/device/cuda/device_cuda_impl.cpp
| Show First 20 Lines • Show All 1,792 Lines • ▼ Show 20 Lines | denoising.functions.combine_halves = function_bind( | ||||
| &CUDADevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising); | &CUDADevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising); | ||||
| denoising.functions.get_feature = function_bind( | denoising.functions.get_feature = function_bind( | ||||
| &CUDADevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising); | &CUDADevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising); | ||||
| denoising.functions.write_feature = function_bind( | denoising.functions.write_feature = function_bind( | ||||
| &CUDADevice::denoising_write_feature, this, _1, _2, _3, &denoising); | &CUDADevice::denoising_write_feature, this, _1, _2, _3, &denoising); | ||||
| denoising.functions.detect_outliers = function_bind( | denoising.functions.detect_outliers = function_bind( | ||||
| &CUDADevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising); | &CUDADevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising); | ||||
| denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h); | denoising.filter_area = make_int4(rtile.get_x(), rtile.get_y(), rtile.get_w(), rtile.get_h()); | ||||
| denoising.render_buffer.samples = rtile.sample; | denoising.render_buffer.samples = rtile.get_sample(); | ||||
| denoising.buffer.gpu_temporary_mem = true; | denoising.buffer.gpu_temporary_mem = true; | ||||
| denoising.run_denoising(rtile); | denoising.run_denoising(rtile); | ||||
| } | } | ||||
| void CUDADevice::adaptive_sampling_filter(uint filter_sample, | void CUDADevice::adaptive_sampling_filter(uint filter_sample, | ||||
| WorkTile *wtile, | WorkTile *wtile, | ||||
| CUdeviceptr d_wtile, | CUdeviceptr d_wtile, | ||||
| ▲ Show 20 Lines • Show All 48 Lines • ▼ Show 20 Lines | |||||
| void CUDADevice::adaptive_sampling_post(RenderTile &rtile, | void CUDADevice::adaptive_sampling_post(RenderTile &rtile, | ||||
| WorkTile *wtile, | WorkTile *wtile, | ||||
| CUdeviceptr d_wtile, | CUdeviceptr d_wtile, | ||||
| CUstream stream) | CUstream stream) | ||||
| { | { | ||||
| const int num_threads_per_block = functions.adaptive_num_threads_per_block; | const int num_threads_per_block = functions.adaptive_num_threads_per_block; | ||||
| uint total_work_size = wtile->h * wtile->w; | uint total_work_size = wtile->h * wtile->w; | ||||
| void *args[] = {&d_wtile, &rtile.start_sample, &rtile.sample, &total_work_size}; | void *args[] = {&d_wtile, &rtile.get_start_sample(), &rtile.get_sample(), &total_work_size}; | ||||
| uint num_blocks = divide_up(total_work_size, num_threads_per_block); | uint num_blocks = divide_up(total_work_size, num_threads_per_block); | ||||
| cuda_assert(cuLaunchKernel(functions.adaptive_scale_samples, | cuda_assert(cuLaunchKernel(functions.adaptive_scale_samples, | ||||
| num_blocks, | num_blocks, | ||||
| 1, | 1, | ||||
| 1, | 1, | ||||
| num_threads_per_block, | num_threads_per_block, | ||||
| 1, | 1, | ||||
| 1, | 1, | ||||
| 0, | 0, | ||||
| stream, | stream, | ||||
| args, | args, | ||||
| 0)); | 0)); | ||||
| } | } | ||||
| void CUDADevice::render(DeviceTask &task, RenderTile &rtile, device_vector<WorkTile> &work_tiles) | void CUDADevice::render(DeviceTask &task, RenderTile &rtile, device_vector<WorkTile> &work_tiles) | ||||
| { | { | ||||
| scoped_timer timer(&rtile.buffers->render_time); | scoped_timer timer(&rtile.get_buffers()->get_render_time()); | ||||
| if (have_error()) | if (have_error()) | ||||
| return; | return; | ||||
| CUDAContextScope scope(this); | CUDAContextScope scope(this); | ||||
| CUfunction cuRender; | CUfunction cuRender; | ||||
| /* Get kernel function. */ | /* Get kernel function. */ | ||||
| if (rtile.task == RenderTile::BAKE) { | if (rtile.get_task() == RenderTile::BAKE) { | ||||
| cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_bake")); | cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_bake")); | ||||
| } | } | ||||
| else if (task.integrator_branched) { | else if (task.integrator_branched) { | ||||
| cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_branched_path_trace")); | cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_branched_path_trace")); | ||||
| } | } | ||||
| else { | else { | ||||
| cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_path_trace")); | cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_path_trace")); | ||||
| } | } | ||||
| if (have_error()) { | if (have_error()) { | ||||
| return; | return; | ||||
| } | } | ||||
| cuda_assert(cuFuncSetCacheConfig(cuRender, CU_FUNC_CACHE_PREFER_L1)); | cuda_assert(cuFuncSetCacheConfig(cuRender, CU_FUNC_CACHE_PREFER_L1)); | ||||
| /* Allocate work tile. */ | /* Allocate work tile. */ | ||||
| work_tiles.alloc(1); | work_tiles.alloc(1); | ||||
| WorkTile *wtile = work_tiles.data(); | WorkTile *wtile = work_tiles.data(); | ||||
| wtile->x = rtile.x; | *wtile = rtile.work_tile(); | ||||
| wtile->y = rtile.y; | |||||
| wtile->w = rtile.w; | |||||
| wtile->h = rtile.h; | |||||
| wtile->offset = rtile.offset; | |||||
| wtile->stride = rtile.stride; | |||||
| wtile->buffer = (float *)(CUdeviceptr)rtile.buffer; | |||||
| /* Prepare work size. More step samples render faster, but for now we | /* Prepare work size. More step samples render faster, but for now we | ||||
| * remain conservative for GPUs connected to a display to avoid driver | * remain conservative for GPUs connected to a display to avoid driver | ||||
| * timeouts and display freezing. */ | * timeouts and display freezing. */ | ||||
| int min_blocks, num_threads_per_block; | int min_blocks, num_threads_per_block; | ||||
| cuda_assert( | cuda_assert( | ||||
| cuOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, cuRender, NULL, 0, 0)); | cuOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, cuRender, NULL, 0, 0)); | ||||
| if (!info.display_device) { | if (!info.display_device) { | ||||
| min_blocks *= 8; | min_blocks *= 8; | ||||
| } | } | ||||
| uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h); | uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h); | ||||
| if (task.adaptive_sampling.use) { | if (task.adaptive_sampling.use) { | ||||
| step_samples = task.adaptive_sampling.align_static_samples(step_samples); | step_samples = task.adaptive_sampling.align_static_samples(step_samples); | ||||
| } | } | ||||
| /* Render all samples. */ | /* Render all samples. */ | ||||
| int start_sample = rtile.start_sample; | int start_sample = rtile.get_start_sample(); | ||||
| int end_sample = rtile.start_sample + rtile.num_samples; | int end_sample = rtile.get_start_sample() + rtile.get_num_samples(); | ||||
| for (int sample = start_sample; sample < end_sample; sample += step_samples) { | for (int sample = start_sample; sample < end_sample; sample += step_samples) { | ||||
| /* Setup and copy work tile to device. */ | /* Setup and copy work tile to device. */ | ||||
| wtile->start_sample = sample; | wtile->start_sample = sample; | ||||
| wtile->num_samples = min(step_samples, end_sample - sample); | wtile->num_samples = min(step_samples, end_sample - sample); | ||||
| work_tiles.copy_to_device(); | work_tiles.copy_to_device(); | ||||
| CUdeviceptr d_work_tiles = (CUdeviceptr)work_tiles.device_pointer; | CUdeviceptr d_work_tiles = (CUdeviceptr)work_tiles.device_pointer; | ||||
| Show All 10 Lines | for (int sample = start_sample; sample < end_sample; sample += step_samples) { | ||||
| uint filter_sample = sample + wtile->num_samples - 1; | uint filter_sample = sample + wtile->num_samples - 1; | ||||
| if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) { | if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) { | ||||
| adaptive_sampling_filter(filter_sample, wtile, d_work_tiles); | adaptive_sampling_filter(filter_sample, wtile, d_work_tiles); | ||||
| } | } | ||||
| cuda_assert(cuCtxSynchronize()); | cuda_assert(cuCtxSynchronize()); | ||||
| /* Update progress. */ | /* Update progress. */ | ||||
| rtile.sample = sample + wtile->num_samples; | rtile.get_sample() = sample + wtile->num_samples; | ||||
| task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples); | task.update_progress(&rtile, rtile.get_w() * rtile.get_h() * wtile->num_samples); | ||||
| if (task.get_cancel()) { | if (task.get_cancel()) { | ||||
| if (task.need_finish_queue == false) | if (task.need_finish_queue == false) | ||||
| break; | break; | ||||
| } | } | ||||
| } | } | ||||
| /* Finalize adaptive sampling. */ | /* Finalize adaptive sampling. */ | ||||
| if (task.adaptive_sampling.use) { | if (task.adaptive_sampling.use) { | ||||
| CUdeviceptr d_work_tiles = (CUdeviceptr)work_tiles.device_pointer; | CUdeviceptr d_work_tiles = (CUdeviceptr)work_tiles.device_pointer; | ||||
| adaptive_sampling_post(rtile, wtile, d_work_tiles); | adaptive_sampling_post(rtile, wtile, d_work_tiles); | ||||
| cuda_assert(cuCtxSynchronize()); | cuda_assert(cuCtxSynchronize()); | ||||
| task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples); | task.update_progress(&rtile, rtile.get_w() * rtile.get_h() * wtile->num_samples); | ||||
| } | } | ||||
| } | } | ||||
| void CUDADevice::film_convert(DeviceTask &task, | void CUDADevice::film_convert(DeviceTask &task, | ||||
| device_ptr buffer, | device_ptr buffer, | ||||
| device_ptr rgba_byte, | device_ptr rgba_byte, | ||||
| device_ptr rgba_half) | device_ptr rgba_half) | ||||
| { | { | ||||
| ▲ Show 20 Lines • Show All 395 Lines • ▼ Show 20 Lines | if (task.type == DeviceTask::RENDER) { | ||||
| device_vector<WorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY); | device_vector<WorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY); | ||||
| /* keep rendering tiles until done */ | /* keep rendering tiles until done */ | ||||
| RenderTile tile; | RenderTile tile; | ||||
| DenoisingTask denoising(this, task); | DenoisingTask denoising(this, task); | ||||
| while (task.acquire_tile(this, tile, task.tile_types)) { | while (task.acquire_tile(this, tile, task.tile_types)) { | ||||
| if (tile.task == RenderTile::PATH_TRACE) { | if (tile.get_task() == RenderTile::PATH_TRACE) { | ||||
| if (use_split_kernel()) { | if (use_split_kernel()) { | ||||
| device_only_memory<uchar> void_buffer(this, "void_buffer"); | device_only_memory<uchar> void_buffer(this, "void_buffer"); | ||||
| split_kernel->path_trace(task, tile, void_buffer, void_buffer); | split_kernel->path_trace(task, tile, void_buffer, void_buffer); | ||||
| } | } | ||||
| else { | else { | ||||
| render(task, tile, work_tiles); | render(task, tile, work_tiles); | ||||
| } | } | ||||
| } | } | ||||
| else if (tile.task == RenderTile::BAKE) { | else if (tile.get_task() == RenderTile::BAKE) { | ||||
| render(task, tile, work_tiles); | render(task, tile, work_tiles); | ||||
| } | } | ||||
| else if (tile.task == RenderTile::DENOISE) { | else if (tile.get_task() == RenderTile::DENOISE) { | ||||
| tile.sample = tile.start_sample + tile.num_samples; | tile.get_sample() = tile.get_start_sample() + tile.get_num_samples(); | ||||
| denoise(tile, denoising); | denoise(tile, denoising); | ||||
| task.update_progress(&tile, tile.w * tile.h); | task.update_progress(&tile, tile.get_w() * tile.get_h()); | ||||
| } | } | ||||
| task.release_tile(tile); | task.release_tile(tile); | ||||
| if (task.get_cancel()) { | if (task.get_cancel()) { | ||||
| if (task.need_finish_queue == false) | if (task.need_finish_queue == false) | ||||
| break; | break; | ||||
| } | } | ||||
| } | } | ||||
| work_tiles.free(); | work_tiles.free(); | ||||
| } | } | ||||
| else if (task.type == DeviceTask::SHADER) { | else if (task.type == DeviceTask::SHADER) { | ||||
| shader(task); | shader(task); | ||||
| cuda_assert(cuCtxSynchronize()); | cuda_assert(cuCtxSynchronize()); | ||||
| } | } | ||||
| else if (task.type == DeviceTask::DENOISE_BUFFER) { | else if (task.type == DeviceTask::DENOISE_BUFFER) { | ||||
| RenderTile tile; | RenderTile tile = RenderTile::from_device_task(task, true); | ||||
| tile.x = task.x; | |||||
| tile.y = task.y; | |||||
| tile.w = task.w; | |||||
| tile.h = task.h; | |||||
| tile.buffer = task.buffer; | |||||
| tile.sample = task.sample + task.num_samples; | |||||
| tile.num_samples = task.num_samples; | |||||
| tile.start_sample = task.sample; | |||||
| tile.offset = task.offset; | |||||
| tile.stride = task.stride; | |||||
| tile.buffers = task.buffers; | |||||
| DenoisingTask denoising(this, task); | DenoisingTask denoising(this, task); | ||||
| denoise(tile, denoising); | denoise(tile, denoising); | ||||
| task.update_progress(&tile, tile.w * tile.h); | task.update_progress(&tile, tile.get_w() * tile.get_h()); | ||||
| } | } | ||||
| } | } | ||||
| void CUDADevice::task_add(DeviceTask &task) | void CUDADevice::task_add(DeviceTask &task) | ||||
| { | { | ||||
| CUDAContextScope scope(this); | CUDAContextScope scope(this); | ||||
| /* Load texture info. */ | /* Load texture info. */ | ||||
| ▲ Show 20 Lines • Show All 153 Lines • ▼ Show 20 Lines | bool CUDASplitKernel::enqueue_split_kernel_data_init(const KernelDimensions &dim, | ||||
| CUDAContextScope scope(device); | CUDAContextScope scope(device); | ||||
| CUdeviceptr d_split_data = (CUdeviceptr)split_data.device_pointer; | CUdeviceptr d_split_data = (CUdeviceptr)split_data.device_pointer; | ||||
| CUdeviceptr d_ray_state = (CUdeviceptr)ray_state.device_pointer; | CUdeviceptr d_ray_state = (CUdeviceptr)ray_state.device_pointer; | ||||
| CUdeviceptr d_queue_index = (CUdeviceptr)queue_index.device_pointer; | CUdeviceptr d_queue_index = (CUdeviceptr)queue_index.device_pointer; | ||||
| CUdeviceptr d_use_queues_flag = (CUdeviceptr)use_queues_flag.device_pointer; | CUdeviceptr d_use_queues_flag = (CUdeviceptr)use_queues_flag.device_pointer; | ||||
| CUdeviceptr d_work_pool_wgs = (CUdeviceptr)work_pool_wgs.device_pointer; | CUdeviceptr d_work_pool_wgs = (CUdeviceptr)work_pool_wgs.device_pointer; | ||||
| CUdeviceptr d_buffer = (CUdeviceptr)rtile.buffer; | CUdeviceptr d_buffer = (CUdeviceptr)rtile.get_buffer(); | ||||
| int end_sample = rtile.start_sample + rtile.num_samples; | int end_sample = rtile.get_start_sample() + rtile.get_num_samples(); | ||||
| int queue_size = dim.global_size[0] * dim.global_size[1]; | int queue_size = dim.global_size[0] * dim.global_size[1]; | ||||
| struct args_t { | struct args_t { | ||||
| CUdeviceptr *split_data_buffer; | CUdeviceptr *split_data_buffer; | ||||
| int *num_elements; | int *num_elements; | ||||
| CUdeviceptr *ray_state; | CUdeviceptr *ray_state; | ||||
| int *start_sample; | int *start_sample; | ||||
| int *end_sample; | int *end_sample; | ||||
| Show All 9 Lines | struct args_t { | ||||
| CUdeviceptr *work_pool_wgs; | CUdeviceptr *work_pool_wgs; | ||||
| int *num_samples; | int *num_samples; | ||||
| CUdeviceptr *buffer; | CUdeviceptr *buffer; | ||||
| }; | }; | ||||
| args_t args = {&d_split_data, | args_t args = {&d_split_data, | ||||
| &num_global_elements, | &num_global_elements, | ||||
| &d_ray_state, | &d_ray_state, | ||||
| &rtile.start_sample, | &rtile.get_start_sample(), | ||||
| &end_sample, | &end_sample, | ||||
| &rtile.x, | &rtile.get_x(), | ||||
| &rtile.y, | &rtile.get_y(), | ||||
| &rtile.w, | &rtile.get_w(), | ||||
| &rtile.h, | &rtile.get_h(), | ||||
| &rtile.offset, | &rtile.get_offset(), | ||||
| &rtile.stride, | &rtile.get_stride(), | ||||
| &d_queue_index, | &d_queue_index, | ||||
| &queue_size, | &queue_size, | ||||
| &d_use_queues_flag, | &d_use_queues_flag, | ||||
| &d_work_pool_wgs, | &d_work_pool_wgs, | ||||
| &rtile.num_samples, | &rtile.get_num_samples(), | ||||
| &d_buffer}; | &d_buffer}; | ||||
| CUfunction data_init; | CUfunction data_init; | ||||
| cuda_assert( | cuda_assert( | ||||
| cuModuleGetFunction(&data_init, device->cuModule, "kernel_cuda_path_trace_data_init")); | cuModuleGetFunction(&data_init, device->cuModule, "kernel_cuda_path_trace_data_init")); | ||||
| if (device->have_error()) { | if (device->have_error()) { | ||||
| return false; | return false; | ||||
| } | } | ||||
| ▲ Show 20 Lines • Show All 52 Lines • Show Last 20 Lines | |||||