Changeset View
Changeset View
Standalone View
Standalone View
intern/cycles/device/cuda/device_cuda_impl.cpp
| Show First 20 Lines • Show All 580 Lines • ▼ Show 20 Lines | void CUDADevice::reserve_local_memory(const DeviceRequestedFeatures &requested_features) | ||||
| * needed for kernel launches, so that we can reliably figure out when | * needed for kernel launches, so that we can reliably figure out when | ||||
| * to allocate scene data in mapped host memory. */ | * to allocate scene data in mapped host memory. */ | ||||
| CUDAContextScope scope(this); | CUDAContextScope scope(this); | ||||
| size_t total = 0, free_before = 0, free_after = 0; | size_t total = 0, free_before = 0, free_after = 0; | ||||
| cuMemGetInfo(&free_before, &total); | cuMemGetInfo(&free_before, &total); | ||||
| /* Get kernel function. */ | /* Get kernel function. */ | ||||
| CUfunction cuPathTrace; | CUfunction cuRender; | ||||
| if (requested_features.use_integrator_branched) { | if (requested_features.use_baking) { | ||||
| cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_branched_path_trace")); | cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_bake")); | ||||
| } | |||||
| else if (requested_features.use_integrator_branched) { | |||||
| cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_branched_path_trace")); | |||||
| } | } | ||||
| else { | else { | ||||
| cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_path_trace")); | cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_path_trace")); | ||||
| } | } | ||||
| cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1)); | cuda_assert(cuFuncSetCacheConfig(cuRender, CU_FUNC_CACHE_PREFER_L1)); | ||||
| int min_blocks, num_threads_per_block; | int min_blocks, num_threads_per_block; | ||||
| cuda_assert(cuOccupancyMaxPotentialBlockSize( | cuda_assert(cuOccupancyMaxPotentialBlockSize( | ||||
| &min_blocks, &num_threads_per_block, cuPathTrace, NULL, 0, 0)); | &min_blocks, &num_threads_per_block, cuRender, NULL, 0, 0)); | ||||
| /* Launch kernel, using just 1 block appears sufficient to reserve | /* Launch kernel, using just 1 block appears sufficient to reserve | ||||
| * memory for all multiprocessors. It would be good to do this in | * memory for all multiprocessors. It would be good to do this in | ||||
| * parallel for the multi GPU case still to make it faster. */ | * parallel for the multi GPU case still to make it faster. */ | ||||
| CUdeviceptr d_work_tiles = 0; | CUdeviceptr d_work_tiles = 0; | ||||
| uint total_work_size = 0; | uint total_work_size = 0; | ||||
| void *args[] = {&d_work_tiles, &total_work_size}; | void *args[] = {&d_work_tiles, &total_work_size}; | ||||
| cuda_assert(cuLaunchKernel(cuPathTrace, 1, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0)); | cuda_assert(cuLaunchKernel(cuRender, 1, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0)); | ||||
| cuda_assert(cuCtxSynchronize()); | cuda_assert(cuCtxSynchronize()); | ||||
| cuMemGetInfo(&free_after, &total); | cuMemGetInfo(&free_after, &total); | ||||
| VLOG(1) << "Local memory reserved " << string_human_readable_number(free_before - free_after) | VLOG(1) << "Local memory reserved " << string_human_readable_number(free_before - free_after) | ||||
| << " bytes. (" << string_human_readable_size(free_before - free_after) << ")"; | << " bytes. (" << string_human_readable_size(free_before - free_after) << ")"; | ||||
| # if 0 | # if 0 | ||||
| ▲ Show 20 Lines • Show All 1,154 Lines • ▼ Show 20 Lines | cuda_assert(cuLaunchKernel(functions.adaptive_scale_samples, | ||||
| 1, | 1, | ||||
| 1, | 1, | ||||
| 0, | 0, | ||||
| stream, | stream, | ||||
| args, | args, | ||||
| 0)); | 0)); | ||||
| } | } | ||||
| void CUDADevice::path_trace(DeviceTask &task, | void CUDADevice::render(DeviceTask &task, | ||||
| RenderTile &rtile, | RenderTile &rtile, | ||||
| device_vector<WorkTile> &work_tiles) | device_vector<WorkTile> &work_tiles) | ||||
| { | { | ||||
| scoped_timer timer(&rtile.buffers->render_time); | scoped_timer timer(&rtile.buffers->render_time); | ||||
| if (have_error()) | if (have_error()) | ||||
| return; | return; | ||||
| CUDAContextScope scope(this); | CUDAContextScope scope(this); | ||||
| CUfunction cuPathTrace; | CUfunction cuRender; | ||||
| /* Get kernel function. */ | /* Get kernel function. */ | ||||
| if (task.integrator_branched) { | if (rtile.task == RenderTile::BAKE) { | ||||
| cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_branched_path_trace")); | cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_bake")); | ||||
| } | |||||
| else if (task.integrator_branched) { | |||||
| cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_branched_path_trace")); | |||||
| } | } | ||||
| else { | else { | ||||
| cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_path_trace")); | cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_path_trace")); | ||||
| } | } | ||||
| if (have_error()) { | if (have_error()) { | ||||
| return; | return; | ||||
| } | } | ||||
| cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1)); | cuda_assert(cuFuncSetCacheConfig(cuRender, CU_FUNC_CACHE_PREFER_L1)); | ||||
| /* Allocate work tile. */ | /* Allocate work tile. */ | ||||
| work_tiles.alloc(1); | work_tiles.alloc(1); | ||||
| WorkTile *wtile = work_tiles.data(); | WorkTile *wtile = work_tiles.data(); | ||||
| wtile->x = rtile.x; | wtile->x = rtile.x; | ||||
| wtile->y = rtile.y; | wtile->y = rtile.y; | ||||
| wtile->w = rtile.w; | wtile->w = rtile.w; | ||||
| wtile->h = rtile.h; | wtile->h = rtile.h; | ||||
| wtile->offset = rtile.offset; | wtile->offset = rtile.offset; | ||||
| wtile->stride = rtile.stride; | wtile->stride = rtile.stride; | ||||
| wtile->buffer = (float *)(CUdeviceptr)rtile.buffer; | wtile->buffer = (float *)(CUdeviceptr)rtile.buffer; | ||||
| /* Prepare work size. More step samples render faster, but for now we | /* Prepare work size. More step samples render faster, but for now we | ||||
| * remain conservative for GPUs connected to a display to avoid driver | * remain conservative for GPUs connected to a display to avoid driver | ||||
| * timeouts and display freezing. */ | * timeouts and display freezing. */ | ||||
| int min_blocks, num_threads_per_block; | int min_blocks, num_threads_per_block; | ||||
| cuda_assert(cuOccupancyMaxPotentialBlockSize( | cuda_assert(cuOccupancyMaxPotentialBlockSize( | ||||
| &min_blocks, &num_threads_per_block, cuPathTrace, NULL, 0, 0)); | &min_blocks, &num_threads_per_block, cuRender, NULL, 0, 0)); | ||||
| if (!info.display_device) { | if (!info.display_device) { | ||||
| min_blocks *= 8; | min_blocks *= 8; | ||||
| } | } | ||||
| uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h); | uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h); | ||||
| if (task.adaptive_sampling.use) { | if (task.adaptive_sampling.use) { | ||||
| step_samples = task.adaptive_sampling.align_static_samples(step_samples); | step_samples = task.adaptive_sampling.align_static_samples(step_samples); | ||||
| } | } | ||||
| Show All 11 Lines | for (int sample = start_sample; sample < end_sample; sample += step_samples) { | ||||
| CUdeviceptr d_work_tiles = (CUdeviceptr)work_tiles.device_pointer; | CUdeviceptr d_work_tiles = (CUdeviceptr)work_tiles.device_pointer; | ||||
| uint total_work_size = wtile->w * wtile->h * wtile->num_samples; | uint total_work_size = wtile->w * wtile->h * wtile->num_samples; | ||||
| uint num_blocks = divide_up(total_work_size, num_threads_per_block); | uint num_blocks = divide_up(total_work_size, num_threads_per_block); | ||||
| /* Launch kernel. */ | /* Launch kernel. */ | ||||
| void *args[] = {&d_work_tiles, &total_work_size}; | void *args[] = {&d_work_tiles, &total_work_size}; | ||||
| cuda_assert( | cuda_assert( | ||||
| cuLaunchKernel(cuPathTrace, num_blocks, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0)); | cuLaunchKernel(cuRender, num_blocks, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0)); | ||||
| /* Run the adaptive sampling kernels at selected samples aligned to step samples. */ | /* Run the adaptive sampling kernels at selected samples aligned to step samples. */ | ||||
| uint filter_sample = sample + wtile->num_samples - 1; | uint filter_sample = sample + wtile->num_samples - 1; | ||||
| if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) { | if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) { | ||||
| adaptive_sampling_filter(filter_sample, wtile, d_work_tiles); | adaptive_sampling_filter(filter_sample, wtile, d_work_tiles); | ||||
| } | } | ||||
| cuda_assert(cuCtxSynchronize()); | cuda_assert(cuCtxSynchronize()); | ||||
| ▲ Show 20 Lines • Show All 89 Lines • ▼ Show 20 Lines | void CUDADevice::shader(DeviceTask &task) | ||||
| CUDAContextScope scope(this); | CUDAContextScope scope(this); | ||||
| CUfunction cuShader; | CUfunction cuShader; | ||||
| CUdeviceptr d_input = (CUdeviceptr)task.shader_input; | CUdeviceptr d_input = (CUdeviceptr)task.shader_input; | ||||
| CUdeviceptr d_output = (CUdeviceptr)task.shader_output; | CUdeviceptr d_output = (CUdeviceptr)task.shader_output; | ||||
| /* get kernel function */ | /* get kernel function */ | ||||
| if (task.shader_eval_type >= SHADER_EVAL_BAKE) { | if (task.shader_eval_type == SHADER_EVAL_DISPLACE) { | ||||
| cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_bake")); | |||||
| } | |||||
| else if (task.shader_eval_type == SHADER_EVAL_DISPLACE) { | |||||
| cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_displace")); | cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_displace")); | ||||
| } | } | ||||
| else { | else { | ||||
| cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_background")); | cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_background")); | ||||
| } | } | ||||
| /* do tasks in smaller chunks, so we can cancel it */ | /* do tasks in smaller chunks, so we can cancel it */ | ||||
| const int shader_chunk_size = 65536; | const int shader_chunk_size = 65536; | ||||
| ▲ Show 20 Lines • Show All 320 Lines • ▼ Show 20 Lines | if (task->type == DeviceTask::RENDER) { | ||||
| while (task->acquire_tile(this, tile, task->tile_types)) { | while (task->acquire_tile(this, tile, task->tile_types)) { | ||||
| if (tile.task == RenderTile::PATH_TRACE) { | if (tile.task == RenderTile::PATH_TRACE) { | ||||
| if (use_split_kernel()) { | if (use_split_kernel()) { | ||||
| device_only_memory<uchar> void_buffer(this, "void_buffer"); | device_only_memory<uchar> void_buffer(this, "void_buffer"); | ||||
| split_kernel->path_trace(task, tile, void_buffer, void_buffer); | split_kernel->path_trace(task, tile, void_buffer, void_buffer); | ||||
| } | } | ||||
| else { | else { | ||||
| path_trace(*task, tile, work_tiles); | render(*task, tile, work_tiles); | ||||
| } | |||||
| } | } | ||||
| else if (tile.task == RenderTile::BAKE) { | |||||
| render(*task, tile, work_tiles); | |||||
| } | } | ||||
| else if (tile.task == RenderTile::DENOISE) { | else if (tile.task == RenderTile::DENOISE) { | ||||
| tile.sample = tile.start_sample + tile.num_samples; | tile.sample = tile.start_sample + tile.num_samples; | ||||
| denoise(tile, denoising); | denoise(tile, denoising); | ||||
| task->update_progress(&tile, tile.w * tile.h); | task->update_progress(&tile, tile.w * tile.h); | ||||
| } | } | ||||
| ▲ Show 20 Lines • Show All 311 Lines • Show Last 20 Lines | |||||