Changeset View
Changeset View
Standalone View
Standalone View
intern/cycles/device/device_cuda.cpp
| Show First 20 Lines • Show All 590 Lines • ▼ Show 20 Lines | void reserve_local_memory(const DeviceRequestedFeatures& requested_features) | ||||
| * needed for kernel launches, so that we can reliably figure out when | * needed for kernel launches, so that we can reliably figure out when | ||||
| * to allocate scene data in mapped host memory. */ | * to allocate scene data in mapped host memory. */ | ||||
| CUDAContextScope scope(this); | CUDAContextScope scope(this); | ||||
| size_t total = 0, free_before = 0, free_after = 0; | size_t total = 0, free_before = 0, free_after = 0; | ||||
| cuMemGetInfo(&free_before, &total); | cuMemGetInfo(&free_before, &total); | ||||
| /* Get kernel function. */ | /* Get kernel function. */ | ||||
| CUfunction cuPathTrace; | CUfunction cuRender; | ||||
| if(requested_features.use_integrator_branched) { | if(requested_features.use_baking) { | ||||
| cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_branched_path_trace")); | cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_bake")); | ||||
| } | |||||
| else if(requested_features.use_integrator_branched) { | |||||
| cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_branched_path_trace")); | |||||
| } | } | ||||
| else { | else { | ||||
| cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_path_trace")); | cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_path_trace")); | ||||
| } | } | ||||
| cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1)); | cuda_assert(cuFuncSetCacheConfig(cuRender, CU_FUNC_CACHE_PREFER_L1)); | ||||
| int min_blocks, num_threads_per_block; | int min_blocks, num_threads_per_block; | ||||
| cuda_assert(cuOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, cuPathTrace, NULL, 0, 0)); | cuda_assert(cuOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, cuRender, NULL, 0, 0)); | ||||
| /* Launch kernel, using just 1 block appears sufficient to reserve | /* Launch kernel, using just 1 block appears sufficient to reserve | ||||
| * memory for all multiprocessors. It would be good to do this in | * memory for all multiprocessors. It would be good to do this in | ||||
| * parallel for the multi GPU case still to make it faster. */ | * parallel for the multi GPU case still to make it faster. */ | ||||
| CUdeviceptr d_work_tiles = 0; | CUdeviceptr d_work_tiles = 0; | ||||
| uint total_work_size = 0; | uint total_work_size = 0; | ||||
| void *args[] = {&d_work_tiles, | void *args[] = {&d_work_tiles, | ||||
| &total_work_size}; | &total_work_size}; | ||||
| cuda_assert(cuLaunchKernel(cuPathTrace, | cuda_assert(cuLaunchKernel(cuRender, | ||||
| 1, 1, 1, | 1, 1, 1, | ||||
| num_threads_per_block, 1, 1, | num_threads_per_block, 1, 1, | ||||
| 0, 0, args, 0)); | 0, 0, args, 0)); | ||||
| cuda_assert(cuCtxSynchronize()); | cuda_assert(cuCtxSynchronize()); | ||||
| cuMemGetInfo(&free_after, &total); | cuMemGetInfo(&free_after, &total); | ||||
| VLOG(1) << "Local memory reserved " | VLOG(1) << "Local memory reserved " | ||||
| ▲ Show 20 Lines • Show All 1,003 Lines • ▼ Show 20 Lines | void denoise(RenderTile &rtile, DenoisingTask& denoising, const DeviceTask &task) | ||||
| denoising.init_from_devicetask(task); | denoising.init_from_devicetask(task); | ||||
| denoising.run_denoising(); | denoising.run_denoising(); | ||||
| task.unmap_neighbor_tiles(rtiles, this); | task.unmap_neighbor_tiles(rtiles, this); | ||||
| } | } | ||||
| void path_trace(DeviceTask& task, RenderTile& rtile, device_vector<WorkTile>& work_tiles) | void render(DeviceTask& task, RenderTile& rtile, device_vector<WorkTile>& work_tiles) | ||||
| { | { | ||||
| scoped_timer timer(&rtile.buffers->render_time); | scoped_timer timer(&rtile.buffers->render_time); | ||||
| if(have_error()) | if(have_error()) | ||||
| return; | return; | ||||
| CUDAContextScope scope(this); | CUDAContextScope scope(this); | ||||
| CUfunction cuPathTrace; | CUfunction cuRender; | ||||
| /* Get kernel function. */ | /* Get kernel function. */ | ||||
| if(task.integrator_branched) { | if(rtile.task == RenderTile::BAKE) { | ||||
| cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_branched_path_trace")); | cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_bake")); | ||||
| } | |||||
| else if(task.integrator_branched) { | |||||
| cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_branched_path_trace")); | |||||
| } | } | ||||
| else { | else { | ||||
| cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_path_trace")); | cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_path_trace")); | ||||
| } | } | ||||
| if(have_error()) { | if(have_error()) { | ||||
| return; | return; | ||||
| } | } | ||||
| cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1)); | cuda_assert(cuFuncSetCacheConfig(cuRender, CU_FUNC_CACHE_PREFER_L1)); | ||||
| /* Allocate work tile. */ | /* Allocate work tile. */ | ||||
| work_tiles.alloc(1); | work_tiles.alloc(1); | ||||
| WorkTile *wtile = work_tiles.data(); | WorkTile *wtile = work_tiles.data(); | ||||
| wtile->x = rtile.x; | wtile->x = rtile.x; | ||||
| wtile->y = rtile.y; | wtile->y = rtile.y; | ||||
| wtile->w = rtile.w; | wtile->w = rtile.w; | ||||
| wtile->h = rtile.h; | wtile->h = rtile.h; | ||||
| wtile->offset = rtile.offset; | wtile->offset = rtile.offset; | ||||
| wtile->stride = rtile.stride; | wtile->stride = rtile.stride; | ||||
| wtile->buffer = (float*)cuda_device_ptr(rtile.buffer); | wtile->buffer = (float*)cuda_device_ptr(rtile.buffer); | ||||
| /* Prepare work size. More step samples render faster, but for now we | /* Prepare work size. More step samples render faster, but for now we | ||||
| * remain conservative for GPUs connected to a display to avoid driver | * remain conservative for GPUs connected to a display to avoid driver | ||||
| * timeouts and display freezing. */ | * timeouts and display freezing. */ | ||||
| int min_blocks, num_threads_per_block; | int min_blocks, num_threads_per_block; | ||||
| cuda_assert(cuOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, cuPathTrace, NULL, 0, 0)); | cuda_assert(cuOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, cuRender, NULL, 0, 0)); | ||||
| if(!info.display_device) { | if(!info.display_device) { | ||||
| min_blocks *= 8; | min_blocks *= 8; | ||||
| } | } | ||||
| uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h);; | uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h);; | ||||
| /* Render all samples. */ | /* Render all samples. */ | ||||
| int start_sample = rtile.start_sample; | int start_sample = rtile.start_sample; | ||||
| int end_sample = rtile.start_sample + rtile.num_samples; | int end_sample = rtile.start_sample + rtile.num_samples; | ||||
| for(int sample = start_sample; sample < end_sample; sample += step_samples) { | for(int sample = start_sample; sample < end_sample; sample += step_samples) { | ||||
| /* Setup and copy work tile to device. */ | /* Setup and copy work tile to device. */ | ||||
| wtile->start_sample = sample; | wtile->start_sample = sample; | ||||
| wtile->num_samples = min(step_samples, end_sample - sample);; | wtile->num_samples = min(step_samples, end_sample - sample);; | ||||
| work_tiles.copy_to_device(); | work_tiles.copy_to_device(); | ||||
| CUdeviceptr d_work_tiles = cuda_device_ptr(work_tiles.device_pointer); | CUdeviceptr d_work_tiles = cuda_device_ptr(work_tiles.device_pointer); | ||||
| uint total_work_size = wtile->w * wtile->h * wtile->num_samples; | uint total_work_size = wtile->w * wtile->h * wtile->num_samples; | ||||
| uint num_blocks = divide_up(total_work_size, num_threads_per_block); | uint num_blocks = divide_up(total_work_size, num_threads_per_block); | ||||
| /* Launch kernel. */ | /* Launch kernel. */ | ||||
| void *args[] = {&d_work_tiles, | void *args[] = {&d_work_tiles, | ||||
| &total_work_size}; | &total_work_size}; | ||||
| cuda_assert(cuLaunchKernel(cuPathTrace, | cuda_assert(cuLaunchKernel(cuRender, | ||||
| num_blocks, 1, 1, | num_blocks, 1, 1, | ||||
| num_threads_per_block, 1, 1, | num_threads_per_block, 1, 1, | ||||
| 0, 0, args, 0)); | 0, 0, args, 0)); | ||||
| cuda_assert(cuCtxSynchronize()); | cuda_assert(cuCtxSynchronize()); | ||||
| /* Update progress. */ | /* Update progress. */ | ||||
| rtile.sample = sample + wtile->num_samples; | rtile.sample = sample + wtile->num_samples; | ||||
| ▲ Show 20 Lines • Show All 67 Lines • ▼ Show 20 Lines | void shader(DeviceTask& task) | ||||
| CUDAContextScope scope(this); | CUDAContextScope scope(this); | ||||
| CUfunction cuShader; | CUfunction cuShader; | ||||
| CUdeviceptr d_input = cuda_device_ptr(task.shader_input); | CUdeviceptr d_input = cuda_device_ptr(task.shader_input); | ||||
| CUdeviceptr d_output = cuda_device_ptr(task.shader_output); | CUdeviceptr d_output = cuda_device_ptr(task.shader_output); | ||||
| /* get kernel function */ | /* get kernel function */ | ||||
| if(task.shader_eval_type >= SHADER_EVAL_BAKE) { | if(task.shader_eval_type == SHADER_EVAL_DISPLACE) { | ||||
| cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_bake")); | |||||
| } | |||||
| else if(task.shader_eval_type == SHADER_EVAL_DISPLACE) { | |||||
| cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_displace")); | cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_displace")); | ||||
| } | } | ||||
| else { | else { | ||||
| cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_background")); | cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_background")); | ||||
| } | } | ||||
| /* do tasks in smaller chunks, so we can cancel it */ | /* do tasks in smaller chunks, so we can cancel it */ | ||||
| const int shader_chunk_size = 65536; | const int shader_chunk_size = 65536; | ||||
| ▲ Show 20 Lines • Show All 275 Lines • ▼ Show 20 Lines | if(task->type == DeviceTask::RENDER) { | ||||
| while(task->acquire_tile(this, tile)) { | while(task->acquire_tile(this, tile)) { | ||||
| if(tile.task == RenderTile::PATH_TRACE) { | if(tile.task == RenderTile::PATH_TRACE) { | ||||
| if(use_split_kernel()) { | if(use_split_kernel()) { | ||||
| device_only_memory<uchar> void_buffer(this, "void_buffer"); | device_only_memory<uchar> void_buffer(this, "void_buffer"); | ||||
| split_kernel->path_trace(task, tile, void_buffer, void_buffer); | split_kernel->path_trace(task, tile, void_buffer, void_buffer); | ||||
| } | } | ||||
| else { | else { | ||||
| path_trace(*task, tile, work_tiles); | render(*task, tile, work_tiles); | ||||
| } | |||||
| } | } | ||||
| else if(tile.task == RenderTile::BAKE) { | |||||
| render(*task, tile, work_tiles); | |||||
| } | } | ||||
| else if(tile.task == RenderTile::DENOISE) { | else if(tile.task == RenderTile::DENOISE) { | ||||
| tile.sample = tile.start_sample + tile.num_samples; | tile.sample = tile.start_sample + tile.num_samples; | ||||
| denoise(tile, denoising, *task); | denoise(tile, denoising, *task); | ||||
| task->update_progress(&tile, tile.w*tile.h); | task->update_progress(&tile, tile.w*tile.h); | ||||
| } | } | ||||
| ▲ Show 20 Lines • Show All 566 Lines • Show Last 20 Lines | |||||