Changeset View
Changeset View
Standalone View
Standalone View
intern/cycles/device/device_cuda.cpp
| Show First 20 Lines • Show All 1,275 Lines • ▼ Show 20 Lines | void denoise(RenderTile &rtile, const DeviceTask &task) | ||||
| denoising.init_from_devicetask(task); | denoising.init_from_devicetask(task); | ||||
| denoising.run_denoising(); | denoising.run_denoising(); | ||||
| task.unmap_neighbor_tiles(rtiles, this); | task.unmap_neighbor_tiles(rtiles, this); | ||||
| } | } | ||||
| void path_trace(RenderTile& rtile, int sample, bool branched) | void path_trace(DeviceTask& task, RenderTile& rtile) | ||||
| { | { | ||||
| if(have_error()) | if(have_error()) | ||||
| return; | return; | ||||
| CUDAContextScope scope(this); | CUDAContextScope scope(this); | ||||
| CUfunction cuPathTrace; | CUfunction cuPathTrace; | ||||
| /* get kernel function */ | /* Get kernel function. */ | ||||
| if(branched) { | if(task.integrator_branched) { | ||||
| cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_branched_path_trace")); | cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_branched_path_trace")); | ||||
| } | } | ||||
| else { | else { | ||||
| cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_path_trace")); | cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_path_trace")); | ||||
| } | } | ||||
| if(have_error()) { | if(have_error()) { | ||||
| return; | return; | ||||
| } | } | ||||
| cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1)); | cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1)); | ||||
| /* allocate work tile */ | /* Allocate work tile. */ | ||||
| device_vector<WorkTile> work_tiles; | device_vector<WorkTile> work_tiles; | ||||
| work_tiles.resize(1); | work_tiles.resize(1); | ||||
| WorkTile *wtile = work_tiles.get_data(); | WorkTile *wtile = work_tiles.get_data(); | ||||
| wtile->x = rtile.x; | wtile->x = rtile.x; | ||||
| wtile->y = rtile.y; | wtile->y = rtile.y; | ||||
| wtile->w = rtile.w; | wtile->w = rtile.w; | ||||
| wtile->h = rtile.h; | wtile->h = rtile.h; | ||||
| wtile->offset = rtile.offset; | wtile->offset = rtile.offset; | ||||
| wtile->stride = rtile.stride; | wtile->stride = rtile.stride; | ||||
| wtile->start_sample = sample; | |||||
| wtile->num_samples = 1; | |||||
| wtile->buffer = (float*)cuda_device_ptr(rtile.buffer); | wtile->buffer = (float*)cuda_device_ptr(rtile.buffer); | ||||
| mem_alloc("work_tiles", work_tiles, MEM_READ_ONLY); | mem_alloc("work_tiles", work_tiles, MEM_READ_ONLY); | ||||
| mem_copy_to(work_tiles); | |||||
| CUdeviceptr d_work_tiles = cuda_device_ptr(work_tiles.device_pointer); | CUdeviceptr d_work_tiles = cuda_device_ptr(work_tiles.device_pointer); | ||||
| /* Prepare work size. More step samples render faster, but for now we | |||||
| * remain conservative to avoid driver timeouts. */ | |||||
| int min_blocks, num_threads_per_block; | |||||
| cuda_assert(cuOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, cuPathTrace, NULL, 0, 0)); | |||||
| uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h);; | |||||
| /* Render all samples. */ | |||||
| int start_sample = rtile.start_sample; | |||||
| int end_sample = rtile.start_sample + rtile.num_samples; | |||||
| for(int sample = start_sample; sample < end_sample; sample += step_samples) { | |||||
| /* Setup and copy work tile to device. */ | |||||
| wtile->start_sample = sample; | |||||
| wtile->num_samples = min(step_samples, end_sample - sample);; | |||||
| mem_copy_to(work_tiles); | |||||
| uint total_work_size = wtile->w * wtile->h * wtile->num_samples; | uint total_work_size = wtile->w * wtile->h * wtile->num_samples; | ||||
| uint num_blocks = divide_up(total_work_size, num_threads_per_block); | |||||
| /* pass in parameters */ | /* Launch kernel. */ | ||||
| void *args[] = {&d_work_tiles, | void *args[] = {&d_work_tiles, | ||||
| &total_work_size}; | &total_work_size}; | ||||
| /* launch kernel */ | |||||
| int num_threads_per_block; | |||||
| cuda_assert(cuFuncGetAttribute(&num_threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuPathTrace)); | |||||
| int num_blocks = divide_up(total_work_size, num_threads_per_block); | |||||
| cuda_assert(cuLaunchKernel(cuPathTrace, | cuda_assert(cuLaunchKernel(cuPathTrace, | ||||
| num_blocks, 1, 1, | num_blocks, 1, 1, | ||||
| num_threads_per_block, 1, 1, | num_threads_per_block, 1, 1, | ||||
| 0, 0, args, 0)); | 0, 0, args, 0)); | ||||
| cuda_assert(cuCtxSynchronize()); | cuda_assert(cuCtxSynchronize()); | ||||
| /* Update progress. */ | |||||
| rtile.sample = sample + wtile->num_samples; | |||||
| task.update_progress(&rtile, rtile.w*rtile.h); | |||||
| if(task.get_cancel()) { | |||||
| if(task.need_finish_queue == false) | |||||
| break; | |||||
| } | |||||
| } | |||||
| mem_free(work_tiles); | mem_free(work_tiles); | ||||
| } | } | ||||
| void film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half) | void film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half) | ||||
| { | { | ||||
| if(have_error()) | if(have_error()) | ||||
| return; | return; | ||||
| ▲ Show 20 Lines • Show All 342 Lines • ▼ Show 20 Lines | #define CUDA_LAUNCH_KERNEL(func, args) \ | ||||
| void thread_run(DeviceTask *task) | void thread_run(DeviceTask *task) | ||||
| { | { | ||||
| CUDAContextScope scope(this); | CUDAContextScope scope(this); | ||||
| if(task->type == DeviceTask::RENDER) { | if(task->type == DeviceTask::RENDER) { | ||||
| RenderTile tile; | RenderTile tile; | ||||
| bool branched = task->integrator_branched; | |||||
| /* Upload Bindless Mapping */ | /* Upload Bindless Mapping */ | ||||
| load_bindless_mapping(); | load_bindless_mapping(); | ||||
| DeviceRequestedFeatures requested_features; | DeviceRequestedFeatures requested_features; | ||||
| if(use_split_kernel()) { | if(use_split_kernel()) { | ||||
| if(!use_adaptive_compilation()) { | if(!use_adaptive_compilation()) { | ||||
| requested_features.max_closure = 64; | requested_features.max_closure = 64; | ||||
| } | } | ||||
| if(split_kernel == NULL) { | if(split_kernel == NULL) { | ||||
| split_kernel = new CUDASplitKernel(this); | split_kernel = new CUDASplitKernel(this); | ||||
| split_kernel->load_kernels(requested_features); | split_kernel->load_kernels(requested_features); | ||||
| } | } | ||||
| } | } | ||||
| /* keep rendering tiles until done */ | /* keep rendering tiles until done */ | ||||
| while(task->acquire_tile(this, tile)) { | while(task->acquire_tile(this, tile)) { | ||||
| if(tile.task == RenderTile::PATH_TRACE) { | if(tile.task == RenderTile::PATH_TRACE) { | ||||
| if(use_split_kernel()) { | if(use_split_kernel()) { | ||||
| device_memory void_buffer; | device_memory void_buffer; | ||||
| split_kernel->path_trace(task, tile, void_buffer, void_buffer); | split_kernel->path_trace(task, tile, void_buffer, void_buffer); | ||||
| } | } | ||||
| else { | else { | ||||
| int start_sample = tile.start_sample; | path_trace(*task, tile); | ||||
| int end_sample = tile.start_sample + tile.num_samples; | |||||
| for(int sample = start_sample; sample < end_sample; sample++) { | |||||
| if(task->get_cancel()) { | |||||
| if(task->need_finish_queue == false) | |||||
| break; | |||||
| } | |||||
| path_trace(tile, sample, branched); | |||||
| tile.sample = sample + 1; | |||||
| task->update_progress(&tile, tile.w*tile.h); | |||||
| } | |||||
| } | } | ||||
| } | } | ||||
| else if(tile.task == RenderTile::DENOISE) { | else if(tile.task == RenderTile::DENOISE) { | ||||
| tile.sample = tile.start_sample + tile.num_samples; | tile.sample = tile.start_sample + tile.num_samples; | ||||
| denoise(tile, *task); | denoise(tile, *task); | ||||
| task->update_progress(&tile, tile.w*tile.h); | task->update_progress(&tile, tile.w*tile.h); | ||||
| ▲ Show 20 Lines • Show All 547 Lines • Show Last 20 Lines | |||||