Changeset View
Changeset View
Standalone View
Standalone View
intern/cycles/device/device_optix.cpp
| Show First 20 Lines • Show All 171 Lines • ▼ Show 20 Lines | # endif | ||||
| bool motion_blur = false; | bool motion_blur = false; | ||||
| bool need_texture_info = false; | bool need_texture_info = false; | ||||
| device_vector<SbtRecord> sbt_data; | device_vector<SbtRecord> sbt_data; | ||||
| device_vector<TextureInfo> texture_info; | device_vector<TextureInfo> texture_info; | ||||
| device_only_memory<KernelParams> launch_params; | device_only_memory<KernelParams> launch_params; | ||||
| vector<device_only_memory<uint8_t>> blas; | vector<device_only_memory<uint8_t>> blas; | ||||
| OptixTraversableHandle tlas_handle = 0; | OptixTraversableHandle tlas_handle = 0; | ||||
| // TODO(pmours): This is copied from device_cuda.cpp, so move to common code eventually | |||||
| int can_map_host = 0; | |||||
| size_t map_host_used = 0; | |||||
| size_t map_host_limit = 0; | |||||
| size_t device_working_headroom = 32 * 1024 * 1024LL; // 32MB | |||||
| size_t device_texture_headroom = 128 * 1024 * 1024LL; // 128MB | |||||
| map<device_memory *, CUDAMem> cuda_mem_map; | map<device_memory *, CUDAMem> cuda_mem_map; | ||||
| bool move_texture_to_host = false; | |||||
| public: | public: | ||||
| OptiXDevice(DeviceInfo &info_, Stats &stats_, Profiler &profiler_, bool background_) | OptiXDevice(DeviceInfo &info_, Stats &stats_, Profiler &profiler_, bool background_) | ||||
| : Device(info_, stats_, profiler_, background_), | : Device(info_, stats_, profiler_, background_), | ||||
| sbt_data(this, "__sbt", MEM_READ_ONLY), | sbt_data(this, "__sbt", MEM_READ_ONLY), | ||||
| texture_info(this, "__texture_info", MEM_TEXTURE), | texture_info(this, "__texture_info", MEM_TEXTURE), | ||||
| launch_params(this, "__params") | launch_params(this, "__params") | ||||
| { | { | ||||
| // Store number of CUDA streams in device info | // Store number of CUDA streams in device info | ||||
| info.cpu_threads = DebugFlags().optix.cuda_streams; | info.cpu_threads = DebugFlags().optix.cuda_streams; | ||||
| // Initialize CUDA driver API | // Initialize CUDA driver API | ||||
| check_result_cuda(cuInit(0)); | check_result_cuda(cuInit(0)); | ||||
| // Retrieve the primary CUDA context for this device | // Retrieve the primary CUDA context for this device | ||||
| check_result_cuda(cuDeviceGet(&cuda_device, info.num)); | check_result_cuda(cuDeviceGet(&cuda_device, info.num)); | ||||
| check_result_cuda(cuDevicePrimaryCtxRetain(&cuda_context, cuda_device)); | check_result_cuda(cuDevicePrimaryCtxRetain(&cuda_context, cuda_device)); | ||||
| // Make that CUDA context current | // Make that CUDA context current | ||||
| const CUDAContextScope scope(cuda_context); | const CUDAContextScope scope(cuda_context); | ||||
| // Limit amount of host mapped memory (see init_host_memory in device_cuda.cpp) | |||||
| size_t default_limit = 4 * 1024 * 1024 * 1024LL; | |||||
| size_t system_ram = system_physical_ram(); | |||||
| if (system_ram > 0) { | |||||
| if (system_ram / 2 > default_limit) { | |||||
| map_host_limit = system_ram - default_limit; | |||||
| } | |||||
| else { | |||||
| map_host_limit = system_ram / 2; | |||||
| } | |||||
| } | |||||
| else { | |||||
| VLOG(1) << "Mapped host memory disabled, failed to get system RAM"; | |||||
| } | |||||
| // Check device support for pinned host memory | |||||
| check_result_cuda( | |||||
| cuDeviceGetAttribute(&can_map_host, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuda_device)); | |||||
| // Create OptiX context for this device | // Create OptiX context for this device | ||||
| OptixDeviceContextOptions options = {}; | OptixDeviceContextOptions options = {}; | ||||
| # ifdef WITH_CYCLES_LOGGING | # ifdef WITH_CYCLES_LOGGING | ||||
| options.logCallbackLevel = 4; // Fatal = 1, Error = 2, Warning = 3, Print = 4 | options.logCallbackLevel = 4; // Fatal = 1, Error = 2, Warning = 3, Print = 4 | ||||
| options.logCallbackFunction = | options.logCallbackFunction = | ||||
| [](unsigned int level, const char *, const char *message, void *) { | [](unsigned int level, const char *, const char *message, void *) { | ||||
| switch (level) { | switch (level) { | ||||
| case 1: | case 1: | ||||
| ▲ Show 20 Lines • Show All 611 Lines • ▼ Show 20 Lines | bool build_optix_bvh(const OptixBuildInput &build_input, | ||||
| check_result_optix_ret( | check_result_optix_ret( | ||||
| optixAccelComputeMemoryUsage(context, &options, &build_input, 1, &sizes)); | optixAccelComputeMemoryUsage(context, &options, &build_input, 1, &sizes)); | ||||
| // Allocate required output buffers | // Allocate required output buffers | ||||
| device_only_memory<char> temp_mem(this, "temp_build_mem"); | device_only_memory<char> temp_mem(this, "temp_build_mem"); | ||||
| temp_mem.alloc_to_device(sizes.tempSizeInBytes); | temp_mem.alloc_to_device(sizes.tempSizeInBytes); | ||||
| out_data.type = MEM_DEVICE_ONLY; | |||||
| out_data.data_type = TYPE_UNKNOWN; | out_data.data_type = TYPE_UNKNOWN; | ||||
| out_data.data_elements = 1; | out_data.data_elements = 1; | ||||
| out_data.data_size = sizes.outputSizeInBytes; | out_data.data_size = sizes.outputSizeInBytes; | ||||
| mem_alloc(out_data); | mem_alloc(out_data); | ||||
| // Finally build the acceleration structure | // Finally build the acceleration structure | ||||
| check_result_optix_ret(optixAccelBuild(context, | check_result_optix_ret(optixAccelBuild(context, | ||||
| NULL, | NULL, | ||||
| ▲ Show 20 Lines • Show All 326 Lines • ▼ Show 20 Lines | void update_launch_params(const char *name, size_t offset, void *data, size_t data_size) | ||||
| CUdeviceptr mem = 0; | CUdeviceptr mem = 0; | ||||
| check_result_cuda(cuModuleGetGlobal(&mem, &bytes, cuda_module, name)); | check_result_cuda(cuModuleGetGlobal(&mem, &bytes, cuda_module, name)); | ||||
| assert(mem != NULL && bytes == data_size); | assert(mem != NULL && bytes == data_size); | ||||
| check_result_cuda(cuMemcpyHtoD(mem, data, data_size)); | check_result_cuda(cuMemcpyHtoD(mem, data, data_size)); | ||||
| } | } | ||||
| void mem_alloc(device_memory &mem) override | void mem_alloc(device_memory &mem) override | ||||
| { | { | ||||
| const CUDAContextScope scope(cuda_context); | if (mem.type == MEM_PIXELS && !background) { | ||||
| assert(!"mem_alloc not supported for pixels."); | |||||
| } | |||||
| else if (mem.type == MEM_TEXTURE) { | |||||
| assert(!"mem_alloc not supported for textures."); | |||||
| } | |||||
| else { | |||||
| generic_alloc(mem); | |||||
| } | |||||
| } | |||||
| mem.device_size = mem.memory_size(); | CUDAMem *generic_alloc(device_memory &mem, size_t pitch_padding = 0) | ||||
| { | |||||
| CUDAContextScope scope(cuda_context); | |||||
| CUdeviceptr device_pointer = 0; | |||||
| size_t size = mem.memory_size() + pitch_padding; | |||||
| CUresult mem_alloc_result = CUDA_ERROR_OUT_OF_MEMORY; | |||||
| const char *status = ""; | |||||
| /* First try allocating in device memory, respecting headroom. We make | |||||
| * an exception for texture info. It is small and frequently accessed, | |||||
| * so treat it as working memory. | |||||
| * | |||||
| * If there is not enough room for working memory, we will try to move | |||||
| * textures to host memory, assuming the performance impact would have | |||||
| * been worse for working memory. */ | |||||
| bool is_texture = (mem.type == MEM_TEXTURE) && (&mem != &texture_info); | |||||
| bool is_image = is_texture && (mem.data_height > 1); | |||||
| size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom; | |||||
| if (mem.type == MEM_TEXTURE && mem.interpolation != INTERPOLATION_NONE) { | size_t total = 0, free = 0; | ||||
| CUDAMem &cmem = cuda_mem_map[&mem]; // Lock and get associated memory information | cuMemGetInfo(&free, &total); | ||||
| CUDA_TEXTURE_DESC tex_desc = {}; | /* Move textures to host memory if needed. */ | ||||
| tex_desc.flags = CU_TRSF_NORMALIZED_COORDINATES; | if (!move_texture_to_host && !is_image && (size + headroom) >= free) { | ||||
| CUDA_RESOURCE_DESC res_desc = {}; | move_textures_to_host(size + headroom - free, is_texture); | ||||
| cuMemGetInfo(&free, &total); | |||||
| } | |||||
| /* Allocate in device memory. */ | |||||
| if (!move_texture_to_host && (size + headroom) < free) { | |||||
| mem_alloc_result = cuMemAlloc(&device_pointer, size); | |||||
| if (mem_alloc_result == CUDA_SUCCESS) { | |||||
| status = " in device memory"; | |||||
| } | |||||
| } | |||||
| /* Fall back to mapped host memory if needed and possible. */ | |||||
| void *map_host_pointer = 0; | |||||
| bool free_map_host = false; | |||||
| if (mem_alloc_result != CUDA_SUCCESS && can_map_host && | |||||
| map_host_used + size < map_host_limit) { | |||||
| if (mem.shared_pointer) { | |||||
| /* Another device already allocated host memory. */ | |||||
| mem_alloc_result = CUDA_SUCCESS; | |||||
| map_host_pointer = mem.shared_pointer; | |||||
| } | |||||
| else { | |||||
| /* Allocate host memory ourselves. */ | |||||
| mem_alloc_result = cuMemHostAlloc( | |||||
| &map_host_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED); | |||||
| mem.shared_pointer = map_host_pointer; | |||||
| free_map_host = true; | |||||
| } | |||||
| if (mem_alloc_result == CUDA_SUCCESS) { | |||||
| cuMemHostGetDevicePointer_v2(&device_pointer, mem.shared_pointer, 0); | |||||
| map_host_used += size; | |||||
| status = " in host memory"; | |||||
| /* Replace host pointer with our host allocation. Only works if | |||||
| * CUDA memory layout is the same and has no pitch padding. Also | |||||
| * does not work if we move textures to host during a render, | |||||
| * since other devices might be using the memory. */ | |||||
| if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer && | |||||
| mem.host_pointer != mem.shared_pointer) { | |||||
| memcpy(mem.shared_pointer, mem.host_pointer, size); | |||||
| mem.host_free(); | |||||
| mem.host_pointer = mem.shared_pointer; | |||||
| } | |||||
| } | |||||
| else { | |||||
| status = " failed, out of host memory"; | |||||
| } | |||||
| } | |||||
| else if (mem_alloc_result != CUDA_SUCCESS) { | |||||
| status = " failed, out of device and host memory"; | |||||
| } | |||||
| if (mem.name) { | |||||
| VLOG(1) << "Buffer allocate: " << mem.name << ", " | |||||
| << string_human_readable_number(mem.memory_size()) << " bytes. (" | |||||
| << string_human_readable_size(mem.memory_size()) << ")" << status; | |||||
| } | |||||
| if (mem_alloc_result != CUDA_SUCCESS) { | |||||
| set_error(string_printf("Buffer allocate %s", status)); | |||||
| return NULL; | |||||
| } | |||||
| mem.device_pointer = (device_ptr)device_pointer; | |||||
| mem.device_size = size; | |||||
| stats.mem_alloc(size); | |||||
| if (!mem.device_pointer) { | |||||
| return NULL; | |||||
| } | |||||
| /* Insert into map of allocations. */ | |||||
| CUDAMem *cmem = &cuda_mem_map[&mem]; | |||||
| cmem->map_host_pointer = map_host_pointer; | |||||
| cmem->free_map_host = free_map_host; | |||||
| return cmem; | |||||
| } | |||||
| void tex_alloc(device_memory &mem) | |||||
| { | |||||
| CUDAContextScope scope(cuda_context); | |||||
| /* General variables for both architectures */ | |||||
| string bind_name = mem.name; | |||||
| size_t dsize = datatype_size(mem.data_type); | |||||
| size_t size = mem.memory_size(); | |||||
| CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP; | |||||
| switch (mem.extension) { | switch (mem.extension) { | ||||
| default: | |||||
| assert(0); | |||||
| case EXTENSION_REPEAT: | case EXTENSION_REPEAT: | ||||
| tex_desc.addressMode[0] = tex_desc.addressMode[1] = tex_desc.addressMode[2] = | address_mode = CU_TR_ADDRESS_MODE_WRAP; | ||||
| CU_TR_ADDRESS_MODE_WRAP; | |||||
| break; | break; | ||||
| case EXTENSION_EXTEND: | case EXTENSION_EXTEND: | ||||
| tex_desc.addressMode[0] = tex_desc.addressMode[1] = tex_desc.addressMode[2] = | address_mode = CU_TR_ADDRESS_MODE_CLAMP; | ||||
| CU_TR_ADDRESS_MODE_CLAMP; | |||||
| break; | break; | ||||
| case EXTENSION_CLIP: | case EXTENSION_CLIP: | ||||
| tex_desc.addressMode[0] = tex_desc.addressMode[1] = tex_desc.addressMode[2] = | address_mode = CU_TR_ADDRESS_MODE_BORDER; | ||||
| CU_TR_ADDRESS_MODE_BORDER; | break; | ||||
| default: | |||||
| assert(0); | |||||
| break; | break; | ||||
| } | } | ||||
| switch (mem.interpolation) { | CUfilter_mode filter_mode; | ||||
| default: // Default to linear for unsupported interpolation types | if (mem.interpolation == INTERPOLATION_CLOSEST) { | ||||
| case INTERPOLATION_LINEAR: | filter_mode = CU_TR_FILTER_MODE_POINT; | ||||
| tex_desc.filterMode = CU_TR_FILTER_MODE_LINEAR; | } | ||||
| break; | else { | ||||
| case INTERPOLATION_CLOSEST: | filter_mode = CU_TR_FILTER_MODE_LINEAR; | ||||
| tex_desc.filterMode = CU_TR_FILTER_MODE_POINT; | } | ||||
| break; | |||||
| /* Data Storage */ | |||||
| if (mem.interpolation == INTERPOLATION_NONE) { | |||||
| generic_alloc(mem); | |||||
| generic_copy_to(mem); | |||||
| // Update data storage pointers in launch parameters | |||||
| # define KERNEL_TEX(data_type, tex_name) \ | |||||
| if (strcmp(mem.name, #tex_name) == 0) \ | |||||
| update_launch_params( \ | |||||
| mem.name, offsetof(KernelParams, tex_name), &mem.device_pointer, sizeof(device_ptr)); | |||||
| # include "kernel/kernel_textures.h" | |||||
| # undef KERNEL_TEX | |||||
| return; | |||||
| } | } | ||||
| CUarray_format format; | /* Image Texture Storage */ | ||||
| CUarray_format_enum format; | |||||
| switch (mem.data_type) { | switch (mem.data_type) { | ||||
| default: | |||||
| assert(0); | |||||
| case TYPE_UCHAR: | case TYPE_UCHAR: | ||||
| format = CU_AD_FORMAT_UNSIGNED_INT8; | format = CU_AD_FORMAT_UNSIGNED_INT8; | ||||
| break; | break; | ||||
| case TYPE_UINT16: | case TYPE_UINT16: | ||||
| format = CU_AD_FORMAT_UNSIGNED_INT16; | format = CU_AD_FORMAT_UNSIGNED_INT16; | ||||
| break; | break; | ||||
| case TYPE_UINT: | case TYPE_UINT: | ||||
| format = CU_AD_FORMAT_UNSIGNED_INT32; | format = CU_AD_FORMAT_UNSIGNED_INT32; | ||||
| break; | break; | ||||
| case TYPE_INT: | case TYPE_INT: | ||||
| format = CU_AD_FORMAT_SIGNED_INT32; | format = CU_AD_FORMAT_SIGNED_INT32; | ||||
| break; | break; | ||||
| case TYPE_FLOAT: | case TYPE_FLOAT: | ||||
| format = CU_AD_FORMAT_FLOAT; | format = CU_AD_FORMAT_FLOAT; | ||||
| break; | break; | ||||
| case TYPE_HALF: | case TYPE_HALF: | ||||
| format = CU_AD_FORMAT_HALF; | format = CU_AD_FORMAT_HALF; | ||||
| break; | break; | ||||
| default: | |||||
| assert(0); | |||||
| return; | |||||
| } | } | ||||
| if (mem.data_depth > 1) { /* 3D texture using array. */ | CUDAMem *cmem = NULL; | ||||
| CUarray array_3d = NULL; | |||||
| size_t src_pitch = mem.data_width * dsize * mem.data_elements; | |||||
| size_t dst_pitch = src_pitch; | |||||
| if (mem.data_depth > 1) { | |||||
| /* 3D texture using array, there is no API for linear memory. */ | |||||
| CUDA_ARRAY3D_DESCRIPTOR desc; | CUDA_ARRAY3D_DESCRIPTOR desc; | ||||
| desc.Width = mem.data_width; | desc.Width = mem.data_width; | ||||
| desc.Height = mem.data_height; | desc.Height = mem.data_height; | ||||
| desc.Depth = mem.data_depth; | desc.Depth = mem.data_depth; | ||||
| desc.Format = format; | desc.Format = format; | ||||
| desc.NumChannels = mem.data_elements; | desc.NumChannels = mem.data_elements; | ||||
| desc.Flags = 0; | desc.Flags = 0; | ||||
| check_result_cuda(cuArray3DCreate(&cmem.array, &desc)); | VLOG(1) << "Array 3D allocate: " << mem.name << ", " | ||||
| mem.device_pointer = (device_ptr)cmem.array; | << string_human_readable_number(mem.memory_size()) << " bytes. (" | ||||
| << string_human_readable_size(mem.memory_size()) << ")"; | |||||
| res_desc.resType = CU_RESOURCE_TYPE_ARRAY; | check_result_cuda(cuArray3DCreate(&array_3d, &desc)); | ||||
| res_desc.res.array.hArray = cmem.array; | |||||
| if (!array_3d) { | |||||
| return; | |||||
| } | } | ||||
| else if (mem.data_height > 0) { /* 2D texture using array. */ | |||||
| CUDA_ARRAY_DESCRIPTOR desc; | |||||
| desc.Width = mem.data_width; | |||||
| desc.Height = mem.data_height; | |||||
| desc.Format = format; | |||||
| desc.NumChannels = mem.data_elements; | |||||
| check_result_cuda(cuArrayCreate(&cmem.array, &desc)); | CUDA_MEMCPY3D param; | ||||
| mem.device_pointer = (device_ptr)cmem.array; | memset(¶m, 0, sizeof(param)); | ||||
| param.dstMemoryType = CU_MEMORYTYPE_ARRAY; | |||||
| param.dstArray = array_3d; | |||||
| param.srcMemoryType = CU_MEMORYTYPE_HOST; | |||||
| param.srcHost = mem.host_pointer; | |||||
| param.srcPitch = src_pitch; | |||||
| param.WidthInBytes = param.srcPitch; | |||||
| param.Height = mem.data_height; | |||||
| param.Depth = mem.data_depth; | |||||
| check_result_cuda(cuMemcpy3D(¶m)); | |||||
| res_desc.resType = CU_RESOURCE_TYPE_ARRAY; | mem.device_pointer = (device_ptr)array_3d; | ||||
| res_desc.res.array.hArray = cmem.array; | mem.device_size = size; | ||||
| stats.mem_alloc(size); | |||||
| cmem = &cuda_mem_map[&mem]; | |||||
| cmem->texobject = 0; | |||||
| cmem->array = array_3d; | |||||
| } | } | ||||
| else { | else if (mem.data_height > 0) { | ||||
| check_result_cuda(cuMemAlloc((CUdeviceptr *)&mem.device_pointer, mem.device_size)); | /* 2D texture, using pitch aligned linear memory. */ | ||||
| int alignment = 0; | |||||
| check_result_cuda(cuDeviceGetAttribute( | |||||
| &alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuda_device)); | |||||
| dst_pitch = align_up(src_pitch, alignment); | |||||
| size_t dst_size = dst_pitch * mem.data_height; | |||||
| res_desc.resType = CU_RESOURCE_TYPE_LINEAR; | cmem = generic_alloc(mem, dst_size - mem.memory_size()); | ||||
| res_desc.res.linear.devPtr = (CUdeviceptr)mem.device_pointer; | if (!cmem) { | ||||
| res_desc.res.linear.format = format; | return; | ||||
| res_desc.res.linear.numChannels = mem.data_elements; | |||||
| res_desc.res.linear.sizeInBytes = mem.device_size; | |||||
| } | } | ||||
| check_result_cuda(cuTexObjectCreate(&cmem.texobject, &res_desc, &tex_desc, NULL)); | CUDA_MEMCPY2D param; | ||||
| memset(¶m, 0, sizeof(param)); | |||||
| param.dstMemoryType = CU_MEMORYTYPE_DEVICE; | |||||
| param.dstDevice = mem.device_pointer; | |||||
| param.dstPitch = dst_pitch; | |||||
| param.srcMemoryType = CU_MEMORYTYPE_HOST; | |||||
| param.srcHost = mem.host_pointer; | |||||
| param.srcPitch = src_pitch; | |||||
| param.WidthInBytes = param.srcPitch; | |||||
| param.Height = mem.data_height; | |||||
| check_result_cuda(cuMemcpy2DUnaligned(¶m)); | |||||
| } | |||||
| else { | |||||
| /* 1D texture, using linear memory. */ | |||||
| cmem = generic_alloc(mem); | |||||
| if (!cmem) { | |||||
| return; | |||||
| } | |||||
| check_result_cuda(cuMemcpyHtoD(mem.device_pointer, mem.host_pointer, size)); | |||||
| } | |||||
| /* Kepler+, bindless textures. */ | |||||
| int flat_slot = 0; | int flat_slot = 0; | ||||
| if (string_startswith(mem.name, "__tex_image")) { | if (string_startswith(mem.name, "__tex_image")) { | ||||
| flat_slot = atoi(mem.name + string(mem.name).rfind("_") + 1); | int pos = string(mem.name).rfind("_"); | ||||
| flat_slot = atoi(mem.name + pos + 1); | |||||
| } | |||||
| else { | |||||
| assert(0); | |||||
| } | } | ||||
| if (flat_slot >= texture_info.size()) | CUDA_RESOURCE_DESC resDesc; | ||||
| memset(&resDesc, 0, sizeof(resDesc)); | |||||
| if (array_3d) { | |||||
| resDesc.resType = CU_RESOURCE_TYPE_ARRAY; | |||||
| resDesc.res.array.hArray = array_3d; | |||||
| resDesc.flags = 0; | |||||
| } | |||||
| else if (mem.data_height > 0) { | |||||
| resDesc.resType = CU_RESOURCE_TYPE_PITCH2D; | |||||
| resDesc.res.pitch2D.devPtr = mem.device_pointer; | |||||
| resDesc.res.pitch2D.format = format; | |||||
| resDesc.res.pitch2D.numChannels = mem.data_elements; | |||||
| resDesc.res.pitch2D.height = mem.data_height; | |||||
| resDesc.res.pitch2D.width = mem.data_width; | |||||
| resDesc.res.pitch2D.pitchInBytes = dst_pitch; | |||||
| } | |||||
| else { | |||||
| resDesc.resType = CU_RESOURCE_TYPE_LINEAR; | |||||
| resDesc.res.linear.devPtr = mem.device_pointer; | |||||
| resDesc.res.linear.format = format; | |||||
| resDesc.res.linear.numChannels = mem.data_elements; | |||||
| resDesc.res.linear.sizeInBytes = mem.device_size; | |||||
| } | |||||
| CUDA_TEXTURE_DESC texDesc; | |||||
| memset(&texDesc, 0, sizeof(texDesc)); | |||||
| texDesc.addressMode[0] = address_mode; | |||||
| texDesc.addressMode[1] = address_mode; | |||||
| texDesc.addressMode[2] = address_mode; | |||||
| texDesc.filterMode = filter_mode; | |||||
| texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES; | |||||
| check_result_cuda(cuTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL)); | |||||
| /* Resize once */ | |||||
| if (flat_slot >= texture_info.size()) { | |||||
| /* Allocate some slots in advance, to reduce amount | |||||
| * of re-allocations. */ | |||||
| texture_info.resize(flat_slot + 128); | texture_info.resize(flat_slot + 128); | ||||
| } | |||||
| /* Set Mapping and tag that we need to (re-)upload to device */ | |||||
| TextureInfo &info = texture_info[flat_slot]; | TextureInfo &info = texture_info[flat_slot]; | ||||
| info.data = (uint64_t)cmem.texobject; | info.data = (uint64_t)cmem->texobject; | ||||
| info.cl_buffer = 0; | info.cl_buffer = 0; | ||||
| info.interpolation = mem.interpolation; | info.interpolation = mem.interpolation; | ||||
| info.extension = mem.extension; | info.extension = mem.extension; | ||||
| info.width = mem.data_width; | info.width = mem.data_width; | ||||
| info.height = mem.data_height; | info.height = mem.data_height; | ||||
| info.depth = mem.data_depth; | info.depth = mem.data_depth; | ||||
| // Texture information has changed and needs an update, delay this to next launch | |||||
| need_texture_info = true; | need_texture_info = true; | ||||
| } | } | ||||
| else { | |||||
| // This is not a texture but simple linear memory | |||||
| check_result_cuda(cuMemAlloc((CUdeviceptr *)&mem.device_pointer, mem.device_size)); | |||||
| // Update data storage pointers in launch parameters | |||||
| # define KERNEL_TEX(data_type, tex_name) \ | |||||
| if (strcmp(mem.name, #tex_name) == 0) \ | |||||
| update_launch_params( \ | |||||
| mem.name, offsetof(KernelParams, tex_name), &mem.device_pointer, sizeof(device_ptr)); | |||||
| # include "kernel/kernel_textures.h" | |||||
| # undef KERNEL_TEX | |||||
| } | |||||
| stats.mem_alloc(mem.device_size); | |||||
| } | |||||
| void mem_copy_to(device_memory &mem) override | void mem_copy_to(device_memory &mem) override | ||||
| { | { | ||||
| if (!mem.host_pointer || mem.host_pointer == mem.shared_pointer) | if (mem.type == MEM_PIXELS) { | ||||
| return; | assert(!"mem_copy_to not supported for pixels."); | ||||
| if (!mem.device_pointer) | |||||
| mem_alloc(mem); // Need to allocate memory first if it does not exist yet | |||||
| const CUDAContextScope scope(cuda_context); | |||||
| if (mem.type == MEM_TEXTURE && mem.interpolation != INTERPOLATION_NONE) { | |||||
| const CUDAMem &cmem = cuda_mem_map[&mem]; // Lock and get associated memory information | |||||
| size_t src_pitch = mem.data_width * datatype_size(mem.data_type) * mem.data_elements; | |||||
| if (mem.data_depth > 1) { | |||||
| CUDA_MEMCPY3D param; | |||||
| memset(¶m, 0, sizeof(param)); | |||||
| param.dstMemoryType = CU_MEMORYTYPE_ARRAY; | |||||
| param.dstArray = cmem.array; | |||||
| param.srcMemoryType = CU_MEMORYTYPE_HOST; | |||||
| param.srcHost = mem.host_pointer; | |||||
| param.srcPitch = src_pitch; | |||||
| param.WidthInBytes = param.srcPitch; | |||||
| param.Height = mem.data_height; | |||||
| param.Depth = mem.data_depth; | |||||
| check_result_cuda(cuMemcpy3D(¶m)); | |||||
| } | } | ||||
| else if (mem.data_height > 0) { | else if (mem.type == MEM_TEXTURE) { | ||||
| CUDA_MEMCPY2D param; | tex_free(mem); | ||||
| memset(¶m, 0, sizeof(param)); | tex_alloc(mem); | ||||
| param.dstMemoryType = CU_MEMORYTYPE_ARRAY; | |||||
| param.dstArray = cmem.array; | |||||
| param.srcMemoryType = CU_MEMORYTYPE_HOST; | |||||
| param.srcHost = mem.host_pointer; | |||||
| param.srcPitch = src_pitch; | |||||
| param.WidthInBytes = param.srcPitch; | |||||
| param.Height = mem.data_height; | |||||
| check_result_cuda(cuMemcpy2D(¶m)); | |||||
| } | } | ||||
| else { | else { | ||||
| check_result_cuda( | if (!mem.device_pointer) { | ||||
| cuMemcpyHtoD((CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.device_size)); | generic_alloc(mem); | ||||
| } | } | ||||
| generic_copy_to(mem); | |||||
| } | } | ||||
| else { | } | ||||
| // This is not a texture but simple linear memory | |||||
| void generic_copy_to(device_memory &mem) | |||||
| { | |||||
| if (mem.host_pointer && mem.device_pointer) { | |||||
| CUDAContextScope scope(cuda_context); | |||||
| if (mem.host_pointer != mem.shared_pointer) { | |||||
| check_result_cuda( | check_result_cuda( | ||||
| cuMemcpyHtoD((CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.device_size)); | cuMemcpyHtoD((CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.memory_size())); | ||||
| } | |||||
| } | } | ||||
| } | } | ||||
| void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) override | void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) override | ||||
| { | { | ||||
| if (mem.type == MEM_PIXELS && !background) { | |||||
| assert(!"mem_copy_from not supported for pixels."); | |||||
| } | |||||
| else if (mem.type == MEM_TEXTURE) { | |||||
| assert(!"mem_copy_from not supported for textures."); | |||||
| } | |||||
| else { | |||||
| // Calculate linear memory offset and size | // Calculate linear memory offset and size | ||||
| const size_t size = elem * w * h; | const size_t size = elem * w * h; | ||||
| const size_t offset = elem * y * w; | const size_t offset = elem * y * w; | ||||
| if (mem.host_pointer && mem.device_pointer) { | if (mem.host_pointer && mem.device_pointer) { | ||||
| const CUDAContextScope scope(cuda_context); | const CUDAContextScope scope(cuda_context); | ||||
| check_result_cuda(cuMemcpyDtoH( | check_result_cuda(cuMemcpyDtoH( | ||||
| (char *)mem.host_pointer + offset, (CUdeviceptr)mem.device_pointer + offset, size)); | (char *)mem.host_pointer + offset, (CUdeviceptr)mem.device_pointer + offset, size)); | ||||
| } | } | ||||
| else if (mem.host_pointer) { | else if (mem.host_pointer) { | ||||
| memset((char *)mem.host_pointer + offset, 0, size); | memset((char *)mem.host_pointer + offset, 0, size); | ||||
| } | } | ||||
| } | } | ||||
| } | |||||
| void mem_zero(device_memory &mem) override | void mem_zero(device_memory &mem) override | ||||
| { | { | ||||
| if (mem.host_pointer) | if (mem.host_pointer) | ||||
| memset(mem.host_pointer, 0, mem.memory_size()); | memset(mem.host_pointer, 0, mem.memory_size()); | ||||
| if (mem.host_pointer && mem.host_pointer == mem.shared_pointer) | if (mem.host_pointer && mem.host_pointer == mem.shared_pointer) | ||||
| return; // This is shared host memory, so no device memory to update | return; // This is shared host memory, so no device memory to update | ||||
| if (!mem.device_pointer) | if (!mem.device_pointer) | ||||
| mem_alloc(mem); // Need to allocate memory first if it does not exist yet | mem_alloc(mem); // Need to allocate memory first if it does not exist yet | ||||
| const CUDAContextScope scope(cuda_context); | const CUDAContextScope scope(cuda_context); | ||||
| check_result_cuda(cuMemsetD8((CUdeviceptr)mem.device_pointer, 0, mem.memory_size())); | check_result_cuda(cuMemsetD8((CUdeviceptr)mem.device_pointer, 0, mem.memory_size())); | ||||
| } | } | ||||
| void mem_free(device_memory &mem) override | void mem_free(device_memory &mem) override | ||||
| { | { | ||||
| assert(mem.device_pointer); | if (mem.type == MEM_PIXELS && !background) { | ||||
| assert(!"mem_free not supported for pixels."); | |||||
| const CUDAContextScope scope(cuda_context); | } | ||||
| else if (mem.type == MEM_TEXTURE) { | |||||
| tex_free(mem); | |||||
| } | |||||
| else { | |||||
| generic_free(mem); | |||||
| } | |||||
| } | |||||
| if (mem.type == MEM_TEXTURE && mem.interpolation != INTERPOLATION_NONE) { | void generic_free(device_memory &mem) | ||||
| CUDAMem &cmem = cuda_mem_map[&mem]; // Lock and get associated memory information | { | ||||
| if (mem.device_pointer) { | |||||
| CUDAContextScope scope(cuda_context); | |||||
| const CUDAMem &cmem = cuda_mem_map[&mem]; | |||||
| if (cmem.array) | if (cmem.map_host_pointer) { | ||||
| cuArrayDestroy(cmem.array); | /* Free host memory. */ | ||||
| else | if (cmem.free_map_host) { | ||||
| cuMemFree((CUdeviceptr)mem.device_pointer); | cuMemFreeHost(cmem.map_host_pointer); | ||||
| if (mem.host_pointer == mem.shared_pointer) { | |||||
| mem.host_pointer = 0; | |||||
| } | |||||
| mem.shared_pointer = 0; | |||||
| } | |||||
| if (cmem.texobject) | map_host_used -= mem.device_size; | ||||
| cuTexObjectDestroy(cmem.texobject); | |||||
| } | } | ||||
| else { | else { | ||||
| // This is not a texture but simple linear memory | /* Free device memory. */ | ||||
| cuMemFree((CUdeviceptr)mem.device_pointer); | cuMemFree(mem.device_pointer); | ||||
| } | } | ||||
| stats.mem_free(mem.device_size); | stats.mem_free(mem.device_size); | ||||
| mem.device_pointer = 0; | |||||
| mem.device_size = 0; | mem.device_size = 0; | ||||
| cuda_mem_map.erase(cuda_mem_map.find(&mem)); | |||||
| } | |||||
| } | |||||
| void tex_free(device_memory &mem) | |||||
| { | |||||
| if (mem.device_pointer) { | |||||
| CUDAContextScope scope(cuda_context); | |||||
| const CUDAMem &cmem = cuda_mem_map[&mem]; | |||||
| if (cmem.texobject) { | |||||
| /* Free bindless texture. */ | |||||
| cuTexObjectDestroy(cmem.texobject); | |||||
| } | |||||
| if (cmem.array) { | |||||
| /* Free array. */ | |||||
| cuArrayDestroy(cmem.array); | |||||
| stats.mem_free(mem.device_size); | |||||
| mem.device_pointer = 0; | mem.device_pointer = 0; | ||||
| mem.device_size = 0; | |||||
| cuda_mem_map.erase(cuda_mem_map.find(&mem)); | |||||
| } | |||||
| else { | |||||
| generic_free(mem); | |||||
| } | |||||
| } | |||||
| } | |||||
| void move_textures_to_host(size_t size, bool for_texture) | |||||
| { | |||||
| /* Signal to reallocate textures in host memory only. */ | |||||
| move_texture_to_host = true; | |||||
| while (size > 0) { | |||||
| /* Find suitable memory allocation to move. */ | |||||
| device_memory *max_mem = NULL; | |||||
| size_t max_size = 0; | |||||
| bool max_is_image = false; | |||||
| foreach (auto &pair, cuda_mem_map) { | |||||
| device_memory &mem = *pair.first; | |||||
| CUDAMem *cmem = &pair.second; | |||||
| bool is_texture = (mem.type == MEM_TEXTURE) && (&mem != &texture_info); | |||||
| bool is_image = is_texture && (mem.data_height > 1); | |||||
| /* Can't move this type of memory. */ | |||||
| if (!is_texture || cmem->array) { | |||||
| continue; | |||||
| } | |||||
| /* Already in host memory. */ | |||||
| if (cmem->map_host_pointer) { | |||||
| continue; | |||||
| } | |||||
| /* For other textures, only move image textures. */ | |||||
| if (for_texture && !is_image) { | |||||
| continue; | |||||
| } | |||||
| /* Try to move largest allocation, prefer moving images. */ | |||||
| if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) { | |||||
| max_is_image = is_image; | |||||
| max_size = mem.device_size; | |||||
| max_mem = &mem; | |||||
| } | |||||
| } | |||||
| /* Move to host memory. This part is mutex protected since | |||||
| * multiple CUDA devices could be moving the memory. The | |||||
| * first one will do it, and the rest will adopt the pointer. */ | |||||
| if (max_mem) { | |||||
| VLOG(1) << "Move memory from device to host: " << max_mem->name; | |||||
| static thread_mutex move_mutex; | |||||
| thread_scoped_lock lock(move_mutex); | |||||
| /* Preserve the original device pointer, in case of multi device | |||||
| * we can't change it because the pointer mapping would break. */ | |||||
| device_ptr prev_pointer = max_mem->device_pointer; | |||||
| size_t prev_size = max_mem->device_size; | |||||
| tex_free(*max_mem); | |||||
| tex_alloc(*max_mem); | |||||
| size = (max_size >= size) ? 0 : size - max_size; | |||||
| max_mem->device_pointer = prev_pointer; | |||||
| max_mem->device_size = prev_size; | |||||
| } | |||||
| else { | |||||
| break; | |||||
| } | |||||
| } | |||||
| /* Update texture info array with new pointers. */ | |||||
| update_texture_info(); | |||||
| move_texture_to_host = false; | |||||
| } | } | ||||
| void const_copy_to(const char *name, void *host, size_t size) override | void const_copy_to(const char *name, void *host, size_t size) override | ||||
| { | { | ||||
| if (strcmp(name, "__data") == 0) { | if (strcmp(name, "__data") == 0) { | ||||
| assert(size <= sizeof(KernelData)); | assert(size <= sizeof(KernelData)); | ||||
| // Fix traversable handle on multi devices | // Fix traversable handle on multi devices | ||||
| ▲ Show 20 Lines • Show All 551 Lines • Show Last 20 Lines | |||||