Changeset View
Standalone View
intern/cycles/device/device_cuda.cpp
| Show First 20 Lines • Show All 79 Lines • ▼ Show 20 Lines | |||||
| class CUDADevice : public Device | class CUDADevice : public Device | ||||
| { | { | ||||
| public: | public: | ||||
| DedicatedTaskPool task_pool; | DedicatedTaskPool task_pool; | ||||
| CUdevice cuDevice; | CUdevice cuDevice; | ||||
| CUcontext cuContext; | CUcontext cuContext; | ||||
| CUmodule cuModule; | CUmodule cuModule; | ||||
| map<device_ptr, bool> tex_interp_map; | map<device_ptr, bool> tex_interp_map; | ||||
| int cuDevId; | int cuDevId; | ||||
sergey: Eventually we can merge this two maps into one like this:
struct MemInfo {
bool… | |||||
| int cuDevArchitecture; | int cuDevArchitecture; | ||||
| bool first_error; | bool first_error; | ||||
| bool use_texture_storage; | |||||
| struct PixelMem { | struct PixelMem { | ||||
| GLuint cuPBO; | GLuint cuPBO; | ||||
| CUgraphicsResource cuPBOresource; | CUgraphicsResource cuPBOresource; | ||||
| GLuint cuTexId; | GLuint cuTexId; | ||||
| int w, h; | int w, h; | ||||
| }; | }; | ||||
| map<device_ptr, PixelMem> pixel_mem_map; | map<device_ptr, PixelMem> pixel_mem_map; | ||||
| /* Bindless Textures */ | |||||
| device_vector<uint> bindless_mapping; | |||||
| bool sync_bindless_mapping; | |||||
Done Inline ActionsWe don't need two arrays, we can have just a single device_vector<uint> bindless_mapping;. And then to add slots: if (flat_slot >= bindless_mapping.size())
bindless_mapping.resize(max(bindless_mapping.size() * 2, 1));
bindless_mapping[flat_slot] = tex;brecht: We don't need two arrays, we can have just a single `device_vector<uint> bindless_mapping;`. | |||||
Done Inline ActionsUse need_ prefix to distinguish flag from method? sergey: Use `need_` prefix to distinguish flag from method? | |||||
| CUdeviceptr cuda_device_ptr(device_ptr mem) | CUdeviceptr cuda_device_ptr(device_ptr mem) | ||||
| { | { | ||||
| return (CUdeviceptr)mem; | return (CUdeviceptr)mem; | ||||
| } | } | ||||
| static bool have_precompiled_kernels() | static bool have_precompiled_kernels() | ||||
| { | { | ||||
| string cubins_path = path_get("lib"); | string cubins_path = path_get("lib"); | ||||
| ▲ Show 20 Lines • Show All 61 Lines • ▼ Show 20 Lines | void cuda_pop_context() | ||||
| cuda_assert(cuCtxSetCurrent(NULL)); | cuda_assert(cuCtxSetCurrent(NULL)); | ||||
| } | } | ||||
| CUDADevice(DeviceInfo& info, Stats &stats, bool background_) | CUDADevice(DeviceInfo& info, Stats &stats, bool background_) | ||||
| : Device(info, stats, background_) | : Device(info, stats, background_) | ||||
| { | { | ||||
| first_error = true; | first_error = true; | ||||
| background = background_; | background = background_; | ||||
| use_texture_storage = true; | |||||
| cuDevId = info.num; | cuDevId = info.num; | ||||
| cuDevice = 0; | cuDevice = 0; | ||||
| cuContext = 0; | cuContext = 0; | ||||
| sync_bindless_mapping = false; | |||||
| /* intialize */ | /* intialize */ | ||||
| if(cuda_error(cuInit(0))) | if(cuda_error(cuInit(0))) | ||||
| return; | return; | ||||
| /* setup device and context */ | /* setup device and context */ | ||||
| if(cuda_error(cuDeviceGet(&cuDevice, cuDevId))) | if(cuda_error(cuDeviceGet(&cuDevice, cuDevId))) | ||||
| return; | return; | ||||
| Show All 13 Lines | : Device(info, stats, background_) | ||||
| if(cuda_error_(result, "cuCtxCreate")) | if(cuda_error_(result, "cuCtxCreate")) | ||||
| return; | return; | ||||
| int major, minor; | int major, minor; | ||||
| cuDeviceComputeCapability(&major, &minor, cuDevId); | cuDeviceComputeCapability(&major, &minor, cuDevId); | ||||
| cuDevArchitecture = major*100 + minor*10; | cuDevArchitecture = major*100 + minor*10; | ||||
| /* In order to use full 6GB of memory on Titan cards, use arrays instead | |||||
| * of textures. On earlier cards this seems slower, but on Titan it is | |||||
| * actually slightly faster in tests. */ | |||||
| use_texture_storage = (cuDevArchitecture < 300); | |||||
| cuda_pop_context(); | cuda_pop_context(); | ||||
| } | } | ||||
| ~CUDADevice() | ~CUDADevice() | ||||
| { | { | ||||
| task_pool.stop(); | task_pool.stop(); | ||||
| if(info.has_bindless_textures) | |||||
| tex_free(bindless_mapping, -1); | |||||
Done Inline ActionsWill it work correct if the texture was never allocated? Use parenthesis btw. sergey: Will it work correct if the texture was never allocated?
Use parenthesis btw. | |||||
Not Done Inline ActionsYes, because in that case there is no valid mem.device_pointer and the function will do nothing. dingto: Yes, because in that case there is no valid mem.device_pointer and the function will do nothing. | |||||
| cuda_assert(cuCtxDestroy(cuContext)); | cuda_assert(cuCtxDestroy(cuContext)); | ||||
| } | } | ||||
| bool support_device(const DeviceRequestedFeatures& /*requested_features*/) | bool support_device(const DeviceRequestedFeatures& /*requested_features*/) | ||||
| { | { | ||||
| int major, minor; | int major, minor; | ||||
| cuDeviceComputeCapability(&major, &minor, cuDevId); | cuDeviceComputeCapability(&major, &minor, cuDevId); | ||||
| ▲ Show 20 Lines • Show All 161 Lines • ▼ Show 20 Lines | bool load_kernels(const DeviceRequestedFeatures& requested_features) | ||||
| if(cuda_error_(result, "cuModuleLoad")) | if(cuda_error_(result, "cuModuleLoad")) | ||||
| cuda_error_message(string_printf("Failed loading CUDA kernel %s.", cubin.c_str())); | cuda_error_message(string_printf("Failed loading CUDA kernel %s.", cubin.c_str())); | ||||
| cuda_pop_context(); | cuda_pop_context(); | ||||
| return (result == CUDA_SUCCESS); | return (result == CUDA_SUCCESS); | ||||
| } | } | ||||
| void load_bindless_mapping() | |||||
| { | |||||
| if(info.has_bindless_textures && sync_bindless_mapping) { | |||||
| tex_alloc("__bindless_mapping", bindless_mapping, INTERPOLATION_NONE, EXTENSION_REPEAT, 0); | |||||
brechtUnsubmitted Not Done Inline ActionsCall tex_free(bindless_mapping) here before tex_alloc(), so we don't leak memory on reallocation. brecht: Call `tex_free(bindless_mapping)` here before `tex_alloc()`, so we don't leak memory on… | |||||
| sync_bindless_mapping = false; | |||||
| } | |||||
| } | |||||
| void mem_alloc(device_memory& mem, MemoryType /*type*/) | void mem_alloc(device_memory& mem, MemoryType /*type*/) | ||||
| { | { | ||||
| cuda_push_context(); | cuda_push_context(); | ||||
| CUdeviceptr device_pointer; | CUdeviceptr device_pointer; | ||||
| size_t size = mem.memory_size(); | size_t size = mem.memory_size(); | ||||
| cuda_assert(cuMemAlloc(&device_pointer, size)); | cuda_assert(cuMemAlloc(&device_pointer, size)); | ||||
| mem.device_pointer = (device_ptr)device_pointer; | mem.device_pointer = (device_ptr)device_pointer; | ||||
| mem.device_size = size; | mem.device_size = size; | ||||
| ▲ Show 20 Lines • Show All 59 Lines • ▼ Show 20 Lines | void const_copy_to(const char *name, void *host, size_t size) | ||||
| //assert(bytes == size); | //assert(bytes == size); | ||||
| cuda_assert(cuMemcpyHtoD(mem, host, size)); | cuda_assert(cuMemcpyHtoD(mem, host, size)); | ||||
| cuda_pop_context(); | cuda_pop_context(); | ||||
| } | } | ||||
| void tex_alloc(const char *name, | void tex_alloc(const char *name, | ||||
| device_memory& mem, | device_memory& mem, | ||||
| InterpolationType interpolation, | InterpolationType interpolation, | ||||
| ExtensionType extension) | ExtensionType extension, | ||||
| int flat_slot) | |||||
| { | { | ||||
| VLOG(1) << "Texture allocate: " << name << ", " << mem.memory_size() << " bytes."; | VLOG(1) << "Texture allocate: " << name << ", " << mem.memory_size() << " bytes."; | ||||
| string bind_name = name; | /* Check if we are on sm_30 or above. | ||||
Done Inline Actionsit's a bit unclear and misleading. Better to have has_bindless_textures or supports_bindless_textures. Otherwise: a) In some time it'll be unclear why kepler is so special, you'll have constantly remember that it was first card which supported bindless textures. sergey: it's a bit unclear and misleading. Better to have `has_bindless_textures` or… | |||||
| if(mem.data_depth > 1) { | * We use arrays and bindles textures for storage there */ | ||||
| /* Kernel uses different bind names for 2d and 3d float textures, | bool has_bindless_textures = info.has_bindless_textures; | ||||
| * so we have to adjust couple of things here. | |||||
| */ | |||||
| vector<string> tokens; | |||||
| string_split(tokens, name, "_"); | |||||
| bind_name = string_printf("__tex_image_%s_3d_%s", | |||||
| tokens[2].c_str(), | |||||
| tokens[3].c_str()); | |||||
| } | |||||
| /* determine format */ | /* General variables for both architectures */ | ||||
| CUarray_format_enum format; | string bind_name = name; | ||||
| size_t dsize = datatype_size(mem.data_type); | size_t dsize = datatype_size(mem.data_type); | ||||
| size_t size = mem.memory_size(); | size_t size = mem.memory_size(); | ||||
| bool use_texture = (interpolation != INTERPOLATION_NONE) || use_texture_storage; | |||||
| if(use_texture) { | CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP; | ||||
| switch(extension) { | |||||
| case EXTENSION_REPEAT: | |||||
| address_mode = CU_TR_ADDRESS_MODE_WRAP; | |||||
| break; | |||||
| case EXTENSION_EXTEND: | |||||
| address_mode = CU_TR_ADDRESS_MODE_CLAMP; | |||||
| break; | |||||
| case EXTENSION_CLIP: | |||||
| address_mode = CU_TR_ADDRESS_MODE_BORDER; | |||||
| break; | |||||
| default: | |||||
| assert(0); | |||||
| break; | |||||
| } | |||||
| CUfilter_mode filter_mode; | |||||
| if(interpolation == INTERPOLATION_CLOSEST) { | |||||
| filter_mode = CU_TR_FILTER_MODE_POINT; | |||||
| } | |||||
| else { | |||||
| filter_mode = CU_TR_FILTER_MODE_LINEAR; | |||||
| } | |||||
| CUarray_format_enum format; | |||||
| switch(mem.data_type) { | switch(mem.data_type) { | ||||
| case TYPE_UCHAR: format = CU_AD_FORMAT_UNSIGNED_INT8; break; | case TYPE_UCHAR: format = CU_AD_FORMAT_UNSIGNED_INT8; break; | ||||
| case TYPE_UINT: format = CU_AD_FORMAT_UNSIGNED_INT32; break; | case TYPE_UINT: format = CU_AD_FORMAT_UNSIGNED_INT32; break; | ||||
| case TYPE_INT: format = CU_AD_FORMAT_SIGNED_INT32; break; | case TYPE_INT: format = CU_AD_FORMAT_SIGNED_INT32; break; | ||||
| case TYPE_FLOAT: format = CU_AD_FORMAT_FLOAT; break; | case TYPE_FLOAT: format = CU_AD_FORMAT_FLOAT; break; | ||||
| default: assert(0); return; | default: assert(0); return; | ||||
| } | } | ||||
| /* General variables for Fermi */ | |||||
| CUtexref texref = NULL; | CUtexref texref = NULL; | ||||
| if(!has_bindless_textures) { | |||||
| if(mem.data_depth > 1) { | |||||
| /* Kernel uses different bind names for 2d and 3d float textures, | |||||
| * so we have to adjust couple of things here. | |||||
| */ | |||||
| vector<string> tokens; | |||||
| string_split(tokens, name, "_"); | |||||
| bind_name = string_printf("__tex_image_%s_3d_%s", | |||||
Done Inline ActionsIndentation here seems to be weird, don't use tabs to indent pass the parent line. sergey: Indentation here seems to be weird, don't use tabs to indent pass the parent line. | |||||
| tokens[2].c_str(), | |||||
| tokens[3].c_str()); | |||||
| } | |||||
| cuda_push_context(); | cuda_push_context(); | ||||
Done Inline ActionsPush/pops don't seem to be matched anymore, or at least the logic is hard the follow. Better to do it more often than have confusing logic. brecht: Push/pops don't seem to be matched anymore, or at least the logic is hard the follow. Better to… | |||||
| cuda_assert(cuModuleGetTexRef(&texref, cuModule, bind_name.c_str())); | cuda_assert(cuModuleGetTexRef(&texref, cuModule, bind_name.c_str())); | ||||
| if(!texref) { | if(!texref) { | ||||
| cuda_pop_context(); | cuda_pop_context(); | ||||
| return; | return; | ||||
| } | } | ||||
| if(interpolation != INTERPOLATION_NONE) { | cuda_pop_context(); | ||||
brechtUnsubmitted Not Done Inline ActionsJust have a single cuda_pop_context() above the if(). brecht: Just have a single `cuda_pop_context()` above the `if()`. | |||||
| } | |||||
| /* Data Storage */ | |||||
| if(interpolation == INTERPOLATION_NONE) { | |||||
| if(has_bindless_textures) { | |||||
| cuda_pop_context(); | |||||
brechtUnsubmitted Not Done Inline ActionsRemove this, it doesn't correspond to any push. brecht: Remove this, it doesn't correspond to any push. | |||||
dingtoAuthorUnsubmitted Not Done Inline ActionsDone, and removed in the else branch below too. dingto: Done, and removed in the else branch below too. | |||||
| mem_alloc(mem, MEM_READ_ONLY); | |||||
| mem_copy_to(mem); | |||||
| cuda_push_context(); | |||||
| CUdeviceptr cumem; | |||||
| size_t cubytes; | |||||
| cuda_assert(cuModuleGetGlobal(&cumem, &cubytes, cuModule, bind_name.c_str())); | |||||
| if(cubytes == 8) { | |||||
| /* 64 bit device pointer */ | |||||
| uint64_t ptr = mem.device_pointer; | |||||
| cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes)); | |||||
| } | |||||
| else { | |||||
| /* 32 bit device pointer */ | |||||
| uint32_t ptr = (uint32_t)mem.device_pointer; | |||||
| cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes)); | |||||
| } | |||||
| cuda_pop_context(); | |||||
| } | |||||
| else { | |||||
| cuda_pop_context(); | |||||
| mem_alloc(mem, MEM_READ_ONLY); | |||||
| mem_copy_to(mem); | |||||
| cuda_push_context(); | |||||
| cuda_assert(cuTexRefSetAddress(NULL, texref, cuda_device_ptr(mem.device_pointer), size)); | |||||
| cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_POINT)); | |||||
| cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_READ_AS_INTEGER)); | |||||
| cuda_pop_context(); | |||||
| } | |||||
| } | |||||
| /* Texture Storage */ | |||||
| else { | |||||
| CUarray handle = NULL; | CUarray handle = NULL; | ||||
| cuda_push_context(); | |||||
| if(mem.data_depth > 1) { | if(mem.data_depth > 1) { | ||||
| CUDA_ARRAY3D_DESCRIPTOR desc; | CUDA_ARRAY3D_DESCRIPTOR desc; | ||||
| desc.Width = mem.data_width; | desc.Width = mem.data_width; | ||||
| desc.Height = mem.data_height; | desc.Height = mem.data_height; | ||||
| desc.Depth = mem.data_depth; | desc.Depth = mem.data_depth; | ||||
| desc.Format = format; | desc.Format = format; | ||||
| desc.NumChannels = mem.data_elements; | desc.NumChannels = mem.data_elements; | ||||
| desc.Flags = 0; | desc.Flags = 0; | ||||
| cuda_assert(cuArray3DCreate(&handle, &desc)); | cuda_assert(cuArray3DCreate(&handle, &desc)); | ||||
| } | } | ||||
| else { | else { | ||||
| CUDA_ARRAY_DESCRIPTOR desc; | CUDA_ARRAY_DESCRIPTOR desc; | ||||
| desc.Width = mem.data_width; | desc.Width = mem.data_width; | ||||
| desc.Height = mem.data_height; | desc.Height = mem.data_height; | ||||
| desc.Format = format; | desc.Format = format; | ||||
| desc.NumChannels = mem.data_elements; | desc.NumChannels = mem.data_elements; | ||||
Not Done Inline ActionsThis have to be doublechecked actually. sergey: This have to be doublechecked actually. | |||||
Not Done Inline ActionsNot sure whats wrong here? dingto: Not sure whats wrong here? | |||||
| cuda_assert(cuArrayCreate(&handle, &desc)); | cuda_assert(cuArrayCreate(&handle, &desc)); | ||||
| } | } | ||||
| if(!handle) { | if(!handle) { | ||||
| cuda_pop_context(); | cuda_pop_context(); | ||||
| return; | return; | ||||
| } | } | ||||
| /* Allocate 3D, 2D or 1D memory */ | |||||
| if(mem.data_depth > 1) { | if(mem.data_depth > 1) { | ||||
| CUDA_MEMCPY3D param; | CUDA_MEMCPY3D param; | ||||
| memset(¶m, 0, sizeof(param)); | memset(¶m, 0, sizeof(param)); | ||||
| param.dstMemoryType = CU_MEMORYTYPE_ARRAY; | param.dstMemoryType = CU_MEMORYTYPE_ARRAY; | ||||
| param.dstArray = handle; | param.dstArray = handle; | ||||
| param.srcMemoryType = CU_MEMORYTYPE_HOST; | param.srcMemoryType = CU_MEMORYTYPE_HOST; | ||||
| param.srcHost = (void*)mem.data_pointer; | param.srcHost = (void*)mem.data_pointer; | ||||
| param.srcPitch = mem.data_width*dsize*mem.data_elements; | param.srcPitch = mem.data_width*dsize*mem.data_elements; | ||||
| param.WidthInBytes = param.srcPitch; | param.WidthInBytes = param.srcPitch; | ||||
| param.Height = mem.data_height; | param.Height = mem.data_height; | ||||
Done Inline ActionsThis we should add a check that tex fits into uint and print a warning or abort rendering otherwise. CUtexObject is unsigned long long after all which is not necessarily gonna to fit into a smaller type. sergey: This we should add a check that `tex` fits into `uint` and print a warning or abort rendering… | |||||
| param.Depth = mem.data_depth; | param.Depth = mem.data_depth; | ||||
| cuda_assert(cuMemcpy3D(¶m)); | cuda_assert(cuMemcpy3D(¶m)); | ||||
| } | } | ||||
| else if(mem.data_height > 1) { | else if(mem.data_height > 1) { | ||||
Not Done Inline ActionsTried using CUDA_RESOURCE_DESC resDesc = {0};? sergey: Tried using `CUDA_RESOURCE_DESC resDesc = {0};`? | |||||
Not Done Inline ActionsThis is giving me a compile error. error: invalid conversion from ‘int’ to ‘CUresourcetype {aka CUresourcetype_enum}’ [-fpermissive]dingto: This is giving me a compile error.
error: invalid conversion from ‘int’ to ‘CUresourcetype… | |||||
Done Inline ActionsScrew it then for now. sergey: Screw it then for now. | |||||
| CUDA_MEMCPY2D param; | CUDA_MEMCPY2D param; | ||||
| memset(¶m, 0, sizeof(param)); | memset(¶m, 0, sizeof(param)); | ||||
| param.dstMemoryType = CU_MEMORYTYPE_ARRAY; | param.dstMemoryType = CU_MEMORYTYPE_ARRAY; | ||||
| param.dstArray = handle; | param.dstArray = handle; | ||||
| param.srcMemoryType = CU_MEMORYTYPE_HOST; | param.srcMemoryType = CU_MEMORYTYPE_HOST; | ||||
| param.srcHost = (void*)mem.data_pointer; | param.srcHost = (void*)mem.data_pointer; | ||||
| param.srcPitch = mem.data_width*dsize*mem.data_elements; | param.srcPitch = mem.data_width*dsize*mem.data_elements; | ||||
| param.WidthInBytes = param.srcPitch; | param.WidthInBytes = param.srcPitch; | ||||
| param.Height = mem.data_height; | param.Height = mem.data_height; | ||||
| cuda_assert(cuMemcpy2D(¶m)); | cuda_assert(cuMemcpy2D(¶m)); | ||||
| } | } | ||||
| else | else | ||||
| cuda_assert(cuMemcpyHtoA(handle, 0, (void*)mem.data_pointer, size)); | cuda_assert(cuMemcpyHtoA(handle, 0, (void*)mem.data_pointer, size)); | ||||
| cuda_assert(cuTexRefSetArray(texref, handle, CU_TRSA_OVERRIDE_FORMAT)); | /* Bindless Textures - Kepler */ | ||||
| if(has_bindless_textures) { | |||||
| CUDA_RESOURCE_DESC resDesc; | |||||
| memset(&resDesc, 0, sizeof(resDesc)); | |||||
| resDesc.resType = CU_RESOURCE_TYPE_ARRAY; | |||||
| resDesc.res.array.hArray = handle; | |||||
| resDesc.flags = 0; | |||||
| CUDA_TEXTURE_DESC texDesc; | |||||
| memset(&texDesc, 0, sizeof(texDesc)); | |||||
| texDesc.addressMode[0] = address_mode; | |||||
| texDesc.addressMode[1] = address_mode; | |||||
| texDesc.addressMode[2] = address_mode; | |||||
| texDesc.filterMode = filter_mode; | |||||
| texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES; | |||||
| CUtexObject tex = 0; | |||||
| cuda_assert(cuTexObjectCreate(&tex, &resDesc, &texDesc, NULL)); | |||||
| if(flat_slot >= bindless_mapping.size()) | |||||
| bindless_mapping.resize(4096); /*TODO(dingto): Make this a variable */ | |||||
| bindless_mapping.get_data()[flat_slot] = (uint)tex; | |||||
| if(interpolation == INTERPOLATION_CLOSEST) { | sync_bindless_mapping = true; | ||||
| cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_POINT)); | |||||
| } | |||||
| else if(interpolation == INTERPOLATION_LINEAR) { | |||||
| cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR)); | |||||
| } | |||||
| else {/* CUBIC and SMART are unsupported for CUDA */ | |||||
| cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR)); | |||||
| } | } | ||||
| /* Regular Textures - Fermi */ | |||||
| else { | |||||
| cuda_assert(cuTexRefSetArray(texref, handle, CU_TRSA_OVERRIDE_FORMAT)); | |||||
| cuda_assert(cuTexRefSetFilterMode(texref, filter_mode)); | |||||
| cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES)); | cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES)); | ||||
| } | |||||
| cuda_pop_context(); | |||||
| /* Fermi and Kepler */ | |||||
| mem.device_pointer = (device_ptr)handle; | mem.device_pointer = (device_ptr)handle; | ||||
| mem.device_size = size; | mem.device_size = size; | ||||
| stats.mem_alloc(size); | stats.mem_alloc(size); | ||||
| } | } | ||||
| else { | |||||
| cuda_pop_context(); | |||||
| mem_alloc(mem, MEM_READ_ONLY); | |||||
| mem_copy_to(mem); | |||||
| /* Fermi, Data and Image Textures */ | |||||
| if(!has_bindless_textures) { | |||||
| cuda_push_context(); | cuda_push_context(); | ||||
| cuda_assert(cuTexRefSetAddress(NULL, texref, cuda_device_ptr(mem.device_pointer), size)); | |||||
| cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_POINT)); | |||||
| cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_READ_AS_INTEGER)); | |||||
| } | |||||
| CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP; | |||||
| switch(extension) { | |||||
| case EXTENSION_REPEAT: | |||||
| address_mode = CU_TR_ADDRESS_MODE_WRAP; | |||||
| break; | |||||
| case EXTENSION_EXTEND: | |||||
| address_mode = CU_TR_ADDRESS_MODE_CLAMP; | |||||
| break; | |||||
| case EXTENSION_CLIP: | |||||
| address_mode = CU_TR_ADDRESS_MODE_BORDER; | |||||
| break; | |||||
| default: | |||||
| assert(0); | |||||
| break; | |||||
| } | |||||
| cuda_assert(cuTexRefSetAddressMode(texref, 0, address_mode)); | cuda_assert(cuTexRefSetAddressMode(texref, 0, address_mode)); | ||||
| cuda_assert(cuTexRefSetAddressMode(texref, 1, address_mode)); | cuda_assert(cuTexRefSetAddressMode(texref, 1, address_mode)); | ||||
| if(mem.data_depth > 1) { | if(mem.data_depth > 1) { | ||||
| cuda_assert(cuTexRefSetAddressMode(texref, 2, address_mode)); | cuda_assert(cuTexRefSetAddressMode(texref, 2, address_mode)); | ||||
| } | } | ||||
| cuda_assert(cuTexRefSetFormat(texref, format, mem.data_elements)); | cuda_assert(cuTexRefSetFormat(texref, format, mem.data_elements)); | ||||
| cuda_pop_context(); | cuda_pop_context(); | ||||
| } | } | ||||
| else { | |||||
| mem_alloc(mem, MEM_READ_ONLY); | |||||
| mem_copy_to(mem); | |||||
| cuda_push_context(); | |||||
| CUdeviceptr cumem; | |||||
| size_t cubytes; | |||||
| cuda_assert(cuModuleGetGlobal(&cumem, &cubytes, cuModule, bind_name.c_str())); | |||||
| if(cubytes == 8) { | |||||
| /* 64 bit device pointer */ | |||||
| uint64_t ptr = mem.device_pointer; | |||||
| cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes)); | |||||
| } | |||||
| else { | |||||
| /* 32 bit device pointer */ | |||||
| uint32_t ptr = (uint32_t)mem.device_pointer; | |||||
| cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes)); | |||||
| } | |||||
| cuda_pop_context(); | |||||
| } | |||||
| /* Fermi and Kepler */ | |||||
| tex_interp_map[mem.device_pointer] = (interpolation != INTERPOLATION_NONE); | tex_interp_map[mem.device_pointer] = (interpolation != INTERPOLATION_NONE); | ||||
| } | } | ||||
| void tex_free(device_memory& mem) | void tex_free(device_memory& mem, int flat_slot) | ||||
| { | { | ||||
| if(mem.device_pointer) { | if(mem.device_pointer) { | ||||
| if(tex_interp_map[mem.device_pointer]) { | if(tex_interp_map[mem.device_pointer]) { | ||||
| cuda_push_context(); | cuda_push_context(); | ||||
| cuArrayDestroy((CUarray)mem.device_pointer); | cuArrayDestroy((CUarray)mem.device_pointer); | ||||
| cuda_pop_context(); | cuda_pop_context(); | ||||
| tex_interp_map.erase(tex_interp_map.find(mem.device_pointer)); | tex_interp_map.erase(tex_interp_map.find(mem.device_pointer)); | ||||
| mem.device_pointer = 0; | mem.device_pointer = 0; | ||||
| stats.mem_free(mem.device_size); | stats.mem_free(mem.device_size); | ||||
| mem.device_size = 0; | mem.device_size = 0; | ||||
| } | } | ||||
| else { | else { | ||||
| tex_interp_map.erase(tex_interp_map.find(mem.device_pointer)); | tex_interp_map.erase(tex_interp_map.find(mem.device_pointer)); | ||||
| mem_free(mem); | mem_free(mem); | ||||
| } | } | ||||
| } | } | ||||
| /* Free CUtexObject (Bindless Textures) */ | |||||
| if(info.has_bindless_textures && flat_slot != -1) { | |||||
| cuTexObjectDestroy(bindless_mapping.get_data()[flat_slot]); | |||||
| } | |||||
| } | } | ||||
| void path_trace(RenderTile& rtile, int sample, bool branched) | void path_trace(RenderTile& rtile, int sample, bool branched) | ||||
| { | { | ||||
| if(have_error()) | if(have_error()) | ||||
| return; | return; | ||||
| /* Upload bindless_mapping vector */ | |||||
| load_bindless_mapping(); | |||||
Done Inline ActionsThis architecture check is unnecessary as we should never have sync_bindless_mapping = true with other architectures. And the copying code can be simplified if we have a single array. Also, this code should be moved to a separate function and called also from shader(), since baking can access textures too. brecht: This architecture check is unnecessary as we should never have `sync_bindless_mapping = true`… | |||||
Done Inline ActionsShall we do a call from thread_run() instead? path_trace() is run for each sample, and trying to load bindless mapping for each of them is a bit redundant. sergey: Shall we do a call from `thread_run()` instead? `path_trace()` is run for each sample, and… | |||||
| cuda_push_context(); | cuda_push_context(); | ||||
| CUfunction cuPathTrace; | CUfunction cuPathTrace; | ||||
| CUdeviceptr d_buffer = cuda_device_ptr(rtile.buffer); | CUdeviceptr d_buffer = cuda_device_ptr(rtile.buffer); | ||||
| CUdeviceptr d_rng_state = cuda_device_ptr(rtile.rng_state); | CUdeviceptr d_rng_state = cuda_device_ptr(rtile.rng_state); | ||||
| /* get kernel function */ | /* get kernel function */ | ||||
| if(branched) { | if(branched) { | ||||
| ▲ Show 20 Lines • Show All 98 Lines • ▼ Show 20 Lines | void film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half) | ||||
| cuda_pop_context(); | cuda_pop_context(); | ||||
| } | } | ||||
| void shader(DeviceTask& task) | void shader(DeviceTask& task) | ||||
| { | { | ||||
| if(have_error()) | if(have_error()) | ||||
| return; | return; | ||||
| /* Upload bindless_mapping vector */ | |||||
| load_bindless_mapping(); | |||||
| cuda_push_context(); | cuda_push_context(); | ||||
| CUfunction cuShader; | CUfunction cuShader; | ||||
| CUdeviceptr d_input = cuda_device_ptr(task.shader_input); | CUdeviceptr d_input = cuda_device_ptr(task.shader_input); | ||||
| CUdeviceptr d_output = cuda_device_ptr(task.shader_output); | CUdeviceptr d_output = cuda_device_ptr(task.shader_output); | ||||
| CUdeviceptr d_output_luma = cuda_device_ptr(task.shader_output_luma); | CUdeviceptr d_output_luma = cuda_device_ptr(task.shader_output_luma); | ||||
| /* get kernel function */ | /* get kernel function */ | ||||
| ▲ Show 20 Lines • Show All 451 Lines • ▼ Show 20 Lines | for(int num = 0; num < count; num++) { | ||||
| DeviceInfo info; | DeviceInfo info; | ||||
| info.type = DEVICE_CUDA; | info.type = DEVICE_CUDA; | ||||
| info.description = string(name); | info.description = string(name); | ||||
| info.id = string_printf("CUDA_%d", num); | info.id = string_printf("CUDA_%d", num); | ||||
| info.num = num; | info.num = num; | ||||
| info.advanced_shading = (major >= 2); | info.advanced_shading = (major >= 2); | ||||
| info.extended_images = (major >= 3); | info.has_bindless_textures = (major >= 3); | ||||
| info.pack_images = false; | info.pack_images = false; | ||||
| /* if device has a kernel timeout, assume it is used for display */ | /* if device has a kernel timeout, assume it is used for display */ | ||||
| if(cuDeviceGetAttribute(&attr, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, num) == CUDA_SUCCESS && attr == 1) { | if(cuDeviceGetAttribute(&attr, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, num) == CUDA_SUCCESS && attr == 1) { | ||||
| info.display_device = true; | info.display_device = true; | ||||
| display_devices.push_back(info); | display_devices.push_back(info); | ||||
| } | } | ||||
| else | else | ||||
| ▲ Show 20 Lines • Show All 139 Lines • Show Last 20 Lines | |||||
Eventually we can merge this two maps into one like this:
struct MemInfo { bool is_array_data; CUtexObject bindless_texture; }; map<device_ptr, MemInfo> tex_meminfo_map;This can be done later. For now i would only strongly suggest using CUtexObject instead of uint for non-kernel code.