Changeset View
Changeset View
Standalone View
Standalone View
intern/cycles/kernel/device/cuda/compat.h
- This file was moved from intern/cycles/kernel/kernel_compat_cuda.h.
| /* | /* | ||||
| * Copyright 2011-2013 Blender Foundation | * Copyright 2011-2013 Blender Foundation | ||||
| * | * | ||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
| * you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
| * You may obtain a copy of the License at | * You may obtain a copy of the License at | ||||
| * | * | ||||
| * http://www.apache.org/licenses/LICENSE-2.0 | * http://www.apache.org/licenses/LICENSE-2.0 | ||||
| * | * | ||||
| * Unless required by applicable law or agreed to in writing, software | * Unless required by applicable law or agreed to in writing, software | ||||
| * distributed under the License is distributed on an "AS IS" BASIS, | * distributed under the License is distributed on an "AS IS" BASIS, | ||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||
| * See the License for the specific language governing permissions and | * See the License for the specific language governing permissions and | ||||
| * limitations under the License. | * limitations under the License. | ||||
| */ | */ | ||||
| #ifndef __KERNEL_COMPAT_CUDA_H__ | #pragma once | ||||
| #define __KERNEL_COMPAT_CUDA_H__ | |||||
| #define __KERNEL_GPU__ | #define __KERNEL_GPU__ | ||||
| #define __KERNEL_CUDA__ | #define __KERNEL_CUDA__ | ||||
| #define CCL_NAMESPACE_BEGIN | #define CCL_NAMESPACE_BEGIN | ||||
| #define CCL_NAMESPACE_END | #define CCL_NAMESPACE_END | ||||
| /* Selective nodes compilation. */ | #ifndef ATTR_FALLTHROUGH | ||||
| #ifndef __NODES_MAX_GROUP__ | # define ATTR_FALLTHROUGH | ||||
| # define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX | |||||
| #endif | |||||
| #ifndef __NODES_FEATURES__ | |||||
| # define __NODES_FEATURES__ NODE_FEATURE_ALL | |||||
| #endif | #endif | ||||
| /* Manual definitions so we can compile without CUDA toolkit. */ | /* Manual definitions so we can compile without CUDA toolkit. */ | ||||
| #ifdef __CUDACC_RTC__ | #ifdef __CUDACC_RTC__ | ||||
| typedef unsigned int uint32_t; | typedef unsigned int uint32_t; | ||||
| typedef unsigned long long uint64_t; | typedef unsigned long long uint64_t; | ||||
| #else | #else | ||||
| # include <stdint.h> | # include <stdint.h> | ||||
| #endif | #endif | ||||
| typedef unsigned short half; | |||||
| typedef unsigned long long CUtexObject; | |||||
| #ifdef CYCLES_CUBIN_CC | #ifdef CYCLES_CUBIN_CC | ||||
| # define FLT_MIN 1.175494350822287507969e-38f | # define FLT_MIN 1.175494350822287507969e-38f | ||||
| # define FLT_MAX 340282346638528859811704183484516925440.0f | # define FLT_MAX 340282346638528859811704183484516925440.0f | ||||
| # define FLT_EPSILON 1.192092896e-07F | # define FLT_EPSILON 1.192092896e-07F | ||||
| #endif | #endif | ||||
| __device__ half __float2half(const float f) | /* Qualifiers */ | ||||
| { | |||||
| half val; | |||||
| asm("{ cvt.rn.f16.f32 %0, %1;}\n" : "=h"(val) : "f"(f)); | |||||
| return val; | |||||
| } | |||||
| /* Qualifier wrappers for different names on different devices */ | |||||
| #define ccl_device __device__ __inline__ | #define ccl_device __device__ __inline__ | ||||
| #if __CUDA_ARCH__ < 500 | #if __CUDA_ARCH__ < 500 | ||||
| # define ccl_device_inline __device__ __forceinline__ | # define ccl_device_inline __device__ __forceinline__ | ||||
| # define ccl_device_forceinline __device__ __forceinline__ | # define ccl_device_forceinline __device__ __forceinline__ | ||||
| #else | #else | ||||
| # define ccl_device_inline __device__ __inline__ | # define ccl_device_inline __device__ __inline__ | ||||
| # define ccl_device_forceinline __device__ __forceinline__ | # define ccl_device_forceinline __device__ __forceinline__ | ||||
| #endif | #endif | ||||
| #define ccl_device_noinline __device__ __noinline__ | #define ccl_device_noinline __device__ __noinline__ | ||||
| #define ccl_device_noinline_cpu ccl_device | #define ccl_device_noinline_cpu ccl_device | ||||
| #define ccl_global | #define ccl_global | ||||
| #define ccl_static_constant __constant__ | #define ccl_static_constant __constant__ | ||||
| #define ccl_device_constant __constant__ __device__ | |||||
| #define ccl_constant const | #define ccl_constant const | ||||
| #define ccl_local __shared__ | #define ccl_gpu_shared __shared__ | ||||
| #define ccl_local_param | |||||
| #define ccl_private | #define ccl_private | ||||
| #define ccl_may_alias | #define ccl_may_alias | ||||
| #define ccl_addr_space | #define ccl_addr_space | ||||
| #define ccl_restrict __restrict__ | #define ccl_restrict __restrict__ | ||||
| #define ccl_loop_no_unroll | #define ccl_loop_no_unroll | ||||
| /* TODO(sergey): In theory we might use references with CUDA, however | |||||
| * performance impact yet to be investigated. | |||||
| */ | |||||
| #define ccl_ref | |||||
| #define ccl_align(n) __align__(n) | #define ccl_align(n) __align__(n) | ||||
| #define ccl_optional_struct_init | #define ccl_optional_struct_init | ||||
| #define ATTR_FALLTHROUGH | |||||
| #define CCL_MAX_LOCAL_SIZE (CUDA_THREADS_BLOCK_WIDTH * CUDA_THREADS_BLOCK_WIDTH) | |||||
| /* No assert supported for CUDA */ | /* No assert supported for CUDA */ | ||||
| #define kernel_assert(cond) | #define kernel_assert(cond) | ||||
| /* Types */ | /* GPU thread, block, grid size and index */ | ||||
| #include "util/util_half.h" | #define ccl_gpu_thread_idx_x (threadIdx.x) | ||||
| #include "util/util_types.h" | #define ccl_gpu_block_dim_x (blockDim.x) | ||||
| #define ccl_gpu_block_idx_x (blockIdx.x) | |||||
| #define ccl_gpu_grid_dim_x (gridDim.x) | |||||
| #define ccl_gpu_warp_size (warpSize) | |||||
| /* Work item functions */ | #define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x) | ||||
| #define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x) | |||||
| ccl_device_inline uint ccl_local_id(uint d) | /* GPU warp synchronizaton */ | ||||
| { | |||||
| switch (d) { | |||||
| case 0: | |||||
| return threadIdx.x; | |||||
| case 1: | |||||
| return threadIdx.y; | |||||
| case 2: | |||||
| return threadIdx.z; | |||||
| default: | |||||
| return 0; | |||||
| } | |||||
| } | |||||
| #define ccl_global_id(d) (ccl_group_id(d) * ccl_local_size(d) + ccl_local_id(d)) | #define ccl_gpu_syncthreads() __syncthreads() | ||||
| #define ccl_gpu_ballot(predicate) __ballot_sync(0xFFFFFFFF, predicate) | |||||
| #define ccl_gpu_shfl_down_sync(mask, var, detla) __shfl_down_sync(mask, var, detla) | |||||
| #define ccl_gpu_popc(x) __popc(x) | |||||
| ccl_device_inline uint ccl_local_size(uint d) | /* GPU texture objects */ | ||||
| { | |||||
| switch (d) { | |||||
| case 0: | |||||
| return blockDim.x; | |||||
| case 1: | |||||
| return blockDim.y; | |||||
| case 2: | |||||
| return blockDim.z; | |||||
| default: | |||||
| return 0; | |||||
| } | |||||
| } | |||||
| #define ccl_global_size(d) (ccl_num_groups(d) * ccl_local_size(d)) | typedef unsigned long long CUtexObject; | ||||
| typedef CUtexObject ccl_gpu_tex_object; | |||||
| ccl_device_inline uint ccl_group_id(uint d) | template<typename T> | ||||
| ccl_device_forceinline T ccl_gpu_tex_object_read_2D(const ccl_gpu_tex_object texobj, | |||||
| const float x, | |||||
| const float y) | |||||
| { | { | ||||
| switch (d) { | return tex2D<T>(texobj, x, y); | ||||
| case 0: | |||||
| return blockIdx.x; | |||||
| case 1: | |||||
| return blockIdx.y; | |||||
| case 2: | |||||
| return blockIdx.z; | |||||
| default: | |||||
| return 0; | |||||
| } | |||||
| } | } | ||||
| ccl_device_inline uint ccl_num_groups(uint d) | template<typename T> | ||||
| ccl_device_forceinline T ccl_gpu_tex_object_read_3D(const ccl_gpu_tex_object texobj, | |||||
| const float x, | |||||
| const float y, | |||||
| const float z) | |||||
| { | { | ||||
| switch (d) { | return tex3D<T>(texobj, x, y, z); | ||||
| case 0: | |||||
| return gridDim.x; | |||||
| case 1: | |||||
| return gridDim.y; | |||||
| case 2: | |||||
| return gridDim.z; | |||||
| default: | |||||
| return 0; | |||||
| } | |||||
| } | } | ||||
| /* Textures */ | |||||
| /* Use arrays for regular data. */ | |||||
| #define kernel_tex_fetch(t, index) t[(index)] | |||||
| #define kernel_tex_array(t) (t) | |||||
| #define kernel_data __data | |||||
| /* Use fast math functions */ | /* Use fast math functions */ | ||||
| #define cosf(x) __cosf(((float)(x))) | #define cosf(x) __cosf(((float)(x))) | ||||
| #define sinf(x) __sinf(((float)(x))) | #define sinf(x) __sinf(((float)(x))) | ||||
| #define powf(x, y) __powf(((float)(x)), ((float)(y))) | #define powf(x, y) __powf(((float)(x)), ((float)(y))) | ||||
| #define tanf(x) __tanf(((float)(x))) | #define tanf(x) __tanf(((float)(x))) | ||||
| #define logf(x) __logf(((float)(x))) | #define logf(x) __logf(((float)(x))) | ||||
| #define expf(x) __expf(((float)(x))) | #define expf(x) __expf(((float)(x))) | ||||
| #endif /* __KERNEL_COMPAT_CUDA_H__ */ | /* Half */ | ||||
| typedef unsigned short half; | |||||
| __device__ half __float2half(const float f) | |||||
| { | |||||
| half val; | |||||
| asm("{ cvt.rn.f16.f32 %0, %1;}\n" : "=h"(val) : "f"(f)); | |||||
| return val; | |||||
| } | |||||
| /* Types */ | |||||
| #include "util/util_half.h" | |||||
| #include "util/util_types.h" | |||||