Changeset View
Changeset View
Standalone View
Standalone View
intern/cycles/util/util_half.h
| Show All 22 Lines | |||||
| #if !defined(__KERNEL_GPU__) && defined(__KERNEL_SSE2__) | #if !defined(__KERNEL_GPU__) && defined(__KERNEL_SSE2__) | ||||
| # include "util/util_simd.h" | # include "util/util_simd.h" | ||||
| #endif | #endif | ||||
| CCL_NAMESPACE_BEGIN | CCL_NAMESPACE_BEGIN | ||||
| /* Half Floats */ | /* Half Floats */ | ||||
| #ifdef __KERNEL_OPENCL__ | |||||
| # define float4_store_half(h, f, scale) vstore_half4(f *(scale), 0, h); | |||||
| #else | |||||
| /* CUDA has its own half data type, no need to define then */ | /* CUDA has its own half data type, no need to define then */ | ||||
| # ifndef __KERNEL_CUDA__ | #ifndef __KERNEL_CUDA__ | ||||
| /* Implementing this as a class rather than a typedef so that the compiler can tell it apart from | /* Implementing this as a class rather than a typedef so that the compiler can tell it apart from | ||||
| * unsigned shorts. */ | * unsigned shorts. */ | ||||
| class half { | class half { | ||||
| public: | public: | ||||
| half() : v(0) | half() : v(0) | ||||
| { | { | ||||
| } | } | ||||
| half(const unsigned short &i) : v(i) | half(const unsigned short &i) : v(i) | ||||
| { | { | ||||
| } | } | ||||
| operator unsigned short() | operator unsigned short() | ||||
| { | { | ||||
| return v; | return v; | ||||
| } | } | ||||
| half &operator=(const unsigned short &i) | half &operator=(const unsigned short &i) | ||||
| { | { | ||||
| v = i; | v = i; | ||||
| return *this; | return *this; | ||||
| } | } | ||||
| private: | private: | ||||
| unsigned short v; | unsigned short v; | ||||
| }; | }; | ||||
| # endif | #endif | ||||
| struct half4 { | struct half4 { | ||||
| half x, y, z, w; | half x, y, z, w; | ||||
| }; | }; | ||||
| # ifdef __KERNEL_CUDA__ | #ifdef __KERNEL_CUDA__ | ||||
| ccl_device_inline void float4_store_half(half *h, float4 f, float scale) | ccl_device_inline void float4_store_half(half *h, float4 f) | ||||
| { | { | ||||
| h[0] = __float2half(f.x * scale); | h[0] = __float2half(f.x); | ||||
| h[1] = __float2half(f.y * scale); | h[1] = __float2half(f.y); | ||||
| h[2] = __float2half(f.z * scale); | h[2] = __float2half(f.z); | ||||
| h[3] = __float2half(f.w * scale); | h[3] = __float2half(f.w); | ||||
| } | } | ||||
| # else | #else | ||||
| ccl_device_inline void float4_store_half(half *h, float4 f, float scale) | ccl_device_inline void float4_store_half(half *h, float4 f) | ||||
| { | { | ||||
| # ifndef __KERNEL_SSE2__ | # ifndef __KERNEL_SSE2__ | ||||
| for (int i = 0; i < 4; i++) { | for (int i = 0; i < 4; i++) { | ||||
| /* optimized float to half for pixels: | /* optimized float to half for pixels: | ||||
| * assumes no negative, no nan, no inf, and sets denormal to 0 */ | * assumes no negative, no nan, no inf, and sets denormal to 0 */ | ||||
| union { | union { | ||||
| uint i; | uint i; | ||||
| float f; | float f; | ||||
| } in; | } in; | ||||
| float fscale = f[i] * scale; | in.f = (f[i] > 0.0f) ? ((f[i] < 65504.0f) ? f[i] : 65504.0f) : 0.0f; | ||||
| in.f = (fscale > 0.0f) ? ((fscale < 65504.0f) ? fscale : 65504.0f) : 0.0f; | |||||
| int x = in.i; | int x = in.i; | ||||
| int absolute = x & 0x7FFFFFFF; | int absolute = x & 0x7FFFFFFF; | ||||
| int Z = absolute + 0xC8000000; | int Z = absolute + 0xC8000000; | ||||
| int result = (absolute < 0x38800000) ? 0 : Z; | int result = (absolute < 0x38800000) ? 0 : Z; | ||||
| int rshift = (result >> 13); | int rshift = (result >> 13); | ||||
| h[i] = (rshift & 0x7FFF); | h[i] = (rshift & 0x7FFF); | ||||
| } | } | ||||
| # else | # else | ||||
| /* same as above with SSE */ | /* same as above with SSE */ | ||||
| ssef fscale = load4f(f) * scale; | ssef x = min(max(load4f(f), 0.0f), 65504.0f); | ||||
| ssef x = min(max(fscale, 0.0f), 65504.0f); | |||||
| # ifdef __KERNEL_AVX2__ | # ifdef __KERNEL_AVX2__ | ||||
| ssei rpack = _mm_cvtps_ph(x, 0); | ssei rpack = _mm_cvtps_ph(x, 0); | ||||
| # else | # else | ||||
| ssei absolute = cast(x) & 0x7FFFFFFF; | ssei absolute = cast(x) & 0x7FFFFFFF; | ||||
| ssei Z = absolute + 0xC8000000; | ssei Z = absolute + 0xC8000000; | ||||
| ssei result = andnot(absolute < 0x38800000, Z); | ssei result = andnot(absolute < 0x38800000, Z); | ||||
| ssei rshift = (result >> 13) & 0x7FFF; | ssei rshift = (result >> 13) & 0x7FFF; | ||||
| ssei rpack = _mm_packs_epi32(rshift, rshift); | ssei rpack = _mm_packs_epi32(rshift, rshift); | ||||
| # endif | # endif | ||||
| _mm_storel_pi((__m64 *)h, _mm_castsi128_ps(rpack)); | _mm_storel_pi((__m64 *)h, _mm_castsi128_ps(rpack)); | ||||
| # endif | # endif | ||||
| } | } | ||||
| ccl_device_inline float half_to_float(half h) | ccl_device_inline float half_to_float(half h) | ||||
| { | { | ||||
| float f; | float f; | ||||
| *((int *)&f) = ((h & 0x8000) << 16) | (((h & 0x7c00) + 0x1C000) << 13) | ((h & 0x03FF) << 13); | *((int *)&f) = ((h & 0x8000) << 16) | (((h & 0x7c00) + 0x1C000) << 13) | ((h & 0x03FF) << 13); | ||||
| Show All 29 Lines | ccl_device_inline half float_to_half(float f) | ||||
| /* Clamp-to-max. */ | /* Clamp-to-max. */ | ||||
| value_bits = (exponent_bits > 0x47000000) ? 0x7bff : value_bits; | value_bits = (exponent_bits > 0x47000000) ? 0x7bff : value_bits; | ||||
| /* Denormals-as-zero. */ | /* Denormals-as-zero. */ | ||||
| value_bits = (exponent_bits == 0 ? 0 : value_bits); | value_bits = (exponent_bits == 0 ? 0 : value_bits); | ||||
| /* Re-insert sign bit and return. */ | /* Re-insert sign bit and return. */ | ||||
| return (value_bits | sign_bit); | return (value_bits | sign_bit); | ||||
| } | } | ||||
| # endif | #endif | ||||
| #endif | |||||
| CCL_NAMESPACE_END | CCL_NAMESPACE_END | ||||
| #endif /* __UTIL_HALF_H__ */ | #endif /* __UTIL_HALF_H__ */ | ||||