Changeset View
Changeset View
Standalone View
Standalone View
intern/cycles/kernel/filter/filter_features_sse.h
- This file was added.
| /* | |||||
| * Copyright 2011-2017 Blender Foundation | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| CCL_NAMESPACE_BEGIN | |||||
| #define ccl_get_feature_sse(pass) _mm_loadu_ps(buffer + (pass)*pass_stride) | |||||
| /* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y), 4 at a time. | |||||
| * pixel_buffer always points to the first of the 4 current pixel in the first pass. | |||||
| * x4 and y4 contain the coordinates of the four pixels, active_pixels contains a mask that's set for all pixels within the window. */ | |||||
| #define FOR_PIXEL_WINDOW_SSE pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x); \ | |||||
| for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \ | |||||
| __m128 y4 = _mm_set1_ps(pixel.y); \ | |||||
| for(pixel.x = low.x; pixel.x < high.x; pixel.x += 4, pixel_buffer += 4) { \ | |||||
| __m128 x4 = _mm_add_ps(_mm_set1_ps(pixel.x), _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); \ | |||||
| __m128 active_pixels = _mm_cmplt_ps(x4, _mm_set1_ps(high.x)); | |||||
| #define END_FOR_PIXEL_WINDOW_SSE } \ | |||||
| pixel_buffer += buffer_w - (pixel.x - low.x); \ | |||||
| } | |||||
| ccl_device_inline void filter_get_features_sse(__m128 x, __m128 y, __m128 active_pixels, float ccl_readonly_ptr buffer, __m128 *features, __m128 ccl_readonly_ptr mean, int pass_stride) | |||||
| { | |||||
| features[0] = x; | |||||
| features[1] = y; | |||||
| features[2] = ccl_get_feature_sse(0); | |||||
| features[3] = ccl_get_feature_sse(1); | |||||
| features[4] = ccl_get_feature_sse(2); | |||||
| features[5] = ccl_get_feature_sse(3); | |||||
| features[6] = ccl_get_feature_sse(4); | |||||
| features[7] = ccl_get_feature_sse(5); | |||||
| features[8] = ccl_get_feature_sse(6); | |||||
| features[9] = ccl_get_feature_sse(7); | |||||
| if(mean) { | |||||
| for(int i = 0; i < DENOISE_FEATURES; i++) | |||||
| features[i] = _mm_sub_ps(features[i], mean[i]); | |||||
| } | |||||
| for(int i = 0; i < DENOISE_FEATURES; i++) | |||||
| features[i] = _mm_mask_ps(features[i], active_pixels); | |||||
| } | |||||
| ccl_device_inline void filter_get_feature_scales_sse(__m128 x, __m128 y, __m128 active_pixels, float ccl_readonly_ptr buffer, __m128 *scales, __m128 ccl_readonly_ptr mean, int pass_stride) | |||||
| { | |||||
| scales[0] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(x, mean[0])), active_pixels); | |||||
| scales[1] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(y, mean[1])), active_pixels); | |||||
| scales[2] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(ccl_get_feature_sse(0), mean[2])), active_pixels); | |||||
| __m128 diff, scale; | |||||
| diff = _mm_sub_ps(ccl_get_feature_sse(1), mean[3]); | |||||
| scale = _mm_mul_ps(diff, diff); | |||||
| diff = _mm_sub_ps(ccl_get_feature_sse(2), mean[4]); | |||||
| scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff)); | |||||
| diff = _mm_sub_ps(ccl_get_feature_sse(3), mean[5]); | |||||
| scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff)); | |||||
| scales[3] = _mm_mask_ps(scale, active_pixels); | |||||
| scales[4] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(ccl_get_feature_sse(4), mean[6])), active_pixels); | |||||
| diff = _mm_sub_ps(ccl_get_feature_sse(5), mean[7]); | |||||
| scale = _mm_mul_ps(diff, diff); | |||||
| diff = _mm_sub_ps(ccl_get_feature_sse(6), mean[8]); | |||||
| scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff)); | |||||
| diff = _mm_sub_ps(ccl_get_feature_sse(7), mean[9]); | |||||
| scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff)); | |||||
| scales[5] = _mm_mask_ps(scale, active_pixels); | |||||
| } | |||||
| ccl_device_inline void filter_calculate_scale_sse(__m128 *scale) | |||||
| { | |||||
| scale[0] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[0]), _mm_set1_ps(0.01f))); | |||||
| scale[1] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[1]), _mm_set1_ps(0.01f))); | |||||
| scale[2] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[2]), _mm_set1_ps(0.01f))); | |||||
| scale[6] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[4]), _mm_set1_ps(0.01f))); | |||||
| scale[7] = scale[8] = scale[9] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(_mm_sqrt_ps(scale[5])), _mm_set1_ps(0.01f))); | |||||
| scale[3] = scale[4] = scale[5] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(_mm_sqrt_ps(scale[3])), _mm_set1_ps(0.01f))); | |||||
| } | |||||
| CCL_NAMESPACE_END | |||||