Changeset View
Changeset View
Standalone View
Standalone View
intern/cycles/kernel/filter/filter_nlm_cpu.h
| Show All 39 Lines | ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy, | ||||
| for(int y = rect.y; y < rect.w; y++) { | for(int y = rect.y; y < rect.w; y++) { | ||||
| int idx_p = y*stride + aligned_lowx; | int idx_p = y*stride + aligned_lowx; | ||||
| int idx_q = (y+dy)*stride + aligned_lowx + dx + frame_offset; | int idx_q = (y+dy)*stride + aligned_lowx + dx + frame_offset; | ||||
| for(int x = aligned_lowx; x < rect.z; x += 4, idx_p += 4, idx_q += 4) { | for(int x = aligned_lowx; x < rect.z; x += 4, idx_p += 4, idx_q += 4) { | ||||
| float4 diff = make_float4(0.0f); | float4 diff = make_float4(0.0f); | ||||
| float4 scale_fac; | float4 scale_fac; | ||||
| if(scale_image) { | if(scale_image) { | ||||
| scale_fac = clamp(load4_a(scale_image, idx_p) / load4_u(scale_image, idx_q), | scale_fac = (idx_q < 0) ? make_float4(0.f) : clamp(load4_a(scale_image, idx_p) / load4_u(scale_image, idx_q), | ||||
| make_float4(0.25f), make_float4(4.0f)); | make_float4(0.25f), make_float4(4.0f)); | ||||
| } | } | ||||
| else { | else { | ||||
| scale_fac = make_float4(1.0f); | scale_fac = make_float4(1.0f); | ||||
| } | } | ||||
| for(int c = 0, chan_ofs = 0; c < numChannels; c++, chan_ofs += channel_offset) { | for(int c = 0, chan_ofs = 0; c < numChannels; c++, chan_ofs += channel_offset) { | ||||
| /* idx_p is guaranteed to be aligned, but idx_q isn't. */ | /* idx_p is guaranteed to be aligned, but idx_q isn't. */ | ||||
| float4 color_p = load4_a(weight_image, idx_p + chan_ofs); | float4 color_p = load4_a(weight_image, idx_p + chan_ofs); | ||||
| float4 color_q = scale_fac*load4_u(weight_image, idx_q + chan_ofs); | float4 color_q = ((idx_q + chan_ofs) < 0) ? make_float4(0.f) : scale_fac*load4_u(weight_image, idx_q + chan_ofs); | ||||
| float4 cdiff = color_p - color_q; | float4 cdiff = color_p - color_q; | ||||
| float4 var_p = load4_a(variance_image, idx_p + chan_ofs); | float4 var_p = load4_a(variance_image, idx_p + chan_ofs); | ||||
| float4 var_q = sqr(scale_fac)*load4_u(variance_image, idx_q + chan_ofs); | float4 var_q = ((idx_q + chan_ofs) < 0) ? make_float4(0.f) : sqr(scale_fac)*load4_u(variance_image, idx_q + chan_ofs); | ||||
| diff += (cdiff*cdiff - a*(var_p + min(var_p, var_q))) / (make_float4(1e-8f) + k_2*(var_p+var_q)); | diff += (cdiff*cdiff - a*(var_p + min(var_p, var_q))) / (make_float4(1e-8f) + k_2*(var_p+var_q)); | ||||
| } | } | ||||
| load4_a(difference_image, idx_p) = diff*channel_fac; | load4_a(difference_image, idx_p) = diff*channel_fac; | ||||
| } | } | ||||
| } | } | ||||
| } | } | ||||
| ccl_device_inline void kernel_filter_nlm_blur(const float *ccl_restrict difference_image, | ccl_device_inline void kernel_filter_nlm_blur(const float *ccl_restrict difference_image, | ||||
| Show All 39 Lines | for(int dx = -f; dx <= f; dx++) { | ||||
| int highx = rect.z - max(0, dx); | int highx = rect.z - max(0, dx); | ||||
| int4 lowx4 = make_int4(rect.x - min(0, dx)); | int4 lowx4 = make_int4(rect.x - min(0, dx)); | ||||
| int4 highx4 = make_int4(rect.z - max(0, dx)); | int4 highx4 = make_int4(rect.z - max(0, dx)); | ||||
| for(int y = rect.y; y < rect.w; y++) { | for(int y = rect.y; y < rect.w; y++) { | ||||
| for(int x = aligned_lowx; x < highx; x += 4) { | for(int x = aligned_lowx; x < highx; x += 4) { | ||||
| int4 x4 = make_int4(x) + make_int4(0, 1, 2, 3); | int4 x4 = make_int4(x) + make_int4(0, 1, 2, 3); | ||||
| int4 active = (x4 >= lowx4) & (x4 < highx4); | int4 active = (x4 >= lowx4) & (x4 < highx4); | ||||
| float4 diff = load4_u(difference_image, y*stride + x + dx); | float4 diff = ((x + dx) < 0) ? make_float4(0.f) : load4_u(difference_image, y*stride + x + dx); | ||||
| load4_a(out_image, y*stride + x) += mask(active, diff); | load4_a(out_image, y*stride + x) += mask(active, diff); | ||||
| } | } | ||||
| } | } | ||||
| } | } | ||||
| aligned_lowx = round_down(rect.x, 4); | aligned_lowx = round_down(rect.x, 4); | ||||
| for(int y = rect.y; y < rect.w; y++) { | for(int y = rect.y; y < rect.w; y++) { | ||||
| for(int x = aligned_lowx; x < rect.z; x += 4) { | for(int x = aligned_lowx; x < rect.z; x += 4) { | ||||
| Show All 40 Lines | for(int x = aligned_lowx; x < rect.z; x += 4) { | ||||
| int4 x4 = make_int4(x) + make_int4(0, 1, 2, 3); | int4 x4 = make_int4(x) + make_int4(0, 1, 2, 3); | ||||
| int4 active = (x4 >= make_int4(rect.x)) & (x4 < make_int4(rect.z)); | int4 active = (x4 >= make_int4(rect.x)) & (x4 < make_int4(rect.z)); | ||||
| int idx_p = y*stride + x, idx_q = (y+dy)*stride + (x+dx); | int idx_p = y*stride + x, idx_q = (y+dy)*stride + (x+dx); | ||||
| float4 weight = load4_a(temp_image, idx_p); | float4 weight = load4_a(temp_image, idx_p); | ||||
| load4_a(accum_image, idx_p) += mask(active, weight); | load4_a(accum_image, idx_p) += mask(active, weight); | ||||
| float4 val = load4_u(image, idx_q); | float4 val = (idx_q < 0) ? make_float4(0.f) : load4_u(image, idx_q); | ||||
| if(channel_offset) { | if(channel_offset) { | ||||
| val += load4_u(image, idx_q + channel_offset); | val += ((idx_q + channel_offset) < 0) ? make_float4(0.f) : load4_u(image, idx_q + channel_offset); | ||||
| val += load4_u(image, idx_q + 2*channel_offset); | val += ((idx_q + 2 * channel_offset) < 0) ? make_float4(0.f) : load4_u(image, idx_q + 2*channel_offset); | ||||
| val *= 1.0f/3.0f; | val *= 1.0f/3.0f; | ||||
| } | } | ||||
| load4_a(out_image, idx_p) += mask(active, weight*val); | load4_a(out_image, idx_p) += mask(active, weight*val); | ||||
| } | } | ||||
| } | } | ||||
| } | } | ||||
| ▲ Show 20 Lines • Show All 61 Lines • Show Last 20 Lines | |||||