Changeset View
Changeset View
Standalone View
Standalone View
intern/cycles/device/device_cpu.cpp
| Show First 20 Lines • Show All 72 Lines • ▼ Show 20 Lines | |||||
| #endif | #endif | ||||
| /* do now to avoid thread issues */ | /* do now to avoid thread issues */ | ||||
| system_cpu_support_sse2(); | system_cpu_support_sse2(); | ||||
| system_cpu_support_sse3(); | system_cpu_support_sse3(); | ||||
| system_cpu_support_sse41(); | system_cpu_support_sse41(); | ||||
| system_cpu_support_avx(); | system_cpu_support_avx(); | ||||
| system_cpu_support_avx2(); | system_cpu_support_avx2(); | ||||
| #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 | |||||
| if(system_cpu_support_avx2()) { | |||||
| VLOG(1) << "Will be using AVX2 kernels."; | |||||
| } | |||||
| else | |||||
| #endif | |||||
| #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX | |||||
| if(system_cpu_support_avx()) { | |||||
| VLOG(1) << "Will be using AVX kernels."; | |||||
| } | |||||
| else | |||||
| #endif | |||||
| #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 | |||||
| if(system_cpu_support_sse41()) { | |||||
| VLOG(1) << "Will be using SSE4.1 kernels."; | |||||
| } | |||||
| else | |||||
| #endif | |||||
| #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 | |||||
| if(system_cpu_support_sse3()) { | |||||
| VLOG(1) << "Will be using SSE3kernels."; | |||||
| } | |||||
| else | |||||
| #endif | |||||
| #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 | |||||
| if(system_cpu_support_sse2()) { | |||||
| VLOG(1) << "Will be using SSE2 kernels."; | |||||
| } | |||||
| else | |||||
| #endif | |||||
| { | |||||
| VLOG(1) << "Will be using regular kernels."; | |||||
| } | |||||
| } | } | ||||
| ~CPUDevice() | ~CPUDevice() | ||||
| { | { | ||||
| task_pool.stop(); | task_pool.stop(); | ||||
| } | } | ||||
| void mem_alloc(device_memory& mem, MemoryType /*type*/) | void mem_alloc(device_memory& mem, MemoryType /*type*/) | ||||
| ▲ Show 20 Lines • Show All 87 Lines • ▼ Show 20 Lines | public: | ||||
| : DeviceTask(task) | : DeviceTask(task) | ||||
| { | { | ||||
| run = function_bind(&CPUDevice::thread_run, device, this); | run = function_bind(&CPUDevice::thread_run, device, this); | ||||
| } | } | ||||
| }; | }; | ||||
| void thread_path_trace(DeviceTask& task) | void thread_path_trace(DeviceTask& task) | ||||
| { | { | ||||
| static bool cpu_type_logged = false; | |||||
| if(task_pool.canceled()) { | if(task_pool.canceled()) { | ||||
| if(task.need_finish_queue == false) | if(task.need_finish_queue == false) | ||||
| return; | return; | ||||
| } | } | ||||
| KernelGlobals kg = kernel_globals; | KernelGlobals kg = kernel_globals; | ||||
| #ifdef WITH_OSL | #ifdef WITH_OSL | ||||
| OSLShader::thread_init(&kg, &kernel_globals, &osl_globals); | OSLShader::thread_init(&kg, &kernel_globals, &osl_globals); | ||||
| #endif | #endif | ||||
| RenderTile tile; | RenderTile tile; | ||||
| void(*path_trace_kernel)(KernelGlobals*, float*, unsigned int*, int, int, int, int, int); | void(*path_trace_kernel)(KernelGlobals*, float*, unsigned int*, int, int, int, int, int); | ||||
| #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 | #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 | ||||
| if(system_cpu_support_avx2()) { | if(system_cpu_support_avx2()) { | ||||
| path_trace_kernel = kernel_cpu_avx2_path_trace; | path_trace_kernel = kernel_cpu_avx2_path_trace; | ||||
| VLOG_ONCE(1, cpu_type_logged) << "Path tracing using AVX2 kernel."; | |||||
| } | } | ||||
| else | else | ||||
| #endif | #endif | ||||
| #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX | #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX | ||||
| if(system_cpu_support_avx()) { | if(system_cpu_support_avx()) { | ||||
| path_trace_kernel = kernel_cpu_avx_path_trace; | path_trace_kernel = kernel_cpu_avx_path_trace; | ||||
| VLOG_ONCE(1, cpu_type_logged) << "Path tracing using AVX kernel."; | |||||
| } | } | ||||
| else | else | ||||
| #endif | #endif | ||||
| #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 | #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 | ||||
| if(system_cpu_support_sse41()) { | if(system_cpu_support_sse41()) { | ||||
| path_trace_kernel = kernel_cpu_sse41_path_trace; | path_trace_kernel = kernel_cpu_sse41_path_trace; | ||||
| VLOG_ONCE(1, cpu_type_logged) << "Path tracing using SSE4.1 kernel."; | |||||
| } | } | ||||
| else | else | ||||
| #endif | #endif | ||||
| #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 | #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 | ||||
| if(system_cpu_support_sse3()) { | if(system_cpu_support_sse3()) { | ||||
| path_trace_kernel = kernel_cpu_sse3_path_trace; | path_trace_kernel = kernel_cpu_sse3_path_trace; | ||||
| VLOG_ONCE(1, cpu_type_logged) << "Path tracing using SSE3 kernel."; | |||||
| } | } | ||||
| else | else | ||||
| #endif | #endif | ||||
| #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 | #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 | ||||
| if(system_cpu_support_sse2()) { | if(system_cpu_support_sse2()) { | ||||
| path_trace_kernel = kernel_cpu_sse2_path_trace; | path_trace_kernel = kernel_cpu_sse2_path_trace; | ||||
| VLOG_ONCE(1, cpu_type_logged) << "Path tracing using SSE2 kernel."; | |||||
| } | } | ||||
| else | else | ||||
| #endif | #endif | ||||
| { | { | ||||
| path_trace_kernel = kernel_cpu_path_trace; | path_trace_kernel = kernel_cpu_path_trace; | ||||
| VLOG_ONCE(1, cpu_type_logged) << "Path tracing using regular kernel."; | |||||
| } | } | ||||
| while(task.acquire_tile(this, tile)) { | while(task.acquire_tile(this, tile)) { | ||||
| float *render_buffer = (float*)tile.buffer; | float *render_buffer = (float*)tile.buffer; | ||||
| uint *rng_state = (uint*)tile.rng_state; | uint *rng_state = (uint*)tile.rng_state; | ||||
| int start_sample = tile.start_sample; | int start_sample = tile.start_sample; | ||||
| int end_sample = tile.start_sample + tile.num_samples; | int end_sample = tile.start_sample + tile.num_samples; | ||||
| Show All 25 Lines | |||||
| #ifdef WITH_OSL | #ifdef WITH_OSL | ||||
| OSLShader::thread_free(&kg); | OSLShader::thread_free(&kg); | ||||
| #endif | #endif | ||||
| } | } | ||||
| void thread_film_convert(DeviceTask& task) | void thread_film_convert(DeviceTask& task) | ||||
| { | { | ||||
| static bool cpu_type_logged = false; | |||||
| float sample_scale = 1.0f/(task.sample + 1); | float sample_scale = 1.0f/(task.sample + 1); | ||||
| if(task.rgba_half) { | if(task.rgba_half) { | ||||
| void(*convert_to_half_float_kernel)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int); | void(*convert_to_half_float_kernel)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int); | ||||
| #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 | #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 | ||||
| if(system_cpu_support_avx2()) { | if(system_cpu_support_avx2()) { | ||||
| convert_to_half_float_kernel = kernel_cpu_avx2_convert_to_half_float; | convert_to_half_float_kernel = kernel_cpu_avx2_convert_to_half_float; | ||||
| VLOG_ONCE(1, cpu_type_logged) << "Converting to half float using AVX2 kernel."; | |||||
| } | } | ||||
| else | else | ||||
| #endif | #endif | ||||
| #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX | #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX | ||||
| if(system_cpu_support_avx()) { | if(system_cpu_support_avx()) { | ||||
| convert_to_half_float_kernel = kernel_cpu_avx_convert_to_half_float; | convert_to_half_float_kernel = kernel_cpu_avx_convert_to_half_float; | ||||
| VLOG_ONCE(1, cpu_type_logged) << "Converting to half float using AVX kernel."; | |||||
| } | } | ||||
| else | else | ||||
| #endif | #endif | ||||
| #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 | #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 | ||||
| if(system_cpu_support_sse41()) { | if(system_cpu_support_sse41()) { | ||||
| convert_to_half_float_kernel = kernel_cpu_sse41_convert_to_half_float; | convert_to_half_float_kernel = kernel_cpu_sse41_convert_to_half_float; | ||||
| VLOG_ONCE(1, cpu_type_logged) << "Converting to half float using SSE4.1 kernel."; | |||||
| } | } | ||||
| else | else | ||||
| #endif | #endif | ||||
| #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 | #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 | ||||
| if(system_cpu_support_sse3()) { | if(system_cpu_support_sse3()) { | ||||
| convert_to_half_float_kernel = kernel_cpu_sse3_convert_to_half_float; | convert_to_half_float_kernel = kernel_cpu_sse3_convert_to_half_float; | ||||
| VLOG_ONCE(1, cpu_type_logged) << "Converting to half float using SSE3 kernel."; | |||||
| } | } | ||||
| else | else | ||||
| #endif | #endif | ||||
| #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 | #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 | ||||
| if(system_cpu_support_sse2()) { | if(system_cpu_support_sse2()) { | ||||
| convert_to_half_float_kernel = kernel_cpu_sse2_convert_to_half_float; | convert_to_half_float_kernel = kernel_cpu_sse2_convert_to_half_float; | ||||
| VLOG_ONCE(1, cpu_type_logged) << "Converting to half float using SSE2 kernel."; | |||||
| } | } | ||||
| else | else | ||||
| #endif | #endif | ||||
| { | { | ||||
| convert_to_half_float_kernel = kernel_cpu_convert_to_half_float; | convert_to_half_float_kernel = kernel_cpu_convert_to_half_float; | ||||
| VLOG_ONCE(1, cpu_type_logged) << "Converting to half float using regular kernel."; | |||||
| } | } | ||||
| for(int y = task.y; y < task.y + task.h; y++) | for(int y = task.y; y < task.y + task.h; y++) | ||||
| for(int x = task.x; x < task.x + task.w; x++) | for(int x = task.x; x < task.x + task.w; x++) | ||||
| convert_to_half_float_kernel(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer, | convert_to_half_float_kernel(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer, | ||||
| sample_scale, x, y, task.offset, task.stride); | sample_scale, x, y, task.offset, task.stride); | ||||
| } | } | ||||
| else { | else { | ||||
| void(*convert_to_byte_kernel)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int); | void(*convert_to_byte_kernel)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int); | ||||
| #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 | #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 | ||||
| if(system_cpu_support_avx2()) { | if(system_cpu_support_avx2()) { | ||||
| convert_to_byte_kernel = kernel_cpu_avx2_convert_to_byte; | convert_to_byte_kernel = kernel_cpu_avx2_convert_to_byte; | ||||
| VLOG_ONCE(1, cpu_type_logged) << "Converting to byte using AVX2 kernel."; | |||||
| } | } | ||||
| else | else | ||||
| #endif | #endif | ||||
| #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX | #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX | ||||
| if(system_cpu_support_avx()) { | if(system_cpu_support_avx()) { | ||||
| convert_to_byte_kernel = kernel_cpu_avx_convert_to_byte; | convert_to_byte_kernel = kernel_cpu_avx_convert_to_byte; | ||||
| VLOG_ONCE(1, cpu_type_logged) << "Converting to byte using AVX kernel."; | |||||
| } | } | ||||
| else | else | ||||
| #endif | #endif | ||||
| #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 | #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 | ||||
| if(system_cpu_support_sse41()) { | if(system_cpu_support_sse41()) { | ||||
| convert_to_byte_kernel = kernel_cpu_sse41_convert_to_byte; | convert_to_byte_kernel = kernel_cpu_sse41_convert_to_byte; | ||||
| VLOG_ONCE(1, cpu_type_logged) << "Converting to byte using SSE4.1 kernel."; | |||||
| } | } | ||||
| else | else | ||||
| #endif | #endif | ||||
| #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 | #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 | ||||
| if(system_cpu_support_sse3()) { | if(system_cpu_support_sse3()) { | ||||
| convert_to_byte_kernel = kernel_cpu_sse3_convert_to_byte; | convert_to_byte_kernel = kernel_cpu_sse3_convert_to_byte; | ||||
| VLOG_ONCE(1, cpu_type_logged) << "Converting to byte using SSE3 kernel."; | |||||
| } | } | ||||
| else | else | ||||
| #endif | #endif | ||||
| #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 | #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 | ||||
| if(system_cpu_support_sse2()) { | if(system_cpu_support_sse2()) { | ||||
| convert_to_byte_kernel = kernel_cpu_sse2_convert_to_byte; | convert_to_byte_kernel = kernel_cpu_sse2_convert_to_byte; | ||||
| VLOG_ONCE(1, cpu_type_logged) << "Converting to byte using SSE2 kernel."; | |||||
| } | } | ||||
| else | else | ||||
| #endif | #endif | ||||
| { | { | ||||
| convert_to_byte_kernel = kernel_cpu_convert_to_byte; | convert_to_byte_kernel = kernel_cpu_convert_to_byte; | ||||
| VLOG_ONCE(1, cpu_type_logged) << "Converting to byte using regular kernel."; | |||||
| } | } | ||||
| for(int y = task.y; y < task.y + task.h; y++) | for(int y = task.y; y < task.y + task.h; y++) | ||||
| for(int x = task.x; x < task.x + task.w; x++) | for(int x = task.x; x < task.x + task.w; x++) | ||||
| convert_to_byte_kernel(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer, | convert_to_byte_kernel(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer, | ||||
| sample_scale, x, y, task.offset, task.stride); | sample_scale, x, y, task.offset, task.stride); | ||||
| } | } | ||||
| } | } | ||||
| void thread_shader(DeviceTask& task) | void thread_shader(DeviceTask& task) | ||||
| { | { | ||||
| KernelGlobals kg = kernel_globals; | KernelGlobals kg = kernel_globals; | ||||
| static bool cpu_type_logged = false; | |||||
| #ifdef WITH_OSL | #ifdef WITH_OSL | ||||
| OSLShader::thread_init(&kg, &kernel_globals, &osl_globals); | OSLShader::thread_init(&kg, &kernel_globals, &osl_globals); | ||||
| #endif | #endif | ||||
| void(*shader_kernel)(KernelGlobals*, uint4*, float4*, float*, int, int, int, int); | void(*shader_kernel)(KernelGlobals*, uint4*, float4*, float*, int, int, int, int); | ||||
| #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 | #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 | ||||
| if(system_cpu_support_avx2()) { | if(system_cpu_support_avx2()) { | ||||
| shader_kernel = kernel_cpu_avx2_shader; | shader_kernel = kernel_cpu_avx2_shader; | ||||
| VLOG_ONCE(1, cpu_type_logged) << "Shading using AVX2 kernel."; | |||||
| } | } | ||||
| else | else | ||||
| #endif | #endif | ||||
| #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX | #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX | ||||
| if(system_cpu_support_avx()) { | if(system_cpu_support_avx()) { | ||||
| shader_kernel = kernel_cpu_avx_shader; | shader_kernel = kernel_cpu_avx_shader; | ||||
| VLOG_ONCE(1, cpu_type_logged) << "Shading using AVX kernel."; | |||||
| } | } | ||||
| else | else | ||||
| #endif | #endif | ||||
| #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 | #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 | ||||
| if(system_cpu_support_sse41()) { | if(system_cpu_support_sse41()) { | ||||
| shader_kernel = kernel_cpu_sse41_shader; | shader_kernel = kernel_cpu_sse41_shader; | ||||
| VLOG_ONCE(1, cpu_type_logged) << "Shading using SSE4.1 kernel."; | |||||
| } | } | ||||
| else | else | ||||
| #endif | #endif | ||||
| #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 | #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 | ||||
| if(system_cpu_support_sse3()) { | if(system_cpu_support_sse3()) { | ||||
| shader_kernel = kernel_cpu_sse3_shader; | shader_kernel = kernel_cpu_sse3_shader; | ||||
| VLOG_ONCE(1, cpu_type_logged) << "Shading using SSE3 kernel."; | |||||
| } | } | ||||
| else | else | ||||
| #endif | #endif | ||||
| #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 | #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 | ||||
| if(system_cpu_support_sse2()) { | if(system_cpu_support_sse2()) { | ||||
| shader_kernel = kernel_cpu_sse2_shader; | shader_kernel = kernel_cpu_sse2_shader; | ||||
| VLOG_ONCE(1, cpu_type_logged) << "Shading using SSE2 kernel."; | |||||
| } | } | ||||
| else | else | ||||
| #endif | #endif | ||||
| { | { | ||||
| shader_kernel = kernel_cpu_shader; | shader_kernel = kernel_cpu_shader; | ||||
| VLOG_ONCE(1, cpu_type_logged) << "Shading using regular kernel."; | |||||
| } | } | ||||
| for(int sample = 0; sample < task.num_samples; sample++) { | for(int sample = 0; sample < task.num_samples; sample++) { | ||||
| for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) | for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) | ||||
| shader_kernel(&kg, | shader_kernel(&kg, | ||||
| (uint4*)task.shader_input, | (uint4*)task.shader_input, | ||||
| (float4*)task.shader_output, | (float4*)task.shader_output, | ||||
| (float*)task.shader_output_luma, | (float*)task.shader_output_luma, | ||||
| ▲ Show 20 Lines • Show All 83 Lines • Show Last 20 Lines | |||||