Changeset View
Changeset View
Standalone View
Standalone View
source/blender/gpu/intern/gpu_codegen.cc
| Show First 20 Lines • Show All 89 Lines • ▼ Show 20 Lines | struct GPUPass { | ||||
| GPUShader *shader; | GPUShader *shader; | ||||
| GPUCodegenCreateInfo *create_info = nullptr; | GPUCodegenCreateInfo *create_info = nullptr; | ||||
| /** Orphaned GPUPasses gets freed by the garbage collector. */ | /** Orphaned GPUPasses gets freed by the garbage collector. */ | ||||
| uint refcount; | uint refcount; | ||||
| /** Identity hash generated from all GLSL code. */ | /** Identity hash generated from all GLSL code. */ | ||||
| uint32_t hash; | uint32_t hash; | ||||
| /** Did we already tried to compile the attached GPUShader. */ | /** Did we already tried to compile the attached GPUShader. */ | ||||
| bool compiled; | bool compiled; | ||||
| /** Hint that an optimized variant of this pass should be created based on a complexity heuristic | |||||
| * during pass code generation. */ | |||||
| bool should_optimize; | |||||
| }; | }; | ||||
| /* -------------------------------------------------------------------- */ | /* -------------------------------------------------------------------- */ | ||||
| /** \name GPUPass Cache | /** \name GPUPass Cache | ||||
| * | * | ||||
| * Internal shader cache: This prevent the shader recompilation / stall when | * Internal shader cache: This prevent the shader recompilation / stall when | ||||
| * using undo/redo AND also allows for GPUPass reuse if the Shader code is the | * using undo/redo AND also allows for GPUPass reuse if the Shader code is the | ||||
| * same for 2 different Materials. Unused GPUPasses are free by Garbage collection. | * same for 2 different Materials. Unused GPUPasses are free by Garbage collection. | ||||
| ▲ Show 20 Lines • Show All 131 Lines • ▼ Show 20 Lines | public: | ||||
| GPUCodegenCreateInfo *create_info = nullptr; | GPUCodegenCreateInfo *create_info = nullptr; | ||||
| private: | private: | ||||
| uint32_t hash_ = 0; | uint32_t hash_ = 0; | ||||
| BLI_HashMurmur2A hm2a_; | BLI_HashMurmur2A hm2a_; | ||||
| ListBase ubo_inputs_ = {nullptr, nullptr}; | ListBase ubo_inputs_ = {nullptr, nullptr}; | ||||
| GPUInput *cryptomatte_input_ = nullptr; | GPUInput *cryptomatte_input_ = nullptr; | ||||
| /** Cache paramters for complexity heuristic. */ | |||||
| uint nodes_total_ = 0; | |||||
| uint textures_total_ = 0; | |||||
| uint uniforms_total_ = 0; | |||||
| public: | public: | ||||
| GPUCodegen(GPUMaterial *mat_, GPUNodeGraph *graph_) : mat(*mat_), graph(*graph_) | GPUCodegen(GPUMaterial *mat_, GPUNodeGraph *graph_) : mat(*mat_), graph(*graph_) | ||||
| { | { | ||||
| BLI_hash_mm2a_init(&hm2a_, GPU_material_uuid_get(&mat)); | BLI_hash_mm2a_init(&hm2a_, GPU_material_uuid_get(&mat)); | ||||
| BLI_hash_mm2a_add_int(&hm2a_, GPU_material_flag(&mat)); | BLI_hash_mm2a_add_int(&hm2a_, GPU_material_flag(&mat)); | ||||
| create_info = new GPUCodegenCreateInfo("codegen"); | create_info = new GPUCodegenCreateInfo("codegen"); | ||||
| output.create_info = reinterpret_cast<GPUShaderCreateInfo *>( | output.create_info = reinterpret_cast<GPUShaderCreateInfo *>( | ||||
| static_cast<ShaderCreateInfo *>(create_info)); | static_cast<ShaderCreateInfo *>(create_info)); | ||||
| Show All 24 Lines | public: | ||||
| void generate_resources(); | void generate_resources(); | ||||
| void generate_library(); | void generate_library(); | ||||
| uint32_t hash_get() const | uint32_t hash_get() const | ||||
| { | { | ||||
| return hash_; | return hash_; | ||||
| } | } | ||||
| /* Heuristic determined during pass codegen for whether a | |||||
| * more optimal variant of this material should be compiled. */ | |||||
| bool should_optimize_heuristic() const | |||||
| { | |||||
| bool do_optimize = (nodes_total_ >= 100 || textures_total_ >= 4 || uniforms_total_ >= 64); | |||||
| return do_optimize; | |||||
| } | |||||
| private: | private: | ||||
| void set_unique_ids(); | void set_unique_ids(); | ||||
| void node_serialize(std::stringstream &eval_ss, const GPUNode *node); | void node_serialize(std::stringstream &eval_ss, const GPUNode *node); | ||||
| char *graph_serialize(eGPUNodeTag tree_tag, GPUNodeLink *output_link); | char *graph_serialize(eGPUNodeTag tree_tag, GPUNodeLink *output_link); | ||||
| char *graph_serialize(eGPUNodeTag tree_tag); | char *graph_serialize(eGPUNodeTag tree_tag); | ||||
| static char *extract_c_str(std::stringstream &stream) | static char *extract_c_str(std::stringstream &stream) | ||||
| ▲ Show 20 Lines • Show All 105 Lines • ▼ Show 20 Lines | else if (tex->tiled_mapping_name[0] != '\0') { | ||||
| info.sampler(slot++, ImageType::FLOAT_1D_ARRAY, name_mapping, Frequency::BATCH); | info.sampler(slot++, ImageType::FLOAT_1D_ARRAY, name_mapping, Frequency::BATCH); | ||||
| } | } | ||||
| else { | else { | ||||
| const char *name = info.name_buffer.append_sampler_name(tex->sampler_name); | const char *name = info.name_buffer.append_sampler_name(tex->sampler_name); | ||||
| info.sampler(slot++, ImageType::FLOAT_2D, name, Frequency::BATCH); | info.sampler(slot++, ImageType::FLOAT_2D, name, Frequency::BATCH); | ||||
| } | } | ||||
| } | } | ||||
| /* Increment heuristic. */ | |||||
| textures_total_ = slot; | |||||
| if (!BLI_listbase_is_empty(&ubo_inputs_)) { | if (!BLI_listbase_is_empty(&ubo_inputs_)) { | ||||
| /* NOTE: generate_uniform_buffer() should have sorted the inputs before this. */ | /* NOTE: generate_uniform_buffer() should have sorted the inputs before this. */ | ||||
| ss << "struct NodeTree {\n"; | ss << "struct NodeTree {\n"; | ||||
| LISTBASE_FOREACH (LinkData *, link, &ubo_inputs_) { | LISTBASE_FOREACH (LinkData *, link, &ubo_inputs_) { | ||||
| GPUInput *input = (GPUInput *)(link->data); | GPUInput *input = (GPUInput *)(link->data); | ||||
| if (input->source == GPU_SOURCE_CRYPTOMATTE) { | if (input->source == GPU_SOURCE_CRYPTOMATTE) { | ||||
| ss << input->type << " crypto_hash;\n"; | ss << input->type << " crypto_hash;\n"; | ||||
| } | } | ||||
| Show All 21 Lines | void GPUCodegen::generate_resources() | ||||
| info.typedef_source_generated = ss.str(); | info.typedef_source_generated = ss.str(); | ||||
| } | } | ||||
| void GPUCodegen::generate_library() | void GPUCodegen::generate_library() | ||||
| { | { | ||||
| GPUCodegenCreateInfo &info = *create_info; | GPUCodegenCreateInfo &info = *create_info; | ||||
| void *value; | void *value; | ||||
| GSetIterState pop_state = {}; | /* Iterate over libraries. We need to keep this struct intact incase | ||||
| while (BLI_gset_pop(graph.used_libraries, &pop_state, &value)) { | * it is required for the optimization an pass. */ | ||||
| GHashIterator *ihash = BLI_ghashIterator_new((GHash *)graph.used_libraries); | |||||
| while (!BLI_ghashIterator_done(ihash)) { | |||||
| value = BLI_ghashIterator_getKey(ihash); | |||||
| auto deps = gpu_shader_dependency_get_resolved_source((const char *)value); | auto deps = gpu_shader_dependency_get_resolved_source((const char *)value); | ||||
| info.dependencies_generated.extend_non_duplicates(deps); | info.dependencies_generated.extend_non_duplicates(deps); | ||||
| BLI_ghashIterator_step(ihash); | |||||
| } | } | ||||
| BLI_ghashIterator_free(ihash); | |||||
| } | } | ||||
| void GPUCodegen::node_serialize(std::stringstream &eval_ss, const GPUNode *node) | void GPUCodegen::node_serialize(std::stringstream &eval_ss, const GPUNode *node) | ||||
| { | { | ||||
| /* Declare constants. */ | /* Declare constants. */ | ||||
| LISTBASE_FOREACH (GPUInput *, input, &node->inputs) { | LISTBASE_FOREACH (GPUInput *, input, &node->inputs) { | ||||
| switch (input->source) { | switch (input->source) { | ||||
| case GPU_SOURCE_FUNCTION_CALL: | case GPU_SOURCE_FUNCTION_CALL: | ||||
| ▲ Show 20 Lines • Show All 51 Lines • ▼ Show 20 Lines | void GPUCodegen::node_serialize(std::stringstream &eval_ss, const GPUNode *node) | ||||
| /* Output arguments. */ | /* Output arguments. */ | ||||
| LISTBASE_FOREACH (GPUOutput *, output, &node->outputs) { | LISTBASE_FOREACH (GPUOutput *, output, &node->outputs) { | ||||
| eval_ss << output; | eval_ss << output; | ||||
| if (output->next) { | if (output->next) { | ||||
| eval_ss << ", "; | eval_ss << ", "; | ||||
| } | } | ||||
| } | } | ||||
| eval_ss << ");\n\n"; | eval_ss << ");\n\n"; | ||||
| /* Increment heuristic. */ | |||||
| nodes_total_++; | |||||
| } | } | ||||
| char *GPUCodegen::graph_serialize(eGPUNodeTag tree_tag, GPUNodeLink *output_link) | char *GPUCodegen::graph_serialize(eGPUNodeTag tree_tag, GPUNodeLink *output_link) | ||||
| { | { | ||||
| if (output_link == nullptr) { | if (output_link == nullptr) { | ||||
| return nullptr; | return nullptr; | ||||
| } | } | ||||
| ▲ Show 20 Lines • Show All 47 Lines • ▼ Show 20 Lines | |||||
| void GPUCodegen::generate_uniform_buffer() | void GPUCodegen::generate_uniform_buffer() | ||||
| { | { | ||||
| /* Extract uniform inputs. */ | /* Extract uniform inputs. */ | ||||
| LISTBASE_FOREACH (GPUNode *, node, &graph.nodes) { | LISTBASE_FOREACH (GPUNode *, node, &graph.nodes) { | ||||
| LISTBASE_FOREACH (GPUInput *, input, &node->inputs) { | LISTBASE_FOREACH (GPUInput *, input, &node->inputs) { | ||||
| if (input->source == GPU_SOURCE_UNIFORM && !input->link) { | if (input->source == GPU_SOURCE_UNIFORM && !input->link) { | ||||
| /* We handle the UBO uniforms separately. */ | /* We handle the UBO uniforms separately. */ | ||||
| BLI_addtail(&ubo_inputs_, BLI_genericNodeN(input)); | BLI_addtail(&ubo_inputs_, BLI_genericNodeN(input)); | ||||
| uniforms_total_++; | |||||
| } | } | ||||
| } | } | ||||
| } | } | ||||
| if (!BLI_listbase_is_empty(&ubo_inputs_)) { | if (!BLI_listbase_is_empty(&ubo_inputs_)) { | ||||
| /* This sorts the inputs based on size. */ | /* This sorts the inputs based on size. */ | ||||
| GPU_material_uniform_buffer_create(&mat, &ubo_inputs_); | GPU_material_uniform_buffer_create(&mat, &ubo_inputs_); | ||||
| } | } | ||||
| } | } | ||||
| Show All 11 Lines | LISTBASE_FOREACH (GPUNode *, node, &graph.nodes) { | ||||
| } | } | ||||
| } | } | ||||
| } | } | ||||
| void GPUCodegen::generate_graphs() | void GPUCodegen::generate_graphs() | ||||
| { | { | ||||
| set_unique_ids(); | set_unique_ids(); | ||||
| /* Serialize graph. */ | |||||
| output.surface = graph_serialize(GPU_NODE_TAG_SURFACE | GPU_NODE_TAG_AOV, graph.outlink_surface); | output.surface = graph_serialize(GPU_NODE_TAG_SURFACE | GPU_NODE_TAG_AOV, graph.outlink_surface); | ||||
| output.volume = graph_serialize(GPU_NODE_TAG_VOLUME, graph.outlink_volume); | output.volume = graph_serialize(GPU_NODE_TAG_VOLUME, graph.outlink_volume); | ||||
| output.displacement = graph_serialize(GPU_NODE_TAG_DISPLACEMENT, graph.outlink_displacement); | output.displacement = graph_serialize(GPU_NODE_TAG_DISPLACEMENT, graph.outlink_displacement); | ||||
| output.thickness = graph_serialize(GPU_NODE_TAG_THICKNESS, graph.outlink_thickness); | output.thickness = graph_serialize(GPU_NODE_TAG_THICKNESS, graph.outlink_thickness); | ||||
| if (!BLI_listbase_is_empty(&graph.outlink_compositor)) { | if (!BLI_listbase_is_empty(&graph.outlink_compositor)) { | ||||
| output.composite = graph_serialize(GPU_NODE_TAG_COMPOSITOR); | output.composite = graph_serialize(GPU_NODE_TAG_COMPOSITOR); | ||||
| } | } | ||||
| Show All 19 Lines | |||||
| /* -------------------------------------------------------------------- */ | /* -------------------------------------------------------------------- */ | ||||
| /** \name GPUPass | /** \name GPUPass | ||||
| * \{ */ | * \{ */ | ||||
| GPUPass *GPU_generate_pass(GPUMaterial *material, | GPUPass *GPU_generate_pass(GPUMaterial *material, | ||||
| GPUNodeGraph *graph, | GPUNodeGraph *graph, | ||||
| GPUCodegenCallbackFn finalize_source_cb, | GPUCodegenCallbackFn finalize_source_cb, | ||||
| void *thunk) | void *thunk, | ||||
| bool optimize_graph) | |||||
| { | { | ||||
| gpu_node_graph_prune_unused(graph); | gpu_node_graph_prune_unused(graph); | ||||
| /* If Optimize flag is passed in, we are generating an optimized | |||||
| * variant of the GPUMaterial's GPUPass. */ | |||||
| if (optimize_graph) { | |||||
| gpu_node_graph_optimize(graph); | |||||
| } | |||||
| /* Extract attributes before compiling so the generated VBOs are ready to accept the future | /* Extract attributes before compiling so the generated VBOs are ready to accept the future | ||||
| * shader. */ | * shader. */ | ||||
| gpu_node_graph_finalize_uniform_attrs(graph); | gpu_node_graph_finalize_uniform_attrs(graph); | ||||
| GPUCodegen codegen(material, graph); | GPUCodegen codegen(material, graph); | ||||
| codegen.generate_graphs(); | codegen.generate_graphs(); | ||||
| codegen.generate_cryptomatte(); | codegen.generate_cryptomatte(); | ||||
| GPUPass *pass_hash = nullptr; | |||||
| if (!optimize_graph) { | |||||
| /* The optimized version of the shader should not re-generate a UBO. | |||||
| * The UBO will not be used for this variant. */ | |||||
| codegen.generate_uniform_buffer(); | codegen.generate_uniform_buffer(); | ||||
| /* Cache lookup: Reuse shaders already compiled. */ | /** Cache lookup: Reuse shaders already compiled. | ||||
| GPUPass *pass_hash = gpu_pass_cache_lookup(codegen.hash_get()); | * NOTE: We only perform cache look-up for non-optimized shader | ||||
| * graphs, as baked constant data amongst other optimizations will generate too many | |||||
| * shader source permutations, with minimal re-usability. */ | |||||
| pass_hash = gpu_pass_cache_lookup(codegen.hash_get()); | |||||
| /* FIXME(fclem): This is broken. Since we only check for the hash and not the full source | /* FIXME(fclem): This is broken. Since we only check for the hash and not the full source | ||||
| * there is no way to have a collision currently. Some advocated to only use a bigger hash. */ | * there is no way to have a collision currently. Some advocated to only use a bigger hash. */ | ||||
| if (pass_hash && (pass_hash->next == nullptr || pass_hash->next->hash != codegen.hash_get())) { | if (pass_hash && (pass_hash->next == nullptr || pass_hash->next->hash != codegen.hash_get())) { | ||||
| if (!gpu_pass_is_valid(pass_hash)) { | if (!gpu_pass_is_valid(pass_hash)) { | ||||
| /* Shader has already been created but failed to compile. */ | /* Shader has already been created but failed to compile. */ | ||||
| return nullptr; | return nullptr; | ||||
| } | } | ||||
| /* No collision, just return the pass. */ | /* No collision, just return the pass. */ | ||||
| BLI_spin_lock(&pass_cache_spin); | BLI_spin_lock(&pass_cache_spin); | ||||
| pass_hash->refcount += 1; | pass_hash->refcount += 1; | ||||
| BLI_spin_unlock(&pass_cache_spin); | BLI_spin_unlock(&pass_cache_spin); | ||||
| return pass_hash; | return pass_hash; | ||||
| } | } | ||||
| } | |||||
| /* Either the shader is not compiled or there is a hash collision... | /* Either the shader is not compiled or there is a hash collision... | ||||
| * continue generating the shader strings. */ | * continue generating the shader strings. */ | ||||
| codegen.generate_attribs(); | codegen.generate_attribs(); | ||||
| codegen.generate_resources(); | codegen.generate_resources(); | ||||
| codegen.generate_library(); | codegen.generate_library(); | ||||
| /* Make engine add its own code and implement the generated functions. */ | /* Make engine add its own code and implement the generated functions. */ | ||||
| Show All 20 Lines | else { | ||||
| /* We still create a pass even if shader compilation | /* We still create a pass even if shader compilation | ||||
| * fails to avoid trying to compile again and again. */ | * fails to avoid trying to compile again and again. */ | ||||
| pass = (GPUPass *)MEM_callocN(sizeof(GPUPass), "GPUPass"); | pass = (GPUPass *)MEM_callocN(sizeof(GPUPass), "GPUPass"); | ||||
| pass->shader = nullptr; | pass->shader = nullptr; | ||||
| pass->refcount = 1; | pass->refcount = 1; | ||||
| pass->create_info = codegen.create_info; | pass->create_info = codegen.create_info; | ||||
| pass->hash = codegen.hash_get(); | pass->hash = codegen.hash_get(); | ||||
| pass->compiled = false; | pass->compiled = false; | ||||
| /* Only flag pass optimization hint if this is the first generated pass for a material. | |||||
| * Optimized passes cannot be optimized further, even if the heuristic is still not | |||||
| * favourable. */ | |||||
| pass->should_optimize = (!optimize_graph) && codegen.should_optimize_heuristic(); | |||||
| codegen.create_info = nullptr; | codegen.create_info = nullptr; | ||||
| /* Only insert non-optimized graphs into cache. | |||||
| * Optimized graphs will continuously be recompiled with new unique source during material | |||||
| * editing, and thus causing the cache to fill up quickly with materials offering minimal | |||||
| * re-use. */ | |||||
| if (!optimize_graph) { | |||||
| gpu_pass_cache_insert_after(pass_hash, pass); | gpu_pass_cache_insert_after(pass_hash, pass); | ||||
| } | } | ||||
| } | |||||
| return pass; | return pass; | ||||
| } | } | ||||
| bool GPU_pass_should_optimize(GPUPass *pass) | |||||
| { | |||||
| /* Returns optimization heuristic prepared during | |||||
| * initial codegen. */ | |||||
| return pass->should_optimize; | |||||
| } | |||||
| /** \} */ | /** \} */ | ||||
| /* -------------------------------------------------------------------- */ | /* -------------------------------------------------------------------- */ | ||||
| /** \name Compilation | /** \name Compilation | ||||
| * \{ */ | * \{ */ | ||||
| static int count_active_texture_sampler(GPUPass *pass, GPUShader *shader) | static int count_active_texture_sampler(GPUPass *pass, GPUShader *shader) | ||||
| { | { | ||||
| ▲ Show 20 Lines • Show All 153 Lines • Show Last 20 Lines | |||||