Page MenuHome
Paste P2285

D11993 alternative (per-thread vertex range), 501bca9f5b1265e6f336bc7c1878b9b50a70356a
ActivePublic

Authored by Campbell Barton (campbellbarton) on Jul 26 2021, 9:02 AM.
diff --git a/source/blender/blenkernel/intern/mesh_normals.cc b/source/blender/blenkernel/intern/mesh_normals.cc
index f496d6eada1..a02c6f484e2 100644
--- a/source/blender/blenkernel/intern/mesh_normals.cc
+++ b/source/blender/blenkernel/intern/mesh_normals.cc
@@ -201,10 +201,10 @@ void BKE_mesh_calc_normals_mapping_ex(MVert *mverts,
struct MeshCalcNormalsData {
const MPoly *mpolys;
+ uint mpolys_len;
const MLoop *mloop;
MVert *mverts;
float (*pnors)[3];
- float (*lnors_weighted)[3];
float (*vnors)[3];
};
@@ -218,66 +218,76 @@ static void mesh_calc_normals_poly_cb(void *__restrict userdata,
BKE_mesh_calc_poly_normal(mp, data->mloop + mp->loopstart, data->mverts, data->pnors[pidx]);
}
-static void mesh_calc_normals_poly_prepare_cb(void *__restrict userdata,
- const int pidx,
- const TaskParallelTLS *__restrict UNUSED(tls))
+static void mesh_calc_normals_poly_accum_for_poly(const MVert *mverts,
+ const MLoop *ml,
+ const uint totloop,
+ const uint vertex_range_min,
+ const uint vertex_range_max,
+ const float pnor[3],
+ float (*vnors)[3])
{
- MeshCalcNormalsData *data = (MeshCalcNormalsData *)userdata;
- const MPoly *mp = &data->mpolys[pidx];
- const MLoop *ml = &data->mloop[mp->loopstart];
- const MVert *mverts = data->mverts;
+ float(*edgevecbuf)[3] = (float(*)[3])BLI_array_alloca(edgevecbuf, (size_t)totloop);
+ const uint vertex_range = vertex_range_max - vertex_range_min;
+ bool found_any = false;
- float pnor_temp[3];
- float *pnor = data->pnors ? data->pnors[pidx] : pnor_temp;
- float(*lnors_weighted)[3] = data->lnors_weighted;
+#define VERT_INDEX_IN_RANGE(v) (((uint)((v)-vertex_range_min)) < vertex_range)
- const int nverts = mp->totloop;
- float(*edgevecbuf)[3] = (float(*)[3])BLI_array_alloca(edgevecbuf, (size_t)nverts);
-
- /* Polygon Normal and edge-vector */
- /* inline version of #BKE_mesh_calc_poly_normal, also does edge-vectors */
{
- int i_prev = nverts - 1;
+ int i_prev = totloop - 1;
+ bool v_prev_in_range = VERT_INDEX_IN_RANGE(ml[i_prev].v);
const float *v_prev = mverts[ml[i_prev].v].co;
const float *v_curr;
- zero_v3(pnor);
- /* Newell's Method */
- for (int i = 0; i < nverts; i++) {
- v_curr = mverts[ml[i].v].co;
- add_newell_cross_v3_v3v3(pnor, v_prev, v_curr);
-
- /* Unrelated to normalize, calculate edge-vector */
- sub_v3_v3v3(edgevecbuf[i_prev], v_prev, v_curr);
- normalize_v3(edgevecbuf[i_prev]);
+ for (int i = 0; i < totloop; i++) {
+ const uint vidx = ml[i].v;
+ const bool v_curr_in_range = VERT_INDEX_IN_RANGE(vidx);
+ v_curr = mverts[vidx].co;
+ if (UNLIKELY(v_curr_in_range || v_prev_in_range)) {
+ sub_v3_v3v3(edgevecbuf[i_prev], v_prev, v_curr);
+ normalize_v3(edgevecbuf[i_prev]);
+ found_any = true;
+ }
i_prev = i;
-
v_prev = v_curr;
- }
- if (UNLIKELY(normalize_v3(pnor) == 0.0f)) {
- pnor[2] = 1.0f; /* other axes set to 0.0 */
+ v_prev_in_range = v_curr_in_range;
}
}
- /* accumulate angle weighted face normal */
- /* inline version of #accumulate_vertex_normals_poly_v3,
- * split between this threaded callback and #mesh_calc_normals_poly_accum_cb. */
- {
- const float *prev_edge = edgevecbuf[nverts - 1];
+ /* Accumulate angle weighted face normal,
+ * inline version of #accumulate_vertex_normals_poly_v3. */
+ if (found_any) {
+ int i_prev = totloop - 1;
+ for (int i = 0; i < totloop; i++) {
+ const uint vidx = ml[i].v;
+ if (UNLIKELY(VERT_INDEX_IN_RANGE(vidx))) {
+ /* Calculate angle between the two poly edges incident on this vertex. */
+ const float fac = saacos(-dot_v3v3(edgevecbuf[i], edgevecbuf[i_prev]));
+ madd_v3_v3fl(vnors[vidx], pnor, fac);
+ }
+ i_prev = i;
+ }
+ }
- for (int i = 0; i < nverts; i++) {
- const int lidx = mp->loopstart + i;
- const float *cur_edge = edgevecbuf[i];
+#undef VERT_INDEX_IN_RANGE
+}
- /* calculate angle between the two poly edges incident on
- * this vertex */
- const float fac = saacos(-dot_v3v3(cur_edge, prev_edge));
+static void mesh_calc_normals_poly_accum(TaskPool *__restrict pool, void *taskdata)
+{
+ struct MeshCalcNormalsData *data = (struct MeshCalcNormalsData *)BLI_task_pool_user_data(pool);
+ const uint *vertex_range_data = (const uint *)taskdata;
+ const uint vertex_range_min = vertex_range_data[0];
+ const uint vertex_range_max = vertex_range_data[1];
+ const MVert *mverts = data->mverts;
+ const MPoly *mp = data->mpolys;
+ const MLoop *ml = data->mloop;
- /* Store for later accumulation */
- mul_v3_v3fl(lnors_weighted[lidx], pnor, fac);
+ const float(*pnors)[3] = data->pnors;
+ float(*vnors)[3] = data->vnors;
- prev_edge = cur_edge;
- }
+ for (int pidx = 0; pidx < data->mpolys_len; pidx++, mp++) {
+ mesh_calc_normals_poly_accum_for_poly(
+ mverts, ml, mp->totloop, vertex_range_min, vertex_range_max, pnors[pidx], vnors);
+ ml += mp->totloop;
}
}
@@ -303,19 +313,23 @@ void BKE_mesh_calc_normals_poly(MVert *mverts,
int numVerts,
const MLoop *mloop,
const MPoly *mpolys,
- int numLoops,
+ int UNUSED(numLoops),
int numPolys,
float (*r_polynors)[3],
const bool only_face_normals)
{
- float(*pnors)[3] = r_polynors;
-
TaskParallelSettings settings;
BLI_parallel_range_settings_defaults(&settings);
settings.min_iter_per_thread = 1024;
- if (only_face_normals) {
- BLI_assert((pnors != nullptr) || (numPolys == 0));
+ bool free_pnors = false;
+ float(*pnors)[3] = r_polynors;
+ if (pnors == nullptr) {
+ pnors = (float(*)[3])MEM_malloc_arrayN((size_t)numPolys, sizeof(*pnors), __func__);
+ free_pnors = true;
+ }
+
+ {
BLI_assert(r_vertnors == nullptr);
MeshCalcNormalsData data;
@@ -325,49 +339,57 @@ void BKE_mesh_calc_normals_poly(MVert *mverts,
data.pnors = pnors;
BLI_task_parallel_range(0, numPolys, &data, mesh_calc_normals_poly_cb, &settings);
- return;
}
- float(*vnors)[3] = r_vertnors;
- float(*lnors_weighted)[3] = (float(*)[3])MEM_malloc_arrayN(
- (size_t)numLoops, sizeof(*lnors_weighted), __func__);
- bool free_vnors = false;
+ if (only_face_normals == false) {
+ float(*vnors)[3] = r_vertnors;
+ bool free_vnors = false;
- /* first go through and calculate normals for all the polys */
- if (vnors == nullptr) {
- vnors = (float(*)[3])MEM_calloc_arrayN((size_t)numVerts, sizeof(*vnors), __func__);
- free_vnors = true;
- }
- else {
- memset(vnors, 0, sizeof(*vnors) * (size_t)numVerts);
- }
+ /* first go through and calculate normals for all the polys */
+ if (vnors == nullptr) {
+ vnors = (float(*)[3])MEM_calloc_arrayN((size_t)numVerts, sizeof(*vnors), __func__);
+ free_vnors = true;
+ }
+ else {
+ memset(vnors, 0, sizeof(*vnors) * (size_t)numVerts);
+ }
+
+ MeshCalcNormalsData data;
+ data.mpolys = mpolys;
+ data.mpolys_len = numPolys;
+ data.mloop = mloop;
+ data.mverts = mverts;
+ data.pnors = pnors;
+ data.vnors = vnors;
- MeshCalcNormalsData data;
- data.mpolys = mpolys;
- data.mloop = mloop;
- data.mverts = mverts;
- data.pnors = pnors;
- data.lnors_weighted = lnors_weighted;
- data.vnors = vnors;
+ /* Compute weighted loop values, accumulating poly normals into vertex normals. */
+ {
+ const int num_threads = BLI_task_scheduler_num_threads();
+ TaskPool *task_pool = BLI_task_pool_create(&data, TASK_PRIORITY_HIGH);
+ uint(*vertex_range_data)[2] = BLI_array_alloca(vertex_range_data, (size_t)num_threads);
+ const int vertex_range = numVerts / num_threads;
+ for (int i = 0; i < num_threads; i++) {
+ vertex_range_data[i][0] = vertex_range * i;
+ vertex_range_data[i][1] = vertex_range * (i + 1);
+ BLI_task_pool_push(
+ task_pool, mesh_calc_normals_poly_accum, vertex_range_data[i], false, nullptr);
+ }
- /* Compute poly normals, and prepare weighted loop normals. */
- BLI_task_parallel_range(0, numPolys, &data, mesh_calc_normals_poly_prepare_cb, &settings);
+ BLI_task_pool_work_and_wait(task_pool);
+ BLI_task_pool_free(task_pool);
+ }
- /* Actually accumulate weighted loop normals into vertex ones. */
- /* Unfortunately, not possible to thread that
- * (not in a reasonable, totally lock- and barrier-free fashion),
- * since several loops will point to the same vertex... */
- for (int lidx = 0; lidx < numLoops; lidx++) {
- add_v3_v3(vnors[mloop[lidx].v], data.lnors_weighted[lidx]);
- }
+ /* Normalize and validate computed vertex normals. */
+ BLI_task_parallel_range(0, numVerts, &data, mesh_calc_normals_poly_finalize_cb, &settings);
- /* Normalize and validate computed vertex normals. */
- BLI_task_parallel_range(0, numVerts, &data, mesh_calc_normals_poly_finalize_cb, &settings);
+ if (free_vnors) {
+ MEM_freeN(vnors);
+ }
+ }
- if (free_vnors) {
- MEM_freeN(vnors);
+ if (free_pnors) {
+ MEM_freeN(pnors);
}
- MEM_freeN(lnors_weighted);
}
void BKE_mesh_ensure_normals(Mesh *mesh)

Event Timeline

This could be improved:

  • You should not have to finalize the vnors in a second threaded pass, since at the end of mesh_calc_normals_poly_accum you know all the vertices in your range are completely accumulated, you can do normalization/copying to float there directly.
  • Doing a quick loop over loops in mesh_calc_normals_poly_accum and only calling mesh_calc_normals_poly_accum_for_poly on first matching/valid loop if any.

Combined, those give me about 10% improvements... But this is still way slower than original code, so agree this is a dead end. ;)

@Bastien Montagne (mont29) thanks for looking into this, feel free to update this paste (or submit new paste).

My concern with this method - generally is that some threads take longer than others, so the more work that's moved into each function call could hold up others.

Nevertheless if you managed to get a speedup over this patch, it could be worth including for reference.