core/decode: default to processing 1 transmit with decode - ogl_beamforming - Ultrasound Beamforming Implemented with OpenGL

Commit: 9b71d439faf4ff9a5e4f5a92a036e0ffd0be29b9
Parent: 4747c52afb2223adac069e10294f00eb7df75c79
Author: Randy Palamar
Date:   Mon, 16 Mar 2026 13:27:51 -0600

core/decode: default to processing 1 transmit with decode

unlike when OpenGL was compiling the shader, processing a single
element here just performs better. The option is kept around
because I haven't remeasured decoding with 48 transmits.

Diffstat:
M beamformer_core.c  | 4 +---

1 file changed, 1 insertion(+), 3 deletions(-)
diff --git a/beamformer_core.c b/beamformer_core.c
@@ -363,6 +363,7 @@ plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb)
 				}
 
 				db->dilate_output = run_cuda_hilbert;
+				db->to_process    = 1;
 
 				if (db->decode_mode == BeamformerDecodeMode_None) {
 					sd->layout = (uv3){{subgroup_size, 1, 1}};
@@ -372,7 +373,6 @@ plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb)
 					sd->dispatch.z = (u32)ceil_f32((f32)pb->parameters.acquisition_count / (f32)sd->layout.z);
 				} else if (db->transmit_count > 40) {
 					db->use_shared_memory = 1;
-					db->to_process        = 2;
 
 					if (db->transmit_count == 48)
 						db->to_process = db->transmit_count / 16;
@@ -385,8 +385,6 @@ plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb)
 					sd->dispatch.y = (u32)ceil_f32((f32)channel_chunk_count              / (f32)sd->layout.y);
 					sd->dispatch.z = (u32)ceil_f32((f32)pb->parameters.acquisition_count / (f32)sd->layout.z / (f32)db->to_process);
 				} else {
-					db->to_process = 1;
-
 					/* NOTE(rnp): register caching. using more threads will cause the compiler to do
 					 * contortions to avoid spilling registers. using less gives higher performance */
 					sd->layout = (uv3){{subgroup_size / 2, 1, 1}};