Commit: 9b71d439faf4ff9a5e4f5a92a036e0ffd0be29b9
Parent: 4747c52afb2223adac069e10294f00eb7df75c79
Author: Randy Palamar
Date: Mon, 16 Mar 2026 13:27:51 -0600
core/decode: default to processing 1 transmit with decode
unlike when OpenGL was compiling the shader, processing a single
element here just performs better. The option is kept around
because I haven't remeasured decoding with 48 transmits.
Diffstat:
1 file changed, 1 insertion(+), 3 deletions(-)
diff --git a/beamformer_core.c b/beamformer_core.c
@@ -363,6 +363,7 @@ plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb)
}
db->dilate_output = run_cuda_hilbert;
+ db->to_process = 1;
if (db->decode_mode == BeamformerDecodeMode_None) {
sd->layout = (uv3){{subgroup_size, 1, 1}};
@@ -372,7 +373,6 @@ plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb)
sd->dispatch.z = (u32)ceil_f32((f32)pb->parameters.acquisition_count / (f32)sd->layout.z);
} else if (db->transmit_count > 40) {
db->use_shared_memory = 1;
- db->to_process = 2;
if (db->transmit_count == 48)
db->to_process = db->transmit_count / 16;
@@ -385,8 +385,6 @@ plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb)
sd->dispatch.y = (u32)ceil_f32((f32)channel_chunk_count / (f32)sd->layout.y);
sd->dispatch.z = (u32)ceil_f32((f32)pb->parameters.acquisition_count / (f32)sd->layout.z / (f32)db->to_process);
} else {
- db->to_process = 1;
-
/* NOTE(rnp): register caching. using more threads will cause the compiler to do
* contortions to avoid spilling registers. using less gives higher performance */
sd->layout = (uv3){{subgroup_size / 2, 1, 1}};