core: implement global channel chunk processing - ogl_beamforming - Ultrasound Beamforming Implemented with OpenGL

Commit: 36abc76bf3aaa47ba2d99de2f8c6afcf8348a97e
Parent: f33f8f7270186a95011e8cf201acb3b50733cd4f
Author: Randy Palamar
Date:   Mon,  4 May 2026 13:39:03 -0600

core: implement global channel chunk processing

every channel in the RF data is entirely independently. by
processing channels in blocks of 16 we can provide a better chance
of data remaining in the GPU's L2 cache across the whole pipeline.
This is also reduces the amount of temporary space (ping pong
buffer) we need to reserve since only portions of the data are
used at a time.

This provides a nice speed boost for the Demodulation stage but
seems to have little effect on the following stages. While this
brings the performance closer to the OpenGL version it still lags
behind. This is because DAS is taking ~10ms longer than before.
That will need further investigation. I suspect its because in
current Mesa the compiler used for OpenGL is different than the
one used for Vulkan. Specifically the Vulkan one is emitting 4
8-byte loads instead of 2 16-byte loads when doing cubic
interpolation. The OpenGL compiled shader is still faster when not
using Cubic interpolation but the data upload rate is worse in
OpenGL due to additional driver overhead.

Diffstat:
M beamformer.meta  | 4 +++-
M beamformer_core.c  | 88 +++++++++++++++++++++++++++++++++++++++++++++----------------------------------
M beamformer_internal.h  | 4 ++++
M generated/beamformer.meta.c  | 11 +++++++----
M shaders/das.glsl  | 46 +++++++++++++++++++++++-----------------------
M ui.c  | 4 ++--
M vulkan.c  | 4 +++-

7 files changed, 92 insertions(+), 69 deletions(-)
diff --git a/beamformer.meta b/beamformer.meta
@@ -1,3 +1,4 @@
+@Constant(16)   ChannelChunkCount
 @Constant(4)    FilterSlots
 @Constant(4096) MaxBacklogFrames
 @Constant(256)  MaxChannelCount
@@ -347,6 +348,7 @@
 			[AcquisitionCount           acquisition_count            U32]
 			[AcquisitionKind            acquisition_kind             U32]
 			[ChannelCount               channel_count                U32]
+			[ChannelChunkCount          channel_chunk_count          U32]
 			[InterpolationMode          interpolation_mode           U32]
 			[SampleCount                sample_count                 U32]
 			[TransmitReceiveOrientation transmit_receive_orientation U32]
@@ -373,7 +375,7 @@
 			[output_size_y     U32]
 			[output_size_z     U32]
 			[cycle_t           U32]
-			[channel_t         S32]
+			[channel_offset    S32]
 		}
 	}
 
diff --git a/beamformer_core.c b/beamformer_core.c
@@ -314,7 +314,13 @@ plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb)
 		sampling_frequency /= 2 * (f32)decimation_rate;
 	}
 
-	cp->rf_size = sample_count * pb->parameters.channel_count * pb->parameters.acquisition_count;
+	cp->raw_channel_byte_stride  = pb->parameters.sample_count * pb->parameters.acquisition_count * beamformer_data_kind_byte_size[data_kind];
+
+	cp->channel_count = pb->parameters.channel_count;
+
+	u32 channel_chunk_count = Min(cp->channel_count, BeamformerChannelChunkCount);
+
+	cp->rf_size = sample_count * pb->parameters.acquisition_count * channel_chunk_count;
 	if (cp->iq_pipeline) cp->rf_size *= 8;
 	else                 cp->rf_size *= 4;
 
@@ -326,6 +332,7 @@ plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb)
 
 	u32 subgroup_size = vk_gpu_info()->subgroup_size;
 
+	cp->first_image_shader_index = 0;
 	cp->pipeline.shader_count = 0;
 	for (u32 i = 0; i < pb->pipeline.shader_count; i++) {
 		BeamformerShaderParameters *sp = pb->pipeline.parameters + i;
@@ -386,7 +393,7 @@ plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb)
 					sd->layout = (uv3){{subgroup_size, 1, 1}};
 
 					sd->dispatch.x = (u32)ceil_f32((f32)sample_count                     / (f32)sd->layout.x);
-					sd->dispatch.y = (u32)ceil_f32((f32)pb->parameters.channel_count     / (f32)sd->layout.y);
+					sd->dispatch.y = (u32)ceil_f32((f32)channel_chunk_count              / (f32)sd->layout.y);
 					sd->dispatch.z = (u32)ceil_f32((f32)pb->parameters.acquisition_count / (f32)sd->layout.z);
 				} else if (db->transmit_count > 40) {
 					db->use_shared_memory = 1;
@@ -400,18 +407,17 @@ plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb)
 					sd->layout = (uv3){{4, 1, use_16z? 16 : 32}};
 
 					sd->dispatch.x = (u32)ceil_f32((f32)sample_count                     / (f32)sd->layout.x);
-					sd->dispatch.y = (u32)ceil_f32((f32)pb->parameters.channel_count     / (f32)sd->layout.y);
+					sd->dispatch.y = (u32)ceil_f32((f32)channel_chunk_count              / (f32)sd->layout.y);
 					sd->dispatch.z = (u32)ceil_f32((f32)pb->parameters.acquisition_count / (f32)sd->layout.z / (f32)db->to_process);
 				} else {
 					db->to_process = 1;
 
 					/* NOTE(rnp): register caching. using more threads will cause the compiler to do
 					 * contortions to avoid spilling registers. using less gives higher performance */
-					/* TODO(rnp): may need to be adjusted to 16 on NVIDIA */
 					sd->layout = (uv3){{subgroup_size / 2, 1, 1}};
 
-					sd->dispatch.x = (u32)ceil_f32((f32)sample_count                 / (f32)sd->layout.x);
-					sd->dispatch.y = (u32)ceil_f32((f32)pb->parameters.channel_count / (f32)sd->layout.y);
+					sd->dispatch.x = (u32)ceil_f32((f32)sample_count        / (f32)sd->layout.x);
+					sd->dispatch.y = (u32)ceil_f32((f32)channel_chunk_count / (f32)sd->layout.y);
 					sd->dispatch.z = 1;
 				}
 
@@ -490,16 +496,17 @@ plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb)
 					fb->sample_count           = sample_count;
 				}
 
-				/* TODO(rnp): filter may need a different dispatch layout */
-				sd->layout     = (uv3){{128, 1, 1}};
+				sd->layout     = (uv3){{subgroup_size, 1, 1}};
 				sd->dispatch.x = (u32)ceil_f32((f32)sample_count                     / (f32)sd->layout.x);
-				sd->dispatch.y = (u32)ceil_f32((f32)pb->parameters.channel_count     / (f32)sd->layout.y);
+				sd->dispatch.y = (u32)ceil_f32((f32)channel_chunk_count              / (f32)sd->layout.y);
 				sd->dispatch.z = (u32)ceil_f32((f32)pb->parameters.acquisition_count / (f32)sd->layout.z);
 			}
 		}break;
 
 		case BeamformerShaderKind_DAS:{
 			if (compute_plan_push_shader(cp, shader, sp)) {
+				cp->first_image_shader_index = cp->pipeline.shader_count;
+
 				BeamformerDASBakeParameters *db = &sd->bake.DAS;
 				db->data_kind = BeamformerDataKind_Float32;
 				if (cp->iq_pipeline) db->data_kind = BeamformerDataKind_Float32Complex;
@@ -520,6 +527,7 @@ plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb)
 				db->transmit_angle         = pb->parameters.focal_vector.E[0];
 				db->focus_depth            = pb->parameters.focal_vector.E[1];
 				db->transmit_receive_orientation = pb->parameters.transmit_receive_orientation;
+				db->channel_chunk_count    = channel_chunk_count;
 
 				// NOTE(rnp): old gcc will miscompile an assignment
 				mem_copy(cp->xdc_transform.E, pb->parameters.xdc_transform.E, sizeof(cp->xdc_transform));
@@ -564,6 +572,9 @@ plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb)
 		}
 	}
 	cp->pipeline.data_kind = data_kind;
+
+	if (cp->first_image_shader_index == 0)
+		cp->first_image_shader_index = cp->pipeline.shader_count;
 }
 
 function void
@@ -782,7 +793,7 @@ beamformer_commit_parameter_block(BeamformerCtx *ctx, BeamformerComputePlan *cp,
 
 function void
 do_compute_shader(BeamformerCtx *ctx, VulkanHandle cmd, BeamformerComputePlan *cp, BeamformerFrame *frame,
-                  u32 shader_slot, Arena arena, u64 rf_pointer)
+                  u32 shader_slot, u32 channel_offset, u64 rf_pointer, Arena arena)
 {
 	BeamformerComputeContext *cc = &ctx->compute_context;
 
@@ -890,23 +901,13 @@ do_compute_shader(BeamformerCtx *ctx, VulkanHandle cmd, BeamformerComputePlan *c
 			.output_size_y      = cp->output_points.y,
 			.output_size_z      = cp->output_points.z,
 			.cycle_t            = das_cycle_t++,
+			.channel_offset     = channel_offset,
 		};
 		mem_copy(pc.voxel_transform.E, cp->voxel_transform.E, sizeof(pc.voxel_transform));
 		mem_copy(pc.xdc_transform.E,   cp->xdc_transform.E,   sizeof(pc.xdc_transform));
 
 		b32 coherent = cp->shader_descriptors[shader_slot].bake.DAS.coherency_weighting;
 
-		i32 loop_end;
-		if (cp->acquisition_kind == BeamformerAcquisitionKind_RCA_VLS ||
-		    cp->acquisition_kind == BeamformerAcquisitionKind_RCA_TPW)
-		{
-			/* NOTE(rnp): to avoid repeatedly sampling the whole focal vectors
-			 * texture we loop over transmits for VLS/TPW */
-			loop_end = (i32)cp->acquisition_count;
-		} else {
-			loop_end = (i32)cp->shader_descriptors[shader_slot].bake.DAS.channel_count;
-		}
-
 		GPUMemoryBarrierInfo memory_barriers[2] = {
 			{
 				.gpu_buffer = b,
@@ -924,15 +925,8 @@ do_compute_shader(BeamformerCtx *ctx, VulkanHandle cmd, BeamformerComputePlan *c
 		vk_command_buffer_memory_barriers(cmd, memory_barriers, 1 + coherent);
 
 		vk_command_push_constants(cmd, 0, sizeof(pc), &pc);
-		for (i32 index = 0; index < loop_end; index++) {
-			if (index != 0) {
-				pc.channel_t = index;
-				vk_command_push_constants(cmd, offsetof(BeamformerDASPushConstants, channel_t),
-				                          sizeof(pc.channel_t), &pc.channel_t);
-			}
-			vk_command_dispatch_compute(cmd, dispatch);
-			vk_command_buffer_memory_barriers(cmd, memory_barriers, 1 + coherent);
-		}
+		vk_command_dispatch_compute(cmd, dispatch);
+		vk_command_buffer_memory_barriers(cmd, memory_barriers, 1 + coherent);
 	}break;
 
 	case BeamformerShaderKind_CoherencyWeighting:{
@@ -1171,9 +1165,20 @@ complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena *arena)
 				slot = (rf->compute_index - 1) % countof(rf->upload_complete_values);
 			}
 
-			for (u32 i = 0; i < cp->pipeline.shader_count; i++) {
-				do_compute_shader(ctx, cmd, cp, frame, i, *arena,
-				                  rf->buffer.gpu_pointer + slot * rf->active_rf_size);
+			for (u32 channel_offset = 0;
+			     channel_offset < cp->channel_count;
+			     channel_offset += BeamformerChannelChunkCount)
+			{
+				u64 rf_pointer = rf->buffer.gpu_pointer + slot * rf->active_rf_size;
+				rf_pointer += cp->raw_channel_byte_stride * channel_offset;
+				for (u32 i = 0; i < cp->first_image_shader_index; i++) {
+					do_compute_shader(ctx, cmd, cp, frame, i, channel_offset, rf_pointer, *arena);
+					vk_command_timestamp(cmd);
+				}
+			}
+
+			for (u32 i = cp->first_image_shader_index; i < cp->pipeline.shader_count; i++) {
+				do_compute_shader(ctx, cmd, cp, frame, i, 0, 0, *arena);
 				vk_command_timestamp(cmd);
 			}
 
@@ -1188,10 +1193,13 @@ complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena *arena)
 			{
 				Arena scratch    = *arena;
 				/* NOTE(rnp): this blocks until work completes */
-				u64 * timestamps = vk_command_read_timestamps(VulkanTimeline_Compute, &scratch);
+				u64 *timestamps  = vk_command_read_timestamps(VulkanTimeline_Compute, &scratch);
 
-				u64 last_time    = timestamps[0] > 0 ? timestamps[1] : 0;
+				i32 steps        = ((i32)cp->channel_count / BeamformerChannelChunkCount) - 1;
+				i32 step         = 0;
 				u32 shader_index = 0;
+				u64 last_time    = timestamps[0] > 0 ? timestamps[1] : 0;
+
 				for (u64 i = 2; i < timestamps[0] + 1; i++) {
 					push_compute_timing_info(ctx->compute_timing_table, (ComputeTimingInfo){
 						.kind        = ComputeTimingInfoKind_Shader,
@@ -1200,7 +1208,12 @@ complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena *arena)
 						.timer_count = timestamps[i] - last_time,
 					});
 					last_time = timestamps[i];
+
 					shader_index++;
+					if (shader_index == cp->first_image_shader_index && step < steps) {
+						shader_index = 0;
+						step++;
+					}
 				}
 			}
 
@@ -1241,7 +1254,7 @@ coalesce_timing_table(ComputeTimingTable *t, ComputeShaderStats *stats)
 	 * info item. this could result in garbage entries but they shouldn't really matter */
 
 	u32 target = atomic_load_u32(&t->write_index);
-	u32 stats_index = (stats->latest_frame_index + 1) % countof(stats->table.times);
+	u32 stats_index = stats->latest_frame_index;
 
 	b32 has_rf = 0;
 	f32 gpu_clocks_to_nano = 1.0e-9f * vk_gpu_info()->timestamp_period_ns;
@@ -1263,8 +1276,7 @@ coalesce_timing_table(ComputeTimingTable *t, ComputeShaderStats *stats)
 		case ComputeTimingInfoKind_ComputeFrameEnd:{
 			assert(t->compute_frame_active == 1);
 			t->compute_frame_active = 0;
-			stats->latest_frame_index = stats_index;
-			stats_index = (stats_index + 1) % countof(stats->table.times);
+			stats_index = stats->latest_frame_index = (stats_index + 1) % countof(stats->table.times);
 			stats->table.shader_count = t->in_flight_shader_count;
 			mem_copy(stats->table.shader_ids, t->in_flight_shader_ids, sizeof(t->in_flight_shader_ids));
 		}break;
diff --git a/beamformer_internal.h b/beamformer_internal.h
@@ -269,6 +269,10 @@ struct BeamformerComputePlan {
 
 	VulkanHandle vulkan_pipelines[BeamformerMaxComputeShaderStages];
 
+	u32 first_image_shader_index;
+	u32 channel_count;
+	u32 raw_channel_byte_stride;
+
 	u32 dirty_programs;
 
 	BeamformerAcquisitionKind acquisition_kind;
diff --git a/generated/beamformer.meta.c b/generated/beamformer.meta.c
@@ -3,6 +3,7 @@
 // GENERATED CODE
 
 // NOTE: Constants (Integer)
+#define BeamformerChannelChunkCount        (16)
 #define BeamformerFilterSlots              (4)
 #define BeamformerMaxBacklogFrames         (4096)
 #define BeamformerMaxChannelCount          (256)
@@ -153,6 +154,7 @@ typedef struct {
 	u32 acquisition_count;
 	u32 acquisition_kind;
 	u32 channel_count;
+	u32 channel_chunk_count;
 	u32 interpolation_mode;
 	u32 sample_count;
 	u32 transmit_receive_orientation;
@@ -195,7 +197,7 @@ typedef struct {
 	u32 output_size_y;
 	u32 output_size_z;
 	u32 cycle_t;
-	i32 channel_t;
+	i32 channel_offset;
 } BeamformerDASPushConstants;
 
 typedef struct {
@@ -616,7 +618,7 @@ read_only global s8 beamformer_shader_global_header_strings[] = {
 	"  uint32_t output_size_y;\n"
 	"  uint32_t output_size_z;\n"
 	"  uint32_t cycle_t;\n"
-	"  int32_t  channel_t;\n"
+	"  int32_t  channel_offset;\n"
 	"};\n"
 	"\n"),
 	s8_comp(""
@@ -749,6 +751,7 @@ read_only global s8 *beamformer_shader_bake_parameter_names[] = {
 		s8_comp("AcquisitionCount"),
 		s8_comp("AcquisitionKind"),
 		s8_comp("ChannelCount"),
+		s8_comp("ChannelChunkCount"),
 		s8_comp("InterpolationMode"),
 		s8_comp("SampleCount"),
 		s8_comp("TransmitReceiveOrientation"),
@@ -772,7 +775,7 @@ read_only global s8 *beamformer_shader_bake_parameter_names[] = {
 read_only global u32 beamformer_shader_bake_parameter_float_bits[] = {
 	0x00000000UL,
 	0x00006000UL,
-	0x0003f800UL,
+	0x0007f000UL,
 	0x00000000UL,
 	0x00000000UL,
 	0x00000000UL,
@@ -783,7 +786,7 @@ read_only global u32 beamformer_shader_bake_parameter_float_bits[] = {
 read_only global u8 beamformer_shader_bake_parameter_counts[] = {
 	12,
 	15,
-	18,
+	19,
 	0,
 	0,
 	1,
diff --git a/shaders/das.glsl b/shaders/das.glsl
@@ -5,14 +5,16 @@
     #define RESULT_COHERENT_CAST(a)   (a).x
     #define RESULT_INCOHERENT_CAST(a) (a).y
   #endif
-  #define SAMPLE_TYPE float
+  #define SAMPLE_TYPE  float
+  #define SAMPLE_BYTES 4
 #elif DataKind == DataKind_Float32Complex
   #if CoherencyWeighting
     #define RESULT_TYPE               vec3
     #define RESULT_COHERENT_CAST(a)   (a).xy
     #define RESULT_INCOHERENT_CAST(a) (a).z
   #endif
-  #define SAMPLE_TYPE vec2
+  #define SAMPLE_TYPE  vec2
+  #define SAMPLE_BYTES 8
 #else
   #error DataKind unsupported for DAS
 #endif
@@ -66,7 +68,7 @@ vec2 rotate_iq(const vec2 iq, const float time)
 #endif
 
 /* NOTE: See: https://cubic.org/docs/hermite.htm */
-SAMPLE_TYPE cubic(const int base_index, const float t)
+SAMPLE_TYPE cubic(const RF rf, const float t)
 {
 	const mat4 h = mat4(
 		 2, -3,  0, 1,
@@ -76,10 +78,10 @@ SAMPLE_TYPE cubic(const int base_index, const float t)
 	);
 
 	SAMPLE_TYPE samples[4] = {
-		RF(rf_data).values[base_index + 0],
-		RF(rf_data).values[base_index + 1],
-		RF(rf_data).values[base_index + 2],
-		RF(rf_data).values[base_index + 3],
+		rf.values[0],
+		rf.values[1],
+		rf.values[2],
+		rf.values[3],
 	};
 
 	vec4        S  = vec4(t * t * t, t * t, t, 1);
@@ -101,23 +103,24 @@ SAMPLE_TYPE cubic(const int base_index, const float t)
 SAMPLE_TYPE sample_rf(const int rf_offset, const float index)
 {
 	SAMPLE_TYPE result = SAMPLE_TYPE(0);
+	RF rf = RF(rf_data + SAMPLE_BYTES * rf_offset);
 	switch (InterpolationMode) {
 	case InterpolationMode_Nearest:{
 		if (int(index) >= 0 && int(round(index)) < SampleCount)
-			result = rotate_iq(RF(rf_data).values[rf_offset + int(round(index))], index / SamplingFrequency);
+			result = rotate_iq(rf.values[int(round(index))], index / SamplingFrequency);
 	}break;
 	case InterpolationMode_Linear:{
 		if (int(index) >= 0 && int(index) < SampleCount - 1) {
 			float tk, t = modf(index, tk);
-			int n = rf_offset + int(tk);
-			result = (1 - t) * RF(rf_data).values[n] + t * RF(rf_data).values[n + 1];
+			int n = int(tk);
+			result = (1 - t) * rf.values[n] + t * rf.values[n + 1];
 			result = rotate_iq(result, index / SamplingFrequency);
 		}
 	}break;
 	case InterpolationMode_Cubic:{
 		if (int(index) > 0 && int(index) < SampleCount - 2) {
 			float tk, t = modf(index, tk);
-			result = rotate_iq(cubic(rf_offset + int(index), t), index / SamplingFrequency);
+			result = rotate_iq(cubic(RF(rf_data + SAMPLE_BYTES * (rf_offset + int(index))), t), index / SamplingFrequency);
 		}
 	}break;
 	}
@@ -204,10 +207,8 @@ float rca_transmit_distance(const vec3 world_point, const vec2 focal_vector, con
 
 RESULT_TYPE RCA(const vec3 world_point)
 {
-	const int16_t acquisition_start = int16_t(channel_t);
-	const int16_t acquisition_end   = int16_t(channel_t + 1);
 	RESULT_TYPE result = RESULT_TYPE(0);
-	for (int16_t acquisition = acquisition_start; acquisition < acquisition_end; acquisition++) {
+	for (int16_t acquisition = int16_t(0); acquisition < int16_t(AcquisitionCount); acquisition++) {
 		const uint16_t tx_rx_orientation = tx_rx_orientation_for_acquisition(acquisition);
 		const bool     rx_rows           = RX_ORIENTATION(tx_rx_orientation) == RCAOrientation_Rows;
 		const vec2     focal_vector      = focal_vector_for_acquisition(acquisition);
@@ -216,7 +217,8 @@ RESULT_TYPE RCA(const vec3 world_point)
 
 		int rf_offset  = acquisition * SampleCount;
 		rf_offset     -= int(InterpolationMode == InterpolationMode_Cubic);
-		for (int rx_channel = 0; rx_channel < ChannelCount; rx_channel++) {
+		for (int chunk_channel = 0; chunk_channel < ChannelChunkCount; chunk_channel++) {
+			int   rx_channel     = channel_offset + chunk_channel;
 			vec3  rx_center      = vec3(rx_channel * xdc_element_pitch, 0);
 			vec2  receive_vector = xdc_world_point - rca_plane_projection(rx_center, rx_rows);
 			float a_arg          = abs(FNumber * receive_vector.x / abs(xdc_world_point.y));
@@ -246,9 +248,9 @@ RESULT_TYPE HERCULES(const vec3 world_point)
 	const float apodization_test = 0.25f / (f_number_over_z * f_number_over_z);
 
 	RESULT_TYPE result = RESULT_TYPE(0);
-	const int rx_channel = channel_t;
-	{
-		int rf_offset   = rx_channel * SampleCount * AcquisitionCount + Sparse * SampleCount;
+	for (float chunk_channel = 0; chunk_channel < float(ChannelChunkCount); chunk_channel += 1.0f) {
+		float rx_channel = float(channel_offset) + chunk_channel;
+		int rf_offset   = int(chunk_channel) * SampleCount * AcquisitionCount + Sparse * SampleCount;
 		rf_offset      -= int(InterpolationMode == InterpolationMode_Cubic);
 
 		// NOTE(rnp): this wouldn't be so messy if we just forced an orientation like with FORCES
@@ -288,21 +290,19 @@ RESULT_TYPE HERCULES(const vec3 world_point)
 
 RESULT_TYPE FORCES(const vec3 xdc_world_point)
 {
-	const int16_t rx_channel_start = int16_t(channel_t);
-	const int16_t rx_channel_end   = int16_t(channel_t + 1);
-
 	RESULT_TYPE result = RESULT_TYPE(0);
 
 	float z_delta_squared     = xdc_world_point.z * xdc_world_point.z;
 	float transmit_y_delta    = xdc_world_point.y - xdc_element_pitch.y * ChannelCount / 2;
 	float transmit_yz_squared = transmit_y_delta * transmit_y_delta + z_delta_squared;
 
-	for (int16_t rx_channel = rx_channel_start; rx_channel < rx_channel_end; rx_channel++) {
+	for (float chunk_channel = 0; chunk_channel < float(ChannelChunkCount); chunk_channel += 1.0f) {
+		float rx_channel      = float(channel_offset) + chunk_channel;
 		float receive_x_delta = xdc_world_point.x - rx_channel * xdc_element_pitch.x;
 		float a_arg           = abs(FNumber * receive_x_delta / xdc_world_point.z);
 
 		if (a_arg < 0.5f) {
-			int rf_offset  = rx_channel * SampleCount * AcquisitionCount + Sparse * SampleCount;
+			int rf_offset  = int(chunk_channel) * SampleCount * AcquisitionCount + Sparse * SampleCount;
 			rf_offset     -= int(InterpolationMode == InterpolationMode_Cubic);
 
 			float receive_index = sample_index(sqrt(receive_x_delta * receive_x_delta + z_delta_squared));
diff --git a/ui.c b/ui.c
@@ -2770,7 +2770,7 @@ draw_compute_stats_bar_view(BeamformerUI *ui, Arena arena, ComputeShaderStats *s
 	for (u32 i = 0; i < countof(frame_labels); i++) {
 		TableCell *cells = table_push_row(table, &arena, TRK_CELLS)->data;
 		cells[0].text = frame_labels[i];
-		u32 frame_index = (stats->latest_frame_index - i) % countof(stats->table.times);
+		u32 frame_index = (stats->latest_frame_index - i - 1) % countof(stats->table.times);
 		for (u32 j = 0; j < stages; j++)
 			total_times[i] += stats->table.times[frame_index][j];
 	}
@@ -2794,7 +2794,7 @@ draw_compute_stats_bar_view(BeamformerUI *ui, Arena arena, ComputeShaderStats *s
 		ts.limits.size.w = cr.size.w;
 		draw_table_cell(ui, arena, (TableCell *)row->data, cr, table->alignment[0], ts, mouse);
 
-		u32 frame_index = (stats->latest_frame_index - row_index) % countof(stats->table.times);
+		u32 frame_index = (stats->latest_frame_index - row_index - 1) % countof(stats->table.times);
 		f32 total_width = average_width * total_times[row_index] / compute_time_sum;
 		Rect rect;
 		rect.pos  = v2_add(cr.pos, (v2){{cr.size.w + table->cell_pad.w , cr.size.h * 0.15f}});
diff --git a/vulkan.c b/vulkan.c
@@ -13,7 +13,7 @@
 #define ValidVulkanHandle(h) ((h).value[0] != 0)
 
 #define MaxCommandBuffersInFlight  BeamformerMaxRawDataFramesInFlight
-#define MaxCommandBufferTimestamps (64)
+#define MaxCommandBufferTimestamps (1024)
 
 typedef enum {
 	VulkanQueueKind_Graphics,
@@ -2467,6 +2467,8 @@ vk_command_read_timestamps(VulkanTimeline timeline, Arena *arena)
 				result = push_array(arena, u64, count + 1);
 				result[0] = count;
 
+				vk_host_wait_timeline(timeline, vq->command_buffer_submission_values[index], -1ULL);
+
 				vkGetQueryPoolResults(vk->device, vq->query_pool, index * MaxCommandBufferTimestamps, count,
 				                      count * sizeof(u64), result + 1, 8, VK_QUERY_RESULT_WAIT_BIT);
 			}

M	beamformer.meta	\|	4	+++-
M	beamformer_core.c	\|	88	+++++++++++++++++++++++++++++++++++++++++++++----------------------------------
M	beamformer_internal.h	\|	4	++++
M	generated/beamformer.meta.c	\|	11	+++++++----
M	shaders/das.glsl	\|	46	+++++++++++++++++++++++-----------------------
M	ui.c	\|	4	++--
M	vulkan.c	\|	4	+++-