core: plan_compute: use DAG to ensure compatible data layouts between stages - ogl_beamforming - Ultrasound Beamforming Implemented with OpenGL

Commit: cac06ceb44ff0bf8626c0fcc95a8292998237d58
Parent: da92b5ec7915c6ad2fd1401bc40349e2706c9c2c
Author: Randy Palamar
Date:   Wed, 25 Mar 2026 07:23:09 -0600

core: plan_compute: use DAG to ensure compatible data layouts between stages

The existing code was becoming very difficult to follow. Instead
we can build up a graph over a couple passes to automatically
ensure compatible data layout and data kind between stages.

The first pass builds a graph based on what was provided through
the API for configuration. At this point any stages which require
a fixed data layout fill that into their node. Extraneous stages
are removed and internal shaders such as CoherencyWeighting are
added if required.

The second pass ensures compatibility. The root node always has a
fixed data layout (the input rf data provided through the API).
All nodes after the root look at the previous node's output and
decide if its compatible. If both have a don't care layout the
previous node's input layout is copied forward. If only one node
has a required layout the other node uses the required layout. If
both the output of the previous node and the input of the current
node have required layouts then a Reshape node is inserted between
them. The same applies to each nodes data kind.

The third pass builds the final pipeline with bake parameters
specified. This is almost the same as before except each node no
longer needs to care about where it is located relative to other
nodes.

There may still be some bugs but they should be much easier to
resolve now. Furthermore this sort of design is also applicable to
the vulkan compute chain and a similar simplification should
happen there later.

Diffstat:
M beamformer.meta  | 45 +++++++++++++++------------------------------
M beamformer_core.c  | 582 +++++++++++++++++++++++++++++++++++++++++++++----------------------------------
M beamformer_internal.h  | 3 ++-
M generated/beamformer.meta.c  | 101 ++++++++++++++++++++++++++++++++++---------------------------------------------
M lib/ogl_beamformer_lib.c  | 4 +---
M math.c  | 24 ++++++++++++++++++++++++
M shaders/coherency_weighting.glsl  | 4 ++--
M shaders/das.glsl  | 20 ++++++++++----------
M shaders/decode.glsl  | 101 +++++++++++++++++++++++++++++++++----------------------------------------------
M shaders/filter.glsl  | 78 ++++++++++++++++++------------------------------------------------------------
M shaders/reshape.glsl  | 63 ++++++++++++---------------------------------------------------
M util.h  | 8 ++++++++

12 files changed, 513 insertions(+), 520 deletions(-)
diff --git a/beamformer.meta b/beamformer.meta
@@ -37,14 +37,14 @@
 	4X
 }
 
-@Table([name size elements complex]) DataKindTable
+@Table([name size elements complex glsl]) DataKindTable
 {
-	[Int16          2 1 0]
-	[Int16Complex   2 2 1]
-	[Float32        4 1 0]
-	[Float32Complex 4 2 1]
-	[Float16        2 1 0]
-	[Float16Complex 2 2 1]
+	[Int16          2 1 0 int16_t  ]
+	[Int16Complex   2 2 1 i16vec2  ]
+	[Float32        4 1 0 float32_t]
+	[Float32Complex 4 2 1 f32vec2  ]
+	[Float16        2 1 0 float16_t]
+	[Float16Complex 2 2 1 f16vec2  ]
 }
 @Expand(DataKindTable) @Enumeration(`$(name)`) DataKind
 
@@ -247,6 +247,14 @@
 	@Expand(DataKindTable) `	$(complex),`
 	`};`
 	``
+	`read_only global s8 beamformer_data_kind_glsl_type[] = {`
+	@Expand(DataKindTable) `	s8_comp("$(glsl)"),`
+	`};`
+	``
+	`read_only global s8 beamformer_data_kind_s8[] = {`
+	@Expand(DataKindTable) `	s8_comp("$(name)"),`
+	`};`
+	``
 	`read_only global u8 beamformer_contrast_mode_samples[] = {`
 	@Expand(ContrastModeTable) `	$(samples),`
 	`};`
@@ -283,12 +291,10 @@
 
 	@Shader(decode.glsl) Decode
 	{
-		@Enumeration DataKind
 		@Enumeration DecodeMode
 
 		@Bake
 		{
-			[DataKind             data_kind              U32]
 			[UseSharedMemory      use_shared_memory      B32]
 			[DecodeMode           decode_mode            U32]
 			[OutputChannelStride  output_channel_stride  U32]
@@ -308,14 +314,11 @@
 			[hadamard_buffer  U64]
 			[rf_buffer        U64]
 			[output_buffer    U64]
-			[output_rf_buffer U64]
-			[first_pass       B32]
 		}
 	}
 
 	@Shader(filter.glsl) Filter
 	{
-		@Enumeration DataKind
 		@Enumeration ShaderBufferSlot
 		@Enumeration ShaderResourceKind
 
@@ -323,10 +326,8 @@
 
 		@Bake
 		{
-			[DataKind              data_kind              U32]
 			[Demodulate            demodulate             U32]
 			[ComplexFilter         complex_filter         U32]
-			[OutputFloats          output_floats          U32]
 			[DecimationRate        decimation_rate        U32]
 			[FilterLength          filter_length          U32]
 			[InputChannelStride    input_channel_stride   U32]
@@ -355,7 +356,6 @@
 		@Constant    MaxChannelCount
 
 		@Enumeration AcquisitionKind
-		@Enumeration DataKind
 		@Enumeration InterpolationMode
 		@Enumeration RCAOrientation
 		@Enumeration ShaderBufferSlot
@@ -365,7 +365,6 @@
 
 		@Bake
 		{
-			[DataKind                   data_kind                    U32]
 			[CoherencyWeighting         coherency_weighting          U32]
 			[SingleFocus                single_focus                 U32]
 			[SingleOrientation          single_orientation           U32]
@@ -406,7 +405,6 @@
 
 	@Shader(sum.glsl) Sum
 	{
-		@Enumeration DataKind
 		@PushConstants
 		{
 			[output_data    U64]
@@ -425,13 +423,6 @@
 {
 	@Shader(coherency_weighting.glsl) CoherencyWeighting
 	{
-		@Enumeration DataKind
-
-		@Bake
-		{
-			[DataKind data_kind U32]
-		}
-
 		@PushConstants
 		{
 			[left_side_buffer  U64]
@@ -445,12 +436,8 @@
 
 	@Shader(reshape.glsl) Reshape
 	{
-		@Enumeration DataKind
-
 		@Bake
 		{
-			[InputDataKind  input_data_kind  U32]
-			[OutputDataKind output_data_kind U32]
 			[SizeX          size_x           U32]
 			[SizeY          size_y           U32]
 			[SizeZ          size_z           U32]
@@ -491,8 +478,6 @@
 {
 	@RenderShader RenderBeamformed
 	{
-		@Enumeration DataKind
-
 		@VertexShader(render_3d.vert.glsl)
 		@FragmentShader(render_3d.frag.glsl)
 
diff --git a/beamformer_core.c b/beamformer_core.c
@@ -1,7 +1,7 @@
 /* See LICENSE for license details. */
 /* TODO(rnp):
  * [ ]: bug? HERCULES might be broken, we may need to to chunk on transmits instead of channels
- * [ ]: refactor: plan_compute should build its own "command graph" which tracks
+ * [ ]: refactor: do_compute should build its own "command graph" which tracks
  *      dependencies better. It is very important that unnecessary barriers are
  *      not placed between compute stages which requires knowledge of the entire
  *      graph.
@@ -9,10 +9,6 @@
  *      use below to spin wait in library
  * [ ]: utilize umonitor/umwait (intel), monitorx/mwaitx (amd), and wfe/sev (aarch64)
  *      for power efficient low latency waiting
- * [ ]: refactor: split decode into reshape and decode
- *      - the check for first pass reshaping is the last non constant check
- *        in the shader
- *      - this will also remove the need for the channel mapping in the decode shader
  * [ ]: BeamformWorkQueue -> BeamformerWorkQueue
  * [ ]: refactor: work queue needs a cleanup, we should only have a single one
  *      - that queue isn't really considered hot so a lock is probably fine
@@ -29,6 +25,27 @@
 
 global f32 dt_for_frame;
 
+typedef struct BeamformerComputeGraphNode BeamformerComputeGraphNode;
+struct BeamformerComputeGraphNode {
+	// NOTE(rnp): will be BeamformerShaderKind_Count for root node
+	BeamformerShaderKind kind;
+
+	// NOTE(rnp): when any of input or output stride is assigned it is assumed that
+	// the shader requires a fixed layout for input, output, or both. When two adjacent
+	// nodes require incompatible layouts the second pass over the graph will insert
+	// Reshape shaders in between.
+	BeamformerDataKind input_data_kind;
+	iv3                input_stride;
+
+	BeamformerDataKind output_data_kind;
+	iv3                output_stride;
+
+	i32                user_pipeline_index;
+
+	BeamformerComputeGraphNode *prev;
+	BeamformerComputeGraphNode *next;
+};
+
 read_only global u32 beamformer_compute_array_parameter_sizes[] = {
 	#define X(k, type, elements) sizeof(type) * elements,
 	BEAMFORMER_COMPUTE_ARRAY_PARAMETERS_LIST
@@ -246,32 +263,39 @@ dispatch_for_output(uv3 layout, iv3 points)
 	return result;
 }
 
-function uv3
-decode_data_stride(b32 input, u32 samples, u32 channels, u32 acquisitions)
-{
-	uv3 result;
-	result.x = input ? channels * acquisitions : 1;
-	result.y = input ? acquisitions            : samples * acquisitions;
-	result.z = input ? 1                       : samples;
-	return result;
-}
-
 function b32
-compute_plan_push_shader(BeamformerComputePlan *p, BeamformerShaderKind shader, BeamformerShaderParameters *sp)
+compute_plan_push_shader(BeamformerComputePlan *p, BeamformerComputeGraphNode *node, BeamformerShaderParameters *sp)
 {
 	b32 result = 0;
 	if (p->pipeline.shader_count < countof(p->pipeline.shaders)) {
 		u32 index = p->pipeline.shader_count++;
-		p->pipeline.shaders[index]    = shader;
-		p->pipeline.parameters[index] = *sp;
+		p->pipeline.shaders[index]    = node->kind;
 		zero_struct(p->shader_descriptors + index);
+		p->pipeline.parameters[index] = sp ? *sp : (BeamformerShaderParameters){0};
+
+		p->shader_descriptors[index].input_data_kind  = node->input_data_kind;
+		p->shader_descriptors[index].output_data_kind = node->output_data_kind;
+
 		result = 1;
 	}
 	return result;
 }
 
+function BeamformerComputeGraphNode *
+push_compute_graph_node(BeamformerComputeGraphNode *root, BeamformerShaderKind kind, Arena *arena)
+{
+	BeamformerComputeGraphNode *result = push_struct(arena, BeamformerComputeGraphNode);
+	DLLPushEnd(root, result);
+	result->kind = kind;
+	result->user_pipeline_index = -1;
+	// NOTE(rnp): initially don't care data kind
+	result->input_data_kind  = BeamformerDataKind_Count;
+	result->output_data_kind = BeamformerDataKind_Count;
+	return result;
+}
+
 function void
-plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb)
+plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb, Arena scratch)
 {
 	b32 run_cuda_hilbert = 0;
 	b32 demodulate       = 0;
@@ -286,13 +310,38 @@ plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb)
 
 	if (demodulate) run_cuda_hilbert = 0;
 
+	f32 sampling_frequency = pb->parameters.sampling_frequency;
+	u32 input_sample_count = pb->parameters.sample_count;
+	u32 acquisition_count  = pb->parameters.acquisition_count;
+	u32 decimation_rate    = Max(pb->parameters.decimation_rate, 1);
+
+	cp->raw_channel_byte_stride = pb->parameters.sample_count * pb->parameters.acquisition_count
+	                              * beamformer_data_kind_byte_size[pb->pipeline.data_kind];
+
 	BeamformerDataKind input_data_kind = pb->pipeline.data_kind;
-	cp->iq_pipeline = beamformer_data_kind_complex[input_data_kind] || demodulate || run_cuda_hilbert;
+	if (demodulate) {
+		switch (input_data_kind) {
+		case BeamformerDataKind_Int16:{  input_data_kind = BeamformerDataKind_Int16Complex;  }break;
+		case BeamformerDataKind_Float16:{input_data_kind = BeamformerDataKind_Float16Complex;}break;
+		case BeamformerDataKind_Float32:{input_data_kind = BeamformerDataKind_Float32Complex;}break;
+		default:{}break;
+		}
+		input_sample_count /= (2 * decimation_rate);
+		sampling_frequency /= (2 * decimation_rate);
+	}
+
+	cp->iq_pipeline = beamformer_data_kind_complex[input_data_kind] || run_cuda_hilbert;
 
 	BeamformerDataKind das_data_kind = cp->iq_pipeline ? BeamformerDataKind_Float32Complex
 	                                                   : BeamformerDataKind_Float32;
 
-	read_only local_persist BeamformerDataKind input_to_intermediate_data_kind[] = {
+	cp->channel_count = pb->parameters.channel_count;
+	u32 chunk_channel_count = Min(cp->channel_count, BeamformerChunkChannelCount);
+
+	cp->rf_size = input_sample_count * pb->parameters.acquisition_count * chunk_channel_count
+	              * beamformer_data_kind_byte_size[das_data_kind];
+
+	read_only local_persist BeamformerDataKind data_kind_to_element_kind[] = {
 		[BeamformerDataKind_Int16]          = BeamformerDataKind_Float16,
 		[BeamformerDataKind_Float16]        = BeamformerDataKind_Float16,
 		[BeamformerDataKind_Float32]        = BeamformerDataKind_Float32,
@@ -300,132 +349,188 @@ plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb)
 		[BeamformerDataKind_Float16Complex] = BeamformerDataKind_Float16,
 		[BeamformerDataKind_Float32Complex] = BeamformerDataKind_Float32,
 	};
-	read_only local_persist b8 input_needs_deinterleave[] = {
-		[BeamformerDataKind_Int16]          = 0,
-		[BeamformerDataKind_Float16]        = 0,
-		[BeamformerDataKind_Float32]        = 0,
-		[BeamformerDataKind_Int16Complex]   = 1,
-		[BeamformerDataKind_Float16Complex] = 1,
-		[BeamformerDataKind_Float32Complex] = 1,
-	};
-	BeamformerDataKind intermediate_data_kind = input_to_intermediate_data_kind[input_data_kind];
 
-	cp->raw_channel_byte_stride = pb->parameters.sample_count * pb->parameters.acquisition_count
-	                              * beamformer_data_kind_byte_size[input_data_kind];
+	//////////////////////////////////////
+	// NOTE(rnp): First Pass: build initial graph and insert hard layout constraints
+	BeamformerComputeGraphNode *root_node = push_struct(&scratch, BeamformerComputeGraphNode);
+	root_node->kind = BeamformerShaderKind_Count;
+	root_node->input_data_kind  = input_data_kind;
+	root_node->input_stride.x   = 1;                                      // Sample Stride
+	root_node->input_stride.y   = input_sample_count * acquisition_count; // Channel Stride
+	root_node->input_stride.z   = input_sample_count;                     // Receive Event Stride
+	root_node->output_data_kind = input_data_kind;
+	root_node->output_stride.x  = 1;                                      // Sample Stride
+	root_node->output_stride.y  = input_sample_count * acquisition_count; // Channel Stride
+	root_node->output_stride.z  = input_sample_count;                     // Receive Event Stride
+	root_node->next = root_node->prev = root_node;
+
+	for EachIndex(pb->pipeline.shader_count, it) {
+		// NOTE(rnp): skip unnecessary shaders
+		switch (pb->pipeline.shaders[it]) {
+		case BeamformerShaderKind_CudaHilbert:{if (!run_cuda_hilbert) continue;}break;
 
-	f32 sampling_frequency = pb->parameters.sampling_frequency;
-	u32 decimation_rate = Max(pb->parameters.decimation_rate, 1);
-	u32 sample_count    = pb->parameters.sample_count;
-	if (demodulate) {
-		sample_count       /= (2 * decimation_rate);
-		sampling_frequency /= 2 * (f32)decimation_rate;
+		case BeamformerShaderKind_Decode:{
+			if (pb->parameters.decode_mode == BeamformerDecodeMode_None)
+				continue;
+		}break;
+
+		case BeamformerShaderKind_CudaDecode:
+		case BeamformerShaderKind_Sum:
+		case BeamformerShaderKind_MinMax:
+		{
+			// NOTE(rnp): currently unsupported
+			continue;
+		}break;
+
+		default:{}break;
+		}
+
+		BeamformerComputeGraphNode *node = push_compute_graph_node(root_node, pb->pipeline.shaders[it],
+		                                                           &scratch);
+		node->user_pipeline_index = (i32)it;
+		switch (pb->pipeline.shaders[it]) {
+		case BeamformerShaderKind_Decode:{
+			b32 low_precision   = beamformer_data_kind_element_size[input_data_kind] < 4;
+			b32 use_coop_matrix = vk_gpu_info()->cooperative_matrix &&
+			                      low_precision &&
+			                      (acquisition_count   % 16 == 0) &&
+			                      (chunk_channel_count % 16 == 0);
+
+			// NOTE(rnp): fixed input layout required for reasonable performance
+			if (low_precision && beamformer_data_kind_complex[input_data_kind])
+				node->input_data_kind = BeamformerDataKind_Float16Complex;
+			node->input_stride.x = chunk_channel_count * acquisition_count;
+			node->input_stride.y = acquisition_count;
+			node->input_stride.z = 1;
+
+			if (use_coop_matrix) {
+				node->input_data_kind  = BeamformerDataKind_Float16;
+				node->output_data_kind = data_kind_to_element_kind[das_data_kind];
+				node->output_stride    = node->input_stride;
+			}
+		}break;
+
+		case BeamformerShaderKind_DAS:{
+			node->input_data_kind  = das_data_kind;
+			node->input_stride.x   = 1;                                      // Sample Stride
+			node->input_stride.y   = input_sample_count * acquisition_count; // Channel Stride
+			node->input_stride.z   = input_sample_count;                     // Receive Event Stride
+			node->output_stride.x  = 1;
+			node->output_stride.y  = cp->output_points.x;
+			node->output_stride.z  = cp->output_points.x * cp->output_points.y;
+			node->output_data_kind = cp->iq_pipeline ? BeamformerDataKind_Float32Complex
+			                                         : BeamformerDataKind_Float32;
+
+			// NOTE(rnp): insert implicit CoherencyWeighting node
+			if (pb->parameters.coherency_weighting)
+				node = push_compute_graph_node(root_node, BeamformerShaderKind_CoherencyWeighting, &scratch);
+		}break;
+
+		default:{}break;
+		}
 	}
 
-	cp->channel_count = pb->parameters.channel_count;
+	//////////////////////////////////////
+	// NOTE(rnp): Second Pass: resolve layout constraints
+	for (BeamformerComputeGraphNode *node = root_node->next;
+	     node != root_node;
+	     node = node->next)
+	{
+		b32 needs_reshape = 0;
 
-	u32 chunk_channel_count = Min(cp->channel_count, BeamformerChunkChannelCount);
+		// NOTE(rnp): data strides
+		{
+			b32 input_dont_care       = bv3_any(iv3_equal(node->input_stride, (iv3){0}));
+			b32 prev_output_dont_care = bv3_any(iv3_equal(node->prev->output_stride, (iv3){0}));
 
-	cp->rf_size = sample_count * pb->parameters.acquisition_count * chunk_channel_count;
-	if (cp->iq_pipeline) cp->rf_size *= 8;
-	else                 cp->rf_size *= 4;
+			if (prev_output_dont_care && !input_dont_care)
+				node->prev->output_stride = node->input_stride;
 
-	u32 das_sample_stride   = 1;
-	u32 das_transmit_stride = sample_count;
-	u32 das_channel_stride  = sample_count * pb->parameters.acquisition_count;
+			if (!prev_output_dont_care && input_dont_care)
+				node->input_stride = node->prev->output_stride;
 
-	f32 time_offset = pb->parameters.time_offset;
+			if (prev_output_dont_care && input_dont_care)
+				node->input_stride = node->prev->output_stride = node->prev->input_stride;
 
-	u32 subgroup_size = vk_gpu_info()->subgroup_size;
+			needs_reshape |= !bv3_all(iv3_equal(node->input_stride, node->prev->output_stride));
+		}
 
-	cp->first_image_shader_index = 0;
-	cp->pipeline.shader_count = 0;
-	for (u32 i = 0; i < pb->pipeline.shader_count; i++) {
-		BeamformerShaderParameters *sp = pb->pipeline.parameters + i;
-		u32 slot   = cp->pipeline.shader_count;
-		u32 shader = pb->pipeline.shaders[i];
+		// NOTE(rnp): data kinds
+		{
+			b32 input_dont_care       = node->input_data_kind        == BeamformerDataKind_Count;
+			b32 prev_output_dont_care = node->prev->output_data_kind == BeamformerDataKind_Count;
 
-		BeamformerShaderDescriptor *ld = cp->shader_descriptors + slot - 1;
-		BeamformerShaderDescriptor *sd = cp->shader_descriptors + slot;
+			if (prev_output_dont_care && !input_dont_care)
+				node->prev->output_data_kind = node->input_data_kind;
 
-		switch (shader) {
+			if (!prev_output_dont_care && input_dont_care)
+				node->input_data_kind = node->prev->output_data_kind;
 
-		case BeamformerShaderKind_CudaHilbert:{
-			if (run_cuda_hilbert)
-				compute_plan_push_shader(cp, shader, sp);
-		}break;
+			if (prev_output_dont_care && input_dont_care)
+				node->input_data_kind = node->prev->output_data_kind = node->prev->input_data_kind;
 
-		case BeamformerShaderKind_Decode:{
-			/* TODO(rnp): rework decode first and demodulate after */
-			b32 first = slot == 0;
+			needs_reshape |= node->input_data_kind != node->prev->output_data_kind;
+		}
 
-			BeamformerShaderKind *last_shader = cp->pipeline.shaders + slot - 1;
-			assert(first || ((*last_shader == BeamformerShaderKind_Demodulate ||
-			                  *last_shader == BeamformerShaderKind_Filter)));
-			b32 decode = pb->parameters.decode_mode != BeamformerDecodeMode_None;
-			if (first && compute_plan_push_shader(cp, BeamformerShaderKind_Reshape, sp)) {
-				sd = cp->shader_descriptors + cp->pipeline.shader_count - 1;
+		// NOTE(rnp): insert reshape if needed
+		if (needs_reshape) {
+			BeamformerComputeGraphNode *new = push_compute_graph_node(node, BeamformerShaderKind_Reshape,
+			                                                          &scratch);
+			new->input_data_kind  = new->prev->output_data_kind;
+			new->input_stride     = new->prev->output_stride;
 
-				sd->layout = (uv3){{subgroup_size, 1, 1}};
+			new->output_data_kind = node->input_data_kind;
+			new->output_stride    = node->input_stride;
+		}
+	}
 
-				sd->dispatch.x = (u32)(ceil_f32((f32)pb->parameters.sample_count / sd->layout.x));
-				sd->dispatch.y = chunk_channel_count;
-				sd->dispatch.z = pb->parameters.acquisition_count;
+	f32 time_offset   = pb->parameters.time_offset;
+	u32 subgroup_size = vk_gpu_info()->subgroup_size;
 
-				uv3 output_stride = decode_data_stride(decode, pb->parameters.sample_count,
-				                                       chunk_channel_count, pb->parameters.acquisition_count);
+	cp->first_image_shader_index = 0;
+	cp->pipeline.shader_count = 0;
 
-				BeamformerReshapeBakeParameters *rb = &sd->bake.Reshape;
-				rb->input_data_kind  = input_data_kind;
-				rb->output_data_kind = decode ? intermediate_data_kind : BeamformerDataKind_Float32;
-				rb->size_x           = pb->parameters.sample_count;
-				rb->size_y           = chunk_channel_count;
-				rb->size_z           = pb->parameters.acquisition_count;
-				rb->input_stride_x   = 1;
-				rb->input_stride_y   = pb->parameters.acquisition_count * pb->parameters.sample_count;
-				rb->input_stride_z   = pb->parameters.sample_count;
-				rb->output_stride_x  = output_stride.x;
-				rb->output_stride_y  = output_stride.y;
-				rb->output_stride_z  = output_stride.z;
-				rb->interleave       = 0;
-				rb->deinterleave     = decode ? input_needs_deinterleave[input_data_kind] : 0;
-			}
+	for (BeamformerComputeGraphNode *node = root_node->next;
+	     node != root_node;
+	     node = node->next)
+	{
+		assert(node->prev->output_data_kind == node->input_data_kind);
+		assert(bv3_all(iv3_equal(node->prev->output_stride, node->input_stride)));
+
+		BeamformerShaderParameters *sp = 0;
+		if (node->user_pipeline_index >= 0)
+			sp = pb->pipeline.parameters + node->user_pipeline_index;
 
-			if (decode && compute_plan_push_shader(cp, shader, sp)) {
-				sd = cp->shader_descriptors + cp->pipeline.shader_count - 1;
+		if (compute_plan_push_shader(cp, node, sp)) {
+			BeamformerShaderDescriptor *sd = cp->shader_descriptors + cp->pipeline.shader_count - 1;
 
+			switch (node->kind) {
+			case BeamformerShaderKind_Decode:{
 				BeamformerDecodeBakeParameters *db = &sd->bake.Decode;
-				db->data_kind = intermediate_data_kind;
 
-				u32 decode_sample_count = demodulate ? 2 * sample_count : sample_count;
+				u32 decode_sample_count = input_sample_count;
 				db->decode_mode         = pb->parameters.decode_mode;
 				db->transmit_count      = pb->parameters.acquisition_count;
 				db->chunk_channel_count = chunk_channel_count;
 
 				// NOTE(rnp): ignored when using coop matrices
-				db->output_sample_stride   = das_sample_stride;
-				db->output_channel_stride  = das_channel_stride;
-				db->output_transmit_stride = das_transmit_stride;
+				db->output_sample_stride   = node->output_stride.x;
+				db->output_channel_stride  = node->output_stride.y;
+				db->output_transmit_stride = node->output_stride.z;
 
 				db->to_process = 1;
 
 				b32 use_coop_matrix = vk_gpu_info()->cooperative_matrix &&
-				                      db->data_kind == BeamformerDataKind_Float16 &&
+				                      node->input_data_kind == BeamformerDataKind_Float16 &&
 				                      (db->transmit_count % 16 == 0) &&
 				                      (chunk_channel_count % 16 == 0);
-				b32 extra_reshape   = 0;
-
-				if (db->decode_mode == BeamformerDecodeMode_None) {
-					sd->layout = (uv3){{subgroup_size, 1, 1}};
-
-					sd->dispatch.x = (u32)ceil_f32((f32)decode_sample_count              / (f32)sd->layout.x);
-					sd->dispatch.y = (u32)ceil_f32((f32)chunk_channel_count              / (f32)sd->layout.y);
-					sd->dispatch.z = (u32)ceil_f32((f32)pb->parameters.acquisition_count / (f32)sd->layout.z);
-				} else if (use_coop_matrix) {
-					extra_reshape = 1;
+				if (use_coop_matrix) {
 					// TODO(rnp): shared memory for larger sizes
-
 					sd->layout = (uv3){{subgroup_size, 1, 1}};
 
+					if (demodulate)
+						decode_sample_count *= 2;
+
 					db->cooperative_matrix   = 1;
 					db->cooperative_matrix_m = 16;
 					db->cooperative_matrix_n = 16;
@@ -440,13 +545,15 @@ plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb)
 					if (db->transmit_count == 48)
 						db->to_process = db->transmit_count / 16;
 
-					b32 use_16z  = db->transmit_count == 48 || db->transmit_count == 80 ||
+					b32 use_16x  = db->transmit_count == 48 || db->transmit_count == 80 ||
 					               db->transmit_count == 96 || db->transmit_count == 160;
-					sd->layout = (uv3){{4, 1, use_16z? 16 : 32}};
+					sd->layout.x = use_16x ? 16 : 32;
+					sd->layout.y = 4;
+					sd->layout.z = 1;
 
-					sd->dispatch.x = (u32)ceil_f32((f32)decode_sample_count              / (f32)sd->layout.x);
+					sd->dispatch.x = (u32)ceil_f32((f32)pb->parameters.acquisition_count / (f32)sd->layout.x / (f32)db->to_process);
 					sd->dispatch.y = (u32)ceil_f32((f32)chunk_channel_count              / (f32)sd->layout.y);
-					sd->dispatch.z = (u32)ceil_f32((f32)pb->parameters.acquisition_count / (f32)sd->layout.z / (f32)db->to_process);
+					sd->dispatch.z = (u32)ceil_f32((f32)decode_sample_count              / (f32)sd->layout.z);
 				} else {
 					/* NOTE(rnp): register caching. using more threads will cause the compiler to do
 					 * contortions to avoid spilling registers. using less gives higher performance */
@@ -456,43 +563,12 @@ plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb)
 					sd->dispatch.y = (u32)ceil_f32((f32)chunk_channel_count / (f32)sd->layout.y);
 					sd->dispatch.z = 1;
 				}
+			}break;
 
-				if (extra_reshape && compute_plan_push_shader(cp, BeamformerShaderKind_Reshape, sp)) {
-					cp->q_rf_data_offset = chunk_channel_count * sample_count * pb->parameters.acquisition_count *
-					                       beamformer_data_kind_byte_size[BeamformerDataKind_Float32];
-
-					sd = cp->shader_descriptors + cp->pipeline.shader_count - 1;
-					sd->layout.x = Min(subgroup_size, db->transmit_count);
-					sd->layout.y = subgroup_size / sd->layout.x;
-					sd->layout.z = 1;
-
-					sd->dispatch.x = (u32)(ceil_f32((f32)db->transmit_count  / sd->layout.x));
-					sd->dispatch.y = (u32)(ceil_f32((f32)chunk_channel_count / sd->layout.y));
-					sd->dispatch.z = sample_count;
-
-					BeamformerReshapeBakeParameters *rb = &sd->bake.Reshape;
-					rb->input_data_kind  = BeamformerDataKind_Float32;
-					rb->output_data_kind = BeamformerDataKind_Float32;
-					rb->size_x           = db->transmit_count;
-					rb->size_y           = chunk_channel_count;
-					rb->size_z           = sample_count;
-					rb->input_stride_x   = 1;
-					rb->input_stride_y   = db->transmit_count;
-					rb->input_stride_z   = chunk_channel_count * db->transmit_count;
-					rb->output_stride_x  = das_transmit_stride;
-					rb->output_stride_y  = das_channel_stride;
-					rb->output_stride_z  = das_sample_stride;
-					rb->interleave       = cp->iq_pipeline;
-				}
-			}
-		}break;
-
-		case BeamformerShaderKind_Demodulate:
-		case BeamformerShaderKind_Filter:
-		{
-			if (compute_plan_push_shader(cp, shader, sp)) {
-				b32 first = slot == 0;
-				b32 demod = shader == BeamformerShaderKind_Demodulate;
+			case BeamformerShaderKind_Demodulate:
+			case BeamformerShaderKind_Filter:
+			{
+				b32 demod = node->kind == BeamformerShaderKind_Demodulate;
 				BeamformerFilter *f = cp->filters + sp->filter_slot;
 
 				time_offset += f->time_delay;
@@ -502,12 +578,21 @@ plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb)
 				fb->demodulate     = demod;
 				fb->complex_filter = f->parameters.complex;
 
-				// NOTE(rnp): if we are decoding we need to deinterleave I and Q channels
-				if (pb->parameters.decode_mode != BeamformerDecodeMode_None)
-					fb->batch_sample_count = chunk_channel_count * sample_count * pb->parameters.acquisition_count;
+				fb->sample_count    = input_sample_count;
+				fb->decimation_rate = demod ? decimation_rate : 1;
+
+				b32 deinterleave =  beamformer_data_kind_complex[node->input_data_kind] &&
+				                   !beamformer_data_kind_complex[node->output_data_kind];
+				if (deinterleave)
+					fb->batch_sample_count = chunk_channel_count * input_sample_count * pb->parameters.acquisition_count;
+
+				fb->output_sample_stride   = node->output_stride.x;
+				fb->output_channel_stride  = node->output_stride.y;
+				fb->output_transmit_stride = node->output_stride.z;
 
-				fb->data_kind = input_data_kind;
-				if (!first) fb->data_kind = intermediate_data_kind;
+				fb->input_sample_stride    = node->input_stride.x;
+				fb->input_channel_stride   = node->input_stride.y;
+				fb->input_transmit_stride  = node->input_stride.z;
 
 				/* NOTE(rnp): when we are demodulating we pretend that the sampler was alternating
 				 * between sampling the I portion and the Q portion of an IQ signal. Therefore there
@@ -521,63 +606,25 @@ plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb)
 				if (demod) {
 					fb->demodulation_frequency = pb->parameters.demodulation_frequency;
 					fb->sampling_frequency     = pb->parameters.sampling_frequency / 2;
-					fb->decimation_rate        = decimation_rate;
-					fb->sample_count           = pb->parameters.sample_count;
-
-					fb->output_channel_stride  = das_channel_stride;
-					fb->output_sample_stride   = das_sample_stride;
-					fb->output_transmit_stride = das_transmit_stride;
-
-					if (first) {
-						fb->input_channel_stride  = pb->parameters.sample_count * pb->parameters.acquisition_count / 2;
-						fb->input_sample_stride   = 1;
-						fb->input_transmit_stride = pb->parameters.sample_count / 2;
-
-						if (pb->parameters.decode_mode == BeamformerDecodeMode_None) {
-							fb->output_floats = 1;
-						} else {
-							/* NOTE(rnp): output optimized layout for decoding */
-							fb->output_channel_stride  = pb->parameters.acquisition_count;
-							fb->output_sample_stride   = pb->parameters.acquisition_count * chunk_channel_count;
-							fb->output_transmit_stride = 1;
-						}
-					} else {
-						assert(cp->pipeline.shaders[slot - 1] == BeamformerShaderKind_Decode);
-						fb->input_channel_stride  = ld->bake.Decode.output_channel_stride;
-						fb->input_sample_stride   = ld->bake.Decode.output_sample_stride;
-						fb->input_transmit_stride = ld->bake.Decode.output_transmit_stride;
-					}
-				} else {
-					fb->decimation_rate        = 1;
-					fb->output_channel_stride  = sample_count * pb->parameters.acquisition_count;
-					fb->output_sample_stride   = 1;
-					fb->output_transmit_stride = sample_count;
-					fb->input_channel_stride   = sample_count * pb->parameters.acquisition_count;
-					fb->input_sample_stride    = 1;
-					fb->input_transmit_stride  = sample_count;
-					fb->sample_count           = sample_count;
 				}
 
 				sd->layout     = (uv3){{subgroup_size, 1, 1}};
-				sd->dispatch.x = (u32)ceil_f32((f32)sample_count                     / (f32)sd->layout.x);
+				sd->dispatch.x = (u32)ceil_f32((f32)input_sample_count               / (f32)sd->layout.x);
 				sd->dispatch.y = (u32)ceil_f32((f32)chunk_channel_count              / (f32)sd->layout.y);
 				sd->dispatch.z = (u32)ceil_f32((f32)pb->parameters.acquisition_count / (f32)sd->layout.z);
-			}
-		}break;
+			}break;
 
-		case BeamformerShaderKind_DAS:{
-			if (compute_plan_push_shader(cp, shader, sp)) {
+			case BeamformerShaderKind_DAS:{
 				cp->first_image_shader_index = cp->pipeline.shader_count;
 
 				BeamformerDASBakeParameters *db = &sd->bake.DAS;
-				db->data_kind              = das_data_kind;
 				db->sampling_frequency     = sampling_frequency;
 				db->demodulation_frequency = pb->parameters.demodulation_frequency;
 				db->speed_of_sound         = pb->parameters.speed_of_sound;
 				db->time_offset            = time_offset;
 				db->f_number               = pb->parameters.f_number;
 				db->acquisition_kind       = pb->parameters.acquisition_kind;
-				db->sample_count           = sample_count;
+				db->sample_count           = input_sample_count;
 				db->channel_count          = pb->parameters.channel_count;
 				db->acquisition_count      = pb->parameters.acquisition_count;
 				db->chunk_channel_count    = chunk_channel_count;
@@ -603,34 +650,61 @@ plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb)
 
 				sd->layout   = layout_for_output(cp->output_points);
 				sd->dispatch = dispatch_for_output(sd->layout, cp->output_points);
+			}break;
 
-				if (pb->parameters.coherency_weighting &&
-				    compute_plan_push_shader(cp, BeamformerShaderKind_CoherencyWeighting, sp))
-				{
-					BeamformerShaderDescriptor *shader_descriptor = cp->shader_descriptors + cp->pipeline.shader_count - 1;
-					shader_descriptor->layout   = sd->layout;
-					shader_descriptor->dispatch = sd->dispatch;
-					shader_descriptor->bake.CoherencyWeighting.data_kind = db->data_kind;
-				}
-			}
-		}break;
+			case BeamformerShaderKind_CoherencyWeighting:{
+				sd->layout   = layout_for_output(cp->output_points);
+				sd->dispatch = dispatch_for_output(sd->layout, cp->output_points);
+			}break;
 
-		#if 0
-		case BeamformerShaderKind_Sum:{
-			sd->bake.data_kind = BeamformerDataKind_Float32;
-			if (cp->iq_pipeline)
-				sd->bake.data_kind = BeamformerDataKind_Float32Complex;
+			case BeamformerShaderKind_Reshape:{
+				BeamformerReshapeBakeParameters *rb = &sd->bake.Reshape;
+				rb->deinterleave =  beamformer_data_kind_complex[node->input_data_kind] &&
+				                   !beamformer_data_kind_complex[node->output_data_kind];
+				rb->interleave   = !beamformer_data_kind_complex[node->input_data_kind] &&
+				                    beamformer_data_kind_complex[node->output_data_kind];
+				assert(rb->interleave == 0 || (rb->interleave != rb->deinterleave));
+
+				rb->input_stride_x   = node->input_stride.x;
+				rb->input_stride_y   = node->input_stride.y;
+				rb->input_stride_z   = node->input_stride.z;
+				rb->output_stride_x  = node->output_stride.x;
+				rb->output_stride_y  = node->output_stride.y;
+				rb->output_stride_z  = node->output_stride.z;
+
+				// NOTE(rnp): order doesn't really matter here but it must match the dispatch layout
+				rb->size_x           = input_sample_count;
+				rb->size_y           = chunk_channel_count;
+				rb->size_z           = acquisition_count;
 
-			sd->layout   = layout_for_output(cp->output_points);
-			sd->dispatch = dispatch_for_output(sd->layout, cp->output_points);
+				sd->layout.x = 1;
+				sd->layout.z = Min(subgroup_size, rb->size_z);
+				sd->layout.y = subgroup_size / sd->layout.z;
 
-			commit = 1;
-		}break;
-		#endif
+				sd->dispatch.x = (u32)(ceil_f32((f32)rb->size_x / sd->layout.x));
+				sd->dispatch.y = (u32)(ceil_f32((f32)rb->size_y / sd->layout.y));
+				sd->dispatch.z = (u32)(ceil_f32((f32)rb->size_z / sd->layout.z));
+			}break;
 
-		default:{}break;
+			default:{}break;
+
+			#if 0
+			case BeamformerShaderKind_Sum:{
+				sd->bake.data_kind = BeamformerDataKind_Float32;
+				if (cp->iq_pipeline)
+					sd->bake.data_kind = BeamformerDataKind_Float32Complex;
+
+				sd->layout   = layout_for_output(cp->output_points);
+				sd->dispatch = dispatch_for_output(sd->layout, cp->output_points);
+
+				commit = 1;
+			}break;
+			#endif
+
+			}
 		}
 	}
+
 	cp->pipeline.data_kind = input_data_kind;
 
 	if (cp->first_image_shader_index == 0)
@@ -669,7 +743,35 @@ stream_append_shader_header(Stream *s, i32 reloadable_index, BeamformerShaderDes
 		stream_append_s8(s,  s8(") in;\n\n"));
 	}
 
+	{
+		u32 max_length = 0;
+		for EachElement(beamformer_data_kind_s8, it)
+			max_length = Max(max_length, (u32)beamformer_data_kind_s8[it].len);
+
+		for EachElement(beamformer_data_kind_s8, it) {
+			stream_append_s8s(s, s8("#define DataKind_"), beamformer_data_kind_s8[it]);
+			stream_pad(s, ' ', max_length - beamformer_data_kind_s8[it].len + 1);
+			stream_append_u64(s, it);
+			stream_append_byte(s, '\n');
+		}
+		stream_append_byte(s, '\n');
+	}
+
 	if (sd) {
+		if (sd->input_data_kind != BeamformerDataKind_Count) {
+			stream_append_s8s(s, s8("#define InputDataType  "),
+			                  beamformer_data_kind_glsl_type[sd->input_data_kind], s8("\n"));
+			stream_append_s8s(s, s8("#define InputDataKind  DataKind_"),
+			                  beamformer_data_kind_s8[sd->input_data_kind], s8("\n"));
+		}
+		if (sd->output_data_kind != BeamformerDataKind_Count) {
+			stream_append_s8s(s, s8("#define OutputDataType "),
+			                  beamformer_data_kind_glsl_type[sd->output_data_kind], s8("\n"));
+			stream_append_s8s(s, s8("#define OutputDataKind DataKind_"),
+			                  beamformer_data_kind_s8[sd->output_data_kind], s8("\n"));
+		}
+		stream_append_byte(s, '\n');
+
 		u32 *parameters = (u32 *)&sd->bake;
 		s8  *names      = beamformer_shader_bake_parameter_names[reloadable_index];
 		u32  float_bits = beamformer_shader_bake_parameter_float_bits[reloadable_index];
@@ -788,7 +890,7 @@ beamformer_commit_parameter_block(BeamformerCtx *ctx, BeamformerComputePlan *cp,
 			cp->output_points  = das_valid_points(pb->parameters.output_points.xyz);
 			cp->average_frames = pb->parameters.output_points.E[3];
 
-			plan_compute_pipeline(cp, pb);
+			plan_compute_pipeline(cp, pb, arena);
 
 			/* NOTE(rnp): these are both handled by plan_compute_pipeline() */
 			u32 mask = 1 << BeamformerParameterBlockRegion_ComputePipeline |
@@ -892,28 +994,14 @@ do_compute_shader(BeamformerCtx *ctx, VulkanHandle cmd, BeamformerComputePlan *c
 	switch (cp->pipeline.shaders[shader_slot]) {
 
 	case BeamformerShaderKind_Decode:{
-		BeamformerDecodeMode mode = cp->shader_descriptors[shader_slot].bake.Decode.decode_mode;
 		BeamformerDecodePushConstants pc = {
 			.hadamard_buffer = cp->array_parameters.gpu_pointer + offsetof(BeamformerComputeArrayParameters, Hadamard),
+			.rf_buffer       = pp_input_pointer,
 		};
 
 		if ((shader_slot + 1) == das_index) pc.output_buffer = pp_das_pointer;
 		else                                pc.output_buffer = pp_output_pointer;
 
-		if (shader_slot == 0 && mode != BeamformerDecodeMode_None) {
-			pc.output_rf_buffer = pp_input_pointer;
-			pc.rf_buffer        = rf_pointer;
-			pc.first_pass       = 1;
-
-			vk_command_push_constants(cmd, 0, sizeof(pc), &pc);
-			vk_command_dispatch_compute(cmd, dispatch);
-
-			pc.output_rf_buffer = 0;
-			pc.first_pass       = 0;
-		}
-
-		pc.rf_buffer = pp_input_pointer;
-
 		GPUMemoryBarrierInfo memory_barriers[]= {
 			// NOTE(rnp): first pass or last stage output
 			{
@@ -952,23 +1040,19 @@ do_compute_shader(BeamformerCtx *ctx, VulkanHandle cmd, BeamformerComputePlan *c
 	case BeamformerShaderKind_Filter:
 	case BeamformerShaderKind_Demodulate:
 	{
-		b32 demod = cp->pipeline.shaders[shader_slot] == BeamformerShaderKind_Demodulate;
-
-		BeamformerDataKind output_data_kind = cp->shader_descriptors[shader_slot].bake.Filter.data_kind;
-		if (demod) output_data_kind = BeamformerDataKind_Float16Complex;
-		if (cp->shader_descriptors[shader_slot].bake.Filter.output_floats) {
-			output_data_kind = demod ? BeamformerDataKind_Float32Complex
-			                         : BeamformerDataKind_Float32;
-		}
+		BeamformerDataKind output_data_kind = cp->shader_descriptors[shader_slot].output_data_kind;
 
 		u64 element_size = beamformer_data_kind_byte_size[output_data_kind];
 		u32 filter_slot  = cp->pipeline.parameters[shader_slot].filter_slot;
 		BeamformerFilterPushConstants pc = {
 			.filter_coefficients   = cp->filters[filter_slot].buffer.gpu_pointer,
 			.input_data            = shader_slot == 0 ? rf_pointer : pp_input_pointer,
-			.output_element_offset = 2 * output_index * pp_size / element_size,
+			.output_element_offset = output_index * pp_size / element_size,
 		};
 
+		if ((shader_slot + 1) == das_index)
+			pc.output_element_offset = das_output_index * pp_size / element_size;
+
 		GPUMemoryBarrierInfo memory_barriers[] = {
 			// NOTE(rnp): last stage output
 			{
@@ -991,7 +1075,7 @@ do_compute_shader(BeamformerCtx *ctx, VulkanHandle cmd, BeamformerComputePlan *c
 			barrier_count--;
 		}
 
-		if ((shader_slot + 1) != das_index || channel_offset == 0)
+		if ((shader_slot + 1) != das_index)
 			barrier_count--;
 
 		if (barrier_count)
@@ -1010,7 +1094,7 @@ do_compute_shader(BeamformerCtx *ctx, VulkanHandle cmd, BeamformerComputePlan *c
 
 		u64 frame_size   = beamformer_frame_byte_size(frame->points, frame->data_kind);
 		u64 iframe_size  = frame_size / beamformer_data_kind_element_count[frame->data_kind];
-		u64 element_size = beamformer_data_kind_byte_size[cp->shader_descriptors[shader_slot].bake.DAS.data_kind];
+		u64 element_size = beamformer_data_kind_byte_size[cp->shader_descriptors[shader_slot].input_data_kind];
 
 		BeamformerDASPushConstants pc = {
 			.xdc_element_pitch = cp->xdc_element_pitch,
@@ -1091,9 +1175,13 @@ do_compute_shader(BeamformerCtx *ctx, VulkanHandle cmd, BeamformerComputePlan *c
 	}break;
 
 	case BeamformerShaderKind_Reshape:{
+		BeamformerDataKind input_data_kind = cp->shader_descriptors[shader_slot].input_data_kind;
+		BeamformerReshapeBakeParameters *rb = &cp->shader_descriptors[shader_slot].bake.Reshape;
+		u64 input_pointer = shader_slot == 0 ? rf_pointer : pp_input_pointer;
 		BeamformerReshapePushConstants pc = {
-			.left_input_buffer  = pp_input_pointer,
-			.right_input_buffer = pp_input_pointer + cp->q_rf_data_offset,
+			.left_input_buffer  = input_pointer,
+			.right_input_buffer = input_pointer + rb->size_x * rb->size_y * rb->size_z
+			                                      * beamformer_data_kind_byte_size[input_data_kind],
 		};
 
 		if ((shader_slot + 1) == das_index) pc.output_buffer = pp_das_pointer;
diff --git a/beamformer_internal.h b/beamformer_internal.h
@@ -283,6 +283,8 @@ typedef struct {BEAMFORMER_COMPUTE_ARRAY_PARAMETERS_LIST} BeamformerComputeArray
 typedef struct {
 	uv3 layout;
 	uv3 dispatch;
+	BeamformerDataKind input_data_kind;
+	BeamformerDataKind output_data_kind;
 	BeamformerShaderBakeParameters bake;
 } BeamformerShaderDescriptor;
 
@@ -304,7 +306,6 @@ struct BeamformerComputePlan {
 	u32 rf_size;
 	i32 hadamard_order;
 	b32 iq_pipeline;
-	u32 q_rf_data_offset;
 
 	m4  voxel_transform;
 	m4  ui_voxel_transform;
diff --git a/generated/beamformer.meta.c b/generated/beamformer.meta.c
@@ -125,7 +125,6 @@ typedef enum {
 } BeamformerShaderKind;
 
 typedef struct {
-	u32 data_kind;
 	b32 use_shared_memory;
 	u32 decode_mode;
 	u32 output_channel_stride;
@@ -141,10 +140,8 @@ typedef struct {
 } BeamformerDecodeBakeParameters;
 
 typedef struct {
-	u32 data_kind;
 	u32 demodulate;
 	u32 complex_filter;
-	u32 output_floats;
 	u32 decimation_rate;
 	u32 filter_length;
 	u32 input_channel_stride;
@@ -160,7 +157,6 @@ typedef struct {
 } BeamformerFilterBakeParameters;
 
 typedef struct {
-	u32 data_kind;
 	u32 coherency_weighting;
 	u32 single_focus;
 	u32 single_orientation;
@@ -182,12 +178,6 @@ typedef struct {
 } BeamformerDASBakeParameters;
 
 typedef struct {
-	u32 data_kind;
-} BeamformerCoherencyWeightingBakeParameters;
-
-typedef struct {
-	u32 input_data_kind;
-	u32 output_data_kind;
 	u32 size_x;
 	u32 size_y;
 	u32 size_z;
@@ -205,8 +195,6 @@ typedef struct {
 	u64 hadamard_buffer;
 	u64 rf_buffer;
 	u64 output_buffer;
-	u64 output_rf_buffer;
-	b32 first_pass;
 } BeamformerDecodePushConstants;
 
 typedef struct {
@@ -421,11 +409,10 @@ typedef struct {
 } BeamformerDASArrayParameters;
 
 typedef union {
-	BeamformerDecodeBakeParameters             Decode;
-	BeamformerFilterBakeParameters             Filter;
-	BeamformerDASBakeParameters                DAS;
-	BeamformerCoherencyWeightingBakeParameters CoherencyWeighting;
-	BeamformerReshapeBakeParameters            Reshape;
+	BeamformerDecodeBakeParameters  Decode;
+	BeamformerFilterBakeParameters  Filter;
+	BeamformerDASBakeParameters     DAS;
+	BeamformerReshapeBakeParameters Reshape;
 } BeamformerShaderBakeParameters;
 
 read_only global u8 beamformer_data_kind_element_size[] = {
@@ -464,6 +451,24 @@ read_only global b8 beamformer_data_kind_complex[] = {
 	1,
 };
 
+read_only global s8 beamformer_data_kind_glsl_type[] = {
+	s8_comp("int16_t"),
+	s8_comp("i16vec2"),
+	s8_comp("float32_t"),
+	s8_comp("f32vec2"),
+	s8_comp("float16_t"),
+	s8_comp("f16vec2"),
+};
+
+read_only global s8 beamformer_data_kind_s8[] = {
+	s8_comp("Int16"),
+	s8_comp("Int16Complex"),
+	s8_comp("Float32"),
+	s8_comp("Float32Complex"),
+	s8_comp("Float16"),
+	s8_comp("Float16Complex"),
+};
+
 read_only global u8 beamformer_contrast_mode_samples[] = {
 	1,
 	3,
@@ -596,14 +601,6 @@ read_only global i32 beamformer_reloadable_render_shader_info_indices[] = {
 
 read_only global s8 beamformer_shader_global_header_strings[] = {
 	s8_comp(""
-	"#define DataKind_Int16          0\n"
-	"#define DataKind_Int16Complex   1\n"
-	"#define DataKind_Float32        2\n"
-	"#define DataKind_Float32Complex 3\n"
-	"#define DataKind_Float16        4\n"
-	"#define DataKind_Float16Complex 5\n"
-	"\n"),
-	s8_comp(""
 	"#define DecodeMode_None     0\n"
 	"#define DecodeMode_Hadamard 1\n"
 	"\n"),
@@ -612,8 +609,6 @@ read_only global s8 beamformer_shader_global_header_strings[] = {
 	"  uint64_t hadamard_buffer;\n"
 	"  uint64_t rf_buffer;\n"
 	"  uint64_t output_buffer;\n"
-	"  uint64_t output_rf_buffer;\n"
-	"  bool     first_pass;\n"
 	"};\n"
 	"\n"),
 	s8_comp(""
@@ -754,32 +749,31 @@ read_only global b8 beamformer_shader_primitive_is_vertex[] = {
 };
 
 read_only global i32 *beamformer_shader_header_vectors[] = {
-	(i32 []){0, 1, 2},
-	(i32 []){0, 3, 4, 5},
-	(i32 []){6, 7, 0, 8, 9, 3, 4, 10, 11},
-	(i32 []){0, 12},
-	0,
-	(i32 []){0, 13},
-	(i32 []){0, 14},
+	(i32 []){0, 1},
+	(i32 []){2, 3, 4},
+	(i32 []){5, 6, 7, 8, 2, 3, 9, 10},
+	(i32 []){11},
+	0,
+	(i32 []){12},
+	(i32 []){13},
+	(i32 []){14},
 	(i32 []){15},
-	(i32 []){0, 16},
 };
 
 read_only global i32 beamformer_shader_header_vector_lengths[] = {
-	3,
-	4,
-	9,
 	2,
+	3,
+	8,
+	1,
 	0,
-	2,
-	2,
 	1,
-	2,
+	1,
+	1,
+	1,
 };
 
 read_only global s8 *beamformer_shader_bake_parameter_names[] = {
 	(s8 []){
-		s8_comp("DataKind"),
 		s8_comp("UseSharedMemory"),
 		s8_comp("DecodeMode"),
 		s8_comp("OutputChannelStride"),
@@ -794,10 +788,8 @@ read_only global s8 *beamformer_shader_bake_parameter_names[] = {
 		s8_comp("CooperativeMatrixK"),
 	},
 	(s8 []){
-		s8_comp("DataKind"),
 		s8_comp("Demodulate"),
 		s8_comp("ComplexFilter"),
-		s8_comp("OutputFloats"),
 		s8_comp("DecimationRate"),
 		s8_comp("FilterLength"),
 		s8_comp("InputChannelStride"),
@@ -812,7 +804,6 @@ read_only global s8 *beamformer_shader_bake_parameter_names[] = {
 		s8_comp("SamplingFrequency"),
 	},
 	(s8 []){
-		s8_comp("DataKind"),
 		s8_comp("CoherencyWeighting"),
 		s8_comp("SingleFocus"),
 		s8_comp("SingleOrientation"),
@@ -834,12 +825,8 @@ read_only global s8 *beamformer_shader_bake_parameter_names[] = {
 	},
 	0,
 	0,
+	0,
 	(s8 []){
-		s8_comp("DataKind"),
-	},
-	(s8 []){
-		s8_comp("InputDataKind"),
-		s8_comp("OutputDataKind"),
 		s8_comp("SizeX"),
 		s8_comp("SizeY"),
 		s8_comp("SizeZ"),
@@ -858,8 +845,8 @@ read_only global s8 *beamformer_shader_bake_parameter_names[] = {
 
 read_only global u32 beamformer_shader_bake_parameter_float_bits[] = {
 	0x00000000UL,
-	0x0000c000UL,
-	0x0007f000UL,
+	0x00003000UL,
+	0x0003f800UL,
 	0x00000000UL,
 	0x00000000UL,
 	0x00000000UL,
@@ -869,13 +856,13 @@ read_only global u32 beamformer_shader_bake_parameter_float_bits[] = {
 };
 
 read_only global u8 beamformer_shader_bake_parameter_counts[] = {
-	13,
-	16,
-	19,
+	12,
+	14,
+	18,
 	0,
 	0,
-	1,
-	13,
+	0,
+	11,
 	0,
 	0,
 };
diff --git a/lib/ogl_beamformer_lib.c b/lib/ogl_beamformer_lib.c
@@ -273,9 +273,7 @@ validate_parameters(BeamformerParameters *bp)
 function b32
 validate_pipeline(i32 *shaders, u32 shader_count, BeamformerDataKind data_kind)
 {
-	b32 data_kind_test = Between(data_kind, 0, BeamformerDataKind_Count - 1) &&
-	                     data_kind != BeamformerDataKind_Float16 &&
-	                     data_kind != BeamformerDataKind_Float16Complex;
+	b32 data_kind_test = Between(data_kind, 0, BeamformerDataKind_Count - 1);
 	if (!lib_error_check(data_kind_test, InvalidDataKind))
 		return 0;
 
diff --git a/math.c b/math.c
@@ -160,6 +160,30 @@ iv3_dimension(iv3 points)
 	return result;
 }
 
+function bv3
+iv3_equal(iv3 a, iv3 b)
+{
+	bv3 result;
+	result.x = a.x == b.x;
+	result.y = a.y == b.y;
+	result.z = a.z == b.z;
+	return result;
+}
+
+function b32
+bv3_all(bv3 a)
+{
+	b32 result = a.x != 0 && a.y != 0 && a.z != 0;
+	return result;
+}
+
+function b32
+bv3_any(bv3 a)
+{
+	b32 result = a.x != 0 || a.y != 0 || a.z != 0;
+	return result;
+}
+
 function v2
 clamp_v2_rect(v2 v, Rect r)
 {
diff --git a/shaders/coherency_weighting.glsl b/shaders/coherency_weighting.glsl
@@ -15,10 +15,10 @@ layout(std430, buffer_reference, buffer_reference_align = 8) restrict buffer Flo
 	vec2 values[];
 };
 
-#if   DataKind == DataKind_Float32
+#if   InputDataKind == DataKind_Float32
   #define COHERENT_SAMPLE(index)    Float32(left_side_buffer).values[index]
   #define INCOHERENT_SAMPLE(index)  Float32(right_side_buffer).values[index]
-#elif DataKind == DataKind_Float32Complex
+#elif InputDataKind == DataKind_Float32Complex
   #define COHERENT_SAMPLE(index)    Float32Complex(left_side_buffer).values[index]
   #define INCOHERENT_SAMPLE(index)  Float32(right_side_buffer).values[index]
 #else
diff --git a/shaders/das.glsl b/shaders/das.glsl
@@ -1,12 +1,12 @@
 /* See LICENSE for license details. */
-#if   DataKind == DataKind_Float32
+#if   InputDataKind == DataKind_Float32
   #if CoherencyWeighting
     #define RESULT_TYPE               vec2
     #define RESULT_COHERENT_CAST(a)   (a).x
     #define RESULT_INCOHERENT_CAST(a) (a).y
   #endif
   #define SAMPLE_TYPE f32
-#elif DataKind == DataKind_Float32Complex
+#elif InputDataKind == DataKind_Float32Complex
   #if CoherencyWeighting
     #define RESULT_TYPE               vec3
     #define RESULT_COHERENT_CAST(a)   (a).xy
@@ -14,7 +14,7 @@
   #endif
   #define SAMPLE_TYPE f32vec2
 #else
-  #error DataKind unsupported for DAS
+  #error InputDataKind unsupported for DAS
 #endif
 
 #ifndef RESULT_TYPE
@@ -32,7 +32,7 @@
 #endif
 
 layout(set = ShaderResourceKind_Buffer, binding = ShaderBufferSlot_PingPong) readonly buffer RF {
-	SAMPLE_TYPE rf[];
+	InputDataType rf[];
 };
 
 layout(std430, buffer_reference) restrict readonly buffer ArrayParameters {
@@ -40,7 +40,7 @@ layout(std430, buffer_reference) restrict readonly buffer ArrayParameters {
 };
 
 layout(std430, buffer_reference) buffer Output {
-	SAMPLE_TYPE x[];
+	OutputDataType x[];
 };
 
 layout(std430, buffer_reference) buffer IncoherentOutput {
@@ -52,7 +52,7 @@ layout(std430, buffer_reference) buffer IncoherentOutput {
 
 #define C_SPLINE 0.5
 
-#if DataKind == DataKind_Float32Complex
+#if InputDataKind == DataKind_Float32Complex
 vec2 rotate_iq(const vec2 iq, const float time)
 {
 	float arg    = radians(360) * DemodulationFrequency * time;
@@ -88,12 +88,12 @@ SAMPLE_TYPE cubic(const int offset, const float t)
 	SAMPLE_TYPE T1 = C_SPLINE * (P2 - samples[0]);
 	SAMPLE_TYPE T2 = C_SPLINE * (samples[3] - P1);
 
-	#if   DataKind == DataKind_Float32
+	#if   InputDataKind == DataKind_Float32
 	vec4 C = vec4(P1.x, P2.x, T1.x, T2.x);
-	float result = dot(S, h * C);
-	#elif DataKind == DataKind_Float32Complex
+	SAMPLE_TYPE result = dot(S, h * C);
+	#elif InputDataKind == DataKind_Float32Complex
 	mat2x4 C = mat2x4(vec4(P1.x, P2.x, T1.x, T2.x), vec4(P1.y, P2.y, T1.y, T2.y));
-	vec2 result = S * h * C;
+	SAMPLE_TYPE result = S * h * C;
 	#endif
 	return result;
 }
diff --git a/shaders/decode.glsl b/shaders/decode.glsl
@@ -5,74 +5,57 @@
 #extension GL_KHR_memory_scope_semantics : require
 #endif
 
-#if   DataKind == DataKind_Float32
-  #define INPUT_DATA_TYPE  f32
-#elif DataKind == DataKind_Float16
-  #define INPUT_DATA_TYPE  f16
-#elif DataKind == DataKind_Int16
-  #define INPUT_DATA_TYPE  s16
-#else
-  #error unsupported data kind for Decode
-#endif
-
-#define SAMPLE_DATA_TYPE f32
-
 layout(std430, buffer_reference, buffer_reference_align = 64) restrict readonly buffer RF {
-	INPUT_DATA_TYPE values[];
-};
-
-layout(std430, buffer_reference, buffer_reference_align = 64) restrict writeonly buffer OutputRF {
-	INPUT_DATA_TYPE values[];
+	InputDataType x[];
 };
 
 layout(std430, buffer_reference, buffer_reference_align = 64) restrict writeonly buffer Output {
-	SAMPLE_DATA_TYPE values[];
+	OutputDataType x[];
 };
 
 layout(std430, buffer_reference, buffer_reference_align = 64) restrict readonly buffer Hadamard {
-	f16 values[];
+	f16 x[];
 };
 
-SAMPLE_DATA_TYPE sample_rf_data(uint index)
+OutputDataType sample_rf_data(u32 index)
 {
-	SAMPLE_DATA_TYPE result = SAMPLE_DATA_TYPE(RF(rf_buffer).values[index]);
+	OutputDataType result = OutputDataType(RF(rf_buffer).x[index]);
 	return result;
 }
 
 #if UseSharedMemory
 
-shared INPUT_DATA_TYPE rf[gl_WorkGroupSize.x * TransmitCount];
+shared InputDataType rf[gl_WorkGroupSize.y][TransmitCount];
 void run_decode_large(void)
 {
-	uint time_sample = gl_GlobalInvocationID.x;
-	uint channel     = gl_GlobalInvocationID.y;
-	uint transmit    = gl_GlobalInvocationID.z * ToProcess;
+	u32 transmit    = gl_GlobalInvocationID.x * ToProcess;
+	u32 channel     = gl_GlobalInvocationID.y;
+	u32 time_sample = gl_GlobalInvocationID.z;
 
-	uint thread_count = gl_WorkGroupSize.x * gl_WorkGroupSize.y * gl_WorkGroupSize.z;
-	uint thread_index = gl_LocalInvocationIndex;
+	const u32 samples_per_thread = TransmitCount / gl_WorkGroupSize.x;
+	const u32 leftover_samples   = TransmitCount % gl_WorkGroupSize.x;
 
-	uint samples_per_thread  = rf.length() / thread_count;
-	uint leftover_samples    = rf.length() % thread_count;
-	uint samples_this_thread = samples_per_thread + uint(thread_index < leftover_samples);
+	u32 thread_index_x      = gl_LocalInvocationID.x;
+	u32 samples_this_thread = samples_per_thread + u32(thread_index_x < leftover_samples);
 
-	u32 rf_offset = TransmitCount * ChunkChannelCount * gl_WorkGroupID.x * gl_WorkGroupSize.x + TransmitCount * channel;
+	u32 rf_offset = TransmitCount * ChunkChannelCount * gl_WorkGroupID.z + TransmitCount * channel;
 
-	for (uint i = 0; i < samples_this_thread; i++) {
-		uint index = i * thread_count + thread_index;
-		rf[index] = RF(rf_buffer).values[rf_offset + index];
+	for (u32 i = 0; i < samples_this_thread; i++) {
+		u32 index = i * gl_WorkGroupSize.x + thread_index_x;
+		rf[gl_LocalInvocationID.y][index] = RF(rf_buffer).x[rf_offset + index];
 	}
 
 	barrier();
 
-	SAMPLE_DATA_TYPE result[ToProcess];
+	OutputDataType result[ToProcess];
 	if (time_sample < OutputTransmitStride) {
-		for (uint i = 0; i < ToProcess; i++)
-			result[i] = SAMPLE_DATA_TYPE(0);
+		for (s32 i = 0; i < ToProcess; i++)
+			result[i] = OutputDataType(0);
 
-		for (int j = 0; j < TransmitCount; j++) {
-			SAMPLE_DATA_TYPE s = SAMPLE_DATA_TYPE(rf[gl_LocalInvocationID.x * TransmitCount + j]);
-			for (uint i = 0; i < ToProcess; i++)
-				result[i] += s * Hadamard(hadamard_buffer).values[TransmitCount * j + (i + transmit)];
+		for (s32 j = 0; j < TransmitCount; j++) {
+			OutputDataType s = OutputDataType(rf[gl_LocalInvocationID.y][j]);
+			for (s32 i = 0; i < ToProcess; i++)
+				result[i] += s * Hadamard(hadamard_buffer).x[TransmitCount * j + (i + transmit)];
 		}
 
 		for (uint i = 0; i < ToProcess; i++)
@@ -87,8 +70,8 @@ void run_decode_large(void)
 		               OutputSampleStride   * time_sample;
 
 		for (uint i = 0; i < ToProcess; i++, out_off += OutputTransmitStride)
-			if (TransmitCount % (gl_WorkGroupSize.z * ToProcess) == 0 || transmit + i < TransmitCount)
-				Output(output_buffer).values[out_off] = result[i];
+			if (TransmitCount % (gl_WorkGroupSize.x * ToProcess) == 0 || transmit + i < TransmitCount)
+				Output(output_buffer).x[out_off] = result[i];
 	}
 }
 #endif
@@ -116,12 +99,12 @@ void run_decode_coop(void)
 	for (u32 k = 0; k < TransmitCount; k += CooperativeMatrixK) {
 		u32 rf_tile_row = CooperativeMatrixM * tile_index.y;
 		u32 rf_tile_col = k;
-		coopMatLoad(rf_matrix, RF(rf_buffer).values, offset + TransmitCount * rf_tile_row + rf_tile_col,
+		coopMatLoad(rf_matrix, RF(rf_buffer).x, offset + TransmitCount * rf_tile_row + rf_tile_col,
 		            TransmitCount, gl_CooperativeMatrixLayoutRowMajor);
 
 		u32 hadamard_tile_row = k;
 		u32 hadamard_tile_col = CooperativeMatrixN * tile_index.x;
-		coopMatLoad(hadamard_matrix, Hadamard(hadamard_buffer).values,
+		coopMatLoad(hadamard_matrix, Hadamard(hadamard_buffer).x,
 		            TransmitCount * hadamard_tile_row + hadamard_tile_col, TransmitCount,
 		            gl_CooperativeMatrixLayoutRowMajor);
 
@@ -132,7 +115,7 @@ void run_decode_coop(void)
 		result[i] = result[i] / f32(TransmitCount);
 
 	Output out_buffer = Output(output_buffer);
-	coopMatStore(result, out_buffer.values, offset + TransmitCount * result_row + result_col,
+	coopMatStore(result, out_buffer.x, offset + TransmitCount * result_row + result_col,
 	             TransmitCount, gl_CooperativeMatrixLayoutRowMajor);
 	#endif
 }
@@ -145,18 +128,18 @@ void run_decode_small(void)
 	u32 rf_offset   = TransmitCount * ChunkChannelCount * time_sample + TransmitCount * channel;
 
 	if (time_sample < OutputTransmitStride) {
-		INPUT_DATA_TYPE rf[TransmitCount];
-		for (int j = 0; j < TransmitCount; j++)
-			rf[j] = RF(rf_buffer).values[rf_offset + j];
-
-		SAMPLE_DATA_TYPE result[TransmitCount];
-		for (int j = 0; j < TransmitCount; j++)
-			result[j] = SAMPLE_DATA_TYPE(0);
-
-		for (int i = 0; i < TransmitCount; i++) {
-			SAMPLE_DATA_TYPE s = SAMPLE_DATA_TYPE(rf[i]);
-			for (int j = 0; j < TransmitCount; j++) {
-				result[j] += s * Hadamard(hadamard_buffer).values[TransmitCount * i + j];
+		InputDataType rf[TransmitCount];
+		for (s32 j = 0; j < TransmitCount; j++)
+			rf[j] = RF(rf_buffer).x[rf_offset + j];
+
+		OutputDataType result[TransmitCount];
+		for (s32 j = 0; j < TransmitCount; j++)
+			result[j] = OutputDataType(0);
+
+		for (s32 i = 0; i < TransmitCount; i++) {
+			OutputDataType s = OutputDataType(rf[i]);
+			for (s32 j = 0; j < TransmitCount; j++) {
+				result[j] += s * Hadamard(hadamard_buffer).x[TransmitCount * i + j];
 			}
 		}
 
@@ -166,7 +149,7 @@ void run_decode_small(void)
 		uint out_off = OutputChannelStride  * channel +
 		               OutputSampleStride   * time_sample;
 		for (int i = 0; i < TransmitCount; i++, out_off += OutputTransmitStride)
-			Output(output_buffer).values[out_off] = result[i];
+			Output(output_buffer).x[out_off] = result[i];
 	}
 }
 
diff --git a/shaders/filter.glsl b/shaders/filter.glsl
@@ -1,63 +1,21 @@
 /* See LICENSE for license details. */
-#if   DataKind == DataKind_Float32Complex || (DataKind == DataKind_Float32 && Demodulate)
-  #define INPUT_TYPE  f32vec2
-  #define SAMPLE_TYPE f32vec2
-  #if BatchSampleCount
-    #define OUTPUT_TYPE f32
-  #else
-    #define OUTPUT_TYPE f32vec2
-  #endif
-#elif DataKind == DataKind_Float32
-  #define INPUT_TYPE  f32
-  #define SAMPLE_TYPE f32
-  #define OUTPUT_TYPE f32
-#elif DataKind == DataKind_Float16Complex || (DataKind == DataKind_Float16 && Demodulate)
-  #define INPUT_TYPE  f16vec2
-  #define SAMPLE_TYPE f16vec2
-  #if OutputFloats
-    #if BatchSampleCount
-      #define OUTPUT_TYPE f32
-    #else
-      #define OUTPUT_TYPE f32vec2
-    #endif
-  #else
-    #if BatchSampleCount
-      #define OUTPUT_TYPE f16
-    #else
-      #define OUTPUT_TYPE f16vec2
-    #endif
-  #endif
-#elif DataKind == DataKind_Float16
-  #define INPUT_TYPE  f16
-  #define SAMPLE_TYPE f16
-  #define OUTPUT_TYPE f16
-#elif DataKind == DataKind_Int16Complex   || (DataKind == DataKind_Int16   && Demodulate)
-  #define INPUT_TYPE  s16vec2
+#if  (InputDataKind == DataKind_Int16Complex          || \
+      (InputDataKind == DataKind_Int16 && Demodulate) || \
+      (InputDataKind == DataKind_Float16 && Demodulate))
   #define SAMPLE_TYPE f16vec2
-  #if OutputFloats
-    #if BatchSampleCount
-      #define OUTPUT_TYPE f32
-    #else
-      #define OUTPUT_TYPE f32vec2
-    #endif
-  #else
-    #if BatchSampleCount
-      #define OUTPUT_TYPE f16
-    #else
-      #define OUTPUT_TYPE f16vec2
-    #endif
-  #endif
-#elif DataKind == DataKind_Int16
-  #define INPUT_TYPE  s16
+#elif InputDataKind == DataKind_Int16
   #define SAMPLE_TYPE f16
-  #define OUTPUT_TYPE f16
-#else
-  #error unsupported data kind
+#elif InputDataKind == DataKind_Float32 && Demodulate
+  #define SAMPLE_TYPE f32vec2
+#endif
+
+#ifndef SAMPLE_TYPE
+  #define SAMPLE_TYPE InputDataType
 #endif
 
-#define ComplexSampleType (DataKind == DataKind_Float32Complex || \
-                           DataKind == DataKind_Float16Complex || \
-                           DataKind == DataKind_Int16Complex || \
+#define ComplexSampleType (InputDataKind == DataKind_Float32Complex || \
+                           InputDataKind == DataKind_Float16Complex || \
+                           InputDataKind == DataKind_Int16Complex   || \
                            Demodulate)
 #if ComplexSampleType
   #define RESULT_TYPE f32vec2
@@ -78,11 +36,11 @@
 #endif
 
 layout(std430, buffer_reference, buffer_reference_align = 64) restrict readonly buffer Input {
-	INPUT_TYPE x[];
+	InputDataType x[];
 };
 
 layout(set = ShaderResourceKind_Buffer, binding = ShaderBufferSlot_PingPong) buffer Output {
-	OUTPUT_TYPE output_data[];
+	OutputDataType output_data[];
 };
 
 layout(std430, buffer_reference, buffer_reference_align = 64) restrict readonly buffer Filter {
@@ -162,11 +120,11 @@ void main()
 
 		#if BatchSampleCount
 		// NOTE(rnp): deinterleave
-		output_data[output_element_offset + out_offset] = OUTPUT_TYPE(result.x);
+		output_data[output_element_offset + out_offset] = OutputDataType(result.x);
 		out_offset += BatchSampleCount;
-		output_data[output_element_offset + out_offset] = OUTPUT_TYPE(result.y);
+		output_data[output_element_offset + out_offset] = OutputDataType(result.y);
 		#else
-		output_data[output_element_offset + out_offset] = OUTPUT_TYPE(result);
+		output_data[output_element_offset + out_offset] = OutputDataType(result);
 		#endif
 	}
 }
diff --git a/shaders/reshape.glsl b/shaders/reshape.glsl
@@ -13,47 +13,21 @@
 #endif
 
 #if   OutputDataKind == DataKind_Float32Complex
-  #if Interleave
-    #define InterleaveWide 1
-    #define Output     Float32V4
-    #define OutputKind f32vec4
-  #else
-    #define Output     Float32Complex
-    #define OutputKind f32vec2
-  #endif
+  #define Output     Float32Complex
+  #define OutputKind f32vec2
 #elif OutputDataKind == DataKind_Float32
-  #if Interleave
-    #define Output     Float32Complex
-    #define OutputKind f32vec2
-  #else
-    #define Output     Float32
-    #define OutputKind f32
-  #endif
+  #define Output     Float32
+  #define OutputKind f32
 #elif OutputDataKind == DataKind_Float16Complex || OutputDataKind == DataKind_Int16Complex
-  #if Interleave
-    #define InterleaveWide 1
-    #define Output     Int16V4
-    #define OutputKind i16vec4
-  #else
-    #define Output     Int16Complex
-    #define OutputKind s16vec2
-  #endif
+  #define Output     Int16Complex
+  #define OutputKind s16vec2
 #elif OutputDataKind == DataKind_Float16 || OutputDataKind == DataKind_Int16
-  #if Interleave
-    #define Output     Int16Complex
-    #define OutputKind s16vec2
-  #else
-    #define Output     Int16
-    #define OutputKind s16
-  #endif
+  #define Output     Int16
+  #define OutputKind s16
 #else
   #error unsupported data kind for Reshape
 #endif
 
-#ifndef InterleaveWide
-  #define InterleaveWide 0
-#endif
-
 layout(std430, buffer_reference, buffer_reference_align = 8) restrict buffer Int16 {
 	s16 x[];
 };
@@ -62,10 +36,6 @@ layout(std430, buffer_reference, buffer_reference_align = 8) restrict buffer Int
 	s16vec2 x[];
 };
 
-layout(std430, buffer_reference, buffer_reference_align = 8) restrict buffer Int16V4 {
-	i16vec4 x[];
-};
-
 layout(std430, buffer_reference, buffer_reference_align = 8) restrict buffer Float32 {
 	f32 x[];
 };
@@ -74,10 +44,6 @@ layout(std430, buffer_reference, buffer_reference_align = 8) restrict buffer Flo
 	f32vec2 x[];
 };
 
-layout(std430, buffer_reference, buffer_reference_align = 8) restrict buffer Float32V4 {
-	f32vec4 x[];
-};
-
 void main(void)
 {
 	if (all(lessThan(gl_GlobalInvocationID, uvec3(SizeX, SizeY, SizeZ)))) {
@@ -90,16 +56,11 @@ void main(void)
 
 		OutputKind out_value = OutputKind(0);
 
-		#if Interleave && InterleaveWide
-			out_value.xy = Input(left_input_buffer).x[input_index];
-			out_value.zw = Input(right_input_buffer).x[input_index];
+		#if Interleave
+		out_value[0] = Input(left_input_buffer).x[input_index];
+		out_value[1] = Input(right_input_buffer).x[input_index];
 		#else
-			#if Interleave
-			out_value[0] = Input(left_input_buffer).x[input_index];
-			out_value[1] = Input(right_input_buffer).x[input_index];
-			#else
-			out_value = Input(left_input_buffer).x[input_index];
-			#endif
+		out_value = Input(left_input_buffer).x[input_index];
 		#endif
 
 		Output(output_buffer).x[output_index] = out_value;
diff --git a/util.h b/util.h
@@ -168,6 +168,9 @@ typedef u64      uptr;
 	asan_poison_region((v), sizeof(*(v))); \
 } while(0)
 
+#define DLLPushEnd(l, n) ((n)->prev = (l)->prev, ((l)->prev ? (l)->prev->next = (n) : (0)), (l)->prev = (n), (n)->next = (l))
+
+// TODO(rnp): cleanup
 #define DLLPushDown(v, list) do { \
 	(v)->next = (list);                   \
 	if ((v)->next) (v)->next->prev = (v); \
@@ -288,6 +291,11 @@ typedef union {
 } uv4;
 
 typedef union {
+	struct { b32 x, y, z; };
+	b32 E[3];
+} bv3;
+
+typedef union {
 	struct { f32 x, y; };
 	struct { f32 w, h; };
 	f32 E[2];

M	beamformer.meta	\|	45	+++++++++++++++------------------------------
M	beamformer_core.c	\|	582	+++++++++++++++++++++++++++++++++++++++++++++----------------------------------
M	beamformer_internal.h	\|	3	++-
M	generated/beamformer.meta.c	\|	101	++++++++++++++++++++++++++++++++++---------------------------------------------
M	lib/ogl_beamformer_lib.c	\|	4	+---
M	math.c	\|	24	++++++++++++++++++++++++
M	shaders/coherency_weighting.glsl	\|	4	++--
M	shaders/das.glsl	\|	20	++++++++++----------
M	shaders/decode.glsl	\|	101	+++++++++++++++++++++++++++++++++----------------------------------------------
M	shaders/filter.glsl	\|	78	++++++++++++++++++------------------------------------------------------------
M	shaders/reshape.glsl	\|	63	++++++++++++---------------------------------------------------
M	util.h	\|	8	++++++++