core/api: drop Cuda shader prefix, remove CudaDecode - ogl_beamforming - Ultrasound Beamforming Implemented with OpenGL

Commit: 4f052280896bf6c43608992e9c14721ab2acbd97
Parent: 423409c940fc65a55e6306c9c2a482aec33f5d98
Author: Randy Palamar
Date:   Wed, 29 Apr 2026 14:41:43 -0600

core/api: drop Cuda shader prefix, remove CudaDecode

I would rather the operation of these be an implementation detail
rather than an API contract. If in the future there is a
compelling reason to use CUDA for decoding the use should be
decided internally and not by the API user.

Diffstat:
M beamformer.meta  | 5 ++---
M beamformer_core.c  | 21 ++++++++-------------
M beamformer_internal.h  | 5 -----
M beamformer_shared_memory.c  | 2 +-
M generated/beamformer.meta.c  | 35 ++++++++++++++++-------------------
M lib/ogl_beamformer_lib.c  | 2 +-
M tests/decode.c  | 13 +++----------
M tests/throughput.c  | 10 ++--------
M ui.c  | 6 +++---

9 files changed, 36 insertions(+), 63 deletions(-)
diff --git a/beamformer.meta b/beamformer.meta
@@ -286,9 +286,6 @@
 
 @ShaderGroup Compute
 {
-	@Shader CudaDecode
-	@Shader CudaHilbert
-
 	@Shader(decode.glsl) Decode
 	{
 		@Enumeration DecodeMode
@@ -415,6 +412,8 @@
 	}
 
 	@Shader(min_max.glsl) MinMax
+
+	@Shader Hilbert
 }
 
 // NOTE: shaders which need to be baked into the beamforming pipeline
diff --git a/beamformer_core.c b/beamformer_core.c
@@ -297,18 +297,18 @@ push_compute_graph_node(BeamformerComputeGraphNode *root, BeamformerShaderKind k
 function void
 plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb, Arena scratch)
 {
-	b32 run_cuda_hilbert = 0;
-	b32 demodulate       = 0;
+	b32 run_hilbert = 0;
+	b32 demodulate  = 0;
 
 	for (u32 i = 0; i < pb->pipeline.shader_count; i++) {
 		switch (pb->pipeline.shaders[i]) {
-		case BeamformerShaderKind_CudaHilbert:{ run_cuda_hilbert = 1; }break;
-		case BeamformerShaderKind_Demodulate:{  demodulate = 1;       }break;
+		case BeamformerShaderKind_Hilbert:{run_hilbert = 1;}break;
+		case BeamformerShaderKind_Demodulate:{demodulate = 1;}break;
 		default:{}break;
 		}
 	}
 
-	if (demodulate) run_cuda_hilbert = 0;
+	if (demodulate) run_hilbert = 0;
 
 	f32 sampling_frequency = pb->parameters.sampling_frequency;
 	u32 input_sample_count = pb->parameters.sample_count;
@@ -330,7 +330,7 @@ plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb, A
 		sampling_frequency /= (2 * decimation_rate);
 	}
 
-	cp->iq_pipeline = beamformer_data_kind_complex[input_data_kind] || run_cuda_hilbert;
+	cp->iq_pipeline = beamformer_data_kind_complex[input_data_kind] || run_hilbert;
 
 	BeamformerDataKind das_data_kind = cp->iq_pipeline ? BeamformerDataKind_Float32Complex
 	                                                   : BeamformerDataKind_Float32;
@@ -367,14 +367,13 @@ plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb, A
 	for EachIndex(pb->pipeline.shader_count, it) {
 		// NOTE(rnp): skip unnecessary shaders
 		switch (pb->pipeline.shaders[it]) {
-		case BeamformerShaderKind_CudaHilbert:{if (!run_cuda_hilbert) continue;}break;
+		case BeamformerShaderKind_Hilbert:{if (!run_hilbert) continue;}break;
 
 		case BeamformerShaderKind_Decode:{
 			if (pb->parameters.decode_mode == BeamformerDecodeMode_None)
 				continue;
 		}break;
 
-		case BeamformerShaderKind_CudaDecode:
 		case BeamformerShaderKind_Sum:
 		case BeamformerShaderKind_MinMax:
 		{
@@ -1030,11 +1029,7 @@ do_compute_shader(BeamformerCtx *ctx, VulkanHandle cmd, BeamformerComputePlan *c
 		cc->ping_pong_input_index = !cc->ping_pong_input_index;
 	}break;
 
-	case BeamformerShaderKind_CudaDecode:{
-		cuda_decode(0, output_index, 0);
-		cc->ping_pong_input_index = !cc->ping_pong_input_index;
-	}break;
-	case BeamformerShaderKind_CudaHilbert:{
+	case BeamformerShaderKind_Hilbert:{
 		cuda_hilbert(input_index, output_index);
 		cc->ping_pong_input_index = !cc->ping_pong_input_index;
 	}break;
diff --git a/beamformer_internal.h b/beamformer_internal.h
@@ -225,10 +225,6 @@ CUDA_INIT_FN(cuda_init_stub) {}
 typedef CUDA_REGISTER_BUFFERS_FN(cuda_register_buffers_fn);
 CUDA_REGISTER_BUFFERS_FN(cuda_register_buffers_stub) {}
 
-#define CUDA_DECODE_FN(name) void name(size_t input_offset, u32 output_buffer_idx, u32 rf_channel_offset)
-typedef CUDA_DECODE_FN(cuda_decode_fn);
-CUDA_DECODE_FN(cuda_decode_stub) {}
-
 #define CUDA_HILBERT_FN(name) void name(u32 input_buffer_idx, u32 output_buffer_idx)
 typedef CUDA_HILBERT_FN(cuda_hilbert_fn);
 CUDA_HILBERT_FN(cuda_hilbert_stub) {}
@@ -238,7 +234,6 @@ typedef CUDA_SET_CHANNEL_MAPPING_FN(cuda_set_channel_mapping_fn);
 CUDA_SET_CHANNEL_MAPPING_FN(cuda_set_channel_mapping_stub) {}
 
 #define CUDALibraryProcedureList \
-	X(decode,              "cuda_decode")              \
 	X(hilbert,             "cuda_hilbert")             \
 	X(init,                "init_cuda_configuration")  \
 	X(register_buffers,    "register_cuda_buffers")    \
diff --git a/beamformer_shared_memory.c b/beamformer_shared_memory.c
@@ -1,5 +1,5 @@
 /* See LICENSE for license details. */
-#define BEAMFORMER_SHARED_MEMORY_VERSION (29UL)
+#define BEAMFORMER_SHARED_MEMORY_VERSION (30UL)
 
 typedef enum {
 	BeamformerWorkKind_Compute,
diff --git a/generated/beamformer.meta.c b/generated/beamformer.meta.c
@@ -96,23 +96,22 @@ typedef enum {
 } BeamformerAcquisitionKind;
 
 typedef enum {
-	BeamformerShaderKind_CudaDecode         = 0,
-	BeamformerShaderKind_CudaHilbert        = 1,
-	BeamformerShaderKind_Decode             = 2,
-	BeamformerShaderKind_Filter             = 3,
-	BeamformerShaderKind_Demodulate         = 4,
-	BeamformerShaderKind_DAS                = 5,
-	BeamformerShaderKind_Sum                = 6,
-	BeamformerShaderKind_MinMax             = 7,
-	BeamformerShaderKind_CoherencyWeighting = 8,
-	BeamformerShaderKind_Reshape            = 9,
-	BeamformerShaderKind_BufferClear        = 10,
-	BeamformerShaderKind_RenderBeamformed   = 11,
+	BeamformerShaderKind_Decode             = 0,
+	BeamformerShaderKind_Filter             = 1,
+	BeamformerShaderKind_Demodulate         = 2,
+	BeamformerShaderKind_DAS                = 3,
+	BeamformerShaderKind_Sum                = 4,
+	BeamformerShaderKind_MinMax             = 5,
+	BeamformerShaderKind_Hilbert            = 6,
+	BeamformerShaderKind_CoherencyWeighting = 7,
+	BeamformerShaderKind_Reshape            = 8,
+	BeamformerShaderKind_BufferClear        = 9,
+	BeamformerShaderKind_RenderBeamformed   = 10,
 	BeamformerShaderKind_Count,
 
-	BeamformerShaderKind_ComputeFirst         = BeamformerShaderKind_CudaDecode,
-	BeamformerShaderKind_ComputeLast          = BeamformerShaderKind_MinMax,
-	BeamformerShaderKind_ComputeCount         = 8,
+	BeamformerShaderKind_ComputeFirst         = BeamformerShaderKind_Decode,
+	BeamformerShaderKind_ComputeLast          = BeamformerShaderKind_Hilbert,
+	BeamformerShaderKind_ComputeCount         = 7,
 	BeamformerShaderKind_ComputeHelpersFirst  = BeamformerShaderKind_CoherencyWeighting,
 	BeamformerShaderKind_ComputeHelpersLast   = BeamformerShaderKind_Reshape,
 	BeamformerShaderKind_ComputeHelpersCount  = 2,
@@ -525,14 +524,13 @@ read_only global s8 game_shader_buffer_slot_strings[] = {
 };
 
 read_only global s8 beamformer_shader_names[] = {
-	s8_comp("CudaDecode"),
-	s8_comp("CudaHilbert"),
 	s8_comp("Decode"),
 	s8_comp("Filter"),
 	s8_comp("Demodulate"),
 	s8_comp("DAS"),
 	s8_comp("Sum"),
 	s8_comp("MinMax"),
+	s8_comp("Hilbert"),
 	s8_comp("CoherencyWeighting"),
 	s8_comp("Reshape"),
 	s8_comp("BufferClear"),
@@ -564,14 +562,13 @@ read_only global s8 *beamformer_reloadable_shader_files[] = {
 };
 
 read_only global i32 beamformer_shader_reloadable_index_by_shader[] = {
-	-1,
-	-1,
 	0,
 	1,
 	1,
 	2,
 	3,
 	4,
+	-1,
 	5,
 	6,
 	7,
diff --git a/lib/ogl_beamformer_lib.c b/lib/ogl_beamformer_lib.c
@@ -654,7 +654,7 @@ beamformer_beamform_data(BeamformerSimpleParameters *bp, void *data, uint32_t da
 		b32 complex = 0;
 		for (u32 stage = 0; stage < bp->compute_stages_count; stage++) {
 			BeamformerShaderKind shader = (BeamformerShaderKind)bp->compute_stages[stage];
-			complex |= shader == BeamformerShaderKind_Demodulate || shader == BeamformerShaderKind_CudaHilbert;
+			complex |= shader == BeamformerShaderKind_Demodulate || shader == BeamformerShaderKind_Hilbert;
 		}
 
 		u64 output_size = output_points.x * output_points.y * output_points.z * sizeof(f32);
diff --git a/tests/decode.c b/tests/decode.c
@@ -17,7 +17,6 @@ read_only global u32 decode_transmit_counts[] = {
 
 typedef struct {
 	b32 loop;
-	b32 cuda;
 	b32 once;
 	b32 dump;
 	b32 full_aperture;
@@ -77,11 +76,10 @@ os_make_directory(char *name)
 function void
 usage(char *argv0)
 {
-	die("%s [--loop] [--once] [--full-aperture] [--cuda] [--warmup n] [--dump dir]\n"
+	die("%s [--loop] [--once] [--full-aperture] [--warmup n] [--dump dir]\n"
 	    "    --loop:          reupload data forever\n"
 	    "    --once:          only run a single frame\n"
 	    "    --full-aperture: recieve on full 256 channel aperture\n"
-	    "    --cuda:          use cuda for decoding\n"
 	    "    --warmup:        warmup with n runs\n"
 	    "    --dump:          dump output stats files to dir\n",
 	    argv0);
@@ -103,8 +101,6 @@ parse_argv(i32 argc, char *argv[])
 			result.loop = 1;
 		} else if (s8_equal(arg, s8("--full-aperture"))) {
 			result.full_aperture = 1;
-		} else if (s8_equal(arg, s8("--cuda"))) {
-			result.cuda = 1;
 		} else if (s8_equal(arg, s8("--dump"))) {
 			if (argc) {
 				result.outdir = *argv;
@@ -181,7 +177,6 @@ dump_stats(BeamformerComputeStatsTable *stats, Options *options, u32 transmit_co
 	char path_buffer[1024];
 	Stream sb = {.data = (u8 *)path_buffer, .cap = sizeof(path_buffer)};
 	stream_append_s8s(&sb, c_str_to_s8(options->outdir), s8(OS_PATH_SEPARATOR), s8("decode_"));
-	if (options->cuda) stream_append_s8(&sb, s8("cuda_"));
 	stream_append_u64(&sb, transmit_count);
 	stream_append_s8(&sb, s8(".bin"));
 	stream_append_byte(&sb, 0);
@@ -223,10 +218,8 @@ send_parameters(Options *options, u32 transmit_count)
 	};
 	beamformer_push_channel_mapping(channel_mapping, countof(channel_mapping));
 
-	i32 shader_stages[1];
-	if (options->cuda) shader_stages[0] = BeamformerShaderKind_CudaDecode;
-	else               shader_stages[0] = BeamformerShaderKind_Decode;
-	beamformer_push_pipeline(shader_stages, 1, BeamformerDataKind_Int16);
+	i32 shader_stages = BeamformerShaderKind_Decode;
+	beamformer_push_pipeline(&shader_stages, 1, BeamformerDataKind_Int16);
 	beamformer_set_global_timeout(1000);
 }
 
diff --git a/tests/throughput.c b/tests/throughput.c
@@ -21,7 +21,6 @@ global f32 g_f_number         = 0.5f;
 
 typedef struct {
 	b32 loop;
-	b32 cuda;
 	u32 frame_number;
 
 	char **remaining;
@@ -377,9 +376,8 @@ beamformer_simple_parameters_from_zbp_file(BeamformerSimpleParameters *bp, char 
 function void
 usage(char *argv0)
 {
-	die("%s [--loop] [--cuda] [--frame n] parameters_file\n"
+	die("%s [--loop] [--frame n] parameters_file\n"
 	    "    --loop:    reupload data forever\n"
-	    "    --cuda:    use cuda for decoding\n"
 	    "    --frame n: use frame n of the data for display\n",
 	    argv0);
 }
@@ -398,9 +396,6 @@ parse_argv(i32 argc, char *argv[])
 		if (s8_equal(arg, s8("--loop"))) {
 			shift(argv, argc);
 			result.loop = 1;
-		} else if (s8_equal(arg, s8("--cuda"))) {
-			shift(argv, argc);
-			result.cuda = 1;
 		} else if (s8_equal(arg, s8("--frame"))) {
 			shift(argv, argc);
 			if (argc) {
@@ -459,8 +454,7 @@ execute_study(Arena arena, Stream path, Options *options)
 	{
 		bp.compute_stages[bp.compute_stages_count++] = BeamformerShaderKind_Demodulate;
 	}
-	if (options->cuda) bp.compute_stages[bp.compute_stages_count++] = BeamformerShaderKind_CudaDecode;
-	else               bp.compute_stages[bp.compute_stages_count++] = BeamformerShaderKind_Decode;
+	bp.compute_stages[bp.compute_stages_count++] = BeamformerShaderKind_Decode;
 	bp.compute_stages[bp.compute_stages_count++] = BeamformerShaderKind_DAS;
 
 	{
diff --git a/ui.c b/ui.c
@@ -2928,9 +2928,9 @@ draw_compute_stats_view(BeamformerUI *ui, Arena arena, Variable *view, Rect r, v
 			cell_rect.size.w = t->widths[column];
 			text_spec.limits.size.w = r.size.w - (cell_rect.pos.x - it->start_x);
 
-			if (column == 0 && row_index < stages && vk_pipeline_valid(cp->vulkan_pipelines[row_index]) == 0 &&
-			    stats->table.shader_ids[row_index] != BeamformerShaderKind_CudaHilbert &&
-			    stats->table.shader_ids[row_index] != BeamformerShaderKind_CudaDecode)
+			if (column == 0 && row_index < stages &&
+			    vk_pipeline_valid(cp->vulkan_pipelines[row_index]) == 0 &&
+			    stats->table.shader_ids[row_index] != BeamformerShaderKind_Hilbert)
 			{
 				text_spec.colour = v4_lerp(FG_COLOUR, FOCUSED_COLOUR, ease_in_out_quartic(csv->blink.t));
 			} else {

M	beamformer.meta	\|	5	++---
M	beamformer_core.c	\|	21	++++++++-------------
M	beamformer_internal.h	\|	5	-----
M	beamformer_shared_memory.c	\|	2	+-
M	generated/beamformer.meta.c	\|	35	++++++++++++++++-------------------
M	lib/ogl_beamformer_lib.c	\|	2	+-
M	tests/decode.c	\|	13	+++----------
M	tests/throughput.c	\|	10	++--------
M	ui.c	\|	6	+++---