ogl_beamforming

Ultrasound Beamforming Implemented with OpenGL
git clone anongit@rnpnr.xyz:ogl_beamforming.git
Log | Files | Refs | Feed | Submodules | README | LICENSE

ogl_beamformer_lib.c (26804B)


      1 /* See LICENSE for license details. */
      2 #include "../compiler.h"
      3 
      4 #define BEAMFORMER_IMPORT static
      5 
      6 #include "../beamformer.h"
      7 
      8 #include "../util.h"
      9 
     10 #include "../generated/beamformer.meta.c"
     11 #include "../beamformer_parameters.h"
     12 #include "ogl_beamformer_lib_base.h"
     13 
     14 #if OS_LINUX
     15 #include "../os_linux.c"
     16 #elif OS_WINDOWS
     17 #include "../os_win32.c"
     18 
     19 W32(iptr) OpenFileMappingA(u32, b32, c8 *);
     20 
     21 #else
     22 #error Unsupported Platform
     23 #endif
     24 
     25 #include "../util_os.c"
     26 #include "../beamformer_shared_memory.c"
     27 
     28 global struct {
     29 	BeamformerSharedMemory *bp;
     30 	i32                     timeout_ms;
     31 	BeamformerLibErrorKind  last_error;
     32 	i64                     shared_memory_size;
     33 } g_beamformer_library_context;
     34 
     35 #if OS_LINUX
     36 
     37 function s8
     38 os_open_shared_memory_area(char *name)
     39 {
     40 	s8 result = {0};
     41 	i32 fd = shm_open(name, O_RDWR, S_IRUSR|S_IWUSR);
     42 	if (fd > 0) {
     43 		struct stat sb;
     44 		if (fstat(fd, &sb) != -1) {
     45 			void *new = mmap(0, sb.st_size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
     46 			if (new != MAP_FAILED) {
     47 				result.data = new;
     48 				result.len  = sb.st_size;
     49 			}
     50 		}
     51 		close(fd);
     52 	}
     53 	return result;
     54 }
     55 
     56 function void
     57 os_close_shared_memory_area(void *memory, i64 size)
     58 {
     59 	munmap(memory, size);
     60 }
     61 
     62 #elif OS_WINDOWS
     63 
     64 W32(u64) VirtualQuery(void *base_address, void *memory_basic_info, u64 memory_basic_info_size);
     65 W32(b32) UnmapViewOfFile(void *);
     66 
     67 function b32
     68 os_reserve_region_locks(void)
     69 {
     70 	u8 buffer[1024];
     71 	Stream sb = {.data = buffer, .cap = countof(buffer)};
     72 	stream_append_s8(&sb, s8(OS_SHARED_MEMORY_NAME "_lock_"));
     73 
     74 	i32 start_index    = sb.widx;
     75 	u32 reserved_count = 0;
     76 	for EachElement(os_w32_shared_memory_semaphores, it) {
     77 		stream_reset(&sb, start_index);
     78 		stream_append_u64(&sb, it);
     79 		stream_append_byte(&sb, 0);
     80 		os_w32_shared_memory_semaphores[it] = os_w32_create_semaphore((c8 *)sb.data, 1, 1);
     81 		if InvalidHandle(os_w32_shared_memory_semaphores[it])
     82 			break;
     83 		reserved_count++;
     84 	}
     85 
     86 	b32 result = reserved_count == countof(os_w32_shared_memory_semaphores);
     87 	if (!result) {
     88 		for (u32 i = 0; i < reserved_count; i++)
     89 			CloseHandle(os_w32_shared_memory_semaphores[i].value[0]);
     90 	}
     91 
     92 	return result;
     93 }
     94 
     95 function s8
     96 os_open_shared_memory_area(char *name)
     97 {
     98 	struct alignas(16) {
     99 		void *BaseAddress;
    100 		void *AllocationBase;
    101 		u32   AllocationProtect;
    102 		u32   __alignment1;
    103 		u64   RegionSize;
    104 		u32   State;
    105 		u32   Protect;
    106 		u32   Type;
    107 		u32   __alignment2;
    108 	} memory_basic_info;
    109 
    110 	s8 result = {0};
    111 	iptr h = OpenFileMappingA(FILE_MAP_ALL_ACCESS, 0, name);
    112 	if (h != INVALID_FILE) {
    113 		// NOTE(rnp): a size of 0 maps the whole region, we can determine its size after
    114 		void *new = MapViewOfFile(h, FILE_MAP_ALL_ACCESS, 0, 0, 0);
    115 		if (new &&
    116 		    VirtualQuery(new, &memory_basic_info, sizeof(memory_basic_info)) == sizeof(memory_basic_info) &&
    117 		    os_reserve_region_locks())
    118 		{
    119 			result.data = new;
    120 			result.len  = (i64)memory_basic_info.RegionSize;
    121 		}
    122 
    123 		if (new && !result.data)
    124 			UnmapViewOfFile(new);
    125 
    126 		CloseHandle(h);
    127 	}
    128 	return result;
    129 }
    130 
    131 function void
    132 os_close_shared_memory_area(void *memory, i64 size)
    133 {
    134 	UnmapViewOfFile(memory);
    135 }
    136 
    137 #endif
    138 
    139 #define lib_error_check(c, e) lib_error_check_(c, BeamformerLibErrorKind_##e)
    140 function b32
    141 lib_error_check_(b32 condition, BeamformerLibErrorKind error_kind)
    142 {
    143 	b32 result = condition;
    144 	if (!result) g_beamformer_library_context.last_error = error_kind;
    145 	assert(result);
    146 	return result;
    147 }
    148 
    149 function b32
    150 check_shared_memory(void)
    151 {
    152 	b32 result = g_beamformer_library_context.bp != 0;
    153 	if unlikely(!g_beamformer_library_context.bp) {
    154 		s8 shared_memory = os_open_shared_memory_area(OS_SHARED_MEMORY_NAME);
    155 		if (lib_error_check(shared_memory.data != 0, SharedMemory)) {
    156 			BeamformerSharedMemory *bp = (BeamformerSharedMemory *)shared_memory.data;
    157 			result = lib_error_check(bp->version == BEAMFORMER_SHARED_MEMORY_VERSION, VersionMismatch);
    158 			if (result) {
    159 				g_beamformer_library_context.bp                 = bp;
    160 				g_beamformer_library_context.shared_memory_size = shared_memory.len;
    161 			} else {
    162 				os_close_shared_memory_area(shared_memory.data, shared_memory.len);
    163 			}
    164 		}
    165 	}
    166 
    167 	if likely(g_beamformer_library_context.bp)
    168 		result = lib_error_check(likely(!g_beamformer_library_context.bp->invalid), InvalidAccess);
    169 	return result;
    170 }
    171 
    172 function b32
    173 valid_parameter_block(u32 block)
    174 {
    175 	b32 result = check_shared_memory();
    176 	if (result) {
    177 		result = lib_error_check(block < g_beamformer_library_context.bp->reserved_parameter_blocks,
    178 		                         ParameterBlockUnallocated);
    179 	}
    180 	return result;
    181 }
    182 
    183 function BeamformWork *
    184 try_push_work_queue(void)
    185 {
    186 	BeamformWork *result = beamform_work_queue_push(&g_beamformer_library_context.bp->external_work_queue);
    187 	lib_error_check(result != 0, WorkQueueFull);
    188 	return result;
    189 }
    190 
    191 function b32
    192 lib_try_lock(i32 lock, i32 timeout_ms)
    193 {
    194 	b32 result = beamformer_shared_memory_take_lock(g_beamformer_library_context.bp, lock, (u32)timeout_ms);
    195 	lib_error_check(result, SyncVariable);
    196 	return result;
    197 }
    198 
    199 function void
    200 lib_release_lock(i32 lock)
    201 {
    202 	beamformer_shared_memory_release_lock(g_beamformer_library_context.bp, lock);
    203 }
    204 
    205 u32
    206 beamformer_get_api_version(void)
    207 {
    208 	return BEAMFORMER_SHARED_MEMORY_VERSION;
    209 }
    210 
    211 const char *
    212 beamformer_error_string(BeamformerLibErrorKind kind)
    213 {
    214 	#define X(type, num, string) string,
    215 	local_persist const char *error_string_table[] = {BEAMFORMER_LIB_ERRORS "invalid error kind"};
    216 	#undef X
    217 	return error_string_table[MIN(kind, countof(error_string_table) - 1)];
    218 }
    219 
    220 BeamformerLibErrorKind
    221 beamformer_get_last_error(void)
    222 {
    223 	return g_beamformer_library_context.last_error;
    224 }
    225 
    226 const char *
    227 beamformer_get_last_error_string(void)
    228 {
    229 	return beamformer_error_string(beamformer_get_last_error());
    230 }
    231 
    232 void
    233 beamformer_set_global_timeout(u32 timeout_ms)
    234 {
    235 	g_beamformer_library_context.timeout_ms = timeout_ms;
    236 }
    237 
    238 b32
    239 beamformer_reserve_parameter_blocks(uint32_t count)
    240 {
    241 	b32 result = 0;
    242 	if (check_shared_memory() &&
    243 	    lib_error_check(count <= BeamformerMaxParameterBlocks, ParameterBlockOverflow))
    244 	{
    245 		g_beamformer_library_context.bp->reserved_parameter_blocks = count;
    246 		result = 1;
    247 	}
    248 	return result;
    249 }
    250 
    251 function b32
    252 validate_parameters(BeamformerParameters *bp)
    253 {
    254 	if (!lib_error_check(Between(bp->contrast_mode, 0, BeamformerContrastMode_Count - 1), InvalidContrastMode))
    255 		return 0;
    256 
    257 	u32 contrast_raw_sample_count = bp->acquisition_count * bp->sample_count * beamformer_contrast_mode_samples[bp->contrast_mode];
    258 	if (!lib_error_check(contrast_raw_sample_count <= bp->raw_data_dimensions.x, DataSizeMismatch))
    259 		return 0;
    260 
    261 	// NOTE(rnp): frame size checks
    262 	{
    263 		// TODO(rnp): this check is overly conservative, what if we are exporting something smaller than Float32Complex
    264 		u64 buffer_size     = g_beamformer_library_context.bp->beamformed_frame_buffer_size;
    265 		u64 frame_size      = Max(1, bp->output_points.x) * Max(1, bp->output_points.y) * Max(1, bp->output_points.z)
    266 		                      * beamformer_data_kind_byte_size[BeamformerDataKind_Float32Complex];
    267 		u64 incoherent_size = frame_size / 2;
    268 		if (bp->coherency_weighting)
    269 			buffer_size -= incoherent_size;
    270 
    271 		if (!lib_error_check(frame_size <= buffer_size, FrameSizeOverflow))
    272 			return 0;
    273 	}
    274 
    275 	return 1;
    276 }
    277 
    278 function b32
    279 validate_pipeline(i32 *shaders, u32 shader_count, BeamformerDataKind data_kind)
    280 {
    281 	b32 data_kind_test = Between(data_kind, 0, BeamformerDataKind_Count - 1);
    282 	if (!lib_error_check(data_kind_test, InvalidDataKind))
    283 		return 0;
    284 
    285 	if (!lib_error_check(shader_count <= BeamformerMaxComputeShaderStages, ComputeStageOverflow))
    286 		return 0;
    287 
    288 	for (u32 i = 0; i < shader_count; i++) {
    289 		b32 stage_test = Between(shaders[i], BeamformerShaderKind_ComputeFirst, BeamformerShaderKind_ComputeLast);
    290 		if (!lib_error_check(stage_test, InvalidComputeStage))
    291 			return 0;
    292 
    293 		if (shaders[i] == BeamformerShaderKind_Hilbert &&
    294 		    !lib_error_check(g_beamformer_library_context.bp->capabilities.hilbert != 0, InvalidComputeStage))
    295 			return 0;
    296 
    297 		if (shaders[i] == BeamformerShaderKind_Demodulate &&
    298 		    !lib_error_check(!beamformer_data_kind_complex[data_kind], InvalidDemodulationDataKind))
    299 		{
    300 			return 0;
    301 		}
    302 	}
    303 
    304 	b32 start_stage_test = shaders[0] == BeamformerShaderKind_Demodulate ||
    305 	                       shaders[0] == BeamformerShaderKind_Decode;
    306 	if (!lib_error_check(start_stage_test, InvalidStartShader))
    307 		return 0;
    308 
    309 	return 1;
    310 }
    311 
    312 u64
    313 beamformer_maximum_rf_data_size(void)
    314 {
    315 	u64 result = U64_MAX;
    316 	if (check_shared_memory()) {
    317 		Arena sm = beamformer_shared_memory_scratch_arena(g_beamformer_library_context.bp,
    318 		                                                  g_beamformer_library_context.shared_memory_size);
    319 		result = Min((u64)arena_capacity(&sm, u8), g_beamformer_library_context.bp->capabilities.max_rf_data_size);
    320 	}
    321 	return result;
    322 }
    323 
    324 u64
    325 beamformer_maximum_frames_for_parameters(BeamformerParameters *bp)
    326 {
    327 	u64 result = U64_MAX;
    328 	if (check_shared_memory() && validate_parameters(bp)) {
    329 		// TODO(rnp): overly conservative frame size check
    330 		u64 buffer_size     = g_beamformer_library_context.bp->beamformed_frame_buffer_size;
    331 		u64 frame_size      = Max(1, bp->output_points.x) * Max(1, bp->output_points.y) * Max(1, bp->output_points.z)
    332 		                      * beamformer_data_kind_byte_size[BeamformerDataKind_Float32Complex];
    333 		u64 incoherent_size = frame_size / 2;
    334 		if (bp->coherency_weighting)
    335 			buffer_size -= incoherent_size;
    336 		result = buffer_size / frame_size;
    337 	}
    338 	return result;
    339 }
    340 
    341 u64
    342 beamformer_maximum_frames_for_simple_parameters(BeamformerSimpleParameters *bp)
    343 {
    344 	u64 result = beamformer_maximum_frames_for_parameters((BeamformerParameters *)bp);
    345 	return result;
    346 }
    347 
    348 function b32
    349 parameter_block_region_upload(void *data, u32 size, u32 block, BeamformerParameterBlockRegions region_id,
    350                               u32 block_offset, i32 timeout_ms)
    351 {
    352 	i32 lock   = BeamformerSharedMemoryLockKind_Count + (i32)block;
    353 	b32 result = valid_parameter_block(block) && lib_try_lock(lock, timeout_ms);
    354 	if (result) {
    355 		mem_copy((u8 *)beamformer_parameter_block(g_beamformer_library_context.bp, block) + block_offset,
    356 		         data, size);
    357 		mark_parameter_block_region_dirty(g_beamformer_library_context.bp, block, region_id);
    358 		lib_release_lock(lock);
    359 	}
    360 	return result;
    361 }
    362 
    363 b32
    364 beamformer_set_pipeline_stage_parameters_at(u32 stage_index, i32 parameter, u32 block)
    365 {
    366 	u32 offset  = BeamformerParameterBlockRegionOffsets[BeamformerParameterBlockRegion_ComputePipeline];
    367 	offset     += offsetof(BeamformerComputePipeline, parameters);
    368 	offset     += (stage_index % BeamformerMaxComputeShaderStages) * sizeof(BeamformerShaderParameters);
    369 	b32 result  = parameter_block_region_upload(&parameter, sizeof(BeamformerShaderParameters), block,
    370 	                                            BeamformerParameterBlockRegion_ComputePipeline, offset,
    371 	                                            g_beamformer_library_context.timeout_ms);
    372 	return result;
    373 }
    374 
    375 b32
    376 beamformer_set_pipeline_stage_parameters(u32 stage_index, i32 parameter)
    377 {
    378 	b32 result = beamformer_set_pipeline_stage_parameters_at(stage_index, parameter, 0);
    379 	return result;
    380 }
    381 
    382 b32
    383 beamformer_push_pipeline_at(i32 *shaders, u32 shader_count, BeamformerDataKind data_kind, u32 block)
    384 {
    385 	b32 result = 0;
    386 	if (check_shared_memory() && validate_pipeline(shaders, shader_count, data_kind)) {
    387 		i32 lock = BeamformerSharedMemoryLockKind_Count + (i32)block;
    388 		if (valid_parameter_block(block) && lib_try_lock(lock, g_beamformer_library_context.timeout_ms)) {
    389 			BeamformerParameterBlock *b = beamformer_parameter_block(g_beamformer_library_context.bp, block);
    390 			mem_copy(&b->pipeline.shaders, shaders, shader_count * sizeof(*shaders));
    391 			mark_parameter_block_region_dirty(g_beamformer_library_context.bp, block,
    392 			                                  BeamformerParameterBlockRegion_ComputePipeline);
    393 			b->pipeline.shader_count = shader_count;
    394 			b->pipeline.data_kind    = data_kind;
    395 			lib_release_lock(lock);
    396 			result = 1;
    397 		}
    398 	}
    399 	return result;
    400 }
    401 
    402 b32
    403 beamformer_push_pipeline(i32 *shaders, u32 shader_count, BeamformerDataKind data_kind)
    404 {
    405 	b32 result = beamformer_push_pipeline_at(shaders, shader_count, data_kind, 0);
    406 	return result;
    407 }
    408 
    409 b32
    410 beamformer_create_filter(BeamformerFilterParameters *filter, u8 filter_slot, u8 parameter_block)
    411 {
    412 	b32 result = 0;
    413 	if (lib_error_check(filter->kind >= 0 && filter->kind < BeamformerFilterKind_Count, InvalidFilterKind)) {
    414 		if (check_shared_memory()) {
    415 			BeamformWork *work = try_push_work_queue();
    416 			if (work) {
    417 				BeamformerCreateFilterContext *ctx = &work->create_filter_context;
    418 				work->kind = BeamformerWorkKind_CreateFilter;
    419 				ctx->parameters      = *filter;
    420 				ctx->filter_slot     = filter_slot     % BeamformerFilterSlots;
    421 				ctx->parameter_block = parameter_block % BeamformerMaxParameterBlocks;
    422 				beamform_work_queue_push_commit(&g_beamformer_library_context.bp->external_work_queue);
    423 				result = 1;
    424 			}
    425 		}
    426 	}
    427 	return result;
    428 }
    429 
    430 function void
    431 beamformer_flush_commands(void)
    432 {
    433 	i32 lock = BeamformerSharedMemoryLockKind_DispatchCompute;
    434 	beamformer_shared_memory_take_lock(g_beamformer_library_context.bp, lock, 0);
    435 }
    436 
    437 #define BEAMFORMER_UPLOAD_FNS \
    438 	X(channel_mapping,               i16, 1, ChannelMapping) \
    439 	X(focal_vectors,                 f32, 2, FocalVectors)   \
    440 	X(sparse_elements,               i16, 1, SparseElements) \
    441 	X(transmit_receive_orientations, u8,  1, TransmitReceiveOrientations)
    442 
    443 #define X(name, dtype, elements, region_name) \
    444 b32 beamformer_push_##name ##_at(dtype *data, u32 count, u32 block) { \
    445 	b32 result = 0; \
    446 	if (lib_error_check(count <= countof(((BeamformerParameterBlock *)0)->name), BufferOverflow)) { \
    447 		result = parameter_block_region_upload(data, count * elements * sizeof(dtype), block, \
    448 		                                       BeamformerParameterBlockRegion_##region_name,  \
    449 		                                       offsetof(BeamformerParameterBlock, name),      \
    450 		                                       g_beamformer_library_context.timeout_ms);      \
    451 	} \
    452 	return result; \
    453 }
    454 BEAMFORMER_UPLOAD_FNS
    455 #undef X
    456 
    457 #define X(name, dtype, ...) \
    458 b32 beamformer_push_##name (dtype *data, u32 count) { \
    459 	b32 result = beamformer_push_##name ##_at(data, count, 0); \
    460 	return result; \
    461 }
    462 BEAMFORMER_UPLOAD_FNS
    463 #undef X
    464 
    465 #define BEAMFORMER_REDUCE_A1S2_CONTRAST_FN(name) void name(void *restrict output_v, \
    466                                                            void *restrict input_v, \
    467                                                            u32 sample_count)
    468 typedef BEAMFORMER_REDUCE_A1S2_CONTRAST_FN(beamformer_reduce_a1s2_contrast_fn);
    469 
    470 #define BEAMFORMER_REDUCE_A1S2_CONTRAST_LIST \
    471 	X(i16) \
    472 	X(f32) \
    473 	X(f16) \
    474 
    475 static_assert(BeamformerDataKind_Float16Complex == (BeamformerDataKind_Count - 1), "");
    476 
    477 #define X(type, ...) \
    478 function BEAMFORMER_REDUCE_A1S2_CONTRAST_FN(beamformer_reduce_a1s2_contrast_##type) \
    479 { \
    480 	type *input_a = (type *)input_v + 0 * sample_count; \
    481 	type *input_b = (type *)input_v + 1 * sample_count; \
    482 	type *input_c = (type *)input_v + 2 * sample_count; \
    483 	type *output  = (type *)output_v; \
    484 	for (u32 sample = 0; sample < sample_count; sample++) \
    485 		output[sample] = input_a[sample] - input_b[sample] - input_c[sample]; \
    486 }
    487 BEAMFORMER_REDUCE_A1S2_CONTRAST_LIST
    488 #undef X
    489 
    490 function b32
    491 beamformer_push_data_base(void *data, u32 data_size, i32 timeout_ms, u32 block)
    492 {
    493 	b32 result = 0;
    494 	Arena scratch = beamformer_shared_memory_scratch_arena(g_beamformer_library_context.bp,
    495 	                                                       g_beamformer_library_context.shared_memory_size);
    496 	BeamformerParameterBlock *b  = beamformer_parameter_block(g_beamformer_library_context.bp, block);
    497 	BeamformerParameters     *bp = &b->parameters;
    498 	BeamformerDataKind     data_kind     = b->pipeline.data_kind;
    499 	BeamformerContrastMode contrast_mode = bp->contrast_mode;
    500 
    501 
    502 	u64 max_rf_size = g_beamformer_library_context.bp->capabilities.max_rf_data_size;
    503 	u32 rf_size     = bp->acquisition_count * bp->sample_count * bp->channel_count * beamformer_data_kind_byte_size[data_kind];
    504 	u32 raw_size    = bp->raw_data_dimensions.x * bp->raw_data_dimensions.y * beamformer_data_kind_byte_size[data_kind];
    505 
    506 	// TODO(rnp): support multi push upload so that max_rf_size is actual limit
    507 	if (lib_error_check(rf_size <= arena_capacity(&scratch, u8), BufferOverflow) &&
    508 	    lib_error_check(rf_size <= max_rf_size, RFDataSizeOverflow) &&
    509 	    lib_error_check(rf_size <= data_size && data_size == raw_size, DataSizeMismatch))
    510 	{
    511 		if (lib_try_lock(BeamformerSharedMemoryLockKind_UploadRF, timeout_ms)) {
    512 			if (lib_try_lock(BeamformerSharedMemoryLockKind_ScratchSpace, 0)) {
    513 				u32 channel_count      = bp->channel_count;
    514 				u32 out_channel_stride = beamformer_data_kind_byte_size[data_kind] * bp->sample_count * bp->acquisition_count;
    515 				u32 in_channel_stride  = beamformer_data_kind_byte_size[data_kind] * bp->raw_data_dimensions.x;
    516 
    517 				for (u32 channel = 0; channel < channel_count; channel++) {
    518 					u16 data_channel = (u16)b->channel_mapping[channel];
    519 					u32 out_off = out_channel_stride * channel;
    520 					u32 in_off  = in_channel_stride  * data_channel;
    521 					switch (contrast_mode) {
    522 					default:{
    523 						/* NOTE(rnp): non temporal copy would be better, but we can't ensure
    524 						 * 64 byte boundaries. */
    525 						memory_copy(scratch.beg + out_off, (u8 *)data + in_off, out_channel_stride);
    526 					}break;
    527 
    528 					case BeamformerContrastMode_A1S2:{
    529 						read_only local_persist u8 reduce_a1s2_index_map[] = {
    530 							[BeamformerDataKind_Int16]          = 0,
    531 							[BeamformerDataKind_Int16Complex]   = 0,
    532 							[BeamformerDataKind_Float32]        = 1,
    533 							[BeamformerDataKind_Float32Complex] = 1,
    534 							[BeamformerDataKind_Float16]        = 2,
    535 							[BeamformerDataKind_Float16Complex] = 2,
    536 						};
    537 						static_assert(BeamformerDataKind_Float16Complex == (BeamformerDataKind_Count - 1), "");
    538 
    539 						read_only local_persist beamformer_reduce_a1s2_contrast_fn *reduce_a1s2_fn_table[] = {
    540 							#define X(type, ...) beamformer_reduce_a1s2_contrast_##type,
    541 							BEAMFORMER_REDUCE_A1S2_CONTRAST_LIST
    542 							#undef X
    543 						};
    544 
    545 						// TODO(rnp): HACK: for some unknown reason loading contrast data after loading
    546 						// non-contrast data causes the dataset to not be stored correctly (it looks
    547 						// like mix of the old and new dataset). Putting this here fixes the issue.
    548 						// Counter-intuitively this improves throughput on my zen4 test computer,
    549 						// however it obviously should not be needed.
    550 						memory_clear(scratch.beg + out_off, 0, out_channel_stride);
    551 
    552 						u32 sample_count = bp->sample_count * beamformer_data_kind_element_count[data_kind];
    553 						reduce_a1s2_fn_table[reduce_a1s2_index_map[data_kind]](scratch.beg + out_off,
    554 						                                                       (u8 *)data + in_off,
    555 						                                                       sample_count);
    556 					}break;
    557 					}
    558 				}
    559 
    560 				lib_release_lock(BeamformerSharedMemoryLockKind_ScratchSpace);
    561 				/* TODO(rnp): need a better way to communicate this */
    562 				u64 rf_block_rf_size = (u64)block << 32ULL | (u64)rf_size;
    563 				atomic_store_u64(&g_beamformer_library_context.bp->rf_block_rf_size, rf_block_rf_size);
    564 				result = 1;
    565 			}
    566 		}
    567 	}
    568 	return result;
    569 }
    570 
    571 b32
    572 beamformer_push_data_with_compute(void *data, u32 data_size, u32 image_plane_tag, u32 parameter_slot)
    573 {
    574 	b32 result = 0;
    575 	if (check_shared_memory()) {
    576 		u32 reserved_blocks = g_beamformer_library_context.bp->reserved_parameter_blocks;
    577 		if (lib_error_check(image_plane_tag < BeamformerViewPlaneTag_Count, InvalidImagePlane) &&
    578 		    lib_error_check(parameter_slot < reserved_blocks, ParameterBlockUnallocated) &&
    579 		    beamformer_push_data_base(data, data_size, g_beamformer_library_context.timeout_ms, parameter_slot))
    580 		{
    581 			BeamformWork *work = try_push_work_queue();
    582 			if (work) {
    583 				work->kind = BeamformerWorkKind_ComputeIndirect;
    584 				work->compute_context.view_plane      = image_plane_tag;
    585 				work->compute_context.parameter_block = parameter_slot;
    586 				beamform_work_queue_push_commit(&g_beamformer_library_context.bp->external_work_queue);
    587 				beamformer_flush_commands();
    588 				result = 1;
    589 			}
    590 		}
    591 	}
    592 	return result;
    593 }
    594 
    595 b32
    596 beamformer_push_parameters_at(BeamformerParameters *bp, u32 block)
    597 {
    598 	b32 result = check_shared_memory() && validate_parameters(bp);
    599 	if (result) {
    600 		result = parameter_block_region_upload(bp, sizeof(*bp), block,
    601 		                                       BeamformerParameterBlockRegion_Parameters,
    602 		                                       offsetof(BeamformerParameterBlock, parameters),
    603 		                                       g_beamformer_library_context.timeout_ms);
    604 		if (result) {
    605 			BeamformerParameterBlock *pb = beamformer_parameter_block(g_beamformer_library_context.bp, block);
    606 			atomic_or_u32(&pb->region_update_flags, 1u << BeamformerParameterRegionFlag_NotifyUI);
    607 		}
    608 	}
    609 	return result;
    610 }
    611 
    612 b32
    613 beamformer_push_parameters(BeamformerParameters *bp)
    614 {
    615 	b32 result = beamformer_push_parameters_at(bp, 0);
    616 	return result;
    617 }
    618 
    619 b32
    620 beamformer_push_simple_parameters_at(BeamformerSimpleParameters *bp, u32 block)
    621 {
    622 	b32 result = check_shared_memory();
    623 	if (result) {
    624 		alignas(64) v2 focal_vectors[countof(bp->steering_angles)];
    625 		for (u32 i = 0; i < countof(bp->steering_angles); i++)
    626 			focal_vectors[i] = (v2){{bp->steering_angles[i], bp->focal_depths[i]}};
    627 
    628 		result &= beamformer_push_parameters_at((BeamformerParameters *)bp, block);
    629 		result &= beamformer_push_pipeline_at(bp->compute_stages, bp->compute_stages_count, (BeamformerDataKind)bp->data_kind, block);
    630 		result &= beamformer_push_channel_mapping_at(bp->channel_mapping, bp->channel_count, block);
    631 		result &= beamformer_push_focal_vectors_at((f32 *)focal_vectors, countof(focal_vectors), block);
    632 		result &= beamformer_push_transmit_receive_orientations_at(bp->transmit_receive_orientations,
    633 		                                                           bp->acquisition_count, block);
    634 
    635 		if (bp->acquisition_kind == BeamformerAcquisitionKind_UFORCES ||
    636 		    bp->acquisition_kind == BeamformerAcquisitionKind_UHERCULES)
    637 		{
    638 			result &= beamformer_push_sparse_elements_at(bp->sparse_elements, bp->acquisition_count, block);
    639 		}
    640 
    641 		for (u32 stage = 0; stage < bp->compute_stages_count; stage++)
    642 			result &= beamformer_set_pipeline_stage_parameters_at(stage, bp->compute_stage_parameters[stage], block);
    643 	}
    644 	return result;
    645 }
    646 
    647 b32
    648 beamformer_push_simple_parameters(BeamformerSimpleParameters *bp)
    649 {
    650 	b32 result = beamformer_push_simple_parameters_at(bp, 0);
    651 	return result;
    652 }
    653 
    654 function b32
    655 beamformer_export_buffer(BeamformerExportContext export_context)
    656 {
    657 	BeamformWork *work = try_push_work_queue();
    658 	b32 result = work && lib_try_lock(BeamformerSharedMemoryLockKind_ExportSync, 0);
    659 	if (result) {
    660 		work->export_context = export_context;
    661 		work->kind = BeamformerWorkKind_ExportBuffer;
    662 		work->lock = BeamformerSharedMemoryLockKind_ScratchSpace;
    663 		beamform_work_queue_push_commit(&g_beamformer_library_context.bp->external_work_queue);
    664 	}
    665 	return result;
    666 }
    667 
    668 function b32
    669 beamformer_export(BeamformerExportContext export, void *out, i32 timeout_ms)
    670 {
    671 	b32 result = 0;
    672 	if (beamformer_export_buffer(export)) {
    673 		/* NOTE(rnp): if this fails it just means that the work from push_data hasn't
    674 		 * started yet. This is here to catch the other case where the work started
    675 		 * and finished before we finished queuing the export work item */
    676 		beamformer_flush_commands();
    677 
    678 		if (lib_try_lock(BeamformerSharedMemoryLockKind_ExportSync, timeout_ms)) {
    679 			if (lib_try_lock(BeamformerSharedMemoryLockKind_ScratchSpace, 0)) {
    680 				Arena scratch = beamformer_shared_memory_scratch_arena(g_beamformer_library_context.bp,
    681 				                                                       g_beamformer_library_context.shared_memory_size);
    682 				mem_copy(out, scratch.beg, export.size);
    683 				lib_release_lock(BeamformerSharedMemoryLockKind_ScratchSpace);
    684 				result = 1;
    685 			}
    686 			lib_release_lock(BeamformerSharedMemoryLockKind_ExportSync);
    687 		}
    688 	}
    689 	return result;
    690 }
    691 
    692 b32
    693 beamformer_beamform_data(BeamformerSimpleParameters *bp, void *data, uint32_t data_size,
    694                          void *out_data, int32_t timeout_ms)
    695 {
    696 	b32 result = beamformer_push_simple_parameters(bp);
    697 	if (result) {
    698 		iv3 output_points = bp->output_points.xyz;
    699 		output_points.E[0] = Max(1, output_points.E[0]);
    700 		output_points.E[1] = Max(1, output_points.E[1]);
    701 		output_points.E[2] = Max(1, output_points.E[2]);
    702 
    703 		b32 complex = 0;
    704 		for (u32 stage = 0; stage < bp->compute_stages_count; stage++) {
    705 			BeamformerShaderKind shader = (BeamformerShaderKind)bp->compute_stages[stage];
    706 			complex |= shader == BeamformerShaderKind_Demodulate || shader == BeamformerShaderKind_Hilbert;
    707 		}
    708 
    709 		u64 output_size = output_points.x * output_points.y * output_points.z * sizeof(f32);
    710 		if (complex) output_size *= 2;
    711 
    712 		Arena scratch = beamformer_shared_memory_scratch_arena(g_beamformer_library_context.bp,
    713 		                                                       g_beamformer_library_context.shared_memory_size);
    714 		if (result && out_data) result &= lib_error_check((iz)output_size <= arena_capacity(&scratch, u8), ExportSpaceOverflow);
    715 
    716 		if (result) {
    717 			result = beamformer_push_data_with_compute(data, data_size, 0, 0);
    718 			if (result && out_data) {
    719 				BeamformerExportContext export;
    720 				export.kind = BeamformerExportKind_BeamformedData;
    721 				export.size = (u32)output_size;
    722 				result = beamformer_export(export, out_data, timeout_ms);
    723 			}
    724 		}
    725 	}
    726 	return result;
    727 }
    728 
    729 b32
    730 beamformer_compute_timings(BeamformerComputeStatsTable *output, i32 timeout_ms)
    731 {
    732 	b32 result = 0;
    733 	if (check_shared_memory()) {
    734 		Arena scratch = beamformer_shared_memory_scratch_arena(g_beamformer_library_context.bp,
    735 		                                                       g_beamformer_library_context.shared_memory_size);
    736 		if (lib_error_check((iz)sizeof(*output) <= arena_capacity(&scratch, u8), ExportSpaceOverflow)) {
    737 			BeamformerExportContext export;
    738 			export.kind = BeamformerExportKind_Stats;
    739 			export.size = sizeof(*output);
    740 			result = beamformer_export(export, output, timeout_ms);
    741 		}
    742 	}
    743 	return result;
    744 }
    745 
    746 i32
    747 beamformer_live_parameters_get_dirty_flag(void)
    748 {
    749 	i32 result = -1;
    750 	if (check_shared_memory()) {
    751 		u32 flag = ctz_u64(g_beamformer_library_context.bp->live_imaging_dirty_flags);
    752 		if (flag != 64) {
    753 			atomic_and_u32(&g_beamformer_library_context.bp->live_imaging_dirty_flags, ~(1u << flag));
    754 			result = (i32)flag;
    755 		}
    756 	}
    757 	return result;
    758 }
    759 
    760 BeamformerLiveImagingParameters *
    761 beamformer_get_live_parameters(void)
    762 {
    763 	BeamformerLiveImagingParameters *result = 0;
    764 	if (check_shared_memory()) result = &g_beamformer_library_context.bp->live_imaging_parameters;
    765 	return result;
    766 }
    767 
    768 b32
    769 beamformer_set_live_parameters(BeamformerLiveImagingParameters *new)
    770 {
    771 	b32 result = 0;
    772 	if (check_shared_memory()) {
    773 		mem_copy(&g_beamformer_library_context.bp->live_imaging_parameters, new, sizeof(*new));
    774 		store_fence();
    775 		result = 1;
    776 	}
    777 	return result;
    778 }