ogl_beamforming

Ultrasound Beamforming Implemented with OpenGL
git clone anongit@rnpnr.xyz:ogl_beamforming.git
Log | Files | Refs | Feed | Submodules | README | LICENSE

beamformer.c (14265B)


      1 /* See LICENSE for license details. */
      2 
      3 #include "beamformer_internal.h"
      4 
      5 /* NOTE(rnp): magic variables to force discrete GPU usage on laptops with multiple devices */
      6 EXPORT i32 NvOptimusEnablement = 1;
      7 EXPORT i32 AmdPowerXpressRequestHighPerformance = 1;
      8 
      9 #if !BEAMFORMER_DEBUG
     10 #include "beamformer_core.c"
     11 #else
     12 
     13 typedef void beamformer_frame_step_fn(BeamformerInput *);
     14 
     15 #define BEAMFORMER_DEBUG_ENTRY_POINTS \
     16 	X(beamformer_debug_ui_deinit)  \
     17 	X(beamformer_complete_compute) \
     18 	X(beamformer_frame_step)       \
     19 	X(beamformer_rf_upload)        \
     20 
     21 #define X(name) global name ##_fn *name;
     22 BEAMFORMER_DEBUG_ENTRY_POINTS
     23 #undef X
     24 
     25 BEAMFORMER_EXPORT void
     26 beamformer_debug_hot_release(BeamformerInput *input)
     27 {
     28 	BeamformerCtx *ctx = BeamformerContextMemory(input->memory);
     29 
     30 	// TODO(rnp): this will deadlock if live imaging is active
     31 	/* NOTE(rnp): spin until compute thread finishes its work (we will probably
     32 	 * never reload while compute is in progress but just incase). */
     33 	spin_wait(atomic_load_u32(&ctx->upload_worker.awake));
     34 	spin_wait(atomic_load_u32(&ctx->compute_worker.awake));
     35 }
     36 
     37 BEAMFORMER_EXPORT void
     38 beamformer_debug_hot_reload(OSLibrary library, BeamformerInput *input)
     39 {
     40 	#define X(name) name = os_lookup_symbol(library, #name);
     41 	BEAMFORMER_DEBUG_ENTRY_POINTS
     42 	#undef X
     43 
     44 	s8 info = beamformer_info("reloaded main executable");
     45 	os_console_log(info.data, info.len);
     46 }
     47 
     48 #endif /* BEAMFORMER_DEBUG */
     49 
     50 function no_return void
     51 fatal(s8 message)
     52 {
     53 	os_fatal(message.data, message.len);
     54 	unreachable();
     55 }
     56 
     57 #include "vulkan.c"
     58 
     59 // TODO(rnp): this doesn't belong here, but will be removed
     60 // once vulkan migration is complete
     61 void * glfwGetProcAddress(char *);
     62 
     63 function void
     64 gl_debug_logger(u32 src, u32 type, u32 id, u32 lvl, i32 len, const char *msg, const void *userctx)
     65 {
     66 	Stream *e = (Stream *)userctx;
     67 	stream_append_s8s(e, s8("[OpenGL] "), (s8){.len = len, .data = (u8 *)msg}, s8("\n"));
     68 	os_console_log(e->data, e->widx);
     69 	stream_reset(e, 0);
     70 }
     71 
     72 function void
     73 load_gl(Stream *err)
     74 {
     75 	#define X(name, ret, params) name = (name##_fn *)glfwGetProcAddress(#name);
     76 	OGLProcedureList
     77 	OGLRequiredExtensionProcedureList
     78 	#undef X
     79 
     80 	stream_reset(err, 0);
     81 	#define X(name, ret, params) if (!name) stream_append_s8(err, s8("missing required GL function: " #name "\n"));
     82 	OGLProcedureList
     83 	OGLRequiredExtensionProcedureListBase
     84 	#if OS_WINDOWS
     85 	  OGLRequiredExtensionProcedureListW32
     86 	#else
     87 	  OGLRequiredExtensionProcedureListLinux
     88 	#endif
     89 	#undef X
     90 
     91 	if (err->widx) fatal(stream_to_s8(err));
     92 }
     93 
     94 function void
     95 beamformer_load_cuda_library(BeamformerCtx *ctx, OSLibrary cuda, Arena arena)
     96 {
     97 	/* TODO(rnp): (25.10.30) registering the rf buffer with CUDA is currently
     98 	 * causing a major performance regression. for now we are disabling its use
     99 	 * altogether. it will be reenabled once the issue can be fixed */
    100 	b32 result = 0 && vk_gpu_info()->vendor == GPUVendor_NVIDIA && ValidHandle(cuda);
    101 	if (result) {
    102 		Stream err = arena_stream(arena);
    103 
    104 		stream_append_s8(&err, beamformer_info("loading CUDA library functions"));
    105 		#define X(name, symname) cuda_## name = os_lookup_symbol(cuda, symname);
    106 		CUDALibraryProcedureList
    107 		#undef X
    108 
    109 		os_console_log(err.data, err.widx);
    110 	}
    111 
    112 	#define X(name, symname) if (!cuda_## name) cuda_## name = cuda_ ## name ## _stub;
    113 	CUDALibraryProcedureList
    114 	#undef X
    115 }
    116 
    117 function void
    118 worker_thread_sleep(GLWorkerThreadContext *ctx, BeamformerSharedMemory *sm)
    119 {
    120 	for (;;) {
    121 		i32 expected = 0;
    122 		if (atomic_cas_u32(&ctx->sync_variable, &expected, 1) ||
    123 		    atomic_load_u32(&sm->live_imaging_parameters.active))
    124 		{
    125 			break;
    126 		}
    127 
    128 		/* TODO(rnp): clean this crap up; we shouldn't need two values to communicate this */
    129 		atomic_store_u32(&ctx->awake, 0);
    130 		os_wait_on_address(&ctx->sync_variable, 1, (u32)-1);
    131 		atomic_store_u32(&ctx->awake, 1);
    132 	}
    133 }
    134 
    135 function OS_THREAD_ENTRY_POINT_FN(compute_worker_thread_entry_point)
    136 {
    137 	GLWorkerThreadContext *ctx = user_context;
    138 
    139 	BeamformerCtx *beamformer = (BeamformerCtx *)ctx->user_context;
    140 
    141 	for (;;) {
    142 		worker_thread_sleep(ctx, beamformer->shared_memory);
    143 		asan_poison_region(ctx->arena.beg, ctx->arena.end - ctx->arena.beg);
    144 		beamformer_complete_compute(beamformer, &ctx->arena);
    145 	}
    146 
    147 	unreachable();
    148 
    149 	return 0;
    150 }
    151 
    152 function OS_THREAD_ENTRY_POINT_FN(beamformer_upload_entry_point)
    153 {
    154 	GLWorkerThreadContext         *ctx = user_context;
    155 	BeamformerUploadThreadContext *up  = (typeof(up))ctx->user_context;
    156 
    157 	for (;;) {
    158 		worker_thread_sleep(ctx, up->shared_memory);
    159 		beamformer_rf_upload(up);
    160 	}
    161 
    162 	unreachable();
    163 
    164 	return 0;
    165 }
    166 
    167 BEAMFORMER_EXPORT void
    168 beamformer_init(BeamformerInput *input)
    169 {
    170 	Arena  memory        = arena_from_memory(input->memory, input->memory_size);
    171 	Arena  compute_arena = sub_arena_end(&memory, MB(2), KB(4));
    172 	Arena  upload_arena  = sub_arena_end(&memory, KB(4), KB(4));
    173 	Arena  ui_arena      = sub_arena_end(&memory, MB(2), KB(4));
    174 	Stream error         = arena_stream(sub_arena_end(&memory, MB(1), 1));
    175 
    176 	BeamformerCtx *ctx   = push_struct(&memory, BeamformerCtx);
    177 
    178 	Arena scratch = {.beg = memory.end - 4096L, .end = memory.end};
    179 	memory.end = scratch.beg;
    180 
    181 	ctx->window_size           = (iv2){{1280, 840}};
    182 	ctx->error_stream          = error;
    183 	ctx->ui_backing_store      = ui_arena;
    184 	ctx->compute_worker.arena  = compute_arena;
    185 	ctx->upload_worker.arena   = upload_arena;
    186 
    187 	#if BEAMFORMER_RENDERDOC_HOOKS
    188 	start_frame_capture       = input->renderdoc_start_frame_capture;
    189 	end_frame_capture         = input->renderdoc_end_frame_capture;
    190 	set_capture_path_template = input->renderdoc_set_capture_file_path_template;
    191 	#endif
    192 
    193 	vk_load(input->vulkan_library_handle, &memory, &ctx->error_stream);
    194 
    195 	BeamformerComputeContext *cs = &ctx->compute_context;
    196 
    197 	// NOTE(rnp): allocate beamformed image ring buffer
    198 	{
    199 		u64 gpu_heap_size = vk_gpu_info()->gpu_heap_size;
    200 		u64 trial_sizes[] = {
    201 			GB(4),
    202 			GB(2),
    203 			GB(1) + MB(512),
    204 			GB(1),
    205 		};
    206 
    207 		u32 base_index = 0;
    208 		for EachElement(trial_sizes, it) {
    209 			if (gpu_heap_size >= 2 * trial_sizes[it])
    210 				break;
    211 			base_index++;
    212 		}
    213 
    214 		for (u32 i = base_index; i < countof(trial_sizes); i++) {
    215 			// TODO(rnp): it may be better to download data from this using the transfer queue
    216 			VulkanTimeline timelines[] = {VulkanTimeline_Compute, VulkanTimeline_Graphics};
    217 			GPUBufferAllocateInfo allocate_info = {
    218 				.size            = trial_sizes[i],
    219 				.flags           = VulkanUsageFlag_TransferSource|VulkanUsageFlag_HostReadWrite,
    220 				.timeline_count  = countof(timelines),
    221 				.timelines_used  = timelines,
    222 				.label           = s8("BeamformedData"),
    223 			};
    224 			vk_buffer_allocate(cs->backlog.buffer, &allocate_info);
    225 			if (cs->backlog.buffer->size > 0)
    226 				break;
    227 		}
    228 		if (cs->backlog.buffer->size == 0) {
    229 			// NOTE(rnp): if this becomes an issue we may be able to get by in some other way
    230 			fatal(s8("Failed to allocate space for beamformed data\n"));
    231 		}
    232 
    233 		BeamformerShaderResourceInfo shader_resource_infos[] = {
    234 			{
    235 				.kind   = BeamformerShaderResourceKind_Buffer,
    236 				.handle = cs->backlog.buffer->handle,
    237 				.slot   = BeamformerShaderBufferSlot_BeamformedData,
    238 			},
    239 		};
    240 		vk_bind_shader_resources(shader_resource_infos, countof(shader_resource_infos));
    241 	}
    242 
    243 	beamformer_load_cuda_library(ctx, input->cuda_library_handle, memory);
    244 
    245 	SetConfigFlags(FLAG_VSYNC_HINT|FLAG_WINDOW_ALWAYS_RUN);
    246 	InitWindow(ctx->window_size.w, ctx->window_size.h, "OGL Beamformer");
    247 	/* NOTE: do this after initing so that the window starts out floating in tiling wm */
    248 	SetWindowState(FLAG_WINDOW_RESIZABLE);
    249 	SetWindowMinSize(840, ctx->window_size.h);
    250 
    251 	load_gl(&ctx->error_stream);
    252 
    253 	ctx->shared_memory      = input->shared_memory;
    254 	ctx->shared_memory_size = input->shared_memory_size;
    255 	if (ctx->shared_memory_size < (i64)sizeof(*ctx->shared_memory))
    256 		fatal(s8("Get more ram lol\n"));
    257 	zero_struct(ctx->shared_memory);
    258 
    259 	ctx->shared_memory->version = BEAMFORMER_SHARED_MEMORY_VERSION;
    260 	ctx->shared_memory->reserved_parameter_blocks = 1;
    261 
    262 	ctx->shared_memory->beamformed_frame_buffer_size = cs->backlog.buffer->size;
    263 
    264 	// TODO(rnp): dynamic rf data buffer slot usage
    265 	// NOTE(rnp): will be same as the max size we were able to get for the frame buffer
    266 	ctx->shared_memory->capabilities.max_rf_data_size = cs->backlog.buffer->size
    267 	                                                    / BeamformerMaxRawDataFramesInFlight;
    268 
    269 	ctx->shared_memory->capabilities.cuda    = cuda_init != cuda_init_stub;
    270 	// TODO(rnp): re-enable hilbert support, with and without cuda
    271 	ctx->shared_memory->capabilities.hilbert = 0;
    272 
    273 	/* TODO(rnp): I'm not sure if its a good idea to pre-reserve a bunch of semaphores
    274 	 * on w32 but thats what we are doing for now */
    275 	#if OS_WINDOWS
    276 	{
    277 		Stream sb = arena_stream(memory);
    278 		stream_append(&sb, input->shared_memory_name, input->shared_memory_name_length);
    279 		stream_append_s8(&sb, s8("_lock_"));
    280 		i32 start_index = sb.widx;
    281 		for EachElement(os_w32_shared_memory_semaphores, it) {
    282 			stream_reset(&sb, start_index);
    283 			stream_append_u64(&sb, it);
    284 			stream_append_byte(&sb, 0);
    285 			os_w32_shared_memory_semaphores[it] = os_w32_create_semaphore((c8 *)sb.data, 1, 1);
    286 			if InvalidHandle(os_w32_shared_memory_semaphores[it])
    287 				fatal(beamformer_info("init: failed to create w32 shared memory semaphore\n"));
    288 
    289 			/* NOTE(rnp): hacky garbage because CreateSemaphore will just open an existing
    290 			 * semaphore without any indication. Sometimes the other side of the shared memory
    291 			 * will provide incorrect parameters or will otherwise fail and its faster to
    292 			 * restart this program than to get that application to release the semaphores */
    293 			/* TODO(rnp): figure out something more robust */
    294 			os_w32_semaphore_release(os_w32_shared_memory_semaphores[it], 1);
    295 		}
    296 	}
    297 	#endif
    298 
    299 	GLWorkerThreadContext *worker = &ctx->compute_worker;
    300 	/* TODO(rnp): we should lock this down after we have something working */
    301 	worker->user_context = (iptr)ctx;
    302 	worker->handle       = os_create_thread("[compute]", worker, compute_worker_thread_entry_point);
    303 
    304 	GLWorkerThreadContext         *upload = &ctx->upload_worker;
    305 	BeamformerUploadThreadContext *upctx  = push_struct(&memory, typeof(*upctx));
    306 	upload->user_context        = (iptr)upctx;
    307 	upctx->rf_buffer            = &cs->rf_buffer;
    308 	upctx->shared_memory        = ctx->shared_memory;
    309 	upctx->shared_memory_size   = ctx->shared_memory_size;
    310 	upctx->compute_timing_table = ctx->compute_timing_table;
    311 	upctx->compute_worker_sync  = &ctx->compute_worker.sync_variable;
    312 	upload->handle = os_create_thread("[upload]", upload, beamformer_upload_entry_point);
    313 
    314 	/* NOTE: set up OpenGL debug logging */
    315 	Stream *gl_error_stream = push_struct(&memory, Stream);
    316 	*gl_error_stream        = stream_alloc(&memory, 1024);
    317 	glDebugMessageCallback(gl_debug_logger, gl_error_stream);
    318 #ifdef _DEBUG
    319 	glEnable(GL_DEBUG_OUTPUT);
    320 #endif
    321 
    322 	if (!BakeShaders)
    323 	{
    324 		for EachElement(beamformer_reloadable_compute_shader_info_indices, it) {
    325 			i32   index = beamformer_reloadable_compute_shader_info_indices[it];
    326 			Arena temp  = scratch;
    327 			s8 file = push_s8_from_parts(&temp, os_path_separator(), s8("shaders"),
    328 			                             beamformer_reloadable_shader_files[index][0]);
    329 			BeamformerFileReloadContext *frc = push_struct(&memory, typeof(*frc));
    330 			frc->kind                 = BeamformerFileReloadKind_ComputeShader;
    331 			frc->shader_reload.shader = beamformer_reloadable_shader_kinds[index];
    332 			os_add_file_watch((char *)file.data, file.len, frc);
    333 		}
    334 
    335 		for EachElement(beamformer_reloadable_compute_helpers_shader_info_indices, it) {
    336 			i32   index = beamformer_reloadable_compute_helpers_shader_info_indices[it];
    337 			Arena temp  = scratch;
    338 			s8 file = push_s8_from_parts(&temp, os_path_separator(), s8("shaders"),
    339 			                             beamformer_reloadable_shader_files[index][0]);
    340 			BeamformerFileReloadContext *frc = push_struct(&memory, typeof(*frc));
    341 			frc->kind                 = BeamformerFileReloadKind_ComputeShader;
    342 			frc->shader_reload.shader = beamformer_reloadable_shader_kinds[index];
    343 			os_add_file_watch((char *)file.data, file.len, frc);
    344 		}
    345 
    346 		for EachElement(beamformer_reloadable_compute_internal_shader_info_indices, it) {
    347 			i32   index = beamformer_reloadable_compute_internal_shader_info_indices[it];
    348 			Arena temp  = scratch;
    349 			s8 file = push_s8_from_parts(&temp, os_path_separator(), s8("shaders"),
    350 			                             beamformer_reloadable_shader_files[index][0]);
    351 			BeamformerFileReloadContext *frc = push_struct(&memory, typeof(*frc));
    352 			frc->kind                   = BeamformerFileReloadKind_ComputeInternalShader;
    353 			frc->shader_reload.shader   = beamformer_reloadable_shader_kinds[index];
    354 			frc->shader_reload.pipeline = cs->compute_internal_pipelines + it;
    355 			os_add_file_watch((char *)file.data, file.len, frc);
    356 		}
    357 	}
    358 
    359 	memory.end = scratch.end;
    360 	ctx->arena = memory;
    361 	ctx->state = BeamformerState_Running;
    362 }
    363 
    364 BEAMFORMER_EXPORT void
    365 beamformer_terminate(BeamformerInput *input)
    366 {
    367 	/* NOTE(rnp): work around pebkac when the beamformer is closed while we are doing live
    368 	 * imaging. if the verasonics is blocked in an external function (calling the library
    369 	 * to start compute) it is impossible for us to get it to properly shut down which
    370 	 * will sometimes result in us needing to power cycle the system. set the shared memory
    371 	 * into an error state and release dispatch lock so that future calls will error instead
    372 	 * of blocking.
    373 	 */
    374 	BeamformerCtx *          ctx = BeamformerContextMemory(input->memory);
    375 	BeamformerSharedMemory * sm  = input->shared_memory;
    376 	if (ctx->state != BeamformerState_Terminated) {
    377 		if (sm) {
    378 			BeamformerSharedMemoryLockKind lock = BeamformerSharedMemoryLockKind_DispatchCompute;
    379 			atomic_store_u32(&sm->invalid, 1);
    380 			atomic_store_u32(&sm->external_work_queue.ridx, sm->external_work_queue.widx);
    381 			DEBUG_DECL(if (sm->locks[lock])) {
    382 				beamformer_shared_memory_release_lock(sm, (i32)lock);
    383 			}
    384 
    385 			atomic_or_u32(&sm->live_imaging_dirty_flags, BeamformerLiveImagingDirtyFlags_StopImaging);
    386 		}
    387 
    388 		beamformer_debug_ui_deinit(ctx);
    389 
    390 		ctx->state = BeamformerState_Terminated;
    391 	}
    392 }
    393 
    394 BEAMFORMER_EXPORT u32
    395 beamformer_should_close(BeamformerInput *input)
    396 {
    397 	BeamformerCtx * ctx = BeamformerContextMemory(input->memory);
    398 	if (ctx->state == BeamformerState_ShouldClose)
    399 		beamformer_terminate(input);
    400 	return ctx->state == BeamformerState_Terminated;
    401 }