beamformer.c (14265B)
1 /* See LICENSE for license details. */ 2 3 #include "beamformer_internal.h" 4 5 /* NOTE(rnp): magic variables to force discrete GPU usage on laptops with multiple devices */ 6 EXPORT i32 NvOptimusEnablement = 1; 7 EXPORT i32 AmdPowerXpressRequestHighPerformance = 1; 8 9 #if !BEAMFORMER_DEBUG 10 #include "beamformer_core.c" 11 #else 12 13 typedef void beamformer_frame_step_fn(BeamformerInput *); 14 15 #define BEAMFORMER_DEBUG_ENTRY_POINTS \ 16 X(beamformer_debug_ui_deinit) \ 17 X(beamformer_complete_compute) \ 18 X(beamformer_frame_step) \ 19 X(beamformer_rf_upload) \ 20 21 #define X(name) global name ##_fn *name; 22 BEAMFORMER_DEBUG_ENTRY_POINTS 23 #undef X 24 25 BEAMFORMER_EXPORT void 26 beamformer_debug_hot_release(BeamformerInput *input) 27 { 28 BeamformerCtx *ctx = BeamformerContextMemory(input->memory); 29 30 // TODO(rnp): this will deadlock if live imaging is active 31 /* NOTE(rnp): spin until compute thread finishes its work (we will probably 32 * never reload while compute is in progress but just incase). */ 33 spin_wait(atomic_load_u32(&ctx->upload_worker.awake)); 34 spin_wait(atomic_load_u32(&ctx->compute_worker.awake)); 35 } 36 37 BEAMFORMER_EXPORT void 38 beamformer_debug_hot_reload(OSLibrary library, BeamformerInput *input) 39 { 40 #define X(name) name = os_lookup_symbol(library, #name); 41 BEAMFORMER_DEBUG_ENTRY_POINTS 42 #undef X 43 44 s8 info = beamformer_info("reloaded main executable"); 45 os_console_log(info.data, info.len); 46 } 47 48 #endif /* BEAMFORMER_DEBUG */ 49 50 function no_return void 51 fatal(s8 message) 52 { 53 os_fatal(message.data, message.len); 54 unreachable(); 55 } 56 57 #include "vulkan.c" 58 59 // TODO(rnp): this doesn't belong here, but will be removed 60 // once vulkan migration is complete 61 void * glfwGetProcAddress(char *); 62 63 function void 64 gl_debug_logger(u32 src, u32 type, u32 id, u32 lvl, i32 len, const char *msg, const void *userctx) 65 { 66 Stream *e = (Stream *)userctx; 67 stream_append_s8s(e, s8("[OpenGL] "), (s8){.len = len, .data = (u8 *)msg}, s8("\n")); 68 os_console_log(e->data, e->widx); 69 stream_reset(e, 0); 70 } 71 72 function void 73 load_gl(Stream *err) 74 { 75 #define X(name, ret, params) name = (name##_fn *)glfwGetProcAddress(#name); 76 OGLProcedureList 77 OGLRequiredExtensionProcedureList 78 #undef X 79 80 stream_reset(err, 0); 81 #define X(name, ret, params) if (!name) stream_append_s8(err, s8("missing required GL function: " #name "\n")); 82 OGLProcedureList 83 OGLRequiredExtensionProcedureListBase 84 #if OS_WINDOWS 85 OGLRequiredExtensionProcedureListW32 86 #else 87 OGLRequiredExtensionProcedureListLinux 88 #endif 89 #undef X 90 91 if (err->widx) fatal(stream_to_s8(err)); 92 } 93 94 function void 95 beamformer_load_cuda_library(BeamformerCtx *ctx, OSLibrary cuda, Arena arena) 96 { 97 /* TODO(rnp): (25.10.30) registering the rf buffer with CUDA is currently 98 * causing a major performance regression. for now we are disabling its use 99 * altogether. it will be reenabled once the issue can be fixed */ 100 b32 result = 0 && vk_gpu_info()->vendor == GPUVendor_NVIDIA && ValidHandle(cuda); 101 if (result) { 102 Stream err = arena_stream(arena); 103 104 stream_append_s8(&err, beamformer_info("loading CUDA library functions")); 105 #define X(name, symname) cuda_## name = os_lookup_symbol(cuda, symname); 106 CUDALibraryProcedureList 107 #undef X 108 109 os_console_log(err.data, err.widx); 110 } 111 112 #define X(name, symname) if (!cuda_## name) cuda_## name = cuda_ ## name ## _stub; 113 CUDALibraryProcedureList 114 #undef X 115 } 116 117 function void 118 worker_thread_sleep(GLWorkerThreadContext *ctx, BeamformerSharedMemory *sm) 119 { 120 for (;;) { 121 i32 expected = 0; 122 if (atomic_cas_u32(&ctx->sync_variable, &expected, 1) || 123 atomic_load_u32(&sm->live_imaging_parameters.active)) 124 { 125 break; 126 } 127 128 /* TODO(rnp): clean this crap up; we shouldn't need two values to communicate this */ 129 atomic_store_u32(&ctx->awake, 0); 130 os_wait_on_address(&ctx->sync_variable, 1, (u32)-1); 131 atomic_store_u32(&ctx->awake, 1); 132 } 133 } 134 135 function OS_THREAD_ENTRY_POINT_FN(compute_worker_thread_entry_point) 136 { 137 GLWorkerThreadContext *ctx = user_context; 138 139 BeamformerCtx *beamformer = (BeamformerCtx *)ctx->user_context; 140 141 for (;;) { 142 worker_thread_sleep(ctx, beamformer->shared_memory); 143 asan_poison_region(ctx->arena.beg, ctx->arena.end - ctx->arena.beg); 144 beamformer_complete_compute(beamformer, &ctx->arena); 145 } 146 147 unreachable(); 148 149 return 0; 150 } 151 152 function OS_THREAD_ENTRY_POINT_FN(beamformer_upload_entry_point) 153 { 154 GLWorkerThreadContext *ctx = user_context; 155 BeamformerUploadThreadContext *up = (typeof(up))ctx->user_context; 156 157 for (;;) { 158 worker_thread_sleep(ctx, up->shared_memory); 159 beamformer_rf_upload(up); 160 } 161 162 unreachable(); 163 164 return 0; 165 } 166 167 BEAMFORMER_EXPORT void 168 beamformer_init(BeamformerInput *input) 169 { 170 Arena memory = arena_from_memory(input->memory, input->memory_size); 171 Arena compute_arena = sub_arena_end(&memory, MB(2), KB(4)); 172 Arena upload_arena = sub_arena_end(&memory, KB(4), KB(4)); 173 Arena ui_arena = sub_arena_end(&memory, MB(2), KB(4)); 174 Stream error = arena_stream(sub_arena_end(&memory, MB(1), 1)); 175 176 BeamformerCtx *ctx = push_struct(&memory, BeamformerCtx); 177 178 Arena scratch = {.beg = memory.end - 4096L, .end = memory.end}; 179 memory.end = scratch.beg; 180 181 ctx->window_size = (iv2){{1280, 840}}; 182 ctx->error_stream = error; 183 ctx->ui_backing_store = ui_arena; 184 ctx->compute_worker.arena = compute_arena; 185 ctx->upload_worker.arena = upload_arena; 186 187 #if BEAMFORMER_RENDERDOC_HOOKS 188 start_frame_capture = input->renderdoc_start_frame_capture; 189 end_frame_capture = input->renderdoc_end_frame_capture; 190 set_capture_path_template = input->renderdoc_set_capture_file_path_template; 191 #endif 192 193 vk_load(input->vulkan_library_handle, &memory, &ctx->error_stream); 194 195 BeamformerComputeContext *cs = &ctx->compute_context; 196 197 // NOTE(rnp): allocate beamformed image ring buffer 198 { 199 u64 gpu_heap_size = vk_gpu_info()->gpu_heap_size; 200 u64 trial_sizes[] = { 201 GB(4), 202 GB(2), 203 GB(1) + MB(512), 204 GB(1), 205 }; 206 207 u32 base_index = 0; 208 for EachElement(trial_sizes, it) { 209 if (gpu_heap_size >= 2 * trial_sizes[it]) 210 break; 211 base_index++; 212 } 213 214 for (u32 i = base_index; i < countof(trial_sizes); i++) { 215 // TODO(rnp): it may be better to download data from this using the transfer queue 216 VulkanTimeline timelines[] = {VulkanTimeline_Compute, VulkanTimeline_Graphics}; 217 GPUBufferAllocateInfo allocate_info = { 218 .size = trial_sizes[i], 219 .flags = VulkanUsageFlag_TransferSource|VulkanUsageFlag_HostReadWrite, 220 .timeline_count = countof(timelines), 221 .timelines_used = timelines, 222 .label = s8("BeamformedData"), 223 }; 224 vk_buffer_allocate(cs->backlog.buffer, &allocate_info); 225 if (cs->backlog.buffer->size > 0) 226 break; 227 } 228 if (cs->backlog.buffer->size == 0) { 229 // NOTE(rnp): if this becomes an issue we may be able to get by in some other way 230 fatal(s8("Failed to allocate space for beamformed data\n")); 231 } 232 233 BeamformerShaderResourceInfo shader_resource_infos[] = { 234 { 235 .kind = BeamformerShaderResourceKind_Buffer, 236 .handle = cs->backlog.buffer->handle, 237 .slot = BeamformerShaderBufferSlot_BeamformedData, 238 }, 239 }; 240 vk_bind_shader_resources(shader_resource_infos, countof(shader_resource_infos)); 241 } 242 243 beamformer_load_cuda_library(ctx, input->cuda_library_handle, memory); 244 245 SetConfigFlags(FLAG_VSYNC_HINT|FLAG_WINDOW_ALWAYS_RUN); 246 InitWindow(ctx->window_size.w, ctx->window_size.h, "OGL Beamformer"); 247 /* NOTE: do this after initing so that the window starts out floating in tiling wm */ 248 SetWindowState(FLAG_WINDOW_RESIZABLE); 249 SetWindowMinSize(840, ctx->window_size.h); 250 251 load_gl(&ctx->error_stream); 252 253 ctx->shared_memory = input->shared_memory; 254 ctx->shared_memory_size = input->shared_memory_size; 255 if (ctx->shared_memory_size < (i64)sizeof(*ctx->shared_memory)) 256 fatal(s8("Get more ram lol\n")); 257 zero_struct(ctx->shared_memory); 258 259 ctx->shared_memory->version = BEAMFORMER_SHARED_MEMORY_VERSION; 260 ctx->shared_memory->reserved_parameter_blocks = 1; 261 262 ctx->shared_memory->beamformed_frame_buffer_size = cs->backlog.buffer->size; 263 264 // TODO(rnp): dynamic rf data buffer slot usage 265 // NOTE(rnp): will be same as the max size we were able to get for the frame buffer 266 ctx->shared_memory->capabilities.max_rf_data_size = cs->backlog.buffer->size 267 / BeamformerMaxRawDataFramesInFlight; 268 269 ctx->shared_memory->capabilities.cuda = cuda_init != cuda_init_stub; 270 // TODO(rnp): re-enable hilbert support, with and without cuda 271 ctx->shared_memory->capabilities.hilbert = 0; 272 273 /* TODO(rnp): I'm not sure if its a good idea to pre-reserve a bunch of semaphores 274 * on w32 but thats what we are doing for now */ 275 #if OS_WINDOWS 276 { 277 Stream sb = arena_stream(memory); 278 stream_append(&sb, input->shared_memory_name, input->shared_memory_name_length); 279 stream_append_s8(&sb, s8("_lock_")); 280 i32 start_index = sb.widx; 281 for EachElement(os_w32_shared_memory_semaphores, it) { 282 stream_reset(&sb, start_index); 283 stream_append_u64(&sb, it); 284 stream_append_byte(&sb, 0); 285 os_w32_shared_memory_semaphores[it] = os_w32_create_semaphore((c8 *)sb.data, 1, 1); 286 if InvalidHandle(os_w32_shared_memory_semaphores[it]) 287 fatal(beamformer_info("init: failed to create w32 shared memory semaphore\n")); 288 289 /* NOTE(rnp): hacky garbage because CreateSemaphore will just open an existing 290 * semaphore without any indication. Sometimes the other side of the shared memory 291 * will provide incorrect parameters or will otherwise fail and its faster to 292 * restart this program than to get that application to release the semaphores */ 293 /* TODO(rnp): figure out something more robust */ 294 os_w32_semaphore_release(os_w32_shared_memory_semaphores[it], 1); 295 } 296 } 297 #endif 298 299 GLWorkerThreadContext *worker = &ctx->compute_worker; 300 /* TODO(rnp): we should lock this down after we have something working */ 301 worker->user_context = (iptr)ctx; 302 worker->handle = os_create_thread("[compute]", worker, compute_worker_thread_entry_point); 303 304 GLWorkerThreadContext *upload = &ctx->upload_worker; 305 BeamformerUploadThreadContext *upctx = push_struct(&memory, typeof(*upctx)); 306 upload->user_context = (iptr)upctx; 307 upctx->rf_buffer = &cs->rf_buffer; 308 upctx->shared_memory = ctx->shared_memory; 309 upctx->shared_memory_size = ctx->shared_memory_size; 310 upctx->compute_timing_table = ctx->compute_timing_table; 311 upctx->compute_worker_sync = &ctx->compute_worker.sync_variable; 312 upload->handle = os_create_thread("[upload]", upload, beamformer_upload_entry_point); 313 314 /* NOTE: set up OpenGL debug logging */ 315 Stream *gl_error_stream = push_struct(&memory, Stream); 316 *gl_error_stream = stream_alloc(&memory, 1024); 317 glDebugMessageCallback(gl_debug_logger, gl_error_stream); 318 #ifdef _DEBUG 319 glEnable(GL_DEBUG_OUTPUT); 320 #endif 321 322 if (!BakeShaders) 323 { 324 for EachElement(beamformer_reloadable_compute_shader_info_indices, it) { 325 i32 index = beamformer_reloadable_compute_shader_info_indices[it]; 326 Arena temp = scratch; 327 s8 file = push_s8_from_parts(&temp, os_path_separator(), s8("shaders"), 328 beamformer_reloadable_shader_files[index][0]); 329 BeamformerFileReloadContext *frc = push_struct(&memory, typeof(*frc)); 330 frc->kind = BeamformerFileReloadKind_ComputeShader; 331 frc->shader_reload.shader = beamformer_reloadable_shader_kinds[index]; 332 os_add_file_watch((char *)file.data, file.len, frc); 333 } 334 335 for EachElement(beamformer_reloadable_compute_helpers_shader_info_indices, it) { 336 i32 index = beamformer_reloadable_compute_helpers_shader_info_indices[it]; 337 Arena temp = scratch; 338 s8 file = push_s8_from_parts(&temp, os_path_separator(), s8("shaders"), 339 beamformer_reloadable_shader_files[index][0]); 340 BeamformerFileReloadContext *frc = push_struct(&memory, typeof(*frc)); 341 frc->kind = BeamformerFileReloadKind_ComputeShader; 342 frc->shader_reload.shader = beamformer_reloadable_shader_kinds[index]; 343 os_add_file_watch((char *)file.data, file.len, frc); 344 } 345 346 for EachElement(beamformer_reloadable_compute_internal_shader_info_indices, it) { 347 i32 index = beamformer_reloadable_compute_internal_shader_info_indices[it]; 348 Arena temp = scratch; 349 s8 file = push_s8_from_parts(&temp, os_path_separator(), s8("shaders"), 350 beamformer_reloadable_shader_files[index][0]); 351 BeamformerFileReloadContext *frc = push_struct(&memory, typeof(*frc)); 352 frc->kind = BeamformerFileReloadKind_ComputeInternalShader; 353 frc->shader_reload.shader = beamformer_reloadable_shader_kinds[index]; 354 frc->shader_reload.pipeline = cs->compute_internal_pipelines + it; 355 os_add_file_watch((char *)file.data, file.len, frc); 356 } 357 } 358 359 memory.end = scratch.end; 360 ctx->arena = memory; 361 ctx->state = BeamformerState_Running; 362 } 363 364 BEAMFORMER_EXPORT void 365 beamformer_terminate(BeamformerInput *input) 366 { 367 /* NOTE(rnp): work around pebkac when the beamformer is closed while we are doing live 368 * imaging. if the verasonics is blocked in an external function (calling the library 369 * to start compute) it is impossible for us to get it to properly shut down which 370 * will sometimes result in us needing to power cycle the system. set the shared memory 371 * into an error state and release dispatch lock so that future calls will error instead 372 * of blocking. 373 */ 374 BeamformerCtx * ctx = BeamformerContextMemory(input->memory); 375 BeamformerSharedMemory * sm = input->shared_memory; 376 if (ctx->state != BeamformerState_Terminated) { 377 if (sm) { 378 BeamformerSharedMemoryLockKind lock = BeamformerSharedMemoryLockKind_DispatchCompute; 379 atomic_store_u32(&sm->invalid, 1); 380 atomic_store_u32(&sm->external_work_queue.ridx, sm->external_work_queue.widx); 381 DEBUG_DECL(if (sm->locks[lock])) { 382 beamformer_shared_memory_release_lock(sm, (i32)lock); 383 } 384 385 atomic_or_u32(&sm->live_imaging_dirty_flags, BeamformerLiveImagingDirtyFlags_StopImaging); 386 } 387 388 beamformer_debug_ui_deinit(ctx); 389 390 ctx->state = BeamformerState_Terminated; 391 } 392 } 393 394 BEAMFORMER_EXPORT u32 395 beamformer_should_close(BeamformerInput *input) 396 { 397 BeamformerCtx * ctx = BeamformerContextMemory(input->memory); 398 if (ctx->state == BeamformerState_ShouldClose) 399 beamformer_terminate(input); 400 return ctx->state == BeamformerState_Terminated; 401 }