beamformer_core.c (67161B)
1 /* See LICENSE for license details. */ 2 /* TODO(rnp): 3 * [ ]: bug? HERCULES might be broken, we may need to to chunk on transmits instead of channels 4 * [ ]: refactor: do_compute should build its own "command graph" which tracks 5 * dependencies better. It is very important that unnecessary barriers are 6 * not placed between compute stages which requires knowledge of the entire 7 * graph. 8 * [ ]: refactor: replace UploadRF with just the scratch_rf_size variable, 9 * use below to spin wait in library 10 * [ ]: utilize umonitor/umwait (intel), monitorx/mwaitx (amd), and wfe/sev (aarch64) 11 * for power efficient low latency waiting 12 * [ ]: BeamformWorkQueue -> BeamformerWorkQueue 13 * [ ]: refactor: work queue needs a cleanup, we should only have a single one 14 * - that queue isn't really considered hot so a lock is probably fine 15 * [ ]: bug: reinit cuda on hot-reload 16 */ 17 18 #include "compiler.h" 19 20 #if defined(BEAMFORMER_DEBUG) && !defined(BEAMFORMER_EXPORT) && OS_WINDOWS 21 #define BEAMFORMER_EXPORT __declspec(dllexport) 22 #endif 23 24 #include "beamformer_internal.h" 25 26 global f32 dt_for_frame; 27 28 typedef struct BeamformerComputeGraphNode BeamformerComputeGraphNode; 29 struct BeamformerComputeGraphNode { 30 // NOTE(rnp): will be BeamformerShaderKind_Count for root node 31 BeamformerShaderKind kind; 32 33 // NOTE(rnp): when any of input or output stride is assigned it is assumed that 34 // the shader requires a fixed layout for input, output, or both. When two adjacent 35 // nodes require incompatible layouts the second pass over the graph will insert 36 // Reshape shaders in between. 37 BeamformerDataKind input_data_kind; 38 iv3 input_stride; 39 40 BeamformerDataKind output_data_kind; 41 iv3 output_stride; 42 43 i32 user_pipeline_index; 44 45 BeamformerComputeGraphNode *prev; 46 BeamformerComputeGraphNode *next; 47 }; 48 49 typedef struct { 50 BeamformerComputeGraphNode *first; 51 BeamformerComputeGraphNode *last; 52 u64 count; 53 } BeamformerComputeGraph; 54 55 read_only global u32 beamformer_compute_array_parameter_sizes[] = { 56 #define X(k, type, elements) sizeof(type) * elements, 57 BEAMFORMER_COMPUTE_ARRAY_PARAMETERS_LIST 58 #undef X 59 }; 60 61 read_only global u32 beamformer_compute_array_parameter_offsets[] = { 62 #define X(k, ...) offsetof(BeamformerComputeArrayParameters, k), 63 BEAMFORMER_COMPUTE_ARRAY_PARAMETERS_LIST 64 #undef X 65 }; 66 67 function void 68 beamformer_compute_plan_release(BeamformerComputeContext *cc, u32 block) 69 { 70 assert(block < countof(cc->compute_plans)); 71 BeamformerComputePlan *cp = cc->compute_plans[block]; 72 if (cp) { 73 vk_buffer_release(&cp->array_parameters); 74 for (u32 i = 0; i < countof(cp->filters); i++) 75 vk_buffer_release(&cp->filters[i].buffer); 76 cc->compute_plans[block] = 0; 77 SLLPushFreelist(cp, cc->compute_plan_freelist); 78 } 79 } 80 81 function BeamformerComputePlan * 82 beamformer_compute_plan_for_block(BeamformerComputeContext *cc, u32 block, Arena *arena) 83 { 84 assert(block < countof(cc->compute_plans)); 85 BeamformerComputePlan *result = cc->compute_plans[block]; 86 if (!result) { 87 result = SLLPopFreelist(cc->compute_plan_freelist); 88 if (!result) result = push_struct_no_zero(arena, BeamformerComputePlan); 89 zero_struct(result); 90 cc->compute_plans[block] = result; 91 92 result->ui_voxel_transform = m4_identity(); 93 94 Stream label = arena_stream(*arena); 95 stream_append_s8(&label, s8("ComputeParameterArray[")); 96 stream_append_u64(&label, block); 97 stream_append_s8(&label, s8("]")); 98 stream_append_byte(&label, 0); 99 100 GPUBufferAllocateInfo allocate_info = { 101 .size = sizeof(BeamformerComputeArrayParameters), 102 .flags = VulkanUsageFlag_HostReadWrite, 103 .label = stream_to_s8(&label), 104 }; 105 vk_buffer_allocate(&result->array_parameters, &allocate_info); 106 assert((result->array_parameters.gpu_pointer & 63) == 0); 107 } 108 return result; 109 } 110 111 function void 112 beamformer_filter_update(BeamformerFilter *f, BeamformerFilterParameters fp, u32 block, u32 slot, Arena arena) 113 { 114 Stream sb = arena_stream(arena); 115 stream_append_s8s(&sb, 116 beamformer_filter_kind_strings[fp.kind % countof(beamformer_filter_kind_strings)], 117 s8("Filter[")); 118 stream_append_u64(&sb, block); 119 stream_append_s8(&sb, s8("][")); 120 stream_append_u64(&sb, slot); 121 stream_append_byte(&sb, ']'); 122 s8 label = arena_stream_commit(&arena, &sb); 123 124 void *filter = 0; 125 switch (fp.kind) { 126 case BeamformerFilterKind_Kaiser:{ 127 /* TODO(rnp): this should also support complex */ 128 /* TODO(rnp): implement this as an IFIR filter instead to reduce computation */ 129 filter = kaiser_low_pass_filter(&arena, fp.kaiser.cutoff_frequency, fp.sampling_frequency, 130 fp.kaiser.beta, (i32)fp.kaiser.length); 131 f->length = (i32)fp.kaiser.length; 132 f->time_delay = (f32)f->length / 2.0f / fp.sampling_frequency; 133 }break; 134 case BeamformerFilterKind_MatchedChirp:{ 135 typeof(fp.matched_chirp) *mc = &fp.matched_chirp; 136 f32 fs = fp.sampling_frequency; 137 f->length = (i32)(mc->duration * fs); 138 if (fp.complex) { 139 filter = baseband_chirp(&arena, mc->min_frequency, mc->max_frequency, fs, f->length, 1, 0.5f); 140 f->time_delay = complex_filter_first_moment(filter, f->length, fs); 141 } else { 142 filter = rf_chirp(&arena, mc->min_frequency, mc->max_frequency, fs, f->length, 1); 143 f->time_delay = real_filter_first_moment(filter, f->length, fs); 144 } 145 }break; 146 InvalidDefaultCase; 147 } 148 149 f->parameters = fp; 150 151 u32 byte_size = f->length * (i32)sizeof(f32) * (fp.complex? 2 : 1); 152 if (f->buffer.size < byte_size) { 153 GPUBufferAllocateInfo allocate_info = { 154 .size = byte_size, 155 .flags = VulkanUsageFlag_HostReadWrite, 156 .label = label, 157 }; 158 vk_buffer_allocate(&f->buffer, &allocate_info); 159 } 160 vk_buffer_range_upload(&f->buffer, filter, 0, byte_size, 0); 161 } 162 163 function iv3 164 das_valid_points(iv3 points) 165 { 166 iv3 result; 167 result.x = Max(points.x, 1); 168 result.y = Max(points.y, 1); 169 result.z = Max(points.z, 1); 170 return result; 171 } 172 173 function void 174 update_hadamard(BeamformerComputePlan *cp, i32 order, b32 row_major, Arena arena) 175 { 176 f16 *hadamard = make_hadamard_transpose(&arena, order, row_major); 177 if (hadamard) { 178 u64 offset = offsetof(BeamformerComputeArrayParameters, Hadamard); 179 u64 size = sizeof(*((BeamformerComputeArrayParameters *)0)->Hadamard) * order * order; 180 vk_buffer_range_upload(&cp->array_parameters, hadamard, offset, size, 0); 181 cp->hadamard_order = order; 182 } 183 } 184 185 function u64 186 beamformer_frame_byte_size(iv3 points, BeamformerDataKind kind) 187 { 188 u64 result = points.x * points.y * points.z * beamformer_data_kind_byte_size[kind]; 189 result = round_up_to(result, 64); 190 return result; 191 } 192 193 function BeamformerFrame * 194 beamformer_frame_next(BeamformerComputeContext *cc, iv3 output_points, b32 complex, u64 reserved_size) 195 { 196 BeamformerFrameBacklog *bl = &cc->backlog; 197 198 BeamformerDataKind kind = complex ? BeamformerDataKind_Float32Complex : BeamformerDataKind_Float32; 199 u64 frame_size = beamformer_frame_byte_size(output_points, kind); 200 201 // TODO(rnp): handle this somewhat gracefully (even it produces garbled output) 202 assert(frame_size + reserved_size <= (u64)bl->buffer->size); 203 204 if (bl->next_offset > (u64)bl->buffer->size - frame_size - reserved_size) 205 bl->next_offset = 0; 206 207 u64 id = bl->counter++; 208 209 BeamformerFrame *result = bl->frames + (id % countof(bl->frames)); 210 atomic_store_u64(&result->timeline_valid_value, -1ULL); 211 result->id = id & U32_MAX; 212 result->buffer_offset = bl->next_offset; 213 result->points = output_points; 214 result->data_kind = kind; 215 216 bl->next_offset += frame_size; 217 218 return result; 219 } 220 221 function void 222 push_compute_timing_info(ComputeTimingTable *t, ComputeTimingInfo info) 223 { 224 u32 index = atomic_add_u32(&t->write_index, 1) % countof(t->buffer); 225 t->buffer[index] = info; 226 } 227 228 function uv3 229 layout_for_output(iv3 points) 230 { 231 uv3 result = {{1, 1, 1}}; 232 233 b32 has_x = points.x > 1; 234 b32 has_y = points.y > 1; 235 b32 has_z = points.z > 1; 236 237 u32 subgroup_size = vk_gpu_info()->subgroup_size; 238 u32 grid_3d_z_size = Max(1, subgroup_size / (4 * 4)); 239 u32 grid_2d_y_size = Max(1, subgroup_size / 8); 240 241 switch (iv3_dimension(points)) { 242 case 1:{ 243 if (has_x) result.x = subgroup_size; 244 if (has_y) result.y = subgroup_size; 245 if (has_z) result.z = subgroup_size; 246 }break; 247 248 case 2:{ 249 if (has_x && has_y) {result.x = 8; result.y = grid_2d_y_size;} 250 if (has_x && has_z) {result.x = 8; result.z = grid_2d_y_size;} 251 if (has_y && has_z) {result.y = 8; result.z = grid_2d_y_size;} 252 }break; 253 254 case 3:{result = (uv3){{4, 4, grid_3d_z_size}};}break; 255 256 InvalidDefaultCase; 257 } 258 259 return result; 260 } 261 262 function uv3 263 dispatch_for_output(uv3 layout, iv3 points) 264 { 265 uv3 result; 266 result.x = (u32)ceil_f32((f32)points.x / layout.x); 267 result.y = (u32)ceil_f32((f32)points.y / layout.y); 268 result.z = (u32)ceil_f32((f32)points.z / layout.z); 269 return result; 270 } 271 272 function b32 273 compute_plan_push_shader(BeamformerComputePlan *p, BeamformerComputeGraphNode *node, BeamformerShaderParameters *sp) 274 { 275 b32 result = 0; 276 if (p->pipeline.shader_count < countof(p->pipeline.shaders)) { 277 u32 index = p->pipeline.shader_count++; 278 p->pipeline.shaders[index] = node->kind; 279 zero_struct(p->shader_descriptors + index); 280 p->pipeline.parameters[index] = sp ? *sp : (BeamformerShaderParameters){0}; 281 282 p->shader_descriptors[index].input_data_kind = node->input_data_kind; 283 p->shader_descriptors[index].output_data_kind = node->output_data_kind; 284 285 result = 1; 286 } 287 return result; 288 } 289 290 function BeamformerComputeGraphNode * 291 push_compute_graph_node(BeamformerComputeGraph *graph, BeamformerShaderKind kind, Arena *arena) 292 { 293 BeamformerComputeGraphNode *result = push_struct(arena, BeamformerComputeGraphNode); 294 if (graph) { 295 DLLInsertLast(0, graph->first, graph->last, result, next, prev); 296 graph->count++; 297 } 298 result->kind = kind; 299 result->user_pipeline_index = -1; 300 // NOTE(rnp): initially don't care data kind 301 result->input_data_kind = BeamformerDataKind_Count; 302 result->output_data_kind = BeamformerDataKind_Count; 303 return result; 304 } 305 306 function void 307 plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb, Arena scratch) 308 { 309 b32 run_hilbert = 0; 310 b32 demodulate = 0; 311 312 for (u32 i = 0; i < pb->pipeline.shader_count; i++) { 313 switch (pb->pipeline.shaders[i]) { 314 case BeamformerShaderKind_Hilbert:{run_hilbert = 1;}break; 315 case BeamformerShaderKind_Demodulate:{demodulate = 1;}break; 316 default:{}break; 317 } 318 } 319 320 if (demodulate) run_hilbert = 0; 321 322 f32 sampling_frequency = pb->parameters.sampling_frequency; 323 u32 input_sample_count = pb->parameters.sample_count; 324 u32 acquisition_count = pb->parameters.acquisition_count; 325 u32 decimation_rate = Max(pb->parameters.decimation_rate, 1); 326 327 cp->raw_channel_byte_stride = pb->parameters.sample_count * pb->parameters.acquisition_count 328 * beamformer_data_kind_byte_size[pb->pipeline.data_kind]; 329 330 BeamformerDataKind input_data_kind = pb->pipeline.data_kind; 331 if (demodulate) { 332 switch (input_data_kind) { 333 case BeamformerDataKind_Int16:{ input_data_kind = BeamformerDataKind_Int16Complex; }break; 334 case BeamformerDataKind_Float16:{input_data_kind = BeamformerDataKind_Float16Complex;}break; 335 case BeamformerDataKind_Float32:{input_data_kind = BeamformerDataKind_Float32Complex;}break; 336 default:{}break; 337 } 338 input_sample_count /= (2 * decimation_rate); 339 sampling_frequency /= (2 * decimation_rate); 340 } 341 342 cp->iq_pipeline = beamformer_data_kind_complex[input_data_kind] || run_hilbert; 343 344 BeamformerDataKind das_data_kind = cp->iq_pipeline ? BeamformerDataKind_Float32Complex 345 : BeamformerDataKind_Float32; 346 347 cp->channel_count = pb->parameters.channel_count; 348 u32 chunk_channel_count = Min(cp->channel_count, BeamformerChunkChannelCount); 349 350 cp->rf_size = input_sample_count * pb->parameters.acquisition_count * chunk_channel_count 351 * beamformer_data_kind_byte_size[das_data_kind]; 352 353 read_only local_persist BeamformerDataKind data_kind_to_element_kind[] = { 354 [BeamformerDataKind_Int16] = BeamformerDataKind_Float16, 355 [BeamformerDataKind_Float16] = BeamformerDataKind_Float16, 356 [BeamformerDataKind_Float32] = BeamformerDataKind_Float32, 357 [BeamformerDataKind_Int16Complex] = BeamformerDataKind_Float16, 358 [BeamformerDataKind_Float16Complex] = BeamformerDataKind_Float16, 359 [BeamformerDataKind_Float32Complex] = BeamformerDataKind_Float32, 360 }; 361 362 ////////////////////////////////////// 363 // NOTE(rnp): First Pass: build initial graph and insert hard layout constraints 364 BeamformerComputeGraph graph = {0}; 365 BeamformerComputeGraphNode *root_node = push_compute_graph_node(&graph, BeamformerShaderKind_Count, &scratch); 366 root_node->input_data_kind = input_data_kind; 367 root_node->input_stride.x = 1; // Sample Stride 368 root_node->input_stride.y = pb->parameters.sample_count * acquisition_count; // Channel Stride 369 root_node->input_stride.z = pb->parameters.sample_count; // Receive Event Stride 370 root_node->output_data_kind = input_data_kind; 371 root_node->output_stride.x = 1; // Sample Stride 372 root_node->output_stride.y = pb->parameters.sample_count * acquisition_count; // Channel Stride 373 root_node->output_stride.z = pb->parameters.sample_count; // Receive Event Stride 374 375 for EachIndex(pb->pipeline.shader_count, it) { 376 // NOTE(rnp): skip unnecessary shaders 377 switch (pb->pipeline.shaders[it]) { 378 case BeamformerShaderKind_Hilbert:{if (!run_hilbert) continue;}break; 379 380 case BeamformerShaderKind_Decode:{ 381 if (pb->parameters.decode_mode == BeamformerDecodeMode_None) 382 continue; 383 }break; 384 385 case BeamformerShaderKind_Sum: 386 case BeamformerShaderKind_MinMax: 387 { 388 // NOTE(rnp): currently unsupported 389 continue; 390 }break; 391 392 default:{}break; 393 } 394 395 BeamformerComputeGraphNode *node = push_compute_graph_node(&graph, pb->pipeline.shaders[it], &scratch); 396 node->user_pipeline_index = (i32)it; 397 switch (pb->pipeline.shaders[it]) { 398 case BeamformerShaderKind_Decode:{ 399 b32 low_precision = beamformer_data_kind_element_size[input_data_kind] < 4; 400 b32 use_coop_matrix = vk_gpu_info()->cooperative_matrix && 401 low_precision && 402 (acquisition_count % 16 == 0) && 403 (chunk_channel_count % 16 == 0); 404 405 // NOTE(rnp): fixed input layout required for reasonable performance 406 if (low_precision && beamformer_data_kind_complex[input_data_kind]) 407 node->input_data_kind = BeamformerDataKind_Float16Complex; 408 node->input_stride.x = chunk_channel_count * acquisition_count; 409 node->input_stride.y = acquisition_count; 410 node->input_stride.z = 1; 411 412 if (use_coop_matrix) { 413 node->input_data_kind = BeamformerDataKind_Float16; 414 node->output_data_kind = data_kind_to_element_kind[das_data_kind]; 415 node->output_stride = node->input_stride; 416 } 417 }break; 418 419 case BeamformerShaderKind_DAS:{ 420 node->input_data_kind = das_data_kind; 421 node->input_stride.x = 1; // Sample Stride 422 node->input_stride.y = input_sample_count * acquisition_count; // Channel Stride 423 node->input_stride.z = input_sample_count; // Receive Event Stride 424 node->output_stride.x = 1; 425 node->output_stride.y = cp->output_points.x; 426 node->output_stride.z = cp->output_points.x * cp->output_points.y; 427 node->output_data_kind = cp->iq_pipeline ? BeamformerDataKind_Float32Complex 428 : BeamformerDataKind_Float32; 429 430 // NOTE(rnp): insert implicit CoherencyWeighting node 431 if (pb->parameters.coherency_weighting) 432 node = push_compute_graph_node(&graph, BeamformerShaderKind_CoherencyWeighting, &scratch); 433 }break; 434 435 default:{}break; 436 } 437 } 438 439 ////////////////////////////////////// 440 // NOTE(rnp): Second Pass: resolve layout constraints 441 for (BeamformerComputeGraphNode *node = root_node->next; node; node = node->next) { 442 b32 needs_reshape = 0; 443 444 // NOTE(rnp): data strides 445 { 446 b32 input_dont_care = bv3_any(iv3_equal(node->input_stride, (iv3){0})); 447 b32 prev_output_dont_care = bv3_any(iv3_equal(node->prev->output_stride, (iv3){0})); 448 449 if (prev_output_dont_care && !input_dont_care) 450 node->prev->output_stride = node->input_stride; 451 452 if (!prev_output_dont_care && input_dont_care) 453 node->input_stride = node->prev->output_stride; 454 455 if (prev_output_dont_care && input_dont_care) 456 node->input_stride = node->prev->output_stride = node->prev->input_stride; 457 458 needs_reshape |= !bv3_all(iv3_equal(node->input_stride, node->prev->output_stride)); 459 } 460 461 // NOTE(rnp): data kinds 462 { 463 b32 input_dont_care = node->input_data_kind == BeamformerDataKind_Count; 464 b32 prev_output_dont_care = node->prev->output_data_kind == BeamformerDataKind_Count; 465 466 if (prev_output_dont_care && !input_dont_care) 467 node->prev->output_data_kind = node->input_data_kind; 468 469 if (!prev_output_dont_care && input_dont_care) 470 node->input_data_kind = node->prev->output_data_kind; 471 472 if (prev_output_dont_care && input_dont_care) 473 node->input_data_kind = node->prev->output_data_kind = node->prev->input_data_kind; 474 475 needs_reshape |= node->input_data_kind != node->prev->output_data_kind; 476 } 477 478 // NOTE(rnp): insert reshape if needed 479 if (needs_reshape) { 480 BeamformerComputeGraphNode *new = push_compute_graph_node(0, BeamformerShaderKind_Reshape, &scratch); 481 BeamformerComputeGraphNode *last = node->prev; 482 DLLInsertLast(0, node, last, new, next, prev); 483 graph.count++; 484 new->input_data_kind = new->prev->output_data_kind; 485 new->input_stride = new->prev->output_stride; 486 new->output_data_kind = new->next->input_data_kind; 487 new->output_stride = new->next->input_stride; 488 } 489 } 490 491 f32 time_offset = pb->parameters.time_offset; 492 u32 subgroup_size = vk_gpu_info()->subgroup_size; 493 494 cp->first_image_shader_index = 0; 495 cp->pipeline.shader_count = 0; 496 497 for (BeamformerComputeGraphNode *node = root_node->next; node; node = node->next) { 498 assert(node->prev->output_data_kind == node->input_data_kind); 499 assert(bv3_all(iv3_equal(node->prev->output_stride, node->input_stride))); 500 501 BeamformerShaderParameters *sp = 0; 502 if (node->user_pipeline_index >= 0) 503 sp = pb->pipeline.parameters + node->user_pipeline_index; 504 505 if (compute_plan_push_shader(cp, node, sp)) { 506 BeamformerShaderDescriptor *sd = cp->shader_descriptors + cp->pipeline.shader_count - 1; 507 508 switch (node->kind) { 509 case BeamformerShaderKind_Decode:{ 510 BeamformerDecodeBakeParameters *db = &sd->bake.Decode; 511 512 u32 decode_sample_count = input_sample_count; 513 db->decode_mode = pb->parameters.decode_mode; 514 db->transmit_count = pb->parameters.acquisition_count; 515 db->chunk_channel_count = chunk_channel_count; 516 517 // NOTE(rnp): ignored when using coop matrices 518 db->output_sample_stride = node->output_stride.x; 519 db->output_channel_stride = node->output_stride.y; 520 db->output_transmit_stride = node->output_stride.z; 521 522 db->to_process = 1; 523 524 b32 use_coop_matrix = vk_gpu_info()->cooperative_matrix && 525 node->input_data_kind == BeamformerDataKind_Float16 && 526 (db->transmit_count % 16 == 0) && 527 (chunk_channel_count % 16 == 0); 528 if (use_coop_matrix) { 529 // TODO(rnp): shared memory for larger sizes 530 sd->layout = (uv3){{subgroup_size, 1, 1}}; 531 532 if (demodulate) 533 decode_sample_count *= 2; 534 535 db->cooperative_matrix = 1; 536 db->cooperative_matrix_m = 16; 537 db->cooperative_matrix_n = 16; 538 db->cooperative_matrix_k = 16; 539 540 sd->dispatch.x = db->transmit_count / db->cooperative_matrix_n; 541 sd->dispatch.y = chunk_channel_count / db->cooperative_matrix_m; 542 sd->dispatch.z = decode_sample_count; 543 } else if (db->transmit_count > 40) { 544 db->use_shared_memory = 1; 545 546 if (db->transmit_count == 48) 547 db->to_process = db->transmit_count / 16; 548 549 b32 use_16x = db->transmit_count == 48 || db->transmit_count == 80 || 550 db->transmit_count == 96 || db->transmit_count == 160; 551 sd->layout.x = use_16x ? 16 : 32; 552 sd->layout.y = 4; 553 sd->layout.z = 1; 554 555 sd->dispatch.x = (u32)ceil_f32((f32)pb->parameters.acquisition_count / (f32)sd->layout.x / (f32)db->to_process); 556 sd->dispatch.y = (u32)ceil_f32((f32)chunk_channel_count / (f32)sd->layout.y); 557 sd->dispatch.z = (u32)ceil_f32((f32)decode_sample_count / (f32)sd->layout.z); 558 } else { 559 /* NOTE(rnp): register caching. using more threads will cause the compiler to do 560 * contortions to avoid spilling registers. using less gives higher performance */ 561 sd->layout = (uv3){{subgroup_size / 2, 1, 1}}; 562 563 sd->dispatch.x = (u32)ceil_f32((f32)decode_sample_count / (f32)sd->layout.x); 564 sd->dispatch.y = (u32)ceil_f32((f32)chunk_channel_count / (f32)sd->layout.y); 565 sd->dispatch.z = 1; 566 } 567 }break; 568 569 case BeamformerShaderKind_Demodulate: 570 case BeamformerShaderKind_Filter: 571 { 572 b32 demod = node->kind == BeamformerShaderKind_Demodulate; 573 BeamformerFilter *f = cp->filters + sp->filter_slot; 574 575 time_offset += f->time_delay; 576 577 BeamformerFilterBakeParameters *fb = &sd->bake.Filter; 578 fb->filter_length = (u32)f->length; 579 fb->demodulate = demod; 580 fb->complex_filter = f->parameters.complex; 581 582 fb->sample_count = input_sample_count; 583 fb->decimation_rate = demod ? decimation_rate : 1; 584 585 b32 deinterleave = beamformer_data_kind_complex[node->input_data_kind] && 586 !beamformer_data_kind_complex[node->output_data_kind]; 587 if (deinterleave) 588 fb->batch_sample_count = chunk_channel_count * input_sample_count * pb->parameters.acquisition_count; 589 590 fb->output_sample_stride = node->output_stride.x; 591 fb->output_channel_stride = node->output_stride.y; 592 fb->output_transmit_stride = node->output_stride.z; 593 594 fb->input_sample_stride = node->input_stride.x; 595 fb->input_channel_stride = node->input_stride.y; 596 fb->input_transmit_stride = node->input_stride.z; 597 598 /* NOTE(rnp): when we are demodulating we pretend that the sampler was alternating 599 * between sampling the I portion and the Q portion of an IQ signal. Therefore there 600 * is an implicit decimation factor of 2 which must always be included. All code here 601 * assumes that the signal was sampled in such a way that supports this operation. 602 * To recover IQ[n] from the sampled data (RF[n]) we do the following: 603 * I[n] = RF[n] 604 * Q[n] = RF[n + 1] 605 * IQ[n] = I[n] - j*Q[n] 606 */ 607 if (demod) { 608 fb->demodulation_frequency = pb->parameters.demodulation_frequency; 609 fb->sampling_frequency = pb->parameters.sampling_frequency / 2; 610 } 611 612 sd->layout = (uv3){{subgroup_size, 1, 1}}; 613 sd->dispatch.x = (u32)ceil_f32((f32)input_sample_count / (f32)sd->layout.x); 614 sd->dispatch.y = (u32)ceil_f32((f32)chunk_channel_count / (f32)sd->layout.y); 615 sd->dispatch.z = (u32)ceil_f32((f32)pb->parameters.acquisition_count / (f32)sd->layout.z); 616 }break; 617 618 case BeamformerShaderKind_DAS:{ 619 cp->first_image_shader_index = cp->pipeline.shader_count; 620 621 BeamformerDASBakeParameters *db = &sd->bake.DAS; 622 db->sampling_frequency = sampling_frequency; 623 db->demodulation_frequency = pb->parameters.demodulation_frequency; 624 db->speed_of_sound = pb->parameters.speed_of_sound; 625 db->time_offset = time_offset; 626 db->f_number = pb->parameters.f_number; 627 db->acquisition_kind = pb->parameters.acquisition_kind; 628 db->sample_count = input_sample_count; 629 db->channel_count = pb->parameters.channel_count; 630 db->acquisition_count = pb->parameters.acquisition_count; 631 db->chunk_channel_count = chunk_channel_count; 632 db->interpolation_mode = pb->parameters.interpolation_mode; 633 db->transmit_angle = pb->parameters.focal_vector.E[0]; 634 db->focus_depth = pb->parameters.focal_vector.E[1]; 635 db->transmit_receive_orientation = pb->parameters.transmit_receive_orientation; 636 637 // NOTE(rnp): old gcc will miscompile an assignment 638 mem_copy(cp->xdc_transform.E, pb->parameters.xdc_transform.E, sizeof(cp->xdc_transform)); 639 640 cp->voxel_transform = m4_mul(cp->ui_voxel_transform, pb->parameters.das_voxel_transform); 641 cp->xdc_element_pitch = pb->parameters.xdc_element_pitch; 642 643 u32 id = pb->parameters.acquisition_kind; 644 if (id == BeamformerAcquisitionKind_UFORCES || id == BeamformerAcquisitionKind_FORCES) 645 cp->voxel_transform = m4_mul(cp->xdc_transform, cp->voxel_transform); 646 647 db->sparse = id == BeamformerAcquisitionKind_UFORCES || id == BeamformerAcquisitionKind_UHERCULES; 648 db->single_focus = pb->parameters.single_focus; 649 db->single_orientation = pb->parameters.single_orientation; 650 db->coherency_weighting = pb->parameters.coherency_weighting; 651 652 sd->layout = layout_for_output(cp->output_points); 653 sd->dispatch = dispatch_for_output(sd->layout, cp->output_points); 654 }break; 655 656 case BeamformerShaderKind_CoherencyWeighting:{ 657 sd->layout = layout_for_output(cp->output_points); 658 sd->dispatch = dispatch_for_output(sd->layout, cp->output_points); 659 }break; 660 661 case BeamformerShaderKind_Reshape:{ 662 BeamformerReshapeBakeParameters *rb = &sd->bake.Reshape; 663 rb->deinterleave = beamformer_data_kind_complex[node->input_data_kind] && 664 !beamformer_data_kind_complex[node->output_data_kind]; 665 rb->interleave = !beamformer_data_kind_complex[node->input_data_kind] && 666 beamformer_data_kind_complex[node->output_data_kind]; 667 assert(rb->interleave == 0 || (rb->interleave != rb->deinterleave)); 668 669 rb->input_stride_x = node->input_stride.x; 670 rb->input_stride_y = node->input_stride.y; 671 rb->input_stride_z = node->input_stride.z; 672 rb->output_stride_x = node->output_stride.x; 673 rb->output_stride_y = node->output_stride.y; 674 rb->output_stride_z = node->output_stride.z; 675 676 // NOTE(rnp): order doesn't really matter here but it must match the dispatch layout 677 rb->size_x = input_sample_count; 678 rb->size_y = chunk_channel_count; 679 rb->size_z = acquisition_count; 680 681 sd->layout.x = 1; 682 sd->layout.z = Min(subgroup_size, rb->size_z); 683 sd->layout.y = subgroup_size / sd->layout.z; 684 685 sd->dispatch.x = (u32)(ceil_f32((f32)rb->size_x / sd->layout.x)); 686 sd->dispatch.y = (u32)(ceil_f32((f32)rb->size_y / sd->layout.y)); 687 sd->dispatch.z = (u32)(ceil_f32((f32)rb->size_z / sd->layout.z)); 688 }break; 689 690 default:{}break; 691 692 #if 0 693 case BeamformerShaderKind_Sum:{ 694 sd->bake.data_kind = BeamformerDataKind_Float32; 695 if (cp->iq_pipeline) 696 sd->bake.data_kind = BeamformerDataKind_Float32Complex; 697 698 sd->layout = layout_for_output(cp->output_points); 699 sd->dispatch = dispatch_for_output(sd->layout, cp->output_points); 700 701 commit = 1; 702 }break; 703 #endif 704 705 } 706 } 707 } 708 709 cp->pipeline.data_kind = input_data_kind; 710 711 if (cp->first_image_shader_index == 0) 712 cp->first_image_shader_index = cp->pipeline.shader_count; 713 } 714 715 function void 716 stream_append_shader_header(Stream *s, i32 reloadable_index, BeamformerShaderDescriptor *sd, uv3 layout) 717 { 718 stream_append_s8s(s, s8("#version 460 core\n\n" 719 "#extension GL_EXT_buffer_reference : require\n" 720 "#extension GL_EXT_shader_16bit_storage : require\n" 721 "#extension GL_EXT_shader_explicit_arithmetic_types : require\n\n" 722 "#define f32 float32_t\n" 723 "#define f16 float16_t\n" 724 "#define s32 int32_t\n" 725 "#define u64 uint64_t\n" 726 "#define u32 uint32_t\n" 727 "#define s16 int16_t\n" 728 "#define u16 uint16_t\n" 729 "#define s32vec2 i32vec2\n" 730 "#define s16vec2 i16vec2\n" 731 "\n")); 732 733 i32 header_vector_length = beamformer_shader_header_vector_lengths[reloadable_index]; 734 i32 *header_vector = beamformer_shader_header_vectors[reloadable_index]; 735 for (i32 index = 0; index < header_vector_length; index++) 736 stream_append_s8(s, beamformer_shader_global_header_strings[header_vector[index]]); 737 738 if (layout.x != 0) { 739 stream_append_s8(s, s8("layout(local_size_x = ")); 740 stream_append_u64(s, layout.x); 741 stream_append_s8(s, s8(", local_size_y = ")); 742 stream_append_u64(s, layout.y); 743 stream_append_s8(s, s8(", local_size_z = ")); 744 stream_append_u64(s, layout.z); 745 stream_append_s8(s, s8(") in;\n\n")); 746 } 747 748 { 749 u32 max_length = 0; 750 for EachElement(beamformer_data_kind_s8, it) 751 max_length = Max(max_length, (u32)beamformer_data_kind_s8[it].len); 752 753 for EachElement(beamformer_data_kind_s8, it) { 754 stream_append_s8s(s, s8("#define DataKind_"), beamformer_data_kind_s8[it]); 755 stream_pad(s, ' ', max_length - beamformer_data_kind_s8[it].len + 1); 756 stream_append_u64(s, it); 757 stream_append_byte(s, '\n'); 758 } 759 stream_append_byte(s, '\n'); 760 } 761 762 if (sd) { 763 BeamformerDataKind data_kinds[] = {sd->input_data_kind, sd->output_data_kind}; 764 s8 line_prefixes[] = {s8_comp("Input"), s8_comp("Output")}; 765 for EachElement(data_kinds, it) { 766 if (data_kinds[it] != BeamformerDataKind_Count) { 767 stream_append_s8s(s, s8("#define "), line_prefixes[it], s8("DataType "), 768 beamformer_data_kind_glsl_type[data_kinds[it]], 769 s8("\n#define "), line_prefixes[it], s8("DataKind DataKind_"), 770 beamformer_data_kind_s8[data_kinds[it]], 771 s8("\n#define "), line_prefixes[it], s8("DataKindByteSize ")); 772 stream_append_u64(s, beamformer_data_kind_byte_size[data_kinds[it]]); 773 stream_append_byte(s, '\n'); 774 } 775 } 776 stream_append_byte(s, '\n'); 777 778 u32 *parameters = (u32 *)&sd->bake; 779 s8 *names = beamformer_shader_bake_parameter_names[reloadable_index]; 780 u32 float_bits = beamformer_shader_bake_parameter_float_bits[reloadable_index]; 781 i32 count = beamformer_shader_bake_parameter_counts[reloadable_index]; 782 783 for (i32 index = 0; index < count; index++) { 784 stream_append_s8s(s, s8("#define "), names[index], 785 (float_bits & (1 << index))? s8(" uintBitsToFloat") : s8(" "), s8("(0x")); 786 stream_append_hex_u64(s, parameters[index]); 787 stream_append_s8(s, s8(")\n")); 788 } 789 } 790 791 if (!renderdoc_attached()) 792 stream_append_s8(s, s8("\n\n#line 1\n")); 793 } 794 795 function void 796 beamformer_reload_pipeline(VulkanHandle *pipeline, BeamformerShaderReloadInfo *sris, u32 count, Arena arena) 797 { 798 assume(count <= 2); 799 s8 paths[2]; 800 VulkanPipelineCreateInfo infos[2]; 801 802 if (!BakeShaders) { 803 for (u32 i = 0; i < count; i++) 804 paths[i] = push_s8_from_parts(&arena, os_path_separator(), s8("shaders"), sris[i].filename_or_data); 805 } 806 807 u32 push_constants_size = 0; 808 for (u32 i = 0; i < count; i++) { 809 Stream shader_stream = arena_stream(arena); 810 i32 reloadable_index = beamformer_shader_reloadable_index_by_shader[sris[i].shader]; 811 if (i == 0) push_constants_size = beamformer_shader_push_constant_sizes[reloadable_index]; 812 else assert(push_constants_size == beamformer_shader_push_constant_sizes[reloadable_index]); 813 814 stream_append_shader_header(&shader_stream, reloadable_index, sris[i].shader_descriptor, sris[i].layout); 815 816 if (BakeShaders) { 817 stream_append_s8(&shader_stream, sris[i].filename_or_data); 818 } else { 819 shader_stream.widx += os_read_entire_file((c8 *)paths[i].data, 820 shader_stream.data + shader_stream.widx, 821 shader_stream.cap - shader_stream.widx); 822 } 823 824 infos[i].kind = sris[i].shader_kind; 825 infos[i].text = arena_stream_commit_zero(&arena, &shader_stream); 826 infos[i].name = beamformer_shader_names[sris[i].shader]; 827 828 //s8 line = s8("---------------\n"); 829 //s8 nl = s8("\n"); 830 //os_console_log(line.data, line.len); 831 //os_console_log(infos[i].name.data, infos[i].name.len); 832 //os_console_log(nl.data, nl.len); 833 //os_console_log(line.data, line.len); 834 //os_console_log(infos[i].text.data, infos[i].text.len); 835 //os_console_log(line.data, line.len); 836 } 837 838 vk_pipeline_release(*pipeline); 839 *pipeline = vk_pipeline(infos, count, push_constants_size); 840 } 841 842 function void 843 beamformer_reload_render_pipeline(VulkanHandle *pipeline, BeamformerShaderKind shader, Arena arena) 844 { 845 i32 index = beamformer_shader_reloadable_index_by_shader[shader]; 846 BeamformerShaderReloadInfo infos[2] = { 847 { 848 .shader = shader, 849 .shader_kind = beamformer_shader_primitive_is_vertex[index] ? VulkanShaderKind_Vertex : VulkanShaderKind_Mesh, 850 .filename_or_data = BakeShaders ? beamformer_shader_data[index][0] 851 : beamformer_reloadable_shader_files[index][0], 852 }, 853 { 854 .shader = shader, 855 .shader_kind = VulkanShaderKind_Fragment, 856 .filename_or_data = BakeShaders ? beamformer_shader_data[index][1] 857 : beamformer_reloadable_shader_files[index][1], 858 }, 859 }; 860 beamformer_reload_pipeline(pipeline, infos, countof(infos), arena); 861 } 862 863 function void 864 beamformer_reload_compute_pipeline(VulkanHandle *pipeline, BeamformerShaderKind shader, 865 BeamformerShaderDescriptor *shader_descriptor, Arena arena) 866 { 867 i32 index = beamformer_shader_reloadable_index_by_shader[shader]; 868 uv3 layout = shader_descriptor ? shader_descriptor->layout : (uv3){{vk_gpu_info()->subgroup_size, 1, 1}}; 869 BeamformerShaderReloadInfo info = { 870 .shader = shader, 871 .shader_kind = VulkanShaderKind_Compute, 872 .shader_descriptor = shader_descriptor, 873 .filename_or_data = BakeShaders ? beamformer_shader_data[index][0] 874 : beamformer_reloadable_shader_files[index][0], 875 .layout = layout, 876 }; 877 beamformer_reload_pipeline(pipeline, &info, 1, arena); 878 } 879 880 function void 881 beamformer_commit_parameter_block(BeamformerCtx *ctx, BeamformerComputePlan *cp, u32 block, Arena arena) 882 { 883 BeamformerParameterBlock *pb = beamformer_parameter_block_lock(ctx->shared_memory, block, -1); 884 for EachBit(pb->region_update_flags, region) { 885 switch (region) { 886 case BeamformerParameterRegionFlag_NotifyUI:{ 887 atomic_store_u32(&ctx->ui_dirty_parameter_blocks, 1u << block); 888 }break; 889 890 case BeamformerParameterRegionFlag_ComputePipeline: 891 case BeamformerParameterRegionFlag_Parameters: 892 { 893 cp->output_points = das_valid_points(pb->parameters.output_points.xyz); 894 cp->average_frames = pb->parameters.output_points.E[3]; 895 896 plan_compute_pipeline(cp, pb, arena); 897 898 /* NOTE(rnp): these are both handled by plan_compute_pipeline() */ 899 u32 mask = 1 << BeamformerParameterBlockRegion_ComputePipeline | 900 1 << BeamformerParameterBlockRegion_Parameters; 901 pb->region_update_flags &= ~mask; 902 903 for (u32 shader_slot = 0; shader_slot < cp->pipeline.shader_count; shader_slot++) { 904 u128 hash = u128_hash_from_data(cp->shader_descriptors + shader_slot, sizeof(BeamformerShaderDescriptor)); 905 if (!u128_equal(hash, cp->shader_hashes[shader_slot])) 906 cp->dirty_programs |= 1 << shader_slot; 907 cp->shader_hashes[shader_slot] = hash; 908 } 909 910 cp->acquisition_count = pb->parameters.acquisition_count; 911 cp->acquisition_kind = pb->parameters.acquisition_kind; 912 913 i64 buffer_size = PING_PONG_BUFFER_SLOTS * round_up_to(cp->rf_size, 64); 914 if (ctx->compute_context.ping_pong_buffer.size < buffer_size) { 915 GPUBufferAllocateInfo allocate_info = {.size = buffer_size, .label = s8("PingPongBuffer")}; 916 vk_buffer_allocate(&ctx->compute_context.ping_pong_buffer, &allocate_info); 917 918 BeamformerShaderResourceInfo shader_resource_infos[] = { 919 { 920 .kind = BeamformerShaderResourceKind_Buffer, 921 .handle = ctx->compute_context.ping_pong_buffer.handle, 922 .slot = BeamformerShaderBufferSlot_PingPong, 923 }, 924 }; 925 vk_bind_shader_resources(shader_resource_infos, countof(shader_resource_infos)); 926 // TODO(rnp): figure out how to share with CUDA 927 } 928 929 if (cp->hadamard_order != (i32)cp->acquisition_count) 930 update_hadamard(cp, (i32)cp->acquisition_count, vk_gpu_info()->cooperative_matrix, arena); 931 }break; 932 933 case BeamformerParameterBlockRegion_ChannelMapping:{ 934 cuda_set_channel_mapping(pb->channel_mapping); 935 }break; 936 case BeamformerParameterRegionFlag_TransmitReceiveOrientations:{ 937 GPUBuffer *b = &cp->array_parameters; 938 u32 kind = BeamformerComputeArrayParameterKind_TransmitReceiveOrientations; 939 u64 offset = beamformer_compute_array_parameter_offsets[kind]; 940 u64 size = beamformer_compute_array_parameter_sizes[kind]; 941 { 942 Arena scratch = arena; 943 u16 *u16s = push_array(&scratch, u16, countof(pb->transmit_receive_orientations)); 944 for (u32 i = 0; i < countof(pb->transmit_receive_orientations); i++) 945 u16s[i] = pb->transmit_receive_orientations[i]; 946 947 vk_buffer_range_upload(b, u16s, offset, size, 0); 948 } 949 }break; 950 case BeamformerParameterRegionFlag_FocalVectors: 951 case BeamformerParameterRegionFlag_SparseElements: 952 { 953 u32 kind = BeamformerComputeArrayParameterKind_Count; 954 switch (region) { 955 case BeamformerParameterBlockRegion_FocalVectors:{ 956 kind = BeamformerComputeArrayParameterKind_FocalVectors; 957 }break; 958 case BeamformerParameterBlockRegion_SparseElements:{ 959 kind = BeamformerComputeArrayParameterKind_SparseElements; 960 }break; 961 InvalidDefaultCase; 962 } 963 964 if (kind != BeamformerComputeArrayParameterKind_Count) { 965 GPUBuffer *b = &cp->array_parameters; 966 u64 offset = beamformer_compute_array_parameter_offsets[kind]; 967 u64 size = beamformer_compute_array_parameter_sizes[kind]; 968 vk_buffer_range_upload(b, (u8 *)pb + BeamformerParameterBlockRegionOffsets[region], offset, size, 0); 969 } 970 }break; 971 } 972 } 973 beamformer_parameter_block_unlock(ctx->shared_memory, block); 974 } 975 976 function void 977 do_compute_shader(BeamformerCtx *ctx, VulkanHandle cmd, BeamformerComputePlan *cp, BeamformerFrame *frame, 978 u32 shader_slot, u32 channel_offset, u64 rf_pointer, Arena arena) 979 { 980 BeamformerComputeContext *cc = &ctx->compute_context; 981 982 u32 output_index = !cc->ping_pong_input_index; 983 u32 input_index = cc->ping_pong_input_index; 984 u32 das_output_index = PING_PONG_BUFFER_SLOTS - 1; 985 986 u64 pp_size = cc->ping_pong_buffer.size / PING_PONG_BUFFER_SLOTS; 987 u64 pp_input_pointer = cc->ping_pong_buffer.gpu_pointer + input_index * pp_size; 988 u64 pp_output_pointer = cc->ping_pong_buffer.gpu_pointer + output_index * pp_size; 989 u64 pp_das_pointer = cc->ping_pong_buffer.gpu_pointer + das_output_index * pp_size; 990 991 u32 das_index = cp->first_image_shader_index - 1; 992 993 uv3 dispatch = cp->shader_descriptors[shader_slot].dispatch; 994 995 vk_command_bind_pipeline(cmd, cp->vulkan_pipelines[shader_slot]); 996 997 switch (cp->pipeline.shaders[shader_slot]) { 998 999 case BeamformerShaderKind_Decode:{ 1000 BeamformerDecodePushConstants pc = { 1001 .hadamard_buffer = cp->array_parameters.gpu_pointer + offsetof(BeamformerComputeArrayParameters, Hadamard), 1002 .rf_buffer = pp_input_pointer, 1003 }; 1004 1005 if ((shader_slot + 1) == das_index) pc.output_buffer = pp_das_pointer; 1006 else pc.output_buffer = pp_output_pointer; 1007 1008 GPUMemoryBarrierInfo memory_barriers[]= { 1009 // NOTE(rnp): first pass or last stage output 1010 { 1011 .gpu_buffer = &cc->ping_pong_buffer, 1012 .offset = pp_input_pointer - cc->ping_pong_buffer.gpu_pointer, 1013 .size = pp_size, 1014 }, 1015 // NOTE(rnp): output for DAS 1016 { 1017 .gpu_buffer = &cc->ping_pong_buffer, 1018 .offset = pp_das_pointer - cc->ping_pong_buffer.gpu_pointer, 1019 .size = pp_size, 1020 }, 1021 }; 1022 1023 u32 barrier_count = 1; 1024 if (shader_slot + 1 == das_index) 1025 barrier_count++; 1026 1027 vk_command_buffer_memory_barriers(cmd, memory_barriers, barrier_count); 1028 vk_command_push_constants(cmd, 0, sizeof(pc), &pc); 1029 vk_command_dispatch_compute(cmd, dispatch); 1030 1031 cc->ping_pong_input_index = !cc->ping_pong_input_index; 1032 }break; 1033 1034 case BeamformerShaderKind_Hilbert:{ 1035 cuda_hilbert(input_index, output_index); 1036 cc->ping_pong_input_index = !cc->ping_pong_input_index; 1037 }break; 1038 1039 case BeamformerShaderKind_Filter: 1040 case BeamformerShaderKind_Demodulate: 1041 { 1042 BeamformerDataKind output_data_kind = cp->shader_descriptors[shader_slot].output_data_kind; 1043 1044 u64 element_size = beamformer_data_kind_byte_size[output_data_kind]; 1045 u32 filter_slot = cp->pipeline.parameters[shader_slot].filter_slot; 1046 BeamformerFilterPushConstants pc = { 1047 .filter_coefficients = cp->filters[filter_slot].buffer.gpu_pointer, 1048 .input_data = shader_slot == 0 ? rf_pointer : pp_input_pointer, 1049 .output_element_offset = output_index * pp_size / element_size, 1050 }; 1051 1052 if ((shader_slot + 1) == das_index) 1053 pc.output_element_offset = das_output_index * pp_size / element_size; 1054 1055 GPUMemoryBarrierInfo memory_barriers[] = { 1056 // NOTE(rnp): last stage output 1057 { 1058 .gpu_buffer = &cc->ping_pong_buffer, 1059 .offset = pp_input_pointer - cc->ping_pong_buffer.gpu_pointer, 1060 .size = pp_size, 1061 }, 1062 // NOTE(rnp): output for DAS 1063 { 1064 .gpu_buffer = &cc->ping_pong_buffer, 1065 .offset = pp_das_pointer - cc->ping_pong_buffer.gpu_pointer, 1066 .size = pp_size, 1067 }, 1068 }; 1069 GPUMemoryBarrierInfo *barriers = memory_barriers; 1070 1071 u32 barrier_count = 2; 1072 if (shader_slot == 0) { 1073 barriers++; 1074 barrier_count--; 1075 } 1076 1077 if ((shader_slot + 1) != das_index) 1078 barrier_count--; 1079 1080 if (barrier_count) 1081 vk_command_buffer_memory_barriers(cmd, barriers, barrier_count); 1082 1083 vk_command_push_constants(cmd, 0, sizeof(pc), &pc); 1084 vk_command_dispatch_compute(cmd, dispatch); 1085 1086 cc->ping_pong_input_index = !cc->ping_pong_input_index; 1087 }break; 1088 1089 case BeamformerShaderKind_DAS:{ 1090 local_persist u32 das_cycle_t = 0; 1091 1092 GPUBuffer *b = cc->backlog.buffer; 1093 1094 u64 frame_size = beamformer_frame_byte_size(frame->points, frame->data_kind); 1095 u64 iframe_size = frame_size / beamformer_data_kind_element_count[frame->data_kind]; 1096 u64 element_size = beamformer_data_kind_byte_size[cp->shader_descriptors[shader_slot].input_data_kind]; 1097 1098 BeamformerDASPushConstants pc = { 1099 .xdc_element_pitch = cp->xdc_element_pitch, 1100 .rf_element_offset = das_output_index * pp_size / element_size, 1101 .output_frame = b->gpu_pointer + frame->buffer_offset, 1102 .incoherent_frame = b->gpu_pointer + b->size - iframe_size, 1103 .output_size_x = cp->output_points.x, 1104 .output_size_y = cp->output_points.y, 1105 .output_size_z = cp->output_points.z, 1106 .cycle_t = das_cycle_t++, 1107 .channel_offset = channel_offset, 1108 .array_parameters = cp->array_parameters.gpu_pointer + offsetof(BeamformerComputeArrayParameters, FocalVectors), 1109 }; 1110 mem_copy(pc.voxel_transform.E, cp->voxel_transform.E, sizeof(pc.voxel_transform)); 1111 mem_copy(pc.xdc_transform.E, cp->xdc_transform.E, sizeof(pc.xdc_transform)); 1112 1113 b32 coherent = cp->shader_descriptors[shader_slot].bake.DAS.coherency_weighting; 1114 1115 GPUMemoryBarrierInfo memory_barriers[] = { 1116 // NOTE(rnp): last stage data output barrier 1117 { 1118 .gpu_buffer = &cc->ping_pong_buffer, 1119 .offset = pp_das_pointer - cc->ping_pong_buffer.gpu_pointer, 1120 .size = pp_size, 1121 }, 1122 // NOTE(rnp): output clearing pipeline barriers or last DAS pipeline write barriers 1123 { 1124 .gpu_buffer = b, 1125 .offset = frame->buffer_offset, 1126 .size = frame_size, 1127 }, 1128 { 1129 .gpu_buffer = b, 1130 .offset = pc.incoherent_frame - b->gpu_pointer, 1131 .size = iframe_size, 1132 }, 1133 }; 1134 1135 u32 barrier_count = countof(memory_barriers); 1136 if (!coherent) barrier_count--; 1137 1138 vk_command_buffer_memory_barriers(cmd, memory_barriers, barrier_count); 1139 vk_command_push_constants(cmd, 0, sizeof(pc), &pc); 1140 vk_command_dispatch_compute(cmd, dispatch); 1141 }break; 1142 1143 case BeamformerShaderKind_CoherencyWeighting:{ 1144 GPUBuffer *b = cc->backlog.buffer; 1145 1146 u64 frame_size = beamformer_frame_byte_size(frame->points, frame->data_kind); 1147 u64 iframe_size = frame_size / beamformer_data_kind_element_count[frame->data_kind]; 1148 1149 BeamformerCoherencyWeightingPushConstants pc = { 1150 .left_side_buffer = b->gpu_pointer + frame->buffer_offset, 1151 .right_side_buffer = b->gpu_pointer + b->size - iframe_size, 1152 .scale = 1.0f, 1153 .output_size_x = cp->output_points.x, 1154 .output_size_y = cp->output_points.y, 1155 .output_size_z = cp->output_points.z, 1156 }; 1157 1158 GPUMemoryBarrierInfo memory_barriers[] = { 1159 { 1160 .gpu_buffer = b, 1161 .offset = frame->buffer_offset, 1162 .size = frame_size, 1163 }, 1164 { 1165 .gpu_buffer = b, 1166 .offset = pc.right_side_buffer - b->gpu_pointer, 1167 .size = iframe_size, 1168 }, 1169 }; 1170 1171 vk_command_buffer_memory_barriers(cmd, memory_barriers, countof(memory_barriers)); 1172 vk_command_push_constants(cmd, 0, sizeof(pc), &pc); 1173 vk_command_dispatch_compute(cmd, dispatch); 1174 }break; 1175 1176 case BeamformerShaderKind_Reshape:{ 1177 BeamformerDataKind input_data_kind = cp->shader_descriptors[shader_slot].input_data_kind; 1178 BeamformerReshapeBakeParameters *rb = &cp->shader_descriptors[shader_slot].bake.Reshape; 1179 u64 input_pointer = shader_slot == 0 ? rf_pointer : pp_input_pointer; 1180 BeamformerReshapePushConstants pc = { 1181 .left_input_buffer = input_pointer, 1182 .right_input_buffer = input_pointer + rb->size_x * rb->size_y * rb->size_z 1183 * beamformer_data_kind_byte_size[input_data_kind], 1184 }; 1185 1186 if ((shader_slot + 1) == das_index) pc.output_buffer = pp_das_pointer; 1187 else pc.output_buffer = pp_output_pointer; 1188 1189 GPUMemoryBarrierInfo memory_barriers[]= { 1190 // NOTE(rnp): first pass or last stage output 1191 { 1192 .gpu_buffer = &cc->ping_pong_buffer, 1193 .offset = pp_input_pointer - cc->ping_pong_buffer.gpu_pointer, 1194 .size = pp_size, 1195 }, 1196 // NOTE(rnp): output for DAS 1197 { 1198 .gpu_buffer = &cc->ping_pong_buffer, 1199 .offset = pp_das_pointer - cc->ping_pong_buffer.gpu_pointer, 1200 .size = pp_size, 1201 }, 1202 }; 1203 1204 u32 barrier_count = 1; 1205 if (shader_slot + 1 == das_index) 1206 barrier_count++; 1207 1208 vk_command_buffer_memory_barriers(cmd, memory_barriers, barrier_count); 1209 vk_command_push_constants(cmd, 0, sizeof(pc), &pc); 1210 vk_command_dispatch_compute(cmd, dispatch); 1211 1212 cc->ping_pong_input_index = !cc->ping_pong_input_index; 1213 }break; 1214 1215 // NOTE(rnp): invalid stages should be filtered in planning phase 1216 InvalidDefaultCase; 1217 } 1218 1219 #if 0 1220 switch (shader) { 1221 case BeamformerShaderKind_MinMax:{ 1222 for (u32 i = 1; i < frame->image.mip_map_levels; i++) { 1223 glBindImageTexture(0, frame->texture, i - 1, GL_TRUE, 0, GL_READ_ONLY, GL_RG32F); 1224 glBindImageTexture(1, frame->texture, i - 0, GL_TRUE, 0, GL_WRITE_ONLY, GL_RG32F); 1225 glProgramUniform1i(program, MIN_MAX_MIPS_LEVEL_UNIFORM_LOC, i); 1226 1227 u32 width = (u32)frame->dim.x >> i; 1228 u32 height = (u32)frame->dim.y >> i; 1229 u32 depth = (u32)frame->dim.z >> i; 1230 glDispatchCompute(ORONE(width / 32), ORONE(height), ORONE(depth / 32)); 1231 glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT); 1232 } 1233 }break; 1234 case BeamformerShaderKind_Sum:{ 1235 u32 aframe_index = ctx->averaged_frame_index % countof(ctx->averaged_frames); 1236 BeamformerFrame *aframe = ctx->averaged_frames + aframe_index; 1237 aframe->id = ctx->averaged_frame_index; 1238 atomic_store_u32(&aframe->ready_to_present, 0); 1239 /* TODO(rnp): hack we need a better way of specifying which frames to sum; 1240 * this is fine for rolling averaging but what if we want to do something else */ 1241 assert(frame >= ctx->beamform_frames); 1242 assert(frame < ctx->beamform_frames + countof(ctx->beamform_frames)); 1243 u32 base_index = (u32)(frame - ctx->beamform_frames); 1244 u32 to_average = (u32)cp->average_frames; 1245 u32 frame_count = 0; 1246 u32 *in_textures = push_array(&arena, u32, BeamformerMaxBacklogFrames); 1247 ComputeFrameIterator cfi = compute_frame_iterator(ctx, 1 + base_index - to_average, to_average); 1248 for (BeamformerFrame *it = frame_next(&cfi); it; it = frame_next(&cfi)) 1249 in_textures[frame_count++] = it->texture; 1250 1251 assert(to_average == frame_count); 1252 1253 glProgramUniform1f(program, SUM_PRESCALE_UNIFORM_LOC, 1 / (f32)frame_count); 1254 /* NOTE: zero output before summing */ 1255 glClearTexImage(aframe->texture, 0, GL_RED, GL_FLOAT, 0); 1256 glMemoryBarrier(GL_TEXTURE_UPDATE_BARRIER_BIT); 1257 1258 glBindImageTexture(0, out_texture, 0, GL_TRUE, 0, GL_READ_WRITE, GL_RG32F); 1259 for (u32 i = 0; i < in_texture_count; i++) { 1260 glBindImageTexture(1, in_textures[i], 0, GL_TRUE, 0, GL_READ_ONLY, GL_RG32F); 1261 glDispatchCompute(dispatch.x, dispatch.y, dispatch.z); 1262 glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT); 1263 } 1264 1265 mem_copy(aframe->voxel_transform.E, frame->voxel_transform.E, sizeof(frame->voxel_transform)); 1266 aframe->compound_count = frame->compound_count; 1267 aframe->acquisition_kind = frame->acquisition_kind; 1268 }break; 1269 } 1270 #endif 1271 } 1272 1273 function void 1274 complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena *arena) 1275 { 1276 BeamformerComputeContext * cs = &ctx->compute_context; 1277 BeamformerSharedMemory * sm = ctx->shared_memory; 1278 1279 for (BeamformWork *work = beamform_work_queue_pop(q); 1280 work; 1281 beamform_work_queue_pop_commit(q), work = beamform_work_queue_pop(q)) 1282 { 1283 switch (work->kind) { 1284 1285 case BeamformerWorkKind_ExportBuffer:{ 1286 /* TODO(rnp): better way of handling DispatchCompute barrier */ 1287 post_sync_barrier(ctx->shared_memory, BeamformerSharedMemoryLockKind_DispatchCompute); 1288 beamformer_shared_memory_take_lock(ctx->shared_memory, (i32)work->lock, (u32)-1); 1289 BeamformerExportContext *ec = &work->export_context; 1290 switch (ec->kind) { 1291 case BeamformerExportKind_BeamformedData:{ 1292 BeamformerFrame *f = ctx->latest_frame; 1293 if (f) { 1294 u64 frame_size = beamformer_frame_byte_size(f->points, f->data_kind); 1295 assert((frame_size & 63) == 0); 1296 if (frame_size <= ec->size) { 1297 vk_host_wait_timeline(VulkanTimeline_Compute, f->timeline_valid_value, -1ULL); 1298 vk_buffer_range_download(beamformer_shared_memory_scratch_arena(sm, ctx->shared_memory_size).beg, 1299 ctx->compute_context.backlog.buffer, f->buffer_offset, 1300 frame_size, 1); 1301 } 1302 } 1303 }break; 1304 case BeamformerExportKind_Stats:{ 1305 ComputeTimingTable *table = ctx->compute_timing_table; 1306 /* NOTE(rnp): do a little spin to let this finish updating */ 1307 spin_wait(table->write_index != atomic_load_u32(&table->read_index)); 1308 ComputeShaderStats *stats = ctx->compute_shader_stats; 1309 if (sizeof(stats->table) <= ec->size) 1310 mem_copy(beamformer_shared_memory_scratch_arena(sm, ctx->shared_memory_size).beg, 1311 &stats->table, sizeof(stats->table)); 1312 }break; 1313 InvalidDefaultCase; 1314 } 1315 beamformer_shared_memory_release_lock(ctx->shared_memory, work->lock); 1316 post_sync_barrier(ctx->shared_memory, BeamformerSharedMemoryLockKind_ExportSync); 1317 }break; 1318 1319 case BeamformerWorkKind_CreateFilter:{ 1320 /* TODO(rnp): this should probably get deleted and moved to lazy loading */ 1321 BeamformerCreateFilterContext *fctx = &work->create_filter_context; 1322 u32 block = fctx->parameter_block; 1323 u32 slot = fctx->filter_slot; 1324 BeamformerComputePlan *cp = beamformer_compute_plan_for_block(cs, block, arena); 1325 beamformer_filter_update(cp->filters + slot, fctx->parameters, block, slot, *arena); 1326 }break; 1327 1328 case BeamformerWorkKind_ComputeIndirect: 1329 case BeamformerWorkKind_Compute: 1330 { 1331 push_compute_timing_info(ctx->compute_timing_table, 1332 (ComputeTimingInfo){.kind = ComputeTimingInfoKind_ComputeFrameBegin}); 1333 1334 BeamformerComputePlan *cp = beamformer_compute_plan_for_block(cs, work->compute_context.parameter_block, arena); 1335 if unlikely(beamformer_parameter_block_dirty(sm, work->compute_context.parameter_block)) { 1336 u32 block = work->compute_context.parameter_block; 1337 beamformer_commit_parameter_block(ctx, cp, block, *arena); 1338 } 1339 1340 post_sync_barrier(ctx->shared_memory, BeamformerSharedMemoryLockKind_DispatchCompute); 1341 1342 u32 dirty_programs = atomic_swap_u32(&cp->dirty_programs, 0); 1343 static_assert(IsPowerOfTwo(BeamformerMaxComputeShaderStages), ""); 1344 assert((dirty_programs & ~((u32)BeamformerMaxComputeShaderStages - 1)) == 0); 1345 if unlikely(dirty_programs) { 1346 for EachBit(dirty_programs, slot) { 1347 beamformer_reload_compute_pipeline(cp->vulkan_pipelines + slot, 1348 cp->pipeline.shaders[slot], 1349 cp->shader_descriptors + slot, *arena); 1350 } 1351 } 1352 1353 atomic_store_u32(&cs->processing_compute, 1); 1354 1355 start_renderdoc_capture(); 1356 1357 i32 das_index = -1; 1358 b32 has_sum = 0; 1359 for (u32 i = 0; i < cp->pipeline.shader_count; i++) { 1360 has_sum |= cp->pipeline.shaders[i] == BeamformerShaderKind_Sum; 1361 if (cp->pipeline.shaders[i] == BeamformerShaderKind_DAS) 1362 das_index = (i32)i; 1363 } 1364 1365 b32 das_coherent = das_index >= 0 && cp->shader_descriptors[das_index].bake.DAS.coherency_weighting; 1366 u64 reserved_frame_size = 0; 1367 1368 if (has_sum) 1369 reserved_frame_size += beamformer_frame_byte_size(cp->output_points, cp->iq_pipeline ? 1370 BeamformerDataKind_Float32Complex : 1371 BeamformerDataKind_Float32); 1372 1373 // TODO(rnp): incoherent sum for different data kinds 1374 if (das_coherent) 1375 reserved_frame_size += beamformer_frame_byte_size(cp->output_points, BeamformerDataKind_Float32); 1376 1377 BeamformerFrame *frame = beamformer_frame_next(cs, cp->output_points, cp->iq_pipeline, reserved_frame_size); 1378 frame->acquisition_kind = cp->acquisition_kind; 1379 frame->compound_count = cp->acquisition_count; 1380 frame->view_plane_tag = work->compute_context.view_plane; 1381 mem_copy(frame->voxel_transform.E, cp->voxel_transform.E, sizeof(cp->voxel_transform)); 1382 1383 VulkanHandle cmd = vk_command_begin(VulkanTimeline_Compute); 1384 vk_command_timestamp(cmd); 1385 1386 if (das_index >= 0) { 1387 GPUBuffer *backlog = cs->backlog.buffer; 1388 u32 subgroup_size = vk_gpu_info()->subgroup_size; 1389 BeamformerBufferClearPushConstants pc = { 1390 .data = backlog->gpu_pointer + frame->buffer_offset, 1391 .clear_v4 = (uv4){{0}}, 1392 .bins = beamformer_frame_byte_size(frame->points, frame->data_kind) / sizeof(uv4), 1393 }; 1394 1395 u32 index = BeamformerShaderKind_BufferClear - BeamformerShaderKind_ComputeInternalFirst; 1396 vk_command_bind_pipeline(cmd, cs->compute_internal_pipelines[index]); 1397 vk_command_push_constants(cmd, 0, sizeof(pc), &pc); 1398 vk_command_dispatch_compute(cmd, (uv3){{(u32)ceil_f32((f32)pc.bins / subgroup_size), 1, 1}}); 1399 1400 if (das_coherent) { 1401 assert((pc.bins % beamformer_data_kind_element_count[frame->data_kind]) == 0); 1402 pc.bins = pc.bins / beamformer_data_kind_element_count[frame->data_kind]; 1403 pc.data = backlog->gpu_pointer + backlog->size - sizeof(uv4) * pc.bins; 1404 vk_command_push_constants(cmd, 0, sizeof(pc), &pc); 1405 vk_command_dispatch_compute(cmd, (uv3){{(u32)ceil_f32((f32)pc.bins / subgroup_size), 1, 1}}); 1406 } 1407 } 1408 1409 BeamformerRFBuffer *rf = &cs->rf_buffer; 1410 u32 compute_index = rf->compute_index; 1411 u32 slot = compute_index % countof(rf->upload_complete_values); 1412 1413 if (work->kind == BeamformerWorkKind_ComputeIndirect) { 1414 // TODO(rnp): this shouldn't be necessary, there should be a way of communicating 1415 // what the value will be so that the only the command wait is needed. 1416 spin_wait(atomic_load_u64(&rf->insertion_index) <= compute_index); 1417 1418 /* NOTE(rnp): if the GPU supports BAR there may be no need to synchronize 1419 * other than the above spin */ 1420 if (vk_buffer_needs_sync(&rf->buffer)) 1421 vk_command_wait_timeline(cmd, VulkanTimeline_Transfer, rf->upload_complete_values[slot]); 1422 } else { 1423 slot = (rf->compute_index - 1) % countof(rf->upload_complete_values); 1424 } 1425 1426 for (u32 channel_offset = 0; 1427 channel_offset < cp->channel_count; 1428 channel_offset += BeamformerChunkChannelCount) 1429 { 1430 u64 rf_pointer = rf->buffer.gpu_pointer + slot * rf->active_rf_size; 1431 rf_pointer += cp->raw_channel_byte_stride * channel_offset; 1432 for (u32 i = 0; i < cp->first_image_shader_index; i++) { 1433 do_compute_shader(ctx, cmd, cp, frame, i, channel_offset, rf_pointer, *arena); 1434 vk_command_timestamp(cmd); 1435 } 1436 } 1437 1438 for (u32 i = cp->first_image_shader_index; i < cp->pipeline.shader_count; i++) { 1439 do_compute_shader(ctx, cmd, cp, frame, i, 0, 0, *arena); 1440 vk_command_timestamp(cmd); 1441 } 1442 1443 u64 end_timeline_value = vk_command_end(cmd, (VulkanHandle){0}, (VulkanHandle){0}); 1444 if (work->kind == BeamformerWorkKind_ComputeIndirect) { 1445 atomic_store_u64(rf->compute_complete_values + slot, end_timeline_value); 1446 atomic_add_u64(&rf->compute_index, 1); 1447 } 1448 1449 atomic_store_u64(&frame->timeline_valid_value, end_timeline_value); 1450 1451 { 1452 Arena scratch = *arena; 1453 /* NOTE(rnp): this blocks until work completes */ 1454 u64 *timestamps = vk_command_read_timestamps(VulkanTimeline_Compute, &scratch); 1455 1456 i32 steps = ((i32)cp->channel_count / BeamformerChunkChannelCount) - 1; 1457 i32 step = 0; 1458 u32 shader_index = 0; 1459 u64 last_time = timestamps[0] > 0 ? timestamps[1] : 0; 1460 1461 for (u64 i = 2; i < timestamps[0] + 1; i++) { 1462 push_compute_timing_info(ctx->compute_timing_table, (ComputeTimingInfo){ 1463 .kind = ComputeTimingInfoKind_Shader, 1464 .shader = cp->pipeline.shaders[shader_index], 1465 .shader_slot = shader_index, 1466 .timer_count = timestamps[i] - last_time, 1467 }); 1468 last_time = timestamps[i]; 1469 1470 shader_index++; 1471 if (shader_index == cp->first_image_shader_index && step < steps) { 1472 shader_index = 0; 1473 step++; 1474 } 1475 } 1476 } 1477 1478 cs->processing_progress = 1; 1479 1480 if (has_sum) { 1481 #if 0 1482 u32 aframe_index = ((ctx->averaged_frame_index++) % countof(ctx->averaged_frames)); 1483 ctx->averaged_frames[aframe_index].view_plane_tag = frame->view_plane_tag; 1484 ctx->averaged_frames[aframe_index].ready_to_present = 1; 1485 atomic_store_u64((u64 *)&ctx->latest_frame, (u64)(ctx->averaged_frames + aframe_index)); 1486 #endif 1487 } else { 1488 atomic_store_u64((u64 *)&ctx->latest_frame, (u64)frame); 1489 } 1490 1491 atomic_store_u32(&cs->processing_compute, 0); 1492 1493 push_compute_timing_info(ctx->compute_timing_table, 1494 (ComputeTimingInfo){.kind = ComputeTimingInfoKind_ComputeFrameEnd}); 1495 1496 end_renderdoc_capture(); 1497 }break; 1498 InvalidDefaultCase; 1499 } 1500 } 1501 } 1502 1503 function void 1504 coalesce_timing_table(ComputeTimingTable *t, ComputeShaderStats *stats) 1505 { 1506 /* TODO(rnp): we do not currently do anything to handle the potential for a half written 1507 * info item. this could result in garbage entries but they shouldn't really matter */ 1508 1509 u32 target = atomic_load_u32(&t->write_index); 1510 u32 stats_index = stats->latest_frame_index; 1511 1512 b32 has_rf = 0; 1513 f32 gpu_clocks_to_nano = 1.0e-9f * vk_gpu_info()->timestamp_period_ns; 1514 1515 // NOTE(rnp): not equal (the index may wrap) 1516 while (t->read_index != target) { 1517 ComputeTimingInfo info = t->buffer[t->read_index % countof(t->buffer)]; 1518 switch (info.kind) { 1519 1520 case ComputeTimingInfoKind_ComputeFrameBegin:{ 1521 assert(t->compute_frame_active == 0); 1522 t->compute_frame_active = 1; 1523 /* NOTE(rnp): allow multiple instances of same shader to accumulate */ 1524 t->in_flight_shader_count = 0; 1525 memory_clear(t->in_flight_shader_ids, 0, sizeof(t->in_flight_shader_ids)); 1526 memory_clear(stats->table.times[stats_index], 0, sizeof(stats->table.times[stats_index])); 1527 }break; 1528 1529 case ComputeTimingInfoKind_ComputeFrameEnd:{ 1530 assert(t->compute_frame_active == 1); 1531 t->compute_frame_active = 0; 1532 stats_index = stats->latest_frame_index = (stats_index + 1) % countof(stats->table.times); 1533 stats->table.shader_count = t->in_flight_shader_count; 1534 mem_copy(stats->table.shader_ids, t->in_flight_shader_ids, sizeof(t->in_flight_shader_ids)); 1535 }break; 1536 1537 case ComputeTimingInfoKind_Shader:{ 1538 t->in_flight_shader_count = Max(t->in_flight_shader_count, info.shader_slot + 1u); 1539 t->in_flight_shader_ids[info.shader_slot] = info.shader; 1540 stats->table.times[stats_index][info.shader_slot] += info.timer_count * gpu_clocks_to_nano; 1541 }break; 1542 1543 case ComputeTimingInfoKind_RF_Data:{ 1544 stats->latest_rf_index = (stats->latest_rf_index + 1) % countof(stats->table.rf_time_deltas); 1545 f32 delta = info.timer_count / (f32)os_system_info()->timer_frequency; 1546 stats->table.rf_time_deltas[stats->latest_rf_index] = delta; 1547 has_rf = 1; 1548 }break; 1549 } 1550 /* NOTE(rnp): do this at the end so that stats table is always in a consistent state */ 1551 t->read_index++; 1552 } 1553 1554 for (u32 i = 0; i < stats->table.shader_count; i++) { 1555 f32 sum = 0; 1556 for EachElement(stats->table.times, it) 1557 sum += stats->table.times[it][i]; 1558 stats->average_times[i] = sum / countof(stats->table.times); 1559 } 1560 1561 if (has_rf) { 1562 f32 sum = 0; 1563 for EachElement(stats->table.rf_time_deltas, i) 1564 sum += stats->table.rf_time_deltas[i]; 1565 stats->rf_time_delta_average = sum / countof(stats->table.rf_time_deltas); 1566 } 1567 } 1568 1569 DEBUG_EXPORT BEAMFORMER_COMPLETE_COMPUTE_FN(beamformer_complete_compute) 1570 { 1571 BeamformerSharedMemory *sm = ctx->shared_memory; 1572 complete_queue(ctx, &sm->external_work_queue, arena); 1573 complete_queue(ctx, ctx->beamform_work_queue, arena); 1574 } 1575 1576 DEBUG_EXPORT BEAMFORMER_RF_UPLOAD_FN(beamformer_rf_upload) 1577 { 1578 BeamformerSharedMemory *sm = ctx->shared_memory; 1579 BeamformerSharedMemoryLockKind scratch_lock = BeamformerSharedMemoryLockKind_ScratchSpace; 1580 BeamformerSharedMemoryLockKind upload_lock = BeamformerSharedMemoryLockKind_UploadRF; 1581 1582 u64 rf_block_rf_size; 1583 if (atomic_load_u32(sm->locks + upload_lock) && 1584 (rf_block_rf_size = atomic_swap_u64(&sm->rf_block_rf_size, 0))) 1585 { 1586 beamformer_shared_memory_take_lock(ctx->shared_memory, (i32)scratch_lock, (u32)-1); 1587 1588 BeamformerRFBuffer *rf = ctx->rf_buffer; 1589 1590 rf->active_rf_size = vk_round_up_to_sync_size(rf_block_rf_size & 0xFFFFFFFFULL, 64); 1591 if unlikely(rf->buffer.size < countof(rf->upload_complete_values) * rf->active_rf_size) { 1592 GPUBufferAllocateInfo allocate_info = { 1593 .size = countof(rf->upload_complete_values) * rf->active_rf_size, 1594 .flags = VulkanUsageFlag_HostReadWrite, 1595 .label = s8("RawRFBuffer"), 1596 }; 1597 vk_buffer_allocate(&rf->buffer, &allocate_info); 1598 } 1599 1600 u64 slot = rf->insertion_index % countof(rf->upload_complete_values); 1601 1602 /* NOTE(rnp): don't overwrite slot if the compute thread hasn't processed it */ 1603 spin_wait(atomic_load_u64(&rf->compute_index) < rf->insertion_index); 1604 vk_host_wait_timeline(VulkanTimeline_Compute, rf->compute_complete_values[slot], -1ULL); 1605 1606 vk_buffer_range_upload(&rf->buffer, beamformer_shared_memory_scratch_arena(sm, ctx->shared_memory_size).beg, 1607 slot * rf->active_rf_size, rf->active_rf_size, 1); 1608 store_fence(); 1609 1610 beamformer_shared_memory_release_lock(ctx->shared_memory, (i32)scratch_lock); 1611 post_sync_barrier(ctx->shared_memory, upload_lock); 1612 1613 atomic_store_u64(rf->upload_complete_values + slot, vk_host_signal_timeline(VulkanTimeline_Transfer)); 1614 atomic_add_u64(&rf->insertion_index, 1); 1615 1616 os_wake_all_waiters(ctx->compute_worker_sync); 1617 1618 u64 current_time = os_timer_count(); 1619 push_compute_timing_info(ctx->compute_timing_table, (ComputeTimingInfo){ 1620 .kind = ComputeTimingInfoKind_RF_Data, 1621 .timer_count = current_time - rf->timestamp, 1622 }); 1623 rf->timestamp = current_time; 1624 } 1625 } 1626 1627 function void 1628 beamformer_queue_compute(BeamformerCtx *ctx, BeamformerFrame *frame, u32 parameter_block) 1629 { 1630 BeamformerSharedMemory *sm = ctx->shared_memory; 1631 BeamformerSharedMemoryLockKind dispatch_lock = BeamformerSharedMemoryLockKind_DispatchCompute; 1632 if (!sm->live_imaging_parameters.active && beamformer_shared_memory_take_lock(sm, (i32)dispatch_lock, 0)) 1633 { 1634 BeamformWork *work = beamform_work_queue_push(ctx->beamform_work_queue); 1635 if (work) { 1636 work->kind = BeamformerWorkKind_Compute; 1637 work->compute_context.view_plane = frame ? frame->view_plane_tag : 0; 1638 work->compute_context.parameter_block = parameter_block; 1639 beamform_work_queue_push_commit(ctx->beamform_work_queue); 1640 } 1641 } 1642 os_wake_all_waiters(&ctx->compute_worker.sync_variable); 1643 } 1644 1645 #include "ui.c" 1646 1647 function void 1648 beamformer_process_input_events(BeamformerCtx *ctx, BeamformerInput *input, 1649 BeamformerInputEvent *events, u32 event_count) 1650 { 1651 for (u32 index = 0; index < event_count; index++) { 1652 BeamformerInputEvent *event = events + index; 1653 switch (event->kind) { 1654 1655 case BeamformerInputEventKind_ExecutableReload:{ 1656 ui_init(ctx, ctx->ui_backing_store); 1657 1658 if (!vk_pipeline_valid(ctx->compute_context.compute_internal_pipelines[0])) { 1659 for EachElement(ctx->compute_context.compute_internal_pipelines, it) { 1660 beamformer_reload_compute_pipeline(ctx->compute_context.compute_internal_pipelines + it, 1661 BeamformerShaderKind_ComputeInternalFirst + it, 0, 1662 ctx->arena); 1663 } 1664 } 1665 }break; 1666 1667 case BeamformerInputEventKind_FileEvent:{ 1668 BeamformerFileReloadContext *frc = event->file_watch_user_context; 1669 switch (frc->kind) { 1670 case BeamformerFileReloadKind_ComputeInternalShader:{ 1671 // TODO(rnp): this could stall, better to push it onto compute once queue is better 1672 beamformer_reload_compute_pipeline(frc->shader_reload.pipeline, frc->shader_reload.shader, 0, ctx->arena); 1673 }break; 1674 1675 case BeamformerFileReloadKind_ComputeShader:{ 1676 for EachElement(ctx->compute_context.compute_plans, block) { 1677 BeamformerComputePlan *cp = ctx->compute_context.compute_plans[block]; 1678 for (u32 slot = 0; cp && slot < cp->pipeline.shader_count; slot++) { 1679 i32 shader_index = beamformer_shader_reloadable_index_by_shader[cp->pipeline.shaders[slot]]; 1680 if (beamformer_reloadable_shader_kinds[shader_index] == frc->shader_reload.shader) 1681 atomic_or_u32(&cp->dirty_programs, 1 << slot); 1682 } 1683 } 1684 1685 // TODO(rnp): track latest parameter block 1686 if (ctx->latest_frame) 1687 beamformer_queue_compute(ctx, ctx->latest_frame, 0); 1688 }break; 1689 1690 case BeamformerFileReloadKind_RenderShader:{ 1691 beamformer_reload_render_pipeline(frc->shader_reload.pipeline, frc->shader_reload.shader, ctx->arena); 1692 ctx->render_shader_updated = 1; 1693 }break; 1694 1695 InvalidDefaultCase; 1696 } 1697 }break; 1698 1699 InvalidDefaultCase; 1700 } 1701 } 1702 } 1703 1704 BEAMFORMER_EXPORT void 1705 beamformer_frame_step(BeamformerInput *input) 1706 { 1707 BeamformerCtx *ctx = BeamformerContextMemory(input->memory); 1708 1709 u64 current_time = os_timer_count(); 1710 dt_for_frame = (f64)(current_time - ctx->frame_timestamp) / os_system_info()->timer_frequency; 1711 ctx->frame_timestamp = current_time; 1712 1713 if (IsWindowResized()) { 1714 ctx->window_size.h = GetScreenHeight(); 1715 ctx->window_size.w = GetScreenWidth(); 1716 } 1717 1718 coalesce_timing_table(ctx->compute_timing_table, ctx->compute_shader_stats); 1719 1720 beamformer_process_input_events(ctx, input, input->event_queue, input->event_count); 1721 1722 BeamformerSharedMemory *sm = ctx->shared_memory; 1723 if (atomic_load_u32(sm->locks + BeamformerSharedMemoryLockKind_UploadRF)) 1724 os_wake_all_waiters(&ctx->upload_worker.sync_variable); 1725 if (atomic_load_u32(sm->locks + BeamformerSharedMemoryLockKind_DispatchCompute)) 1726 os_wake_all_waiters(&ctx->compute_worker.sync_variable); 1727 1728 BeamformerFrame *frame = ctx->latest_frame; 1729 BeamformerViewPlaneTag tag = frame? frame->view_plane_tag : 0; 1730 draw_ui(ctx, input, frame, tag); 1731 1732 ctx->render_shader_updated = 0; 1733 }