ogl_beamforming

Ultrasound Beamforming Implemented with OpenGL
git clone anongit@rnpnr.xyz:ogl_beamforming.git
Log | Files | Refs | Feed | Submodules | README | LICENSE

Commit: f33f8f7270186a95011e8cf201acb3b50733cd4f
Parent: 3e4bea29377e32bee5ef97cc5efef02310587b1b
Author: Randy Palamar
Date:   Thu,  7 May 2026 15:28:11 -0600

core: migrate to vulkan compute

Most things are working here. The only relevant thing that is
missing currently is the frame averaging which no one really uses.
This has a minor performance regression which I saw before when I
tried switching the images in OpenGL to a large ssbo. It can be
solved by not doing the "DAS Fast" thing (running a single channel
at a time) but this may cause issues on lower end devices. The
next commit will implement a new optimization which should solve
this universally.

Diffstat:
Mbeamformer.c | 338++++++++++++++++++-------------------------------------------------------------
Mbeamformer.h | 1+
Mbeamformer.meta | 124++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
Mbeamformer_core.c | 1620++++++++++++++++++++++++++++++++++++++++---------------------------------------
Mbeamformer_internal.h | 383+++++++++++++++++++++++++++++++++++++++++++++++--------------------------------
Mbeamformer_parameters.h | 6++++--
Mbeamformer_shared_memory.c | 9++++-----
Mbuild.c | 204++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------
Dexternal/include/raylib_extended.h | 2--
Dexternal/rcore_extended.c | 8--------
Mgenerated/beamformer.meta.c | 316++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------
Mlib/ogl_beamformer_lib.c | 15+++++++++++++--
Mlib/ogl_beamformer_lib_base.h | 4++++
Mmain_linux.c | 11+----------
Mmain_w32.c | 11+----------
Mmath.c | 20+++-----------------
Mopengl.h | 131++++++++++++-------------------------------------------------------------------
Ashaders/buffer_clear.glsl | 11+++++++++++
Ashaders/coherency_weighting.glsl | 41+++++++++++++++++++++++++++++++++++++++++
Mshaders/das.glsl | 161++++++++++++++++++++++++++++++++++++++++++-------------------------------------
Mshaders/decode.glsl | 108++++++++++++++++++++++++++++++++++---------------------------------------------
Mshaders/filter.glsl | 24+++++++++++++-----------
Mshaders/render_3d.frag.glsl | 60++++++++++++++++++++++++++++++++++++++++++++----------------
Ashaders/render_3d.vert.glsl | 19+++++++++++++++++++
Mui.c | 528+++++++++++++++++++++++++++++++++++++++++++++++++------------------------------
Mutil.c | 8--------
Mutil.h | 16+++++++++-------
Dutil_gl.c | 69---------------------------------------------------------------------
Mutil_os.c | 18++++++++++++++++++
Mvulkan.c | 1838++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------
Mvulkan.h | 1179+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------
31 files changed, 4942 insertions(+), 2341 deletions(-)

diff --git a/beamformer.c b/beamformer.c @@ -56,30 +56,10 @@ fatal(s8 message) #include "vulkan.c" -// TODO(rnp): none of this belongs here, but will be removed +// TODO(rnp): this doesn't belong here, but will be removed // once vulkan migration is complete -#define GLFW_VISIBLE 0x00020004 -void glfwWindowHint(i32, i32); -iptr glfwCreateWindow(i32, i32, char *, iptr, iptr); -void glfwMakeContextCurrent(iptr); -iptr glfwGetGLXContext(iptr); -iptr glfwGetWGLContext(iptr); void * glfwGetProcAddress(char *); -#if OS_WINDOWS -function iptr -os_get_native_gl_context(iptr window) -{ - return glfwGetWGLContext(window); -} -#else -function iptr -os_get_native_gl_context(iptr window) -{ - return glfwGetGLXContext(window); -} -#endif - function void gl_debug_logger(u32 src, u32 type, u32 id, u32 lvl, i32 len, const char *msg, const void *userctx) { @@ -100,7 +80,12 @@ load_gl(Stream *err) stream_reset(err, 0); #define X(name, ret, params) if (!name) stream_append_s8(err, s8("missing required GL function: " #name "\n")); OGLProcedureList - OGLRequiredExtensionProcedureList + OGLRequiredExtensionProcedureListBase + #if OS_WINDOWS + OGLRequiredExtensionProcedureListW32 + #else + OGLRequiredExtensionProcedureListLinux + #endif #undef X if (err->widx) fatal(stream_to_s8(err)); @@ -129,41 +114,6 @@ beamformer_load_cuda_library(BeamformerCtx *ctx, OSLibrary cuda, Arena arena) #undef X } -function BeamformerRenderModel -render_model_from_arrays(f32 *vertices, f32 *normals, i32 vertices_size, u16 *indices, i32 index_count) -{ - BeamformerRenderModel result = {0}; - - i32 buffer_size = vertices_size * 2 + index_count * (i32)sizeof(u16); - i32 indices_offset = vertices_size * 2; - i32 indices_size = index_count * (i32)sizeof(u16); - - result.elements = index_count; - result.elements_offset = indices_offset; - - glCreateBuffers(1, &result.buffer); - glNamedBufferStorage(result.buffer, buffer_size, 0, GL_DYNAMIC_STORAGE_BIT); - glNamedBufferSubData(result.buffer, 0, vertices_size, vertices); - glNamedBufferSubData(result.buffer, vertices_size, vertices_size, normals); - glNamedBufferSubData(result.buffer, indices_offset, indices_size, indices); - - glCreateVertexArrays(1, &result.vao); - glVertexArrayVertexBuffer(result.vao, 0, result.buffer, 0, 3 * sizeof(f32)); - glVertexArrayVertexBuffer(result.vao, 1, result.buffer, vertices_size, 3 * sizeof(f32)); - glVertexArrayElementBuffer(result.vao, result.buffer); - - glEnableVertexArrayAttrib(result.vao, 0); - glEnableVertexArrayAttrib(result.vao, 1); - - glVertexArrayAttribFormat(result.vao, 0, 3, GL_FLOAT, 0, 0); - glVertexArrayAttribFormat(result.vao, 1, 3, GL_FLOAT, 0, (u32)vertices_size); - - glVertexArrayAttribBinding(result.vao, 0, 0); - glVertexArrayAttribBinding(result.vao, 1, 0); - - return result; -} - function void worker_thread_sleep(GLWorkerThreadContext *ctx, BeamformerSharedMemory *sm) { @@ -186,17 +136,12 @@ function OS_THREAD_ENTRY_POINT_FN(compute_worker_thread_entry_point) { GLWorkerThreadContext *ctx = user_context; - glfwMakeContextCurrent(ctx->window_handle); - ctx->gl_context = os_get_native_gl_context(ctx->window_handle); - BeamformerCtx *beamformer = (BeamformerCtx *)ctx->user_context; - glCreateQueries(GL_TIME_ELAPSED, countof(beamformer->compute_context.shader_timer_ids), - beamformer->compute_context.shader_timer_ids); for (;;) { worker_thread_sleep(ctx, beamformer->shared_memory); asan_poison_region(ctx->arena.beg, ctx->arena.end - ctx->arena.beg); - beamformer_complete_compute(ctx->user_context, &ctx->arena, ctx->gl_context); + beamformer_complete_compute(beamformer, &ctx->arena); } unreachable(); @@ -206,31 +151,8 @@ function OS_THREAD_ENTRY_POINT_FN(compute_worker_thread_entry_point) function OS_THREAD_ENTRY_POINT_FN(beamformer_upload_entry_point) { - GLWorkerThreadContext *ctx = user_context; - glfwMakeContextCurrent(ctx->window_handle); - ctx->gl_context = os_get_native_gl_context(ctx->window_handle); - - BeamformerUploadThreadContext *up = (typeof(up))ctx->user_context; - BeamformerRFBuffer *rf = up->rf_buffer; - glCreateQueries(GL_TIMESTAMP, 1, &rf->data_timestamp_query); - /* NOTE(rnp): start this here so we don't have to worry about it being started or not */ - glQueryCounter(rf->data_timestamp_query, GL_TIMESTAMP); - - glGenSemaphoresEXT(countof(rf->gl_upload_semaphores), rf->gl_upload_semaphores); - for EachElement(rf->vk_upload_semaphores, it) { - OSHandle export = {0}; - rf->vk_upload_semaphores[it] = vk_semaphore_create(rf->upload_semaphores_handles + it); - - if (OS_WINDOWS) { - glImportSemaphoreWin32HandleEXT(rf->gl_upload_semaphores[it], GL_HANDLE_TYPE_OPAQUE_WIN32_EXT, - (void *)export.value[0]); - // NOTE(rnp): w32 does not transfer ownership from handle back to driver - rf->upload_semaphores_handles[it] = export; - } else { - glImportSemaphoreFdEXT(rf->gl_upload_semaphores[it], GL_HANDLE_TYPE_OPAQUE_FD_EXT, export.value[0]); - rf->upload_semaphores_handles[it].value[0] = OSInvalidHandleValue; - } - } + GLWorkerThreadContext *ctx = user_context; + BeamformerUploadThreadContext *up = (typeof(up))ctx->user_context; for (;;) { worker_thread_sleep(ctx, up->shared_memory); @@ -264,6 +186,45 @@ beamformer_init(BeamformerInput *input) vk_load(input->vulkan_library_handle, &memory, &ctx->error_stream); + BeamformerComputeContext *cs = &ctx->compute_context; + + // NOTE(rnp): allocate beamformed image ring buffer + { + u64 gpu_heap_size = vk_gpu_info()->gpu_heap_size; + u64 trial_sizes[] = { + GB(4), + GB(2), + GB(1) + MB(512), + GB(1), + }; + + u32 base_index = 0; + for EachElement(trial_sizes, it) { + if (gpu_heap_size >= 2 * trial_sizes[it]) + break; + base_index++; + } + + for (u32 i = base_index; i < countof(trial_sizes); i++) { + // TODO(rnp): it may be better to download data from this using the transfer queue + VulkanTimeline timelines[] = {VulkanTimeline_Compute, VulkanTimeline_Graphics}; + GPUBufferAllocateInfo allocate_info = { + .size = trial_sizes[i], + .flags = VulkanUsageFlag_TransferSource|VulkanUsageFlag_HostReadWrite, + .timeline_count = countof(timelines), + .timelines_used = timelines, + .label = s8("BeamformedData"), + }; + vk_buffer_allocate(cs->backlog.buffer, &allocate_info); + if (cs->backlog.buffer->size > 0) + break; + } + if (cs->backlog.buffer->size == 0) { + // NOTE(rnp): if this becomes an issue we may be able to get by in some other way + fatal(s8("Failed to allocate space for beamformed data\n")); + } + } + beamformer_load_cuda_library(ctx, input->cuda_library_handle, memory); SetConfigFlags(FLAG_VSYNC_HINT|FLAG_WINDOW_ALWAYS_RUN); @@ -272,15 +233,8 @@ beamformer_init(BeamformerInput *input) SetWindowState(FLAG_WINDOW_RESIZABLE); SetWindowMinSize(840, ctx->window_size.h); - glfwWindowHint(GLFW_VISIBLE, 0); - iptr raylib_window_handle = (iptr)GetPlatformWindowHandle(); - load_gl(&ctx->error_stream); - ctx->beamform_work_queue = push_struct(&memory, BeamformWorkQueue); - ctx->compute_shader_stats = push_struct(&memory, ComputeShaderStats); - ctx->compute_timing_table = push_struct(&memory, ComputeTimingTable); - ctx->shared_memory = input->shared_memory; ctx->shared_memory_size = input->shared_memory_size; if (ctx->shared_memory_size < (i64)sizeof(*ctx->shared_memory)) @@ -289,6 +243,7 @@ beamformer_init(BeamformerInput *input) ctx->shared_memory->version = BEAMFORMER_SHARED_MEMORY_VERSION; ctx->shared_memory->reserved_parameter_blocks = 1; + ctx->shared_memory->max_beamformed_data_size = cs->backlog.buffer->size; /* TODO(rnp): I'm not sure if its a good idea to pre-reserve a bunch of semaphores * on w32 but thats what we are doing for now */ @@ -316,14 +271,10 @@ beamformer_init(BeamformerInput *input) } #endif - BeamformerComputeContext *cs = &ctx->compute_context; - cs->rf_buffer.export_handle = (OSHandle){OSInvalidHandleValue}; - GLWorkerThreadContext *worker = &ctx->compute_worker; /* TODO(rnp): we should lock this down after we have something working */ - worker->user_context = (iptr)ctx; - worker->window_handle = glfwCreateWindow(1, 1, "", 0, raylib_window_handle); - worker->handle = os_create_thread("[compute]", worker, compute_worker_thread_entry_point); + worker->user_context = (iptr)ctx; + worker->handle = os_create_thread("[compute]", worker, compute_worker_thread_entry_point); GLWorkerThreadContext *upload = &ctx->upload_worker; BeamformerUploadThreadContext *upctx = push_struct(&memory, typeof(*upctx)); @@ -333,10 +284,7 @@ beamformer_init(BeamformerInput *input) upctx->shared_memory_size = ctx->shared_memory_size; upctx->compute_timing_table = ctx->compute_timing_table; upctx->compute_worker_sync = &ctx->compute_worker.sync_variable; - upload->window_handle = glfwCreateWindow(1, 1, "", 0, raylib_window_handle); - upload->handle = os_create_thread("[upload]", upload, beamformer_upload_entry_point); - - glfwMakeContextCurrent(raylib_window_handle); + upload->handle = os_create_thread("[upload]", upload, beamformer_upload_entry_point); /* NOTE: set up OpenGL debug logging */ Stream *gl_error_stream = push_struct(&memory, Stream); @@ -352,171 +300,37 @@ beamformer_init(BeamformerInput *input) i32 index = beamformer_reloadable_compute_shader_info_indices[it]; Arena temp = scratch; s8 file = push_s8_from_parts(&temp, os_path_separator(), s8("shaders"), - beamformer_reloadable_shader_files[index]); + beamformer_reloadable_shader_files[index][0]); BeamformerFileReloadContext *frc = push_struct(&memory, typeof(*frc)); - frc->kind = BeamformerFileReloadKind_ComputeShader; - frc->compute_shader_kind = beamformer_reloadable_shader_kinds[index]; + frc->kind = BeamformerFileReloadKind_ComputeShader; + frc->shader_reload.shader = beamformer_reloadable_shader_kinds[index]; os_add_file_watch((char *)file.data, file.len, frc); } - } - FrameViewRenderContext *fvr = &ctx->frame_view_render_context; - glCreateFramebuffers(countof(fvr->framebuffers), fvr->framebuffers); - LABEL_GL_OBJECT(GL_FRAMEBUFFER, fvr->framebuffers[0], s8("Frame View Framebuffer")); - LABEL_GL_OBJECT(GL_FRAMEBUFFER, fvr->framebuffers[1], s8("Frame View Resolving Framebuffer")); - - glCreateRenderbuffers(countof(fvr->renderbuffers), fvr->renderbuffers); - u32 msaa_samples = vk_gpu_info()->max_msaa_samples; - glNamedRenderbufferStorageMultisample(fvr->renderbuffers[0], msaa_samples, GL_RGBA8, - FRAME_VIEW_RENDER_TARGET_SIZE); - glNamedRenderbufferStorageMultisample(fvr->renderbuffers[1], msaa_samples, GL_DEPTH_COMPONENT24, - FRAME_VIEW_RENDER_TARGET_SIZE); - - static_assert(countof(beamformer_reloadable_render_shader_info_indices) == 1, - "only a single render shader is currently handled"); - i32 render_rsi_index = beamformer_reloadable_render_shader_info_indices[0]; - - // TODO(rnp): leaks when BakeShaders is true - Arena *arena = &memory; - BeamformerShaderReloadContext *render_3d = push_struct(arena, typeof(*render_3d)); - render_3d->reloadable_info_index = render_rsi_index; - render_3d->gl_type = GL_FRAGMENT_SHADER; - render_3d->header = s8("" - "layout(location = 0) in vec3 normal;\n" - "layout(location = 1) in vec3 texture_coordinate;\n\n" - "layout(location = 2) in vec3 test_texture_coordinate;\n\n" - "layout(location = 0) out vec4 out_colour;\n\n" - "layout(location = " str(FRAME_VIEW_DYNAMIC_RANGE_LOC) ") uniform float u_db_cutoff = 60;\n" - "layout(location = " str(FRAME_VIEW_THRESHOLD_LOC) ") uniform float u_threshold = 40;\n" - "layout(location = " str(FRAME_VIEW_GAMMA_LOC) ") uniform float u_gamma = 1;\n" - "layout(location = " str(FRAME_VIEW_LOG_SCALE_LOC) ") uniform bool u_log_scale;\n" - "layout(location = " str(FRAME_VIEW_BB_COLOUR_LOC) ") uniform vec4 u_bb_colour = vec4(" str(FRAME_VIEW_BB_COLOUR) ");\n" - "layout(location = " str(FRAME_VIEW_BB_FRACTION_LOC) ") uniform float u_bb_fraction = " str(FRAME_VIEW_BB_FRACTION) ";\n" - "layout(location = " str(FRAME_VIEW_SOLID_BB_LOC) ") uniform bool u_solid_bb;\n" - "\n" - "layout(binding = 0) uniform sampler3D u_texture;\n"); - - render_3d->link = push_struct(arena, typeof(*render_3d)); - render_3d->link->reloadable_info_index = -1; - render_3d->link->gl_type = GL_VERTEX_SHADER; - render_3d->link->link = render_3d; - render_3d->link->header = s8("" - "layout(location = 0) in vec3 v_position;\n" - "layout(location = 1) in vec3 v_normal;\n" - "\n" - "layout(location = 0) out vec3 f_normal;\n" - "layout(location = 1) out vec3 f_texture_coordinate;\n" - "layout(location = 2) out vec3 f_orig_texture_coordinate;\n" - "\n" - "layout(location = " str(FRAME_VIEW_MODEL_MATRIX_LOC) ") uniform mat4 u_model;\n" - "layout(location = " str(FRAME_VIEW_VIEW_MATRIX_LOC) ") uniform mat4 u_view;\n" - "layout(location = " str(FRAME_VIEW_PROJ_MATRIX_LOC) ") uniform mat4 u_projection;\n" - "\n" - "\n" - "void main()\n" - "{\n" - "\tvec3 pos = v_position;\n" - "\tf_orig_texture_coordinate = (2 * v_position + 1) / 2;\n" - //"\tif (v_position.y == -1) pos.x = clamp(v_position.x, -u_clip_fraction, u_clip_fraction);\n" - "\tvec3 tex_coord = (2 * pos + 1) / 2;\n" - "\tf_texture_coordinate = tex_coord;\n" - //"\tf_texture_coordinate = u_swizzle? tex_coord.xzy : tex_coord;\n" - //"\tf_normal = normalize(mat3(u_model) * v_normal);\n" - "\tf_normal = v_normal;\n" - "\tgl_Position = u_projection * u_view * u_model * vec4(pos, 1);\n" - "}\n"); - - // TODO(rnp): this is probably not expected by the platform, refactor so that all - // needed context (eg. headers) are available outside of here and push initial load - // into ui_init - { - BeamformerFileReloadContext *frc = push_struct(&memory, typeof(*frc)); - frc->kind = BeamformerFileReloadKind_Shader; - frc->shader_reload_context = render_3d; - input->event_queue[input->event_count++] = (BeamformerInputEvent){ - .kind = BeamformerInputEventKind_FileEvent, - .file_watch_user_context = frc, - }; + for EachElement(beamformer_reloadable_compute_helpers_shader_info_indices, it) { + i32 index = beamformer_reloadable_compute_helpers_shader_info_indices[it]; + Arena temp = scratch; + s8 file = push_s8_from_parts(&temp, os_path_separator(), s8("shaders"), + beamformer_reloadable_shader_files[index][0]); + BeamformerFileReloadContext *frc = push_struct(&memory, typeof(*frc)); + frc->kind = BeamformerFileReloadKind_ComputeShader; + frc->shader_reload.shader = beamformer_reloadable_shader_kinds[index]; + os_add_file_watch((char *)file.data, file.len, frc); + } - s8 render_file = {0}; - if (!BakeShaders) { - render_file = push_s8_from_parts(&scratch, os_path_separator(), s8("shaders"), - beamformer_reloadable_shader_files[render_rsi_index]); - os_add_file_watch((char *)render_file.data, render_file.len, frc); + for EachElement(beamformer_reloadable_compute_internal_shader_info_indices, it) { + i32 index = beamformer_reloadable_compute_internal_shader_info_indices[it]; + Arena temp = scratch; + s8 file = push_s8_from_parts(&temp, os_path_separator(), s8("shaders"), + beamformer_reloadable_shader_files[index][0]); + BeamformerFileReloadContext *frc = push_struct(&memory, typeof(*frc)); + frc->kind = BeamformerFileReloadKind_ComputeInternalShader; + frc->shader_reload.shader = beamformer_reloadable_shader_kinds[index]; + frc->shader_reload.pipeline = cs->compute_internal_pipelines + it; + os_add_file_watch((char *)file.data, file.len, frc); } } - f32 unit_cube_vertices[] = { - 0.5f, 0.5f, -0.5f, - 0.5f, 0.5f, -0.5f, - 0.5f, 0.5f, -0.5f, - 0.5f, -0.5f, -0.5f, - 0.5f, -0.5f, -0.5f, - 0.5f, -0.5f, -0.5f, - 0.5f, 0.5f, 0.5f, - 0.5f, 0.5f, 0.5f, - 0.5f, 0.5f, 0.5f, - 0.5f, -0.5f, 0.5f, - 0.5f, -0.5f, 0.5f, - 0.5f, -0.5f, 0.5f, - -0.5f, 0.5f, -0.5f, - -0.5f, 0.5f, -0.5f, - -0.5f, 0.5f, -0.5f, - -0.5f, -0.5f, -0.5f, - -0.5f, -0.5f, -0.5f, - -0.5f, -0.5f, -0.5f, - -0.5f, 0.5f, 0.5f, - -0.5f, 0.5f, 0.5f, - -0.5f, 0.5f, 0.5f, - -0.5f, -0.5f, 0.5f, - -0.5f, -0.5f, 0.5f, - -0.5f, -0.5f, 0.5f - }; - f32 unit_cube_normals[] = { - 0.0f, 0.0f, -1.0f, - 0.0f, 1.0f, 0.0f, - 1.0f, 0.0f, 0.0f, - 0.0f, 0.0f, -1.0f, - 0.0f, -1.0f, 0.0f, - 1.0f, 0.0f, 0.0f, - 0.0f, 0.0f, 1.0f, - 0.0f, 1.0f, 0.0f, - 1.0f, 0.0f, 0.0f, - 0.0f, 0.0f, 1.0f, - 0.0f, -1.0f, 0.0f, - 1.0f, 0.0f, 0.0f, - 0.0f, 0.0f, -1.0f, - 0.0f, 1.0f, 0.0f, - -1.0f, 0.0f, 0.0f, - 0.0f, 0.0f, -1.0f, - 0.0f, -1.0f, 0.0f, - -1.0f, 0.0f, 0.0f, - 0.0f, 0.0f, 1.0f, - 0.0f, 1.0f, 0.0f, - -1.0f, 0.0f, 0.0f, - 0.0f, 0.0f, 1.0f, - 0.0f, -1.0f, 0.0f, - -1.0f, 0.0f, 0.0f - }; - u16 unit_cube_indices[] = { - 1, 13, 19, - 1, 19, 7, - 9, 6, 18, - 9, 18, 21, - 23, 20, 14, - 23, 14, 17, - 16, 4, 10, - 16, 10, 22, - 5, 2, 8, - 5, 8, 11, - 15, 12, 0, - 15, 0, 3 - }; - - cs->unit_cube_model = render_model_from_arrays(unit_cube_vertices, unit_cube_normals, - sizeof(unit_cube_vertices), - unit_cube_indices, countof(unit_cube_indices)); - memory.end = scratch.end; ctx->arena = memory; ctx->state = BeamformerState_Running; diff --git a/beamformer.h b/beamformer.h @@ -182,6 +182,7 @@ typedef struct { #if BEAMFORMER_RENDERDOC_HOOKS void *renderdoc_start_frame_capture; void *renderdoc_end_frame_capture; + void *renderdoc_set_capture_file_path_template; #endif } BeamformerInput; diff --git a/beamformer.meta b/beamformer.meta @@ -1,5 +1,5 @@ @Constant(4) FilterSlots -@Constant(16) MaxBacklogFrames +@Constant(4096) MaxBacklogFrames @Constant(256) MaxChannelCount @Constant(256) MaxEmissionsCount @Constant(16) MaxComputeShaderStages @@ -210,6 +210,13 @@ @Library @Struct SimpleParameters @MATLAB @Struct SimpleParameters +@Struct DASArrayParameters +{ + [focal_vectors V2 MaxChannelCount] + [sparse_elements S16 MaxChannelCount] + [transmit_receive_orientations U16 MaxChannelCount] +} + @Emit { `read_only global u8 beamformer_data_kind_element_size[] = {` @@ -274,6 +281,15 @@ [ToProcess to_process U32] [TransmitCount transmit_count U32] } + + @PushConstants + { + [hadamard_buffer U64] + [rf_buffer U64] + [output_buffer U64] + [output_rf_buffer U64] + [first_pass B32] + } } @Shader(filter.glsl) Filter @@ -301,22 +317,32 @@ [DemodulationFrequency demodulation_frequency F32] [SamplingFrequency sampling_frequency F32] } + + @PushConstants + { + [input_data U64] + [output_data U64] + [filter_coefficients U64] + } } @Shader(das.glsl) DAS { + @Constant MaxChannelCount + @Enumeration AcquisitionKind @Enumeration DataKind @Enumeration InterpolationMode @Enumeration RCAOrientation + @Struct DASArrayParameters + @Bake { [DataKind data_kind U32] [CoherencyWeighting coherency_weighting U32] [SingleFocus single_focus U32] [SingleOrientation single_orientation U32] - [Fast fast U32] [Sparse sparse U32] [AcquisitionCount acquisition_count U32] [AcquisitionKind acquisition_kind U32] @@ -336,17 +362,101 @@ @PushConstants { - [xdc_transform M4] - [voxel_transform M4] - [xdc_element_pitch V2] + [xdc_transform M4] + [voxel_transform M4] + [xdc_element_pitch V2] + [rf_data U64] + [output_data U64] + [incoherent_output U64] + [array_parameters U64] + [output_size_x U32] + [output_size_y U32] + [output_size_z U32] + [cycle_t U32] + [channel_t S32] } } - @Shader(min_max.glsl) MinMax @Shader(sum.glsl) Sum + { + @Enumeration DataKind + @PushConstants + { + [output_data U64] + [input_data U64] + [image_elements U32] + [scale F32] + } + } + + @Shader(min_max.glsl) MinMax +} + +// NOTE: shaders which need to be baked into the beamforming pipeline +// but should not be visible to the external interface +@ShaderGroup ComputeHelpers +{ + @Shader(coherency_weighting.glsl) CoherencyWeighting + { + @Enumeration DataKind + + @Bake + { + [DataKind data_kind U32] + } + + @PushConstants + { + [left_side_buffer U64] + [right_side_buffer U64] + [elements U32] + [scale F32] + [output_size_x U32] + [output_size_y U32] + [output_size_z U32] + } + } +} + +// NOTE: general compute shaders which do not need baking +@ShaderGroup ComputeInternal +{ + @Shader(buffer_clear.glsl) BufferClear + { + @PushConstants + { + [data U64] + [clear_word U32] + [words U32] + } + } } @ShaderGroup Render { - @Shader(render_3d.frag.glsl) Render3D + @RenderShader RenderBeamformed + { + @Enumeration DataKind + + @VertexShader(render_3d.vert.glsl) + @FragmentShader(render_3d.frag.glsl) + + @PushConstants + { + [mvp_matrix M4] + [positions U64] + [normals U64] + + [bounding_box_colour V4] + [bounding_box_fraction F32] + [db_cutoff F32] + [threshold F32] + [gamma F32] + [input_data U64] + [input_size_x U32] + [input_size_y U32] + [input_size_z U32] + [data_kind U32] + } + } } diff --git a/beamformer_core.c b/beamformer_core.c @@ -11,14 +11,9 @@ * - the check for first pass reshaping is the last non constant check * in the shader * - this will also remove the need for the channel mapping in the decode shader - * [X]: refactor: ui: reload only shader which is affected by the interaction * [ ]: BeamformWorkQueue -> BeamformerWorkQueue - * [ ]: need to keep track of gpu memory in some way - * - want to be able to store more than 16 2D frames but limit 3D frames - * - maybe keep track of how much gpu memory is committed for beamformed images - * and use that to determine when to loop back over existing textures - * - to do this maybe use a circular linked list instead of a flat array - * - then have a way of querying how many frames are available for a specific point count + * [ ]: refactor: work queue needs a cleanup, we should only have a single one + * - that queue isn't really considered hot so a lock is probably fine * [ ]: bug: reinit cuda on hot-reload */ @@ -32,33 +27,33 @@ global f32 dt_for_frame; -#define DECODE_FIRST_PASS_UNIFORM_LOC 1 - -#define DAS_CYCLE_T_UNIFORM_LOC 2 -#define DAS_FAST_CHANNEL_UNIFORM_LOC 3 - -#define MIN_MAX_MIPS_LEVEL_UNIFORM_LOC 1 -#define SUM_PRESCALE_UNIFORM_LOC 1 - #if !BEAMFORMER_RENDERDOC_HOOKS #define start_renderdoc_capture(...) #define end_renderdoc_capture(...) #define renderdoc_attached(...) (0) #else -global renderdoc_start_frame_capture_fn *start_frame_capture; -global renderdoc_end_frame_capture_fn *end_frame_capture; -#define start_renderdoc_capture(gl) if (start_frame_capture) start_frame_capture(gl, 0) -#define end_renderdoc_capture(gl) if (end_frame_capture) end_frame_capture(gl, 0) +global renderdoc_start_frame_capture_fn *start_frame_capture; +global renderdoc_set_capture_path_template_fn *set_capture_path_template; +global renderdoc_end_frame_capture_fn *end_frame_capture; +#define start_renderdoc_capture() do { \ + if (set_capture_path_template) set_capture_path_template("captures/ogl.rdc"); \ + if (start_frame_capture) start_frame_capture(vk_renderdoc_instance_handle(), 0); \ +} while(0) +#define end_renderdoc_capture() if (end_frame_capture) end_frame_capture(vk_renderdoc_instance_handle(), 0) #define renderdoc_attached(...) (start_frame_capture != 0) #endif -typedef struct { - BeamformerFrame *frames; - u32 capacity; - u32 offset; - u32 cursor; - u32 needed_frames; -} ComputeFrameIterator; +read_only global u32 beamformer_compute_array_parameter_sizes[] = { + #define X(k, type, elements) sizeof(type) * elements, + BEAMFORMER_COMPUTE_ARRAY_PARAMETERS_LIST + #undef X +}; + +read_only global u32 beamformer_compute_array_parameter_offsets[] = { + #define X(k, ...) offsetof(BeamformerComputeArrayParameters, k), + BEAMFORMER_COMPUTE_ARRAY_PARAMETERS_LIST + #undef X +}; function void beamformer_compute_plan_release(BeamformerComputeContext *cc, u32 block) @@ -66,10 +61,9 @@ beamformer_compute_plan_release(BeamformerComputeContext *cc, u32 block) assert(block < countof(cc->compute_plans)); BeamformerComputePlan *cp = cc->compute_plans[block]; if (cp) { - glDeleteBuffers(countof(cp->ubos), cp->ubos); - glDeleteTextures(countof(cp->textures), cp->textures); + vk_buffer_release(&cp->array_parameters); for (u32 i = 0; i < countof(cp->filters); i++) - glDeleteBuffers(1, &cp->filters[i].ssbo); + vk_buffer_release(&cp->filters[i].buffer); cc->compute_plans[block] = 0; SLLPushFreelist(cp, cc->compute_plan_freelist); } @@ -88,39 +82,19 @@ beamformer_compute_plan_for_block(BeamformerComputeContext *cc, u32 block, Arena result->ui_voxel_transform = m4_identity(); - glCreateBuffers(countof(result->ubos), result->ubos); - Stream label = arena_stream(*arena); - #define X(k, t, ...) \ - glNamedBufferStorage(result->ubos[BeamformerComputeUBOKind_##k], sizeof(t), \ - 0, GL_DYNAMIC_STORAGE_BIT); \ - stream_append_s8(&label, s8(#t "[")); \ - stream_append_u64(&label, block); \ - stream_append_byte(&label, ']'); \ - glObjectLabel(GL_BUFFER, result->ubos[BeamformerComputeUBOKind_##k], \ - label.widx, (c8 *)label.data); \ - label.widx = 0; - BEAMFORMER_COMPUTE_UBO_LIST - #undef X - - #define X(_k, t, ...) t, - GLenum gl_kind[] = {BEAMFORMER_COMPUTE_TEXTURE_LIST_FULL}; - #undef X - read_only local_persist s8 tex_prefix[] = { - #define X(k, ...) s8_comp(#k "["), - BEAMFORMER_COMPUTE_TEXTURE_LIST_FULL - #undef X + stream_append_s8(&label, s8("ComputeParameterArray[")); + stream_append_u64(&label, block); + stream_append_s8(&label, s8("]")); + stream_append_byte(&label, 0); + + GPUBufferAllocateInfo allocate_info = { + .size = sizeof(BeamformerComputeArrayParameters), + .flags = VulkanUsageFlag_HostReadWrite, + .label = stream_to_s8(&label), }; - glCreateTextures(GL_TEXTURE_1D, BeamformerComputeTextureKind_Count - 1, result->textures); - for (u32 i = 0; i < BeamformerComputeTextureKind_Count - 1; i++) { - /* TODO(rnp): this could be predicated on channel count for this compute plan */ - glTextureStorage1D(result->textures[i], 1, gl_kind[i], BeamformerMaxChannelCount); - stream_append_s8(&label, tex_prefix[i]); - stream_append_u64(&label, block); - stream_append_byte(&label, ']'); - glObjectLabel(GL_TEXTURE, result->textures[i], label.widx, (c8 *)label.data); - label.widx = 0; - } + vk_buffer_allocate(&result->array_parameters, &allocate_info); + assert((result->array_parameters.gpu_pointer & 63) == 0); } return result; } @@ -165,42 +139,16 @@ beamformer_filter_update(BeamformerFilter *f, BeamformerFilterParameters fp, u32 f->parameters = fp; - glDeleteBuffers(1, &f->ssbo); - glCreateBuffers(1, &f->ssbo); - glNamedBufferStorage(f->ssbo, f->length * (i32)sizeof(f32) * (fp.complex? 2 : 1), filter, 0); - glObjectLabel(GL_BUFFER, f->ssbo, (i32)label.len, (c8 *)label.data); -} - -function ComputeFrameIterator -compute_frame_iterator(BeamformerCtx *ctx, u32 start_index, u32 needed_frames) -{ - start_index = start_index % countof(ctx->beamform_frames); - - ComputeFrameIterator result; - result.frames = ctx->beamform_frames; - result.offset = start_index; - result.capacity = countof(ctx->beamform_frames); - result.cursor = 0; - result.needed_frames = needed_frames; - return result; -} - -function BeamformerFrame * -frame_next(ComputeFrameIterator *bfi) -{ - BeamformerFrame *result = 0; - if (bfi->cursor != bfi->needed_frames) { - u32 index = (bfi->offset + bfi->cursor++) % bfi->capacity; - result = bfi->frames + index; + u32 byte_size = f->length * (i32)sizeof(f32) * (fp.complex? 2 : 1); + if (f->buffer.size < byte_size) { + GPUBufferAllocateInfo allocate_info = { + .size = byte_size, + .flags = VulkanUsageFlag_HostReadWrite, + .label = label, + }; + vk_buffer_allocate(&f->buffer, &allocate_info); } - return result; -} - -function b32 -beamformer_frame_compatible(BeamformerFrame *f, iv3 dim, GLenum gl_kind) -{ - b32 result = gl_kind == f->gl_kind && iv3_equal(dim, f->dim); - return result; + vk_buffer_range_upload(&f->buffer, filter, 0, byte_size, 0); } function iv3 @@ -214,83 +162,51 @@ das_valid_points(iv3 points) } function void -alloc_beamform_frame(BeamformerFrame *out, iv3 out_dim, GLenum gl_kind, s8 name, Arena arena) +update_hadamard(BeamformerComputePlan *cp, i32 order, b32 row_major, Arena arena) { - out->dim = das_valid_points(out_dim); + f16 *hadamard = make_hadamard_transpose(&arena, order, row_major); + if (hadamard) { + u64 offset = offsetof(BeamformerComputeArrayParameters, Hadamard); + u64 size = sizeof(*((BeamformerComputeArrayParameters *)0)->Hadamard) * order * order; + vk_buffer_range_upload(&cp->array_parameters, hadamard, offset, size, 0); + cp->hadamard_order = order; + } +} - /* NOTE: allocate storage for beamformed output data; - * this is shared between compute and fragment shaders */ - u32 max_dim = (u32)Max(out->dim.x, Max(out->dim.y, out->dim.z)); - out->mips = (i32)ctz_u64(round_up_power_of_two(max_dim)) + 1; +function u64 +beamformer_frame_byte_size(iv3 points, BeamformerDataKind kind) +{ + u64 result = points.x * points.y * points.z * beamformer_data_kind_byte_size[kind]; + result = round_up_to(result, 64); + return result; +} - out->gl_kind = gl_kind; +function BeamformerFrame * +beamformer_frame_next(BeamformerComputeContext *cc, iv3 output_points, b32 complex, u64 reserved_size) +{ + BeamformerFrameBacklog *bl = &cc->backlog; - Stream label = arena_stream(arena); - stream_append_s8(&label, name); - stream_append_byte(&label, '['); - stream_append_hex_u64(&label, out->id); - stream_append_byte(&label, ']'); + BeamformerDataKind kind = complex ? BeamformerDataKind_Float32Complex : BeamformerDataKind_Float32; + u64 frame_size = beamformer_frame_byte_size(output_points, kind); - glDeleteTextures(1, &out->texture); - glCreateTextures(GL_TEXTURE_3D, 1, &out->texture); - glTextureStorage3D(out->texture, out->mips, gl_kind, out->dim.x, out->dim.y, out->dim.z); + // TODO(rnp): handle this somewhat gracefully (even it produces garbled output) + assert(frame_size + reserved_size <= (u64)bl->buffer->size); - glTextureParameteri(out->texture, GL_TEXTURE_MIN_FILTER, GL_NEAREST); - glTextureParameteri(out->texture, GL_TEXTURE_MAG_FILTER, GL_NEAREST); + if (bl->next_offset > (u64)bl->buffer->size - frame_size - reserved_size) + bl->next_offset = 0; - LABEL_GL_OBJECT(GL_TEXTURE, out->texture, stream_to_s8(&label)); -} + u64 id = bl->counter++; -function void -update_hadamard_texture(BeamformerComputePlan *cp, i32 order, b32 row_major, Arena arena) -{ - f16 *hadamard = make_hadamard_transpose(&arena, order, row_major); - if (hadamard) { - cp->hadamard_order = order; - u32 *texture = cp->textures + BeamformerComputeTextureKind_Hadamard; - glDeleteTextures(1, texture); - glCreateTextures(GL_TEXTURE_2D, 1, texture); - glTextureStorage2D(*texture, 1, GL_R16F, order, order); - glTextureSubImage2D(*texture, 0, 0, 0, order, order, GL_RED, GL_SHORT, hadamard); - - Stream label = arena_stream(arena); - stream_append_s8(&label, s8("Hadamard")); - stream_append_i64(&label, order); - LABEL_GL_OBJECT(GL_TEXTURE, *texture, stream_to_s8(&label)); - } -} + BeamformerFrame *result = bl->frames + (id % countof(bl->frames)); + atomic_store_u64(&result->timeline_valid_value, -1ULL); + result->id = id & U32_MAX; + result->buffer_offset = bl->next_offset; + result->points = output_points; + result->data_kind = kind; -function void -alloc_shader_storage(BeamformerCtx *ctx, u32 decoded_data_size, Arena arena) -{ - BeamformerComputeContext *cc = &ctx->compute_context; - glDeleteBuffers(countof(cc->ping_pong_ssbos), cc->ping_pong_ssbos); - glCreateBuffers(countof(cc->ping_pong_ssbos), cc->ping_pong_ssbos); - - cc->ping_pong_ssbo_size = decoded_data_size; - - Stream label = arena_stream(arena); - stream_append_s8(&label, s8("PingPongSSBO[")); - i32 s_widx = label.widx; - for (i32 i = 0; i < countof(cc->ping_pong_ssbos); i++) { - glNamedBufferStorage(cc->ping_pong_ssbos[i], (iz)decoded_data_size, 0, 0); - stream_append_i64(&label, i); - stream_append_byte(&label, ']'); - LABEL_GL_OBJECT(GL_BUFFER, cc->ping_pong_ssbos[i], stream_to_s8(&label)); - stream_reset(&label, s_widx); - } + bl->next_offset += frame_size; - /* TODO(rnp): (25.08.04) cuda lib is heavily broken atm. First there are multiple RF - * buffers and cuda decode shouldn't assume that the data is coming from the rf_buffer - * ssbo. Second each parameter block may need a different hadamard matrix so ideally - * decode should just take the texture as a parameter. Third, none of these dimensions - * need to be pre-known by the library unless its allocating GPU memory which it shouldn't - * need to do. For now grab out of parameter block 0 but it is not correct */ - BeamformerParameterBlock *pb = beamformer_parameter_block(ctx->shared_memory, 0); - /* NOTE(rnp): these are stubs when CUDA isn't supported */ - cuda_register_buffers(cc->ping_pong_ssbos, countof(cc->ping_pong_ssbos), cc->rf_buffer.ssbo); - u32 decoded_data_dimension[3] = {pb->parameters.sample_count, pb->parameters.channel_count, pb->parameters.acquisition_count}; - cuda_init(pb->parameters.raw_data_dimensions.E, decoded_data_dimension); + return result; } function void @@ -306,35 +222,69 @@ fill_frame_compute_work(BeamformerCtx *ctx, BeamformWork *work, BeamformerViewPl { b32 result = work != 0; if (result) { - u32 frame_id = atomic_add_u32(&ctx->next_render_frame_index, 1); - u32 frame_index = frame_id % countof(ctx->beamform_frames); - work->kind = indirect? BeamformerWorkKind_ComputeIndirect : BeamformerWorkKind_Compute; - work->lock = BeamformerSharedMemoryLockKind_DispatchCompute; + work->kind = indirect? BeamformerWorkKind_ComputeIndirect : BeamformerWorkKind_Compute; + work->lock = BeamformerSharedMemoryLockKind_DispatchCompute; work->compute_context.parameter_block = parameter_block; - work->compute_context.frame = ctx->beamform_frames + frame_index; - work->compute_context.frame->ready_to_present = 0; - work->compute_context.frame->view_plane_tag = plane; - work->compute_context.frame->id = frame_id; } return result; } -function void -do_sum_shader(BeamformerComputeContext *cc, u32 *in_textures, u32 in_texture_count, - u32 out_texture, iv3 out_data_dim) +function uv3 +layout_for_output(iv3 points) { - /* NOTE: zero output before summing */ - glClearTexImage(out_texture, 0, GL_RED, GL_FLOAT, 0); - glMemoryBarrier(GL_TEXTURE_UPDATE_BARRIER_BIT); - - glBindImageTexture(0, out_texture, 0, GL_TRUE, 0, GL_READ_WRITE, GL_RG32F); - for (u32 i = 0; i < in_texture_count; i++) { - glBindImageTexture(1, in_textures[i], 0, GL_TRUE, 0, GL_READ_ONLY, GL_RG32F); - glDispatchCompute(ORONE((u32)out_data_dim.x / 32u), - ORONE((u32)out_data_dim.y), - ORONE((u32)out_data_dim.z / 32u)); - glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT); + uv3 result = {{1, 1, 1}}; + + b32 has_x = points.x > 1; + b32 has_y = points.y > 1; + b32 has_z = points.z > 1; + + u32 subgroup_size = vk_gpu_info()->subgroup_size; + u32 grid_3d_z_size = Max(1, subgroup_size / (4 * 4)); + u32 grid_2d_y_size = Max(1, subgroup_size / 8); + + switch (iv3_dimension(points)) { + case 1:{ + if (has_x) result.x = subgroup_size; + if (has_y) result.y = subgroup_size; + if (has_z) result.z = subgroup_size; + }break; + + case 2:{ + if (has_x && has_y) {result.x = 8; result.y = grid_2d_y_size;} + if (has_x && has_z) {result.x = 8; result.z = grid_2d_y_size;} + if (has_y && has_z) {result.y = 8; result.z = grid_2d_y_size;} + }break; + + case 3:{result = (uv3){{4, 4, grid_3d_z_size}};}break; + + InvalidDefaultCase; } + + return result; +} + +function uv3 +dispatch_for_output(uv3 layout, iv3 points) +{ + uv3 result; + result.x = (u32)ceil_f32((f32)points.x / layout.x); + result.y = (u32)ceil_f32((f32)points.y / layout.y); + result.z = (u32)ceil_f32((f32)points.z / layout.z); + return result; +} + +function b32 +compute_plan_push_shader(BeamformerComputePlan *p, BeamformerShaderKind shader, BeamformerShaderParameters *sp) +{ + b32 result = 0; + if (p->pipeline.shader_count < countof(p->pipeline.shaders)) { + u32 index = p->pipeline.shader_count++; + p->pipeline.shaders[index] = shader; + p->pipeline.parameters[index] = *sp; + zero_struct(p->shader_descriptors + index); + result = 1; + } + return result; } function void @@ -374,355 +324,372 @@ plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb) f32 time_offset = pb->parameters.time_offset; - // TODO(rnp): subgroup size - u32 subgroup_size = vk_gpu_info()->vendor == GPUVendor_NVIDIA ? 32 : 64; + u32 subgroup_size = vk_gpu_info()->subgroup_size; cp->pipeline.shader_count = 0; for (u32 i = 0; i < pb->pipeline.shader_count; i++) { BeamformerShaderParameters *sp = pb->pipeline.parameters + i; u32 slot = cp->pipeline.shader_count; u32 shader = pb->pipeline.shaders[i]; - b32 commit = 0; BeamformerShaderDescriptor *ld = cp->shader_descriptors + slot - 1; BeamformerShaderDescriptor *sd = cp->shader_descriptors + slot; - zero_struct(sd); switch (shader) { - case BeamformerShaderKind_CudaHilbert:{ commit = run_cuda_hilbert; }break; + + case BeamformerShaderKind_CudaHilbert:{ + if (run_cuda_hilbert) + compute_plan_push_shader(cp, shader, sp); + }break; + case BeamformerShaderKind_Decode:{ /* TODO(rnp): rework decode first and demodulate after */ b32 first = slot == 0; - BeamformerDecodeBakeParameters *db = &sd->bake.Decode; - db->data_kind = data_kind; - if (!first) { - if (data_kind == BeamformerDataKind_Int16) { - db->data_kind = BeamformerDataKind_Int16Complex; - } else { - db->data_kind = BeamformerDataKind_Float32Complex; - } - } - BeamformerShaderKind *last_shader = cp->pipeline.shaders + slot - 1; assert(first || ((*last_shader == BeamformerShaderKind_Demodulate || *last_shader == BeamformerShaderKind_Filter))); - db->decode_mode = pb->parameters.decode_mode; - db->transmit_count = pb->parameters.acquisition_count; + if ((first || pb->parameters.decode_mode != BeamformerDecodeMode_None) && + compute_plan_push_shader(cp, shader, sp)) + { + BeamformerDecodeBakeParameters *db = &sd->bake.Decode; - u32 channel_stride = pb->parameters.acquisition_count * pb->parameters.sample_count; - db->input_sample_stride = first? 1 : ld->bake.Filter.output_sample_stride; - db->input_channel_stride = first? channel_stride : ld->bake.Filter.output_channel_stride; - db->input_transmit_stride = first? pb->parameters.sample_count : 1; + db->data_kind = data_kind; + if (!first) { + if (data_kind == BeamformerDataKind_Int16) { + db->data_kind = BeamformerDataKind_Int16Complex; + } else { + db->data_kind = BeamformerDataKind_Float32Complex; + } + } - db->output_sample_stride = das_sample_stride; - db->output_channel_stride = das_channel_stride; - db->output_transmit_stride = das_transmit_stride; - if (first) { - db->output_channel_stride *= decimation_rate; - db->output_transmit_stride *= decimation_rate; - } + db->decode_mode = pb->parameters.decode_mode; + db->transmit_count = pb->parameters.acquisition_count; - db->dilate_output = run_cuda_hilbert; + u32 channel_stride = pb->parameters.acquisition_count * pb->parameters.sample_count; + db->input_sample_stride = first? 1 : ld->bake.Filter.output_sample_stride; + db->input_channel_stride = first? channel_stride : ld->bake.Filter.output_channel_stride; + db->input_transmit_stride = first? pb->parameters.sample_count : 1; - if (db->decode_mode == BeamformerDecodeMode_None) { - sd->layout = (uv3){{subgroup_size, 1, 1}}; + db->output_sample_stride = das_sample_stride; + db->output_channel_stride = das_channel_stride; + db->output_transmit_stride = das_transmit_stride; + if (first) { + db->output_channel_stride *= decimation_rate; + db->output_transmit_stride *= decimation_rate; + } - sd->dispatch.x = (u32)ceil_f32((f32)sample_count / (f32)sd->layout.x); - sd->dispatch.y = (u32)ceil_f32((f32)pb->parameters.channel_count / (f32)sd->layout.y); - sd->dispatch.z = (u32)ceil_f32((f32)pb->parameters.acquisition_count / (f32)sd->layout.z); - } else if (db->transmit_count > 40) { - db->use_shared_memory = 1; - db->to_process = 2; + db->dilate_output = run_cuda_hilbert; - if (db->transmit_count == 48) - db->to_process = db->transmit_count / 16; + if (db->decode_mode == BeamformerDecodeMode_None) { + sd->layout = (uv3){{subgroup_size, 1, 1}}; - b32 use_16z = db->transmit_count == 48 || db->transmit_count == 80 || - db->transmit_count == 96 || db->transmit_count == 160; - sd->layout = (uv3){{4, 1, use_16z? 16 : 32}}; + sd->dispatch.x = (u32)ceil_f32((f32)sample_count / (f32)sd->layout.x); + sd->dispatch.y = (u32)ceil_f32((f32)pb->parameters.channel_count / (f32)sd->layout.y); + sd->dispatch.z = (u32)ceil_f32((f32)pb->parameters.acquisition_count / (f32)sd->layout.z); + } else if (db->transmit_count > 40) { + db->use_shared_memory = 1; + db->to_process = 2; - sd->dispatch.x = (u32)ceil_f32((f32)sample_count / (f32)sd->layout.x); - sd->dispatch.y = (u32)ceil_f32((f32)pb->parameters.channel_count / (f32)sd->layout.y); - sd->dispatch.z = (u32)ceil_f32((f32)pb->parameters.acquisition_count / (f32)sd->layout.z / (f32)db->to_process); - } else { - db->to_process = 1; + if (db->transmit_count == 48) + db->to_process = db->transmit_count / 16; - /* NOTE(rnp): register caching. using more threads will cause the compiler to do - * contortions to avoid spilling registers. using less gives higher performance */ - sd->layout = (uv3){{subgroup_size / 2, 1, 1}}; + b32 use_16z = db->transmit_count == 48 || db->transmit_count == 80 || + db->transmit_count == 96 || db->transmit_count == 160; + sd->layout = (uv3){{4, 1, use_16z? 16 : 32}}; - sd->dispatch.x = (u32)ceil_f32((f32)sample_count / (f32)sd->layout.x); - sd->dispatch.y = (u32)ceil_f32((f32)pb->parameters.channel_count / (f32)sd->layout.y); - sd->dispatch.z = 1; - } + sd->dispatch.x = (u32)ceil_f32((f32)sample_count / (f32)sd->layout.x); + sd->dispatch.y = (u32)ceil_f32((f32)pb->parameters.channel_count / (f32)sd->layout.y); + sd->dispatch.z = (u32)ceil_f32((f32)pb->parameters.acquisition_count / (f32)sd->layout.z / (f32)db->to_process); + } else { + db->to_process = 1; + + /* NOTE(rnp): register caching. using more threads will cause the compiler to do + * contortions to avoid spilling registers. using less gives higher performance */ + /* TODO(rnp): may need to be adjusted to 16 on NVIDIA */ + sd->layout = (uv3){{subgroup_size / 2, 1, 1}}; - if (first) sd->dispatch.x *= decimation_rate; + sd->dispatch.x = (u32)ceil_f32((f32)sample_count / (f32)sd->layout.x); + sd->dispatch.y = (u32)ceil_f32((f32)pb->parameters.channel_count / (f32)sd->layout.y); + sd->dispatch.z = 1; + } - /* NOTE(rnp): decode 2 samples per dispatch when data is i16 */ - if (first && data_kind == BeamformerDataKind_Int16) - sd->dispatch.x = (u32)ceil_f32((f32)sd->dispatch.x / 2); + if (first) sd->dispatch.x *= decimation_rate; - commit = first || db->decode_mode != BeamformerDecodeMode_None; + /* NOTE(rnp): decode 2 samples per dispatch when data is i16 */ + if (first && data_kind == BeamformerDataKind_Int16) + sd->dispatch.x = (u32)ceil_f32((f32)sd->dispatch.x / 2); + } }break; + case BeamformerShaderKind_Demodulate: case BeamformerShaderKind_Filter: { - b32 first = slot == 0; - b32 demod = shader == BeamformerShaderKind_Demodulate; - BeamformerFilter *f = cp->filters + sp->filter_slot; - - time_offset += f->time_delay; - - BeamformerFilterBakeParameters *fb = &sd->bake.Filter; - fb->filter_length = (u32)f->length; - fb->demodulate = demod; - fb->complex_filter = f->parameters.complex; - - fb->data_kind = data_kind; - if (!first) fb->data_kind = BeamformerDataKind_Float32; - - /* NOTE(rnp): when we are demodulating we pretend that the sampler was alternating - * between sampling the I portion and the Q portion of an IQ signal. Therefore there - * is an implicit decimation factor of 2 which must always be included. All code here - * assumes that the signal was sampled in such a way that supports this operation. - * To recover IQ[n] from the sampled data (RF[n]) we do the following: - * I[n] = RF[n] - * Q[n] = RF[n + 1] - * IQ[n] = I[n] - j*Q[n] - */ - if (demod) { - fb->demodulation_frequency = pb->parameters.demodulation_frequency; - fb->sampling_frequency = pb->parameters.sampling_frequency / 2; - fb->decimation_rate = decimation_rate; - fb->sample_count = pb->parameters.sample_count; - - fb->output_channel_stride = das_channel_stride; - fb->output_sample_stride = das_sample_stride; - fb->output_transmit_stride = das_transmit_stride; - - if (first) { - fb->input_channel_stride = pb->parameters.sample_count * pb->parameters.acquisition_count / 2; - fb->input_sample_stride = 1; - fb->input_transmit_stride = pb->parameters.sample_count / 2; - - if (pb->parameters.decode_mode == BeamformerDecodeMode_None) { - fb->output_floats = 1; + if (compute_plan_push_shader(cp, shader, sp)) { + b32 first = slot == 0; + b32 demod = shader == BeamformerShaderKind_Demodulate; + BeamformerFilter *f = cp->filters + sp->filter_slot; + + time_offset += f->time_delay; + + BeamformerFilterBakeParameters *fb = &sd->bake.Filter; + fb->filter_length = (u32)f->length; + fb->demodulate = demod; + fb->complex_filter = f->parameters.complex; + + fb->data_kind = data_kind; + if (!first) fb->data_kind = BeamformerDataKind_Float32; + + /* NOTE(rnp): when we are demodulating we pretend that the sampler was alternating + * between sampling the I portion and the Q portion of an IQ signal. Therefore there + * is an implicit decimation factor of 2 which must always be included. All code here + * assumes that the signal was sampled in such a way that supports this operation. + * To recover IQ[n] from the sampled data (RF[n]) we do the following: + * I[n] = RF[n] + * Q[n] = RF[n + 1] + * IQ[n] = I[n] - j*Q[n] + */ + if (demod) { + fb->demodulation_frequency = pb->parameters.demodulation_frequency; + fb->sampling_frequency = pb->parameters.sampling_frequency / 2; + fb->decimation_rate = decimation_rate; + fb->sample_count = pb->parameters.sample_count; + + fb->output_channel_stride = das_channel_stride; + fb->output_sample_stride = das_sample_stride; + fb->output_transmit_stride = das_transmit_stride; + + if (first) { + fb->input_channel_stride = pb->parameters.sample_count * pb->parameters.acquisition_count / 2; + fb->input_sample_stride = 1; + fb->input_transmit_stride = pb->parameters.sample_count / 2; + + if (pb->parameters.decode_mode == BeamformerDecodeMode_None) { + fb->output_floats = 1; + } else { + /* NOTE(rnp): output optimized layout for decoding */ + fb->output_channel_stride = das_channel_stride; + fb->output_sample_stride = pb->parameters.acquisition_count; + fb->output_transmit_stride = 1; + } } else { - /* NOTE(rnp): output optimized layout for decoding */ - fb->output_channel_stride = das_channel_stride; - fb->output_sample_stride = pb->parameters.acquisition_count; - fb->output_transmit_stride = 1; + assert(cp->pipeline.shaders[slot - 1] == BeamformerShaderKind_Decode); + fb->input_channel_stride = ld->bake.Decode.output_channel_stride; + fb->input_sample_stride = ld->bake.Decode.output_sample_stride; + fb->input_transmit_stride = ld->bake.Decode.output_transmit_stride; } } else { - assert(cp->pipeline.shaders[slot - 1] == BeamformerShaderKind_Decode); - fb->input_channel_stride = ld->bake.Decode.output_channel_stride; - fb->input_sample_stride = ld->bake.Decode.output_sample_stride; - fb->input_transmit_stride = ld->bake.Decode.output_transmit_stride; + fb->decimation_rate = 1; + fb->output_channel_stride = sample_count * pb->parameters.acquisition_count; + fb->output_sample_stride = 1; + fb->output_transmit_stride = sample_count; + fb->input_channel_stride = sample_count * pb->parameters.acquisition_count; + fb->input_sample_stride = 1; + fb->input_transmit_stride = sample_count; + fb->sample_count = sample_count; } - } else { - fb->decimation_rate = 1; - fb->output_channel_stride = sample_count * pb->parameters.acquisition_count; - fb->output_sample_stride = 1; - fb->output_transmit_stride = sample_count; - fb->input_channel_stride = sample_count * pb->parameters.acquisition_count; - fb->input_sample_stride = 1; - fb->input_transmit_stride = sample_count; - fb->sample_count = sample_count; - } - - /* TODO(rnp): filter may need a different dispatch layout */ - sd->layout = (uv3){{128, 1, 1}}; - sd->dispatch.x = (u32)ceil_f32((f32)sample_count / (f32)sd->layout.x); - sd->dispatch.y = (u32)ceil_f32((f32)pb->parameters.channel_count / (f32)sd->layout.y); - sd->dispatch.z = (u32)ceil_f32((f32)pb->parameters.acquisition_count / (f32)sd->layout.z); - commit = 1; + /* TODO(rnp): filter may need a different dispatch layout */ + sd->layout = (uv3){{128, 1, 1}}; + sd->dispatch.x = (u32)ceil_f32((f32)sample_count / (f32)sd->layout.x); + sd->dispatch.y = (u32)ceil_f32((f32)pb->parameters.channel_count / (f32)sd->layout.y); + sd->dispatch.z = (u32)ceil_f32((f32)pb->parameters.acquisition_count / (f32)sd->layout.z); + } }break; - case BeamformerShaderKind_DAS:{ - BeamformerDASBakeParameters *db = &sd->bake.DAS; - - db->data_kind = BeamformerDataKind_Float32; - if (cp->iq_pipeline) db->data_kind = BeamformerDataKind_Float32Complex; - - BeamformerDASPushConstants *du = &cp->das_ubo_data; - du->xdc_element_pitch = pb->parameters.xdc_element_pitch; - db->sampling_frequency = sampling_frequency; - db->demodulation_frequency = pb->parameters.demodulation_frequency; - db->speed_of_sound = pb->parameters.speed_of_sound; - db->time_offset = time_offset; - db->f_number = pb->parameters.f_number; - db->acquisition_kind = pb->parameters.acquisition_kind; - db->sample_count = sample_count; - db->channel_count = pb->parameters.channel_count; - db->acquisition_count = pb->parameters.acquisition_count; - db->interpolation_mode = pb->parameters.interpolation_mode; - db->transmit_angle = pb->parameters.focal_vector.E[0]; - db->focus_depth = pb->parameters.focal_vector.E[1]; - db->transmit_receive_orientation = pb->parameters.transmit_receive_orientation; - - // NOTE(rnp): old gcc will miscompile an assignment - mem_copy(du->voxel_transform.E, pb->parameters.das_voxel_transform.E, sizeof(du->voxel_transform)); - mem_copy(du->xdc_transform.E, pb->parameters.xdc_transform.E, sizeof(du->xdc_transform)); - - du->voxel_transform = m4_mul(cp->ui_voxel_transform, du->voxel_transform); - - u32 id = pb->parameters.acquisition_kind; - - if (id == BeamformerAcquisitionKind_UFORCES || id == BeamformerAcquisitionKind_FORCES) - du->voxel_transform = m4_mul(du->xdc_transform, du->voxel_transform); - - db->sparse = id == BeamformerAcquisitionKind_UFORCES || - id == BeamformerAcquisitionKind_UHERCULES; - - db->single_focus = pb->parameters.single_focus; - db->single_orientation = pb->parameters.single_orientation; - db->coherency_weighting = pb->parameters.coherency_weighting; - db->fast = !pb->parameters.coherency_weighting; - - sd->layout = (uv3){{1, 1, 1}}; - - b32 has_x = cp->output_points.x > 1; - b32 has_y = cp->output_points.y > 1; - b32 has_z = cp->output_points.z > 1; - - u32 grid_3d_z_size = Max(1, subgroup_size / (4 * 4)); - u32 grid_2d_y_size = Max(1, subgroup_size / 8); - - switch (iv3_dimension(cp->output_points)) { - - case 1:{ - if (has_x) sd->layout.x = subgroup_size; - if (has_y) sd->layout.y = subgroup_size; - if (has_z) sd->layout.z = subgroup_size; - }break; - - case 2:{ - if (has_x && has_y) {sd->layout.x = 8; sd->layout.y = grid_2d_y_size;} - if (has_x && has_z) {sd->layout.x = 8; sd->layout.z = grid_2d_y_size;} - if (has_y && has_z) {sd->layout.y = 8; sd->layout.z = grid_2d_y_size;} - }break; - case 3:{sd->layout = (uv3){{4, 4, grid_3d_z_size}};}break; - - InvalidDefaultCase; + case BeamformerShaderKind_DAS:{ + if (compute_plan_push_shader(cp, shader, sp)) { + BeamformerDASBakeParameters *db = &sd->bake.DAS; + db->data_kind = BeamformerDataKind_Float32; + if (cp->iq_pipeline) db->data_kind = BeamformerDataKind_Float32Complex; + + cp->voxel_transform = m4_mul(cp->ui_voxel_transform, pb->parameters.das_voxel_transform); + cp->xdc_element_pitch = pb->parameters.xdc_element_pitch; + + db->sampling_frequency = sampling_frequency; + db->demodulation_frequency = pb->parameters.demodulation_frequency; + db->speed_of_sound = pb->parameters.speed_of_sound; + db->time_offset = time_offset; + db->f_number = pb->parameters.f_number; + db->acquisition_kind = pb->parameters.acquisition_kind; + db->sample_count = sample_count; + db->channel_count = pb->parameters.channel_count; + db->acquisition_count = pb->parameters.acquisition_count; + db->interpolation_mode = pb->parameters.interpolation_mode; + db->transmit_angle = pb->parameters.focal_vector.E[0]; + db->focus_depth = pb->parameters.focal_vector.E[1]; + db->transmit_receive_orientation = pb->parameters.transmit_receive_orientation; + + // NOTE(rnp): old gcc will miscompile an assignment + mem_copy(cp->xdc_transform.E, pb->parameters.xdc_transform.E, sizeof(cp->xdc_transform)); + + u32 id = pb->parameters.acquisition_kind; + if (id == BeamformerAcquisitionKind_UFORCES || id == BeamformerAcquisitionKind_FORCES) + cp->voxel_transform = m4_mul(cp->xdc_transform, cp->voxel_transform); + + db->sparse = id == BeamformerAcquisitionKind_UFORCES || id == BeamformerAcquisitionKind_UHERCULES; + db->single_focus = pb->parameters.single_focus; + db->single_orientation = pb->parameters.single_orientation; + db->coherency_weighting = pb->parameters.coherency_weighting; + + sd->layout = layout_for_output(cp->output_points); + sd->dispatch = dispatch_for_output(sd->layout, cp->output_points); + + if (pb->parameters.coherency_weighting && + compute_plan_push_shader(cp, BeamformerShaderKind_CoherencyWeighting, sp)) + { + BeamformerShaderDescriptor *shader_descriptor = cp->shader_descriptors + cp->pipeline.shader_count - 1; + shader_descriptor->layout = sd->layout; + shader_descriptor->dispatch = sd->dispatch; + shader_descriptor->bake.CoherencyWeighting.data_kind = db->data_kind; + } } + }break; - sd->dispatch.x = (u32)ceil_f32((f32)cp->output_points.x / sd->layout.x); - sd->dispatch.y = (u32)ceil_f32((f32)cp->output_points.y / sd->layout.y); - sd->dispatch.z = (u32)ceil_f32((f32)cp->output_points.z / sd->layout.z); + #if 0 + case BeamformerShaderKind_Sum:{ + sd->bake.data_kind = BeamformerDataKind_Float32; + if (cp->iq_pipeline) + sd->bake.data_kind = BeamformerDataKind_Float32Complex; + + sd->layout = layout_for_output(cp->output_points); + sd->dispatch = dispatch_for_output(sd->layout, cp->output_points); commit = 1; }break; - default:{ commit = 1; }break; - } + #endif - if (commit) { - u32 index = cp->pipeline.shader_count++; - cp->pipeline.shaders[index] = shader; - cp->pipeline.parameters[index] = *sp; + default:{}break; } } cp->pipeline.data_kind = data_kind; } function void -stream_push_shader_header(Stream *s, BeamformerShaderKind shader_kind, s8 header) +stream_append_shader_header(Stream *s, i32 reloadable_index, BeamformerShaderDescriptor *sd, uv3 layout) { - stream_append_s8s(s, s8("#version 460 core\n\n"), header); - - switch (shader_kind) { - case BeamformerShaderKind_DAS:{ - stream_append_s8(s, s8("" - "layout(location = " str(DAS_CYCLE_T_UNIFORM_LOC) ") uniform uint u_cycle_t;\n" - "layout(location = " str(DAS_FAST_CHANNEL_UNIFORM_LOC) ") uniform int u_channel;\n\n" - )); - }break; - case BeamformerShaderKind_Decode:{ - stream_append_s8s(s, s8("" - "layout(location = " str(DECODE_FIRST_PASS_UNIFORM_LOC) ") uniform bool u_first_pass;\n\n" - )); - }break; - case BeamformerShaderKind_MinMax:{ - stream_append_s8(s, s8("layout(location = " str(MIN_MAX_MIPS_LEVEL_UNIFORM_LOC) - ") uniform int u_mip_map;\n\n")); - }break; - case BeamformerShaderKind_Sum:{ - stream_append_s8(s, s8("layout(location = " str(SUM_PRESCALE_UNIFORM_LOC) - ") uniform float u_sum_prescale = 1.0;\n\n")); - }break; - default:{}break; + stream_append_s8s(s, s8("#version 460 core\n\n" + "#extension GL_EXT_buffer_reference : require\n" + "#extension GL_EXT_shader_16bit_storage : require\n" + "#extension GL_EXT_shader_explicit_arithmetic_types : require\n\n")); + + i32 header_vector_length = beamformer_shader_header_vector_lengths[reloadable_index]; + i32 *header_vector = beamformer_shader_header_vectors[reloadable_index]; + for (i32 index = 0; index < header_vector_length; index++) + stream_append_s8(s, beamformer_shader_global_header_strings[header_vector[index]]); + + if (layout.x != 0) { + stream_append_s8(s, s8("layout(local_size_x = ")); + stream_append_u64(s, layout.x); + stream_append_s8(s, s8(", local_size_y = ")); + stream_append_u64(s, layout.y); + stream_append_s8(s, s8(", local_size_z = ")); + stream_append_u64(s, layout.z); + stream_append_s8(s, s8(") in;\n\n")); } -} - -function void -load_compute_shader(BeamformerCtx *ctx, BeamformerComputePlan *cp, u32 shader_slot, Arena arena) -{ - BeamformerShaderKind shader = cp->pipeline.shaders[shader_slot]; - - u32 program = 0; - i32 reloadable_index = beamformer_shader_reloadable_index_by_shader[shader]; - if (reloadable_index != -1) { - BeamformerShaderKind base_shader = beamformer_reloadable_shader_kinds[reloadable_index]; - s8 path; - if (!BakeShaders) - path = push_s8_from_parts(&arena, os_path_separator(), s8("shaders"), - beamformer_reloadable_shader_files[reloadable_index]); - - Stream shader_stream = arena_stream(arena); - stream_push_shader_header(&shader_stream, base_shader, s8("")); - - i32 header_vector_length = beamformer_shader_header_vector_lengths[reloadable_index]; - i32 *header_vector = beamformer_shader_header_vectors[reloadable_index]; - for (i32 index = 0; index < header_vector_length; index++) - stream_append_s8(&shader_stream, beamformer_shader_global_header_strings[header_vector[index]]); - - BeamformerShaderDescriptor *sd = cp->shader_descriptors + shader_slot; - - if (sd->layout.x != 0) { - stream_append_s8(&shader_stream, s8("layout(local_size_x = ")); - stream_append_u64(&shader_stream, sd->layout.x); - stream_append_s8(&shader_stream, s8(", local_size_y = ")); - stream_append_u64(&shader_stream, sd->layout.y); - stream_append_s8(&shader_stream, s8(", local_size_z = ")); - stream_append_u64(&shader_stream, sd->layout.z); - stream_append_s8(&shader_stream, s8(") in;\n\n")); - } + if (sd) { u32 *parameters = (u32 *)&sd->bake; s8 *names = beamformer_shader_bake_parameter_names[reloadable_index]; u32 float_bits = beamformer_shader_bake_parameter_float_bits[reloadable_index]; i32 count = beamformer_shader_bake_parameter_counts[reloadable_index]; for (i32 index = 0; index < count; index++) { - stream_append_s8s(&shader_stream, s8("#define "), names[index], + stream_append_s8s(s, s8("#define "), names[index], (float_bits & (1 << index))? s8(" uintBitsToFloat") : s8(" "), s8("(0x")); - stream_append_hex_u64(&shader_stream, parameters[index]); - stream_append_s8(&shader_stream, s8(")\n")); + stream_append_hex_u64(s, parameters[index]); + stream_append_s8(s, s8(")\n")); } + } - if (!renderdoc_attached()) - stream_append_s8(&shader_stream, s8("\n#line 1\n")); + if (!renderdoc_attached()) + stream_append_s8(s, s8("\n\n#line 1\n")); +} + +function void +beamformer_reload_pipeline(VulkanHandle *pipeline, BeamformerShaderReloadInfo *sris, u32 count, Arena arena) +{ + assume(count <= 2); + s8 paths[2]; + VulkanPipelineCreateInfo infos[2]; + + if (!BakeShaders) { + for (u32 i = 0; i < count; i++) + paths[i] = push_s8_from_parts(&arena, os_path_separator(), s8("shaders"), sris[i].filename_or_data); + } + + u32 push_constants_size = 0; + for (u32 i = 0; i < count; i++) { + Stream shader_stream = arena_stream(arena); + i32 reloadable_index = beamformer_shader_reloadable_index_by_shader[sris[i].shader]; + if (i == 0) push_constants_size = beamformer_shader_push_constant_sizes[reloadable_index]; + else assert(push_constants_size == beamformer_shader_push_constant_sizes[reloadable_index]); + + stream_append_shader_header(&shader_stream, reloadable_index, sris[i].shader_descriptor, sris[i].layout); - s8 shader_text; if (BakeShaders) { - stream_append_s8(&shader_stream, beamformer_shader_data[reloadable_index]); - shader_text = arena_stream_commit(&arena, &shader_stream); + stream_append_s8(&shader_stream, sris[i].filename_or_data); } else { - shader_text = arena_stream_commit(&arena, &shader_stream); - i64 length = os_read_entire_file((c8 *)path.data, arena.beg, arena_capacity(&arena, u8)); - shader_text.len += length; - arena_commit(&arena, length); + shader_stream.widx += os_read_entire_file((c8 *)paths[i].data, + shader_stream.data + shader_stream.widx, + shader_stream.cap - shader_stream.widx); } - /* TODO(rnp): instance name */ - s8 shader_name = beamformer_shader_names[shader]; - program = load_shader(arena, &shader_text, (u32 []){GL_COMPUTE_SHADER}, 1, shader_name); + infos[i].kind = sris[i].shader_kind; + infos[i].text = arena_stream_commit_zero(&arena, &shader_stream); + infos[i].name = beamformer_shader_names[sris[i].shader]; + + //s8 line = s8("---------------\n"); + //s8 nl = s8("\n"); + //os_console_log(line.data, line.len); + //os_console_log(infos[i].name.data, infos[i].name.len); + //os_console_log(nl.data, nl.len); + //os_console_log(line.data, line.len); + //os_console_log(infos[i].text.data, infos[i].text.len); + //os_console_log(line.data, line.len); } - glDeleteProgram(cp->programs[shader_slot]); - cp->programs[shader_slot] = program; + vk_pipeline_release(*pipeline); + *pipeline = vk_pipeline(infos, count, push_constants_size); +} + +function void +beamformer_reload_render_pipeline(VulkanHandle *pipeline, BeamformerShaderKind shader, Arena arena) +{ + i32 index = beamformer_shader_reloadable_index_by_shader[shader]; + BeamformerShaderReloadInfo infos[2] = { + { + .shader = shader, + .shader_kind = beamformer_shader_primitive_is_vertex[index] ? VulkanShaderKind_Vertex : VulkanShaderKind_Mesh, + .filename_or_data = BakeShaders ? beamformer_shader_data[index][0] + : beamformer_reloadable_shader_files[index][0], + }, + { + .shader = shader, + .shader_kind = VulkanShaderKind_Fragment, + .filename_or_data = BakeShaders ? beamformer_shader_data[index][1] + : beamformer_reloadable_shader_files[index][1], + }, + }; + beamformer_reload_pipeline(pipeline, infos, countof(infos), arena); +} + +function void +beamformer_reload_compute_pipeline(VulkanHandle *pipeline, BeamformerShaderKind shader, + BeamformerShaderDescriptor *shader_descriptor, Arena arena) +{ + i32 index = beamformer_shader_reloadable_index_by_shader[shader]; + uv3 layout = shader_descriptor ? shader_descriptor->layout : (uv3){{vk_gpu_info()->subgroup_size, 1, 1}}; + BeamformerShaderReloadInfo info = { + .shader = shader, + .shader_kind = VulkanShaderKind_Compute, + .shader_descriptor = shader_descriptor, + .filename_or_data = BakeShaders ? beamformer_shader_data[index][0] + : beamformer_reloadable_shader_files[index][0], + .layout = layout, + }; + beamformer_reload_pipeline(pipeline, &info, 1, arena); } function void @@ -755,52 +722,58 @@ beamformer_commit_parameter_block(BeamformerCtx *ctx, BeamformerComputePlan *cp, cp->shader_hashes[shader_slot] = hash; } - #define X(k, t, v) glNamedBufferSubData(cp->ubos[BeamformerComputeUBOKind_##k], \ - 0, sizeof(t), &cp->v ## _ubo_data); - BEAMFORMER_COMPUTE_UBO_LIST - #undef X - cp->acquisition_count = pb->parameters.acquisition_count; cp->acquisition_kind = pb->parameters.acquisition_kind; - u32 decoded_data_size = cp->rf_size; - if (ctx->compute_context.ping_pong_ssbo_size < decoded_data_size) - alloc_shader_storage(ctx, decoded_data_size, arena); + // NOTE(rnp): buffer size / 2 should be mutiple of 64 + i64 buffer_size = round_up_to(2 * cp->rf_size, 128); + if (ctx->compute_context.ping_pong_buffer.size < buffer_size) { + GPUBufferAllocateInfo allocate_info = {.size = buffer_size, .label = s8("PingPongBuffer")}; + vk_buffer_allocate(&ctx->compute_context.ping_pong_buffer, &allocate_info); + // TODO(rnp): figure out how to share with CUDA + } if (cp->hadamard_order != (i32)cp->acquisition_count) - update_hadamard_texture(cp, (i32)cp->acquisition_count, 0, arena); - - mem_copy(cp->voxel_transform.E, pb->parameters.das_voxel_transform.E, sizeof(cp->voxel_transform)); - - GLenum gl_kind = cp->iq_pipeline ? GL_RG32F : GL_R32F; - if (cp->average_frames > 1 && !beamformer_frame_compatible(ctx->averaged_frames + 0, cp->output_points, gl_kind)) { - alloc_beamform_frame(ctx->averaged_frames + 0, cp->output_points, gl_kind, s8("Averaged Frame"), arena); - alloc_beamform_frame(ctx->averaged_frames + 1, cp->output_points, gl_kind, s8("Averaged Frame"), arena); - } + update_hadamard(cp, (i32)cp->acquisition_count, 0, arena); }break; + case BeamformerParameterBlockRegion_ChannelMapping:{ cuda_set_channel_mapping(pb->channel_mapping); }break; + case BeamformerParameterRegionFlag_TransmitReceiveOrientations:{ + GPUBuffer *b = &cp->array_parameters; + u32 kind = BeamformerComputeArrayParameterKind_TransmitReceiveOrientations; + u64 offset = beamformer_compute_array_parameter_offsets[kind]; + u64 size = beamformer_compute_array_parameter_sizes[kind]; + { + Arena scratch = arena; + u16 *u16s = push_array(&scratch, u16, countof(pb->transmit_receive_orientations)); + for (u32 i = 0; i < countof(pb->transmit_receive_orientations); i++) + u16s[i] = pb->transmit_receive_orientations[i]; + + vk_buffer_range_upload(b, u16s, offset, size, 0); + } + }break; case BeamformerParameterRegionFlag_FocalVectors: case BeamformerParameterRegionFlag_SparseElements: - case BeamformerParameterRegionFlag_TransmitReceiveOrientations: { - BeamformerComputeTextureKind texture_kind = 0; - u32 pixel_type = 0, texture_format = 0; + u32 kind = BeamformerComputeArrayParameterKind_Count; switch (region) { - #define X(kind, _gl, tf, pt, ...) \ - case BeamformerParameterRegionFlag_##kind:{ \ - texture_kind = BeamformerComputeTextureKind_## kind; \ - texture_format = tf; \ - pixel_type = pt; \ + case BeamformerParameterBlockRegion_FocalVectors:{ + kind = BeamformerComputeArrayParameterKind_FocalVectors; + }break; + case BeamformerParameterBlockRegion_SparseElements:{ + kind = BeamformerComputeArrayParameterKind_SparseElements; }break; - BEAMFORMER_COMPUTE_TEXTURE_LIST - #undef X InvalidDefaultCase; } - glTextureSubImage1D(cp->textures[texture_kind], 0, 0, BeamformerMaxChannelCount, - texture_format, pixel_type, - (u8 *)pb + BeamformerParameterBlockRegionOffsets[region]); + + if (kind != BeamformerComputeArrayParameterKind_Count) { + GPUBuffer *b = &cp->array_parameters; + u64 offset = beamformer_compute_array_parameter_offsets[kind]; + u64 size = beamformer_compute_array_parameter_sizes[kind]; + vk_buffer_range_upload(b, (u8 *)pb + BeamformerParameterBlockRegionOffsets[region], offset, size, 0); + } }break; } } @@ -808,127 +781,205 @@ beamformer_commit_parameter_block(BeamformerCtx *ctx, BeamformerComputePlan *cp, } function void -do_compute_shader(BeamformerCtx *ctx, BeamformerComputePlan *cp, BeamformerFrame *frame, - BeamformerShaderKind shader, u32 shader_slot, BeamformerShaderParameters *sp, Arena arena) +do_compute_shader(BeamformerCtx *ctx, VulkanHandle cmd, BeamformerComputePlan *cp, BeamformerFrame *frame, + u32 shader_slot, Arena arena, u64 rf_pointer) { BeamformerComputeContext *cc = &ctx->compute_context; - u32 program = cp->programs[shader_slot]; - glUseProgram(program); + u32 output_index = !cc->ping_pong_input_index; + u32 input_index = cc->ping_pong_input_index; - u32 output_ssbo_idx = !cc->last_output_ssbo_index; - u32 input_ssbo_idx = cc->last_output_ssbo_index; + u64 pp_size = cc->ping_pong_buffer.size / 2; + u64 pp_input_pointer = cc->ping_pong_buffer.gpu_pointer + input_index * pp_size; + u64 pp_output_pointer = cc->ping_pong_buffer.gpu_pointer + output_index * pp_size; uv3 dispatch = cp->shader_descriptors[shader_slot].dispatch; - switch (shader) { - case BeamformerShaderKind_Decode:{ - glBindImageTexture(0, cp->textures[BeamformerComputeTextureKind_Hadamard], 0, 0, 0, GL_READ_ONLY, GL_R16F); + vk_command_bind_pipeline(cmd, cp->vulkan_pipelines[shader_slot]); + + switch (cp->pipeline.shaders[shader_slot]) { + + case BeamformerShaderKind_Decode:{ BeamformerDecodeMode mode = cp->shader_descriptors[shader_slot].bake.Decode.decode_mode; - if (shader_slot == 0) { - if (mode != BeamformerDecodeMode_None) { - glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, cc->ping_pong_ssbos[input_ssbo_idx]); - glProgramUniform1ui(program, DECODE_FIRST_PASS_UNIFORM_LOC, 1); + BeamformerDecodePushConstants pc = { + .hadamard_buffer = cp->array_parameters.gpu_pointer + offsetof(BeamformerComputeArrayParameters, Hadamard), + .output_buffer = pp_output_pointer, + }; - glDispatchCompute(dispatch.x, dispatch.y, dispatch.z); - glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); - } - } + if (shader_slot == 0 && mode != BeamformerDecodeMode_None) { + pc.output_rf_buffer = pp_input_pointer; + pc.rf_buffer = rf_pointer; + pc.first_pass = 1; - if (mode != BeamformerDecodeMode_None) - glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, cc->ping_pong_ssbos[input_ssbo_idx]); + GPUMemoryBarrierInfo barrier = { + .gpu_buffer = &cc->ping_pong_buffer, + .offset = pp_input_pointer - cc->ping_pong_buffer.gpu_pointer, + .size = pp_size, + }; - glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 3, cc->ping_pong_ssbos[output_ssbo_idx]); + vk_command_push_constants(cmd, 0, sizeof(pc), &pc); + vk_command_dispatch_compute(cmd, dispatch); + vk_command_buffer_memory_barriers(cmd, &barrier, 1); - glProgramUniform1ui(program, DECODE_FIRST_PASS_UNIFORM_LOC, 0); + pc.output_rf_buffer = 0; + } + + pc.rf_buffer = pp_input_pointer; + pc.first_pass = 0; - glDispatchCompute(dispatch.x, dispatch.y, dispatch.z); - glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); + GPUMemoryBarrierInfo barrier = { + .gpu_buffer = &cc->ping_pong_buffer, + .offset = pp_output_pointer - cc->ping_pong_buffer.gpu_pointer, + .size = pp_size, + }; - cc->last_output_ssbo_index = !cc->last_output_ssbo_index; + vk_command_push_constants(cmd, 0, sizeof(pc), &pc); + vk_command_dispatch_compute(cmd, dispatch); + vk_command_buffer_memory_barriers(cmd, &barrier, 1); + + cc->ping_pong_input_index = !cc->ping_pong_input_index; }break; + case BeamformerShaderKind_CudaDecode:{ - cuda_decode(0, output_ssbo_idx, 0); - cc->last_output_ssbo_index = !cc->last_output_ssbo_index; + cuda_decode(0, output_index, 0); + cc->ping_pong_input_index = !cc->ping_pong_input_index; }break; case BeamformerShaderKind_CudaHilbert:{ - cuda_hilbert(input_ssbo_idx, output_ssbo_idx); - cc->last_output_ssbo_index = !cc->last_output_ssbo_index; + cuda_hilbert(input_index, output_index); + cc->ping_pong_input_index = !cc->ping_pong_input_index; }break; + case BeamformerShaderKind_Filter: case BeamformerShaderKind_Demodulate: { - if (shader_slot != 0) - glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, cc->ping_pong_ssbos[input_ssbo_idx]); - glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, cc->ping_pong_ssbos[output_ssbo_idx]); - glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 3, cp->filters[sp->filter_slot].ssbo); + u32 filter_slot = cp->pipeline.parameters[shader_slot].filter_slot; + BeamformerFilterPushConstants pc = { + .filter_coefficients = cp->filters[filter_slot].buffer.gpu_pointer, + .output_data = pp_output_pointer, + .input_data = shader_slot == 0 ? rf_pointer : pp_input_pointer, + }; - glDispatchCompute(dispatch.x, dispatch.y, dispatch.z); - glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); + GPUMemoryBarrierInfo barrier = { + .gpu_buffer = &cc->ping_pong_buffer, + .offset = pp_output_pointer - cc->ping_pong_buffer.gpu_pointer, + .size = pp_size, + }; - cc->last_output_ssbo_index = !cc->last_output_ssbo_index; - }break; - case BeamformerShaderKind_MinMax:{ - for (i32 i = 1; i < frame->mips; i++) { - glBindImageTexture(0, frame->texture, i - 1, GL_TRUE, 0, GL_READ_ONLY, GL_RG32F); - glBindImageTexture(1, frame->texture, i - 0, GL_TRUE, 0, GL_WRITE_ONLY, GL_RG32F); - glProgramUniform1i(program, MIN_MAX_MIPS_LEVEL_UNIFORM_LOC, i); + vk_command_push_constants(cmd, 0, sizeof(pc), &pc); + vk_command_dispatch_compute(cmd, dispatch); + vk_command_buffer_memory_barriers(cmd, &barrier, 1); - u32 width = (u32)frame->dim.x >> i; - u32 height = (u32)frame->dim.y >> i; - u32 depth = (u32)frame->dim.z >> i; - glDispatchCompute(ORONE(width / 32), ORONE(height), ORONE(depth / 32)); - glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT); - } + cc->ping_pong_input_index = !cc->ping_pong_input_index; }break; + case BeamformerShaderKind_DAS:{ local_persist u32 das_cycle_t = 0; - BeamformerDASBakeParameters *db = &cp->shader_descriptors[shader_slot].bake.DAS; - if (db->fast) { - glClearTexImage(frame->texture, 0, GL_RED, GL_FLOAT, 0); - glMemoryBarrier(GL_TEXTURE_UPDATE_BARRIER_BIT); - glBindImageTexture(0, frame->texture, 0, GL_TRUE, 0, GL_READ_WRITE, cp->iq_pipeline ? GL_RG32F : GL_R32F); + GPUBuffer *b = cc->backlog.buffer; + + u64 frame_size = beamformer_frame_byte_size(frame->points, frame->data_kind); + u64 incoherent_size = frame_size / beamformer_data_kind_element_count[frame->data_kind]; + + BeamformerDASPushConstants pc = { + .xdc_element_pitch = cp->xdc_element_pitch, + .rf_data = pp_input_pointer, + .output_data = b->gpu_pointer + frame->buffer_offset, + .incoherent_output = b->gpu_pointer + b->size - incoherent_size, + .array_parameters = cp->array_parameters.gpu_pointer + offsetof(BeamformerDASArrayParameters, focal_vectors), + .output_size_x = cp->output_points.x, + .output_size_y = cp->output_points.y, + .output_size_z = cp->output_points.z, + .cycle_t = das_cycle_t++, + }; + mem_copy(pc.voxel_transform.E, cp->voxel_transform.E, sizeof(pc.voxel_transform)); + mem_copy(pc.xdc_transform.E, cp->xdc_transform.E, sizeof(pc.xdc_transform)); + + b32 coherent = cp->shader_descriptors[shader_slot].bake.DAS.coherency_weighting; + + i32 loop_end; + if (cp->acquisition_kind == BeamformerAcquisitionKind_RCA_VLS || + cp->acquisition_kind == BeamformerAcquisitionKind_RCA_TPW) + { + /* NOTE(rnp): to avoid repeatedly sampling the whole focal vectors + * texture we loop over transmits for VLS/TPW */ + loop_end = (i32)cp->acquisition_count; } else { - glBindImageTexture(0, frame->texture, 0, GL_TRUE, 0, GL_WRITE_ONLY, cp->iq_pipeline ? GL_RG32F : GL_R32F); + loop_end = (i32)cp->shader_descriptors[shader_slot].bake.DAS.channel_count; } - u32 sparse_texture = cp->textures[BeamformerComputeTextureKind_SparseElements]; - if (!db->sparse) sparse_texture = 0; - - glBindBufferBase(GL_UNIFORM_BUFFER, 0, cp->ubos[BeamformerComputeUBOKind_DAS]); - glBindBufferRange(GL_SHADER_STORAGE_BUFFER, 1, cc->ping_pong_ssbos[input_ssbo_idx], 0, cp->rf_size); - glBindImageTexture(1, sparse_texture, 0, 0, 0, GL_READ_ONLY, GL_R16I); - glBindImageTexture(2, cp->textures[BeamformerComputeTextureKind_FocalVectors], 0, 0, 0, GL_READ_ONLY, GL_RG32F); - glBindImageTexture(3, cp->textures[BeamformerComputeTextureKind_TransmitReceiveOrientations], 0, 0, 0, GL_READ_ONLY, GL_R8I); + GPUMemoryBarrierInfo memory_barriers[2] = { + { + .gpu_buffer = b, + .offset = frame->buffer_offset, + .size = frame_size, + }, + { + .gpu_buffer = b, + .offset = pc.incoherent_output - b->gpu_pointer, + .size = incoherent_size, + }, + }; - glProgramUniform1ui(program, DAS_CYCLE_T_UNIFORM_LOC, das_cycle_t++); + // NOTE(rnp): barrier to wait for clear pipeline to complete + vk_command_buffer_memory_barriers(cmd, memory_barriers, 1 + coherent); - if (db->fast) { - i32 loop_end; - if (db->acquisition_kind == BeamformerAcquisitionKind_RCA_VLS || - db->acquisition_kind == BeamformerAcquisitionKind_RCA_TPW) - { - /* NOTE(rnp): to avoid repeatedly sampling the whole focal vectors - * texture we loop over transmits for VLS/TPW */ - loop_end = (i32)db->acquisition_count; - } else { - loop_end = (i32)db->channel_count; - } - f32 percent_per_step = 1.0f / (f32)loop_end; - cc->processing_progress = -percent_per_step; - for (i32 index = 0; index < loop_end; index++) { - cc->processing_progress += percent_per_step; - /* IMPORTANT(rnp): prevents OS from coalescing and killing our shader */ - glFinish(); - glProgramUniform1i(program, DAS_FAST_CHANNEL_UNIFORM_LOC, index); - glDispatchCompute(dispatch.x, dispatch.y, dispatch.z); - glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT); + vk_command_push_constants(cmd, 0, sizeof(pc), &pc); + for (i32 index = 0; index < loop_end; index++) { + if (index != 0) { + pc.channel_t = index; + vk_command_push_constants(cmd, offsetof(BeamformerDASPushConstants, channel_t), + sizeof(pc.channel_t), &pc.channel_t); } - } else { - glDispatchCompute(dispatch.x, dispatch.y, dispatch.z); + vk_command_dispatch_compute(cmd, dispatch); + vk_command_buffer_memory_barriers(cmd, memory_barriers, 1 + coherent); + } + }break; + + case BeamformerShaderKind_CoherencyWeighting:{ + GPUBuffer *b = cc->backlog.buffer; + + u64 frame_size = beamformer_frame_byte_size(frame->points, frame->data_kind); + u64 incoherent_size = frame_size / beamformer_data_kind_element_count[frame->data_kind]; + + GPUMemoryBarrierInfo memory_barrier = { + .gpu_buffer = b, + .offset = frame->buffer_offset, + .size = frame_size, + }; + + BeamformerCoherencyWeightingPushConstants cwpc = { + .left_side_buffer = b->gpu_pointer + frame->buffer_offset, + .right_side_buffer = b->gpu_pointer + b->size - incoherent_size, + .elements = incoherent_size / beamformer_data_kind_element_size[frame->data_kind], + .scale = 1.0f, + .output_size_x = cp->output_points.x, + .output_size_y = cp->output_points.y, + .output_size_z = cp->output_points.z, + }; + + vk_command_push_constants(cmd, 0, sizeof(cwpc), &cwpc); + vk_command_dispatch_compute(cmd, dispatch); + vk_command_buffer_memory_barriers(cmd, &memory_barrier, 1); + }break; + + // NOTE(rnp): invalid stages should be filtered in planning phase + InvalidDefaultCase; + } + + #if 0 + switch (shader) { + case BeamformerShaderKind_MinMax:{ + for (u32 i = 1; i < frame->image.mip_map_levels; i++) { + glBindImageTexture(0, frame->texture, i - 1, GL_TRUE, 0, GL_READ_ONLY, GL_RG32F); + glBindImageTexture(1, frame->texture, i - 0, GL_TRUE, 0, GL_WRITE_ONLY, GL_RG32F); + glProgramUniform1i(program, MIN_MAX_MIPS_LEVEL_UNIFORM_LOC, i); + + u32 width = (u32)frame->dim.x >> i; + u32 height = (u32)frame->dim.y >> i; + u32 depth = (u32)frame->dim.z >> i; + glDispatchCompute(ORONE(width / 32), ORONE(height), ORONE(depth / 32)); + glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT); } - glMemoryBarrier(GL_TEXTURE_UPDATE_BARRIER_BIT|GL_SHADER_IMAGE_ACCESS_BARRIER_BIT); }break; case BeamformerShaderKind_Sum:{ u32 aframe_index = ctx->averaged_frame_index % countof(ctx->averaged_frames); @@ -950,77 +1001,27 @@ do_compute_shader(BeamformerCtx *ctx, BeamformerComputePlan *cp, BeamformerFrame assert(to_average == frame_count); glProgramUniform1f(program, SUM_PRESCALE_UNIFORM_LOC, 1 / (f32)frame_count); - do_sum_shader(cc, in_textures, frame_count, aframe->texture, aframe->dim); + /* NOTE: zero output before summing */ + glClearTexImage(aframe->texture, 0, GL_RED, GL_FLOAT, 0); + glMemoryBarrier(GL_TEXTURE_UPDATE_BARRIER_BIT); + + glBindImageTexture(0, out_texture, 0, GL_TRUE, 0, GL_READ_WRITE, GL_RG32F); + for (u32 i = 0; i < in_texture_count; i++) { + glBindImageTexture(1, in_textures[i], 0, GL_TRUE, 0, GL_READ_ONLY, GL_RG32F); + glDispatchCompute(dispatch.x, dispatch.y, dispatch.z); + glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT); + } + mem_copy(aframe->voxel_transform.E, frame->voxel_transform.E, sizeof(frame->voxel_transform)); aframe->compound_count = frame->compound_count; aframe->acquisition_kind = frame->acquisition_kind; }break; - InvalidDefaultCase; } -} - -function s8 -shader_text_with_header(s8 header, s8 filepath, b32 has_file, BeamformerShaderKind shader_kind, Arena *arena) -{ - Stream sb = arena_stream(*arena); - stream_push_shader_header(&sb, shader_kind, header); - stream_append_s8(&sb, s8("\n#line 1\n")); - - s8 result; - if (BakeShaders) { - /* TODO(rnp): better handling of shaders with no backing file */ - if (has_file) { - i32 reloadable_index = beamformer_shader_reloadable_index_by_shader[shader_kind]; - stream_append_s8(&sb, beamformer_shader_data[reloadable_index]); - } - result = arena_stream_commit(arena, &sb); - } else { - result = arena_stream_commit(arena, &sb); - if (has_file) { - i64 length = os_read_entire_file((c8 *)filepath.data, arena->beg, arena_capacity(arena, u8)); - result.len += length; - arena_commit(arena, length); - } - } - - return result; -} - -/* NOTE(rnp): currently this function is only handling rendering shaders. - * look at load_compute_shader for compute shaders */ -function void -beamformer_reload_shader(BeamformerCtx *ctx, BeamformerShaderReloadContext *src, Arena arena, s8 shader_name) -{ - BeamformerShaderKind kind = beamformer_reloadable_shader_kinds[src->reloadable_info_index]; - assert(kind == BeamformerShaderKind_Render3D); - - s8 path = push_s8_from_parts(&arena, os_path_separator(), s8("shaders"), - beamformer_reloadable_shader_files[src->reloadable_info_index]); - - i32 shader_count = 1; - BeamformerShaderReloadContext *link = src->link; - while (link != src) { shader_count++; link = link->link; } - - s8 *shader_texts = push_array(&arena, s8, shader_count); - u32 *shader_types = push_array(&arena, u32, shader_count); - - i32 index = 0; - do { - b32 has_file = link->reloadable_info_index >= 0; - shader_texts[index] = shader_text_with_header(link->header, path, has_file, kind, &arena); - shader_types[index] = link->gl_type; - index++; - link = link->link; - } while (link != src); - - u32 *shader = &ctx->frame_view_render_context.shader; - glDeleteProgram(*shader); - *shader = load_shader(arena, shader_texts, shader_types, shader_count, shader_name); - ctx->frame_view_render_context.updated = 1; + #endif } function void -complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena *arena, iptr gl_context) +complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena *arena) { BeamformerComputeContext * cs = &ctx->compute_context; BeamformerSharedMemory * sm = ctx->shared_memory; @@ -1029,6 +1030,7 @@ complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena *arena, iptr gl_c while (work) { b32 can_commit = 1; switch (work->kind) { + case BeamformerWorkKind_ExportBuffer:{ /* TODO(rnp): better way of handling DispatchCompute barrier */ post_sync_barrier(ctx->shared_memory, BeamformerSharedMemoryLockKind_DispatchCompute); @@ -1036,15 +1038,15 @@ complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena *arena, iptr gl_c BeamformerExportContext *ec = &work->export_context; switch (ec->kind) { case BeamformerExportKind_BeamformedData:{ - BeamformerFrame *frame = ctx->latest_frame; - if (frame) { - assert(frame->ready_to_present); - u32 texture = frame->texture; - iv3 dim = frame->dim; - u32 out_size = (u32)dim.x * (u32)dim.y * (u32)dim.z * 2 * sizeof(f32); - if (out_size <= ec->size) { - glGetTextureImage(texture, 0, GL_RG, GL_FLOAT, (i32)out_size, - beamformer_shared_memory_scratch_arena(sm, ctx->shared_memory_size).beg); + BeamformerFrame *f = ctx->latest_frame; + if (f) { + u64 frame_size = beamformer_frame_byte_size(f->points, f->data_kind); + assert((frame_size & 63) == 0); + if (frame_size <= ec->size) { + vk_host_wait_timeline(VulkanTimeline_Compute, f->timeline_valid_value, -1ULL); + vk_buffer_range_download(beamformer_shared_memory_scratch_arena(sm, ctx->shared_memory_size).beg, + ctx->compute_context.backlog.buffer, f->buffer_offset, + frame_size, 1); } } }break; @@ -1062,6 +1064,7 @@ complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena *arena, iptr gl_c beamformer_shared_memory_release_lock(ctx->shared_memory, work->lock); post_sync_barrier(ctx->shared_memory, BeamformerSharedMemoryLockKind_ExportSync); }break; + case BeamformerWorkKind_CreateFilter:{ /* TODO(rnp): this should probably get deleted and moved to lazy loading */ BeamformerCreateFilterContext *fctx = &work->create_filter_context; @@ -1070,20 +1073,18 @@ complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena *arena, iptr gl_c BeamformerComputePlan *cp = beamformer_compute_plan_for_block(cs, block, arena); beamformer_filter_update(cp->filters + slot, fctx->parameters, block, slot, *arena); }break; + case BeamformerWorkKind_ComputeIndirect:{ fill_frame_compute_work(ctx, work, work->compute_indirect_context.view_plane, work->compute_indirect_context.parameter_block, 1); } /* FALLTHROUGH */ - case BeamformerWorkKind_Compute:{ - DEBUG_DECL(glClearNamedBufferData(cs->ping_pong_ssbos[0], GL_RG32F, GL_RG, GL_FLOAT, 0);) - DEBUG_DECL(glClearNamedBufferData(cs->ping_pong_ssbos[1], GL_RG32F, GL_RG, GL_FLOAT, 0);) - DEBUG_DECL(glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);) + case BeamformerWorkKind_Compute:{ push_compute_timing_info(ctx->compute_timing_table, (ComputeTimingInfo){.kind = ComputeTimingInfoKind_ComputeFrameBegin}); BeamformerComputePlan *cp = beamformer_compute_plan_for_block(cs, work->compute_context.parameter_block, arena); - if (beamformer_parameter_block_dirty(sm, work->compute_context.parameter_block)) { + if unlikely(beamformer_parameter_block_dirty(sm, work->compute_context.parameter_block)) { u32 block = work->compute_context.parameter_block; beamformer_commit_parameter_block(ctx, cp, block, *arena); } @@ -1094,91 +1095,134 @@ complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena *arena, iptr gl_c static_assert(ISPOWEROF2(BeamformerMaxComputeShaderStages), "max compute shader stages must be power of 2"); assert((dirty_programs & ~((u32)BeamformerMaxComputeShaderStages - 1)) == 0); - for EachBit(dirty_programs, slot) - load_compute_shader(ctx, cp, (u32)slot, *arena); + for EachBit(dirty_programs, slot) { + beamformer_reload_compute_pipeline(cp->vulkan_pipelines + slot, cp->pipeline.shaders[slot], + cp->shader_descriptors + slot, *arena); + } atomic_store_u32(&cs->processing_compute, 1); - start_renderdoc_capture(gl_context); - BeamformerFrame *frame = work->compute_context.frame; + start_renderdoc_capture(); - GLenum gl_kind = cp->iq_pipeline ? GL_RG32F : GL_R32F; - if (!beamformer_frame_compatible(frame, cp->output_points, gl_kind)) - alloc_beamform_frame(frame, cp->output_points, gl_kind, s8("Beamformed_Data"), *arena); + i32 das_index = -1; + b32 has_sum = 0; + for (u32 i = 0; i < cp->pipeline.shader_count; i++) { + has_sum |= cp->pipeline.shaders[i] == BeamformerShaderKind_Sum; + if (cp->pipeline.shaders[i] == BeamformerShaderKind_DAS) + das_index = (i32)i; + } - m4 voxel_transform = m4_mul(cp->ui_voxel_transform, cp->voxel_transform); - mem_copy(frame->voxel_transform.E, voxel_transform.E, sizeof(voxel_transform)); + b32 das_coherent = das_index >= 0 && cp->shader_descriptors[das_index].bake.DAS.coherency_weighting; + u64 reserved_frame_size = 0; + + if (has_sum) + reserved_frame_size += beamformer_frame_byte_size(cp->output_points, cp->iq_pipeline ? + BeamformerDataKind_Float32Complex : + BeamformerDataKind_Float32); + + // TODO(rnp): incoherent sum for different data kinds + if (das_coherent) + reserved_frame_size += beamformer_frame_byte_size(cp->output_points, BeamformerDataKind_Float32); + + BeamformerFrame *frame = beamformer_frame_next(cs, cp->output_points, cp->iq_pipeline, reserved_frame_size); frame->acquisition_kind = cp->acquisition_kind; frame->compound_count = cp->acquisition_count; - - BeamformerComputeContext *cc = &ctx->compute_context; - BeamformerComputePipeline *pipeline = &cp->pipeline; - /* NOTE(rnp): first stage requires access to raw data buffer directly so we break - * it out into a separate step. This way data can get released as soon as possible */ - if (pipeline->shader_count > 0) { - BeamformerRFBuffer *rf = &cs->rf_buffer; - u32 compute_index = rf->compute_index; - u32 slot = compute_index % countof(rf->compute_syncs); - - if (work->kind == BeamformerWorkKind_ComputeIndirect) { - /* NOTE(rnp): compute indirect is used when uploading data. if compute thread - * preempts upload it must wait for slot counter to reach a value it hasn't - * processed yet. */ - spin_wait(atomic_load_u64(rf->uploaded_data_indices + slot) <= compute_index); - - /* NOTE(rnp): if the GPU supports BAR there may be no need to synchronize - * other than the above spin */ - if (vk_buffer_needs_sync(&rf->buffer)) - glWaitSemaphoreEXT(rf->gl_upload_semaphores[slot], 0, 0, 0, 0, 0); - } else { - slot = (rf->compute_index - 1) % countof(rf->compute_syncs); + mem_copy(frame->voxel_transform.E, cp->voxel_transform.E, sizeof(cp->voxel_transform)); + + VulkanHandle cmd = vk_command_begin(VulkanTimeline_Compute); + vk_command_timestamp(cmd); + + if (das_index >= 0) { + GPUBuffer *backlog = cs->backlog.buffer; + u32 subgroup_size = vk_gpu_info()->subgroup_size; + BeamformerBufferClearPushConstants pc = { + .data = backlog->gpu_pointer + frame->buffer_offset, + .clear_word = 0, + .words = beamformer_frame_byte_size(frame->points, frame->data_kind) / sizeof(u32), + }; + + u32 index = BeamformerShaderKind_BufferClear - BeamformerShaderKind_ComputeInternalFirst; + vk_command_bind_pipeline(cmd, cs->compute_internal_pipelines[index]); + vk_command_push_constants(cmd, 0, sizeof(pc), &pc); + vk_command_dispatch_compute(cmd, (uv3){{(u32)ceil_f32((f32)pc.words / subgroup_size), 1, 1}}); + + if (das_coherent) { + pc.words = pc.words / beamformer_data_kind_element_count[frame->data_kind]; + pc.data = backlog->gpu_pointer + backlog->size - sizeof(u32) * pc.words; + vk_command_push_constants(cmd, 0, sizeof(pc), &pc); + vk_command_dispatch_compute(cmd, (uv3){{(u32)ceil_f32((f32)pc.words / subgroup_size), 1, 1}}); } + } - glBindBufferRange(GL_SHADER_STORAGE_BUFFER, 1, rf->ssbo, slot * rf->active_rf_size, rf->active_rf_size); + BeamformerRFBuffer *rf = &cs->rf_buffer; + u32 compute_index = rf->compute_index; + u32 slot = compute_index % countof(rf->upload_complete_values); - glBeginQuery(GL_TIME_ELAPSED, cc->shader_timer_ids[0]); - do_compute_shader(ctx, cp, frame, pipeline->shaders[0], 0, pipeline->parameters + 0, *arena); - glEndQuery(GL_TIME_ELAPSED); + if (work->kind == BeamformerWorkKind_ComputeIndirect) { + // TODO(rnp): this shouldn't be necessary, there should be a way of communicating + // what the value will be so that the only the command wait is needed. + spin_wait(atomic_load_u64(rf->upload_complete_values + slot) <= compute_index); - if (work->kind == BeamformerWorkKind_ComputeIndirect) { - atomic_store_u64(rf->compute_syncs + slot, glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0)); - atomic_add_u64(&rf->compute_index, 1); - } + /* NOTE(rnp): if the GPU supports BAR there may be no need to synchronize + * other than the above spin */ + if (vk_buffer_needs_sync(&rf->buffer)) + vk_command_wait_timeline(cmd, VulkanTimeline_Transfer, rf->upload_complete_values[slot]); + } else { + slot = (rf->compute_index - 1) % countof(rf->upload_complete_values); + } + + for (u32 i = 0; i < cp->pipeline.shader_count; i++) { + do_compute_shader(ctx, cmd, cp, frame, i, *arena, + rf->buffer.gpu_pointer + slot * rf->active_rf_size); + vk_command_timestamp(cmd); } - b32 did_sum_shader = 0; - for (u32 i = 1; i < pipeline->shader_count; i++) { - did_sum_shader |= pipeline->shaders[i] == BeamformerShaderKind_Sum; - glBeginQuery(GL_TIME_ELAPSED, cc->shader_timer_ids[i]); - do_compute_shader(ctx, cp, frame, pipeline->shaders[i], i, pipeline->parameters + i, *arena); - glEndQuery(GL_TIME_ELAPSED); + u64 end_timeline_value = vk_command_end(cmd, (VulkanHandle){0}, (VulkanHandle){0}); + if (work->kind == BeamformerWorkKind_ComputeIndirect) { + atomic_store_u64(rf->compute_complete_values + slot, end_timeline_value); + atomic_add_u64(&rf->compute_index, 1); } - /* NOTE(rnp): the first of these blocks until work completes */ - for (u32 i = 0; i < pipeline->shader_count; i++) { - ComputeTimingInfo info = {0}; - info.kind = ComputeTimingInfoKind_Shader; - info.shader = pipeline->shaders[i]; - glGetQueryObjectui64v(cc->shader_timer_ids[i], GL_QUERY_RESULT, &info.timer_count); - push_compute_timing_info(ctx->compute_timing_table, info); + atomic_store_u64(&frame->timeline_valid_value, end_timeline_value); + + { + Arena scratch = *arena; + /* NOTE(rnp): this blocks until work completes */ + u64 * timestamps = vk_command_read_timestamps(VulkanTimeline_Compute, &scratch); + + u64 last_time = timestamps[0] > 0 ? timestamps[1] : 0; + u32 shader_index = 0; + for (u64 i = 2; i < timestamps[0] + 1; i++) { + push_compute_timing_info(ctx->compute_timing_table, (ComputeTimingInfo){ + .kind = ComputeTimingInfoKind_Shader, + .shader = cp->pipeline.shaders[shader_index], + .shader_slot = shader_index, + .timer_count = timestamps[i] - last_time, + }); + last_time = timestamps[i]; + shader_index++; + } } + cs->processing_progress = 1; - frame->ready_to_present = 1; - if (did_sum_shader) { + if (has_sum) { + #if 0 u32 aframe_index = ((ctx->averaged_frame_index++) % countof(ctx->averaged_frames)); ctx->averaged_frames[aframe_index].view_plane_tag = frame->view_plane_tag; ctx->averaged_frames[aframe_index].ready_to_present = 1; atomic_store_u64((u64 *)&ctx->latest_frame, (u64)(ctx->averaged_frames + aframe_index)); + #endif } else { atomic_store_u64((u64 *)&ctx->latest_frame, (u64)frame); } - cs->processing_compute = 0; + + atomic_store_u32(&cs->processing_compute, 0); push_compute_timing_info(ctx->compute_timing_table, (ComputeTimingInfo){.kind = ComputeTimingInfoKind_ComputeFrameEnd}); - end_renderdoc_capture(gl_context); + end_renderdoc_capture(); }break; InvalidDefaultCase; } @@ -1199,95 +1243,69 @@ coalesce_timing_table(ComputeTimingTable *t, ComputeShaderStats *stats) u32 target = atomic_load_u32(&t->write_index); u32 stats_index = (stats->latest_frame_index + 1) % countof(stats->table.times); - static_assert(BeamformerShaderKind_Count + 1 <= 32, "timing coalescence bitfield test"); - u32 seen_info_test = 0; + b32 has_rf = 0; + f32 gpu_clocks_to_nano = 1.0e-9f * vk_gpu_info()->timestamp_period_ns; + // NOTE(rnp): not equal (the index may wrap) while (t->read_index != target) { ComputeTimingInfo info = t->buffer[t->read_index % countof(t->buffer)]; switch (info.kind) { + case ComputeTimingInfoKind_ComputeFrameBegin:{ assert(t->compute_frame_active == 0); t->compute_frame_active = 1; /* NOTE(rnp): allow multiple instances of same shader to accumulate */ + t->in_flight_shader_count = 0; + memory_clear(t->in_flight_shader_ids, 0, sizeof(t->in_flight_shader_ids)); memory_clear(stats->table.times[stats_index], 0, sizeof(stats->table.times[stats_index])); }break; + case ComputeTimingInfoKind_ComputeFrameEnd:{ assert(t->compute_frame_active == 1); t->compute_frame_active = 0; stats->latest_frame_index = stats_index; stats_index = (stats_index + 1) % countof(stats->table.times); + stats->table.shader_count = t->in_flight_shader_count; + mem_copy(stats->table.shader_ids, t->in_flight_shader_ids, sizeof(t->in_flight_shader_ids)); }break; + case ComputeTimingInfoKind_Shader:{ - stats->table.times[stats_index][info.shader] += (f32)info.timer_count / 1.0e9f; - seen_info_test |= (1u << info.shader); + t->in_flight_shader_count = Max(t->in_flight_shader_count, info.shader_slot + 1u); + t->in_flight_shader_ids[info.shader_slot] = info.shader; + stats->table.times[stats_index][info.shader_slot] += info.timer_count * gpu_clocks_to_nano; }break; + case ComputeTimingInfoKind_RF_Data:{ stats->latest_rf_index = (stats->latest_rf_index + 1) % countof(stats->table.rf_time_deltas); - f32 delta = (f32)(info.timer_count - stats->last_rf_timer_count) / 1.0e9f; + f32 delta = info.timer_count / (f32)os_system_info()->timer_frequency; stats->table.rf_time_deltas[stats->latest_rf_index] = delta; - stats->last_rf_timer_count = info.timer_count; - seen_info_test |= (1 << BeamformerShaderKind_Count); + has_rf = 1; }break; } /* NOTE(rnp): do this at the end so that stats table is always in a consistent state */ - atomic_add_u32(&t->read_index, 1); + t->read_index++; } - if (seen_info_test) { - for EachEnumValue(BeamformerShaderKind, shader) { - if (seen_info_test & (1 << shader)) { - f32 sum = 0; - for EachElement(stats->table.times, i) - sum += stats->table.times[i][shader]; - stats->average_times[shader] = sum / countof(stats->table.times); - } - } + for (u32 i = 0; i < stats->table.shader_count; i++) { + f32 sum = 0; + for EachElement(stats->table.times, it) + sum += stats->table.times[it][i]; + stats->average_times[i] = sum / countof(stats->table.times); + } - if (seen_info_test & (1 << BeamformerShaderKind_Count)) { - f32 sum = 0; - for EachElement(stats->table.rf_time_deltas, i) - sum += stats->table.rf_time_deltas[i]; - stats->rf_time_delta_average = sum / countof(stats->table.rf_time_deltas); - } + if (has_rf) { + f32 sum = 0; + for EachElement(stats->table.rf_time_deltas, i) + sum += stats->table.rf_time_deltas[i]; + stats->rf_time_delta_average = sum / countof(stats->table.rf_time_deltas); } } DEBUG_EXPORT BEAMFORMER_COMPLETE_COMPUTE_FN(beamformer_complete_compute) { - BeamformerCtx *ctx = (BeamformerCtx *)user_context; BeamformerSharedMemory *sm = ctx->shared_memory; - complete_queue(ctx, &sm->external_work_queue, arena, gl_context); - complete_queue(ctx, ctx->beamform_work_queue, arena, gl_context); -} - -function void -beamformer_rf_buffer_allocate(BeamformerRFBuffer *rf, u32 rf_size) -{ - if ValidHandle(rf->export_handle) - os_release_handle(rf->export_handle); - - OSHandle export = {0}; - vk_buffer_allocate(&rf->buffer, (iz)rf_size, GPUBufferCreateFlags_HostWritable|GPUBufferCreateFlags_MemoryOnly, - &export, s8("")); - - glDeleteBuffers(1, &rf->ssbo); - glCreateBuffers(1, &rf->ssbo); - - glDeleteMemoryObjectsEXT(1, &rf->memory_object); - glCreateMemoryObjectsEXT(1, &rf->memory_object); - - if (OS_WINDOWS) { - glImportMemoryWin32HandleEXT(rf->memory_object, rf->buffer.size, GL_HANDLE_TYPE_OPAQUE_WIN32_EXT, - (void *)export.value[0]); - // NOTE(rnp): w32 does not transfer ownership from handle back to driver - rf->export_handle = export; - } else { - glImportMemoryFdEXT(rf->memory_object, rf->buffer.size, GL_HANDLE_TYPE_OPAQUE_FD_EXT, export.value[0]); - } - - glNamedBufferStorageMemEXT(rf->ssbo, rf->buffer.size, rf->memory_object, 0); - - LABEL_GL_OBJECT(GL_BUFFER, rf->ssbo, s8("Raw_RF_SSBO")); + complete_queue(ctx, &sm->external_work_queue, arena); + complete_queue(ctx, ctx->beamform_work_queue, arena); } DEBUG_EXPORT BEAMFORMER_RF_UPLOAD_FN(beamformer_rf_upload) @@ -1305,22 +1323,20 @@ DEBUG_EXPORT BEAMFORMER_RF_UPLOAD_FN(beamformer_rf_upload) BeamformerRFBuffer *rf = ctx->rf_buffer; rf->active_rf_size = vk_round_up_to_sync_size(rf_block_rf_size & 0xFFFFFFFFULL, 64); - if unlikely(rf->buffer.size < countof(rf->compute_syncs) * rf->active_rf_size) - beamformer_rf_buffer_allocate(rf, countof(rf->compute_syncs) * rf->active_rf_size); + if unlikely(rf->buffer.size < countof(rf->upload_complete_values) * rf->active_rf_size) { + GPUBufferAllocateInfo allocate_info = { + .size = countof(rf->upload_complete_values) * rf->active_rf_size, + .flags = VulkanUsageFlag_HostReadWrite, + .label = s8("RawRFBuffer"), + }; + vk_buffer_allocate(&rf->buffer, &allocate_info); + } - u32 slot = rf->insertion_index++ % countof(rf->compute_syncs); + u32 slot = rf->insertion_index % countof(rf->upload_complete_values); /* NOTE(rnp): don't overwrite slot if the compute thread hasn't processed it */ - u64 current_slot_value = rf->uploaded_data_indices[slot]; - spin_wait(atomic_load_u64(&rf->compute_index) < current_slot_value); - - if (atomic_load_u64(rf->compute_syncs + slot)) { - GLenum sync_result = glClientWaitSync(rf->compute_syncs[slot], 0, 1000000000); - if (sync_result == GL_TIMEOUT_EXPIRED || sync_result == GL_WAIT_FAILED) { - // TODO(rnp): what do? - } - glDeleteSync(rf->compute_syncs[slot]); - } + spin_wait(atomic_load_u64(&rf->compute_index) < rf->upload_complete_values[slot]); + vk_host_wait_timeline(VulkanTimeline_Compute, rf->compute_complete_values[slot], -1ULL); vk_buffer_range_upload(&rf->buffer, beamformer_shared_memory_scratch_arena(sm, ctx->shared_memory_size).beg, slot * rf->active_rf_size, rf->active_rf_size, 1); @@ -1329,19 +1345,17 @@ DEBUG_EXPORT BEAMFORMER_RF_UPLOAD_FN(beamformer_rf_upload) beamformer_shared_memory_release_lock(ctx->shared_memory, (i32)scratch_lock); post_sync_barrier(ctx->shared_memory, upload_lock); - if (vk_buffer_needs_sync(&rf->buffer)) { - // TODO(rnp): vk_buffer_sync - } - - atomic_store_u64(rf->uploaded_data_indices + slot, rf->insertion_index); - atomic_store_u64(rf->compute_syncs + slot, 0); + rf->insertion_index++; + atomic_store_u64(rf->upload_complete_values + slot, vk_host_signal_timeline(VulkanTimeline_Transfer)); os_wake_all_waiters(ctx->compute_worker_sync); - ComputeTimingInfo info = {.kind = ComputeTimingInfoKind_RF_Data}; - glGetQueryObjectui64v(rf->data_timestamp_query, GL_QUERY_RESULT, &info.timer_count); - glQueryCounter(rf->data_timestamp_query, GL_TIMESTAMP); - push_compute_timing_info(ctx->compute_timing_table, info); + u64 current_time = os_timer_count(); + push_compute_timing_info(ctx->compute_timing_table, (ComputeTimingInfo){ + .kind = ComputeTimingInfoKind_RF_Data, + .timer_count = current_time - rf->timestamp, + }); + rf->timestamp = current_time; } } @@ -1373,33 +1387,49 @@ beamformer_process_input_events(BeamformerCtx *ctx, BeamformerInput *input, case BeamformerInputEventKind_ExecutableReload:{ ui_init(ctx, ctx->ui_backing_store); + if (!vk_pipeline_valid(ctx->compute_context.compute_internal_pipelines[0])) { + for EachElement(ctx->compute_context.compute_internal_pipelines, it) { + beamformer_reload_compute_pipeline(ctx->compute_context.compute_internal_pipelines + it, + BeamformerShaderKind_ComputeInternalFirst + it, 0, + ctx->arena); + } + } + #if BEAMFORMER_RENDERDOC_HOOKS - start_frame_capture = input->renderdoc_start_frame_capture; - end_frame_capture = input->renderdoc_end_frame_capture; + start_frame_capture = input->renderdoc_start_frame_capture; + end_frame_capture = input->renderdoc_end_frame_capture; + set_capture_path_template = input->renderdoc_set_capture_file_path_template; #endif }break; case BeamformerInputEventKind_FileEvent:{ BeamformerFileReloadContext *frc = event->file_watch_user_context; switch (frc->kind) { - case BeamformerFileReloadKind_Shader:{ - BeamformerShaderReloadContext *src = frc->shader_reload_context; - BeamformerShaderKind kind = beamformer_reloadable_shader_kinds[src->reloadable_info_index]; - beamformer_reload_shader(ctx, src, ctx->arena, beamformer_shader_names[kind]); + case BeamformerFileReloadKind_ComputeInternalShader:{ + // TODO(rnp): this could stall, better to push it onto compute once queue is better + beamformer_reload_compute_pipeline(frc->shader_reload.pipeline, frc->shader_reload.shader, 0, ctx->arena); }break; + case BeamformerFileReloadKind_ComputeShader:{ for EachElement(ctx->compute_context.compute_plans, block) { BeamformerComputePlan *cp = ctx->compute_context.compute_plans[block]; for (u32 slot = 0; cp && slot < cp->pipeline.shader_count; slot++) { i32 shader_index = beamformer_shader_reloadable_index_by_shader[cp->pipeline.shaders[slot]]; - if (beamformer_reloadable_shader_kinds[shader_index] == frc->compute_shader_kind) + if (beamformer_reloadable_shader_kinds[shader_index] == frc->shader_reload.shader) atomic_or_u32(&cp->dirty_programs, 1 << slot); } } + // TODO(rnp): track latest parameter block if (ctx->latest_frame) - beamformer_queue_compute(ctx, ctx->latest_frame, ctx->latest_frame->parameter_block); + beamformer_queue_compute(ctx, ctx->latest_frame, 0); + }break; + + case BeamformerFileReloadKind_RenderShader:{ + beamformer_reload_render_pipeline(frc->shader_reload.pipeline, frc->shader_reload.shader, ctx->arena); + ctx->render_shader_updated = 1; }break; + InvalidDefaultCase; } }break; @@ -1437,5 +1467,5 @@ beamformer_frame_step(BeamformerInput *input) BeamformerViewPlaneTag tag = frame? frame->view_plane_tag : 0; draw_ui(ctx, input, frame, tag); - ctx->frame_view_render_context.updated = 0; + ctx->render_shader_updated = 0; } diff --git a/beamformer_internal.h b/beamformer_internal.h @@ -10,12 +10,8 @@ #include "generated/beamformer.meta.c" #include "generated/beamformer_shaders.c" -#include <raylib_extended.h> -#include <rlgl.h> - -#include "threads.c" -#include "util_gl.c" -#include "util_os.c" +#include "external/raylib/src/raylib.h" +#include "external/raylib/src/rlgl.h" #define beamformer_info(s) s8("[info] " s "\n") @@ -24,16 +20,63 @@ typedef struct { u64 value[1]; } VulkanHandle; typedef enum { - GPUBufferCreateFlags_HostWritable = 1 << 0, - GPUBufferCreateFlags_MemoryOnly = 1 << 1, -} GPUBufferCreateFlags; + VulkanTimeline_Graphics, + VulkanTimeline_Compute, + VulkanTimeline_Transfer, + VulkanTimeline_Count, +} VulkanTimeline; + +typedef enum { + VulkanShaderKind_Vertex, + VulkanShaderKind_Mesh, + VulkanShaderKind_Fragment, + VulkanShaderKind_Compute, + VulkanShaderKind_Count, +} VulkanShaderKind; + +typedef enum { + VulkanImageUsage_None, + VulkanImageUsage_Colour, + VulkanImageUsage_DepthStencil, + VulkanImageUsage_Count, +} VulkanImageUsage; + +typedef enum { + VulkanUsageFlag_ImageSampling = 1 << 0, + VulkanUsageFlag_HostReadWrite = 1 << 1, // NOTE: not valid on images + /* NOTE: uses: + * - image-image copy operations + * - buffer-buffer copy operations + */ + VulkanUsageFlag_TransferSource = 1 << 2, + VulkanUsageFlag_TransferDestination = 1 << 3, +} VulkanUsageFlags; + +typedef struct { + VulkanShaderKind kind; + s8 text; + s8 name; +} VulkanPipelineCreateInfo; typedef struct { + VulkanHandle buffer; u64 gpu_pointer; i64 size; - VulkanHandle buffer; + + // NOTE: only used for render models + u64 index_count; } GPUBuffer; +typedef struct { + VulkanHandle image; + u32 width; + u32 height; + u32 samples; + u32 mip_map_levels; + // TODO(rnp): this is only here for importing from OpenGL, move it back into handle later + u64 memory_size; +} GPUImage; + typedef enum { GPUVendor_AMD = 0x1002, GPUVendor_NVIDIA = 0x10DE, @@ -59,28 +102,94 @@ typedef struct { u64 gpu_heap_used; } GPUInfo; +typedef struct { + i64 size; + VulkanUsageFlags flags; + + // NOTE(rnp): only required if buffer will be used on multiple timelines + VulkanTimeline *timelines_used; + u32 timeline_count; + + s8 label; +} GPUBufferAllocateInfo; + +typedef struct { + GPUBuffer *gpu_buffer; + u64 offset; + u64 size; +} GPUMemoryBarrierInfo; + +typedef struct { + GPUBuffer model; + u32 vertex_count; + u32 normals_offset; +} RenderModel; + +#include "threads.c" +#include "util_os.c" + /////////////////////////// // NOTE: vulkan layer API DEBUG_IMPORT void vk_load(OSLibrary vulkan, Arena *memory, Stream *error); DEBUG_IMPORT GPUInfo *vk_gpu_info(void); -DEBUG_IMPORT void vk_buffer_allocate(GPUBuffer *, iz size, GPUBufferCreateFlags flags, OSHandle *export, s8 label); +DEBUG_IMPORT void vk_buffer_allocate(GPUBuffer *, GPUBufferAllocateInfo *info); DEBUG_IMPORT void vk_buffer_release(GPUBuffer *); DEBUG_IMPORT void vk_buffer_range_upload(GPUBuffer *, void *data, u64 offset, u64 size, b32 non_temporal); +DEBUG_IMPORT void vk_buffer_range_download(void *output, GPUBuffer *, u64 source_offset, u64 size, b32 non_temporal); DEBUG_IMPORT u64 vk_round_up_to_sync_size(u64, u64 min); -/* NOTE: Compute shaders do not have bindings. Data should be passed using push constants. +// NOTE: images are 2D only, any other use case should just use a buffer and index in the shader +DEBUG_IMPORT void vk_image_allocate(GPUImage *, u32 width, u32 height, u32 mips, u32 samples, VulkanImageUsage usage, VulkanUsageFlags flags, OSHandle *export); +DEBUG_IMPORT void vk_image_release(GPUImage *); + +DEBUG_IMPORT void vk_render_model_allocate(GPUBuffer *, void *indices, u64 index_count, u64 model_size, s8 label); +DEBUG_IMPORT void vk_render_model_range_upload(GPUBuffer *, void *data, u64 offset, u64 size, b32 non_temporal); +DEBUG_IMPORT void vk_render_model_release(GPUBuffer *); + +/* NOTE: Pipelines do not have bindings. Data should be passed using push constants. * In particular the push constants should contain pointers to gpu memory using the * BufferDeviceAddress extension. */ // TODO(rnp): change this to accept SPIR-V directly and accept BakeParameters as specialization data -DEBUG_IMPORT VulkanHandle vk_compute_shader(s8 text, s8 name); -DEBUG_IMPORT void vk_compute_shader_release(VulkanHandle); +DEBUG_IMPORT VulkanHandle vk_pipeline(VulkanPipelineCreateInfo *infos, u32 count, u32 push_constants_size); +DEBUG_IMPORT b32 vk_pipeline_valid(VulkanHandle); +DEBUG_IMPORT void vk_pipeline_release(VulkanHandle); -// NOTE: temporary API DEBUG_IMPORT b32 vk_buffer_needs_sync(GPUBuffer *); -DEBUG_IMPORT VulkanHandle vk_semaphore_create(OSHandle *export); +DEBUG_IMPORT VulkanHandle vk_create_semaphore(OSHandle *export); + +DEBUG_IMPORT b32 vk_host_wait_timeline(VulkanTimeline timeline, u64 value, u64 timeout_ns); +DEBUG_IMPORT u64 vk_host_signal_timeline(VulkanTimeline timeline); + +DEBUG_IMPORT VulkanHandle vk_command_begin(VulkanTimeline timeline); +DEBUG_IMPORT void vk_command_bind_pipeline(VulkanHandle command, VulkanHandle pipeline); +DEBUG_IMPORT void vk_command_buffer_memory_barriers(VulkanHandle command, GPUMemoryBarrierInfo *barriers, u64 count); +DEBUG_IMPORT void vk_command_dispatch_compute(VulkanHandle command, uv3 dispatch); +DEBUG_IMPORT void vk_command_push_constants(VulkanHandle command, u32 offset, u32 size, void *values); +DEBUG_IMPORT void vk_command_timestamp(VulkanHandle command); +DEBUG_IMPORT void vk_command_wait_timeline(VulkanHandle command, VulkanTimeline timeline, u64 value); +// NOTE: extra semaphores only exist for synchronization with OpenGL and will be removed in the future +DEBUG_IMPORT u64 vk_command_end(VulkanHandle command, VulkanHandle wait_semaphore, VulkanHandle finished_semaphore); + +DEBUG_IMPORT void vk_command_begin_rendering(VulkanHandle command, GPUImage *restrict colour, GPUImage *restrict depth, GPUImage *restrict resolve); +DEBUG_IMPORT void vk_command_draw(VulkanHandle command, GPUBuffer *model); +DEBUG_IMPORT void vk_command_scissor(VulkanHandle command, u32 width, u32 height, u32 x_offset, u32 y_offset); +DEBUG_IMPORT void vk_command_viewport(VulkanHandle command, f32 width, f32 height, f32 x_offset, f32 y_offset, f32 min_depth, f32 max_depth); +DEBUG_IMPORT void vk_command_end_rendering(VulkanHandle command); + +DEBUG_IMPORT void vk_command_copy_buffer(VulkanHandle command, GPUBuffer *restrict destination, GPUBuffer *restrict source, u64 source_offset, i64 size); + +// NOTE: returns array of valid timestamps + 1, first element is the count. +// Calling thread may stall until results available. +DEBUG_IMPORT u64 * vk_command_read_timestamps(VulkanTimeline timeline, Arena *arena); + +#if BEAMFORMER_RENDERDOC_HOOKS +DEBUG_IMPORT void * vk_renderdoc_instance_handle(void); +#else +#define vk_renderdoc_instance_handle() ((void *)0) +#endif /////////////////////////////// // NOTE: CUDA Library Bindings @@ -119,73 +228,34 @@ CUDALibraryProcedureList ///////////////////////////////////// // NOTE: Core Beamformer Definitions -/* TODO(rnp): this should be a UBO */ -#define FRAME_VIEW_MODEL_MATRIX_LOC 0 -#define FRAME_VIEW_VIEW_MATRIX_LOC 1 -#define FRAME_VIEW_PROJ_MATRIX_LOC 2 -#define FRAME_VIEW_DYNAMIC_RANGE_LOC 3 -#define FRAME_VIEW_THRESHOLD_LOC 4 -#define FRAME_VIEW_GAMMA_LOC 5 -#define FRAME_VIEW_LOG_SCALE_LOC 6 -#define FRAME_VIEW_BB_COLOUR_LOC 7 -#define FRAME_VIEW_BB_FRACTION_LOC 8 -#define FRAME_VIEW_SOLID_BB_LOC 10 - -#define FRAME_VIEW_BB_COLOUR 0.92, 0.88, 0.78, 1.0 -#define FRAME_VIEW_BB_FRACTION 0.007f - -#define FRAME_VIEW_RENDER_TARGET_SIZE 1024, 1024 - -typedef struct { - u32 shader; - u32 framebuffers[2]; /* [0] -> multisample target, [1] -> normal target for resolving */ - u32 renderbuffers[2]; /* only used for 3D views, size is fixed */ - b32 updated; -} FrameViewRenderContext; - #include "beamformer_parameters.h" #include "beamformer_shared_memory.c" typedef struct { - iptr elements_offset; - i32 elements; - u32 buffer; - u32 vao; -} BeamformerRenderModel; - -typedef struct { BeamformerFilterParameters parameters; - f32 time_delay; - i32 length; - u32 ssbo; + f32 time_delay; + i32 length; + GPUBuffer buffer; } BeamformerFilter; -/* TODO(rnp): need 1 UBO per filter slot */ -#define BEAMFORMER_COMPUTE_UBO_LIST \ - X(DAS, BeamformerDASPushConstants, das) - -#define X(k, ...) BeamformerComputeUBOKind_##k, -typedef enum {BEAMFORMER_COMPUTE_UBO_LIST BeamformerComputeUBOKind_Count} BeamformerComputeUBOKind; -#undef X - -// X(kind, gl_kind, texture_format, pixel_type) -#define BEAMFORMER_COMPUTE_TEXTURE_LIST \ - X(FocalVectors, GL_RG32F, GL_RG, GL_FLOAT) \ - X(SparseElements, GL_R16I, GL_RED_INTEGER, GL_SHORT) \ - X(TransmitReceiveOrientations, GL_R8I, GL_RED_INTEGER, GL_BYTE) - -#define BEAMFORMER_COMPUTE_TEXTURE_LIST_FULL \ - BEAMFORMER_COMPUTE_TEXTURE_LIST \ - X(Hadamard, GL_R16F) +// X(kind, format, elements) +#define BEAMFORMER_COMPUTE_ARRAY_PARAMETERS_LIST \ + X(Hadamard, f16, BeamformerMaxChannelCount * BeamformerMaxChannelCount) \ + X(FocalVectors, v2, BeamformerMaxChannelCount) \ + X(SparseElements, i16, BeamformerMaxChannelCount) \ + X(TransmitReceiveOrientations, u16, BeamformerMaxChannelCount) \ typedef enum { - #define X(k, ...) BeamformerComputeTextureKind_##k, - BEAMFORMER_COMPUTE_TEXTURE_LIST_FULL + #define X(k, ...) BeamformerComputeArrayParameterKind_##k, + BEAMFORMER_COMPUTE_ARRAY_PARAMETERS_LIST #undef X - BeamformerComputeTextureKind_Count -} BeamformerComputeTextureKind; -static_assert((BeamformerComputeTextureKind_Count - 1) == BeamformerComputeTextureKind_Hadamard, - "BeamformerComputeTextureKind_Hadamard must be end of TextureKinds"); + BeamformerComputeArrayParameterKind_Count +} BeamformerComputeArrayParameterKind; + +// NOTE(rnp): only used to calculate offsets, never used directly +#define X(name, type, elements) alignas(64) type name[elements]; +typedef struct {BEAMFORMER_COMPUTE_ARRAY_PARAMETERS_LIST} BeamformerComputeArrayParameters; +#undef X typedef struct { uv3 layout; @@ -197,7 +267,7 @@ typedef struct BeamformerComputePlan BeamformerComputePlan; struct BeamformerComputePlan { BeamformerComputePipeline pipeline; - u32 programs[BeamformerMaxComputeShaderStages]; + VulkanHandle vulkan_pipelines[BeamformerMaxComputeShaderStages]; u32 dirty_programs; @@ -214,14 +284,15 @@ struct BeamformerComputePlan { iv3 output_points; i32 average_frames; - u32 textures[BeamformerComputeTextureKind_Count]; - u32 ubos[BeamformerComputeUBOKind_Count]; + // TODO(rnp): specialization constants + v2 xdc_element_pitch; + m4 xdc_transform; + // TODO(rnp): probably just compute this everytime + m4 das_voxel_transform; - BeamformerFilter filters[BeamformerFilterSlots]; + GPUBuffer array_parameters; - #define X(k, type, name) type name ##_ubo_data; - BEAMFORMER_COMPUTE_UBO_LIST - #undef X + BeamformerFilter filters[BeamformerFilterSlots]; u128 shader_hashes[BeamformerMaxComputeShaderStages]; BeamformerShaderDescriptor shader_descriptors[BeamformerMaxComputeShaderStages]; @@ -230,50 +301,20 @@ struct BeamformerComputePlan { }; typedef struct { - // NOTE(rnp): w32 doesn't transfer ownership of these when they are imported - // into the driver. For now just store them here, this code won't be around for long - OSHandle upload_semaphores_handles[BeamformerMaxRawDataFramesInFlight]; - VulkanHandle vk_upload_semaphores[BeamformerMaxRawDataFramesInFlight]; - u32 gl_upload_semaphores[BeamformerMaxRawDataFramesInFlight]; - - GLsync compute_syncs[BeamformerMaxRawDataFramesInFlight]; - - u64 uploaded_data_indices[BeamformerMaxRawDataFramesInFlight]; + u64 upload_complete_values[BeamformerMaxRawDataFramesInFlight]; + u64 compute_complete_values[BeamformerMaxRawDataFramesInFlight]; GPUBuffer buffer; - OSHandle export_handle; - - u32 ssbo, memory_object; u32 active_rf_size; - u32 data_timestamp_query; + + u64 timestamp; u64 insertion_index; u64 compute_index; } BeamformerRFBuffer; typedef struct { - BeamformerRFBuffer rf_buffer; - - BeamformerComputePlan *compute_plans[BeamformerMaxParameterBlocks]; - BeamformerComputePlan *compute_plan_freelist; - - /* NOTE(rnp): two interstage ssbos are allocated so that they may be used to - * ping pong data between compute stages */ - u32 ping_pong_ssbos[2]; - u32 last_output_ssbo_index; - - u32 ping_pong_ssbo_size; - - f32 processing_progress; - b32 processing_compute; - - u32 shader_timer_ids[BeamformerMaxComputeShaderStages]; - - BeamformerRenderModel unit_cube_model; -} BeamformerComputeContext; - -typedef struct { BeamformerComputeStatsTable table; f32 average_times[BeamformerShaderKind_Count]; @@ -296,7 +337,11 @@ typedef struct { u64 timer_count; ComputeTimingInfoKind kind; union { - BeamformerShaderKind shader; + struct { + static_assert(BeamformerShaderKind_Count <= U16_MAX, ""); + u16 shader; + u16 shader_slot; + }; }; } ComputeTimingInfo; @@ -304,6 +349,10 @@ typedef struct { u32 write_index; u32 read_index; b32 compute_frame_active; + + u32 in_flight_shader_count; + BeamformerShaderKind in_flight_shader_ids[BeamformerMaxComputeShaderStages]; + ComputeTimingInfo buffer[4096]; } ComputeTimingTable; @@ -315,34 +364,57 @@ typedef struct { i32 *compute_worker_sync; } BeamformerUploadThreadContext; -struct BeamformerFrame { - u32 texture; - b32 ready_to_present; - - iv3 dim; - i32 mips; +typedef struct { + u64 buffer_offset; + u64 timeline_valid_value; /* NOTE: for use when displaying either prebeamformed frames or on the current frame * when we intend to recompute on the next frame */ m4 voxel_transform; - // metadata - GLenum gl_kind; + iv3 points; + u32 id; u32 compound_count; - u32 parameter_block; + BeamformerDataKind data_kind; BeamformerAcquisitionKind acquisition_kind; BeamformerViewPlaneTag view_plane_tag; +} BeamformerFrame; - BeamformerFrame *next; -}; +/* NOTE(rnp): backing storage for beamformed frames. The amount of backlog frames +* is dependant on the currently requested output size. */ +typedef struct { + GPUBuffer buffer[1]; + + u64 next_offset; + u64 counter; + + BeamformerFrame frames[BeamformerMaxBacklogFrames]; +} BeamformerFrameBacklog; + +typedef struct { + BeamformerRFBuffer rf_buffer; + + BeamformerComputePlan *compute_plans[BeamformerMaxParameterBlocks]; + BeamformerComputePlan *compute_plan_freelist; + + VulkanHandle compute_internal_pipelines[BeamformerShaderKind_ComputeInternalCount]; + + /* NOTE(rnp): used to ping pong data between compute stages. + * Half the buffer will be used for reading and the other for writing. */ + GPUBuffer ping_pong_buffer; + u32 ping_pong_input_index; + + f32 processing_progress; + b32 processing_compute; + + BeamformerFrameBacklog backlog; +} BeamformerComputeContext; typedef struct { OSThread handle; Arena arena; - iptr window_handle; - iptr gl_context; iptr user_context; i32 sync_variable; b32 awake; @@ -367,26 +439,15 @@ typedef struct { u64 frame_timestamp; - BeamformerComputeContext compute_context; - - /* TODO(rnp): ideally this would go in the UI but its hard to manage with the UI - * destroying itself on hot-reload */ - FrameViewRenderContext frame_view_render_context; - Stream error_stream; - BeamformWorkQueue *beamform_work_queue; - - ComputeShaderStats *compute_shader_stats; - ComputeTimingTable *compute_timing_table; - BeamformerSharedMemory *shared_memory; i64 shared_memory_size; - BeamformerFrame beamform_frames[BeamformerMaxBacklogFrames]; BeamformerFrame *latest_frame; - u32 next_render_frame_index; - u32 display_frame_index; + + // TODO(rnp): track elsewhere + b32 render_shader_updated; /* NOTE: this will only be used when we are averaging */ u32 averaged_frame_index; @@ -394,31 +455,47 @@ typedef struct { GLWorkerThreadContext upload_worker; GLWorkerThreadContext compute_worker; + + BeamformerComputeContext compute_context; + + ComputeShaderStats compute_shader_stats[1]; + ComputeTimingTable compute_timing_table[1]; + + BeamformWorkQueue beamform_work_queue[1]; } BeamformerCtx; #define BeamformerContextMemory(m) (BeamformerCtx *)align_pointer_up((m), alignof(BeamformerCtx)); typedef enum { - BeamformerFileReloadKind_Shader, + BeamformerFileReloadKind_ComputeInternalShader, BeamformerFileReloadKind_ComputeShader, + BeamformerFileReloadKind_RenderShader, } BeamformerFileReloadKind; -typedef struct BeamformerShaderReloadContext BeamformerShaderReloadContext; -struct BeamformerShaderReloadContext { - BeamformerShaderReloadContext * link; - s8 header; - GLenum gl_type; - i32 reloadable_info_index; -}; +typedef struct { + BeamformerShaderKind shader; + VulkanHandle * pipeline; +} BeamformerShaderReloadData; + +typedef struct { + BeamformerShaderKind shader; + VulkanShaderKind shader_kind; + + // NOTE(rnp): based on BakeShaders compile time value + s8 filename_or_data; + + BeamformerShaderDescriptor *shader_descriptor; + + uv3 layout; +} BeamformerShaderReloadInfo; typedef struct { BeamformerFileReloadKind kind; union { - BeamformerShaderReloadContext * shader_reload_context; - BeamformerShaderKind compute_shader_kind; + BeamformerShaderReloadData shader_reload; }; } BeamformerFileReloadContext; -#define BEAMFORMER_COMPLETE_COMPUTE_FN(name) void name(iptr user_context, Arena *arena, iptr gl_context) +#define BEAMFORMER_COMPLETE_COMPUTE_FN(name) void name(BeamformerCtx *ctx, Arena *arena) typedef BEAMFORMER_COMPLETE_COMPUTE_FN(beamformer_complete_compute_fn); #define BEAMFORMER_RF_UPLOAD_FN(name) void name(BeamformerUploadThreadContext *ctx) diff --git a/beamformer_parameters.h b/beamformer_parameters.h @@ -10,10 +10,12 @@ */ typedef struct { + uint64_t shader_count; + uint32_t shader_ids[BeamformerMaxComputeShaderStages]; /* NOTE(rnp): this wants to be iterated on both dimensions. it depends entirely on which * visualization method you want to use. the coalescing function wants both directions */ - float times[32][BeamformerMaxComputeShaderStages]; - float rf_time_deltas[32]; + float times[32][BeamformerMaxComputeShaderStages]; + float rf_time_deltas[32]; } BeamformerComputeStatsTable; /* X(type, id, pretty name) */ diff --git a/beamformer_shared_memory.c b/beamformer_shared_memory.c @@ -1,7 +1,5 @@ /* See LICENSE for license details. */ -#define BEAMFORMER_SHARED_MEMORY_VERSION (28UL) - -typedef struct BeamformerFrame BeamformerFrame; +#define BEAMFORMER_SHARED_MEMORY_VERSION (29UL) typedef enum { BeamformerWorkKind_Compute, @@ -39,8 +37,7 @@ typedef enum {BEAMFORMER_SHARED_MEMORY_LOCKS BeamformerSharedMemoryLockKind_Coun #undef X typedef struct { - BeamformerFrame *frame; - u32 parameter_block; + u32 parameter_block; } BeamformerComputeWorkContext; typedef struct { @@ -161,6 +158,8 @@ typedef struct { /* TODO(rnp): this is really sucky. we need a better way to communicate this */ u64 rf_block_rf_size; + u64 max_beamformed_data_size; + BeamformerLiveImagingParameters live_imaging_parameters; BeamformerLiveImagingDirtyFlags live_imaging_dirty_flags; diff --git a/build.c b/build.c @@ -741,9 +741,8 @@ build_raylib(Arena a) { b32 result = 1, shared = config.debug; char *libraylib = shared ? OS_SHARED_LINK_LIB("raylib") : OUTPUT_LIB(OS_STATIC_LIB("raylib")); - if (needs_rebuild(libraylib, "external/include/rlgl.h", "external/raylib")) { + if (needs_rebuild(libraylib, "external/raylib")) { git_submodule_update(a, "external/raylib"); - os_copy_file("external/raylib/src/rlgl.h", "external/include/rlgl.h"); CommandList cc = {0}; cmd_base(&a, &cc, 0, config.debug); @@ -752,16 +751,17 @@ build_raylib(Arena a) if (!is_msvc) cmd_append(&a, &cc, "-Wno-unused-but-set-variable"); cmd_append(&a, &cc, "-Iexternal/include", "-Iexternal/raylib/src", "-Iexternal/raylib/src/external/glfw/include"); #define RAYLIB_SOURCES \ + X(rcore) \ X(rglfw) \ X(rshapes) \ X(rtext) \ X(rtextures) \ X(utils) #define X(name) "external/raylib/src/" #name ".c", - char *srcs[] = {"external/rcore_extended.c", RAYLIB_SOURCES}; + char *srcs[] = {RAYLIB_SOURCES}; #undef X #define X(name) OUTPUT(OBJECT(#name)), - char *outs[] = {OUTPUT(OBJECT("rcore_extended")), RAYLIB_SOURCES}; + char *outs[] = {RAYLIB_SOURCES}; #undef X if (shared) { @@ -1045,9 +1045,11 @@ meta_end_and_write_matlab(MetaprogramContext *m, char *path) X(EndScope) \ X(Enumeration) \ X(Expand) \ + X(FragmentShader) \ X(Library) \ X(MATLAB) \ X(PushConstants) \ + X(RenderShader) \ X(Shader) \ X(ShaderAlias) \ X(ShaderGroup) \ @@ -1055,6 +1057,7 @@ meta_end_and_write_matlab(MetaprogramContext *m, char *path) X(Struct) \ X(Table) \ X(Union) \ + X(VertexShader) \ typedef enum { #define X(k, ...) MetaEntryKind_## k, @@ -1080,14 +1083,14 @@ typedef enum { } MetaEmitLang; #define META_KIND_LIST \ - X(M4, m4, mat4, float, single, 64, 16) \ - X(V4, v4, vec4, float, single, 16, 4) \ - X(SV4, iv4, ivec4, int32_t, int32, 16, 4) \ - X(UV4, uv4, uvec4, uint32_t, uint32, 16, 4) \ - X(UV2, uv2, uvec2, uint32_t, uint32, 8, 2) \ - X(V3, v3, vec3, float, single, 12, 3) \ - X(V2, v2, vec2, float, single, 8, 2) \ - X(F32, f32, float, float, single, 4, 1) \ + X(M4, m4, f32mat4, float, single, 64, 16) \ + X(V4, v4, f32vec4, float, single, 16, 4) \ + X(SV4, iv4, i32vec4, int32_t, int32, 16, 4) \ + X(UV4, uv4, u32vec4, uint32_t, uint32, 16, 4) \ + X(UV2, uv2, u32vec2, uint32_t, uint32, 8, 2) \ + X(V3, v3, f32vec3, float, single, 12, 3) \ + X(V2, v2, f32vec2, float, single, 8, 2) \ + X(F32, f32, float32_t, float, single, 4, 1) \ X(S32, i32, int32_t, int32_t, int32, 4, 1) \ X(S16, i16, int16_t, int16_t, int16, 2, 1) \ X(S8, i8, int8_t, int8_t, int8, 1, 1) \ @@ -1749,14 +1752,28 @@ typedef struct { typedef enum { MetaShaderKind_Alias, MetaShaderKind_Compute, + MetaShaderKind_Render, MetaShaderKind_Count, } MetaShaderKind; +typedef enum { + MetaShaderPrimitiveKind_Mesh, + MetaShaderPrimitiveKind_Vertex, + MetaShaderPrimitiveKind_Count, +} MetaShaderPrimitiveKind; + +typedef struct { + MetaShaderPrimitiveKind kind; +} MetaRenderShader; + typedef struct { MetaShaderKind kind; MetaIDList entity_reference_ids; - s8 file; - MetaEntityID alias_parent_id; + s8 files[2]; + union { + MetaEntityID alias_parent_id; + MetaRenderShader render; + }; } MetaShader; #define META_STRUCT_FIELDS \ @@ -2320,6 +2337,44 @@ meta_pack_shader_common(MetaContext *ctx, MetaEntityID shader_id, MetaEntry *e, } function i64 +meta_pack_render_shader(MetaContext *ctx, MetaEntry *entries, i64 entry_count, MetaEntityID group_entity_id) +{ + assert(entries[0].kind == MetaEntryKind_RenderShader); + + MetaEntityID entity_id = meta_intern_entity(ctx, entries->name, MetaEntityKind_Shader, + group_entity_id, entries->location, 0); + meta_entity(ctx, entity_id)->shader.kind = MetaShaderKind_Render; + + meta_entry_argument_expected(entries); + + MetaEntryScope scope = meta_entry_extract_scope(entries, entry_count); + if (scope.consumed > 1) { + for (MetaEntry *e = scope.start; e < scope.one_past_last; e++) { + switch (e->kind) { + + case MetaEntryKind_VertexShader:{ + if (meta_entity(ctx, entity_id)->shader.files[0].len) + meta_entry_error(e, "primitive shader file redefined\n"); + meta_entity(ctx, entity_id)->shader.files[0] = meta_entry_argument_expect(e, 0, MetaEntryArgumentKind_String).string; + meta_entity(ctx, entity_id)->shader.render.kind = MetaShaderPrimitiveKind_Vertex; + }break; + + case MetaEntryKind_FragmentShader:{ + if (meta_entity(ctx, entity_id)->shader.files[1].len) + meta_entry_error(e, "fragment shader file redefined\n"); + meta_entity(ctx, entity_id)->shader.files[1] = meta_entry_argument_expect(e, 0, MetaEntryArgumentKind_String).string; + }break; + + default:{ + e += meta_pack_shader_common(ctx, entity_id, e, scope.one_past_last - e, group_entity_id); + }break; + } + } + } + return scope.consumed; +} + +function i64 meta_pack_compute_shader(MetaContext *ctx, MetaEntry *entries, i64 entry_count, MetaEntityID group_entity_id) { assert(entries[0].kind == MetaEntryKind_Shader); @@ -2332,7 +2387,7 @@ meta_pack_compute_shader(MetaContext *ctx, MetaEntry *entries, i64 entry_count, meta_entry_argument_expected(entries, s8("[file_name]")); } else if (entries->argument_count == 1) { s8 shader_file = meta_entry_argument_expect(entries, 0, MetaEntryArgumentKind_String).string; - meta_entity(ctx, entity_id)->shader.file = shader_file; + meta_entity(ctx, entity_id)->shader.files[0] = shader_file; } MetaEntryScope scope = meta_entry_extract_scope(entries, entry_count); @@ -2360,6 +2415,9 @@ meta_pack_shader_group(MetaContext *ctx, MetaEntry *entries, i64 entry_count) if (scope.consumed > 1) { for (MetaEntry *e = scope.start; e < scope.one_past_last; e++) { switch (e->kind) { + case MetaEntryKind_RenderShader:{ + e += meta_pack_render_shader(ctx, e, scope.one_past_last - e, entity_id); + }break; case MetaEntryKind_Shader:{ e += meta_pack_compute_shader(ctx, e, scope.one_past_last - e, entity_id); }break; @@ -3480,12 +3538,15 @@ meta_push_shader_reload_info(MetaprogramContext *m, MetaContext *ctx) } } meta_end_scope(m, s8("};\n")); - meta_begin_scope(m, s8("read_only global s8 " META_NAMESPACE_LOWER "_reloadable_shader_files[] = {")); + meta_begin_scope(m, s8("read_only global s8 *" META_NAMESPACE_LOWER "_reloadable_shader_files[] = {")); { for (da_count shader = 0; shader < ctx->base_shader_count; shader++) { da_count id = ctx->base_shader_ids[shader]; MetaShader *s = &ctx->entities.data[id].shader; - meta_push_line(m, s8("s8_comp(\""), s->file, s8("\"),")); + meta_begin_line(m, s8("(s8 []){s8_comp(\""), s->files[0], s8("\")")); + if (s->files[1].len) + meta_push(m, s8(", s8_comp(\""), s->files[1], s8("\")")); + meta_end_line(m, s8("},")); } } meta_end_scope(m, s8("};\n")); @@ -3558,7 +3619,7 @@ meta_push_shader_reload_info(MetaprogramContext *m, MetaContext *ctx) .element_count_style = MetaPushStructStyle_C, .base_types = meta_kind_glsl_types, .prefix = str8("\" "), - .suffix = str8("\\n\""), + .suffix = str8(";\\n\""), }); meta_push_line(m, s8("\"};\\n\"")); meta_push_line(m, s8("\"\\n\"),")); @@ -3566,7 +3627,7 @@ meta_push_shader_reload_info(MetaprogramContext *m, MetaContext *ctx) case MetaEntityKind_PushConstants:{ meta_push_line(m, s8("s8_comp(\"\"")); - meta_push_line(m, s8("\"layout(std140, binding = 0) uniform PushConstants {\\n\"")); + meta_push_line(m, s8("\"layout(push_constant, std430) uniform PushConstants {\\n\"")); meta_push_struct_body(ctx, m, e, (MetaPushStructParameters){ .layout_style = MetaPushStructStyle_C, .union_style = MetaPushStructStyle_C, @@ -3593,6 +3654,21 @@ meta_push_shader_reload_info(MetaprogramContext *m, MetaContext *ctx) m->scratch = ctx->scratch; } } meta_end_scope(m, s8("};\n")); + + meta_begin_scope(m, s8("read_only global b8 " META_NAMESPACE_LOWER "_shader_has_primitive[] = {")); + for (da_count bs = 0; bs < ctx->base_shader_count; bs++) { + MetaShader *s = &ctx->entities.data[ctx->base_shader_ids[bs]].shader; + meta_push_line(m, s->kind == MetaShaderKind_Render ? s8("1,") : s8("0,")); + } + meta_end_scope(m, s8("};\n")); + + meta_begin_scope(m, s8("read_only global b8 " META_NAMESPACE_LOWER "_shader_primitive_is_vertex[] = {")); + for (da_count bs = 0; bs < ctx->base_shader_count; bs++) { + MetaShader *s = &ctx->entities.data[ctx->base_shader_ids[bs]].shader; + b8 vertex = s->kind == MetaShaderKind_Render && s->render.kind == MetaShaderPrimitiveKind_Vertex; + meta_push_line(m, vertex ? s8("1,") : s8("0,")); + } + meta_end_scope(m, s8("};\n")); } function void @@ -3603,30 +3679,67 @@ meta_push_shader_bake(MetaprogramContext *m, MetaContext *ctx) s8 shader_name = ctx->entity_names.data[ctx->base_shader_ids[bs]]; - meta_begin_line(m, s8("read_only global u8 " META_NAMESPACE_LOWER "_shader_")); - for (i64 i = 0; i < shader_name.len; i++) - stream_append_byte(&m->stream, ToLower(shader_name.data[i])); - - meta_begin_scope(m, s8("_bytes[] = {")); { - Arena scratch = m->scratch; - s8 filename = push_s8_from_parts(&scratch, s8(OS_PATH_SEPARATOR), s8("shaders"), s->file); - s8 file = read_entire_file((c8 *)filename.data, &scratch); - metagen_push_byte_array(m, file); - } meta_end_scope(m, s8("};\n")); + for EachElement(s->files, it) { + if (s->files[it].len > 0) { + meta_begin_line(m, s8("read_only global u8 " META_NAMESPACE_LOWER "_shader_")); + for (i64 i = 0; i < shader_name.len; i++) + stream_append_byte(&m->stream, ToLower(shader_name.data[i])); + + if (s->kind == MetaShaderKind_Render) + meta_push(m, it == 0 ? s8("_primitive") : s8("_fragment")); + + meta_begin_scope(m, s8("_bytes[] = {")); { + Arena scratch = m->scratch; + s8 filename = push_s8_from_parts(&scratch, s8(OS_PATH_SEPARATOR), s8("shaders"), s->files[it]); + s8 file = read_entire_file((c8 *)filename.data, &scratch); + metagen_push_byte_array(m, file); + } meta_end_scope(m, s8("};\n")); + } + } } - meta_begin_scope(m, s8("read_only global s8 " META_NAMESPACE_LOWER "_shader_data[] = {")); { + meta_begin_scope(m, s8("read_only global s8 *" META_NAMESPACE_LOWER "_shader_data[] = {")); { for (da_count bs = 0; bs < ctx->base_shader_count; bs++) { + MetaShader *s = &ctx->entities.data[ctx->base_shader_ids[bs]].shader; + s8 shader_name = ctx->entity_names.data[ctx->base_shader_ids[bs]]; - meta_begin_line(m, s8("{.data = " META_NAMESPACE_LOWER "_shader_")); - for (iz i = 0; i < shader_name.len; i++) + if (s->kind == MetaShaderKind_Render) { + meta_begin_scope(m, s8("(s8 []){")); + meta_indent(m); + } else { + meta_begin_line(m, s8("(s8 []){")); + } + + meta_push(m, s8("{.data = " META_NAMESPACE_LOWER "_shader_")); + for (i64 i = 0; i < shader_name.len; i++) stream_append_byte(&m->stream, ToLower(shader_name.data[i])); + if (s->kind == MetaShaderKind_Render) + meta_push(m, s8("_primitive")); + meta_push(m, s8("_bytes, .len = countof(" META_NAMESPACE_LOWER "_shader_")); - for (iz i = 0; i < shader_name.len; i++) + for (i64 i = 0; i < shader_name.len; i++) stream_append_byte(&m->stream, ToLower(shader_name.data[i])); - meta_end_line(m, s8("_bytes)},")); + + if (s->kind == MetaShaderKind_Render) + meta_push(m, s8("_primitive")); + meta_push(m, s8("_bytes)}")); + + if (s->kind == MetaShaderKind_Render) { + meta_end_line(m, s8(",")); + meta_begin_line(m, s8("{.data = " META_NAMESPACE_LOWER "_shader_")); + for (i64 i = 0; i < shader_name.len; i++) + stream_append_byte(&m->stream, ToLower(shader_name.data[i])); + + meta_push(m, s8("_fragment_bytes, .len = countof(" META_NAMESPACE_LOWER "_shader_")); + for (i64 i = 0; i < shader_name.len; i++) + stream_append_byte(&m->stream, ToLower(shader_name.data[i])); + meta_end_line(m, s8("_fragment_bytes)}")); + } + + if (s->kind == MetaShaderKind_Render) meta_end_scope(m, s8("},")); + else meta_end_line(m, s8("},")); } } meta_end_scope(m, s8("};\n")); } @@ -3662,7 +3775,9 @@ metagen_emit_c_code(MetaContext *ctx, Arena arena) u32 dep_count = 0; for (da_count bs = 0; bs < ctx->base_shader_count; bs++) { MetaShader *s = &ctx->entities.data[ctx->base_shader_ids[bs]].shader; - deps[dep_count++] = (c8 *)push_s8_from_parts(&m->scratch, s8(OS_PATH_SEPARATOR), s8("shaders"), s->file).data; + deps[dep_count++] = (c8 *)push_s8_from_parts(&m->scratch, s8(OS_PATH_SEPARATOR), s8("shaders"), s->files[0]).data; + if (s->files[1].len > 0) + deps[dep_count++] = (c8 *)push_s8_from_parts(&m->scratch, s8(OS_PATH_SEPARATOR), s8("shaders"), s->files[1]).data; } if (needs_rebuild_(out_shaders, deps, dep_count)) { build_log_generate("Bake Shaders"); @@ -3939,6 +4054,19 @@ metagen_emit_c_code(MetaContext *ctx, Arena arena) } } meta_end_scope(m, s8("};\n")); + meta_begin_scope(m, s8("read_only global u8 " META_NAMESPACE_LOWER "_shader_push_constant_sizes[] = {")); + for (da_count bs = 0; bs < ctx->base_shader_count; bs++) { + da_count id = ctx->base_shader_ids[bs]; + MetaEntity *e = ctx->entities.data + id; + MetaEntityID pc_id = meta_entity_first_child_of_kind(ctx, e, MetaEntityKind_PushConstants); + if (pc_id.value != 0) { + meta_push_line(m, s8("sizeof(" META_NAMESPACE_UPPER), ctx->entity_names.data[id], s8("PushConstants),")); + } else { + meta_push_line(m, s8("0,")); + } + } + meta_end_scope(m, s8("};\n")); + //fprintf(stderr, "%.*s\n", (i32)m.stream.widx, m.stream.data); result = meta_write_and_reset(m, out_meta); @@ -4734,7 +4862,7 @@ metagen_load_context(Arena *arena, char *filename) { for (da_count shader = 0; shader < ctx->entity_kind_counts[MetaEntityKind_Shader]; shader++) { MetaEntity *e = ctx->entities.data + ctx->entity_kind_ids[MetaEntityKind_Shader][shader]; - if (e->shader.file.len > 0) + if (e->shader.files[0].len > 0) ctx->base_shader_count++; } @@ -4744,14 +4872,14 @@ metagen_load_context(Arena *arena, char *filename) da_count base_shader_ids_index = 0; for (da_count shader = 0; shader < ctx->entity_kind_counts[MetaEntityKind_Shader]; shader++) { da_count id = ctx->entity_kind_ids[MetaEntityKind_Shader][shader]; - if (ctx->entities.data[id].shader.file.len > 0) + if (ctx->entities.data[id].shader.files[0].len > 0) ctx->base_shader_ids[base_shader_ids_index++] = id; } // NOTE(rnp): first pass to resolve real shaders for (da_count shader = 0; shader < ctx->entity_kind_counts[MetaEntityKind_Shader]; shader++) { da_count id = ctx->entity_kind_ids[MetaEntityKind_Shader][shader]; - if (ctx->entities.data[id].shader.file.len > 0) { + if (ctx->entities.data[id].shader.files[0].len > 0) { ctx->base_shader_id_map[shader] = meta_lookup_id_slow(ctx->base_shader_ids, ctx->base_shader_count, id); diff --git a/external/include/raylib_extended.h b/external/include/raylib_extended.h @@ -1,2 +0,0 @@ -#include "../raylib/src/raylib.h" -RLAPI void *GetPlatformWindowHandle(void); diff --git a/external/rcore_extended.c b/external/rcore_extended.c @@ -1,8 +0,0 @@ -/* NOTE(rnp): hacky stuff to work around broken raylib garbage */ -#include <raylib_extended.h> -#include "raylib/src/rcore.c" - -void *GetPlatformWindowHandle(void) -{ - return (void *)platform.handle; -} diff --git a/generated/beamformer.meta.c b/generated/beamformer.meta.c @@ -4,7 +4,7 @@ // NOTE: Constants (Integer) #define BeamformerFilterSlots (4) -#define BeamformerMaxBacklogFrames (16) +#define BeamformerMaxBacklogFrames (4096) #define BeamformerMaxChannelCount (256) #define BeamformerMaxEmissionsCount (256) #define BeamformerMaxComputeShaderStages (16) @@ -84,23 +84,31 @@ typedef enum { } BeamformerAcquisitionKind; typedef enum { - BeamformerShaderKind_CudaDecode = 0, - BeamformerShaderKind_CudaHilbert = 1, - BeamformerShaderKind_Decode = 2, - BeamformerShaderKind_Filter = 3, - BeamformerShaderKind_Demodulate = 4, - BeamformerShaderKind_DAS = 5, - BeamformerShaderKind_MinMax = 6, - BeamformerShaderKind_Sum = 7, - BeamformerShaderKind_Render3D = 8, + BeamformerShaderKind_CudaDecode = 0, + BeamformerShaderKind_CudaHilbert = 1, + BeamformerShaderKind_Decode = 2, + BeamformerShaderKind_Filter = 3, + BeamformerShaderKind_Demodulate = 4, + BeamformerShaderKind_DAS = 5, + BeamformerShaderKind_Sum = 6, + BeamformerShaderKind_MinMax = 7, + BeamformerShaderKind_CoherencyWeighting = 8, + BeamformerShaderKind_BufferClear = 9, + BeamformerShaderKind_RenderBeamformed = 10, BeamformerShaderKind_Count, - BeamformerShaderKind_ComputeFirst = BeamformerShaderKind_CudaDecode, - BeamformerShaderKind_ComputeLast = BeamformerShaderKind_Sum, - BeamformerShaderKind_ComputeCount = 8, - BeamformerShaderKind_RenderFirst = BeamformerShaderKind_Render3D, - BeamformerShaderKind_RenderLast = BeamformerShaderKind_Render3D, - BeamformerShaderKind_RenderCount = 1, + BeamformerShaderKind_ComputeFirst = BeamformerShaderKind_CudaDecode, + BeamformerShaderKind_ComputeLast = BeamformerShaderKind_MinMax, + BeamformerShaderKind_ComputeCount = 8, + BeamformerShaderKind_ComputeHelpersFirst = BeamformerShaderKind_CoherencyWeighting, + BeamformerShaderKind_ComputeHelpersLast = BeamformerShaderKind_CoherencyWeighting, + BeamformerShaderKind_ComputeHelpersCount = 1, + BeamformerShaderKind_ComputeInternalFirst = BeamformerShaderKind_BufferClear, + BeamformerShaderKind_ComputeInternalLast = BeamformerShaderKind_BufferClear, + BeamformerShaderKind_ComputeInternalCount = 1, + BeamformerShaderKind_RenderFirst = BeamformerShaderKind_RenderBeamformed, + BeamformerShaderKind_RenderLast = BeamformerShaderKind_RenderBeamformed, + BeamformerShaderKind_RenderCount = 1, } BeamformerShaderKind; typedef struct { @@ -141,7 +149,6 @@ typedef struct { u32 coherency_weighting; u32 single_focus; u32 single_orientation; - u32 fast; u32 sparse; u32 acquisition_count; u32 acquisition_kind; @@ -159,12 +166,78 @@ typedef struct { } BeamformerDASBakeParameters; typedef struct { - m4 xdc_transform; - m4 voxel_transform; - v2 xdc_element_pitch; + u32 data_kind; +} BeamformerCoherencyWeightingBakeParameters; + +typedef struct { + u64 hadamard_buffer; + u64 rf_buffer; + u64 output_buffer; + u64 output_rf_buffer; + b32 first_pass; +} BeamformerDecodePushConstants; + +typedef struct { + u64 input_data; + u64 output_data; + u64 filter_coefficients; +} BeamformerFilterPushConstants; + +typedef struct { + m4 xdc_transform; + m4 voxel_transform; + v2 xdc_element_pitch; + u64 rf_data; + u64 output_data; + u64 incoherent_output; + u64 array_parameters; + u32 output_size_x; + u32 output_size_y; + u32 output_size_z; + u32 cycle_t; + i32 channel_t; } BeamformerDASPushConstants; typedef struct { + u64 output_data; + u64 input_data; + u32 image_elements; + f32 scale; +} BeamformerSumPushConstants; + +typedef struct { + u64 left_side_buffer; + u64 right_side_buffer; + u32 elements; + f32 scale; + u32 output_size_x; + u32 output_size_y; + u32 output_size_z; +} BeamformerCoherencyWeightingPushConstants; + +typedef struct { + u64 data; + u32 clear_word; + u32 words; +} BeamformerBufferClearPushConstants; + +typedef struct { + m4 mvp_matrix; + u64 positions; + u64 normals; + v4 bounding_box_colour; + f32 bounding_box_fraction; + f32 db_cutoff; + f32 threshold; + f32 gamma; + u64 input_data; + u32 input_size_x; + u32 input_size_y; + u32 input_size_z; + u32 data_kind; +} BeamformerRenderBeamformedPushConstants; + +typedef struct { f32 cycles; f32 frequency; } BeamformerSineParameters; @@ -304,10 +377,17 @@ typedef struct { BeamformerDataKind data_kind; } BeamformerSimpleParameters; +typedef struct { + v2 focal_vectors[BeamformerMaxChannelCount]; + i16 sparse_elements[BeamformerMaxChannelCount]; + u16 transmit_receive_orientations[BeamformerMaxChannelCount]; +} BeamformerDASArrayParameters; + typedef union { - BeamformerDecodeBakeParameters Decode; - BeamformerFilterBakeParameters Filter; - BeamformerDASBakeParameters DAS; + BeamformerDecodeBakeParameters Decode; + BeamformerFilterBakeParameters Filter; + BeamformerDASBakeParameters DAS; + BeamformerCoherencyWeightingBakeParameters CoherencyWeighting; } BeamformerShaderBakeParameters; read_only global u8 beamformer_data_kind_element_size[] = { @@ -399,27 +479,33 @@ read_only global s8 beamformer_shader_names[] = { s8_comp("Filter"), s8_comp("Demodulate"), s8_comp("DAS"), - s8_comp("MinMax"), s8_comp("Sum"), - s8_comp("Render3D"), + s8_comp("MinMax"), + s8_comp("CoherencyWeighting"), + s8_comp("BufferClear"), + s8_comp("RenderBeamformed"), }; read_only global BeamformerShaderKind beamformer_reloadable_shader_kinds[] = { BeamformerShaderKind_Decode, BeamformerShaderKind_Filter, BeamformerShaderKind_DAS, - BeamformerShaderKind_MinMax, BeamformerShaderKind_Sum, - BeamformerShaderKind_Render3D, + BeamformerShaderKind_MinMax, + BeamformerShaderKind_CoherencyWeighting, + BeamformerShaderKind_BufferClear, + BeamformerShaderKind_RenderBeamformed, }; -read_only global s8 beamformer_reloadable_shader_files[] = { - s8_comp("decode.glsl"), - s8_comp("filter.glsl"), - s8_comp("das.glsl"), - s8_comp("min_max.glsl"), - s8_comp("sum.glsl"), - s8_comp("render_3d.frag.glsl"), +read_only global s8 *beamformer_reloadable_shader_files[] = { + (s8 []){s8_comp("decode.glsl")}, + (s8 []){s8_comp("filter.glsl")}, + (s8 []){s8_comp("das.glsl")}, + (s8 []){s8_comp("sum.glsl")}, + (s8 []){s8_comp("min_max.glsl")}, + (s8 []){s8_comp("coherency_weighting.glsl")}, + (s8 []){s8_comp("buffer_clear.glsl")}, + (s8 []){s8_comp("render_3d.vert.glsl"), s8_comp("render_3d.frag.glsl")}, }; read_only global i32 beamformer_shader_reloadable_index_by_shader[] = { @@ -432,6 +518,8 @@ read_only global i32 beamformer_shader_reloadable_index_by_shader[] = { 3, 4, 5, + 6, + 7, }; read_only global i32 beamformer_reloadable_compute_shader_info_indices[] = { @@ -442,10 +530,18 @@ read_only global i32 beamformer_reloadable_compute_shader_info_indices[] = { 4, }; -read_only global i32 beamformer_reloadable_render_shader_info_indices[] = { +read_only global i32 beamformer_reloadable_compute_helpers_shader_info_indices[] = { 5, }; +read_only global i32 beamformer_reloadable_compute_internal_shader_info_indices[] = { + 6, +}; + +read_only global i32 beamformer_reloadable_render_shader_info_indices[] = { + 7, +}; + read_only global s8 beamformer_shader_global_header_strings[] = { s8_comp("" "#define DataKind_Int16 0\n" @@ -460,6 +556,23 @@ read_only global s8 beamformer_shader_global_header_strings[] = { "#define DecodeMode_Hadamard 1\n" "\n"), s8_comp("" + "layout(push_constant, std430) uniform PushConstants {\n" + " uint64_t hadamard_buffer;\n" + " uint64_t rf_buffer;\n" + " uint64_t output_buffer;\n" + " uint64_t output_rf_buffer;\n" + " bool first_pass;\n" + "};\n" + "\n"), + s8_comp("" + "layout(push_constant, std430) uniform PushConstants {\n" + " uint64_t input_data;\n" + " uint64_t output_data;\n" + " uint64_t filter_coefficients;\n" + "};\n" + "\n"), + s8_comp("#define MaxChannelCount (256)\n\n"), + s8_comp("" "#define AcquisitionKind_FORCES 0\n" "#define AcquisitionKind_UFORCES 1\n" "#define AcquisitionKind_HERCULES 2\n" @@ -484,30 +597,115 @@ read_only global s8 beamformer_shader_global_header_strings[] = { "#define RCAOrientation_Columns 2\n" "\n"), s8_comp("" - "layout(std140, binding = 0) uniform PushConstants {\n" - " mat4 xdc_transform;\n" - " mat4 voxel_transform;\n" - " vec2 xdc_element_pitch;\n" + "struct DASArrayParameters {\n" + " f32vec2 focal_vectors[MaxChannelCount];\n" + " int16_t sparse_elements[MaxChannelCount];\n" + " uint16_t transmit_receive_orientations[MaxChannelCount];\n" + "};\n" + "\n"), + s8_comp("" + "layout(push_constant, std430) uniform PushConstants {\n" + " f32mat4 xdc_transform;\n" + " f32mat4 voxel_transform;\n" + " f32vec2 xdc_element_pitch;\n" + " uint64_t rf_data;\n" + " uint64_t output_data;\n" + " uint64_t incoherent_output;\n" + " uint64_t array_parameters;\n" + " uint32_t output_size_x;\n" + " uint32_t output_size_y;\n" + " uint32_t output_size_z;\n" + " uint32_t cycle_t;\n" + " int32_t channel_t;\n" + "};\n" + "\n"), + s8_comp("" + "layout(push_constant, std430) uniform PushConstants {\n" + " uint64_t output_data;\n" + " uint64_t input_data;\n" + " uint32_t image_elements;\n" + " float32_t scale;\n" + "};\n" + "\n"), + s8_comp("" + "layout(push_constant, std430) uniform PushConstants {\n" + " uint64_t left_side_buffer;\n" + " uint64_t right_side_buffer;\n" + " uint32_t elements;\n" + " float32_t scale;\n" + " uint32_t output_size_x;\n" + " uint32_t output_size_y;\n" + " uint32_t output_size_z;\n" + "};\n" + "\n"), + s8_comp("" + "layout(push_constant, std430) uniform PushConstants {\n" + " uint64_t data;\n" + " uint32_t clear_word;\n" + " uint32_t words;\n" + "};\n" + "\n"), + s8_comp("" + "layout(push_constant, std430) uniform PushConstants {\n" + " f32mat4 mvp_matrix;\n" + " uint64_t positions;\n" + " uint64_t normals;\n" + " f32vec4 bounding_box_colour;\n" + " float32_t bounding_box_fraction;\n" + " float32_t db_cutoff;\n" + " float32_t threshold;\n" + " float32_t gamma;\n" + " uint64_t input_data;\n" + " uint32_t input_size_x;\n" + " uint32_t input_size_y;\n" + " uint32_t input_size_z;\n" + " uint32_t data_kind;\n" "};\n" "\n"), }; -read_only global i32 *beamformer_shader_header_vectors[] = { - (i32 []){0, 1}, - (i32 []){0}, - (i32 []){2, 0, 3, 4, 5}, +read_only global b8 beamformer_shader_has_primitive[] = { + 0, 0, 0, 0, + 0, + 0, + 0, + 1, }; -read_only global i32 beamformer_shader_header_vector_lengths[] = { - 2, - 1, - 5, +read_only global b8 beamformer_shader_primitive_is_vertex[] = { + 0, + 0, + 0, 0, 0, 0, + 0, + 1, +}; + +read_only global i32 *beamformer_shader_header_vectors[] = { + (i32 []){0, 1, 2}, + (i32 []){0, 3}, + (i32 []){4, 5, 0, 6, 7, 8, 9}, + (i32 []){0, 10}, + 0, + (i32 []){0, 11}, + (i32 []){12}, + (i32 []){0, 13}, +}; + +read_only global i32 beamformer_shader_header_vector_lengths[] = { + 3, + 2, + 7, + 2, + 0, + 2, + 1, + 2, }; read_only global s8 *beamformer_shader_bake_parameter_names[] = { @@ -547,7 +745,6 @@ read_only global s8 *beamformer_shader_bake_parameter_names[] = { s8_comp("CoherencyWeighting"), s8_comp("SingleFocus"), s8_comp("SingleOrientation"), - s8_comp("Fast"), s8_comp("Sparse"), s8_comp("AcquisitionCount"), s8_comp("AcquisitionKind"), @@ -565,13 +762,19 @@ read_only global s8 *beamformer_shader_bake_parameter_names[] = { }, 0, 0, + (s8 []){ + s8_comp("DataKind"), + }, + 0, 0, }; read_only global u32 beamformer_shader_bake_parameter_float_bits[] = { 0x00000000UL, 0x00006000UL, - 0x0007f000UL, + 0x0003f800UL, + 0x00000000UL, + 0x00000000UL, 0x00000000UL, 0x00000000UL, 0x00000000UL, @@ -580,9 +783,22 @@ read_only global u32 beamformer_shader_bake_parameter_float_bits[] = { read_only global u8 beamformer_shader_bake_parameter_counts[] = { 12, 15, - 19, + 18, 0, 0, + 1, + 0, + 0, +}; + +read_only global u8 beamformer_shader_push_constant_sizes[] = { + sizeof(BeamformerDecodePushConstants), + sizeof(BeamformerFilterPushConstants), + sizeof(BeamformerDASPushConstants), + sizeof(BeamformerSumPushConstants), 0, + sizeof(BeamformerCoherencyWeightingPushConstants), + sizeof(BeamformerBufferClearPushConstants), + sizeof(BeamformerRenderBeamformedPushConstants), }; diff --git a/lib/ogl_beamformer_lib.c b/lib/ogl_beamformer_lib.c @@ -229,6 +229,15 @@ beamformer_get_last_error_string(void) return beamformer_error_string(beamformer_get_last_error()); } +u64 +beamformer_maximum_frame_size(void) +{ + u64 result = U64_MAX; + if (check_shared_memory()) + result = g_beamformer_library_context.bp->max_beamformed_data_size; + return result; +} + void beamformer_set_global_timeout(u32 timeout_ms) { @@ -650,12 +659,14 @@ beamformer_beamform_data(BeamformerSimpleParameters *bp, void *data, uint32_t da complex |= shader == BeamformerShaderKind_Demodulate || shader == BeamformerShaderKind_CudaHilbert; } - iz output_size = output_points.x * output_points.y * output_points.z * (i32)sizeof(f32); + u64 output_size = output_points.x * output_points.y * output_points.z * sizeof(f32); if (complex) output_size *= 2; + result = lib_error_check(output_size <= g_beamformer_library_context.bp->max_beamformed_data_size, FrameSizeOverflow); + Arena scratch = beamformer_shared_memory_scratch_arena(g_beamformer_library_context.bp, g_beamformer_library_context.shared_memory_size); - if (out_data) result &= lib_error_check(output_size <= arena_capacity(&scratch, u8), ExportSpaceOverflow); + if (result && out_data) result &= lib_error_check((iz)output_size <= arena_capacity(&scratch, u8), ExportSpaceOverflow); if (result) { result = beamformer_push_data_with_compute(data, data_size, 0, 0); diff --git a/lib/ogl_beamformer_lib_base.h b/lib/ogl_beamformer_lib_base.h @@ -27,6 +27,7 @@ X(ExportSpaceOverflow, 16, "not enough space for data export") \ X(SharedMemory, 17, "failed to open shared memory region") \ X(SyncVariable, 18, "failed to acquire lock within timeout period") \ + X(FrameSizeOverflow, 19, "maximum frame size exceeded") \ #define X(type, num, string) BeamformerLibErrorKind_##type = num, typedef enum {BEAMFORMER_LIB_ERRORS} BeamformerLibErrorKind; @@ -38,6 +39,9 @@ BEAMFORMER_LIB_EXPORT BeamformerLibErrorKind beamformer_get_last_error(void); BEAMFORMER_LIB_EXPORT const char *beamformer_get_last_error_string(void); BEAMFORMER_LIB_EXPORT const char *beamformer_error_string(BeamformerLibErrorKind kind); +// NOTE: returns U64_MAX if shared memory could not be opened +BEAMFORMER_LIB_EXPORT uint64_t beamformer_maximum_frame_size(void); + /////////////////////////// // NOTE: Simple API /* Usage: diff --git a/main_linux.c b/main_linux.c @@ -252,16 +252,7 @@ load_platform_libraries(BeamformerInput *input) #if BEAMFORMER_RENDERDOC_HOOKS local_persist OSLibrary renderdoc_handle = {OSInvalidHandleValue}; renderdoc_handle = load_library(OS_RENDERDOC_SONAME, 0, RTLD_NOW|RTLD_LOCAL|RTLD_NOLOAD); - if ValidHandle(renderdoc_handle) { - renderdoc_get_api_fn *get_api = os_lookup_symbol(renderdoc_handle, "RENDERDOC_GetAPI"); - if (get_api) { - RenderDocAPI *api = 0; - if (get_api(10600, (void **)&api)) { - input->renderdoc_start_frame_capture = RENDERDOC_START_FRAME_CAPTURE(api); - input->renderdoc_end_frame_capture = RENDERDOC_END_FRAME_CAPTURE(api); - } - } - } + load_renderdoc_functions(input, renderdoc_handle); #endif } diff --git a/main_w32.c b/main_w32.c @@ -301,16 +301,7 @@ load_platform_libraries(BeamformerInput *input) #if BEAMFORMER_RENDERDOC_HOOKS local_persist OSLibrary renderdoc_handle = {OSInvalidHandleValue}; renderdoc_handle = get_module(OS_RENDERDOC_SONAME); - if ValidHandle(renderdoc_handle) { - renderdoc_get_api_fn *get_api = os_lookup_symbol(renderdoc_handle, "RENDERDOC_GetAPI"); - if (get_api) { - RenderDocAPI *api = 0; - if (get_api(10600, (void **)&api)) { - input->renderdoc_start_frame_capture = RENDERDOC_START_FRAME_CAPTURE(api); - input->renderdoc_end_frame_capture = RENDERDOC_END_FRAME_CAPTURE(api); - } - } - } + load_renderdoc_functions(input, renderdoc_handle); #endif } diff --git a/math.c b/math.c @@ -153,20 +153,6 @@ subrange_n_from_n_m_count(u64 n, u64 n_count, u64 m) return result; } -function b32 -iv2_equal(iv2 a, iv2 b) -{ - b32 result = a.x == b.x && a.y == b.y; - return result; -} - -function b32 -iv3_equal(iv3 a, iv3 b) -{ - b32 result = a.x == b.x && a.y == b.y && a.z == b.z; - return result; -} - function i32 iv3_dimension(iv3 points) { @@ -574,12 +560,12 @@ function m4 perspective_projection(f32 n, f32 f, f32 fov, f32 aspect) { m4 result; - f32 t = tan_f32(fov / 2.0f); + f32 t = n * tan_f32(fov / 2.0f); f32 r = t * aspect; f32 a = -(f + n) / (f - n); f32 b = -2 * f * n / (f - n); - result.c[0] = (v4){{1 / r, 0, 0, 0}}; - result.c[1] = (v4){{0, 1 / t, 0, 0}}; + result.c[0] = (v4){{n / r, 0, 0, 0}}; + result.c[1] = (v4){{0, n / t, 0, 0}}; result.c[2] = (v4){{0, 0, a, -1}}; result.c[3] = (v4){{0, 0, b, 0}}; return result; diff --git a/opengl.h b/opengl.h @@ -11,154 +11,63 @@ #include <GL/gl.h> /* NOTE: do not add extra 0s to these, even at the start -> garbage compilers will complain */ -#define GL_DYNAMIC_STORAGE_BIT 0x0100 #define GL_SHADER_IMAGE_ACCESS_BARRIER_BIT 0x00000020 #define GL_TEXTURE_UPDATE_BARRIER_BIT 0x00000100 -#define GL_SHADER_STORAGE_BARRIER_BIT 0x00002000 -#define GL_UNSIGNED_INT_8_8_8_8 0x8035 -#define GL_TEXTURE_3D 0x806F -#define GL_MAX_3D_TEXTURE_SIZE 0x8073 -#define GL_MULTISAMPLE 0x809D +#define GL_NONE 0 + #define GL_CLAMP_TO_BORDER 0x812D -#define GL_CLAMP_TO_EDGE 0x812F -#define GL_DEPTH_COMPONENT24 0x81A6 -#define GL_MAJOR_VERSION 0x821B -#define GL_MINOR_VERSION 0x821C -#define GL_RG 0x8227 -#define GL_R16F 0x822D -#define GL_R32F 0x822E #define GL_RG32F 0x8230 -#define GL_R8I 0x8231 -#define GL_R16I 0x8233 -#define GL_MAX_COMPUTE_SHARED_MEMORY_SIZE 0x8262 -#define GL_BUFFER 0x82E0 -#define GL_PROGRAM 0x82E2 -#define GL_MIRRORED_REPEAT 0x8370 -#define GL_QUERY_RESULT 0x8866 #define GL_READ_ONLY 0x88B8 #define GL_WRITE_ONLY 0x88B9 #define GL_READ_WRITE 0x88BA -#define GL_TIME_ELAPSED 0x88BF -#define GL_STATIC_DRAW 0x88E4 -#define GL_UNIFORM_BUFFER 0x8A11 -#define GL_MAX_UNIFORM_BLOCK_SIZE 0x8A30 -#define GL_FRAGMENT_SHADER 0x8B30 -#define GL_VERTEX_SHADER 0x8B31 -#define GL_COMPILE_STATUS 0x8B81 -#define GL_LINK_STATUS 0x8B82 -#define GL_INFO_LOG_LENGTH 0x8B84 -#define GL_MAX_TEXTURE_BUFFER_SIZE 0x8C2B -#define GL_COLOR_ATTACHMENT0 0x8CE0 -#define GL_DEPTH_ATTACHMENT 0x8D00 -#define GL_FRAMEBUFFER 0x8D40 -#define GL_RENDERBUFFER 0x8D41 -#define GL_RED_INTEGER 0x8D94 -#define GL_TIMESTAMP 0x8E28 -#define GL_MIN_MAP_BUFFER_ALIGNMENT 0x90BC -#define GL_SHADER_STORAGE_BUFFER 0x90D2 -#define GL_MAX_SHADER_STORAGE_BLOCK_SIZE 0x90DE -#define GL_MAX_SERVER_WAIT_TIMEOUT 0x9111 -#define GL_SYNC_GPU_COMMANDS_COMPLETE 0x9117 -#define GL_TIMEOUT_EXPIRED 0x911B -#define GL_WAIT_FAILED 0x911D -#define GL_TEXTURE_BUFFER_OFFSET_ALIGNMENT 0x919F -#define GL_COMPUTE_SHADER 0x91B9 #define GL_DEBUG_OUTPUT 0x92E0 +#define GL_DEDICATED_MEMORY_OBJECT_EXT 0x9581 #define GL_HANDLE_TYPE_OPAQUE_FD_EXT 0x9586 #define GL_HANDLE_TYPE_OPAQUE_WIN32_EXT 0x9587 +#define GL_LAYOUT_COLOR_ATTACHMENT_EXT 0x958E +#define GL_LAYOUT_SHADER_READ_ONLY_EXT 0x9591 typedef char GLchar; typedef i64 GLsizeiptr; typedef i64 GLintptr; typedef u64 GLuint64; -typedef struct __GLsync *GLsync; /* X(name, ret, params) */ #define OGLProcedureList \ - X(glAttachShader, void, (GLuint program, GLuint shader)) \ - X(glBeginQuery, void, (GLenum target, GLuint id)) \ - X(glBindBufferBase, void, (GLenum target, GLuint index, GLuint buffer)) \ - X(glBindBufferRange, void, (GLenum target, GLuint index, GLuint buffer, GLintptr offset, GLsizeiptr size)) \ - X(glBindFramebuffer, void, (GLenum target, GLuint framebuffer)) \ X(glBindImageTexture, void, (GLuint unit, GLuint texture, GLint level, GLboolean layered, GLint layer, GLenum access, GLenum format)) \ - X(glBindTextureUnit, void, (GLuint unit, GLuint texture)) \ - X(glBindVertexArray, void, (GLuint array)) \ - X(glBlitNamedFramebuffer, void, (GLuint sfb, GLuint dfb, GLint sx0, GLint sy0, GLint sx1, GLint sy1, GLint dx0, GLint dy0, GLint dx1, GLint dy1, GLbitfield mask, GLenum filter)) \ - X(glClearNamedBufferData, void, (GLuint buffer, GLenum internalformat, GLenum format, GLenum type, const void *data)) \ X(glClearNamedFramebufferfv, void, (GLuint framebuffer, GLenum buffer, GLint drawbuffer, const GLfloat *value)) \ X(glClearTexImage, void, (GLuint texture, GLint level, GLenum format, GLenum type, const void *data)) \ - X(glClientWaitSync, GLenum, (GLsync sync, GLbitfield flags, GLuint64 timeout)) \ - X(glCompileShader, void, (GLuint shader)) \ - X(glCopyImageSubData, void, (GLuint srcName, GLenum srcTarget, GLint srcLevel, GLint srcX, GLint srcY, GLint srcZ, GLuint dstName, GLenum dstTarget, GLint dstLevel, GLint dstX, GLint dstY, GLint dstZ, GLsizei srcWidth, GLsizei srcHeight, GLsizei srcDepth)) \ - X(glCreateBuffers, void, (GLsizei n, GLuint *buffers)) \ - X(glCreateFramebuffers, void, (GLsizei n, GLuint *ids)) \ - X(glCreateProgram, GLuint, (void)) \ - X(glCreateQueries, void, (GLenum target, GLsizei n, GLuint *ids)) \ - X(glCreateRenderbuffers, void, (GLsizei n, GLuint *renderbuffers)) \ - X(glCreateShader, GLuint, (GLenum shaderType)) \ X(glCreateTextures, void, (GLenum target, GLsizei n, GLuint *textures)) \ - X(glCreateVertexArrays, void, (GLsizei n, GLuint *arrays)) \ X(glDebugMessageCallback, void, (void (*)(GLenum source, GLenum type, GLuint id, GLenum severity, GLsizei length, const GLchar *message, const void *user), void *user)) \ - X(glDeleteBuffers, void, (GLsizei n, const GLuint *buffers)) \ - X(glDeleteProgram, void, (GLuint program)) \ - X(glDeleteShader, void, (GLuint shader)) \ - X(glDeleteSync, void, (GLsync sync)) \ X(glDispatchCompute, void, (GLuint num_groups_x, GLuint num_groups_y, GLuint num_groups_z)) \ - X(glEndQuery, void, (GLenum target)) \ - X(glEnableVertexArrayAttrib, void, (GLuint vao, GLuint index)) \ - X(glFenceSync, GLsync, (GLenum condition, GLbitfield flags)) \ - X(glGenerateTextureMipmap, void, (GLuint texture)) \ - X(glGetProgramInfoLog, void, (GLuint program, GLsizei maxLength, GLsizei *length, GLchar *infoLog)) \ - X(glGetProgramiv, void, (GLuint program, GLenum pname, GLint *params)) \ - X(glGetQueryObjectui64v, void, (GLuint id, GLenum pname, GLuint64 *params)) \ - X(glGetShaderInfoLog, void, (GLuint shader, GLsizei maxLength, GLsizei *length, GLchar *infoLog)) \ - X(glGetShaderiv, void, (GLuint shader, GLenum pname, GLint *params)) \ - X(glGetTextureImage, void, (GLuint texture, GLint level, GLenum format, GLenum type, GLsizei bufSize, void *pixels)) \ - X(glLinkProgram, void, (GLuint program)) \ X(glMemoryBarrier, void, (GLbitfield barriers)) \ - X(glNamedBufferData, void, (GLuint buffer, GLsizeiptr size, const void *data, GLenum usage)) \ - X(glNamedBufferStorage, void, (GLuint buffer, GLsizeiptr size, const void *data, GLbitfield flags)) \ - X(glNamedBufferSubData, void, (GLuint buffer, GLintptr offset, GLsizei size, const void *data)) \ - X(glNamedFramebufferRenderbuffer, void, (GLuint fb, GLenum attachment, GLenum renderbuffertarget, GLuint rb)) \ - X(glNamedFramebufferTexture, void, (GLuint fb, GLenum attachment, GLuint texture, GLint level)) \ - X(glNamedRenderbufferStorageMultisample, void, (GLuint rb, GLsizei samples, GLenum internalformat, GLsizei width, GLsizei height)) \ X(glObjectLabel, void, (GLenum identifier, GLuint name, GLsizei length, const char *label)) \ - X(glProgramUniform1f, void, (GLuint program, GLint location, GLfloat v0)) \ - X(glProgramUniform1i, void, (GLuint program, GLint location, GLint v0)) \ - X(glProgramUniform1ui, void, (GLuint program, GLint location, GLuint v0)) \ - X(glProgramUniform3iv, void, (GLuint program, GLint location, GLsizei count, const GLint *value)) \ - X(glProgramUniform4fv, void, (GLuint program, GLint location, GLsizei count, const GLfloat *value)) \ - X(glProgramUniformMatrix4fv, void, (GLuint program, GLint location, GLsizei count, GLboolean transpose, const GLfloat *value)) \ - X(glQueryCounter, void, (GLuint id, GLenum target)) \ - X(glShaderSource, void, (GLuint shader, GLsizei count, const GLchar **strings, const GLint *lengths)) \ X(glTextureParameteri, void, (GLuint texture, GLenum pname, GLint param)) \ X(glTextureParameterfv, void, (GLuint texture, GLenum pname, const GLfloat *param)) \ - X(glTextureStorage1D, void, (GLuint texture, GLsizei levels, GLenum internalformat, GLsizei width)) \ - X(glTextureStorage2D, void, (GLuint texture, GLsizei levels, GLenum internalformat, GLsizei width, GLsizei height)) \ - X(glTextureStorage3D, void, (GLuint texture, GLsizei levels, GLenum internalformat, GLsizei width, GLsizei height, GLsizei depth)) \ - X(glTextureSubImage1D, void, (GLuint texture, GLint level, GLint xoff, GLsizei width, GLenum format, GLenum type, const void *pix)) \ - X(glTextureSubImage2D, void, (GLuint texture, GLint level, GLint xoff, GLint yoff, GLsizei width, GLsizei height, GLenum format, GLenum type, const void *pix)) \ - X(glTextureSubImage3D, void, (GLuint texture, GLint level, GLint xoff, GLint yoff, GLint zoff, GLsizei width, GLsizei height, GLsizei depth, GLenum format, GLenum type, const void *pix)) \ - X(glUseProgram, void, (GLuint program)) \ - X(glVertexArrayAttribBinding, void, (GLuint vao, GLuint attribindex, GLuint bindingindex)) \ - X(glVertexArrayAttribFormat, void, (GLuint vao, GLuint attribindex, GLint size, GLenum type, GLboolean normalized, GLuint relativeoffset)) \ - X(glVertexArrayElementBuffer, void, (GLuint vao, GLuint buffer)) \ - X(glVertexArrayVertexBuffer, void, (GLuint vao, GLuint bindingindex, GLuint buffer, GLintptr offset, GLsizei stride)) \ - -#define OGLRequiredExtensionProcedureList \ +#define OGLRequiredExtensionProcedureListBase \ X(glCreateMemoryObjectsEXT, void, (GLsizei n, GLuint *memoryObjects)) \ X(glDeleteMemoryObjectsEXT, void, (GLsizei n, GLuint *memoryObjects)) \ X(glGenSemaphoresEXT, void, (GLsizei n, GLuint *semaphores)) \ - X(glImportMemoryFdEXT, void, (GLuint memory, GLuint64 size, GLenum handleType, int fd)) \ + X(glMemoryObjectParameterivEXT, void, (GLuint memoryObject, GLenum pname, const GLint *params)) \ + X(glSignalSemaphoreEXT, void, (GLuint semaphore, GLuint numBufferBarriers, const GLuint *buffers, GLuint numTextureBarriers, const GLuint *textures, const GLenum *dstLayouts)) \ + X(glTextureStorageMem2DEXT, void, (GLuint texture, GLsizei levels, GLenum internalFormat, GLsizei width, GLsizei height, GLuint memory, GLuint64 offset)) \ + X(glWaitSemaphoreEXT, void, (GLuint semaphore, GLuint numBufferBarriers, const GLuint *buffers, GLuint numTextureBarriers, const GLuint *textures, const GLenum *srcLayouts)) \ + +#define OGLRequiredExtensionProcedureListW32 \ X(glImportMemoryWin32HandleEXT, void, (GLuint memory, GLuint64 size, GLenum handleType, void *handle)) \ - X(glImportSemaphoreFdEXT, void, (GLuint semaphore, GLenum handleType, int fd)) \ X(glImportSemaphoreWin32HandleEXT, void, (GLuint semaphore, GLenum handleType, void *handle)) \ - X(glNamedBufferStorageMemEXT, void, (GLuint buffer, GLsizeiptr size, GLuint memory, GLuint64 offset)) \ - X(glWaitSemaphoreEXT, void, (GLuint semaphore, GLuint numBufferBarriers, const GLuint *buffers, GLuint numTextureBarriers, const GLuint *textures, const GLenum *srcLayouts)) \ +#define OGLRequiredExtensionProcedureListLinux \ + X(glImportMemoryFdEXT, void, (GLuint memory, GLuint64 size, GLenum handleType, int fd)) \ + X(glImportSemaphoreFdEXT, void, (GLuint semaphore, GLenum handleType, int fd)) \ + +#define OGLRequiredExtensionProcedureList \ + OGLRequiredExtensionProcedureListBase \ + OGLRequiredExtensionProcedureListW32 \ + OGLRequiredExtensionProcedureListLinux \ #define X(name, ret, params) typedef ret name##_fn params; OGLProcedureList diff --git a/shaders/buffer_clear.glsl b/shaders/buffer_clear.glsl @@ -0,0 +1,11 @@ +/* See LICENSE for license details. */ +layout(std430, buffer_reference, buffer_reference_align = 8) restrict writeonly buffer Buffer { + uint32_t values[]; +}; + +void main() +{ + uint32_t word = gl_GlobalInvocationID.x; + if (word < words) + Buffer(data).values[word] = clear_word; +} diff --git a/shaders/coherency_weighting.glsl b/shaders/coherency_weighting.glsl @@ -0,0 +1,41 @@ +/* See LICENSE for license details. */ +layout(std430, buffer_reference, buffer_reference_align = 8) restrict buffer Int16 { + int16_t values[]; +}; + +layout(std430, buffer_reference, buffer_reference_align = 8) restrict buffer Int16Complex { + i16vec2 values[]; +}; + +layout(std430, buffer_reference, buffer_reference_align = 8) restrict buffer Float32 { + float values[]; +}; + +layout(std430, buffer_reference, buffer_reference_align = 8) restrict buffer Float32Complex { + vec2 values[]; +}; + +#if DataKind == DataKind_Float32 + #define COHERENT_SAMPLE(index) Float32(left_side_buffer).values[index] + #define INCOHERENT_SAMPLE(index) Float32(right_side_buffer).values[index] +#elif DataKind == DataKind_Float32Complex + #define COHERENT_SAMPLE(index) Float32Complex(left_side_buffer).values[index] + #define INCOHERENT_SAMPLE(index) Float32(right_side_buffer).values[index] +#else + #error DataKind unsupported for CoherencyWeighting +#endif + +uint32_t output_index(uint32_t x, uint32_t y, uint32_t z) +{ + uint32_t result = output_size_x * output_size_y * z + output_size_x * y + x; + return result; +} + +void main() +{ + uvec3 out_voxel = gl_GlobalInvocationID; + if (!all(lessThan(out_voxel, uvec3(output_size_x, output_size_y, output_size_z)))) + return; + uint32_t index = output_index(out_voxel.x, out_voxel.y, out_voxel.z); + COHERENT_SAMPLE(index) *= COHERENT_SAMPLE(index) / INCOHERENT_SAMPLE(index); +} diff --git a/shaders/das.glsl b/shaders/das.glsl @@ -1,48 +1,54 @@ /* See LICENSE for license details. */ #if DataKind == DataKind_Float32 - #define SAMPLE_TYPE float - #define TEXTURE_KIND r32f - #define RESULT_TYPE_CAST(a) (a).x - #define OUTPUT_TYPE_CAST(a) vec4((a).x, 0, 0, 0) - #if !Fast - #define RESULT_TYPE vec2 - #define RESULT_LAST_INDEX 1 + #if CoherencyWeighting + #define RESULT_TYPE vec2 + #define RESULT_COHERENT_CAST(a) (a).x + #define RESULT_INCOHERENT_CAST(a) (a).y #endif + #define SAMPLE_TYPE float #elif DataKind == DataKind_Float32Complex - #define SAMPLE_TYPE vec2 - #define TEXTURE_KIND rg32f - #define RESULT_TYPE_CAST(a) (a).xy - #define OUTPUT_TYPE_CAST(a) vec4((a).xy, 0, 0) - #if !Fast - #define RESULT_TYPE vec3 - #define RESULT_LAST_INDEX 2 + #if CoherencyWeighting + #define RESULT_TYPE vec3 + #define RESULT_COHERENT_CAST(a) (a).xy + #define RESULT_INCOHERENT_CAST(a) (a).z #endif + #define SAMPLE_TYPE vec2 #else #error DataKind unsupported for DAS #endif -layout(std430, binding = 1) readonly restrict buffer buffer_1 { - SAMPLE_TYPE rf_data[]; -}; - #ifndef RESULT_TYPE #define RESULT_TYPE SAMPLE_TYPE #endif -#if Fast - #define RESULT_STORE(a, length_a) RESULT_TYPE(a) - layout(TEXTURE_KIND, binding = 0) restrict uniform image3D u_out_data_tex; +#ifndef RESULT_COHERENT_CAST + #define RESULT_COHERENT_CAST(a) (a) +#endif + +#if CoherencyWeighting + #define RESULT_STORE(a) RESULT_TYPE(RESULT_COHERENT_CAST(a), length(a)) #else - #define RESULT_STORE(a, length_a) RESULT_TYPE(a, length_a) - layout(TEXTURE_KIND, binding = 0) writeonly restrict uniform image3D u_out_data_tex; + #define RESULT_STORE(a) (a) #endif -layout(r16i, binding = 1) readonly restrict uniform iimage1D sparse_elements; -layout(rg32f, binding = 2) readonly restrict uniform image1D focal_vectors; -layout(r8i, binding = 3) readonly restrict uniform iimage1D transmit_receive_orientations; +layout(std430, buffer_reference, buffer_reference_align = 64) restrict readonly buffer RF { + SAMPLE_TYPE values[]; +}; + +layout(std430, buffer_reference, buffer_reference_align = 64) restrict buffer Output { + SAMPLE_TYPE values[]; +}; -#define RX_ORIENTATION(tx_rx) (((tx_rx) >> 0) & 0x0F) -#define TX_ORIENTATION(tx_rx) (((tx_rx) >> 4) & 0x0F) +layout(std430, buffer_reference, buffer_reference_align = 64) restrict buffer IncoherentOutput { + float values[]; +}; + +layout(std430, buffer_reference) restrict readonly buffer ArrayParameters { + DASArrayParameters data; +}; + +#define RX_ORIENTATION(tx_rx) bitfieldExtract((tx_rx), 0, 4) +#define TX_ORIENTATION(tx_rx) bitfieldExtract((tx_rx), 4, 4) #define C_SPLINE 0.5 @@ -70,10 +76,10 @@ SAMPLE_TYPE cubic(const int base_index, const float t) ); SAMPLE_TYPE samples[4] = { - rf_data[base_index + 0], - rf_data[base_index + 1], - rf_data[base_index + 2], - rf_data[base_index + 3], + RF(rf_data).values[base_index + 0], + RF(rf_data).values[base_index + 1], + RF(rf_data).values[base_index + 2], + RF(rf_data).values[base_index + 3], }; vec4 S = vec4(t * t * t, t * t, t, 1); @@ -98,13 +104,13 @@ SAMPLE_TYPE sample_rf(const int rf_offset, const float index) switch (InterpolationMode) { case InterpolationMode_Nearest:{ if (int(index) >= 0 && int(round(index)) < SampleCount) - result = rotate_iq(rf_data[rf_offset + int(round(index))], index / SamplingFrequency); + result = rotate_iq(RF(rf_data).values[rf_offset + int(round(index))], index / SamplingFrequency); }break; case InterpolationMode_Linear:{ if (int(index) >= 0 && int(index) < SampleCount - 1) { float tk, t = modf(index, tk); int n = rf_offset + int(tk); - result = (1 - t) * rf_data[n] + t * rf_data[n + 1]; + result = (1 - t) * RF(rf_data).values[n] + t * RF(rf_data).values[n + 1]; result = rotate_iq(result, index / SamplingFrequency); } }break; @@ -124,6 +130,12 @@ float sample_index(const float distance) return time * SamplingFrequency; } +uint32_t output_index(uint32_t x, uint32_t y, uint32_t z) +{ + uint32_t result = output_size_x * output_size_y * z + output_size_x * y + x; + return result; +} + float apodize(const float arg) { /* IMPORTANT: do not move calculation of arg into this function. It will generate a @@ -158,19 +170,22 @@ float cylindrical_wave_transmit_distance(const vec3 point, const float focal_dep return distance(rca_plane_projection(point, tx_rows), f); } -int tx_rx_orientation_for_acquisition(const int acquisition) +uint16_t tx_rx_orientation_for_acquisition(const int16_t acquisition) { - int result = bool(SingleOrientation) ? TransmitReceiveOrientation : imageLoad(transmit_receive_orientations, acquisition).x; + uint16_t result = uint16_t(TransmitReceiveOrientation); + if (!bool(SingleOrientation)) + result = ArrayParameters(array_parameters).data.transmit_receive_orientations[acquisition]; return result; } -vec2 focal_vector_for_acquisition(const int acquisition) +vec2 focal_vector_for_acquisition(const int16_t acquisition) { - vec2 result = bool(SingleFocus) ? vec2(TransmitAngle, FocusDepth) : imageLoad(focal_vectors, acquisition).xy; + vec2 result = bool(SingleFocus) ? vec2(TransmitAngle, FocusDepth) + : ArrayParameters(array_parameters).data.focal_vectors[acquisition]; return result; } -float rca_transmit_distance(const vec3 world_point, const vec2 focal_vector, const int transmit_receive_orientation) +float rca_transmit_distance(const vec3 world_point, const vec2 focal_vector, const uint16_t transmit_receive_orientation) { float result = 0; if (TX_ORIENTATION(transmit_receive_orientation) != RCAOrientation_None) { @@ -189,13 +204,13 @@ float rca_transmit_distance(const vec3 world_point, const vec2 focal_vector, con RESULT_TYPE RCA(const vec3 world_point) { - const int acquisition_start = bool(Fast)? u_channel : 0; - const int acquisition_end = bool(Fast)? u_channel + 1 : AcquisitionCount; + const int16_t acquisition_start = int16_t(channel_t); + const int16_t acquisition_end = int16_t(channel_t + 1); RESULT_TYPE result = RESULT_TYPE(0); - for (int acquisition = acquisition_start; acquisition < acquisition_end; acquisition++) { - const int tx_rx_orientation = tx_rx_orientation_for_acquisition(acquisition); - const bool rx_rows = RX_ORIENTATION(tx_rx_orientation) == RCAOrientation_Rows; - const vec2 focal_vector = focal_vector_for_acquisition(acquisition); + for (int16_t acquisition = acquisition_start; acquisition < acquisition_end; acquisition++) { + const uint16_t tx_rx_orientation = tx_rx_orientation_for_acquisition(acquisition); + const bool rx_rows = RX_ORIENTATION(tx_rx_orientation) == RCAOrientation_Rows; + const vec2 focal_vector = focal_vector_for_acquisition(acquisition); vec2 xdc_world_point = rca_plane_projection((xdc_transform * vec4(world_point, 1)).xyz, rx_rows); float transmit_distance = rca_transmit_distance(world_point, focal_vector, tx_rx_orientation); @@ -209,7 +224,7 @@ RESULT_TYPE RCA(const vec3 world_point) if (a_arg < 0.5f) { float sidx = sample_index(transmit_distance + length(receive_vector)); SAMPLE_TYPE value = apodize(a_arg) * sample_rf(rf_offset, sidx); - result += RESULT_STORE(value, length(value)); + result += RESULT_STORE(value); } rf_offset += SampleCount * AcquisitionCount; } @@ -219,10 +234,10 @@ RESULT_TYPE RCA(const vec3 world_point) RESULT_TYPE HERCULES(const vec3 world_point) { - const int tx_rx_orientation = tx_rx_orientation_for_acquisition(0); - const bool rx_cols = RX_ORIENTATION(tx_rx_orientation) == RCAOrientation_Columns; - const vec2 focal_vector = focal_vector_for_acquisition(0); - const vec3 xdc_world_point = (xdc_transform * vec4(world_point, 1)).xyz; + const uint16_t tx_rx_orientation = tx_rx_orientation_for_acquisition(int16_t(0)); + const bool rx_cols = RX_ORIENTATION(tx_rx_orientation) == RCAOrientation_Columns; + const vec2 focal_vector = focal_vector_for_acquisition(int16_t(0)); + const vec3 xdc_world_point = (xdc_transform * vec4(world_point, 1)).xyz; const float transmit_index = sample_index(rca_transmit_distance(world_point, focal_vector, tx_rx_orientation)); const float z_delta_squared = xdc_world_point.z * xdc_world_point.z; @@ -231,11 +246,7 @@ RESULT_TYPE HERCULES(const vec3 world_point) const float apodization_test = 0.25f / (f_number_over_z * f_number_over_z); RESULT_TYPE result = RESULT_TYPE(0); - #if Fast - const int rx_channel = u_channel; - #else - for (int rx_channel = 0; rx_channel < ChannelCount; rx_channel++) - #endif + const int rx_channel = channel_t; { int rf_offset = rx_channel * SampleCount * AcquisitionCount + Sparse * SampleCount; rf_offset -= int(InterpolationMode == InterpolationMode_Cubic); @@ -249,7 +260,8 @@ RESULT_TYPE HERCULES(const vec3 world_point) else element_receive_delta_squared.y *= element_receive_delta_squared.y; for (int transmit = Sparse; transmit < AcquisitionCount; transmit++) { - int tx_channel = bool(Sparse) ? imageLoad(sparse_elements, transmit - Sparse).x : transmit; + int tx_channel = bool(Sparse) ? ArrayParameters(array_parameters).data.sparse_elements[transmit - Sparse] + : transmit; if (rx_cols) element_receive_delta_squared.y = xy_world_point.y - tx_channel * xdc_element_pitch.y; else element_receive_delta_squared.x = xy_world_point.x - tx_channel * xdc_element_pitch.x; @@ -265,7 +277,7 @@ RESULT_TYPE HERCULES(const vec3 world_point) float index = transmit_index + sqrt(z_delta_squared + element_delta_squared) * SamplingFrequency / SpeedOfSound; SAMPLE_TYPE value = apodization * sample_rf(rf_offset, index); - result += RESULT_STORE(value, length(value)); + result += RESULT_STORE(value); } rf_offset += SampleCount; @@ -276,8 +288,8 @@ RESULT_TYPE HERCULES(const vec3 world_point) RESULT_TYPE FORCES(const vec3 xdc_world_point) { - const int rx_channel_start = bool(Fast)? u_channel : 0; - const int rx_channel_end = bool(Fast)? u_channel + 1 : ChannelCount; + const int16_t rx_channel_start = int16_t(channel_t); + const int16_t rx_channel_end = int16_t(channel_t + 1); RESULT_TYPE result = RESULT_TYPE(0); @@ -285,7 +297,7 @@ RESULT_TYPE FORCES(const vec3 xdc_world_point) float transmit_y_delta = xdc_world_point.y - xdc_element_pitch.y * ChannelCount / 2; float transmit_yz_squared = transmit_y_delta * transmit_y_delta + z_delta_squared; - for (int rx_channel = rx_channel_start; rx_channel < rx_channel_end; rx_channel++) { + for (int16_t rx_channel = rx_channel_start; rx_channel < rx_channel_end; rx_channel++) { float receive_x_delta = xdc_world_point.x - rx_channel * xdc_element_pitch.x; float a_arg = abs(FNumber * receive_x_delta / xdc_world_point.z); @@ -296,12 +308,13 @@ RESULT_TYPE FORCES(const vec3 xdc_world_point) float receive_index = sample_index(sqrt(receive_x_delta * receive_x_delta + z_delta_squared)); float apodization = apodize(a_arg); for (int transmit = Sparse; transmit < AcquisitionCount; transmit++) { - int tx_channel = bool(Sparse) ? imageLoad(sparse_elements, transmit - Sparse).x : transmit; + int tx_channel = bool(Sparse) ? ArrayParameters(array_parameters).data.sparse_elements[transmit - Sparse] + : transmit; float transmit_x_delta = xdc_world_point.x - xdc_element_pitch.x * tx_channel; float transmit_index = sqrt(transmit_yz_squared + transmit_x_delta * transmit_x_delta) * SamplingFrequency / SpeedOfSound; SAMPLE_TYPE value = apodization * sample_rf(rf_offset, receive_index + transmit_index); - result += RESULT_STORE(value, length(value)); + result += RESULT_STORE(value); rf_offset += SampleCount; } } @@ -311,15 +324,17 @@ RESULT_TYPE FORCES(const vec3 xdc_world_point) void main() { - ivec3 out_voxel = ivec3(gl_GlobalInvocationID); - vec3 image_points = vec3(imageSize(u_out_data_tex)) - 1.0f; - if (!all(lessThan(out_voxel, imageSize(u_out_data_tex)))) + uvec3 out_voxel = gl_GlobalInvocationID; + if (!all(lessThan(out_voxel, uvec3(output_size_x, output_size_y, output_size_z)))) return; - vec3 point = vec3(out_voxel) / max(vec3(1.0f), image_points); - vec3 world_point = (voxel_transform * vec4(point, 1)).xyz; + vec3 image_points = vec3(output_size_x, output_size_y, output_size_z) - 1.0f; + vec3 point = vec3(out_voxel) / max(vec3(1.0f), image_points); + vec3 world_point = (voxel_transform * vec4(point, 1)).xyz; + + uint32_t out_index = output_index(out_voxel.x, out_voxel.y, out_voxel.z); - RESULT_TYPE sum; + RESULT_TYPE sum = RESULT_TYPE(0); switch (AcquisitionKind) { case AcquisitionKind_FORCES: case AcquisitionKind_UFORCES: @@ -340,15 +355,9 @@ void main() }break; } - #if Fast - sum += RESULT_TYPE_CAST(imageLoad(u_out_data_tex, out_voxel)); - #endif - #if CoherencyWeighting - /* TODO(rnp): scale such that brightness remains ~constant */ - float denominator = sum[RESULT_LAST_INDEX] + float(sum[RESULT_LAST_INDEX] == 0); - RESULT_TYPE_CAST(sum) *= RESULT_TYPE_CAST(sum) / denominator; + IncoherentOutput(incoherent_output).values[out_index] += RESULT_INCOHERENT_CAST(sum); #endif - imageStore(u_out_data_tex, out_voxel, OUTPUT_TYPE_CAST(sum)); + Output(output_data).values[out_index] += RESULT_COHERENT_CAST(sum); } diff --git a/shaders/decode.glsl b/shaders/decode.glsl @@ -9,57 +9,42 @@ */ #if DataKind == DataKind_Float32 - #define INPUT_DATA_TYPE float - #define SAMPLE_DATA_TYPE float - #define SAMPLE_TYPE_CAST(x) (x) + #define INPUT_DATA_TYPE float + #define SAMPLE_DATA_TYPE float #elif DataKind == DataKind_Float32Complex - #define INPUT_DATA_TYPE vec2 - #define SAMPLE_DATA_TYPE vec2 - #define SAMPLE_TYPE_CAST(x) (x) + #define INPUT_DATA_TYPE vec2 + #define SAMPLE_DATA_TYPE vec2 #elif DataKind == DataKind_Int16Complex - #define INPUT_DATA_TYPE int - #define SAMPLE_DATA_TYPE vec2 - #define SAMPLE_TYPE_CAST(x) vec2(((x) << 16) >> 16, (x) >> 16) + #define INPUT_DATA_TYPE i16vec2 + #define SAMPLE_DATA_TYPE vec2 #elif DataKind == DataKind_Int16 - #define INPUT_DATA_TYPE int - #define RF_SAMPLES_PER_INDEX 2 - #if DilateOutput - #define SAMPLE_DATA_TYPE vec4 - #define SAMPLE_TYPE_CAST(x) vec4(((x) << 16) >> 16, 0, (x) >> 16, 0) - #else - #define SAMPLE_DATA_TYPE vec2 - #define SAMPLE_TYPE_CAST(x) vec2(((x) << 16) >> 16, (x) >> 16) - #define OUTPUT_SAMPLES_PER_INDEX 2 - #endif + #define INPUT_DATA_TYPE int16_t + #define SAMPLE_DATA_TYPE float #else - #error unsupported data kind for Decode + #error unsupported data kind for Decode #endif -#ifndef OUTPUT_SAMPLES_PER_INDEX - #define OUTPUT_SAMPLES_PER_INDEX 1 -#endif - -#ifndef RF_SAMPLES_PER_INDEX - #define RF_SAMPLES_PER_INDEX 1 -#endif +// TODO(rnp): fix DilateOutput -layout(std430, binding = 1) readonly restrict buffer buffer_1 { - INPUT_DATA_TYPE rf_data[]; +layout(std430, buffer_reference, buffer_reference_align = 64) restrict readonly buffer RF { + INPUT_DATA_TYPE values[]; }; -layout(std430, binding = 2) writeonly restrict buffer buffer_2 { - INPUT_DATA_TYPE out_rf_data[]; +layout(std430, buffer_reference, buffer_reference_align = 64) restrict writeonly buffer OutputRF { + INPUT_DATA_TYPE values[]; }; -layout(std430, binding = 3) writeonly restrict buffer buffer_3 { - SAMPLE_DATA_TYPE out_data[]; +layout(std430, buffer_reference, buffer_reference_align = 64) restrict writeonly buffer Output { + SAMPLE_DATA_TYPE values[]; }; -layout(r16f, binding = 0) readonly restrict uniform image2D hadamard; +layout(std430, buffer_reference, buffer_reference_align = 64) restrict readonly buffer Hadamard { + float16_t values[]; +}; SAMPLE_DATA_TYPE sample_rf_data(uint index) { - SAMPLE_DATA_TYPE result = SAMPLE_TYPE_CAST(rf_data[index]); + SAMPLE_DATA_TYPE result = SAMPLE_DATA_TYPE(RF(rf_buffer).values[index]); return result; } @@ -67,7 +52,7 @@ SAMPLE_DATA_TYPE sample_rf_data(uint index) shared INPUT_DATA_TYPE rf[gl_WorkGroupSize.x * TransmitCount]; void run_decode_large(void) { - uint time_sample = gl_GlobalInvocationID.x * RF_SAMPLES_PER_INDEX; + uint time_sample = gl_GlobalInvocationID.x; uint channel = gl_GlobalInvocationID.y; uint transmit = gl_GlobalInvocationID.z * ToProcess; @@ -78,12 +63,11 @@ void run_decode_large(void) uint leftover_samples = rf.length() % thread_count; uint samples_this_thread = samples_per_thread + uint(thread_index < leftover_samples); - uint rf_offset = (InputChannelStride * channel / RF_SAMPLES_PER_INDEX + - TransmitCount * gl_WorkGroupID.x * gl_WorkGroupSize.x); + uint rf_offset = InputChannelStride * channel + TransmitCount * gl_WorkGroupID.x * gl_WorkGroupSize.x; for (uint i = 0; i < samples_this_thread; i++) { uint index = i * thread_count + thread_index; - rf[index] = rf_data[rf_offset + index]; + rf[index] = RF(rf_buffer).values[rf_offset + index]; } barrier(); @@ -94,9 +78,9 @@ void run_decode_large(void) result[i] = SAMPLE_DATA_TYPE(0); for (int j = 0; j < TransmitCount; j++) { - SAMPLE_DATA_TYPE s = SAMPLE_TYPE_CAST(rf[gl_LocalInvocationID.x * TransmitCount + j]); + SAMPLE_DATA_TYPE s = SAMPLE_DATA_TYPE(rf[gl_LocalInvocationID.x * TransmitCount + j]); for (uint i = 0; i < ToProcess; i++) - result[i] += imageLoad(hadamard, ivec2(j, transmit + i)).x * s; + result[i] += s * Hadamard(hadamard_buffer).values[TransmitCount * j + (i + transmit)]; } for (uint i = 0; i < ToProcess; i++) @@ -112,30 +96,30 @@ void run_decode_large(void) for (uint i = 0; i < ToProcess; i++, out_off += OutputTransmitStride) if (TransmitCount % (gl_WorkGroupSize.z * ToProcess) == 0 || transmit + i < TransmitCount) - out_data[out_off / OUTPUT_SAMPLES_PER_INDEX] = result[i]; + Output(output_buffer).values[out_off] = result[i]; } } #endif void run_decode_small(void) { - uint time_sample = gl_GlobalInvocationID.x * RF_SAMPLES_PER_INDEX; + uint time_sample = gl_GlobalInvocationID.x; uint channel = gl_GlobalInvocationID.y; - uint rf_offset = (InputChannelStride * channel + TransmitCount * time_sample) / RF_SAMPLES_PER_INDEX; + uint rf_offset = InputChannelStride * channel + TransmitCount * time_sample; if (time_sample < OutputTransmitStride) { INPUT_DATA_TYPE rf[TransmitCount]; for (int j = 0; j < TransmitCount; j++) - rf[j] = rf_data[rf_offset + j]; + rf[j] = RF(rf_buffer).values[rf_offset + j]; SAMPLE_DATA_TYPE result[TransmitCount]; for (int j = 0; j < TransmitCount; j++) result[j] = SAMPLE_DATA_TYPE(0); for (int i = 0; i < TransmitCount; i++) { - SAMPLE_DATA_TYPE s = SAMPLE_TYPE_CAST(rf[i]); + SAMPLE_DATA_TYPE s = SAMPLE_DATA_TYPE(rf[i]); for (int j = 0; j < TransmitCount; j++) { - result[j] += imageLoad(hadamard, ivec2(i, j)).x * s; + result[j] += s * Hadamard(hadamard_buffer).values[TransmitCount * i + j]; } } @@ -145,7 +129,7 @@ void run_decode_small(void) uint out_off = OutputChannelStride * channel + OutputSampleStride * time_sample; for (int i = 0; i < TransmitCount; i++, out_off += OutputTransmitStride) - out_data[out_off / OUTPUT_SAMPLES_PER_INDEX] = result[i]; + Output(output_buffer).values[out_off] = result[i]; } } @@ -153,40 +137,40 @@ void main() { switch (DecodeMode) { case DecodeMode_None:{ - uint time_sample = gl_GlobalInvocationID.x * RF_SAMPLES_PER_INDEX; + uint time_sample = gl_GlobalInvocationID.x; uint channel = gl_GlobalInvocationID.y; uint transmit = gl_GlobalInvocationID.z; if (time_sample < OutputTransmitStride) { - uint in_off = (InputChannelStride * channel + - InputTransmitStride * transmit + - InputSampleStride * time_sample) / RF_SAMPLES_PER_INDEX; + uint in_off = InputChannelStride * channel + + InputTransmitStride * transmit + + InputSampleStride * time_sample; - uint out_off = (OutputChannelStride * channel + - OutputTransmitStride * transmit + - OutputSampleStride * time_sample) / OUTPUT_SAMPLES_PER_INDEX; + uint out_off = OutputChannelStride * channel + + OutputTransmitStride * transmit + + OutputSampleStride * time_sample; - out_data[out_off] = sample_rf_data(in_off); + Output(output_buffer).values[out_off] = sample_rf_data(in_off); } }break; case DecodeMode_Hadamard:{ - if (u_first_pass) { - uint time_sample = gl_GlobalInvocationID.x * RF_SAMPLES_PER_INDEX; + if (first_pass) { + uint time_sample = gl_GlobalInvocationID.x; uint channel = gl_GlobalInvocationID.y; uint transmit = gl_GlobalInvocationID.z * ToProcess; if (time_sample < InputTransmitStride) { - uint out_off = (InputChannelStride * channel + TransmitCount * time_sample) / RF_SAMPLES_PER_INDEX; - uint in_off = (InputChannelStride * channel + InputSampleStride * time_sample); + uint out_off = InputChannelStride * channel + TransmitCount * time_sample; + uint in_off = InputChannelStride * channel + InputSampleStride * time_sample; #if UseSharedMemory in_off += InputTransmitStride * transmit; out_off += transmit; for (uint i = 0; i < ToProcess; i++, in_off += InputTransmitStride) { if (transmit + i < TransmitCount) - out_rf_data[out_off + i] = rf_data[in_off / RF_SAMPLES_PER_INDEX]; + OutputRF(output_rf_buffer).values[out_off + i] = RF(rf_buffer).values[in_off]; } #else for (uint i = 0; i < TransmitCount; i++, in_off += InputTransmitStride) - out_rf_data[out_off + i] = rf_data[in_off / RF_SAMPLES_PER_INDEX]; + OutputRF(output_rf_buffer).values[out_off + i] = RF(rf_buffer).values[in_off]; #endif } } else { diff --git a/shaders/filter.glsl b/shaders/filter.glsl @@ -28,16 +28,16 @@ #define apply_filter(iq, h) ((iq) * (h)) #endif -layout(std430, binding = 1) readonly restrict buffer buffer_1 { - DATA_TYPE in_data[]; +layout(std430, buffer_reference, buffer_reference_align = 64) restrict readonly buffer Input { + DATA_TYPE values[]; }; -layout(std430, binding = 2) writeonly restrict buffer buffer_2 { - OUT_DATA_TYPE out_data[]; +layout(std430, buffer_reference, buffer_reference_align = 64) restrict writeonly buffer Output { + OUT_DATA_TYPE values[]; }; -layout(std430, binding = 3) readonly restrict buffer buffer_3 { - FILTER_TYPE filter_coefficients[FilterLength]; +layout(std430, buffer_reference, buffer_reference_align = 64) restrict readonly buffer Filter { + FILTER_TYPE values[FilterLength]; }; vec2 complex_mul(vec2 a, vec2 b) @@ -58,7 +58,7 @@ vec2 rotate_iq(vec2 iq, uint index) SAMPLE_TYPE sample_rf(uint index) { - SAMPLE_TYPE result = SAMPLE_TYPE_CAST(in_data[index]); + SAMPLE_TYPE result = SAMPLE_TYPE_CAST(Input(input_data).values[index]); return result; } @@ -80,6 +80,8 @@ void main() ///////////////////////// // NOTE: sample caching { + bool offset_wraps = (DecimationRate * gl_WorkGroupID.x * gl_WorkGroupSize.x) < (FilterLength - 1); + in_offset += DecimationRate * gl_WorkGroupID.x * gl_WorkGroupSize.x - (FilterLength - 1); uint total_samples = rf.length(); @@ -87,10 +89,10 @@ void main() uint leftover_count = total_samples % thread_count; uint samples_this_thread = samples_per_thread + uint(thread_index < leftover_count); - const float scale = bool(ComplexFilter) ? 1 : sqrt(2); + const float scale = bool(ComplexFilter) ? 1 : sqrt(2.0f); for (uint i = 0; i < samples_this_thread; i++) { uint index = thread_count * i + thread_index; - if (gl_WorkGroupID.x == 0 && index < FilterLength - 1) { + if (offset_wraps && index < FilterLength - 1) { rf[index] = SAMPLE_TYPE(0); } else { #if Demodulate @@ -107,7 +109,7 @@ void main() SAMPLE_TYPE result = SAMPLE_TYPE(0); uint offset = DecimationRate * thread_index; for (uint j = 0; j < FilterLength; j++) - result += apply_filter(rf[offset + j], filter_coefficients[j]); - out_data[out_offset] = RESULT_TYPE_CAST(result); + result += apply_filter(rf[offset + j], Filter(filter_coefficients).values[j]); + Output(output_data).values[out_offset] = RESULT_TYPE_CAST(result); } } diff --git a/shaders/render_3d.frag.glsl b/shaders/render_3d.frag.glsl @@ -1,4 +1,15 @@ /* See LICENSE for license details. */ +layout(location = 0) in vec3 normal; +layout(location = 1) in vec3 texture_coordinate; +layout(location = 0) out vec4 out_colour; + +layout(std430, buffer_reference, buffer_reference_align = 64) readonly buffer InputVec2 { + vec2 values[]; +}; + +layout(std430, buffer_reference, buffer_reference_align = 64) readonly buffer InputFloat { + float values[]; +}; /* input: h [0,360] | s,v [0, 1] * * output: rgb [0,1] */ @@ -20,24 +31,41 @@ float sdf_wire_box_outside(vec3 p, vec3 b, float e) return result; } -int texture_dimension(ivec3 points) +uint32_t texture_dimension(uvec3 points) { - points = ivec3(greaterThan(points, ivec3(1))); + points = uvec3(greaterThan(points, uvec3(1))); return points.x + points.y + points.z; } +uint32_t input_index(vec3 uv) +{ + uv *= vec3(input_size_x - 1, input_size_y - 1, input_size_z - 1); + uint32_t result = input_size_y * input_size_x * uint32_t(uv.z) + + input_size_x * uint32_t(uv.y) + + uint32_t(uv.x); + result = min(result, input_size_z * input_size_y * input_size_x - 1); + return result; +} float sample_value(vec3 p) { - float result = length(texture(u_texture, p).xy); - float threshold_val = pow(10.0f, u_threshold / 20.0f); + float result; + if (input_data != 0) { + uint32_t index = input_index(texture_coordinate); + switch (data_kind) { + case DataKind_Float32:{ result = length(InputFloat(input_data).values[index]); }break; + case DataKind_Float32Complex:{ result = length(InputVec2(input_data).values[index]); }break; + } + } + + float threshold_val = pow(10.0f, threshold / 20.0f); result = clamp(result, 0.0f, threshold_val); result = result / threshold_val; - result = pow(result, u_gamma); + result = pow(result, gamma); - if (u_log_scale) { + if (db_cutoff > 0) { result = 20 * log(result) / log(10); - result = clamp(result, -u_db_cutoff, 0) / -u_db_cutoff; + result = clamp(result, -db_cutoff, 0) / -db_cutoff; result = 1 - result; } @@ -54,40 +82,40 @@ float grad(float x) void main(void) { - int dimension = texture_dimension(textureSize(u_texture, 0)); + uint32_t dimension = texture_dimension(uvec3(input_size_x, input_size_y, input_size_z)); if (dimension == 3) { // TODO(rnp): add slice offset passed in as a uniform } - float smp = sample_value(texture_coordinate); + float data = sample_value(texture_coordinate); //float t = test_texture_coordinate.y; //smp = smp * smoothstep(-0.4, 1.1, t) * u_gain; - vec3 p = 2.0f * test_texture_coordinate - 1.0f; + vec3 p = 2.0f * texture_coordinate - 1.0f; switch (dimension) { case 1:{ - float df = mix(grad(texture_coordinate.x), dFdx(smp), + float df = mix(grad(texture_coordinate.x), dFdx(data), smoothstep(0.0f, 0.55f, abs(texture_coordinate.x - 0.5f))); - float de = abs(smp - texture_coordinate.y) / sqrt(1.0f + df * df); + float de = abs(data - texture_coordinate.y) / sqrt(1.0f + df * df); float eps = length(fwidth(texture_coordinate.xy)); float thickness = 4.f; float alpha = smoothstep((0.5f * thickness + 2.0f) * eps, (0.5f * thickness + 0.0f) * eps, de); - out_colour = vec4(u_bb_colour.xyz, alpha); + out_colour = vec4(bounding_box_colour.xyz, alpha); }break; case 0: // NOTE(rnp): 0 is a special case for X-Plane Rendering case 2: case 3: { - float t = clamp(sdf_wire_box_outside(p, vec3(1.0f), u_bb_fraction) / u_bb_fraction, 0, 1); + float t = clamp(sdf_wire_box_outside(p, vec3(1.0f), bounding_box_fraction) / bounding_box_fraction, 0, 1); - out_colour = vec4(t * vec3(smp) + (1 - t) * u_bb_colour.xyz, 1); - if (u_solid_bb) out_colour = u_bb_colour; + out_colour = vec4(t * vec3(data) + (1 - t) * bounding_box_colour.xyz, 1); + //if (u_solid_bb) out_colour = u_bb_colour; }break; } diff --git a/shaders/render_3d.vert.glsl b/shaders/render_3d.vert.glsl @@ -0,0 +1,19 @@ +layout(location = 0) out vec3 f_normal; +layout(location = 1) out vec3 f_texture_coordinate; + +layout(std430, buffer_reference, buffer_reference_align = 16) readonly buffer Vector4 { + vec4 values[]; +}; + +void main() +{ + vec3 position = Vector4(positions).values[gl_VertexIndex].xyz; + vec3 normal = Vector4(normals).values[gl_VertexIndex].xyz; + vec3 texture_coordinate = (2 * position + 1) / 2; + + f_texture_coordinate = texture_coordinate; + f_normal = normal; + //f_normal = normalize(mat3(mvp_matrix) * normal); + + gl_Position = mvp_matrix * vec4(position, 1); +} diff --git a/ui.c b/ui.c @@ -63,6 +63,10 @@ #define RULER_COLOUR (v4){{1.00f, 0.70f, 0.00f, 1.0f}} #define BORDER_COLOUR v4_lerp(FG_COLOUR, BG_COLOUR, 0.85f) +#define FRAME_VIEW_BB_COLOUR (v4){{0.92f, 0.88f, 0.78f, 1.0f}} +#define FRAME_VIEW_BB_FRACTION 0.007f +#define FRAME_VIEW_RENDER_TARGET_SIZE 1024, 1024 + #define MENU_PLUS_COLOUR (v4){{0.33f, 0.42f, 1.00f, 1.00f}} #define MENU_CLOSE_COLOUR FOCUSED_COLOUR @@ -308,8 +312,7 @@ struct Variable { #define BEAMFORMER_FRAME_VIEW_KIND_LIST \ X(Latest, "Latest") \ X(3DXPlane, "3D X-Plane") \ - X(Indexed, "Indexed") \ - X(Copy, "Copy") + X(Copy, "Copy") \ typedef enum { #define X(kind, ...) BeamformerFrameViewKind_##kind, @@ -322,12 +325,16 @@ typedef struct BeamformerFrameView BeamformerFrameView; struct BeamformerFrameView { BeamformerFrameViewKind kind; b32 dirty; - BeamformerFrame *frame; BeamformerFrameView *prev, *next; - u32 texture; - i32 texture_mipmaps; - iv2 texture_dim; + // NOTE(rnp): for FrameViewKindCopy + GPUBuffer copy_buffer; + + GPUImage colour_image; + // NOTE(rnp): temporary, on w32 we must hold onto this when importing vulkan data to OpenGL + OSHandle export_handle; + u32 memory_object; + u32 texture; /* NOTE(rnp): any pointers to variables are added to the menu and will * be put onto the freelist if the view is closed. */ @@ -339,14 +346,13 @@ struct BeamformerFrameView { Variable gamma; union { - /* BeamformerFrameViewKind_Latest/BeamformerFrameViewKind_Indexed */ + /* BeamformerFrameViewKind_Latest/BeamformerFrameViewKind_Copy */ struct { Variable lateral_scale_bar; Variable axial_scale_bar; Variable *lateral_scale_bar_active; Variable *axial_scale_bar_active; - /* NOTE(rnp): if kind is Latest selects which plane to use - * if kind is Indexed selects the index */ + /* NOTE(rnp): selects which plane to use */ Variable *cycler; u32 cycler_state; @@ -354,6 +360,8 @@ struct BeamformerFrameView { v3 min_coordinate; v3 max_coordinate; + + BeamformerFrame frame; }; /* BeamformerFrameViewKind_3DXPlane */ @@ -415,7 +423,6 @@ struct BeamformerUI { BeamformerFrameView *views; BeamformerFrameView *view_freelist; - BeamformerFrame *frame_freelist; Interaction interaction; Interaction hot_interaction; @@ -423,12 +430,20 @@ struct BeamformerUI { InputState text_input_state; - /* TODO(rnp): ideally this isn't copied all over the place */ - BeamformerRenderModel unit_cube_model; + VulkanHandle pipelines[BeamformerShaderKind_RenderCount]; + + OSHandle render_semaphores_export[2]; + VulkanHandle render_semaphores[2]; + u32 render_semaphores_gl[2]; + + GPUImage render_3d_image; + GPUImage render_3d_depth_image; + RenderModel unit_cube_model; v2_sll *scale_bar_savepoint_freelist; - BeamformerFrame *latest_plane[BeamformerViewPlaneTag_Count + 1]; + BeamformerFrame latest_plane[BeamformerViewPlaneTag_Count + 1]; + b32 latest_plane_valid[BeamformerViewPlaneTag_Count + 1]; BeamformerUIParameters params; b32 flush_params; @@ -439,8 +454,6 @@ struct BeamformerUI { f32 off_axis_position; f32 beamform_plane; - FrameViewRenderContext *frame_view_render_context; - BeamformerSharedMemory * shared_memory; BeamformerCtx * beamformer_context; }; @@ -640,9 +653,9 @@ make_raylib_texture(BeamformerFrameView *v) { Texture result; result.id = v->texture; - result.width = v->texture_dim.w; - result.height = v->texture_dim.h; - result.mipmaps = v->texture_mipmaps; + result.width = v->colour_image.width; + result.height = v->colour_image.height; + result.mipmaps = v->colour_image.mip_map_levels; result.format = PIXELFORMAT_UNCOMPRESSED_R8G8B8A8; return result; } @@ -743,16 +756,11 @@ push_custom_view_title(Stream *s, Variable *var) #undef X stream_append_s8(s, labels[*bv->cycler->cycler.state % (BeamformerViewPlaneTag_Count + 1)]); }break; - case BeamformerFrameViewKind_Indexed:{ - stream_append_s8(s, s8(": Index {")); - stream_append_u64(s, *bv->cycler->cycler.state % BeamformerMaxBacklogFrames); - stream_append_s8(s, s8("} [")); - }break; case BeamformerFrameViewKind_3DXPlane:{ stream_append_s8(s, s8(": 3D X-Plane")); }break; InvalidDefaultCase; } if (bv->kind != BeamformerFrameViewKind_3DXPlane) { - stream_append_hex_u64(s, bv->frame? bv->frame->id : 0); + stream_append_hex_u64(s, bv->frame.id); stream_append_byte(s, ']'); } }break; @@ -954,19 +962,37 @@ table_end_subtable(Table *table) } function void -resize_frame_view(BeamformerFrameView *view, iv2 dim) +resize_frame_view(BeamformerFrameView *view, uv2 dim) { + if ValidHandle(view->export_handle) os_release_handle(view->export_handle); + + glDeleteMemoryObjectsEXT(1, &view->memory_object); + glCreateMemoryObjectsEXT(1, &view->memory_object); + glDeleteTextures(1, &view->texture); glCreateTextures(GL_TEXTURE_2D, 1, &view->texture); - view->texture_dim = dim; - view->texture_mipmaps = (i32)ctz_u64((u64)Max(dim.x, dim.y)) + 1; - glTextureStorage2D(view->texture, view->texture_mipmaps, GL_RGBA8, dim.x, dim.y); + vk_image_allocate(&view->colour_image, dim.w, dim.h, 1, 1, VulkanImageUsage_Colour, + VulkanUsageFlag_ImageSampling, &view->export_handle); - glGenerateTextureMipmap(view->texture); + glMemoryObjectParameterivEXT(view->memory_object, GL_DEDICATED_MEMORY_OBJECT_EXT, (GLint []){1}); + + if (OS_WINDOWS) { + glImportMemoryWin32HandleEXT(view->memory_object, view->colour_image.memory_size, + GL_HANDLE_TYPE_OPAQUE_WIN32_EXT, (void *)view->export_handle.value[0]); + // NOTE(rnp): w32 does not transfer ownership from handle back to driver + } else { + glImportMemoryFdEXT(view->memory_object, view->colour_image.memory_size, + GL_HANDLE_TYPE_OPAQUE_FD_EXT, view->export_handle.value[0]); + view->export_handle.value[0] = OSInvalidHandleValue; + } + + glTextureStorageMem2DEXT(view->texture, view->colour_image.mip_map_levels, GL_RGBA8, + view->colour_image.width, view->colour_image.height, + view->memory_object, 0); /* NOTE(rnp): work around raylib's janky texture sampling */ - v4 border_colour = (v4){{0, 0, 0, 1}}; + v4 border_colour = {{0, 0, 0, 1}}; if (view->kind != BeamformerFrameViewKind_Copy) border_colour = (v4){0}; glTextureParameteri(view->texture, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_BORDER); glTextureParameteri(view->texture, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_BORDER); @@ -983,11 +1009,8 @@ resize_frame_view(BeamformerFrameView *view, iv2 dim) function void ui_beamformer_frame_view_release_subresources(BeamformerUI *ui, BeamformerFrameView *bv, BeamformerFrameViewKind kind) { - if (kind == BeamformerFrameViewKind_Copy && bv->frame) { - glDeleteTextures(1, &bv->frame->texture); - bv->frame->texture = 0; - SLLPushFreelist(bv->frame, ui->frame_freelist); - } + if (kind == BeamformerFrameViewKind_Copy) + vk_buffer_release(&bv->copy_buffer); if (kind != BeamformerFrameViewKind_3DXPlane) { if (bv->axial_scale_bar.scale_bar.savepoint_stack) @@ -1289,10 +1312,10 @@ ui_beamformer_frame_view_convert(BeamformerUI *ui, Arena *arena, Variable *view, bv->threshold.real32 = old? old->threshold.real32 : 55.0f; bv->gamma.scaled_real32.val = old? old->gamma.scaled_real32.val : 1.0f; bv->gamma.scaled_real32.scale = old? old->gamma.scaled_real32.scale : 0.05f; - bv->min_coordinate = (old && old->frame) ? m4_mul_v4(old->frame->voxel_transform, (v4){{0.0f, 0.0f, 0.0f, 1.0f}}).xyz - : (v3){0}; - bv->max_coordinate = (old && old->frame) ? m4_mul_v4(old->frame->voxel_transform, (v4){{1.0f, 1.0f, 1.0f, 1.0f}}).xyz - : (v3){0}; + bv->min_coordinate = old ? m4_mul_v4(old->frame.voxel_transform, (v4){{0.0f, 0.0f, 0.0f, 1.0f}}).xyz + : (v3){0}; + bv->max_coordinate = old ? m4_mul_v4(old->frame.voxel_transform, (v4){{1.0f, 1.0f, 1.0f, 1.0f}}).xyz + : (v3){0}; #define X(_t, pretty) s8_comp(pretty), read_only local_persist s8 kind_labels[] = {BEAMFORMER_FRAME_VIEW_KIND_LIST}; @@ -1302,7 +1325,7 @@ ui_beamformer_frame_view_convert(BeamformerUI *ui, Arena *arena, Variable *view, /* TODO(rnp): this is quite dumb. what we actually want is to render directly * into the view region with the appropriate size for that region (scissor) */ - resize_frame_view(bv, (iv2){{FRAME_VIEW_RENDER_TARGET_SIZE}}); + resize_frame_view(bv, (uv2){{FRAME_VIEW_RENDER_TARGET_SIZE}}); switch (kind) { case BeamformerFrameViewKind_3DXPlane:{ @@ -1329,11 +1352,10 @@ ui_beamformer_frame_view_convert(BeamformerUI *ui, Arena *arena, Variable *view, axial->zoom_starting_coord = F32_INFINITY; b32 copy = kind == BeamformerFrameViewKind_Copy; - v3 normal = (v3){.y = 1.0f}; - if (old && old->frame) - normal = cross(old->frame->voxel_transform.c[0].xyz, old->frame->voxel_transform.c[1].xyz); + v3 N = (v3){.y = 1.0f}; + if (old) N = cross(old->frame.voxel_transform.c[0].xyz, old->frame.voxel_transform.c[1].xyz); - BeamformerViewPlaneTag plane = ui_plane_layout_from_normal(v3_normalize(normal)); + BeamformerViewPlaneTag plane = ui_plane_layout_from_normal(v3_normalize(N)); switch (plane) { case BeamformerViewPlaneTag_XY:{ lateral->min_value = copy ? &bv->min_coordinate.x : &ui->min_coordinate.x; @@ -1395,10 +1417,6 @@ ui_beamformer_frame_view_convert(BeamformerUI *ui, Arena *arena, Variable *view, &bv->cycler_state, labels, countof(labels)); bv->cycler_state = BeamformerViewPlaneTag_Count; }break; - case BeamformerFrameViewKind_Indexed:{ - bv->cycler = add_variable_cycler(ui, menu, arena, 0, ui->small_font, s8("Index:"), - &bv->cycler_state, 0, BeamformerMaxBacklogFrames); - }break; default:{}break; } @@ -1411,6 +1429,7 @@ ui_beamformer_frame_view_new(BeamformerUI *ui, Arena *arena) BeamformerFrameView *result = SLLPopFreelist(ui->view_freelist); if (!result) result = push_struct_no_zero(arena, typeof(*result)); zero_struct(result); + result->export_handle.value[0] = OSInvalidHandleValue; DLLPushDown(result, ui->views); return result; } @@ -1534,19 +1553,29 @@ ui_add_live_frame_view(BeamformerUI *ui, Variable *view, RegionSplitDirection di function void ui_beamformer_frame_view_copy_frame(BeamformerUI *ui, BeamformerFrameView *new, BeamformerFrameView *old) { - assert(old->frame); - new->frame = SLLPopFreelist(ui->frame_freelist); - if (!new->frame) new->frame = push_struct(&ui->arena, typeof(*new->frame)); + mem_copy(&new->frame, &old->frame, sizeof(old->frame)); + + iv3 points = new->frame.points; + i64 frame_size = points.x * points.y * points.z * beamformer_data_kind_byte_size[new->frame.data_kind]; - mem_copy(new->frame, old->frame, sizeof(*new->frame)); - new->frame->texture = 0; - new->frame->next = 0; - alloc_beamform_frame(new->frame, old->frame->dim, old->frame->gl_kind, s8("Frame Copy: "), ui->arena); + Stream sb = arena_stream(ui->arena); + stream_append_s8(&sb, s8("Frame Copy [")); + stream_append_hex_u64(&sb, new->frame.id); + stream_append_s8(&sb, s8("]")); + stream_append_byte(&sb, 0); + + GPUBufferAllocateInfo allocate_info = { + .size = frame_size, + .flags = VulkanUsageFlag_TransferDestination, + .label = stream_to_s8(&sb), + }; + vk_buffer_allocate(&new->copy_buffer, &allocate_info); - glCopyImageSubData(old->frame->texture, GL_TEXTURE_3D, 0, 0, 0, 0, - new->frame->texture, GL_TEXTURE_3D, 0, 0, 0, 0, - new->frame->dim.x, new->frame->dim.y, new->frame->dim.z); - glMemoryBarrier(GL_TEXTURE_UPDATE_BARRIER_BIT); + GPUBuffer *backlog = ui->beamformer_context->compute_context.backlog.buffer; + VulkanHandle cmd = vk_command_begin(VulkanTimeline_Compute); + vk_command_wait_timeline(cmd, VulkanTimeline_Compute, old->frame.timeline_valid_value); + vk_command_copy_buffer(cmd, &new->copy_buffer, backlog, old->frame.buffer_offset, frame_size); + new->frame.timeline_valid_value = vk_command_end(cmd, (VulkanHandle){0}, (VulkanHandle){0}); } function void @@ -1557,10 +1586,6 @@ ui_copy_frame(BeamformerUI *ui, Variable *view, RegionSplitDirection direction) assert(view->type == VT_UI_VIEW); BeamformerFrameView *old = view->view.child->generic; - /* TODO(rnp): hack; it would be better if this was unreachable with a 0 old->frame */ - if (!old->frame) - return; - Variable *new_region = ui_split_region(ui, region, view, direction); new_region->region_split.right = add_beamformer_frame_view(ui, new_region, &ui->arena, BeamformerFrameViewKind_Copy, 1, old); @@ -1641,7 +1666,7 @@ function m4 projection_matrix_for_x_plane_view(BeamformerFrameView *view) { assert(view->kind == BeamformerFrameViewKind_3DXPlane); - f32 aspect = (f32)view->texture_dim.w / (f32)view->texture_dim.h; + f32 aspect = (f32)view->colour_image.width / (f32)view->colour_image.height; m4 result = perspective_projection(10e-3f, 500e-3f, 45.0f * PI / 180.0f, aspect); return result; } @@ -1679,22 +1704,35 @@ view_plane_tag_from_x_plane_shift(BeamformerFrameView *view, Variable *x_plane_s function void render_single_xplane(BeamformerUI *ui, BeamformerFrameView *view, Variable *x_plane_shift, - u32 program, f32 rotation_turns, v3 translate, BeamformerViewPlaneTag tag) -{ - u32 texture = 0; - if (ui->latest_plane[tag]) - texture = ui->latest_plane[tag]->texture; + f32 rotation_turns, v3 translate, BeamformerViewPlaneTag tag, + VulkanHandle command, BeamformerRenderBeamformedPushConstants *pc) +{ + GPUBuffer *beamformed_buffer = ui->beamformer_context->compute_context.backlog.buffer; + pc->input_data = 0; + if (ui->latest_plane_valid[tag]) { + BeamformerFrame *f = ui->latest_plane + tag; + pc->input_data = beamformed_buffer->gpu_pointer + f->buffer_offset; + pc->input_size_x = f->points.x; + pc->input_size_y = f->points.y; + pc->input_size_z = f->points.z; + pc->data_kind = f->data_kind; + vk_command_wait_timeline(command, VulkanTimeline_Compute, f->timeline_valid_value); + } + + v3 camera = camera_for_x_plane_view(ui, view); + v3 scale = beamformer_frame_view_plane_size(ui, view); - v3 scale = beamformer_frame_view_plane_size(ui, view); m4 model_transform = y_aligned_volume_transform(scale, translate, rotation_turns); + m4 view_m = view_matrix_for_x_plane_view(ui, view, camera); + m4 projection = projection_matrix_for_x_plane_view(view); + + //pc->mvp_matrix = m4_mul(m4_mul(model_transform, view_m), projection); + pc->mvp_matrix = m4_mul(projection, m4_mul(view_m, model_transform)); + pc->bounding_box_colour = v4_lerp(FG_COLOUR, HOVERED_COLOUR, x_plane_shift->hover_t); + pc->bounding_box_fraction = FRAME_VIEW_BB_FRACTION; - v4 colour = v4_lerp(FG_COLOUR, HOVERED_COLOUR, x_plane_shift->hover_t); - glProgramUniformMatrix4fv(program, FRAME_VIEW_MODEL_MATRIX_LOC, 1, 0, model_transform.E); - glProgramUniform4fv(program, FRAME_VIEW_BB_COLOUR_LOC, 1, colour.E); - glProgramUniform1ui(program, FRAME_VIEW_SOLID_BB_LOC, 0); - glBindTextureUnit(0, texture); - glDrawElements(GL_TRIANGLES, ui->unit_cube_model.elements, GL_UNSIGNED_SHORT, - (void *)ui->unit_cube_model.elements_offset); + vk_command_push_constants(command, 0, sizeof(*pc), pc); + vk_command_draw(command, &ui->unit_cube_model.model); XPlaneShift *xp = &x_plane_shift->x_plane_shift; v3 xp_delta = v3_sub(xp->end_point, xp->start_point); @@ -1706,64 +1744,59 @@ render_single_xplane(BeamformerUI *ui, BeamformerFrameView *view, Variable *x_pl /* TODO(rnp): there is no reason to compute the rotation matrix again */ model_transform = y_aligned_volume_transform(scale, v3_add(f, translate), rotation_turns); - glProgramUniformMatrix4fv(program, FRAME_VIEW_MODEL_MATRIX_LOC, 1, 0, model_transform.E); - glProgramUniform1ui(program, FRAME_VIEW_SOLID_BB_LOC, 1); - glProgramUniform4fv(program, FRAME_VIEW_BB_COLOUR_LOC, 1, HOVERED_COLOUR.E); - glDrawElements(GL_TRIANGLES, ui->unit_cube_model.elements, GL_UNSIGNED_SHORT, - (void *)ui->unit_cube_model.elements_offset); + pc->mvp_matrix = m4_mul(projection, m4_mul(view_m, model_transform)); + pc->bounding_box_colour = HOVERED_COLOUR; + pc->bounding_box_fraction = 1.0f; + pc->input_data = 0; + + vk_command_push_constants(command, 0, sizeof(*pc), pc); + vk_command_draw(command, &ui->unit_cube_model.model); } } function void -render_3D_xplane(BeamformerUI *ui, BeamformerFrameView *view, u32 program) +render_3D_xplane(BeamformerUI *ui, BeamformerFrameView *view, VulkanHandle command, BeamformerRenderBeamformedPushConstants *pc) { if (view->demo->bool32) { view->rotation += dt_for_frame * 0.125f; if (view->rotation > 1.0f) view->rotation -= 1.0f; } - v3 camera = camera_for_x_plane_view(ui, view); - m4 view_m = view_matrix_for_x_plane_view(ui, view, camera); - m4 projection = projection_matrix_for_x_plane_view(view); - - glProgramUniformMatrix4fv(program, FRAME_VIEW_VIEW_MATRIX_LOC, 1, 0, view_m.E); - glProgramUniformMatrix4fv(program, FRAME_VIEW_PROJ_MATRIX_LOC, 1, 0, projection.E); - glProgramUniform1f(program, FRAME_VIEW_BB_FRACTION_LOC, FRAME_VIEW_BB_FRACTION); - v3 model_translate = offset_x_plane_position(ui, view, BeamformerViewPlaneTag_XZ); - render_single_xplane(ui, view, view->x_plane_shifts + 0, program, + render_single_xplane(ui, view, view->x_plane_shifts + 0, x_plane_rotation_for_view_plane(view, BeamformerViewPlaneTag_XZ), - model_translate, BeamformerViewPlaneTag_XZ); + model_translate, BeamformerViewPlaneTag_XZ, command, pc); model_translate = offset_x_plane_position(ui, view, BeamformerViewPlaneTag_YZ); model_translate.y -= 0.0001f; - render_single_xplane(ui, view, view->x_plane_shifts + 1, program, + render_single_xplane(ui, view, view->x_plane_shifts + 1, x_plane_rotation_for_view_plane(view, BeamformerViewPlaneTag_YZ), - model_translate, BeamformerViewPlaneTag_YZ); + model_translate, BeamformerViewPlaneTag_YZ, command, pc); } function void -render_2D_plane(BeamformerUI *ui, BeamformerFrameView *view, u32 program) +render_2D_plane(BeamformerUI *ui, BeamformerFrameView *view, VulkanHandle command, BeamformerRenderBeamformedPushConstants *pc) { m4 view_m = m4_identity(); m4 model = m4_scale((v3){{2.0f, 2.0f, 0.0f}}); m4 projection = orthographic_projection(0, 1, 1, 1); - glProgramUniformMatrix4fv(program, FRAME_VIEW_MODEL_MATRIX_LOC, 1, 0, model.E); - glProgramUniformMatrix4fv(program, FRAME_VIEW_VIEW_MATRIX_LOC, 1, 0, view_m.E); - glProgramUniformMatrix4fv(program, FRAME_VIEW_PROJ_MATRIX_LOC, 1, 0, projection.E); + GPUBuffer *beamformed_buffer = ui->beamformer_context->compute_context.backlog.buffer; + pc->mvp_matrix = m4_mul(m4_mul(model, view_m), projection); + pc->input_data = beamformed_buffer->gpu_pointer + view->frame.buffer_offset, + pc->input_size_x = view->frame.points.x, + pc->input_size_y = view->frame.points.y, + pc->input_size_z = view->frame.points.z, + pc->data_kind = view->frame.data_kind, - glProgramUniform1f(program, FRAME_VIEW_BB_FRACTION_LOC, 0); - glBindTextureUnit(0, view->frame->texture); - glDrawElements(GL_TRIANGLES, ui->unit_cube_model.elements, GL_UNSIGNED_SHORT, - (void *)ui->unit_cube_model.elements_offset); + vk_command_wait_timeline(command, VulkanTimeline_Compute, view->frame.timeline_valid_value); + vk_command_push_constants(command, 0, sizeof(*pc), pc); + vk_command_draw(command, &ui->unit_cube_model.model); } function b32 frame_view_ready_to_present(BeamformerUI *ui, BeamformerFrameView *view) { - b32 result = !iv2_equal((iv2){0}, view->texture_dim) && view->frame; - result |= view->kind == BeamformerFrameViewKind_3DXPlane && - ui->latest_plane[BeamformerViewPlaneTag_Count]; + b32 result = view->colour_image.width > 0 || view->colour_image.height > 0; return result; } @@ -1772,76 +1805,61 @@ view_update(BeamformerUI *ui, BeamformerFrameView *view) { if (view->kind == BeamformerFrameViewKind_Latest) { u32 index = *view->cycler->cycler.state; - view->dirty |= view->frame != ui->latest_plane[index]; - view->frame = ui->latest_plane[index]; - if (view->dirty && view->frame) { - view->min_coordinate = m4_mul_v4(view->frame->voxel_transform, (v4){{0.0f, 0.0f, 0.0f, 1.0f}}).xyz; - view->max_coordinate = m4_mul_v4(view->frame->voxel_transform, (v4){{1.0f, 1.0f, 1.0f, 1.0f}}).xyz; + view->dirty |= view->frame.timeline_valid_value != ui->latest_plane[index].timeline_valid_value; + mem_copy(&view->frame, ui->latest_plane + index, sizeof(view->frame)); + if (view->dirty) { + view->min_coordinate = m4_mul_v4(view->frame.voxel_transform, (v4){{0.0f, 0.0f, 0.0f, 1.0f}}).xyz; + view->max_coordinate = m4_mul_v4(view->frame.voxel_transform, (v4){{1.0f, 1.0f, 1.0f, 1.0f}}).xyz; } } /* TODO(rnp): x-z or y-z */ - view->dirty |= ui->frame_view_render_context->updated; + // TODO(rnp): how to track this now? use pipeline handle value? + view->dirty |= ui->beamformer_context->render_shader_updated; view->dirty |= view->kind == BeamformerFrameViewKind_3DXPlane; - b32 result = frame_view_ready_to_present(ui, view) && view->dirty; + b32 result = view->dirty; return result; } function void update_frame_views(BeamformerUI *ui, Rect window) { - FrameViewRenderContext *ctx = ui->frame_view_render_context; - b32 fbo_bound = 0; for (BeamformerFrameView *view = ui->views; view; view = view->next) { if (view_update(ui, view)) { - //start_renderdoc_capture(0); - - if (!fbo_bound) { - fbo_bound = 1; - glBindFramebuffer(GL_FRAMEBUFFER, ctx->framebuffers[0]); - glUseProgram(ctx->shader); - glBindVertexArray(ui->unit_cube_model.vao); - glEnable(GL_DEPTH_TEST); - } + BeamformerRenderBeamformedPushConstants pc = { + .bounding_box_colour = FRAME_VIEW_BB_COLOUR, + .db_cutoff = view->log_scale->bool32 ? view->dynamic_range.real32 : 0, + .threshold = view->threshold.real32, + .gamma = view->gamma.scaled_real32.val, + .positions = ui->unit_cube_model.model.gpu_pointer, + .normals = ui->unit_cube_model.model.gpu_pointer + ui->unit_cube_model.normals_offset, + }; - u32 fb = ctx->framebuffers[0]; - u32 program = ctx->shader; - glViewport(0, 0, view->texture_dim.w, view->texture_dim.h); - glProgramUniform1f(program, FRAME_VIEW_THRESHOLD_LOC, view->threshold.real32); - glProgramUniform1f(program, FRAME_VIEW_DYNAMIC_RANGE_LOC, view->dynamic_range.real32); - glProgramUniform1f(program, FRAME_VIEW_GAMMA_LOC, view->gamma.scaled_real32.val); - glProgramUniform1ui(program, FRAME_VIEW_LOG_SCALE_LOC, view->log_scale->bool32); + //start_renderdoc_capture(); - glNamedFramebufferRenderbuffer(fb, GL_COLOR_ATTACHMENT0, GL_RENDERBUFFER, ctx->renderbuffers[0]); - glNamedFramebufferRenderbuffer(fb, GL_DEPTH_ATTACHMENT, GL_RENDERBUFFER, ctx->renderbuffers[1]); - glClearNamedFramebufferfv(fb, GL_COLOR, 0, (f32 []){0, 0, 0, 0}); - glClearNamedFramebufferfv(fb, GL_DEPTH, 0, (f32 []){1}); + glSignalSemaphoreEXT(ui->render_semaphores_gl[0], 0, 0, 1, &view->texture, (GLenum []){GL_NONE}); + VulkanHandle cmd = vk_command_begin(VulkanTimeline_Graphics); + vk_command_bind_pipeline(cmd, ui->pipelines[BeamformerShaderKind_RenderBeamformed - BeamformerShaderKind_RenderFirst]); + vk_command_begin_rendering(cmd, &ui->render_3d_image, &ui->render_3d_depth_image, &view->colour_image); + vk_command_viewport(cmd, view->colour_image.width, view->colour_image.height, 0, 0, 0.0f, 1.0f); + vk_command_scissor(cmd, view->colour_image.width, view->colour_image.height, 0, 0); if (view->kind == BeamformerFrameViewKind_3DXPlane) { - render_3D_xplane(ui, view, program); + render_3D_xplane(ui, view, cmd, &pc); } else { - render_2D_plane(ui, view, program); + render_2D_plane(ui, view, cmd, &pc); } + vk_command_end_rendering(cmd); + vk_command_end(cmd, ui->render_semaphores[0], ui->render_semaphores[1]); - /* NOTE(rnp): resolve multisampled scene */ - glNamedFramebufferTexture(ctx->framebuffers[1], GL_COLOR_ATTACHMENT0, view->texture, 0); - glBlitNamedFramebuffer(fb, ctx->framebuffers[1], 0, 0, FRAME_VIEW_RENDER_TARGET_SIZE, - 0, 0, view->texture_dim.w, view->texture_dim.h, GL_COLOR_BUFFER_BIT, GL_NEAREST); + glWaitSemaphoreEXT(ui->render_semaphores_gl[1], 0, 0, 1, &view->texture, (GLenum[]){GL_LAYOUT_COLOR_ATTACHMENT_EXT}); - glGenerateTextureMipmap(view->texture); - view->dirty = 0; + //end_renderdoc_capture(); - //end_renderdoc_capture(0); + view->dirty = 0; } } - if (fbo_bound) { - glBindFramebuffer(GL_FRAMEBUFFER, 0); - glViewport((i32)window.pos.x, (i32)window.pos.y, (i32)window.size.w, (i32)window.size.h); - /* NOTE(rnp): I don't trust raylib to not mess with us */ - glBindVertexArray(0); - glDisable(GL_DEPTH_TEST); - } } function Color @@ -2420,9 +2438,9 @@ draw_view_ruler(BeamformerFrameView *view, Arena a, Rect view_rect, TextSpec ts) { v2 vr_max_p = v2_add(view_rect.pos, view_rect.size); - v3 U = view->frame->voxel_transform.c[0].xyz; - v3 V = view->frame->voxel_transform.c[1].xyz; - v3 min = view->frame->voxel_transform.c[3].xyz; + v3 U = view->frame.voxel_transform.c[0].xyz; + v3 V = view->frame.voxel_transform.c[1].xyz; + v3 min = view->frame.voxel_transform.c[3].xyz; v2 start_uv = plane_uv(v3_sub(view->ruler.start, min), U, V); v2 end_uv = plane_uv(v3_sub(view->ruler.end, min), U, V); @@ -2487,7 +2505,7 @@ draw_3D_xplane_frame_view(BeamformerUI *ui, Arena arena, Variable *var, Rect dis assert(var->type == VT_BEAMFORMER_FRAME_VIEW); BeamformerFrameView *view = var->generic; - f32 aspect = (f32)view->texture_dim.w / (f32)view->texture_dim.h; + f32 aspect = (f32)view->colour_image.width / (f32)view->colour_image.height; Rect vr = display_rect; if (aspect > 1.0f) vr.size.w = vr.size.h; else vr.size.h = vr.size.w; @@ -2536,7 +2554,7 @@ draw_3D_xplane_frame_view(BeamformerUI *ui, Arena arena, Variable *var, Rect dis it->hover_t = CLAMP01(it->hover_t); } - Rectangle tex_r = {0, 0, (f32)view->texture_dim.w, (f32)view->texture_dim.h}; + Rectangle tex_r = {0, 0, (f32)view->colour_image.width, (f32)view->colour_image.height}; NPatchInfo tex_np = {tex_r, 0, 0, 0, 0, NPATCH_NINE_PATCH}; DrawTextureNPatch(make_raylib_texture(view), tex_np, rl_rect(vr), (Vector2){0}, 0, WHITE); @@ -2548,9 +2566,9 @@ draw_beamformer_frame_view(BeamformerUI *ui, Arena a, Variable *var, Rect displa { assert(var->type == VT_BEAMFORMER_FRAME_VIEW); BeamformerFrameView *view = var->generic; - BeamformerFrame *frame = view->frame; + BeamformerFrame *frame = &view->frame; - b32 is_1d = iv3_dimension(frame->dim) == 1; + b32 is_1d = iv3_dimension(frame->points) == 1; f32 txt_w = measure_text(ui->small_font, s8("-288.8 mm")).w; f32 scale_bar_size = 1.2f * txt_w + RULER_TICK_LENGTH; @@ -2603,11 +2621,11 @@ draw_beamformer_frame_view(BeamformerUI *ui, Arena a, Variable *var, Rect displa Rectangle tex_r; if (is_1d) { - tex_r = (Rectangle){0, 0, view->texture_dim.x, -view->texture_dim.y}; + tex_r = (Rectangle){0, 0, view->colour_image.width, -view->colour_image.height}; } else { v2 pixels_per_meter = { - .w = (f32)view->texture_dim.w / output_dim.w, - .h = (f32)view->texture_dim.h / output_dim.h, + .w = (f32)view->colour_image.width / output_dim.w, + .h = (f32)view->colour_image.height / output_dim.h, }; /* NOTE(rnp): math to resize the texture without stretching when the view changes @@ -2742,22 +2760,19 @@ push_compute_time(Arena *arena, s8 prefix, f32 time) function v2 draw_compute_stats_bar_view(BeamformerUI *ui, Arena arena, ComputeShaderStats *stats, - BeamformerShaderKind *stages, u32 stages_count, f32 compute_time_sum, - TextSpec ts, Rect r, v2 mouse) + f32 compute_time_sum, TextSpec ts, Rect r, v2 mouse) { read_only local_persist s8 frame_labels[] = {s8_comp("0:"), s8_comp("-1:"), s8_comp("-2:"), s8_comp("-3:")}; f32 total_times[countof(frame_labels)] = {0}; + + u32 stages = stats->table.shader_count; Table *table = table_new(&arena, countof(frame_labels), TextAlignment_Right, TextAlignment_Left); for (u32 i = 0; i < countof(frame_labels); i++) { TableCell *cells = table_push_row(table, &arena, TRK_CELLS)->data; cells[0].text = frame_labels[i]; u32 frame_index = (stats->latest_frame_index - i) % countof(stats->table.times); - u32 seen_shaders = 0; - for (u32 j = 0; j < stages_count; j++) { - if ((seen_shaders & (1u << stages[j])) == 0) - total_times[i] += stats->table.times[frame_index][stages[j]]; - seen_shaders |= (1u << stages[j]); - } + for (u32 j = 0; j < stages; j++) + total_times[i] += stats->table.times[frame_index][j]; } v2 result = table_extent(table, arena, ts.font); @@ -2784,14 +2799,14 @@ draw_compute_stats_bar_view(BeamformerUI *ui, Arena arena, ComputeShaderStats *s Rect rect; rect.pos = v2_add(cr.pos, (v2){{cr.size.w + table->cell_pad.w , cr.size.h * 0.15f}}); rect.size = (v2){.y = 0.7f * cr.size.h}; - for (u32 i = 0; i < stages_count; i++) { - rect.size.w = total_width * stats->table.times[frame_index][stages[i]] / total_times[row_index]; + for (u32 i = 0; i < stages; i++) { + rect.size.w = total_width * stats->table.times[frame_index][i] / total_times[row_index]; Color color = colour_from_normalized(g_colour_palette[i % countof(g_colour_palette)]); DrawRectangleRec(rl_rect(rect), color); if (point_in_rect(mouse, rect)) { text_pos = v2_add(rect.pos, (v2){.x = table->cell_pad.w}); - s8 name = push_s8_from_parts(&arena, s8(""), beamformer_shader_names[stages[i]], s8(": ")); - mouse_text = push_compute_time(&arena, name, stats->table.times[frame_index][stages[i]]); + s8 name = push_s8_from_parts(&arena, s8(""), beamformer_shader_names[stats->table.shader_ids[i]], s8(": ")); + mouse_text = push_compute_time(&arena, name, stats->table.times[frame_index][i]); } rect.pos.x += rect.size.w; } @@ -2865,19 +2880,13 @@ draw_compute_stats_view(BeamformerUI *ui, Arena arena, Variable *view, Rect r, v ComputeStatsView *csv = &view->compute_stats_view; ComputeShaderStats *stats = csv->compute_shader_stats; f32 compute_time_sum = 0; - u32 stages = cp->pipeline.shader_count; + u32 stages = stats->table.shader_count; TextSpec text_spec = {.font = &ui->font, .colour = FG_COLOUR, .flags = TF_LIMITED}; ui_blinker_update(&csv->blink, BLINK_SPEED); - static_assert(BeamformerShaderKind_ComputeCount <= 32, "shader kind bitfield test"); - u32 seen_shaders = 0; - for (u32 i = 0; i < stages; i++) { - BeamformerShaderKind index = cp->pipeline.shaders[i]; - if ((seen_shaders & (1u << index)) == 0) - compute_time_sum += stats->average_times[index]; - seen_shaders |= (1u << index); - } + for (u32 index = 0; index < stages; index++) + compute_time_sum += stats->average_times[index]; v2 result = {0}; @@ -2886,13 +2895,12 @@ draw_compute_stats_view(BeamformerUI *ui, Arena arena, Variable *view, Rect r, v case ComputeStatsViewKind_Average:{ da_reserve(&arena, table, stages); for (u32 i = 0; i < stages; i++) { - push_table_time_row(table, &arena, beamformer_shader_names[cp->pipeline.shaders[i]], - stats->average_times[cp->pipeline.shaders[i]]); + push_table_time_row(table, &arena, beamformer_shader_names[stats->table.shader_ids[i]], + stats->average_times[i]); } }break; case ComputeStatsViewKind_Bar:{ - result = draw_compute_stats_bar_view(ui, arena, stats, cp->pipeline.shaders, stages, - compute_time_sum, text_spec, r, mouse); + result = draw_compute_stats_bar_view(ui, arena, stats, compute_time_sum, text_spec, r, mouse); r.pos = v2_add(r.pos, (v2){.y = result.y}); }break; InvalidDefaultCase; @@ -2920,9 +2928,9 @@ draw_compute_stats_view(BeamformerUI *ui, Arena arena, Variable *view, Rect r, v cell_rect.size.w = t->widths[column]; text_spec.limits.size.w = r.size.w - (cell_rect.pos.x - it->start_x); - if (column == 0 && row_index < stages && cp->programs[row_index] == 0 && - cp->pipeline.shaders[row_index] != BeamformerShaderKind_CudaHilbert && - cp->pipeline.shaders[row_index] != BeamformerShaderKind_CudaDecode) + if (column == 0 && row_index < stages && vk_pipeline_valid(cp->vulkan_pipelines[row_index]) == 0 && + stats->table.shader_ids[row_index] != BeamformerShaderKind_CudaHilbert && + stats->table.shader_ids[row_index] != BeamformerShaderKind_CudaDecode) { text_spec.colour = v4_lerp(FG_COLOUR, FOCUSED_COLOUR, ease_in_out_quartic(csv->blink.t)); } else { @@ -3707,7 +3715,7 @@ ui_begin_interact(BeamformerUI *ui, v2 mouse, b32 scroll) switch (++bv->ruler.state) { case RulerState_Start:{ hot.kind = InteractionKind_Ruler; - bv->ruler.start = world_point_from_plane_uv(bv->frame->voxel_transform, + bv->ruler.start = world_point_from_plane_uv(bv->frame.voxel_transform, rect_uv(mouse, hot.rect)); }break; case RulerState_Hold:{}break; @@ -3791,7 +3799,7 @@ ui_extra_actions(BeamformerUI *ui, Variable *var) ui_beamformer_frame_view_release_subresources(ui, old, last_kind); ui_beamformer_frame_view_convert(ui, &ui->arena, view->child, view->menu, old->kind, old, log_scale); - if (new->kind == BeamformerFrameViewKind_Copy && old->frame) + if (new->kind == BeamformerFrameViewKind_Copy) ui_beamformer_frame_view_copy_frame(ui, new, old); DLLRemove(old); @@ -3947,7 +3955,7 @@ ui_interact(BeamformerUI *ui, BeamformerInput *input, Rect window_rect) assert(it->var->type == VT_BEAMFORMER_FRAME_VIEW); BeamformerFrameView *bv = it->var->generic; v2 mouse = clamp_v2_rect(input_mouse, it->rect); - bv->ruler.end = world_point_from_plane_uv(bv->frame->voxel_transform, rect_uv(mouse, it->rect)); + bv->ruler.end = world_point_from_plane_uv(bv->frame.voxel_transform, rect_uv(mouse, it->rect)); }break; case InteractionKind_Drag:{ if (!IsMouseButtonDown(MOUSE_BUTTON_LEFT) && !IsMouseButtonDown(MOUSE_BUTTON_RIGHT)) { @@ -4037,8 +4045,6 @@ ui_init(BeamformerCtx *ctx, Arena store) if (!ui) { ui = ctx->ui = push_struct(&store, typeof(*ui)); ui->arena = store; - ui->frame_view_render_context = &ctx->frame_view_render_context; - ui->unit_cube_model = ctx->compute_context.unit_cube_model; ui->shared_memory = ctx->shared_memory; ui->beamformer_context = ctx; @@ -4072,9 +4078,130 @@ ui_init(BeamformerCtx *ctx, Arena store) split->region_split.left = add_compute_progress_bar(split, ctx); split->region_split.right = add_compute_stats_view(ui, split, &ui->arena, ctx); + u32 samples = vk_gpu_info()->max_msaa_samples; + vk_image_allocate(&ui->render_3d_image, FRAME_VIEW_RENDER_TARGET_SIZE, 1, samples, VulkanImageUsage_Colour, 0, 0); + vk_image_allocate(&ui->render_3d_depth_image, FRAME_VIEW_RENDER_TARGET_SIZE, 1, samples, VulkanImageUsage_DepthStencil, 0, 0); + + glGenSemaphoresEXT(countof(ui->render_semaphores_gl), ui->render_semaphores_gl); + for EachElement(ui->render_semaphores, it) + ui->render_semaphores[it] = vk_create_semaphore(ui->render_semaphores_export + it); + + if (OS_WINDOWS) { + glImportSemaphoreWin32HandleEXT(ui->render_semaphores_gl[0], GL_HANDLE_TYPE_OPAQUE_WIN32_EXT, (void *)ui->render_semaphores_export[0].value[0]); + glImportSemaphoreWin32HandleEXT(ui->render_semaphores_gl[1], GL_HANDLE_TYPE_OPAQUE_WIN32_EXT, (void *)ui->render_semaphores_export[1].value[0]); + } else { + glImportSemaphoreFdEXT(ui->render_semaphores_gl[0], GL_HANDLE_TYPE_OPAQUE_FD_EXT, ui->render_semaphores_export[0].value[0]); + glImportSemaphoreFdEXT(ui->render_semaphores_gl[1], GL_HANDLE_TYPE_OPAQUE_FD_EXT, ui->render_semaphores_export[1].value[0]); + ui->render_semaphores_export[0].value[0] = OSInvalidHandleValue; + ui->render_semaphores_export[1].value[0] = OSInvalidHandleValue; + } + + if (!BakeShaders) + { + for EachElement(beamformer_reloadable_render_shader_info_indices, it) { + i32 index = beamformer_reloadable_render_shader_info_indices[it]; + for (u32 i = 0; i < 2; i++) { + BeamformerFileReloadContext *frc = push_struct(&ui->arena, typeof(*frc)); + frc->kind = BeamformerFileReloadKind_RenderShader; + frc->shader_reload.shader = beamformer_reloadable_shader_kinds[index]; + frc->shader_reload.pipeline = ui->pipelines + it; + + Arena scratch = ui->arena; + s8 file = push_s8_from_parts(&scratch, os_path_separator(), s8("shaders"), + beamformer_reloadable_shader_files[index][i]); + + os_add_file_watch((char *)file.data, file.len, frc); + } + } + } + + f32 unit_cube_vertices[] = { + 0.5f, 0.5f, -0.5f, 0.0f, + 0.5f, 0.5f, -0.5f, 0.0f, + 0.5f, 0.5f, -0.5f, 0.0f, + 0.5f, -0.5f, -0.5f, 0.0f, + 0.5f, -0.5f, -0.5f, 0.0f, + 0.5f, -0.5f, -0.5f, 0.0f, + 0.5f, 0.5f, 0.5f, 0.0f, + 0.5f, 0.5f, 0.5f, 0.0f, + 0.5f, 0.5f, 0.5f, 0.0f, + 0.5f, -0.5f, 0.5f, 0.0f, + 0.5f, -0.5f, 0.5f, 0.0f, + 0.5f, -0.5f, 0.5f, 0.0f, + -0.5f, 0.5f, -0.5f, 0.0f, + -0.5f, 0.5f, -0.5f, 0.0f, + -0.5f, 0.5f, -0.5f, 0.0f, + -0.5f, -0.5f, -0.5f, 0.0f, + -0.5f, -0.5f, -0.5f, 0.0f, + -0.5f, -0.5f, -0.5f, 0.0f, + -0.5f, 0.5f, 0.5f, 0.0f, + -0.5f, 0.5f, 0.5f, 0.0f, + -0.5f, 0.5f, 0.5f, 0.0f, + -0.5f, -0.5f, 0.5f, 0.0f, + -0.5f, -0.5f, 0.5f, 0.0f, + -0.5f, -0.5f, 0.5f, 0.0f, + }; + f32 unit_cube_normals[] = { + 0.0f, 0.0f, -1.0f, 0.0f, + 0.0f, 1.0f, 0.0f, 0.0f, + 1.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, -1.0f, 0.0f, + 0.0f, -1.0f, 0.0f, 0.0f, + 1.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 1.0f, 0.0f, + 0.0f, 1.0f, 0.0f, 0.0f, + 1.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 1.0f, 0.0f, + 0.0f, -1.0f, 0.0f, 0.0f, + 1.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, -1.0f, 0.0f, + 0.0f, 1.0f, 0.0f, 0.0f, + -1.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, -1.0f, 0.0f, + 0.0f, -1.0f, 0.0f, 0.0f, + -1.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 1.0f, 0.0f, + 0.0f, 1.0f, 0.0f, 0.0f, + -1.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 1.0f, 0.0f, + 0.0f, -1.0f, 0.0f, 0.0f, + -1.0f, 0.0f, 0.0f, 0.0f, + }; + u16 unit_cube_indices[] = { + 1, 13, 19, + 1, 19, 7, + 9, 6, 18, + 9, 18, 21, + 23, 20, 14, + 23, 14, 17, + 16, 4, 10, + 16, 10, 22, + 5, 2, 8, + 5, 8, 11, + 15, 12, 0, + 15, 0, 3 + }; + + static_assert(countof(unit_cube_normals) == countof(unit_cube_vertices), ""); + + RenderModel *rm = &ui->unit_cube_model; + rm->vertex_count = countof(unit_cube_vertices) / 4; + rm->normals_offset = round_up_to(sizeof(unit_cube_vertices), 16); + + u64 model_size = 2 * round_up_to(sizeof(unit_cube_vertices), 16); + vk_render_model_allocate(&rm->model, unit_cube_indices, countof(unit_cube_indices), model_size, s8("unit_cube_model")); + vk_render_model_range_upload(&rm->model, unit_cube_vertices, 0, sizeof(unit_cube_vertices), 0); + vk_render_model_range_upload(&rm->model, unit_cube_normals, rm->normals_offset, sizeof(unit_cube_normals), 0); + /* NOTE(rnp): shrink variable size once this fires */ assert((uz)(ui->arena.beg - (u8 *)ui) < KB(64)); } + + for EachElement(beamformer_reloadable_render_shader_info_indices, it) { + i32 index = beamformer_reloadable_render_shader_info_indices[it]; + BeamformerShaderKind shader = beamformer_reloadable_shader_kinds[index]; + beamformer_reload_render_pipeline(ui->pipelines + it, shader, ui->arena); + } } function void @@ -4091,8 +4218,15 @@ draw_ui(BeamformerCtx *ctx, BeamformerInput *input, BeamformerFrame *frame_to_dr { BeamformerUI *ui = ctx->ui; - ui->latest_plane[BeamformerViewPlaneTag_Count] = frame_to_draw; - ui->latest_plane[frame_plane] = frame_to_draw; + if (frame_to_draw) { + mem_copy(ui->latest_plane + BeamformerViewPlaneTag_Count, frame_to_draw, sizeof(*frame_to_draw)); + mem_copy(ui->latest_plane + frame_plane, frame_to_draw, sizeof(*frame_to_draw)); + ui->latest_plane_valid[BeamformerViewPlaneTag_Count] = 1; + ui->latest_plane_valid[frame_plane] = 1; + } else { + ui->latest_plane_valid[BeamformerViewPlaneTag_Count] = 0; + ui->latest_plane_valid[frame_plane] = 0; + } asan_poison_region(ui->arena.beg, ui->arena.end - ui->arena.beg); @@ -4151,7 +4285,7 @@ draw_ui(BeamformerCtx *ctx, BeamformerInput *input, BeamformerFrame *frame_to_dr if (pb) { ui->flush_params = 0; - iv3 points = ctx->latest_frame->dim; + iv3 points = ctx->latest_frame->points; i32 dimension = iv3_dimension(points); // TODO(rnp): this is immediate mode code that should be in the ui building code diff --git a/util.c b/util.c @@ -607,14 +607,6 @@ s8_scan_backwards(s8 s, u8 byte) } function s8 -s8_trim_trailing(s8 s, u8 byte) -{ - s8 result = s; - while (result.len >= 1 && result.data[result.len - 1] == byte) result.len--; - return result; -} - -function s8 s8_cut_head(s8 s, iz cut) { s8 result = s; diff --git a/util.h b/util.h @@ -377,18 +377,20 @@ typedef OS_WRITE_NEW_FILE_FN(os_write_new_file_fn); #define RENDERDOC_GET_API_FN(name) b32 name(u32 version, void **out_api) typedef RENDERDOC_GET_API_FN(renderdoc_get_api_fn); -#define RENDERDOC_START_FRAME_CAPTURE_FN(name) void name(iptr gl_context, iptr window_handle) +#define RENDERDOC_START_FRAME_CAPTURE_FN(name) void name(void *instance_handle, iptr window_handle) typedef RENDERDOC_START_FRAME_CAPTURE_FN(renderdoc_start_frame_capture_fn); -#define RENDERDOC_END_FRAME_CAPTURE_FN(name) b32 name(iptr gl_context, iptr window_handle) +#define RENDERDOC_END_FRAME_CAPTURE_FN(name) b32 name(void *instance_handle, iptr window_handle) typedef RENDERDOC_END_FRAME_CAPTURE_FN(renderdoc_end_frame_capture_fn); -typedef alignas(16) u8 RenderDocAPI[216]; -#define RENDERDOC_API_FN_ADDR(a, offset) (*(iptr *)((*a) + offset)) -#define RENDERDOC_START_FRAME_CAPTURE(a) (renderdoc_start_frame_capture_fn *)RENDERDOC_API_FN_ADDR(a, 152) -#define RENDERDOC_END_FRAME_CAPTURE(a) (renderdoc_end_frame_capture_fn *) RENDERDOC_API_FN_ADDR(a, 168) +#define RENDERDOC_SET_CAPTURE_PATH_TEMPLATE_FN(name) void name(const char *template) +typedef RENDERDOC_SET_CAPTURE_PATH_TEMPLATE_FN(renderdoc_set_capture_path_template_fn); -#define LABEL_GL_OBJECT(type, id, s) {s8 _s = (s); glObjectLabel(type, id, (i32)_s.len, (c8 *)_s.data);} +typedef alignas(16) u8 RenderDocAPI[216]; +#define RENDERDOC_API_FN_ADDR(a, offset) (*(iptr *)((*a) + offset)) +#define RENDERDOC_START_FRAME_CAPTURE(a) (renderdoc_start_frame_capture_fn *) RENDERDOC_API_FN_ADDR(a, 152) +#define RENDERDOC_END_FRAME_CAPTURE(a) (renderdoc_end_frame_capture_fn *) RENDERDOC_API_FN_ADDR(a, 168) +#define RENDERDOC_SET_CAPTURE_PATH_TEMPLATE(a) (renderdoc_set_capture_path_template_fn *) RENDERDOC_API_FN_ADDR(a, 184) #include "util.c" #include "math.c" diff --git a/util_gl.c b/util_gl.c @@ -1,69 +0,0 @@ -/* See LICENSE for license details. */ -function u32 -compile_shader(Arena a, u32 type, s8 shader, s8 name) -{ - u32 sid = glCreateShader(type); - glShaderSource(sid, 1, (const char **)&shader.data, (int *)&shader.len); - glCompileShader(sid); - - i32 res = 0; - glGetShaderiv(sid, GL_COMPILE_STATUS, &res); - - if (res == GL_FALSE) { - Stream buf = arena_stream(a); - stream_append_s8s(&buf, s8("\n"), name, s8(": failed to compile\n")); - - i32 len = 0, out_len = 0; - glGetShaderiv(sid, GL_INFO_LOG_LENGTH, &len); - glGetShaderInfoLog(sid, len, &out_len, (char *)(buf.data + buf.widx)); - stream_commit(&buf, out_len); - glDeleteShader(sid); - os_console_log(buf.data, buf.widx); - - sid = 0; - } - - return sid; -} - -function u32 -link_program(Arena a, u32 *shader_ids, i32 shader_id_count) -{ - i32 success = 0; - u32 result = glCreateProgram(); - for (i32 i = 0; i < shader_id_count; i++) - glAttachShader(result, shader_ids[i]); - glLinkProgram(result); - glGetProgramiv(result, GL_LINK_STATUS, &success); - if (success == GL_FALSE) { - i32 len = 0; - Stream buf = arena_stream(a); - stream_append_s8(&buf, s8("shader link error: ")); - glGetProgramInfoLog(result, buf.cap - buf.widx, &len, (c8 *)(buf.data + buf.widx)); - stream_reset(&buf, len); - stream_append_byte(&buf, '\n'); - os_console_log(buf.data, buf.widx); - glDeleteProgram(result); - result = 0; - } - return result; -} - -function u32 -load_shader(Arena arena, s8 *shader_texts, u32 *shader_types, i32 count, s8 name) -{ - u32 result = 0; - u32 *ids = push_array(&arena, u32, count); - b32 valid = 1; - for (i32 i = 0; i < count; i++) { - ids[i] = compile_shader(arena, shader_types[i], shader_texts[i], name); - valid &= ids[i] != 0; - } - - if (valid) result = link_program(arena, ids, count); - for (i32 i = 0; i < count; i++) glDeleteShader(ids[i]); - - if (result) glObjectLabel(GL_PROGRAM, result, (i32)name.len, (c8 *)name.data); - - return result; -} diff --git a/util_os.c b/util_os.c @@ -24,3 +24,21 @@ release_lock(i32 *lock) atomic_store_u32(lock, 0); os_wake_all_waiters(lock); } + +#if BEAMFORMER_RENDERDOC_HOOKS +function void +load_renderdoc_functions(BeamformerInput *input, OSLibrary rdoc) +{ + if ValidHandle(rdoc) { + renderdoc_get_api_fn *get_api = os_lookup_symbol(rdoc, "RENDERDOC_GetAPI"); + if (get_api) { + RenderDocAPI *api = 0; + if (get_api(10600, (void **)&api)) { + input->renderdoc_start_frame_capture = RENDERDOC_START_FRAME_CAPTURE(api); + input->renderdoc_end_frame_capture = RENDERDOC_END_FRAME_CAPTURE(api); + input->renderdoc_set_capture_file_path_template = RENDERDOC_SET_CAPTURE_PATH_TEMPLATE(api); + } + } + } +} +#endif diff --git a/vulkan.c b/vulkan.c @@ -1,3 +1,8 @@ +/* See LICENSE for license details. */ +// TODO(rnp) +// [ ]: what is needed for HDR? I think it makes sense to just default to it nowadays +// [ ]: once opengl is removed switch images to SRGB and/or 16 bit Float + #include "beamformer_internal.h" #include "vulkan.h" #include "external/glslang/glslang/Include/glslang_c_interface.h" @@ -7,6 +12,9 @@ #define ValidVulkanHandle(h) ((h).value[0] != 0) +#define MaxCommandBuffersInFlight BeamformerMaxRawDataFramesInFlight +#define MaxCommandBufferTimestamps (64) + typedef enum { VulkanQueueKind_Graphics, VulkanQueueKind_Compute, @@ -28,17 +36,44 @@ typedef struct { void * host_pointer; VulkanMemoryKind memory_kind; + + // NOTE: only used when the buffer is backing a VulkanRenderModel. + VkIndexType index_type; } VulkanBuffer; typedef struct { - VkPipeline pipeline; - VkPipelineLayout layout; -} VulkanShader; + VkDeviceMemory memory; + VkImage image; + VkImageView view; +} VulkanImage; + +typedef struct { + VkPipeline pipeline; + VkPipelineLayout layout; + VkShaderStageFlags stage_flags; +} VulkanPipeline; + +typedef struct { + VkSemaphore semaphore; + u64 value; +} VulkanSemaphore; + +typedef struct { + VulkanQueueKind kind; + u32 command_buffer_index; + + // NOTE(rnp): since there may not be QueueKind_Count queues, when putting values into this + // array you must be careful to map through the queue_indices array in the vulkan_context. + u64 in_flight_wait_values[VulkanQueueKind_Count]; +} VulkanCommandBuffer; typedef enum { VulkanEntityKind_Buffer, + VulkanEntityKind_CommandBuffer, + VulkanEntityKind_Image, + VulkanEntityKind_Pipeline, + VulkanEntityKind_RenderModel, VulkanEntityKind_Semaphore, - VulkanEntityKind_Shader, } VulkanEntityKind; typedef struct VulkanEntity VulkanEntity; @@ -46,9 +81,11 @@ struct VulkanEntity { VulkanEntity * next; VulkanEntityKind kind; union { - VulkanBuffer buffer; - VkSemaphore semaphore; - VulkanShader shader; + VulkanBuffer buffer; + VulkanCommandBuffer command_buffer; + VulkanImage image; + VulkanPipeline pipeline; + VulkanSemaphore semaphore; } as; }; @@ -59,10 +96,21 @@ typedef alignas(64) struct { u16 queue_index; VkQueue queue; - u8 _pad[48]; + VkQueryPool query_pool; + u32 query_pool_occupied[VulkanQueueKind_Count]; + + u32 next_command_buffer_index; + VkCommandPool command_pool; + VkCommandBuffer command_buffers[MaxCommandBuffersInFlight]; + u64 command_buffer_submission_values[MaxCommandBuffersInFlight]; + + VulkanSemaphore timeline_semaphore; + + VkPipelineStageFlags2 pipeline_stage_flags; + + VulkanPipeline *bound_pipeline; } VulkanQueue; -static_assert(sizeof(VulkanQueue) == 64 && alignof(VulkanQueue) == 64, - "VulkanQueue must be placed on its own cacheline"); +static_assert(alignof(VulkanQueue) == 64, "VulkanQueue must be placed on its own cacheline"); typedef struct { Arena arena; @@ -72,8 +120,9 @@ typedef struct { VkDevice device; VkPhysicalDevice physical_device; - // NOTE(rnp): fallback for when a compute shader fails to compile - VulkanShader default_compute_shader; + // NOTE(rnp): fallback for when a shader fails to compile + VulkanPipeline default_compute_pipeline; + VulkanPipeline default_graphics_pipeline; GPUInfo gpu_info; @@ -88,6 +137,14 @@ typedef struct { } memory_info; VulkanQueue * queues[VulkanQueueKind_Count]; + // NOTE(rnp): there are a few places in the code where simply going through the queues map + // is not sufficient. those places need to know of the unique queues which unique queue + // is being referred to. that code uses this map instead. + u16 queue_indices[VulkanQueueKind_Count]; + u16 unique_queues; + + VkFormat swap_chain_image_format; + VkFormat depth_stencil_format; VulkanEntity * entity_freelist; Arena entity_arena; @@ -110,8 +167,11 @@ read_only global const char *vk_required_instance_extensions[] = { #endif #define VK_REQUIRED_DEVICE_EXTENSIONS_LIST \ + X("VK_KHR_16bit_storage") \ X("VK_KHR_external_memory") \ X("VK_KHR_external_semaphore") \ + X("VK_KHR_storage_buffer_storage_class") \ + X("VK_KHR_timeline_semaphore") \ VK_OS_REQUIRED_DEVICE_EXTENSIONS_LIST #define X(str) str, @@ -126,6 +186,42 @@ VK_REQUIRED_DEVICE_EXTENSIONS_LIST }; #undef X +#define VK_REQUIRED_PHYSICAL_FEATURES \ + X(shaderInt16) \ + X(shaderInt64) \ + +#define VK_REQUIRED_PHYSICAL_11_FEATURES \ + X(storageBuffer16BitAccess) \ + +#define VK_REQUIRED_PHYSICAL_12_FEATURES \ + X(bufferDeviceAddress) \ + X(shaderFloat16) \ + X(timelineSemaphore) \ + +#define VK_REQUIRED_PHYSICAL_13_FEATURES \ + X(dynamicRendering) \ + X(synchronization2) \ + +#define VK_DEBUG_EXTENSIONS \ + X(VK_KHR, shader_non_semantic_info) \ + X(VK_KHR, shader_relaxed_extended_instruction) \ + +#define X(p, s, ...) #p "_" #s, +read_only global const char *vk_debug_extensions[] = {VK_DEBUG_EXTENSIONS}; +#undef X +#define X(p, s, ...) sizeof(#p "_" #s) - 1, +read_only global u32 vk_debug_extension_name_lengths[] = {VK_DEBUG_EXTENSIONS}; +#undef X + +global union { + struct { + #define X(_, name, ...) b8 name; + VK_DEBUG_EXTENSIONS + #undef X + }; + b8 E[countof(vk_debug_extensions)]; +} vulkan_debug; + global VulkanContext vulkan_context[1]; /* NOTE(rnp): the idea here is to set reasonable development constraints. @@ -214,6 +310,15 @@ global glslang_resource_t glslc_resource_constraints[1] = {{ }, }}; + +#if BEAMFORMER_RENDERDOC_HOOKS +DEBUG_IMPORT void * +vk_renderdoc_instance_handle(void) +{ + return *((void **)vulkan_context->handle); +} +#endif + function VulkanEntity * vk_entity_allocate(VulkanEntityKind kind) { @@ -246,6 +351,16 @@ vk_entity_data(VulkanHandle h, VulkanEntityKind kind) return &e->as; } +function VkCommandBuffer +vk_command_buffer(VulkanHandle h) +{ + VulkanCommandBuffer *vcb = vk_entity_data(h, VulkanEntityKind_CommandBuffer); + VulkanQueue *vq = vulkan_context->queues[vcb->kind]; + + VkCommandBuffer result = vq->command_buffers[vcb->command_buffer_index]; + return result; +} + #define glslang_log(a, ...) glslang_log_(a, arg_list(s8, __VA_ARGS__)) function void glslang_log_(Arena arena, s8 *items, uz count) @@ -253,8 +368,8 @@ glslang_log_(Arena arena, s8 *items, uz count) Stream sb = arena_stream(arena); stream_append_s8(&sb, glslang_info("")); stream_append_s8s_(&sb, items, count); - s8 log = s8_trim_trailing(stream_to_s8(&sb), '\n'); - os_console_log(log.data, log.len); + if (sb.data[sb.widx - 1] != '\n') stream_append_byte(&sb, '\n'); + os_console_log(sb.data, sb.widx); } function s8 @@ -271,7 +386,7 @@ glsl_to_spirv(Arena *arena, u32 kind, s8 shader_text, s8 name) .target_language = GLSLANG_TARGET_SPV, .target_language_version = GLSLANG_TARGET_SPV_1_6, .code = (c8 *)shader_text.data, - .default_version = 100, + .default_version = 460, .default_profile = GLSLANG_NO_PROFILE, .force_default_version_and_profile = 0, .forward_compatible = 0, @@ -302,13 +417,13 @@ glsl_to_spirv(Arena *arena, u32 kind, s8 shader_text, s8 name) glslang_program_add_shader(program, shader); i32 messages = GLSLANG_MSG_DEBUG_INFO_BIT|GLSLANG_MSG_SPV_RULES_BIT|GLSLANG_MSG_VULKAN_RULES_BIT; if (glslang_program_link(program, messages)) { - glslang_spv_options_t options = { - .validate = 1, - .generate_debug_info = 1, - .emit_nonsemantic_shader_debug_info = 1, - .emit_nonsemantic_shader_debug_source = 1, - //.disable_optimizer = 1, - }; + glslang_spv_options_t options = {.validate = 1,}; + + if (vulkan_debug.shader_non_semantic_info) { + options.generate_debug_info = 1; + options.emit_nonsemantic_shader_debug_info = 1; + options.emit_nonsemantic_shader_debug_source = 1; + } glslang_program_add_source_text(program, kind, (c8 *)shader_text.data, shader_text.len); glslang_program_SPIRV_generate_with_options(program, kind, &options); @@ -342,7 +457,7 @@ vk_shader_kind_to_glslang_shader_kind(u32 kind) function VkShaderModule vk_compile_shader_module(Arena arena, u32 kind, s8 text, s8 name) { - VkShaderModule result = 0; + VkShaderModule result = {0}; s8 spirv = glsl_to_spirv(&arena, vk_shader_kind_to_glslang_shader_kind(kind), text, name); VkShaderModuleCreateInfo create_info = { .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO, @@ -350,19 +465,45 @@ vk_compile_shader_module(Arena arena, u32 kind, s8 text, s8 name) .pCode = (u32 *)spirv.data, }; if (spirv.len > 0) vkCreateShaderModule(vulkan_context->device, &create_info, 0, &result); + + return result; +} + +function VkShaderStageFlags +vk_stage_flags_from_shader_kind(VulkanShaderKind kind) +{ + read_only local_persist VkShaderStageFlags map[VulkanShaderKind_Count + 1] = { + [VulkanShaderKind_Vertex] = VK_SHADER_STAGE_VERTEX_BIT, + [VulkanShaderKind_Mesh] = VK_SHADER_STAGE_MESH_BIT_EXT, + [VulkanShaderKind_Fragment] = VK_SHADER_STAGE_FRAGMENT_BIT, + [VulkanShaderKind_Compute] = VK_SHADER_STAGE_COMPUTE_BIT, + [VulkanShaderKind_Count] = 0, + }; + VkShaderStageFlags result = map[Clamp((u32)kind, 0, VulkanShaderKind_Count)]; return result; } -function VulkanShader -vk_compute_pipeline_from_shader_text(Arena arena, s8 text, s8 name) +function VulkanPipeline +vk_compute_pipeline_from_shader_text(Arena arena, s8 text, s8 name, u32 push_constants_size) { - VulkanShader result = {0}; + VulkanPipeline result = {.stage_flags = VK_SHADER_STAGE_COMPUTE_BIT}; VkShaderModule module = vk_compile_shader_module(arena, VK_SHADER_STAGE_COMPUTE_BIT, text, name); if (module) { - VkPipelineLayoutCreateInfo pli = {.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO}; - vkCreatePipelineLayout(vulkan_context->device, &pli, 0, &result.layout); + VkPushConstantRange push_constant_range = { + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, + .offset = 0, + .size = push_constants_size, + }; + + VkPipelineLayoutCreateInfo pipeline_layout_create_info = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, + .pushConstantRangeCount = push_constants_size ? 1 : 0, + .pPushConstantRanges = push_constants_size ? &push_constant_range : 0, + }; + + vkCreatePipelineLayout(vulkan_context->device, &pipeline_layout_create_info, 0, &result.layout); - VkComputePipelineCreateInfo pi = { + VkComputePipelineCreateInfo pipeline_create_info = { .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, .layout = result.layout, .stage = { @@ -373,10 +514,376 @@ vk_compute_pipeline_from_shader_text(Arena arena, s8 text, s8 name) }, }; - vkCreateComputePipelines(vulkan_context->device, 0, 1, &pi, 0, &result.pipeline); + vkCreateComputePipelines(vulkan_context->device, 0, 1, &pipeline_create_info, 0, &result.pipeline); vkDestroyShaderModule(vulkan_context->device, module, 0); } + if (result.pipeline == 0) result = vulkan_context->default_compute_pipeline; + + return result; +} + +function VulkanPipeline +vk_graphics_pipeline_from_infos(Arena arena, VulkanPipelineCreateInfo *infos, u32 count, u32 push_constants_size) +{ + assume(count == 2); + + VulkanPipeline result = {0}; + VkShaderModule modules[2]; + + modules[0] = vk_compile_shader_module(arena, vk_stage_flags_from_shader_kind(infos[0].kind), + infos[0].text, infos[0].name); + modules[1] = vk_compile_shader_module(arena, vk_stage_flags_from_shader_kind(infos[1].kind), + infos[1].text, infos[1].name); + if (modules[0] && modules[1]) { + result.stage_flags = vk_stage_flags_from_shader_kind(infos[0].kind) + | vk_stage_flags_from_shader_kind(infos[1].kind); + + VkPushConstantRange pcr = { + .stageFlags = result.stage_flags, + .offset = 0, + .size = push_constants_size, + }; + + VkPipelineLayoutCreateInfo pipeline_layout_info = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, + .pushConstantRangeCount = push_constants_size ? 1 : 0, + .pPushConstantRanges = push_constants_size ? &pcr : 0, + }; + + vkCreatePipelineLayout(vulkan_context->device, &pipeline_layout_info, 0, &result.layout); + + VkPipelineShaderStageCreateInfo shader_stage_create_infos[2] = { + { + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + .stage = vk_stage_flags_from_shader_kind(infos[0].kind), + .module = modules[0], + .pName = "main", + }, + { + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + .stage = vk_stage_flags_from_shader_kind(infos[1].kind), + .module = modules[1], + .pName = "main", + }, + }; + + VkPipelineVertexInputStateCreateInfo vertex_input_info = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO, + }; + + VkPipelineInputAssemblyStateCreateInfo input_assembly_info = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO, + .topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST, + }; + + VkPipelineViewportStateCreateInfo viewport_info = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO, + .viewportCount = 1, + .scissorCount = 1, + }; + + VkPipelineRasterizationStateCreateInfo rasterization_info = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO, + .polygonMode = VK_POLYGON_MODE_FILL, + .lineWidth = 1.0f, + .cullMode = VK_CULL_MODE_BACK_BIT, + .frontFace = VK_FRONT_FACE_CLOCKWISE, + }; + + VkPipelineMultisampleStateCreateInfo multisampling_info = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO, + .rasterizationSamples = vulkan_context->gpu_info.max_msaa_samples, + }; + + VkPipelineDepthStencilStateCreateInfo depth_test_create_info = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO, + .depthTestEnable = 1, + .depthWriteEnable = 1, + .depthCompareOp = VK_COMPARE_OP_LESS, + .depthBoundsTestEnable = 1, + .stencilTestEnable = 0, + .front = {0}, + .back = {0}, + .minDepthBounds = 0.0f, + .maxDepthBounds = 1.0f, + }; + + u32 colour_mask = VK_COLOR_COMPONENT_R_BIT|VK_COLOR_COMPONENT_G_BIT|VK_COLOR_COMPONENT_B_BIT|VK_COLOR_COMPONENT_A_BIT; + VkPipelineColorBlendAttachmentState blend_state = { + .colorWriteMask = colour_mask, + .blendEnable = 1, + .srcColorBlendFactor = VK_BLEND_FACTOR_SRC_ALPHA, + .dstColorBlendFactor = VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA, + .colorBlendOp = VK_BLEND_OP_ADD, + .srcAlphaBlendFactor = VK_BLEND_FACTOR_ONE, + .dstAlphaBlendFactor = VK_BLEND_FACTOR_ZERO, + .alphaBlendOp = VK_BLEND_OP_ADD, + }; + + VkPipelineColorBlendStateCreateInfo colour_blend_state_create = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO, + .logicOpEnable = 0, + .logicOp = VK_LOGIC_OP_COPY, + .attachmentCount = 1, + .pAttachments = &blend_state, + }; + + VkDynamicState dynamic_states[] = { + VK_DYNAMIC_STATE_VIEWPORT, + VK_DYNAMIC_STATE_SCISSOR, + }; + + VkPipelineDynamicStateCreateInfo dynamic_state_info = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO, + .dynamicStateCount = countof(dynamic_states), + .pDynamicStates = dynamic_states, + }; + + //VkFormat colour_attachment_format = VK_FORMAT_R8G8B8A8_SRGB; + VkFormat colour_attachment_format = VK_FORMAT_R8G8B8A8_UNORM; + VkPipelineRenderingCreateInfo rendering_create_info = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_RENDERING_CREATE_INFO, + .colorAttachmentCount = 1, + .pColorAttachmentFormats = &colour_attachment_format, + .depthAttachmentFormat = vulkan_context->depth_stencil_format, + .stencilAttachmentFormat = vulkan_context->depth_stencil_format, + }; + + VkGraphicsPipelineCreateInfo pci = { + .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO, + .pNext = &rendering_create_info, + .stageCount = countof(shader_stage_create_infos), + .pStages = shader_stage_create_infos, + .pVertexInputState = &vertex_input_info, + .pInputAssemblyState = &input_assembly_info, + .pViewportState = &viewport_info, + .pRasterizationState = &rasterization_info, + .pMultisampleState = &multisampling_info, + .pDepthStencilState = &depth_test_create_info, + .pColorBlendState = &colour_blend_state_create, + .pDynamicState = &dynamic_state_info, + .layout = result.layout, + }; + + vkCreateGraphicsPipelines(vulkan_context->device, 0, 1, &pci,0, &result.pipeline); + } + + if (modules[0]) vkDestroyShaderModule(vulkan_context->device, modules[0], 0); + if (modules[1]) vkDestroyShaderModule(vulkan_context->device, modules[1], 0); + + if (result.pipeline == 0) result = vulkan_context->default_graphics_pipeline; + + return result; +} + +function VulkanSemaphore +vk_make_semaphore(OSHandle *export) +{ + VulkanContext *vk = vulkan_context; + + VkSemaphoreCreateInfo sci = {.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO}; + VkExportSemaphoreCreateInfo esci = { + .sType = VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_CREATE_INFO, + .handleTypes = OS_WINDOWS ? VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT + : VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT, + }; + VkSemaphoreTypeCreateInfo stc = { + .sType = VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO, + .semaphoreType = VK_SEMAPHORE_TYPE_TIMELINE, + }; + + if (export) sci.pNext = &esci; + else sci.pNext = &stc; + + VulkanSemaphore result = {0}; + + vkCreateSemaphore(vk->device, &sci, 0, &result.semaphore); + + if (export) { + if (OS_WINDOWS) { + VkSemaphoreGetWin32HandleInfoKHR ghi = { + .sType = VK_STRUCTURE_TYPE_SEMAPHORE_GET_WIN32_HANDLE_INFO_KHR, + .handleType = VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT, + .semaphore = result.semaphore, + }; + void *handle; + vkGetSemaphoreWin32HandleKHR(vk->device, &ghi, &handle); + export->value[0] = (u64)handle; + } else { + VkSemaphoreGetFdInfoKHR ghi = { + .sType = VK_STRUCTURE_TYPE_SEMAPHORE_GET_FD_INFO_KHR, + .handleType = VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT, + .semaphore = result.semaphore, + }; + i32 handle; + vkGetSemaphoreFdKHR(vk->device, &ghi, &handle); + export->value[0] = (u64)handle; + } + } + + return result; +} + +function void +vk_release_memory(VkDeviceMemory memory, u64 size) +{ + VulkanContext *vk = vulkan_context; + vkFreeMemory(vk->device, memory, 0); + atomic_add_u64(&vk->gpu_info.gpu_heap_used, -size); +} + +function b32 +vk_allocate_memory(VkDeviceMemory *memory, u64 size, VulkanMemoryKind kind, VkMemoryAllocateFlags flags, + VkMemoryDedicatedAllocateInfo *dedicated_allocate_info, OSHandle *export) +{ + VulkanContext *vk = vulkan_context; + + VkExportMemoryAllocateInfo export_info = { + .sType = VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO, + .handleTypes = OS_WINDOWS ? VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT + : VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT, + }; + + VkMemoryAllocateFlagsInfo memory_allocate_flags_info = { + .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_FLAGS_INFO, + .flags = flags, + .pNext = dedicated_allocate_info, + }; + + if (export) { + export_info.pNext = dedicated_allocate_info; + memory_allocate_flags_info.pNext = &export_info; + } + + VkMemoryAllocateInfo memory_allocate_info = { + .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO, + .allocationSize = size, + .memoryTypeIndex = vk->memory_info.memory_type_indices[kind], + .pNext = &memory_allocate_flags_info, + }; + + b32 result = vkAllocateMemory(vk->device, &memory_allocate_info, 0, memory) == VK_SUCCESS; + if (result) { + atomic_add_u64(&vk->gpu_info.gpu_heap_used, memory_allocate_info.allocationSize); + + if (export) { + if (OS_WINDOWS) { + VkMemoryGetWin32HandleInfoKHR handle_info = { + .sType = VK_STRUCTURE_TYPE_MEMORY_GET_WIN32_HANDLE_INFO_KHR, + .memory = *memory, + .handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT, + }; + void *handle; + vkGetMemoryWin32HandleKHR(vk->device, &handle_info, &handle); + export->value[0] = (u64)handle; + } else { + VkMemoryGetFdInfoKHR fd_info = { + .sType = VK_STRUCTURE_TYPE_MEMORY_GET_FD_INFO_KHR, + .memory = *memory, + .handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT, + }; + i32 fd; + vkGetMemoryFdKHR(vk->device, &fd_info, &fd); + export->value[0] = (u64)fd; + } + } + } + return result; +} + +function u32 +vk_index_size(VkIndexType type) +{ + u32 result = 0; + switch (type) { + case VK_INDEX_TYPE_UINT16:{ result = 2; }break; + case VK_INDEX_TYPE_UINT32:{ result = 4; }break; + InvalidDefaultCase; + } + return result; +} + +typedef struct { + GPUBuffer *gpu_buffer; + u64 size; + VulkanUsageFlags flags; + u32 queue_family_count; + u32 queue_family_indices[VulkanTimeline_Count]; + VkIndexType index_type; + s8 label; +} VulkanBufferAllocateInfo; + +function b32 +vk_buffer_allocate_common(VulkanBuffer *vb, VulkanBufferAllocateInfo *ai) +{ + VulkanContext *vk = vulkan_context; + + // TODO(rnp): this probably should be handled, its usually 4GB. likely + // need to chain multiple allocations and handle it in shader code + u64 size = Min(ai->size, vk->memory_info.max_allocation_size & ~(vk->memory_info.non_coherent_atom_size - 1)); + + VkBufferCreateInfo buffer_create_info = { + .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, + .usage = VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, + .size = size, + .sharingMode = ai->queue_family_count > 1 ? VK_SHARING_MODE_CONCURRENT : VK_SHARING_MODE_EXCLUSIVE, + .queueFamilyIndexCount = ai->queue_family_count, + .pQueueFamilyIndices = ai->queue_family_indices, + }; + + if (ai->flags & VulkanUsageFlag_TransferSource) + buffer_create_info.usage |= VK_BUFFER_USAGE_TRANSFER_SRC_BIT; + + if (ai->flags & VulkanUsageFlag_TransferDestination) + buffer_create_info.usage |= VK_BUFFER_USAGE_TRANSFER_DST_BIT; + + if (ai->index_type != VK_INDEX_TYPE_NONE_KHR) + buffer_create_info.usage |= VK_BUFFER_USAGE_INDEX_BUFFER_BIT; + + vkCreateBuffer(vk->device, &buffer_create_info, 0, &vb->buffer); + + VkMemoryRequirements memory_requirements; + vkGetBufferMemoryRequirements(vk->device, vb->buffer, &memory_requirements); + + assert((u64)size <= memory_requirements.size); + size = memory_requirements.size; + + VkMemoryDedicatedAllocateInfo dedicated_allocate_info = { + .sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO, + .buffer = vb->buffer, + }; + + /* NOTE(rnp): to create a CPU writable buffer: + * 1. try to allocate and map the entire buffer + * - this may fail if the buffer is bigger than the BAR size + * (unknowable from vulkan), or the memory space has become + * too fragmented (unlikely) + * 2. if allocation or mapping fails we must chain a host buffer + * for staging. If this happens in practice we should add + * the ability to import an existing external allocation + */ + b32 host_read_write = (ai->flags & VulkanUsageFlag_HostReadWrite) != 0; + vb->memory_kind = host_read_write ? VulkanMemoryKind_BAR : VulkanMemoryKind_Device; + + b32 result = 0; + // TODO(rnp): this may fail if the allocation is too big for the BAR size + // it needs to handled properly + if (vk_allocate_memory(&vb->memory, size, vb->memory_kind, VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_BIT, &dedicated_allocate_info, 0)) { + result = 1; + ai->gpu_buffer->size = size; + + vb->index_type = ai->index_type; + if (host_read_write) + vkMapMemory(vk->device, vb->memory, 0, size, 0, &vb->host_pointer); + + vkBindBufferMemory(vk->device, vb->buffer, vb->memory, 0); + VkBufferDeviceAddressInfo buffer_device_address_info = { + .sType = VK_STRUCTURE_TYPE_BUFFER_DEVICE_ADDRESS_INFO, + .buffer = vb->buffer, + }; + ai->gpu_buffer->gpu_pointer = vkGetBufferDeviceAddress(vk->device, &buffer_device_address_info); + } return result; } @@ -398,7 +905,9 @@ vk_load_instance(void) /* TODO(rnp): debug only, and check for these before enabling */ const char *validation_layers[] = { + #if BEAMFORMER_DEBUG "VK_LAYER_KHRONOS_validation", + #endif }; VkInstanceCreateInfo instance_create_info = { @@ -410,6 +919,23 @@ vk_load_instance(void) .enabledLayerCount = countof(validation_layers), }; + #if 0 && BEAMFORMER_DEBUG + VkValidationFeatureEnableEXT validation_feature_enables[] = { + VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_EXT, + VK_VALIDATION_FEATURE_ENABLE_BEST_PRACTICES_EXT, + VK_VALIDATION_FEATURE_ENABLE_DEBUG_PRINTF_EXT, + VK_VALIDATION_FEATURE_ENABLE_SYNCHRONIZATION_VALIDATION_EXT, + }; + + VkValidationFeaturesEXT validation_features = { + .sType = VK_STRUCTURE_TYPE_VALIDATION_FEATURES_EXT, + .enabledValidationFeatureCount = countof(validation_feature_enables), + .pEnabledValidationFeatures = validation_feature_enables, + }; + + instance_create_info.pNext = &validation_features; + #endif + vkCreateInstance(&instance_create_info, 0, &vulkan_context->handle); #define X(name, ...) name = (name##_fn *)vkGetInstanceProcAddr(vulkan_context->handle, #name); @@ -451,7 +977,7 @@ vk_load_physical_device(Arena arena, Stream *err) VkPhysicalDeviceProperties2 dp = {.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2}; VkPhysicalDeviceVulkan11Properties v11p = {.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_PROPERTIES}; - dp.pNext= &v11p; + dp.pNext = &v11p; vkGetPhysicalDeviceProperties2(vk->physical_device, &dp); @@ -498,6 +1024,89 @@ vk_load_physical_device(Arena arena, Stream *err) } fatal(stream_to_s8(err)); } + + #if BEAMFORMER_DEBUG + for (u32 index = 0; index < extension_count; index++) { + for EachElement(vk_debug_extensions, it) { + s8 test = { + .data = (u8 *)vk_debug_extensions[it], + .len = vk_debug_extension_name_lengths[it], + }; + vulkan_debug.E[it] |= s8_equal(test, ext_str8s[index]); + } + } + #endif + } + + { + VkPhysicalDeviceFeatures2 df = {.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2}; + VkPhysicalDeviceVulkan11Features v11f = {.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES}; + VkPhysicalDeviceVulkan12Features v12f = {.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES}; + VkPhysicalDeviceVulkan13Features v13f = {.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_3_FEATURES}; + df.pNext = &v11f; + v11f.pNext = &v12f; + v12f.pNext = &v13f; + vkGetPhysicalDeviceFeatures2(vk->physical_device, &df); + + { + b32 all_supported = 1; + #define X(name, ...) all_supported &= df.features.name; + VK_REQUIRED_PHYSICAL_FEATURES + #undef X + + if (!all_supported) { + stream_append_s8(err, vulkan_info("fatal error: missing physical device features:\n")); + #define X(name, ...) if (!df.features.name) stream_append_s8(err, s8(" " #name "\n")); + VK_REQUIRED_PHYSICAL_FEATURES + #undef X + fatal(stream_to_s8(err)); + } + } + + { + b32 all_supported = 1; + #define X(name, ...) all_supported &= v11f.name; + VK_REQUIRED_PHYSICAL_11_FEATURES + #undef X + + if (!all_supported) { + stream_append_s8(err, vulkan_info("fatal error: missing physical device features:\n")); + #define X(name, ...) if (!v11f.name) stream_append_s8(err, s8(" " #name "\n")); + VK_REQUIRED_PHYSICAL_11_FEATURES + #undef X + fatal(stream_to_s8(err)); + } + } + + { + b32 all_supported = 1; + #define X(name, ...) all_supported &= v12f.name; + VK_REQUIRED_PHYSICAL_12_FEATURES + #undef X + + if (!all_supported) { + stream_append_s8(err, vulkan_info("fatal error: missing physical device features:\n")); + #define X(name, ...) if (!v12f.name) stream_append_s8(err, s8(" " #name "\n")); + VK_REQUIRED_PHYSICAL_12_FEATURES + #undef X + fatal(stream_to_s8(err)); + } + } + + { + b32 all_supported = 1; + #define X(name, ...) all_supported &= v13f.name; + VK_REQUIRED_PHYSICAL_13_FEATURES + #undef X + + if (!all_supported) { + stream_append_s8(err, vulkan_info("fatal error: missing physical device features:\n")); + #define X(name, ...) if (!v13f.name) stream_append_s8(err, s8(" " #name "\n")); + VK_REQUIRED_PHYSICAL_13_FEATURES + #undef X + fatal(stream_to_s8(err)); + } + } } VkPhysicalDeviceMemoryProperties2 mp = {.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MEMORY_PROPERTIES_2}; @@ -686,27 +1295,30 @@ vk_load_queues(Arena *memory, Stream *err) assigned_subindices[VulkanQueueKind_Transfer] += 1; } - u32 unique_queues = 0; for EachElement(assigned_subindices, it) - unique_queues += assigned_subindices[it]; + vk->unique_queues += assigned_subindices[it]; end_temp_arena(arena_save); ///////////////////////////////////////////// // NOTE(rnp): fill in info and create device - - VulkanQueue *qs = push_array(memory, VulkanQueue, unique_queues); for EachElement(vk->queues, it) { u32 index = queue_subindices[it]; for (i32 i = 0; i < queue_indices[it]; i++) index += assigned_subindices[i]; - - vk->queues[it] = qs + index; - qs[index].queue_family = queue_indices[it]; - qs[index].queue_index = queue_subindices[it]; + vk->queue_indices[it] = index; } - VkDeviceQueueCreateInfo queue_create_infos[VulkanQueueKind_Count]; + for EachElement(vk->queues, it) { + if (vk->queues[vk->queue_indices[it]] == 0) { + vk->queues[vk->queue_indices[it]] = push_struct(memory, VulkanQueue); + vk->queues[vk->queue_indices[it]]->queue_family = queue_indices[it]; + vk->queues[vk->queue_indices[it]]->queue_index = queue_subindices[it]; + } + vk->queues[it] = vk->queues[vk->queue_indices[it]]; + } + + VkDeviceQueueCreateInfo queue_create_infos[VulkanQueueKind_Count]; f32 queue_priorities[VulkanQueueKind_Count][VulkanQueueKind_Count]; for (u32 i = 0; i < VulkanQueueKind_Count; i++) @@ -716,7 +1328,7 @@ vk_load_queues(Arena *memory, Stream *err) u32 queue_create_index = 0; b32 queue_info_filled[VulkanQueueKind_Count] = {0}; - for (u32 q = 0; q < unique_queues; q++) { + for (u32 q = 0; q < vk->unique_queues; q++) { u32 base_q = queue_indices[q]; if (!queue_info_filled[base_q]) { queue_create_infos[queue_create_index++] = (VkDeviceQueueCreateInfo){ @@ -729,14 +1341,63 @@ vk_load_queues(Arena *memory, Stream *err) queue_info_filled[base_q] = 1; } - VkPhysicalDeviceFeatures device_features = {0}; + VkPhysicalDeviceVulkan13Features v13f = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_3_FEATURES, + #define X(name, ...) .name = 1, + VK_REQUIRED_PHYSICAL_13_FEATURES + #undef X + }; + + VkPhysicalDeviceShaderRelaxedExtendedInstructionFeaturesKHR pdsre = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_RELAXED_EXTENDED_INSTRUCTION_FEATURES_KHR, + .shaderRelaxedExtendedInstruction = 1, + }; + if (vulkan_debug.shader_relaxed_extended_instruction) v13f.pNext = &pdsre; + + VkPhysicalDeviceVulkan12Features v12f = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES, + .pNext = &v13f, + #define X(name, ...) .name = 1, + VK_REQUIRED_PHYSICAL_12_FEATURES + #undef X + }; + + VkPhysicalDeviceVulkan11Features v11f = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES, + .pNext = &v12f, + #define X(name, ...) .name = 1, + VK_REQUIRED_PHYSICAL_11_FEATURES + #undef X + }; + VkPhysicalDeviceFeatures2 device_features = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2, + .pNext = &v11f, + .features = { + #define X(name, ...) .name = 1, + VK_REQUIRED_PHYSICAL_FEATURES + #undef X + }, + }; + + Arena arena = *memory; + u32 enabled_count = countof(vk_required_device_extensions) + countof(vk_debug_extensions); + const char **enabled_extensions = push_array(&arena, const char *, enabled_count); + + enabled_count = 0; + for EachElement(vk_required_device_extensions, it) + enabled_extensions[enabled_count++] = vk_required_device_extensions[it]; + + for EachElement(vk_debug_extensions, it) + if (vulkan_debug.E[it]) + enabled_extensions[enabled_count++] = vk_debug_extensions[it]; + VkDeviceCreateInfo device_create_info = { .sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO, + .pNext = &device_features, .pQueueCreateInfos = queue_create_infos, .queueCreateInfoCount = queue_create_index, - .pEnabledFeatures = &device_features, - .ppEnabledExtensionNames = vk_required_device_extensions, - .enabledExtensionCount = countof(vk_required_device_extensions), + .ppEnabledExtensionNames = enabled_extensions, + .enabledExtensionCount = enabled_count, }; vkCreateDevice(vk->physical_device, &device_create_info, 0, &vk->device); @@ -744,9 +1405,69 @@ vk_load_queues(Arena *memory, Stream *err) VkDeviceProcedureList #undef X - for (u32 q = 0; q < unique_queues; q++) { + for (u32 q = 0; q < vk->unique_queues; q++) { VulkanQueue *qp = vk->queues[q]; vkGetDeviceQueue(vk->device, qp->queue_family, qp->queue_index, &qp->queue); + + VkCommandPoolCreateInfo command_pool_create_info = { + .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO, + .flags = VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT, + .queueFamilyIndex = qp->queue_family, + }; + vkCreateCommandPool(vk->device, &command_pool_create_info, 0, &qp->command_pool); + + VkCommandBufferAllocateInfo command_buffer_allocate_info = { + .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO, + .commandPool = qp->command_pool, + .level = VK_COMMAND_BUFFER_LEVEL_PRIMARY, + .commandBufferCount = countof(qp->command_buffers), + }; + vkAllocateCommandBuffers(vk->device, &command_buffer_allocate_info, qp->command_buffers); + + qp->timeline_semaphore = vk_make_semaphore(0); + + VkQueryPoolCreateInfo query_pool_create_info = { + .sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO, + .queryType = VK_QUERY_TYPE_TIMESTAMP, + .queryCount = countof(qp->command_buffers) * MaxCommandBufferTimestamps, + }; + vkCreateQueryPool(vk->device, &query_pool_create_info, 0, &qp->query_pool); + } + + vk->queues[VulkanQueueKind_Graphics]->pipeline_stage_flags |= VK_PIPELINE_STAGE_2_ALL_GRAPHICS_BIT; + vk->queues[VulkanQueueKind_Compute]->pipeline_stage_flags |= VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT; +} + +function void +vk_load_graphics(void) +{ + VulkanContext *vk = vulkan_context; + + // NOTE: swap chain image format + { + } + + // NOTE: depth/stencil format + { + VkFormat depth_formats[] = { + VK_FORMAT_D32_SFLOAT_S8_UINT, + VK_FORMAT_D24_UNORM_S8_UINT, + VK_FORMAT_D16_UNORM_S8_UINT, + }; + + vk->depth_stencil_format = VK_FORMAT_UNDEFINED; + for EachElement(depth_formats, it) { + VkFormatProperties3 format_properties3 = {.sType = VK_STRUCTURE_TYPE_FORMAT_PROPERTIES_3}; + VkFormatProperties2 format_properties2 = { + .sType = VK_STRUCTURE_TYPE_FORMAT_PROPERTIES_2, + .pNext = &format_properties3, + }; + vkGetPhysicalDeviceFormatProperties2(vk->physical_device, depth_formats[it], &format_properties2); + if (format_properties3.optimalTilingFeatures & VK_FORMAT_FEATURE_2_DEPTH_STENCIL_ATTACHMENT_BIT) { + vk->depth_stencil_format = depth_formats[it]; + break; + } + } } } @@ -772,17 +1493,43 @@ vk_load(OSLibrary vulkan_library_handle, Arena *memory, Stream *err) vk_load_instance(); vk_load_physical_device(vulkan_context->arena, err); vk_load_queues(&vulkan_context->arena, err); + vk_load_graphics(); - // TODO: setup compute pipeline read_only local_persist s8 default_compute_shader = s8("" "#version 430 core\n" + "layout(push_constant) uniform pc { uint data[256 / 4]; };\n" "void main() {}\n" "\n"); + vk->default_compute_pipeline = vk_compute_pipeline_from_shader_text(vk->arena, default_compute_shader, + s8("error_compute_shader"), 256); + + read_only local_persist s8 default_vertex_shader = s8("" + "#version 430 core\n" + "layout(push_constant) uniform pc { uint data[256 / 4]; };\n" + "void main() {gl_Position = vec4(0);}\n" + "\n"); + read_only local_persist s8 default_fragment_shader = s8("" + "#version 430 core\n" + "layout(location = 0) out vec4 out_colour;" + "layout(push_constant) uniform pc { uint data[256 / 4]; };\n" + "void main() {out_colour = vec4(0.5f, 0.0f, 0.5f, 1.0f);}\n" + "\n"); - vk->default_compute_shader = vk_compute_pipeline_from_shader_text(vk->arena, default_compute_shader, - s8("error_compute_shader")); + VulkanPipelineCreateInfo pipeline_create_infos[2] = { + { + .kind = VulkanShaderKind_Vertex, + .text = default_vertex_shader, + .name = s8("error_vertex_shader"), + }, + { + .kind = VulkanShaderKind_Fragment, + .text = default_fragment_shader, + .name = s8("error_fragment_shader"), + }, + }; + vk->default_graphics_pipeline = vk_graphics_pipeline_from_infos(vk->arena, pipeline_create_infos, 2, 256); - // TODO: setup render pipeline + // TODO: setup ui render pipeline if (err->widx > 0) { os_console_log(err->data, err->widx); @@ -796,110 +1543,63 @@ vk_gpu_info(void) return &vulkan_context->gpu_info; } -DEBUG_IMPORT void -vk_buffer_release(GPUBuffer *b) +function void +vk_vulkan_buffer_release(VulkanBuffer *vb, u64 size) { VulkanContext *vk = vulkan_context; - if ValidVulkanHandle(b->buffer) { - VulkanBuffer *vb = vk_entity_data(b->buffer, VulkanEntityKind_Buffer); - // TODO(rnp): this happens implicitly, probably just delete this if block - if (vb->host_pointer) - vkUnmapMemory(vk->device, vb->memory); + VulkanEntity *e = (VulkanEntity *)((u8 *)vb - offsetof(VulkanEntity, as)); + // TODO(rnp): this happens implicitly, probably just delete this if block + if (vb->host_pointer) + vkUnmapMemory(vk->device, vb->memory); - if (vb->buffer) - vkDestroyBuffer(vk->device, vb->buffer, 0); + if (vb->buffer) + vkDestroyBuffer(vk->device, vb->buffer, 0); - vkFreeMemory(vk->device, vb->memory, 0); - if (vb->memory_kind != VulkanMemoryKind_Host) - vk->gpu_info.gpu_heap_used -= b->size; + vk_release_memory(vb->memory, vb->memory_kind != VulkanMemoryKind_Host ? size : 0); + vk_entity_release(e); +} - vk_entity_release((VulkanEntity *)b->buffer.value[0]); - } +DEBUG_IMPORT void +vk_buffer_release(GPUBuffer *b) +{ + if ValidVulkanHandle(b->buffer) + vk_vulkan_buffer_release(vk_entity_data(b->buffer, VulkanEntityKind_Buffer), b->size); zero_struct(b); } DEBUG_IMPORT void -vk_buffer_allocate(GPUBuffer *b, iz size, GPUBufferCreateFlags flags, OSHandle *export, s8 label) +vk_buffer_allocate(GPUBuffer *b, GPUBufferAllocateInfo *info) { - vk_buffer_release(b); VulkanContext *vk = vulkan_context; - VulkanEntity *e = vk_entity_allocate(VulkanEntityKind_Buffer); - VulkanBuffer *vb = &e->as.buffer; - - b->buffer.value[0] = (u64)e; - - assert(size > 0); - - // TODO(rnp): this probably should be handled, its usually 4GB. likely - // need to chain multiple allocations and handle it in shader code - assert((u64)size <= vk->memory_info.max_allocation_size); - size = (iz)Min((u64)size, vk->memory_info.max_allocation_size); - - u64 remaining = vk->gpu_info.gpu_heap_size - vk->gpu_info.gpu_heap_used; - VkExportMemoryAllocateInfo ei = { - .sType = VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO, - .handleTypes = OS_WINDOWS ? VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT - : VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT, - }; + vk_buffer_release(b); - VkMemoryAllocateFlagsInfo mafi = { - .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_FLAGS_INFO, - //.flags = VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_BIT, - .pNext = (export) ? & ei: 0, - }; + assert(info->size > 0); - /* NOTE(rnp): to create a CPU writable buffer: - * 1. try to allocate and map the entire buffer - * - this may fail if the buffer is bigger than the BAR size - * (unknowable from vulkan), or the memory space has become - * too fragmented (unlikely) - * 2. if allocation or mapping fails we must chain a host buffer - * for staging. If this happens in practice we should add - * the ability to import an existing external allocation - */ - vb->memory_kind = flags & GPUBufferCreateFlags_HostWritable ? VulkanMemoryKind_BAR : VulkanMemoryKind_Device; - VkMemoryAllocateInfo mai = { - .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO, - .allocationSize = Min((u64)size, remaining), - .memoryTypeIndex = vk->memory_info.memory_type_indices[vb->memory_kind], - .pNext = &mafi, + VulkanEntity *e = vk_entity_allocate(VulkanEntityKind_Buffer); + VulkanBufferAllocateInfo vulkan_buffer_allocate_info = { + .gpu_buffer = b, + .size = (u64)info->size, + .flags = info->flags, + .index_type = VK_INDEX_TYPE_NONE_KHR, + .label = info->label, }; - // TODO(rnp): this may fail if the allocation is too big for the BAR size - // it needs to handled properly - if (vkAllocateMemory(vk->device, &mai, 0, &vb->memory) == VK_SUCCESS) { - vk->gpu_info.gpu_heap_used += mai.allocationSize; - b->size = mai.allocationSize; - - if (flags & GPUBufferCreateFlags_HostWritable) - vkMapMemory(vk->device, vb->memory, 0, b->size, 0, &vb->host_pointer); + u32 queue_index_hit_count[VulkanQueueKind_Count] = {0}; + for (u32 it = 0; it < info->timeline_count; it++) + queue_index_hit_count[vk->queue_indices[info->timelines_used[it]]]++; - if (export) { - if (OS_WINDOWS) { - VkMemoryGetWin32HandleInfoKHR handle_info = { - .sType = VK_STRUCTURE_TYPE_MEMORY_GET_WIN32_HANDLE_INFO_KHR, - .memory = vb->memory, - .handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT, - }; - void *handle; - vkGetMemoryWin32HandleKHR(vk->device, &handle_info, &handle); - export->value[0] = (u64)handle; - } else { - VkMemoryGetFdInfoKHR fd_info = { - .sType = VK_STRUCTURE_TYPE_MEMORY_GET_FD_INFO_KHR, - .memory = vb->memory, - .handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT, - }; - i32 fd; - vkGetMemoryFdKHR(vk->device, &fd_info, &fd); - export->value[0] = (u64)fd; - } + for EachElement(queue_index_hit_count, it) { + if (queue_index_hit_count[it] > 0) { + u32 index = vulkan_buffer_allocate_info.queue_family_count++; + vulkan_buffer_allocate_info.queue_family_indices[index] = vk->queues[vk->queue_indices[it]]->queue_family; } } - if ((flags & GPUBufferCreateFlags_MemoryOnly) == 0) { - // TODO(rnp): create and bind memory to buffer + if (vk_buffer_allocate_common(&e->as.buffer, &vulkan_buffer_allocate_info)) { + b->buffer.value[0] = (u64)e; + } else { + vk_entity_release(e); } } @@ -925,108 +1625,854 @@ vk_round_up_to_sync_size(u64 size, u64 min) return result; } -DEBUG_IMPORT void -vk_buffer_range_upload(GPUBuffer *b, void *data, u64 offset, u64 size, b32 non_temporal) +function force_inline void +vk_buffer_buffer_copy(VulkanBuffer *destination, VulkanBuffer *source, u64 destination_offset, u64 source_offset, u64 size, b32 non_temporal) { VulkanContext *vk = vulkan_context; - VulkanBuffer *vb = vk_entity_data(b->buffer, VulkanEntityKind_Buffer); - switch (vb->memory_kind) { - case VulkanMemoryKind_Host: + switch (source->memory_kind) { case VulkanMemoryKind_BAR: { - assert(vb->host_pointer); - void *dest = (u8 *)vb->host_pointer + offset; - // NOTE(rnp): don't trash the CPU cache for large data stores - if (non_temporal) memory_copy_non_temporal(dest, data, size); - else mem_copy(dest, data, size); - - b32 coherent = vk->memory_info.memory_host_coherent[vb->memory_kind]; - if (!coherent) { - u64 nca_size = vk->memory_info.non_coherent_atom_size; - VkMappedMemoryRange mrs[1] = {{ - .sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE, - .memory = vb->memory, - .offset = offset - (offset % nca_size), - .size = vk_round_up_to_sync_size(size, nca_size), - }}; - vkFlushMappedMemoryRanges(vk->device, countof(mrs), mrs); + switch (destination->memory_kind) { + case VulkanMemoryKind_Host:{ + if (destination->memory) { + // TODO(rnp): there is likely a more efficient way of doing this in this case + InvalidCodePath; + } else { + assert(source->host_pointer); + b32 coherent = vk->memory_info.memory_host_coherent[source->memory_kind]; + if (!coherent) { + u64 nca_size = vk->memory_info.non_coherent_atom_size; + VkMappedMemoryRange mrs[1] = {{ + .sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE, + .memory = source->memory, + .offset = source_offset - (source_offset % nca_size), + .size = vk_round_up_to_sync_size(size, nca_size), + }}; + vkInvalidateMappedMemoryRanges(vk->device, countof(mrs), mrs); + } + + void *dest = (u8 *)destination->host_pointer + destination_offset; + void *src = (u8 *)source->host_pointer + source_offset; + + // NOTE(rnp): don't trash the CPU cache for large data stores + if (non_temporal) memory_copy_non_temporal(dest, src, size); + else mem_copy(dest, src, size); + } + }break; + InvalidDefaultCase; + } + }break; + + case VulkanMemoryKind_Host:{ + switch (destination->memory_kind) { + case VulkanMemoryKind_BAR:{ + assert(destination->host_pointer); + + void *dest = (u8 *)destination->host_pointer + destination_offset; + void *src = (u8 *)source->host_pointer + source_offset; + + // NOTE(rnp): don't trash the CPU cache for large data stores + if (non_temporal) memory_copy_non_temporal(dest, src, size); + else mem_copy(dest, src, size); + + b32 coherent = vk->memory_info.memory_host_coherent[destination->memory_kind]; + if (!coherent) { + u64 nca_size = vk->memory_info.non_coherent_atom_size; + VkMappedMemoryRange mrs[1] = {{ + .sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE, + .memory = destination->memory, + .offset = destination_offset - (destination_offset % nca_size), + .size = vk_round_up_to_sync_size(size, nca_size), + }}; + vkFlushMappedMemoryRanges(vk->device, countof(mrs), mrs); + } + }break; + InvalidDefaultCase; + } }break; + // TODO(rnp): use transfer queue when not mapped InvalidDefaultCase; } } -DEBUG_IMPORT VulkanHandle -vk_semaphore_create(OSHandle *export) +DEBUG_IMPORT void +vk_buffer_range_upload(GPUBuffer *b, void *data, u64 offset, u64 size, b32 non_temporal) { + VulkanBuffer *db = vk_entity_data(b->buffer, VulkanEntityKind_Buffer); + VulkanBuffer sb = { + .host_pointer = data, + .memory_kind = VulkanMemoryKind_Host, + }; + vk_buffer_buffer_copy(db, &sb, offset, 0, size, non_temporal); +} + +DEBUG_IMPORT void +vk_buffer_range_download(void *destination, GPUBuffer *source, u64 offset, u64 size, b32 non_temporal) +{ + VulkanBuffer *sb = vk_entity_data(source->buffer, VulkanEntityKind_Buffer); + VulkanBuffer db = { + .host_pointer = destination, + .memory_kind = VulkanMemoryKind_Host, + }; + vk_buffer_buffer_copy(&db, sb, 0, offset, size, non_temporal); +} + +DEBUG_IMPORT void +vk_render_model_release(GPUBuffer *model) +{ + if ValidVulkanHandle(model->buffer) + vk_vulkan_buffer_release(vk_entity_data(model->buffer, VulkanEntityKind_RenderModel), model->size); + zero_struct(model); +} + +DEBUG_IMPORT void +vk_render_model_allocate(GPUBuffer *model, void *indices, u64 index_count, u64 model_size, s8 label) +{ + vk_render_model_release(model); + + VulkanEntity *e = vk_entity_allocate(VulkanEntityKind_RenderModel); + + assert(index_count <= U32_MAX); + VkIndexType index_type; + if (index_count <= U16_MAX) index_type = VK_INDEX_TYPE_UINT16; + else index_type = VK_INDEX_TYPE_UINT32; + + i64 indices_size = round_up_to(vk_index_size(index_type) * index_count, 64); + + i64 size = round_up_to(model_size + indices_size, 64); + assert(size > 0); + + VulkanBufferAllocateInfo vulkan_buffer_allocate_info = { + .gpu_buffer = model, + .size = (u64)size, + .flags = VulkanUsageFlag_HostReadWrite, + .index_type = index_type, + .label = label, + .queue_family_count = 1, + .queue_family_indices[0] = vulkan_context->queues[VulkanQueueKind_Graphics]->queue_family, + }; + if (vk_buffer_allocate_common(&e->as.buffer, &vulkan_buffer_allocate_info)) { + model->buffer.value[0] = (u64)e; + model->index_count = index_count; + model->gpu_pointer += indices_size; + + VulkanBuffer sb = { + .host_pointer = indices, + .memory_kind = VulkanMemoryKind_Host, + }; + + vk_buffer_buffer_copy(&e->as.buffer, &sb, 0, 0, vk_index_size(index_type) * index_count, 0); + } else { + vk_entity_release(e); + } +} + +DEBUG_IMPORT void +vk_render_model_range_upload(GPUBuffer *model, void *data, u64 offset, u64 size, b32 non_temporal) +{ + VulkanBuffer *db = vk_entity_data(model->buffer, VulkanEntityKind_RenderModel); + VulkanBuffer sb = { + .host_pointer = data, + .memory_kind = VulkanMemoryKind_Host, + }; + + offset += round_up_to(vk_index_size(db->index_type) * model->index_count, 64); + + vk_buffer_buffer_copy(db, &sb, offset, 0, size, non_temporal); +} + +DEBUG_IMPORT void +vk_image_release(GPUImage *image) +{ + if ValidVulkanHandle(image->image) { + VulkanContext *vk = vulkan_context; + VulkanImage *vi = vk_entity_data(image->image, VulkanEntityKind_Image); + + vkDestroyImageView(vk->device, vi->view, 0); + vkDestroyImage(vk->device, vi->image, 0); + vk_release_memory(vi->memory, image->memory_size); + + vk_entity_release((VulkanEntity *)image->image.value[0]); + } + zero_struct(image); +} + +DEBUG_IMPORT void +vk_image_allocate(GPUImage *image, u32 width, u32 height, u32 mips, u32 samples, + VulkanImageUsage usage, VulkanUsageFlags flags, OSHandle *export) +{ + assert(IsPowerOfTwo(samples)); + + vk_image_release(image); + VulkanContext *vk = vulkan_context; + VulkanEntity *e = vk_entity_allocate(VulkanEntityKind_Image); + VulkanImage *vi = &e->as.image; + + image->image.value[0] = (u64)e; + image->width = Min(width, vk->gpu_info.max_image_dimension_2D); + image->height = Min(height, vk->gpu_info.max_image_dimension_2D); + image->mip_map_levels = Max(mips, 1); + image->samples = Min(samples, vk->gpu_info.max_msaa_samples); + + VkFormat usage_format_map[VulkanImageUsage_Count + 1] = { + [VulkanImageUsage_None] = VK_FORMAT_UNDEFINED, + //[VulkanImageUsage_Colour] = VK_FORMAT_R8G8B8A8_SRGB, + [VulkanImageUsage_Colour] = VK_FORMAT_R8G8B8A8_UNORM, + [VulkanImageUsage_DepthStencil] = vk->depth_stencil_format, + [VulkanImageUsage_Count] = VK_FORMAT_UNDEFINED, + }; - VkSemaphoreCreateInfo sci = {.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO}; - VkExportSemaphoreCreateInfo esci = { - .sType = VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_CREATE_INFO, - .handleTypes = OS_WINDOWS ? VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT - : VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT, + read_only local_persist VkImageUsageFlagBits usage_extra_bit_map[VulkanImageUsage_Count + 1] = { + [VulkanImageUsage_None] = 0, + [VulkanImageUsage_Colour] = VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT, + [VulkanImageUsage_DepthStencil] = VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT, + [VulkanImageUsage_Count] = 0, }; - if (export) sci.pNext = &esci; + read_only local_persist VkImageAspectFlags usage_image_aspect_map[VulkanImageUsage_Count + 1] = { + [VulkanImageUsage_None] = 0, + [VulkanImageUsage_Colour] = VK_IMAGE_ASPECT_COLOR_BIT, + [VulkanImageUsage_DepthStencil] = VK_IMAGE_ASPECT_DEPTH_BIT|VK_IMAGE_ASPECT_STENCIL_BIT, + [VulkanImageUsage_Count] = 0, + }; + + usage = Clamp((u32)usage, 0, VulkanImageUsage_Count); + VkImageUsageFlagBits usage_flags = usage_extra_bit_map[usage]; + + if (flags & VulkanUsageFlag_ImageSampling) usage_flags |= VK_IMAGE_USAGE_SAMPLED_BIT; + if (flags & VulkanUsageFlag_TransferSource) usage_flags |= VK_IMAGE_USAGE_TRANSFER_SRC_BIT; + if (flags & VulkanUsageFlag_TransferDestination) usage_flags |= VK_IMAGE_USAGE_TRANSFER_DST_BIT; + + u32 queue_family = vk->queues[VulkanQueueKind_Graphics]->queue_family; + VkImageCreateInfo image_create_info = { + .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, + .flags = export ? VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT : 0, + .imageType = VK_IMAGE_TYPE_2D, + .format = usage_format_map[usage], + .extent = {image->width, image->height, 1}, + .mipLevels = image->mip_map_levels, + .arrayLayers = 1, + .samples = image->samples, + .tiling = VK_IMAGE_TILING_OPTIMAL, + .usage = usage_flags, + // NOTE(rnp): needed if multiple queue families are accessed + .sharingMode = VK_SHARING_MODE_EXCLUSIVE, + .queueFamilyIndexCount = 1, + .pQueueFamilyIndices = &queue_family, + .initialLayout = VK_IMAGE_LAYOUT_UNDEFINED, + }; + + VkExternalMemoryImageCreateInfo external_memory_image_create_info = { + .sType = VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_IMAGE_CREATE_INFO, + .handleTypes = OS_WINDOWS ? VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT + : VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT, + }; + + if (export) image_create_info.pNext = &external_memory_image_create_info; + + vkCreateImage(vk->device, &image_create_info, 0, &vi->image); + + VkMemoryRequirements memory_requirements; + vkGetImageMemoryRequirements(vk->device, vi->image, &memory_requirements); + + VkMemoryDedicatedAllocateInfo dedicated_allocate_info = { + .sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO, + .image = vi->image, + }; + + if (vk_allocate_memory(&vi->memory, memory_requirements.size, VulkanMemoryKind_Device, 0, &dedicated_allocate_info, export)) { + image->memory_size = memory_requirements.size; + vkBindImageMemory(vk->device, vi->image, vi->memory, 0); + + VkImageViewCreateInfo image_view_info = { + .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, + .image = vi->image, + .viewType = VK_IMAGE_VIEW_TYPE_2D, + .format = usage_format_map[usage], + .subresourceRange = { + .aspectMask = usage_image_aspect_map[usage], + .baseMipLevel = 0, + .levelCount = 1, + .baseArrayLayer = 0, + .layerCount = 1, + }, + }; + vkCreateImageView(vk->device, &image_view_info, 0, &vi->view); + } else { + vkDestroyImage(vk->device, vi->image, 0); + vk_entity_release(e); + zero_struct(image); + } +} + +DEBUG_IMPORT VulkanHandle +vk_create_semaphore(OSHandle *export) +{ VulkanEntity *e = vk_entity_allocate(VulkanEntityKind_Semaphore); + e->as.semaphore = vk_make_semaphore(export); VulkanHandle result = {(u64)e}; + return result; +} - vkCreateSemaphore(vk->device, &sci, 0, &e->as.semaphore); - - if (export) { - if (OS_WINDOWS) { - VkSemaphoreGetWin32HandleInfoKHR ghi = { - .sType = VK_STRUCTURE_TYPE_SEMAPHORE_GET_WIN32_HANDLE_INFO_KHR, - .handleType = VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT, - .semaphore = e->as.semaphore, - }; - void *handle; - vkGetSemaphoreWin32HandleKHR(vk->device, &ghi, &handle); - export->value[0] = (u64)handle; - } else { - VkSemaphoreGetFdInfoKHR ghi = { - .sType = VK_STRUCTURE_TYPE_SEMAPHORE_GET_FD_INFO_KHR, - .handleType = VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT, - .semaphore = e->as.semaphore, - }; - i32 handle; - vkGetSemaphoreFdKHR(vk->device, &ghi, &handle); - export->value[0] = (u64)handle; - } +DEBUG_IMPORT b32 +vk_host_wait_timeline(VulkanTimeline timeline, u64 value, u64 timeout_ns) +{ + b32 result = 0; + if Between(timeline, 0, VulkanTimeline_Count - 1) { + VulkanContext *vk = vulkan_context; + VulkanQueue *vq = vk->queues[timeline]; + VkSemaphoreWaitInfo semaphore_wait_info = { + .sType = VK_STRUCTURE_TYPE_SEMAPHORE_WAIT_INFO, + .pSemaphores = &vq->timeline_semaphore.semaphore, + .semaphoreCount = 1, + .pValues = &value, + }; + result = vkWaitSemaphores(vk->device, &semaphore_wait_info, timeout_ns) == VK_SUCCESS; } + return result; +} +DEBUG_IMPORT u64 +vk_host_signal_timeline(VulkanTimeline timeline) +{ + u64 result = -1; + if Between(timeline, 0, VulkanTimeline_Count - 1) { + VulkanContext *vk = vulkan_context; + VulkanQueue *vq = vk->queues[timeline]; + VulkanSemaphore *vs = &vq->timeline_semaphore; + result = ++vs->value; + VkSemaphoreSignalInfo ssi = { + .sType = VK_STRUCTURE_TYPE_SEMAPHORE_SIGNAL_INFO, + .semaphore = vs->semaphore, + .value = result, + }; + vkSignalSemaphore(vk->device, &ssi); + } return result; } DEBUG_IMPORT VulkanHandle -vk_compute_shader(s8 text, s8 name) +vk_pipeline(VulkanPipelineCreateInfo *infos, u32 count, u32 push_constants_size) { + assert(Between(count, 1, 2)); + assert(count == 2 || infos[0].kind == VulkanShaderKind_Compute); + VulkanHandle result = {0}; DeferLoop(take_lock(&vulkan_context->arena_lock, -1), release_lock(&vulkan_context->arena_lock)) { Arena arena = vulkan_context->arena; - VulkanEntity *e = vk_entity_allocate(VulkanEntityKind_Shader); + VulkanEntity *e = vk_entity_allocate(VulkanEntityKind_Pipeline); result = (VulkanHandle){(u64)e}; - e->as.shader = vk_compute_pipeline_from_shader_text(arena, text, name); - if (e->as.shader.pipeline == 0) e->as.shader = vulkan_context->default_compute_shader; + if (count == 2) e->as.pipeline = vk_graphics_pipeline_from_infos(arena, infos, count, push_constants_size); + else e->as.pipeline = vk_compute_pipeline_from_shader_text(arena, infos[0].text, infos[0].name, push_constants_size); } return result; } -DEBUG_IMPORT void -vk_compute_shader_release(VulkanHandle h) +DEBUG_IMPORT b32 +vk_pipeline_valid(VulkanHandle h) { + b32 result = 0; if ValidVulkanHandle(h) { - VulkanShader *vs = vk_entity_data(h, VulkanEntityKind_Shader); - if (vs->pipeline != vulkan_context->default_compute_shader.pipeline) { - vkDestroyPipeline(vulkan_context->device, vs->pipeline, 0); - vkDestroyPipelineLayout(vulkan_context->device, vs->layout, 0); + VulkanPipeline *vp = vk_entity_data(h, VulkanEntityKind_Pipeline); + if (vp->stage_flags == VK_SHADER_STAGE_COMPUTE_BIT) + result = vp->pipeline != vulkan_context->default_compute_pipeline.pipeline; + else + result = vp->pipeline != vulkan_context->default_graphics_pipeline.pipeline; + } + return result; +} + +DEBUG_IMPORT void +vk_pipeline_release(VulkanHandle h) +{ + if (vk_pipeline_valid(h)) { + VulkanEntity *e = (VulkanEntity *)h.value[0]; + VulkanTimeline timeline; + if (e->as.pipeline.stage_flags == VK_SHADER_STAGE_COMPUTE_BIT) timeline = VulkanTimeline_Compute; + else timeline = VulkanTimeline_Graphics; + + VulkanQueue *vq = vulkan_context->queues[timeline]; + DeferLoop(take_lock(&vq->lock, -1), release_lock(&vq->lock)) + { + u32 index = (vq->next_command_buffer_index - 1) % countof(vq->command_buffers); + vk_host_wait_timeline(timeline, vq->command_buffer_submission_values[index], -1ULL); + + if (&e->as.pipeline == vq->bound_pipeline) + vq->bound_pipeline = 0; + + vkDestroyPipeline(vulkan_context->device, e->as.pipeline.pipeline, 0); + vkDestroyPipelineLayout(vulkan_context->device, e->as.pipeline.layout, 0); + } + vk_entity_release(e); + } +} + +DEBUG_IMPORT VulkanHandle +vk_command_begin(VulkanTimeline timeline) +{ + VulkanHandle result = {0}; + if Between(timeline, 0, VulkanTimeline_Count - 1) { + VulkanContext *vk = vulkan_context; + VulkanQueue *vq = vk->queues[timeline]; + + take_lock(&vq->lock, -1); + + VulkanEntity *e = vk_entity_allocate(VulkanEntityKind_CommandBuffer); + VulkanCommandBuffer *vcb = &e->as.command_buffer; + u32 index = vq->next_command_buffer_index++ % countof(vq->command_buffers); + vcb->kind = (VulkanQueueKind)timeline; + vcb->command_buffer_index = index; + + // TODO(rnp): probably not the best to have this here but it will likely not be hit + b32 wait_result = vk_host_wait_timeline(timeline, vq->command_buffer_submission_values[index], -1ULL); + assert(wait_result); + + VkCommandBufferBeginInfo buffer_begin_info = { + .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO, + .flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT, + }; + + vq->query_pool_occupied[index] = 0; + + vkBeginCommandBuffer(vq->command_buffers[index], &buffer_begin_info); + vkCmdResetQueryPool(vq->command_buffers[index], vq->query_pool, + index * MaxCommandBufferTimestamps, MaxCommandBufferTimestamps); + + result = (VulkanHandle){(u64)e}; + } + return result; +} + +DEBUG_IMPORT void +vk_command_bind_pipeline(VulkanHandle command, VulkanHandle pipeline) +{ + if ValidVulkanHandle(command) { + VulkanContext *vk = vulkan_context; + VulkanCommandBuffer *vcb = vk_entity_data(command, VulkanEntityKind_CommandBuffer); + VulkanQueue *vq = vk->queues[vcb->kind]; + + VulkanPipeline *vp = 0; + if ValidVulkanHandle(pipeline) { + vp = vk_entity_data(pipeline, VulkanEntityKind_Pipeline); + } else if (vcb->kind == VulkanQueueKind_Compute) { + vp = &vk->default_compute_pipeline; + } else if (vcb->kind == VulkanQueueKind_Graphics) { + vp = &vk->default_graphics_pipeline; + } else { + InvalidCodePath; + } + + read_only local_persist VkPipelineBindPoint bind_point_lut[VulkanQueueKind_Count] = { + [VulkanQueueKind_Graphics] = VK_PIPELINE_BIND_POINT_GRAPHICS, + [VulkanQueueKind_Compute] = VK_PIPELINE_BIND_POINT_COMPUTE, + [VulkanQueueKind_Transfer] = -1, + }; + + VkPipelineBindPoint bind_point = bind_point_lut[vcb->kind]; + assert(bind_point != (VkPipelineBindPoint)-1); + + vkCmdBindPipeline(vq->command_buffers[vcb->command_buffer_index], bind_point, vp->pipeline); + vq->bound_pipeline = vp; + } +} + +DEBUG_IMPORT void +vk_command_buffer_memory_barriers(VulkanHandle command, GPUMemoryBarrierInfo *barriers, u64 count) +{ + if ValidVulkanHandle(command) { + VulkanContext *vk = vulkan_context; + VulkanCommandBuffer *vcb = vk_entity_data(command, VulkanEntityKind_CommandBuffer); + VulkanQueue *vq = vk->queues[vcb->kind]; + + DeferLoop(take_lock(&vk->arena_lock, -1), release_lock(&vk->arena_lock)) + { + Arena arena = vk->arena; + u32 valid_count = 0; + VkBufferMemoryBarrier2 *memory_barriers = push_array(&arena, VkBufferMemoryBarrier2, count); + for (u64 it = 0; it < count; it++) { + if ValidVulkanHandle(barriers[it].gpu_buffer->buffer) { + u32 index = valid_count++; + VulkanBuffer *vb = vk_entity_data(barriers[it].gpu_buffer->buffer, VulkanEntityKind_Buffer); + memory_barriers[index].sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2; + memory_barriers[index].srcStageMask = vq->pipeline_stage_flags; + memory_barriers[index].srcAccessMask = VK_ACCESS_2_MEMORY_WRITE_BIT; + memory_barriers[index].dstStageMask = vq->pipeline_stage_flags; + memory_barriers[index].dstAccessMask = VK_ACCESS_2_MEMORY_READ_BIT; + memory_barriers[index].srcQueueFamilyIndex = vq->queue_family; + memory_barriers[index].dstQueueFamilyIndex = vq->queue_family; + memory_barriers[index].buffer = vb->buffer; + memory_barriers[index].offset = barriers[it].offset; + memory_barriers[index].size = barriers[it].size; + } + } + + VkDependencyInfo dependancy_info = { + .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, + .bufferMemoryBarrierCount = valid_count, + .pBufferMemoryBarriers = memory_barriers, + }; + + vkCmdPipelineBarrier2(vq->command_buffers[vcb->command_buffer_index], &dependancy_info); + } + } +} + +DEBUG_IMPORT void +vk_command_dispatch_compute(VulkanHandle command, uv3 dispatch) +{ + assert(dispatch.x <= U16_MAX); + assert(dispatch.y <= U16_MAX); + assert(dispatch.z <= U16_MAX); + if ValidVulkanHandle(command) { + VkCommandBuffer cmd = vk_command_buffer(command); + vkCmdDispatch(cmd, dispatch.x, dispatch.y, dispatch.z); + } +} + +DEBUG_IMPORT void +vk_command_push_constants(VulkanHandle command, u32 offset, u32 size, void *values) +{ + if ValidVulkanHandle(command) { + VulkanCommandBuffer *vcb = vk_entity_data(command, VulkanEntityKind_CommandBuffer); + VulkanQueue *vq = vulkan_context->queues[vcb->kind]; + VulkanPipeline *vp = vq->bound_pipeline; + + assert(vp); + + vkCmdPushConstants(vq->command_buffers[vcb->command_buffer_index], vp->layout, vp->stage_flags, + offset, size, values); + } +} + +DEBUG_IMPORT void +vk_command_timestamp(VulkanHandle command) +{ + if ValidVulkanHandle(command) { + VulkanContext *vk = vulkan_context; + VulkanCommandBuffer *vcb = vk_entity_data(command, VulkanEntityKind_CommandBuffer); + VulkanQueue *vq = vk->queues[vcb->kind]; + + read_only local_persist VkPipelineStageFlags2 stage_lut[VulkanQueueKind_Count] = { + [VulkanQueueKind_Graphics] = VK_PIPELINE_STAGE_2_ALL_GRAPHICS_BIT, + [VulkanQueueKind_Compute] = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, + [VulkanQueueKind_Transfer] = -1, + }; + + VkPipelineStageFlags2 stage = stage_lut[vcb->kind]; + assert(stage != (VkPipelineStageFlags2)-1); + + if (vq->query_pool_occupied[vcb->command_buffer_index] < MaxCommandBufferTimestamps) { + u32 query_index = vq->query_pool_occupied[vcb->command_buffer_index]++; + vkCmdWriteTimestamp2(vq->command_buffers[vcb->command_buffer_index], stage, + vq->query_pool, + vcb->command_buffer_index * MaxCommandBufferTimestamps + query_index); } - vk_entity_release((VulkanEntity *)h.value[0]); } } + +DEBUG_IMPORT void +vk_command_wait_timeline(VulkanHandle command, VulkanTimeline timeline, u64 value) +{ + if (ValidVulkanHandle(command) && Between(timeline, 0, VulkanTimeline_Count - 1)) { + VulkanContext *vk = vulkan_context; + VulkanCommandBuffer *vcb = vk_entity_data(command, VulkanEntityKind_CommandBuffer); + + u32 wait_index = vk->queue_indices[timeline]; + vcb->in_flight_wait_values[wait_index] = Max(value, vcb->in_flight_wait_values[wait_index]); + } +} + +DEBUG_IMPORT u64 +vk_command_end(VulkanHandle command, VulkanHandle wait_semaphore, VulkanHandle finished_semaphore) +{ + u64 result = -1; + if ValidVulkanHandle(command) { + VulkanContext *vk = vulkan_context; + VulkanCommandBuffer *vcb = vk_entity_data(command, VulkanEntityKind_CommandBuffer); + VulkanQueue *vq = vk->queues[vcb->kind]; + VulkanSemaphore *vs = &vq->timeline_semaphore; + + vkEndCommandBuffer(vq->command_buffers[vcb->command_buffer_index]); + + VkCommandBufferSubmitInfo command_buffer_submit_info = { + .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_SUBMIT_INFO, + .commandBuffer = vq->command_buffers[vcb->command_buffer_index], + }; + + result = ++vs->value; + + u32 signal_submit_info_count = 1; + VkSemaphoreSubmitInfo signal_submit_infos[2] = {{ + .sType = VK_STRUCTURE_TYPE_SEMAPHORE_SUBMIT_INFO, + .semaphore = vs->semaphore, + .value = result, + .stageMask = vq->pipeline_stage_flags, + }}; + + if ValidVulkanHandle(finished_semaphore) { + VulkanSemaphore *fs = vk_entity_data(finished_semaphore, VulkanEntityKind_Semaphore); + signal_submit_infos[signal_submit_info_count++] = (VkSemaphoreSubmitInfo){ + .sType = VK_STRUCTURE_TYPE_SEMAPHORE_SUBMIT_INFO, + .semaphore = fs->semaphore, + .stageMask = vq->pipeline_stage_flags, + }; + } + + u32 wait_submit_info_count = 0; + VkSemaphoreSubmitInfo wait_submit_infos[VulkanQueueKind_Count + 1]; + for (u32 i = 0; i < vk->unique_queues; i++) { + u32 queue_index = vk->queue_indices[i]; + if (vcb->in_flight_wait_values[queue_index] > 0) { + VulkanQueue *q = vk->queues[queue_index]; + VkSemaphoreSubmitInfo wait_ssi = { + .sType = VK_STRUCTURE_TYPE_SEMAPHORE_SUBMIT_INFO, + .semaphore = q->timeline_semaphore.semaphore, + .value = vcb->in_flight_wait_values[queue_index], + .stageMask = q->pipeline_stage_flags, + }; + wait_submit_infos[wait_submit_info_count++] = wait_ssi; + } + } + + if ValidVulkanHandle(wait_semaphore) { + VulkanSemaphore *ws = vk_entity_data(wait_semaphore, VulkanEntityKind_Semaphore); + wait_submit_infos[wait_submit_info_count++] = (VkSemaphoreSubmitInfo){ + .sType = VK_STRUCTURE_TYPE_SEMAPHORE_SUBMIT_INFO, + .semaphore = ws->semaphore, + .stageMask = vq->pipeline_stage_flags, + }; + } + + VkSubmitInfo2 submit_info = { + .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO_2, + .commandBufferInfoCount = 1, + .pCommandBufferInfos = &command_buffer_submit_info, + .waitSemaphoreInfoCount = wait_submit_info_count, + .pWaitSemaphoreInfos = wait_submit_infos, + .signalSemaphoreInfoCount = signal_submit_info_count, + .pSignalSemaphoreInfos = signal_submit_infos, + }; + + vkQueueSubmit2(vq->queue, 1, &submit_info, 0); + + vq->bound_pipeline = 0; + + atomic_store_u64(vq->command_buffer_submission_values + vcb->command_buffer_index, result); + + release_lock(&vq->lock); + + vk_entity_release((VulkanEntity *)command.value[0]); + } + return result; +} + +DEBUG_IMPORT void +vk_command_begin_rendering(VulkanHandle command, GPUImage *colour, GPUImage *depth, GPUImage *resolve) +{ + if ValidVulkanHandle(command) { + VkCommandBuffer cmd = vk_command_buffer(command); + + assert((colour->width == depth->width) && (colour->height == depth->height)); + + VulkanImage *ci = vk_entity_data(colour->image, VulkanEntityKind_Image); + VulkanImage *di = vk_entity_data(depth->image, VulkanEntityKind_Image); + VulkanImage *ri = 0; + if (resolve) ri = vk_entity_data(resolve->image, VulkanEntityKind_Image); + + // NOTE: Layout Transitions + { + u32 image_memory_barrier_count = 2; + VkImageMemoryBarrier2 image_memory_barriers[3] = { + { + .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER_2, + .srcStageMask = VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT, + .srcAccessMask = 0, + .dstStageMask = VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT, + .dstAccessMask = VK_ACCESS_2_COLOR_ATTACHMENT_READ_BIT|VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT, + .oldLayout = VK_IMAGE_LAYOUT_UNDEFINED, + .newLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, + .image = ci->image, + .subresourceRange = { + .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, + .baseMipLevel = 0, + .levelCount = 1, + .baseArrayLayer = 0, + .layerCount = 1, + }, + }, + { + .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER_2, + .srcStageMask = VK_PIPELINE_STAGE_2_EARLY_FRAGMENT_TESTS_BIT|VK_PIPELINE_STAGE_2_LATE_FRAGMENT_TESTS_BIT, + .srcAccessMask = 0, + .dstStageMask = VK_PIPELINE_STAGE_2_EARLY_FRAGMENT_TESTS_BIT|VK_PIPELINE_STAGE_2_LATE_FRAGMENT_TESTS_BIT, + .dstAccessMask = VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT, + .oldLayout = VK_IMAGE_LAYOUT_UNDEFINED, + .newLayout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL, + .image = di->image, + .subresourceRange = { + .aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT|VK_IMAGE_ASPECT_STENCIL_BIT, + .baseMipLevel = 0, + .levelCount = 1, + .baseArrayLayer = 0, + .layerCount = 1, + }, + }, + }; + + if (resolve) image_memory_barriers[image_memory_barrier_count++] = (VkImageMemoryBarrier2){ + .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER_2, + .srcStageMask = VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT, + .srcAccessMask = 0, + .dstStageMask = VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT|VK_PIPELINE_STAGE_2_RESOLVE_BIT, + .dstAccessMask = VK_ACCESS_2_COLOR_ATTACHMENT_READ_BIT|VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT, + .oldLayout = VK_IMAGE_LAYOUT_UNDEFINED, + .newLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, + .image = ri->image, + .subresourceRange = { + .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, + .baseMipLevel = 0, + .levelCount = 1, + .baseArrayLayer = 0, + .layerCount = 1, + }, + }; + + VkDependencyInfo dependency_info = { + .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, + .imageMemoryBarrierCount = image_memory_barrier_count, + .pImageMemoryBarriers = image_memory_barriers, + }; + + vkCmdPipelineBarrier2(cmd, &dependency_info); + } + + VkRenderingAttachmentInfo colour_attachment = { + .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO, + .imageView = ci->view, + .imageLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, + .resolveMode = ri ? VK_RESOLVE_MODE_AVERAGE_BIT : 0, + .resolveImageView = ri ? ri->view : 0, + .resolveImageLayout = ri ? VK_IMAGE_LAYOUT_GENERAL : 0, + .loadOp = VK_ATTACHMENT_LOAD_OP_CLEAR, + .storeOp = VK_ATTACHMENT_STORE_OP_STORE, + .clearValue = {.color = {{0.0f, 0.0f, 0.0f, 0.0f}}}, + }; + + VkRenderingAttachmentInfo depth_stencil_attachment = { + .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO, + .imageView = di->view, + .imageLayout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL, + .loadOp = VK_ATTACHMENT_LOAD_OP_CLEAR, + .storeOp = VK_ATTACHMENT_STORE_OP_STORE, + .clearValue = {.depthStencil = {1.0f, 0}}, + }; + + VkRenderingInfo rendering_info = { + .sType = VK_STRUCTURE_TYPE_RENDERING_INFO, + .renderArea = {.offset = {0}, .extent = {colour->width, colour->height}}, + .layerCount = 1, + .colorAttachmentCount = 1, + .pColorAttachments = &colour_attachment, + .pDepthAttachment = &depth_stencil_attachment, + .pStencilAttachment = &depth_stencil_attachment, + }; + + vkCmdBeginRendering(cmd, &rendering_info); + } +} + +DEBUG_IMPORT void +vk_command_draw(VulkanHandle command, GPUBuffer *model) +{ + if (ValidVulkanHandle(command) && ValidVulkanHandle(model->buffer)) { + VkCommandBuffer cmd = vk_command_buffer(command); + VulkanBuffer *vb = vk_entity_data(model->buffer, VulkanEntityKind_RenderModel); + vkCmdBindIndexBuffer2(cmd, vb->buffer, 0, vk_index_size(vb->index_type) * model->index_count, vb->index_type); + vkCmdDrawIndexed(cmd, model->index_count, 1, 0, 0, 0); + } +} + +DEBUG_IMPORT void +vk_command_scissor(VulkanHandle command, u32 width, u32 height, u32 x_offset, u32 y_offset) +{ + if ValidVulkanHandle(command) { + VkCommandBuffer cmd = vk_command_buffer(command); + VkRect2D scissor = {.offset = {x_offset, y_offset}, .extent = {width, height}}; + vkCmdSetScissor(cmd, 0, 1, &scissor); + } +} + +DEBUG_IMPORT void +vk_command_viewport(VulkanHandle command, f32 width, f32 height, f32 x_offset, f32 y_offset, f32 min_depth, f32 max_depth) +{ + if ValidVulkanHandle(command) { + VkCommandBuffer cmd = vk_command_buffer(command); + VkViewport viewport = {x_offset, y_offset, width, height, min_depth, max_depth}; + vkCmdSetViewport(cmd, 0, 1, &viewport); + } +} + +DEBUG_IMPORT void +vk_command_end_rendering(VulkanHandle command) +{ + if ValidVulkanHandle(command) vkCmdEndRendering(vk_command_buffer(command)); +} + +DEBUG_IMPORT void +vk_command_copy_buffer(VulkanHandle command, GPUBuffer *restrict destination, + GPUBuffer *restrict source, u64 source_offset, i64 size) +{ + if (ValidVulkanHandle(command) && ValidVulkanHandle(destination->buffer) && ValidVulkanHandle(source->buffer)) { + VkCommandBuffer cmd = vk_command_buffer(command); + VulkanBuffer *db = vk_entity_data(destination->buffer, VulkanEntityKind_Buffer); + VulkanBuffer *sb = vk_entity_data(source->buffer, VulkanEntityKind_Buffer); + + VkBufferCopy2 buffer_copy = { + .sType = VK_STRUCTURE_TYPE_BUFFER_COPY_2, + .srcOffset = source_offset, + .dstOffset = 0, + .size = size, + }; + + VkCopyBufferInfo2 copy_buffer_info = { + .sType = VK_STRUCTURE_TYPE_COPY_BUFFER_INFO_2, + .srcBuffer = sb->buffer, + .dstBuffer = db->buffer, + .regionCount = 1, + .pRegions = &buffer_copy, + }; + + vkCmdCopyBuffer2(cmd, &copy_buffer_info); + } +} + +DEBUG_IMPORT u64 * +vk_command_read_timestamps(VulkanTimeline timeline, Arena *arena) +{ + u64 *result = 0; + if Between(timeline, 0, VulkanTimeline_Count - 1) { + VulkanContext *vk = vulkan_context; + VulkanQueue *vq = vk->queues[timeline]; + DeferLoop(take_lock(&vq->lock, -1), release_lock(&vq->lock)) { + u32 index = (vq->next_command_buffer_index - 1) % countof(vq->command_buffers); + u32 count = vq->query_pool_occupied[index]; + if (count > 0) { + result = push_array(arena, u64, count + 1); + result[0] = count; + + vkGetQueryPoolResults(vk->device, vq->query_pool, index * MaxCommandBufferTimestamps, count, + count * sizeof(u64), result + 1, 8, VK_QUERY_RESULT_WAIT_BIT); + } + } + } else { + result = push_array(arena, u64, 1); + } + return result; +} diff --git a/vulkan.h b/vulkan.h @@ -24,7 +24,9 @@ typedef uint32_t VkBool32; typedef uint32_t VkFlags; +typedef uint64_t VkFlags64; typedef uint32_t VkSampleMask; +typedef uint64_t VkDeviceAddress; typedef uint64_t VkDeviceSize; VK_HANDLE(VkBuffer); VK_HANDLE(VkCommandBuffer); @@ -41,6 +43,7 @@ VK_HANDLE(VkPhysicalDevice); VK_HANDLE(VkPipeline); VK_HANDLE(VkPipelineCache); VK_HANDLE(VkPipelineLayout); +VK_HANDLE(VkQueryPool); VK_HANDLE(VkQueue); VK_HANDLE(VkRenderPass); VK_HANDLE(VkSampler); @@ -51,53 +54,85 @@ VK_HANDLE(VkSwapchainKHR); typedef enum { VK_SUCCESS = 0, + VK_TIMEOUT = 2, VK_SUBOPTIMAL_KHR = 1000001003, VK_ERROR_OUT_OF_DATE_KHR = -1000001004, VK_RESULT_MAX_ENUM = 0x7FFFFFFF } VkResult; typedef enum { - VK_STRUCTURE_TYPE_APPLICATION_INFO = 0, - VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO = 1, - VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO = 2, - VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO = 3, - VK_STRUCTURE_TYPE_SUBMIT_INFO = 4, - VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO = 5, - VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE = 6, - VK_STRUCTURE_TYPE_FENCE_CREATE_INFO = 8, - VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO = 9, - VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO = 15, - VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO = 16, - VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO = 18, - VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO = 19, - VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO = 20, - VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO = 22, - VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO = 23, - VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO = 24, - VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO = 26, - VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO = 27, - VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO = 28, - VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO = 29, - VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO = 30, - VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO = 37, - VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO = 38, - VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO = 39, - VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO = 40, - VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO = 42, - VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO = 43, - VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_PROPERTIES = 50, - VK_STRUCTURE_TYPE_SWAPCHAIN_CREATE_INFO_KHR = 1000001000, - VK_STRUCTURE_TYPE_PRESENT_INFO_KHR = 1000001001, - VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2 = 1000059001, - VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MEMORY_PROPERTIES_2 = 1000059006, - VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_FLAGS_INFO = 1000060000, - VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO = 1000072002, - VK_STRUCTURE_TYPE_MEMORY_GET_WIN32_HANDLE_INFO_KHR = 1000073003, - VK_STRUCTURE_TYPE_MEMORY_GET_FD_INFO_KHR = 1000074002, - VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_CREATE_INFO = 1000077000, - VK_STRUCTURE_TYPE_SEMAPHORE_GET_WIN32_HANDLE_INFO_KHR = 1000078003, - VK_STRUCTURE_TYPE_SEMAPHORE_GET_FD_INFO_KHR = 1000079001, - VK_STRUCTURE_TYPE_MAX_ENUM = 0x7FFFFFFF, + VK_STRUCTURE_TYPE_APPLICATION_INFO = 0, + VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO = 1, + VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO = 2, + VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO = 3, + VK_STRUCTURE_TYPE_SUBMIT_INFO = 4, + VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO = 5, + VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE = 6, + VK_STRUCTURE_TYPE_FENCE_CREATE_INFO = 8, + VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO = 9, + VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO = 11, + VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO = 12, + VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO = 14, + VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO = 15, + VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO = 16, + VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO = 18, + VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO = 19, + VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO = 20, + VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO = 22, + VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO = 23, + VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO = 24, + VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO = 25, + VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO = 26, + VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO = 27, + VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO = 28, + VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO = 29, + VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO = 30, + VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO = 37, + VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO = 38, + VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO = 39, + VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO = 40, + VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO = 42, + VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO = 43, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES = 49, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_PROPERTIES = 50, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES = 51, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_PROPERTIES = 52, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_3_FEATURES = 53, + VK_STRUCTURE_TYPE_SWAPCHAIN_CREATE_INFO_KHR = 1000001000, + VK_STRUCTURE_TYPE_PRESENT_INFO_KHR = 1000001001, + VK_STRUCTURE_TYPE_RENDERING_INFO = 1000044000, + VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO = 1000044001, + VK_STRUCTURE_TYPE_PIPELINE_RENDERING_CREATE_INFO = 1000044002, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2 = 1000059000, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2 = 1000059001, + VK_STRUCTURE_TYPE_FORMAT_PROPERTIES_2 = 1000059002, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MEMORY_PROPERTIES_2 = 1000059006, + VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_FLAGS_INFO = 1000060000, + VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_IMAGE_CREATE_INFO = 1000072001, + VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO = 1000072002, + VK_STRUCTURE_TYPE_MEMORY_GET_WIN32_HANDLE_INFO_KHR = 1000073003, + VK_STRUCTURE_TYPE_MEMORY_GET_FD_INFO_KHR = 1000074002, + VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_CREATE_INFO = 1000077000, + VK_STRUCTURE_TYPE_SEMAPHORE_GET_WIN32_HANDLE_INFO_KHR = 1000078003, + VK_STRUCTURE_TYPE_SEMAPHORE_GET_FD_INFO_KHR = 1000079001, + VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO = 1000127001, + VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO = 1000207002, + VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO = 1000207003, + VK_STRUCTURE_TYPE_SEMAPHORE_WAIT_INFO = 1000207004, + VK_STRUCTURE_TYPE_SEMAPHORE_SIGNAL_INFO = 1000207005, + VK_STRUCTURE_TYPE_BUFFER_DEVICE_ADDRESS_INFO = 1000244001, + VK_STRUCTURE_TYPE_VALIDATION_FEATURES_EXT = 1000247000, + VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2 = 1000314001, + VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER_2 = 1000314002, + VK_STRUCTURE_TYPE_DEPENDENCY_INFO = 1000314003, + VK_STRUCTURE_TYPE_SUBMIT_INFO_2 = 1000314004, + VK_STRUCTURE_TYPE_SEMAPHORE_SUBMIT_INFO = 1000314005, + VK_STRUCTURE_TYPE_COMMAND_BUFFER_SUBMIT_INFO = 1000314006, + VK_STRUCTURE_TYPE_COPY_BUFFER_INFO_2 = 1000337000, + VK_STRUCTURE_TYPE_BUFFER_COPY_2 = 1000337006, + VK_STRUCTURE_TYPE_FORMAT_PROPERTIES_3 = 1000360000, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_RELAXED_EXTENDED_INSTRUCTION_FEATURES_KHR = 1000558000, + VK_STRUCTURE_TYPE_MAX_ENUM = 0x7FFFFFFF, } VkStructureType; typedef enum { @@ -110,6 +145,27 @@ typedef enum { } VkPhysicalDeviceType; typedef enum { + VK_QUERY_TYPE_OCCLUSION = 0, + VK_QUERY_TYPE_PIPELINE_STATISTICS = 1, + VK_QUERY_TYPE_TIMESTAMP = 2, + VK_QUERY_TYPE_RESULT_STATUS_ONLY_KHR = 1000023000, + VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT = 1000028004, + VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR = 1000116000, + VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR = 1000150000, + VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR = 1000150001, + VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_NV = 1000165000, + VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL = 1000210000, + VK_QUERY_TYPE_VIDEO_ENCODE_FEEDBACK_KHR = 1000299000, + VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT = 1000328000, + VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT = 1000382000, + VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR = 1000386000, + VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR = 1000386001, + VK_QUERY_TYPE_MICROMAP_SERIALIZATION_SIZE_EXT = 1000396000, + VK_QUERY_TYPE_MICROMAP_COMPACTED_SIZE_EXT = 1000396001, + VK_QUERY_TYPE_MAX_ENUM = 0x7FFFFFFF +} VkQueryType; + +typedef enum { VK_SYSTEM_ALLOCATION_SCOPE_COMMAND = 0, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT = 1, VK_SYSTEM_ALLOCATION_SCOPE_CACHE = 2, @@ -180,6 +236,121 @@ typedef enum { } VkPipelineStageFlagBits; typedef VkFlags VkPipelineStageFlags; +typedef enum { + VK_PIPELINE_STAGE_2_NONE = 0ULL, + VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT = 0x00000001ULL, + VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT = 0x00000002ULL, + VK_PIPELINE_STAGE_2_VERTEX_INPUT_BIT = 0x00000004ULL, + VK_PIPELINE_STAGE_2_VERTEX_SHADER_BIT = 0x00000008ULL, + VK_PIPELINE_STAGE_2_TESSELLATION_CONTROL_SHADER_BIT = 0x00000010ULL, + VK_PIPELINE_STAGE_2_TESSELLATION_EVALUATION_SHADER_BIT = 0x00000020ULL, + VK_PIPELINE_STAGE_2_GEOMETRY_SHADER_BIT = 0x00000040ULL, + VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT = 0x00000080ULL, + VK_PIPELINE_STAGE_2_EARLY_FRAGMENT_TESTS_BIT = 0x00000100ULL, + VK_PIPELINE_STAGE_2_LATE_FRAGMENT_TESTS_BIT = 0x00000200ULL, + VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT = 0x00000400ULL, + VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT = 0x00000800ULL, + VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT = 0x00001000ULL, + VK_PIPELINE_STAGE_2_TRANSFER_BIT = 0x00001000ULL, + VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT = 0x00002000ULL, + VK_PIPELINE_STAGE_2_HOST_BIT = 0x00004000ULL, + VK_PIPELINE_STAGE_2_ALL_GRAPHICS_BIT = 0x00008000ULL, + VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT = 0x00010000ULL, + VK_PIPELINE_STAGE_2_COPY_BIT = 0x100000000ULL, + VK_PIPELINE_STAGE_2_RESOLVE_BIT = 0x200000000ULL, + VK_PIPELINE_STAGE_2_BLIT_BIT = 0x400000000ULL, + VK_PIPELINE_STAGE_2_CLEAR_BIT = 0x800000000ULL, + VK_PIPELINE_STAGE_2_INDEX_INPUT_BIT = 0x1000000000ULL, + VK_PIPELINE_STAGE_2_VERTEX_ATTRIBUTE_INPUT_BIT = 0x2000000000ULL, + VK_PIPELINE_STAGE_2_PRE_RASTERIZATION_SHADERS_BIT = 0x4000000000ULL, + VK_PIPELINE_STAGE_2_VIDEO_DECODE_BIT_KHR = 0x04000000ULL, + VK_PIPELINE_STAGE_2_VIDEO_ENCODE_BIT_KHR = 0x08000000ULL, + VK_PIPELINE_STAGE_2_TRANSFORM_FEEDBACK_BIT_EXT = 0x01000000ULL, + VK_PIPELINE_STAGE_2_CONDITIONAL_RENDERING_BIT_EXT = 0x00040000ULL, + VK_PIPELINE_STAGE_2_COMMAND_PREPROCESS_BIT_NV = 0x00020000ULL, + VK_PIPELINE_STAGE_2_COMMAND_PREPROCESS_BIT_EXT = 0x00020000ULL, + VK_PIPELINE_STAGE_2_FRAGMENT_SHADING_RATE_ATTACHMENT_BIT_KHR = 0x00400000ULL, + VK_PIPELINE_STAGE_2_SHADING_RATE_IMAGE_BIT_NV = 0x00400000ULL, + VK_PIPELINE_STAGE_2_ACCELERATION_STRUCTURE_BUILD_BIT_KHR = 0x02000000ULL, + VK_PIPELINE_STAGE_2_RAY_TRACING_SHADER_BIT_KHR = 0x00200000ULL, + VK_PIPELINE_STAGE_2_RAY_TRACING_SHADER_BIT_NV = 0x00200000ULL, + VK_PIPELINE_STAGE_2_ACCELERATION_STRUCTURE_BUILD_BIT_NV = 0x02000000ULL, + VK_PIPELINE_STAGE_2_FRAGMENT_DENSITY_PROCESS_BIT_EXT = 0x00800000ULL, + VK_PIPELINE_STAGE_2_TASK_SHADER_BIT_NV = 0x00080000ULL, + VK_PIPELINE_STAGE_2_MESH_SHADER_BIT_NV = 0x00100000ULL, + VK_PIPELINE_STAGE_2_TASK_SHADER_BIT_EXT = 0x00080000ULL, + VK_PIPELINE_STAGE_2_MESH_SHADER_BIT_EXT = 0x00100000ULL, + VK_PIPELINE_STAGE_2_SUBPASS_SHADER_BIT_HUAWEI = 0x8000000000ULL, + VK_PIPELINE_STAGE_2_INVOCATION_MASK_BIT_HUAWEI = 0x10000000000ULL, + VK_PIPELINE_STAGE_2_ACCELERATION_STRUCTURE_COPY_BIT_KHR = 0x10000000ULL, + VK_PIPELINE_STAGE_2_MICROMAP_BUILD_BIT_EXT = 0x40000000ULL, + VK_PIPELINE_STAGE_2_CLUSTER_CULLING_SHADER_BIT_HUAWEI = 0x20000000000ULL, + VK_PIPELINE_STAGE_2_OPTICAL_FLOW_BIT_NV = 0x20000000ULL, + VK_PIPELINE_STAGE_2_CONVERT_COOPERATIVE_VECTOR_MATRIX_BIT_NV = 0x100000000000ULL, + VK_PIPELINE_STAGE_2_DATA_GRAPH_BIT_ARM = 0x40000000000ULL, + VK_PIPELINE_STAGE_2_COPY_INDIRECT_BIT_KHR = 0x400000000000ULL, + VK_PIPELINE_STAGE_2_MEMORY_DECOMPRESSION_BIT_EXT = 0x200000000000ULL, +} VkPipelineStageFlagBits2; +typedef VkFlags64 VkPipelineStageFlags2; + +typedef enum { + VK_ACCESS_2_NONE = 0ULL, + VK_ACCESS_2_INDIRECT_COMMAND_READ_BIT = 0x00000001ULL, + VK_ACCESS_2_INDEX_READ_BIT = 0x00000002ULL, + VK_ACCESS_2_VERTEX_ATTRIBUTE_READ_BIT = 0x00000004ULL, + VK_ACCESS_2_UNIFORM_READ_BIT = 0x00000008ULL, + VK_ACCESS_2_INPUT_ATTACHMENT_READ_BIT = 0x00000010ULL, + VK_ACCESS_2_SHADER_READ_BIT = 0x00000020ULL, + VK_ACCESS_2_SHADER_WRITE_BIT = 0x00000040ULL, + VK_ACCESS_2_COLOR_ATTACHMENT_READ_BIT = 0x00000080ULL, + VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT = 0x00000100ULL, + VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_READ_BIT = 0x00000200ULL, + VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT = 0x00000400ULL, + VK_ACCESS_2_TRANSFER_READ_BIT = 0x00000800ULL, + VK_ACCESS_2_TRANSFER_WRITE_BIT = 0x00001000ULL, + VK_ACCESS_2_HOST_READ_BIT = 0x00002000ULL, + VK_ACCESS_2_HOST_WRITE_BIT = 0x00004000ULL, + VK_ACCESS_2_MEMORY_READ_BIT = 0x00008000ULL, + VK_ACCESS_2_MEMORY_WRITE_BIT = 0x00010000ULL, + VK_ACCESS_2_SHADER_SAMPLED_READ_BIT = 0x100000000ULL, + VK_ACCESS_2_SHADER_STORAGE_READ_BIT = 0x200000000ULL, + VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT = 0x400000000ULL, + VK_ACCESS_2_VIDEO_DECODE_READ_BIT_KHR = 0x800000000ULL, + VK_ACCESS_2_VIDEO_DECODE_WRITE_BIT_KHR = 0x1000000000ULL, + VK_ACCESS_2_VIDEO_ENCODE_READ_BIT_KHR = 0x2000000000ULL, + VK_ACCESS_2_VIDEO_ENCODE_WRITE_BIT_KHR = 0x4000000000ULL, + VK_ACCESS_2_SHADER_TILE_ATTACHMENT_READ_BIT_QCOM = 0x8000000000000ULL, + VK_ACCESS_2_SHADER_TILE_ATTACHMENT_WRITE_BIT_QCOM = 0x10000000000000ULL, + VK_ACCESS_2_TRANSFORM_FEEDBACK_WRITE_BIT_EXT = 0x02000000ULL, + VK_ACCESS_2_TRANSFORM_FEEDBACK_COUNTER_READ_BIT_EXT = 0x04000000ULL, + VK_ACCESS_2_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT = 0x08000000ULL, + VK_ACCESS_2_CONDITIONAL_RENDERING_READ_BIT_EXT = 0x00100000ULL, + VK_ACCESS_2_COMMAND_PREPROCESS_READ_BIT_NV = 0x00020000ULL, + VK_ACCESS_2_COMMAND_PREPROCESS_WRITE_BIT_NV = 0x00040000ULL, + VK_ACCESS_2_COMMAND_PREPROCESS_READ_BIT_EXT = 0x00020000ULL, + VK_ACCESS_2_COMMAND_PREPROCESS_WRITE_BIT_EXT = 0x00040000ULL, + VK_ACCESS_2_FRAGMENT_SHADING_RATE_ATTACHMENT_READ_BIT_KHR = 0x00800000ULL, + VK_ACCESS_2_SHADING_RATE_IMAGE_READ_BIT_NV = 0x00800000ULL, + VK_ACCESS_2_ACCELERATION_STRUCTURE_READ_BIT_KHR = 0x00200000ULL, + VK_ACCESS_2_ACCELERATION_STRUCTURE_WRITE_BIT_KHR = 0x00400000ULL, + VK_ACCESS_2_ACCELERATION_STRUCTURE_READ_BIT_NV = 0x00200000ULL, + VK_ACCESS_2_ACCELERATION_STRUCTURE_WRITE_BIT_NV = 0x00400000ULL, + VK_ACCESS_2_FRAGMENT_DENSITY_MAP_READ_BIT_EXT = 0x01000000ULL, + VK_ACCESS_2_COLOR_ATTACHMENT_READ_NONCOHERENT_BIT_EXT = 0x00080000ULL, + VK_ACCESS_2_DESCRIPTOR_BUFFER_READ_BIT_EXT = 0x20000000000ULL, + VK_ACCESS_2_INVOCATION_MASK_READ_BIT_HUAWEI = 0x8000000000ULL, + VK_ACCESS_2_SHADER_BINDING_TABLE_READ_BIT_KHR = 0x10000000000ULL, + VK_ACCESS_2_MICROMAP_READ_BIT_EXT = 0x100000000000ULL, + VK_ACCESS_2_MICROMAP_WRITE_BIT_EXT = 0x200000000000ULL, + VK_ACCESS_2_OPTICAL_FLOW_READ_BIT_NV = 0x40000000000ULL, + VK_ACCESS_2_OPTICAL_FLOW_WRITE_BIT_NV = 0x80000000000ULL, + VK_ACCESS_2_DATA_GRAPH_READ_BIT_ARM = 0x800000000000ULL, + VK_ACCESS_2_DATA_GRAPH_WRITE_BIT_ARM = 0x1000000000000ULL, + VK_ACCESS_2_MEMORY_DECOMPRESSION_READ_BIT_EXT = 0x80000000000000ULL, + VK_ACCESS_2_MEMORY_DECOMPRESSION_WRITE_BIT_EXT = 0x100000000000000ULL, +} VkAccessFlagBits2; +typedef VkFlags64 VkAccessFlags2; + typedef VkFlags VkDeviceCreateFlags; typedef enum { @@ -189,6 +360,128 @@ typedef enum { } VkPointClippingBehavior; typedef enum { + VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT = 0x00000001, + VK_FORMAT_FEATURE_STORAGE_IMAGE_BIT = 0x00000002, + VK_FORMAT_FEATURE_STORAGE_IMAGE_ATOMIC_BIT = 0x00000004, + VK_FORMAT_FEATURE_UNIFORM_TEXEL_BUFFER_BIT = 0x00000008, + VK_FORMAT_FEATURE_STORAGE_TEXEL_BUFFER_BIT = 0x00000010, + VK_FORMAT_FEATURE_STORAGE_TEXEL_BUFFER_ATOMIC_BIT = 0x00000020, + VK_FORMAT_FEATURE_VERTEX_BUFFER_BIT = 0x00000040, + VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT = 0x00000080, + VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BLEND_BIT = 0x00000100, + VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT = 0x00000200, + VK_FORMAT_FEATURE_BLIT_SRC_BIT = 0x00000400, + VK_FORMAT_FEATURE_BLIT_DST_BIT = 0x00000800, + VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_LINEAR_BIT = 0x00001000, + VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_CUBIC_BIT_EXT = 0x00002000, + VK_FORMAT_FEATURE_TRANSFER_SRC_BIT = 0x00004000, + VK_FORMAT_FEATURE_TRANSFER_DST_BIT = 0x00008000, + VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_MINMAX_BIT = 0x00010000, + VK_FORMAT_FEATURE_MIDPOINT_CHROMA_SAMPLES_BIT = 0x00020000, + VK_FORMAT_FEATURE_SAMPLED_IMAGE_YCBCR_CONVERSION_LINEAR_FILTER_BIT = 0x00040000, + VK_FORMAT_FEATURE_SAMPLED_IMAGE_YCBCR_CONVERSION_SEPARATE_RECONSTRUCTION_FILTER_BIT = 0x00080000, + VK_FORMAT_FEATURE_SAMPLED_IMAGE_YCBCR_CONVERSION_CHROMA_RECONSTRUCTION_EXPLICIT_BIT = 0x00100000, + VK_FORMAT_FEATURE_SAMPLED_IMAGE_YCBCR_CONVERSION_CHROMA_RECONSTRUCTION_EXPLICIT_FORCEABLE_BIT = 0x00200000, + VK_FORMAT_FEATURE_DISJOINT_BIT = 0x00400000, + VK_FORMAT_FEATURE_COSITED_CHROMA_SAMPLES_BIT = 0x00800000, + VK_FORMAT_FEATURE_FRAGMENT_DENSITY_MAP_BIT_EXT = 0x01000000, + VK_FORMAT_FEATURE_VIDEO_DECODE_OUTPUT_BIT_KHR = 0x02000000, + VK_FORMAT_FEATURE_VIDEO_DECODE_DPB_BIT_KHR = 0x04000000, + VK_FORMAT_FEATURE_VIDEO_ENCODE_DPB_BIT_KHR = 0x10000000, + VK_FORMAT_FEATURE_ACCELERATION_STRUCTURE_VERTEX_BUFFER_BIT_KHR = 0x20000000, + VK_FORMAT_FEATURE_FRAGMENT_SHADING_RATE_ATTACHMENT_BIT_KHR = 0x40000000, + VK_FORMAT_FEATURE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF +} VkFormatFeatureFlagBits; +typedef VkFlags VkFormatFeatureFlags; + + +typedef enum { + VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_BIT = (1ULL << 0), + VK_FORMAT_FEATURE_2_STORAGE_IMAGE_BIT = (1ULL << 1), + VK_FORMAT_FEATURE_2_STORAGE_IMAGE_ATOMIC_BIT = (1ULL << 2), + VK_FORMAT_FEATURE_2_UNIFORM_TEXEL_BUFFER_BIT = (1ULL << 3), + VK_FORMAT_FEATURE_2_STORAGE_TEXEL_BUFFER_BIT = (1ULL << 4), + VK_FORMAT_FEATURE_2_STORAGE_TEXEL_BUFFER_ATOMIC_BIT = (1ULL << 5), + VK_FORMAT_FEATURE_2_VERTEX_BUFFER_BIT = (1ULL << 6), + VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BIT = (1ULL << 7), + VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BLEND_BIT = (1ULL << 8), + VK_FORMAT_FEATURE_2_DEPTH_STENCIL_ATTACHMENT_BIT = (1ULL << 9), + VK_FORMAT_FEATURE_2_BLIT_SRC_BIT = (1ULL << 10), + VK_FORMAT_FEATURE_2_BLIT_DST_BIT = (1ULL << 11), + VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_FILTER_LINEAR_BIT = (1ULL << 12), + VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_FILTER_CUBIC_BIT = (1ULL << 13), + VK_FORMAT_FEATURE_2_TRANSFER_SRC_BIT = (1ULL << 14), + VK_FORMAT_FEATURE_2_TRANSFER_DST_BIT = (1ULL << 15), + VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_FILTER_MINMAX_BIT = (1ULL << 16), + VK_FORMAT_FEATURE_2_MIDPOINT_CHROMA_SAMPLES_BIT = (1ULL << 17), + VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_YCBCR_CONVERSION_LINEAR_FILTER_BIT = (1ULL << 18), + VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_YCBCR_CONVERSION_SEPARATE_RECONSTRUCTION_FILTER_BIT = (1ULL << 19), + VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_YCBCR_CONVERSION_CHROMA_RECONSTRUCTION_EXPLICIT_BIT = (1ULL << 20), + VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_YCBCR_CONVERSION_CHROMA_RECONSTRUCTION_EXPLICIT_FORCEABLE_BIT = (1ULL << 21), + VK_FORMAT_FEATURE_2_DISJOINT_BIT = (1ULL << 22), + VK_FORMAT_FEATURE_2_COSITED_CHROMA_SAMPLES_BIT = (1ULL << 23), + VK_FORMAT_FEATURE_2_FRAGMENT_DENSITY_MAP_BIT_EXT = (1ULL << 24), + VK_FORMAT_FEATURE_2_VIDEO_DECODE_OUTPUT_BIT_KHR = (1ULL << 25), + VK_FORMAT_FEATURE_2_VIDEO_DECODE_DPB_BIT_KHR = (1ULL << 26), + VK_FORMAT_FEATURE_2_VIDEO_ENCODE_INPUT_BIT_KHR = (1ULL << 27), + VK_FORMAT_FEATURE_2_VIDEO_ENCODE_DPB_BIT_KHR = (1ULL << 28), + VK_FORMAT_FEATURE_2_ACCELERATION_STRUCTURE_VERTEX_BUFFER_BIT_KHR = (1ULL << 29), + VK_FORMAT_FEATURE_2_FRAGMENT_SHADING_RATE_ATTACHMENT_BIT_KHR = (1ULL << 30), + VK_FORMAT_FEATURE_2_STORAGE_READ_WITHOUT_FORMAT_BIT = (1ULL << 31), + VK_FORMAT_FEATURE_2_STORAGE_WRITE_WITHOUT_FORMAT_BIT = (1ULL << 32), + VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_DEPTH_COMPARISON_BIT = (1ULL << 33), + VK_FORMAT_FEATURE_2_WEIGHT_IMAGE_BIT_QCOM = (1ULL << 34), + VK_FORMAT_FEATURE_2_WEIGHT_SAMPLED_IMAGE_BIT_QCOM = (1ULL << 35), + VK_FORMAT_FEATURE_2_BLOCK_MATCHING_BIT_QCOM = (1ULL << 36), + VK_FORMAT_FEATURE_2_BOX_FILTER_SAMPLED_BIT_QCOM = (1ULL << 37), + VK_FORMAT_FEATURE_2_LINEAR_COLOR_ATTACHMENT_BIT_NV = (1ULL << 38), + VK_FORMAT_FEATURE_2_TENSOR_SHADER_BIT_ARM = (1ULL << 39), + VK_FORMAT_FEATURE_2_OPTICAL_FLOW_IMAGE_BIT_NV = (1ULL << 40), + VK_FORMAT_FEATURE_2_OPTICAL_FLOW_VECTOR_BIT_NV = (1ULL << 41), + VK_FORMAT_FEATURE_2_OPTICAL_FLOW_COST_BIT_NV = (1ULL << 42), + VK_FORMAT_FEATURE_2_TENSOR_IMAGE_ALIASING_BIT_ARM = (1ULL << 43), + + VK_FORMAT_FEATURE_2_HOST_IMAGE_TRANSFER_BIT = (1ULL << 46), + + VK_FORMAT_FEATURE_2_TENSOR_DATA_GRAPH_BIT_ARM = (1ULL << 48), + VK_FORMAT_FEATURE_2_VIDEO_ENCODE_QUANTIZATION_DELTA_MAP_BIT_KHR = (1ULL << 49), + VK_FORMAT_FEATURE_2_VIDEO_ENCODE_EMPHASIS_MAP_BIT_KHR = (1ULL << 50), + VK_FORMAT_FEATURE_2_ACCELERATION_STRUCTURE_RADIUS_BUFFER_BIT_NV = (1ULL << 51), + VK_FORMAT_FEATURE_2_DEPTH_COPY_ON_COMPUTE_QUEUE_BIT_KHR = (1ULL << 52), + VK_FORMAT_FEATURE_2_DEPTH_COPY_ON_TRANSFER_QUEUE_BIT_KHR = (1ULL << 53), + VK_FORMAT_FEATURE_2_STENCIL_COPY_ON_COMPUTE_QUEUE_BIT_KHR = (1ULL << 54), + VK_FORMAT_FEATURE_2_STENCIL_COPY_ON_TRANSFER_QUEUE_BIT_KHR = (1ULL << 55), + + VK_FORMAT_FEATURE_2_COPY_IMAGE_INDIRECT_DST_BIT_KHR = (1ULL << 59), +} VkFormatFeatureFlagBits2; +typedef VkFlags64 VkFormatFeatureFlags2; + +typedef enum { + VK_IMAGE_CREATE_SPARSE_BINDING_BIT = 0x00000001, + VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT = 0x00000002, + VK_IMAGE_CREATE_SPARSE_ALIASED_BIT = 0x00000004, + VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT = 0x00000008, + VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT = 0x00000010, + VK_IMAGE_CREATE_ALIAS_BIT = 0x00000400, + VK_IMAGE_CREATE_SPLIT_INSTANCE_BIND_REGIONS_BIT = 0x00000040, + VK_IMAGE_CREATE_2D_ARRAY_COMPATIBLE_BIT = 0x00000020, + VK_IMAGE_CREATE_BLOCK_TEXEL_VIEW_COMPATIBLE_BIT = 0x00000080, + VK_IMAGE_CREATE_EXTENDED_USAGE_BIT = 0x00000100, + VK_IMAGE_CREATE_PROTECTED_BIT = 0x00000800, + VK_IMAGE_CREATE_DISJOINT_BIT = 0x00000200, + VK_IMAGE_CREATE_CORNER_SAMPLED_BIT_NV = 0x00002000, + VK_IMAGE_CREATE_SAMPLE_LOCATIONS_COMPATIBLE_DEPTH_BIT_EXT = 0x00001000, + VK_IMAGE_CREATE_SUBSAMPLED_BIT_EXT = 0x00004000, + VK_IMAGE_CREATE_DESCRIPTOR_BUFFER_CAPTURE_REPLAY_BIT_EXT = 0x00010000, + VK_IMAGE_CREATE_MULTISAMPLED_RENDER_TO_SINGLE_SAMPLED_BIT_EXT = 0x00040000, + VK_IMAGE_CREATE_2D_VIEW_COMPATIBLE_BIT_EXT = 0x00020000, + VK_IMAGE_CREATE_VIDEO_PROFILE_INDEPENDENT_BIT_KHR = 0x00100000, + VK_IMAGE_CREATE_FRAGMENT_DENSITY_MAP_OFFSET_BIT_EXT = 0x00008000, + VK_IMAGE_CREATE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF +} VkImageCreateFlagBits; +typedef VkFlags VkImageCreateFlags; + +typedef enum { VK_SAMPLE_COUNT_1_BIT = 0x00000001, VK_SAMPLE_COUNT_2_BIT = 0x00000002, VK_SAMPLE_COUNT_4_BIT = 0x00000004, @@ -253,6 +546,14 @@ typedef enum { VK_IMAGE_VIEW_TYPE_MAX_ENUM = 0x7FFFFFFF } VkImageViewType; +typedef enum VkIndexType { + VK_INDEX_TYPE_UINT16 = 0, + VK_INDEX_TYPE_UINT32 = 1, + VK_INDEX_TYPE_UINT8 = 1000265000, + VK_INDEX_TYPE_NONE_KHR = 1000165000, + VK_INDEX_TYPE_MAX_ENUM = 0x7FFFFFFF +} VkIndexType; + typedef enum { VK_BLEND_FACTOR_ZERO = 0, VK_BLEND_FACTOR_ONE = 1, @@ -336,9 +637,22 @@ typedef enum { VK_FENCE_CREATE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF } VkFenceCreateFlagBits; typedef VkFlags VkFenceCreateFlags; + +typedef enum { + VK_QUERY_POOL_CREATE_RESET_BIT_KHR = 0x00000001, + VK_QUERY_POOL_CREATE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF +} VkQueryPoolCreateFlagBits; +typedef VkFlags VkQueryPoolCreateFlags; + typedef VkFlags VkSemaphoreCreateFlags; typedef enum { + VK_SEMAPHORE_WAIT_ANY_BIT = 0x00000001, + VK_SEMAPHORE_WAIT_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF +} VkSemaphoreWaitFlagBits; +typedef VkFlags VkSemaphoreWaitFlags; + +typedef enum { VK_IMAGE_VIEW_CREATE_FRAGMENT_DENSITY_MAP_DYNAMIC_BIT_EXT = 0x00000001, VK_IMAGE_VIEW_CREATE_DESCRIPTOR_BUFFER_CAPTURE_REPLAY_BIT_EXT = 0x00000004, VK_IMAGE_VIEW_CREATE_FRAGMENT_DENSITY_MAP_DEFERRED_BIT_EXT = 0x00000002, @@ -816,6 +1130,20 @@ typedef enum { } VkFormat; typedef enum { + VK_IMAGE_TILING_OPTIMAL = 0, + VK_IMAGE_TILING_LINEAR = 1, + VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT = 1000158000, + VK_IMAGE_TILING_MAX_ENUM = 0x7FFFFFFF +} VkImageTiling; + +typedef enum { + VK_IMAGE_TYPE_1D = 0, + VK_IMAGE_TYPE_2D = 1, + VK_IMAGE_TYPE_3D = 2, + VK_IMAGE_TYPE_MAX_ENUM = 0x7FFFFFFF +} VkImageType; + +typedef enum { VK_COLOR_SPACE_SRGB_NONLINEAR_KHR = 0, VK_COLOR_SPACE_DISPLAY_P3_NONLINEAR_EXT = 1000104001, VK_COLOR_SPACE_EXTENDED_SRGB_LINEAR_EXT = 1000104002, @@ -843,6 +1171,61 @@ typedef enum { } VkSharingMode; typedef enum { + VK_QUERY_RESULT_64_BIT = 0x00000001, + VK_QUERY_RESULT_WAIT_BIT = 0x00000002, + VK_QUERY_RESULT_WITH_AVAILABILITY_BIT = 0x00000004, + VK_QUERY_RESULT_PARTIAL_BIT = 0x00000008, + VK_QUERY_RESULT_WITH_STATUS_BIT_KHR = 0x00000010, + VK_QUERY_RESULT_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF +} VkQueryResultFlagBits; +typedef VkFlags VkQueryResultFlags; + +typedef enum { + VK_BUFFER_CREATE_SPARSE_BINDING_BIT = 0x00000001, + VK_BUFFER_CREATE_SPARSE_RESIDENCY_BIT = 0x00000002, + VK_BUFFER_CREATE_SPARSE_ALIASED_BIT = 0x00000004, + VK_BUFFER_CREATE_PROTECTED_BIT = 0x00000008, + VK_BUFFER_CREATE_DEVICE_ADDRESS_CAPTURE_REPLAY_BIT = 0x00000010, + VK_BUFFER_CREATE_DESCRIPTOR_BUFFER_CAPTURE_REPLAY_BIT_EXT = 0x00000020, + VK_BUFFER_CREATE_VIDEO_PROFILE_INDEPENDENT_BIT_KHR = 0x00000040, + VK_BUFFER_CREATE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF +} VkBufferCreateFlagBits; +typedef VkFlags VkBufferCreateFlags; + +typedef enum { + VK_BUFFER_USAGE_TRANSFER_SRC_BIT = 0x00000001, + VK_BUFFER_USAGE_TRANSFER_DST_BIT = 0x00000002, + VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT = 0x00000004, + VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT = 0x00000008, + VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT = 0x00000010, + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT = 0x00000020, + VK_BUFFER_USAGE_INDEX_BUFFER_BIT = 0x00000040, + VK_BUFFER_USAGE_VERTEX_BUFFER_BIT = 0x00000080, + VK_BUFFER_USAGE_INDIRECT_BUFFER_BIT = 0x00000100, + VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT = 0x00020000, + VK_BUFFER_USAGE_VIDEO_DECODE_SRC_BIT_KHR = 0x00002000, + VK_BUFFER_USAGE_VIDEO_DECODE_DST_BIT_KHR = 0x00004000, + VK_BUFFER_USAGE_TRANSFORM_FEEDBACK_BUFFER_BIT_EXT = 0x00000800, + VK_BUFFER_USAGE_TRANSFORM_FEEDBACK_COUNTER_BUFFER_BIT_EXT = 0x00001000, + VK_BUFFER_USAGE_CONDITIONAL_RENDERING_BIT_EXT = 0x00000200, + VK_BUFFER_USAGE_EXECUTION_GRAPH_SCRATCH_BIT_AMDX = 0x02000000, + VK_BUFFER_USAGE_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT_KHR = 0x00080000, + VK_BUFFER_USAGE_ACCELERATION_STRUCTURE_STORAGE_BIT_KHR = 0x00100000, + VK_BUFFER_USAGE_SHADER_BINDING_TABLE_BIT_KHR = 0x00000400, + VK_BUFFER_USAGE_VIDEO_ENCODE_DST_BIT_KHR = 0x00008000, + VK_BUFFER_USAGE_VIDEO_ENCODE_SRC_BIT_KHR = 0x00010000, + VK_BUFFER_USAGE_SAMPLER_DESCRIPTOR_BUFFER_BIT_EXT = 0x00200000, + VK_BUFFER_USAGE_RESOURCE_DESCRIPTOR_BUFFER_BIT_EXT = 0x00400000, + VK_BUFFER_USAGE_PUSH_DESCRIPTORS_DESCRIPTOR_BUFFER_BIT_EXT = 0x04000000, + VK_BUFFER_USAGE_MICROMAP_BUILD_INPUT_READ_ONLY_BIT_EXT = 0x00800000, + VK_BUFFER_USAGE_MICROMAP_STORAGE_BIT_EXT = 0x01000000, + VK_BUFFER_USAGE_TILE_MEMORY_BIT_QCOM = 0x08000000, + VK_BUFFER_USAGE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF +} VkBufferUsageFlagBits; +typedef VkFlags VkBufferUsageFlags; +typedef VkFlags VkBufferViewCreateFlags; + +typedef enum { VK_PIPELINE_SHADER_STAGE_CREATE_ALLOW_VARYING_SUBGROUP_SIZE_BIT = 0x00000001, VK_PIPELINE_SHADER_STAGE_CREATE_REQUIRE_FULL_SUBGROUPS_BIT = 0x00000002, VK_PIPELINE_SHADER_STAGE_CREATE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF @@ -1079,12 +1462,6 @@ typedef enum { typedef VkFlags VkDescriptorSetLayoutCreateFlags; typedef enum { - VK_ATTACHMENT_DESCRIPTION_MAY_ALIAS_BIT = 0x00000001, - VK_ATTACHMENT_DESCRIPTION_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF -} VkAttachmentDescriptionFlagBits; -typedef VkFlags VkAttachmentDescriptionFlags; - -typedef enum { VK_DESCRIPTOR_TYPE_SAMPLER = 0, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER = 1, VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE = 2, @@ -1138,13 +1515,6 @@ typedef enum { } VkCommandBufferLevel; typedef enum { - VK_SUBPASS_CONTENTS_INLINE = 0, - VK_SUBPASS_CONTENTS_SECONDARY_COMMAND_BUFFERS = 1, - VK_SUBPASS_CONTENTS_INLINE_AND_SECONDARY_COMMAND_BUFFERS_KHR = 1000451000, - VK_SUBPASS_CONTENTS_MAX_ENUM = 0x7FFFFFFF -} VkSubpassContents; - -typedef enum { VK_IMAGE_LAYOUT_UNDEFINED = 0, VK_IMAGE_LAYOUT_GENERAL = 1, VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL = 2, @@ -1195,26 +1565,6 @@ typedef enum { typedef VkFlags VkFramebufferCreateFlags; typedef enum { - VK_RENDER_PASS_CREATE_TRANSFORM_BIT_QCOM = 0x00000002, - VK_RENDER_PASS_CREATE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF -} VkRenderPassCreateFlagBits; -typedef VkFlags VkRenderPassCreateFlags; - -typedef enum { - VK_SUBPASS_DESCRIPTION_PER_VIEW_ATTRIBUTES_BIT_NVX = 0x00000001, - VK_SUBPASS_DESCRIPTION_PER_VIEW_POSITION_X_ONLY_BIT_NVX = 0x00000002, - VK_SUBPASS_DESCRIPTION_FRAGMENT_REGION_BIT_QCOM = 0x00000004, - VK_SUBPASS_DESCRIPTION_SHADER_RESOLVE_BIT_QCOM = 0x00000008, - VK_SUBPASS_DESCRIPTION_TILE_SHADING_APRON_BIT_QCOM = 0x00000100, - VK_SUBPASS_DESCRIPTION_RASTERIZATION_ORDER_ATTACHMENT_COLOR_ACCESS_BIT_EXT = 0x00000010, - VK_SUBPASS_DESCRIPTION_RASTERIZATION_ORDER_ATTACHMENT_DEPTH_ACCESS_BIT_EXT = 0x00000020, - VK_SUBPASS_DESCRIPTION_RASTERIZATION_ORDER_ATTACHMENT_STENCIL_ACCESS_BIT_EXT = 0x00000040, - VK_SUBPASS_DESCRIPTION_ENABLE_LEGACY_DITHERING_BIT_EXT = 0x00000080, - VK_SUBPASS_DESCRIPTION_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF -} VkSubpassDescriptionFlagBits; -typedef VkFlags VkSubpassDescriptionFlags; - -typedef enum { VK_COMMAND_POOL_CREATE_TRANSIENT_BIT = 0x00000001, VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT = 0x00000002, VK_COMMAND_POOL_CREATE_PROTECTED_BIT = 0x00000004, @@ -1296,24 +1646,24 @@ typedef enum { typedef VkFlags VkMemoryAllocateFlags; typedef enum { - VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT = 0x00000001, - VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT = 0x00000002, - VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT = 0x00000004, - VK_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_TEXTURE_BIT = 0x00000008, - VK_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_TEXTURE_KMT_BIT = 0x00000010, - VK_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP_BIT = 0x00000020, - VK_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE_BIT = 0x00000040, - VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT = 0x00000200, - VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID = 0x00000400, - VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT = 0x00000080, - VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_MAPPED_FOREIGN_MEMORY_BIT_EXT = 0x00000100, - VK_EXTERNAL_MEMORY_HANDLE_TYPE_ZIRCON_VMO_BIT_FUCHSIA = 0x00000800, - VK_EXTERNAL_MEMORY_HANDLE_TYPE_RDMA_ADDRESS_BIT_NV = 0x00001000, - VK_EXTERNAL_MEMORY_HANDLE_TYPE_SCREEN_BUFFER_BIT_QNX = 0x00004000, - VK_EXTERNAL_MEMORY_HANDLE_TYPE_MTLBUFFER_BIT_EXT = 0x00010000, - VK_EXTERNAL_MEMORY_HANDLE_TYPE_MTLTEXTURE_BIT_EXT = 0x00020000, - VK_EXTERNAL_MEMORY_HANDLE_TYPE_MTLHEAP_BIT_EXT = 0x00040000, - VK_EXTERNAL_MEMORY_HANDLE_TYPE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF, + VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT = 0x00000001, + VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT = 0x00000002, + VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT = 0x00000004, + VK_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_TEXTURE_BIT = 0x00000008, + VK_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_TEXTURE_KMT_BIT = 0x00000010, + VK_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP_BIT = 0x00000020, + VK_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE_BIT = 0x00000040, + VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT = 0x00000200, + VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID = 0x00000400, + VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT = 0x00000080, + VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_MAPPED_FOREIGN_MEMORY_BIT_EXT = 0x00000100, + VK_EXTERNAL_MEMORY_HANDLE_TYPE_ZIRCON_VMO_BIT_FUCHSIA = 0x00000800, + VK_EXTERNAL_MEMORY_HANDLE_TYPE_RDMA_ADDRESS_BIT_NV = 0x00001000, + VK_EXTERNAL_MEMORY_HANDLE_TYPE_SCREEN_BUFFER_BIT_QNX = 0x00004000, + VK_EXTERNAL_MEMORY_HANDLE_TYPE_MTLBUFFER_BIT_EXT = 0x00010000, + VK_EXTERNAL_MEMORY_HANDLE_TYPE_MTLTEXTURE_BIT_EXT = 0x00020000, + VK_EXTERNAL_MEMORY_HANDLE_TYPE_MTLHEAP_BIT_EXT = 0x00040000, + VK_EXTERNAL_MEMORY_HANDLE_TYPE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF, } VkExternalMemoryHandleTypeFlagBits; typedef VkFlags VkExternalMemoryHandleTypeFlags; @@ -1328,6 +1678,44 @@ typedef enum { } VkExternalSemaphoreHandleTypeFlagBits; typedef VkFlags VkExternalSemaphoreHandleTypeFlags; +typedef enum { + VK_SEMAPHORE_TYPE_BINARY = 0, + VK_SEMAPHORE_TYPE_TIMELINE = 1, + VK_SEMAPHORE_TYPE_MAX_ENUM = 0x7FFFFFFF +} VkSemaphoreType; + +typedef enum { + VK_RESOLVE_MODE_NONE = 0, + VK_RESOLVE_MODE_SAMPLE_ZERO_BIT = 0x00000001, + VK_RESOLVE_MODE_AVERAGE_BIT = 0x00000002, + VK_RESOLVE_MODE_MIN_BIT = 0x00000004, + VK_RESOLVE_MODE_MAX_BIT = 0x00000008, + VK_RESOLVE_MODE_EXTERNAL_FORMAT_DOWNSAMPLE_BIT_ANDROID = 0x00000010, + VK_RESOLVE_MODE_CUSTOM_BIT_EXT = 0x00000020, + VK_RESOLVE_MODE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF +} VkResolveModeFlagBits; +typedef VkFlags VkResolveModeFlags; + +typedef enum { + VK_RENDERING_CONTENTS_SECONDARY_COMMAND_BUFFERS_BIT = 0x00000001, + VK_RENDERING_SUSPENDING_BIT = 0x00000002, + VK_RENDERING_RESUMING_BIT = 0x00000004, + VK_RENDERING_ENABLE_LEGACY_DITHERING_BIT_EXT = 0x00000008, + VK_RENDERING_CONTENTS_INLINE_BIT_KHR = 0x00000010, + VK_RENDERING_PER_LAYER_FRAGMENT_DENSITY_BIT_VALVE = 0x00000020, + VK_RENDERING_FRAGMENT_REGION_BIT_EXT = 0x00000040, + VK_RENDERING_CUSTOM_RESOLVE_BIT_EXT = 0x00000080, + VK_RENDERING_LOCAL_READ_CONCURRENT_ACCESS_CONTROL_BIT_KHR = 0x00000100, + VK_RENDERING_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF +} VkRenderingFlagBits; +typedef VkFlags VkRenderingFlags; + +typedef enum { + VK_SUBMIT_PROTECTED_BIT = 0x00000001, + VK_SUBMIT_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF +} VkSubmitFlagBits; +typedef VkFlags VkSubmitFlags; + typedef struct { uint32_t width; uint32_t height; @@ -1371,18 +1759,6 @@ typedef struct { } VkLayerProperties; typedef struct { - VkStructureType sType; - const void * pNext; - uint32_t waitSemaphoreCount; - const VkSemaphore * pWaitSemaphores; - const VkPipelineStageFlags * pWaitDstStageMask; - uint32_t commandBufferCount; - const VkCommandBuffer * pCommandBuffers; - uint32_t signalSemaphoreCount; - const VkSemaphore * pSignalSemaphores; -} VkSubmitInfo; - -typedef struct { VkStructureType sType; const void * pNext; const char * pApplicationName; @@ -1393,6 +1769,26 @@ typedef struct { } VkApplicationInfo; typedef struct { + VkFormatFeatureFlags linearTilingFeatures; + VkFormatFeatureFlags optimalTilingFeatures; + VkFormatFeatureFlags bufferFeatures; +} VkFormatProperties; + +typedef struct { + VkStructureType sType; + void * pNext; + VkFormatProperties formatProperties; +} VkFormatProperties2; + +typedef struct { + VkStructureType sType; + void * pNext; + VkFormatFeatureFlags2 linearTilingFeatures; + VkFormatFeatureFlags2 optimalTilingFeatures; + VkFormatFeatureFlags2 bufferFeatures; +} VkFormatProperties3; + +typedef struct { VkStructureType sType; const void * pNext; VkInstanceCreateFlags flags; @@ -1640,6 +2036,107 @@ typedef struct { } VkPhysicalDeviceFeatures; typedef struct { + VkStructureType sType; + void * pNext; + VkBool32 storageBuffer16BitAccess; + VkBool32 uniformAndStorageBuffer16BitAccess; + VkBool32 storagePushConstant16; + VkBool32 storageInputOutput16; + VkBool32 multiview; + VkBool32 multiviewGeometryShader; + VkBool32 multiviewTessellationShader; + VkBool32 variablePointersStorageBuffer; + VkBool32 variablePointers; + VkBool32 protectedMemory; + VkBool32 samplerYcbcrConversion; + VkBool32 shaderDrawParameters; +} VkPhysicalDeviceVulkan11Features; + +typedef struct { + VkStructureType sType; + void * pNext; + VkBool32 samplerMirrorClampToEdge; + VkBool32 drawIndirectCount; + VkBool32 storageBuffer8BitAccess; + VkBool32 uniformAndStorageBuffer8BitAccess; + VkBool32 storagePushConstant8; + VkBool32 shaderBufferInt64Atomics; + VkBool32 shaderSharedInt64Atomics; + VkBool32 shaderFloat16; + VkBool32 shaderInt8; + VkBool32 descriptorIndexing; + VkBool32 shaderInputAttachmentArrayDynamicIndexing; + VkBool32 shaderUniformTexelBufferArrayDynamicIndexing; + VkBool32 shaderStorageTexelBufferArrayDynamicIndexing; + VkBool32 shaderUniformBufferArrayNonUniformIndexing; + VkBool32 shaderSampledImageArrayNonUniformIndexing; + VkBool32 shaderStorageBufferArrayNonUniformIndexing; + VkBool32 shaderStorageImageArrayNonUniformIndexing; + VkBool32 shaderInputAttachmentArrayNonUniformIndexing; + VkBool32 shaderUniformTexelBufferArrayNonUniformIndexing; + VkBool32 shaderStorageTexelBufferArrayNonUniformIndexing; + VkBool32 descriptorBindingUniformBufferUpdateAfterBind; + VkBool32 descriptorBindingSampledImageUpdateAfterBind; + VkBool32 descriptorBindingStorageImageUpdateAfterBind; + VkBool32 descriptorBindingStorageBufferUpdateAfterBind; + VkBool32 descriptorBindingUniformTexelBufferUpdateAfterBind; + VkBool32 descriptorBindingStorageTexelBufferUpdateAfterBind; + VkBool32 descriptorBindingUpdateUnusedWhilePending; + VkBool32 descriptorBindingPartiallyBound; + VkBool32 descriptorBindingVariableDescriptorCount; + VkBool32 runtimeDescriptorArray; + VkBool32 samplerFilterMinmax; + VkBool32 scalarBlockLayout; + VkBool32 imagelessFramebuffer; + VkBool32 uniformBufferStandardLayout; + VkBool32 shaderSubgroupExtendedTypes; + VkBool32 separateDepthStencilLayouts; + VkBool32 hostQueryReset; + VkBool32 timelineSemaphore; + VkBool32 bufferDeviceAddress; + VkBool32 bufferDeviceAddressCaptureReplay; + VkBool32 bufferDeviceAddressMultiDevice; + VkBool32 vulkanMemoryModel; + VkBool32 vulkanMemoryModelDeviceScope; + VkBool32 vulkanMemoryModelAvailabilityVisibilityChains; + VkBool32 shaderOutputViewportIndex; + VkBool32 shaderOutputLayer; + VkBool32 subgroupBroadcastDynamicId; +} VkPhysicalDeviceVulkan12Features; + +typedef struct { + VkStructureType sType; + void * pNext; + VkBool32 robustImageAccess; + VkBool32 inlineUniformBlock; + VkBool32 descriptorBindingInlineUniformBlockUpdateAfterBind; + VkBool32 pipelineCreationCacheControl; + VkBool32 privateData; + VkBool32 shaderDemoteToHelperInvocation; + VkBool32 shaderTerminateInvocation; + VkBool32 subgroupSizeControl; + VkBool32 computeFullSubgroups; + VkBool32 synchronization2; + VkBool32 textureCompressionASTC_HDR; + VkBool32 shaderZeroInitializeWorkgroupMemory; + VkBool32 dynamicRendering; + VkBool32 shaderIntegerDotProduct; + VkBool32 maintenance4; +} VkPhysicalDeviceVulkan13Features; + +typedef struct { + VkStructureType sType; + void * pNext; + VkBool32 shaderRelaxedExtendedInstruction; +} VkPhysicalDeviceShaderRelaxedExtendedInstructionFeaturesKHR; + +typedef struct { + VkStructureType sType; + void * pNext; + VkPhysicalDeviceFeatures features; +} VkPhysicalDeviceFeatures2; + +typedef struct { VkQueueFlags queueFlags; uint32_t queueCount; uint32_t timestampValidBits; @@ -1718,7 +2215,6 @@ typedef struct { VkColorSpaceKHR colorSpace; } VkSurfaceFormatKHR; - typedef struct { VkStructureType sType; const void * pNext; @@ -1726,39 +2222,82 @@ typedef struct { } VkFenceCreateInfo; typedef struct { + VkStructureType sType; + const void * pNext; + VkQueryPoolCreateFlags flags; + VkQueryType queryType; + uint32_t queryCount; + VkQueryPipelineStatisticFlags pipelineStatistics; +} VkQueryPoolCreateInfo; + +typedef struct { VkStructureType sType; const void * pNext; VkSemaphoreCreateFlags flags; } VkSemaphoreCreateInfo; typedef struct { - VkStructureType sType; - const void * pNext; - VkExternalSemaphoreHandleTypeFlags handleTypes; + VkStructureType sType; + const void * pNext; + VkSemaphoreType semaphoreType; + uint64_t initialValue; +} VkSemaphoreTypeCreateInfo; + +typedef struct { + VkStructureType sType; + const void * pNext; + uint32_t waitSemaphoreValueCount; + const uint64_t * pWaitSemaphoreValues; + uint32_t signalSemaphoreValueCount; + const uint64_t * pSignalSemaphoreValues; +} VkTimelineSemaphoreSubmitInfo; + +typedef struct { + VkStructureType sType; + const void * pNext; + VkSemaphoreWaitFlags flags; + uint32_t semaphoreCount; + const VkSemaphore * pSemaphores; + const uint64_t * pValues; +} VkSemaphoreWaitInfo; + +typedef struct { + VkStructureType sType; + const void * pNext; + VkSemaphore semaphore; + uint64_t value; +} VkSemaphoreSignalInfo; + +typedef struct { + VkStructureType sType; + const void * pNext; + VkSemaphore semaphore; + uint64_t value; + VkPipelineStageFlags2 stageMask; + uint32_t deviceIndex; +} VkSemaphoreSubmitInfo; + +typedef struct { + VkStructureType sType; + const void * pNext; + VkExternalSemaphoreHandleTypeFlags handleTypes; } VkExportSemaphoreCreateInfo; typedef struct { - VkStructureType sType; - const void * pNext; - VkSemaphore semaphore; - VkExternalSemaphoreHandleTypeFlagBits handleType; + VkStructureType sType; + const void * pNext; + VkSemaphore semaphore; + VkExternalSemaphoreHandleTypeFlagBits handleType; } VkSemaphoreGetWin32HandleInfoKHR; typedef struct { - VkStructureType sType; - const void * pNext; - VkSemaphore semaphore; - VkExternalSemaphoreHandleTypeFlagBits handleType; + VkStructureType sType; + const void * pNext; + VkSemaphore semaphore; + VkExternalSemaphoreHandleTypeFlagBits handleType; } VkSemaphoreGetFdInfoKHR; typedef struct { - VkComponentSwizzle r; - VkComponentSwizzle g; - VkComponentSwizzle b; - VkComponentSwizzle a; -} VkComponentMapping; - -typedef struct { VkImageAspectFlags aspectMask; uint32_t baseMipLevel; uint32_t levelCount; @@ -1767,6 +2306,64 @@ typedef struct { } VkImageSubresourceRange; typedef struct { + VkStructureType sType; + const void * pNext; + VkPipelineStageFlags2 srcStageMask; + VkAccessFlags2 srcAccessMask; + VkPipelineStageFlags2 dstStageMask; + VkAccessFlags2 dstAccessMask; +} VkMemoryBarrier2; + +typedef struct { + VkStructureType sType; + const void * pNext; + VkPipelineStageFlags2 srcStageMask; + VkAccessFlags2 srcAccessMask; + VkPipelineStageFlags2 dstStageMask; + VkAccessFlags2 dstAccessMask; + uint32_t srcQueueFamilyIndex; + uint32_t dstQueueFamilyIndex; + VkBuffer buffer; + VkDeviceSize offset; + VkDeviceSize size; +} VkBufferMemoryBarrier2; + +typedef struct { + VkStructureType sType; + const void * pNext; + VkPipelineStageFlags2 srcStageMask; + VkAccessFlags2 srcAccessMask; + VkPipelineStageFlags2 dstStageMask; + VkAccessFlags2 dstAccessMask; + VkImageLayout oldLayout; + VkImageLayout newLayout; + uint32_t srcQueueFamilyIndex; + uint32_t dstQueueFamilyIndex; + VkImage image; + VkImageSubresourceRange subresourceRange; +} VkImageMemoryBarrier2; + +typedef struct { + VkStructureType sType; + const void * pNext; + VkDependencyFlags dependencyFlags; + uint32_t memoryBarrierCount; + const VkMemoryBarrier2 * pMemoryBarriers; + uint32_t bufferMemoryBarrierCount; + const VkBufferMemoryBarrier2 * pBufferMemoryBarriers; + uint32_t imageMemoryBarrierCount; + const VkImageMemoryBarrier2 * pImageMemoryBarriers; +} VkDependencyInfo; + + +typedef struct { + VkComponentSwizzle r; + VkComponentSwizzle g; + VkComponentSwizzle b; + VkComponentSwizzle a; +} VkComponentMapping; + +typedef struct { VkStructureType sType; const void * pNext; VkImageViewCreateFlags flags; @@ -1778,6 +2375,35 @@ typedef struct { } VkImageViewCreateInfo; typedef struct { + VkStructureType sType; + const void * pNext; + VkBufferCreateFlags flags; + VkDeviceSize size; + VkBufferUsageFlags usage; + VkSharingMode sharingMode; + uint32_t queueFamilyIndexCount; + const uint32_t * pQueueFamilyIndices; +} VkBufferCreateInfo; + +typedef struct { + VkStructureType sType; + const void * pNext; + VkImageCreateFlags flags; + VkImageType imageType; + VkFormat format; + VkExtent3D extent; + uint32_t mipLevels; + uint32_t arrayLayers; + VkSampleCountFlagBits samples; + VkImageTiling tiling; + VkImageUsageFlags usage; + VkSharingMode sharingMode; + uint32_t queueFamilyIndexCount; + const uint32_t * pQueueFamilyIndices; + VkImageLayout initialLayout; +} VkImageCreateInfo; + +typedef struct { VkStructureType sType; const void * pNext; VkShaderModuleCreateFlags flags; @@ -1965,69 +2591,57 @@ typedef struct { const VkPushConstantRange * pPushConstantRanges; } VkPipelineLayoutCreateInfo; -typedef struct { - VkAttachmentDescriptionFlags flags; - VkFormat format; - VkSampleCountFlagBits samples; - VkAttachmentLoadOp loadOp; - VkAttachmentStoreOp storeOp; - VkAttachmentLoadOp stencilLoadOp; - VkAttachmentStoreOp stencilStoreOp; - VkImageLayout initialLayout; - VkImageLayout finalLayout; -} VkAttachmentDescription; +typedef union { + float float32[4]; + int32_t int32[4]; + uint32_t uint32[4]; +} VkClearColorValue; typedef struct { - uint32_t attachment; - VkImageLayout layout; -} VkAttachmentReference; + float depth; + uint32_t stencil; +} VkClearDepthStencilValue; + +typedef union { + VkClearColorValue color; + VkClearDepthStencilValue depthStencil; +} VkClearValue; typedef struct { - VkStructureType sType; - const void * pNext; - VkFramebufferCreateFlags flags; - VkRenderPass renderPass; - uint32_t attachmentCount; - const VkImageView * pAttachments; - uint32_t width; - uint32_t height; - uint32_t layers; -} VkFramebufferCreateInfo; - -typedef struct { - VkSubpassDescriptionFlags flags; - VkPipelineBindPoint pipelineBindPoint; - uint32_t inputAttachmentCount; - const VkAttachmentReference * pInputAttachments; - uint32_t colorAttachmentCount; - const VkAttachmentReference * pColorAttachments; - const VkAttachmentReference * pResolveAttachments; - const VkAttachmentReference * pDepthStencilAttachment; - uint32_t preserveAttachmentCount; - const uint32_t * pPreserveAttachments; -} VkSubpassDescription; - -typedef struct { - uint32_t srcSubpass; - uint32_t dstSubpass; - VkPipelineStageFlags srcStageMask; - VkPipelineStageFlags dstStageMask; - VkAccessFlags srcAccessMask; - VkAccessFlags dstAccessMask; - VkDependencyFlags dependencyFlags; -} VkSubpassDependency; + VkStructureType sType; + const void * pNext; + VkImageView imageView; + VkImageLayout imageLayout; + VkResolveModeFlagBits resolveMode; + VkImageView resolveImageView; + VkImageLayout resolveImageLayout; + VkAttachmentLoadOp loadOp; + VkAttachmentStoreOp storeOp; + VkClearValue clearValue; +} VkRenderingAttachmentInfo; typedef struct { - VkStructureType sType; - const void * pNext; - VkRenderPassCreateFlags flags; - uint32_t attachmentCount; - const VkAttachmentDescription * pAttachments; - uint32_t subpassCount; - const VkSubpassDescription * pSubpasses; - uint32_t dependencyCount; - const VkSubpassDependency * pDependencies; -} VkRenderPassCreateInfo; + VkStructureType sType; + const void * pNext; + VkRenderingFlags flags; + VkRect2D renderArea; + uint32_t layerCount; + uint32_t viewMask; + uint32_t colorAttachmentCount; + const VkRenderingAttachmentInfo * pColorAttachments; + const VkRenderingAttachmentInfo * pDepthAttachment; + const VkRenderingAttachmentInfo * pStencilAttachment; +} VkRenderingInfo; + +typedef struct { + VkStructureType sType; + const void * pNext; + uint32_t viewMask; + uint32_t colorAttachmentCount; + const VkFormat * pColorAttachmentFormats; + VkFormat depthAttachmentFormat; + VkFormat stencilAttachmentFormat; +} VkPipelineRenderingCreateInfo; typedef struct { VkStructureType sType; @@ -2062,21 +2676,41 @@ typedef struct { const VkCommandBufferInheritanceInfo * pInheritanceInfo; } VkCommandBufferBeginInfo; -typedef union { - float float32[4]; - int32_t int32[4]; - uint32_t uint32[4]; -} VkClearColorValue; +typedef struct { + VkStructureType sType; + const void * pNext; + VkCommandBuffer commandBuffer; + uint32_t deviceMask; +} VkCommandBufferSubmitInfo; typedef struct { - float depth; - uint32_t stencil; -} VkClearDepthStencilValue; + VkStructureType sType; + const void * pNext; + VkSubmitFlags flags; + uint32_t waitSemaphoreInfoCount; + const VkSemaphoreSubmitInfo * pWaitSemaphoreInfos; + uint32_t commandBufferInfoCount; + const VkCommandBufferSubmitInfo * pCommandBufferInfos; + uint32_t signalSemaphoreInfoCount; + const VkSemaphoreSubmitInfo * pSignalSemaphoreInfos; +} VkSubmitInfo2; -typedef union { - VkClearColorValue color; - VkClearDepthStencilValue depthStencil; -} VkClearValue; +typedef struct { + VkStructureType sType; + const void * pNext; + VkDeviceSize srcOffset; + VkDeviceSize dstOffset; + VkDeviceSize size; +} VkBufferCopy2; + +typedef struct { + VkStructureType sType; + const void * pNext; + VkBuffer srcBuffer; + VkBuffer dstBuffer; + uint32_t regionCount; + const VkBufferCopy2 * pRegions; +} VkCopyBufferInfo2; typedef struct { VkStructureType sType; @@ -2126,25 +2760,44 @@ typedef struct { } VkAllocationCallbacks; typedef struct { - VkStructureType sType; - const void * pNext; - VkDeviceMemory memory; - VkDeviceSize offset; - VkDeviceSize size; + VkStructureType sType; + const void * pNext; + VkDeviceMemory memory; + VkDeviceSize offset; + VkDeviceSize size; } VkMappedMemoryRange; typedef struct { VkStructureType sType; const void * pNext; + VkBuffer buffer; +} VkBufferDeviceAddressInfo; + +typedef struct { + VkStructureType sType; + const void * pNext; VkDeviceSize allocationSize; uint32_t memoryTypeIndex; } VkMemoryAllocateInfo; typedef struct { - VkStructureType sType; - const void * pNext; - VkMemoryAllocateFlags flags; - uint32_t deviceMask; + VkDeviceSize size; + VkDeviceSize alignment; + uint32_t memoryTypeBits; +} VkMemoryRequirements; + +typedef struct { + VkStructureType sType; + const void * pNext; + VkImage image; + VkBuffer buffer; +} VkMemoryDedicatedAllocateInfo; + +typedef struct { + VkStructureType sType; + const void * pNext; + VkMemoryAllocateFlags flags; + uint32_t deviceMask; } VkMemoryAllocateFlagsInfo; typedef struct { @@ -2154,17 +2807,23 @@ typedef struct { } VkExportMemoryAllocateInfo; typedef struct { - VkStructureType sType; - const void * pNext; - VkDeviceMemory memory; - VkExternalMemoryHandleTypeFlagBits handleType; + VkStructureType sType; + const void * pNext; + VkExternalMemoryHandleTypeFlags handleTypes; +} VkExternalMemoryImageCreateInfo; + +typedef struct { + VkStructureType sType; + const void * pNext; + VkDeviceMemory memory; + VkExternalMemoryHandleTypeFlagBits handleType; } VkMemoryGetWin32HandleInfoKHR; typedef struct { - VkStructureType sType; - const void * pNext; - VkDeviceMemory memory; - VkExternalMemoryHandleTypeFlagBits handleType; + VkStructureType sType; + const void * pNext; + VkDeviceMemory memory; + VkExternalMemoryHandleTypeFlagBits handleType; } VkMemoryGetFdInfoKHR; typedef struct { @@ -2193,6 +2852,36 @@ typedef struct { const VkDescriptorSetLayoutBinding * pBindings; } VkDescriptorSetLayoutCreateInfo; +typedef enum { + VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_EXT = 0, + VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_RESERVE_BINDING_SLOT_EXT = 1, + VK_VALIDATION_FEATURE_ENABLE_BEST_PRACTICES_EXT = 2, + VK_VALIDATION_FEATURE_ENABLE_DEBUG_PRINTF_EXT = 3, + VK_VALIDATION_FEATURE_ENABLE_SYNCHRONIZATION_VALIDATION_EXT = 4, + VK_VALIDATION_FEATURE_ENABLE_MAX_ENUM_EXT = 0x7FFFFFFF +} VkValidationFeatureEnableEXT; + +typedef enum { + VK_VALIDATION_FEATURE_DISABLE_ALL_EXT = 0, + VK_VALIDATION_FEATURE_DISABLE_SHADERS_EXT = 1, + VK_VALIDATION_FEATURE_DISABLE_THREAD_SAFETY_EXT = 2, + VK_VALIDATION_FEATURE_DISABLE_API_PARAMETERS_EXT = 3, + VK_VALIDATION_FEATURE_DISABLE_OBJECT_LIFETIMES_EXT = 4, + VK_VALIDATION_FEATURE_DISABLE_CORE_CHECKS_EXT = 5, + VK_VALIDATION_FEATURE_DISABLE_UNIQUE_HANDLES_EXT = 6, + VK_VALIDATION_FEATURE_DISABLE_SHADER_VALIDATION_CACHE_EXT = 7, + VK_VALIDATION_FEATURE_DISABLE_MAX_ENUM_EXT = 0x7FFFFFFF +} VkValidationFeatureDisableEXT; + +typedef struct { + VkStructureType sType; + const void * pNext; + uint32_t enabledValidationFeatureCount; + const VkValidationFeatureEnableEXT * pEnabledValidationFeatures; + uint32_t disabledValidationFeatureCount; + const VkValidationFeatureDisableEXT * pDisabledValidationFeatures; +} VkValidationFeaturesEXT; + /* X(name, ret, params) */ #define VkLoaderProcedureList \ @@ -2208,6 +2897,8 @@ typedef struct { X(vkEnumerateDeviceExtensionProperties, VkResult, (VkPhysicalDevice physicalDevice, const char *pLayerName, uint32_t *pPropertyCount, VkExtensionProperties *pProperties)) \ X(vkEnumeratePhysicalDevices, VkResult, (VkInstance instance, uint32_t *pPhysicalDeviceCount, VkPhysicalDevice *pPhysicalDevices)) \ X(vkGetDeviceProcAddr, void *, (VkDevice device, const char *pName)) \ + X(vkGetPhysicalDeviceFeatures2, void, (VkPhysicalDevice physicalDevice, VkPhysicalDeviceFeatures2 *pFeatures)) \ + X(vkGetPhysicalDeviceFormatProperties2, void, (VkPhysicalDevice physicalDevice, VkFormat format, VkFormatProperties2 *pFormatProperties)) \ X(vkGetPhysicalDeviceMemoryProperties2, void, (VkPhysicalDevice physicalDevice, VkPhysicalDeviceMemoryProperties2 *pMemoryProperties)) \ X(vkGetPhysicalDeviceProperties2, void, (VkPhysicalDevice physicalDevice, VkPhysicalDeviceProperties2 *pProperties)) \ X(vkGetPhysicalDeviceQueueFamilyProperties, void, (VkPhysicalDevice physicalDevice, uint32_t *pQueueFamilyPropertyCount, VkQueueFamilyProperties *pQueueFamilyProperties)) \ @@ -2215,24 +2906,58 @@ typedef struct { /* X(name, ret, params) */ #define VkDeviceProcedureList \ - X(vkAllocateMemory, VkResult, (VkDevice device, const VkMemoryAllocateInfo *pAllocateInfo, const VkAllocationCallbacks *pAllocator, VkDeviceMemory *pMemory)) \ - X(vkCreateComputePipelines, VkResult, (VkDevice device, VkPipelineCache pipelineCache, uint32_t createInfoCount, const VkComputePipelineCreateInfo *pCreateInfos, const VkAllocationCallbacks *pAllocator, VkPipeline *pPipelines)) \ - X(vkCreatePipelineLayout, VkResult, (VkDevice device, const VkPipelineLayoutCreateInfo *pCreateInfo, const VkAllocationCallbacks *pAllocator, VkPipelineLayout *pPipelineLayout)) \ - X(vkCreateSemaphore, VkResult, (VkDevice device, const VkSemaphoreCreateInfo *pCreateInfo, const VkAllocationCallbacks *pAllocator, VkSemaphore *pSemaphore)) \ - X(vkCreateShaderModule, VkResult, (VkDevice device, const VkShaderModuleCreateInfo *pCreateInfo, const VkAllocationCallbacks *pAllocator, VkShaderModule *pShaderModule)) \ - X(vkDestroyBuffer, void, (VkDevice device, VkBuffer buffer, const VkAllocationCallbacks *pAllocator)) \ - X(vkDestroyPipeline, void, (VkDevice device, VkPipeline pipeline, const VkAllocationCallbacks *pAllocator)) \ - X(vkDestroyPipelineLayout, void, (VkDevice device, VkPipelineLayout pipelineLayout, const VkAllocationCallbacks *pAllocator)) \ - X(vkDestroyShaderModule, void, (VkDevice device, VkShaderModule shaderModule, const VkAllocationCallbacks *pAllocator)) \ - X(vkFlushMappedMemoryRanges, VkResult, (VkDevice device, uint32_t memoryRangeCount, const VkMappedMemoryRange *pMemoryRanges)) \ - X(vkFreeMemory, void, (VkDevice device, VkDeviceMemory memory, const VkAllocationCallbacks *pAllocator)) \ - X(vkGetDeviceQueue, void, (VkDevice device, uint32_t queueFamilyIndex, uint32_t queueIndex, VkQueue *pQueue)) \ - X(vkGetMemoryFdKHR, VkResult, (VkDevice device, const VkMemoryGetFdInfoKHR *pGetFdInfo, int *pFd)) \ - X(vkGetMemoryWin32HandleKHR, VkResult, (VkDevice device, const VkMemoryGetWin32HandleInfoKHR *pGetWin32HandleInfo, void **pHandle)) \ - X(vkGetSemaphoreFdKHR, VkResult, (VkDevice device, const VkSemaphoreGetFdInfoKHR *pGetFdInfo, int *pFd)) \ - X(vkGetSemaphoreWin32HandleKHR, VkResult, (VkDevice device, const VkSemaphoreGetWin32HandleInfoKHR *pGetWin32HandleInfo, void **pHandle)) \ - X(vkMapMemory, VkResult, (VkDevice device, VkDeviceMemory memory, VkDeviceSize offset, VkDeviceSize size, VkMemoryMapFlags flags, void **ppData)) \ - X(vkUnmapMemory, void, (VkDevice device, VkDeviceMemory memory)) \ + X(vkAllocateCommandBuffers, VkResult, (VkDevice device, const VkCommandBufferAllocateInfo *pAllocateInfo, VkCommandBuffer *pCommandBuffers)) \ + X(vkAllocateMemory, VkResult, (VkDevice device, const VkMemoryAllocateInfo *pAllocateInfo, const VkAllocationCallbacks *pAllocator, VkDeviceMemory *pMemory)) \ + X(vkBindBufferMemory, VkResult, (VkDevice device, VkBuffer buffer, VkDeviceMemory memory, VkDeviceSize memoryOffset)) \ + X(vkBindImageMemory, VkResult, (VkDevice device, VkImage image, VkDeviceMemory memory, VkDeviceSize memoryOffset)) \ + X(vkCreateBuffer, VkResult, (VkDevice device, const VkBufferCreateInfo *pCreateInfo, const VkAllocationCallbacks *pAllocator, VkBuffer *pBuffer)) \ + X(vkCreateCommandPool, VkResult, (VkDevice device, const VkCommandPoolCreateInfo *pCreateInfo, const VkAllocationCallbacks *pAllocator, VkCommandPool *pCommandPool)) \ + X(vkCreateComputePipelines, VkResult, (VkDevice device, VkPipelineCache pipelineCache, uint32_t createInfoCount, const VkComputePipelineCreateInfo *pCreateInfos, const VkAllocationCallbacks *pAllocator, VkPipeline *pPipelines)) \ + X(vkCreateGraphicsPipelines, VkResult, (VkDevice device, VkPipelineCache pipelineCache, uint32_t createInfoCount, const VkGraphicsPipelineCreateInfo *pCreateInfos, const VkAllocationCallbacks *pAllocator, VkPipeline *pPipelines)) \ + X(vkCreateImage, VkResult, (VkDevice device, const VkImageCreateInfo *pCreateInfo, const VkAllocationCallbacks *pAllocator, VkImage *pImage)) \ + X(vkCreateImageView, VkResult, (VkDevice device, const VkImageViewCreateInfo *pCreateInfo, const VkAllocationCallbacks *pAllocator, VkImageView *pView)) \ + X(vkCreatePipelineLayout, VkResult, (VkDevice device, const VkPipelineLayoutCreateInfo *pCreateInfo, const VkAllocationCallbacks *pAllocator, VkPipelineLayout *pPipelineLayout)) \ + X(vkCreateQueryPool, VkResult, (VkDevice device, const VkQueryPoolCreateInfo *pCreateInfo, const VkAllocationCallbacks *pAllocator, VkQueryPool *pQueryPool)) \ + X(vkCreateSemaphore, VkResult, (VkDevice device, const VkSemaphoreCreateInfo *pCreateInfo, const VkAllocationCallbacks *pAllocator, VkSemaphore *pSemaphore)) \ + X(vkCreateShaderModule, VkResult, (VkDevice device, const VkShaderModuleCreateInfo *pCreateInfo, const VkAllocationCallbacks *pAllocator, VkShaderModule *pShaderModule)) \ + X(vkDestroyBuffer, void, (VkDevice device, VkBuffer buffer, const VkAllocationCallbacks *pAllocator)) \ + X(vkDestroyImage, void, (VkDevice device, VkImage image, const VkAllocationCallbacks *pAllocator)) \ + X(vkDestroyImageView, void, (VkDevice device, VkImageView imageView, const VkAllocationCallbacks *pAllocator)) \ + X(vkDestroyPipeline, void, (VkDevice device, VkPipeline pipeline, const VkAllocationCallbacks *pAllocator)) \ + X(vkDestroyPipelineLayout, void, (VkDevice device, VkPipelineLayout pipelineLayout, const VkAllocationCallbacks *pAllocator)) \ + X(vkDestroyShaderModule, void, (VkDevice device, VkShaderModule shaderModule, const VkAllocationCallbacks *pAllocator)) \ + X(vkFlushMappedMemoryRanges, VkResult, (VkDevice device, uint32_t memoryRangeCount, const VkMappedMemoryRange *pMemoryRanges)) \ + X(vkFreeMemory, void, (VkDevice device, VkDeviceMemory memory, const VkAllocationCallbacks *pAllocator)) \ + X(vkGetBufferDeviceAddress, VkDeviceAddress, (VkDevice device, const VkBufferDeviceAddressInfo *pInfo)) \ + X(vkGetBufferMemoryRequirements, void, (VkDevice device, VkBuffer buffer, VkMemoryRequirements *pMemoryRequirements)) \ + X(vkGetDeviceQueue, void, (VkDevice device, uint32_t queueFamilyIndex, uint32_t queueIndex, VkQueue *pQueue)) \ + X(vkGetImageMemoryRequirements, void, (VkDevice device, VkImage image, VkMemoryRequirements *pMemoryRequirements)) \ + X(vkGetMemoryFdKHR, VkResult, (VkDevice device, const VkMemoryGetFdInfoKHR *pGetFdInfo, int *pFd)) \ + X(vkGetMemoryWin32HandleKHR, VkResult, (VkDevice device, const VkMemoryGetWin32HandleInfoKHR *pGetWin32HandleInfo, void **pHandle)) \ + X(vkGetQueryPoolResults, VkResult, (VkDevice device, VkQueryPool queryPool, uint32_t firstQuery, uint32_t queryCount, size_t dataSize, void *pData, VkDeviceSize stride, VkQueryResultFlags flags)) \ + X(vkGetSemaphoreFdKHR, VkResult, (VkDevice device, const VkSemaphoreGetFdInfoKHR *pGetFdInfo, int *pFd)) \ + X(vkGetSemaphoreWin32HandleKHR, VkResult, (VkDevice device, const VkSemaphoreGetWin32HandleInfoKHR *pGetWin32HandleInfo, void **pHandle)) \ + X(vkInvalidateMappedMemoryRanges, VkResult, (VkDevice device, uint32_t memoryRangeCount, const VkMappedMemoryRange *pMemoryRanges)) \ + X(vkMapMemory, VkResult, (VkDevice device, VkDeviceMemory memory, VkDeviceSize offset, VkDeviceSize size, VkMemoryMapFlags flags, void **ppData)) \ + X(vkSignalSemaphore, VkResult, (VkDevice device, const VkSemaphoreSignalInfo *pSignalInfo)) \ + X(vkUnmapMemory, void, (VkDevice device, VkDeviceMemory memory)) \ + X(vkWaitSemaphores, VkResult, (VkDevice device, const VkSemaphoreWaitInfo *pWaitInfo, uint64_t timeout)) \ + X(vkBeginCommandBuffer, VkResult, (VkCommandBuffer commandBuffer, const VkCommandBufferBeginInfo *pBeginInfo)) \ + X(vkCmdBeginRendering, void, (VkCommandBuffer commandBuffer, const VkRenderingInfo *pRenderingInfo)) \ + X(vkCmdBindIndexBuffer2, void, (VkCommandBuffer commandBuffer, VkBuffer buffer, VkDeviceSize offset, VkDeviceSize size, VkIndexType indexType)) \ + X(vkCmdBindPipeline, void, (VkCommandBuffer commandBuffer, VkPipelineBindPoint pipelineBindPoint, VkPipeline pipeline)) \ + X(vkCmdCopyBuffer2, void, (VkCommandBuffer commandBuffer, const VkCopyBufferInfo2 *pCopyBufferInfo)) \ + X(vkCmdDispatch, void, (VkCommandBuffer commandBuffer, uint32_t groupCountX, uint32_t groupCountY, uint32_t groupCountZ)) \ + X(vkCmdDrawIndexed, void, (VkCommandBuffer commandBuffer, uint32_t indexCount, uint32_t instanceCount, uint32_t firstIndex, int32_t vertexOffset, uint32_t firstInstance)) \ + X(vkCmdEndRendering, void, (VkCommandBuffer commandBuffer)) \ + X(vkCmdPipelineBarrier2, void, (VkCommandBuffer commandBuffer, const VkDependencyInfo *pDependencyInfo)) \ + X(vkCmdPushConstants, void, (VkCommandBuffer commandBuffer, VkPipelineLayout layout, VkShaderStageFlags stageFlags, uint32_t offset, uint32_t size, const void *pValues)) \ + X(vkCmdResetQueryPool, void, (VkCommandBuffer commandBuffer, VkQueryPool queryPool, uint32_t firstQuery, uint32_t queryCount)) \ + X(vkCmdSetScissor, void, (VkCommandBuffer commandBuffer, uint32_t firstScissor, uint32_t scissorCount, const VkRect2D *pScissors)) \ + X(vkCmdSetViewport, void, (VkCommandBuffer commandBuffer, uint32_t firstViewport, uint32_t viewportCount, const VkViewport *pViewports)) \ + X(vkCmdWriteTimestamp2, void, (VkCommandBuffer commandBuffer, VkPipelineStageFlags2 stage, VkQueryPool queryPool, uint32_t query)) \ + X(vkEndCommandBuffer, VkResult, (VkCommandBuffer commandBuffer)) \ + X(vkQueueSubmit2, VkResult, (VkQueue queue, uint32_t submitCount, const VkSubmitInfo2 *pSubmits, VkFence fence)) \ #define X(name, ret, params) typedef ret name##_fn params;