core: migrate to vulkan compute - ogl_beamforming - Ultrasound Beamforming Implemented with OpenGL

Commit: f33f8f7270186a95011e8cf201acb3b50733cd4f
Parent: 3e4bea29377e32bee5ef97cc5efef02310587b1b
Author: Randy Palamar
Date:   Thu,  7 May 2026 15:28:11 -0600

core: migrate to vulkan compute

Most things are working here. The only relevant thing that is
missing currently is the frame averaging which no one really uses.
This has a minor performance regression which I saw before when I
tried switching the images in OpenGL to a large ssbo. It can be
solved by not doing the "DAS Fast" thing (running a single channel
at a time) but this may cause issues on lower end devices. The
next commit will implement a new optimization which should solve
this universally.

Diffstat:
M beamformer.c  | 338 ++++++++++++++++++-------------------------------------------------------------
M beamformer.h  | 1 +
M beamformer.meta  | 124 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
M beamformer_core.c  | 1620 ++++++++++++++++++++++++++++++++++++++++---------------------------------------
M beamformer_internal.h  | 383 +++++++++++++++++++++++++++++++++++++++++++++++--------------------------------
M beamformer_parameters.h  | 6 ++++--
M beamformer_shared_memory.c  | 9 ++++-----
M build.c  | 204 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------
D external/include/raylib_extended.h  | 2 --
D external/rcore_extended.c  | 8 --------
M generated/beamformer.meta.c  | 316 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------
M lib/ogl_beamformer_lib.c  | 15 +++++++++++++--
M lib/ogl_beamformer_lib_base.h  | 4 ++++
M main_linux.c  | 11 +----------
M main_w32.c  | 11 +----------
M math.c  | 20 +++-----------------
M opengl.h  | 131 ++++++++++++-------------------------------------------------------------------
A shaders/buffer_clear.glsl  | 11 +++++++++++
A shaders/coherency_weighting.glsl  | 41 +++++++++++++++++++++++++++++++++++++++++
M shaders/das.glsl  | 161 ++++++++++++++++++++++++++++++++++++++++++-------------------------------------
M shaders/decode.glsl  | 108 ++++++++++++++++++++++++++++++++++---------------------------------------------
M shaders/filter.glsl  | 24 +++++++++++++-----------
M shaders/render_3d.frag.glsl  | 60 ++++++++++++++++++++++++++++++++++++++++++++----------------
A shaders/render_3d.vert.glsl  | 19 +++++++++++++++++++
M ui.c  | 528 +++++++++++++++++++++++++++++++++++++++++++++++++------------------------------
M util.c  | 8 --------
M util.h  | 16 +++++++++-------
D util_gl.c  | 69 ---------------------------------------------------------------------
M util_os.c  | 18 ++++++++++++++++++
M vulkan.c  | 1838 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------
M vulkan.h  | 1179 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------

31 files changed, 4942 insertions(+), 2341 deletions(-)
diff --git a/beamformer.c b/beamformer.c
@@ -56,30 +56,10 @@ fatal(s8 message)
 
 #include "vulkan.c"
 
-// TODO(rnp): none of this belongs here, but will be removed
+// TODO(rnp): this doesn't belong here, but will be removed
 // once vulkan migration is complete
-#define GLFW_VISIBLE 0x00020004
-void   glfwWindowHint(i32, i32);
-iptr   glfwCreateWindow(i32, i32, char *, iptr, iptr);
-void   glfwMakeContextCurrent(iptr);
-iptr   glfwGetGLXContext(iptr);
-iptr   glfwGetWGLContext(iptr);
 void * glfwGetProcAddress(char *);
 
-#if OS_WINDOWS
-function iptr
-os_get_native_gl_context(iptr window)
-{
-	return glfwGetWGLContext(window);
-}
-#else
-function iptr
-os_get_native_gl_context(iptr window)
-{
-	return glfwGetGLXContext(window);
-}
-#endif
-
 function void
 gl_debug_logger(u32 src, u32 type, u32 id, u32 lvl, i32 len, const char *msg, const void *userctx)
 {
@@ -100,7 +80,12 @@ load_gl(Stream *err)
 	stream_reset(err, 0);
 	#define X(name, ret, params) if (!name) stream_append_s8(err, s8("missing required GL function: " #name "\n"));
 	OGLProcedureList
-	OGLRequiredExtensionProcedureList
+	OGLRequiredExtensionProcedureListBase
+	#if OS_WINDOWS
+	  OGLRequiredExtensionProcedureListW32
+	#else
+	  OGLRequiredExtensionProcedureListLinux
+	#endif
 	#undef X
 
 	if (err->widx) fatal(stream_to_s8(err));
@@ -129,41 +114,6 @@ beamformer_load_cuda_library(BeamformerCtx *ctx, OSLibrary cuda, Arena arena)
 	#undef X
 }
 
-function BeamformerRenderModel
-render_model_from_arrays(f32 *vertices, f32 *normals, i32 vertices_size, u16 *indices, i32 index_count)
-{
-	BeamformerRenderModel result = {0};
-
-	i32 buffer_size    = vertices_size * 2 + index_count * (i32)sizeof(u16);
-	i32 indices_offset = vertices_size * 2;
-	i32 indices_size   = index_count * (i32)sizeof(u16);
-
-	result.elements        = index_count;
-	result.elements_offset = indices_offset;
-
-	glCreateBuffers(1, &result.buffer);
-	glNamedBufferStorage(result.buffer, buffer_size, 0, GL_DYNAMIC_STORAGE_BIT);
-	glNamedBufferSubData(result.buffer, 0,              vertices_size, vertices);
-	glNamedBufferSubData(result.buffer, vertices_size,  vertices_size, normals);
-	glNamedBufferSubData(result.buffer, indices_offset, indices_size,  indices);
-
-	glCreateVertexArrays(1, &result.vao);
-	glVertexArrayVertexBuffer(result.vao, 0, result.buffer, 0,             3 * sizeof(f32));
-	glVertexArrayVertexBuffer(result.vao, 1, result.buffer, vertices_size, 3 * sizeof(f32));
-	glVertexArrayElementBuffer(result.vao, result.buffer);
-
-	glEnableVertexArrayAttrib(result.vao, 0);
-	glEnableVertexArrayAttrib(result.vao, 1);
-
-	glVertexArrayAttribFormat(result.vao, 0, 3, GL_FLOAT, 0, 0);
-	glVertexArrayAttribFormat(result.vao, 1, 3, GL_FLOAT, 0, (u32)vertices_size);
-
-	glVertexArrayAttribBinding(result.vao, 0, 0);
-	glVertexArrayAttribBinding(result.vao, 1, 0);
-
-	return result;
-}
-
 function void
 worker_thread_sleep(GLWorkerThreadContext *ctx, BeamformerSharedMemory *sm)
 {
@@ -186,17 +136,12 @@ function OS_THREAD_ENTRY_POINT_FN(compute_worker_thread_entry_point)
 {
 	GLWorkerThreadContext *ctx = user_context;
 
-	glfwMakeContextCurrent(ctx->window_handle);
-	ctx->gl_context = os_get_native_gl_context(ctx->window_handle);
-
 	BeamformerCtx *beamformer = (BeamformerCtx *)ctx->user_context;
-	glCreateQueries(GL_TIME_ELAPSED, countof(beamformer->compute_context.shader_timer_ids),
-	                beamformer->compute_context.shader_timer_ids);
 
 	for (;;) {
 		worker_thread_sleep(ctx, beamformer->shared_memory);
 		asan_poison_region(ctx->arena.beg, ctx->arena.end - ctx->arena.beg);
-		beamformer_complete_compute(ctx->user_context, &ctx->arena, ctx->gl_context);
+		beamformer_complete_compute(beamformer, &ctx->arena);
 	}
 
 	unreachable();
@@ -206,31 +151,8 @@ function OS_THREAD_ENTRY_POINT_FN(compute_worker_thread_entry_point)
 
 function OS_THREAD_ENTRY_POINT_FN(beamformer_upload_entry_point)
 {
-	GLWorkerThreadContext *ctx = user_context;
-	glfwMakeContextCurrent(ctx->window_handle);
-	ctx->gl_context = os_get_native_gl_context(ctx->window_handle);
-
-	BeamformerUploadThreadContext *up = (typeof(up))ctx->user_context;
-	BeamformerRFBuffer            *rf = up->rf_buffer;
-	glCreateQueries(GL_TIMESTAMP, 1, &rf->data_timestamp_query);
-	/* NOTE(rnp): start this here so we don't have to worry about it being started or not */
-	glQueryCounter(rf->data_timestamp_query, GL_TIMESTAMP);
-
-	glGenSemaphoresEXT(countof(rf->gl_upload_semaphores), rf->gl_upload_semaphores);
-	for EachElement(rf->vk_upload_semaphores, it) {
-		OSHandle export = {0};
-		rf->vk_upload_semaphores[it] = vk_semaphore_create(rf->upload_semaphores_handles + it);
-
-		if (OS_WINDOWS) {
-			glImportSemaphoreWin32HandleEXT(rf->gl_upload_semaphores[it], GL_HANDLE_TYPE_OPAQUE_WIN32_EXT,
-			                                 (void *)export.value[0]);
-			// NOTE(rnp): w32 does not transfer ownership from handle back to driver
-			rf->upload_semaphores_handles[it] = export;
-		} else {
-			glImportSemaphoreFdEXT(rf->gl_upload_semaphores[it], GL_HANDLE_TYPE_OPAQUE_FD_EXT, export.value[0]);
-			rf->upload_semaphores_handles[it].value[0] = OSInvalidHandleValue;
-		}
-	}
+	GLWorkerThreadContext         *ctx = user_context;
+	BeamformerUploadThreadContext *up  = (typeof(up))ctx->user_context;
 
 	for (;;) {
 		worker_thread_sleep(ctx, up->shared_memory);
@@ -264,6 +186,45 @@ beamformer_init(BeamformerInput *input)
 
 	vk_load(input->vulkan_library_handle, &memory, &ctx->error_stream);
 
+	BeamformerComputeContext *cs = &ctx->compute_context;
+
+	// NOTE(rnp): allocate beamformed image ring buffer
+	{
+		u64 gpu_heap_size = vk_gpu_info()->gpu_heap_size;
+		u64 trial_sizes[] = {
+			GB(4),
+			GB(2),
+			GB(1) + MB(512),
+			GB(1),
+		};
+
+		u32 base_index = 0;
+		for EachElement(trial_sizes, it) {
+			if (gpu_heap_size >= 2 * trial_sizes[it])
+				break;
+			base_index++;
+		}
+
+		for (u32 i = base_index; i < countof(trial_sizes); i++) {
+			// TODO(rnp): it may be better to download data from this using the transfer queue
+			VulkanTimeline timelines[] = {VulkanTimeline_Compute, VulkanTimeline_Graphics};
+			GPUBufferAllocateInfo allocate_info = {
+				.size            = trial_sizes[i],
+				.flags           = VulkanUsageFlag_TransferSource|VulkanUsageFlag_HostReadWrite,
+				.timeline_count  = countof(timelines),
+				.timelines_used  = timelines,
+				.label           = s8("BeamformedData"),
+			};
+			vk_buffer_allocate(cs->backlog.buffer, &allocate_info);
+			if (cs->backlog.buffer->size > 0)
+				break;
+		}
+		if (cs->backlog.buffer->size == 0) {
+			// NOTE(rnp): if this becomes an issue we may be able to get by in some other way
+			fatal(s8("Failed to allocate space for beamformed data\n"));
+		}
+	}
+
 	beamformer_load_cuda_library(ctx, input->cuda_library_handle, memory);
 
 	SetConfigFlags(FLAG_VSYNC_HINT|FLAG_WINDOW_ALWAYS_RUN);
@@ -272,15 +233,8 @@ beamformer_init(BeamformerInput *input)
 	SetWindowState(FLAG_WINDOW_RESIZABLE);
 	SetWindowMinSize(840, ctx->window_size.h);
 
-	glfwWindowHint(GLFW_VISIBLE, 0);
-	iptr raylib_window_handle = (iptr)GetPlatformWindowHandle();
-
 	load_gl(&ctx->error_stream);
 
-	ctx->beamform_work_queue  = push_struct(&memory, BeamformWorkQueue);
-	ctx->compute_shader_stats = push_struct(&memory, ComputeShaderStats);
-	ctx->compute_timing_table = push_struct(&memory, ComputeTimingTable);
-
 	ctx->shared_memory      = input->shared_memory;
 	ctx->shared_memory_size = input->shared_memory_size;
 	if (ctx->shared_memory_size < (i64)sizeof(*ctx->shared_memory))
@@ -289,6 +243,7 @@ beamformer_init(BeamformerInput *input)
 
 	ctx->shared_memory->version = BEAMFORMER_SHARED_MEMORY_VERSION;
 	ctx->shared_memory->reserved_parameter_blocks = 1;
+	ctx->shared_memory->max_beamformed_data_size = cs->backlog.buffer->size;
 
 	/* TODO(rnp): I'm not sure if its a good idea to pre-reserve a bunch of semaphores
 	 * on w32 but thats what we are doing for now */
@@ -316,14 +271,10 @@ beamformer_init(BeamformerInput *input)
 	}
 	#endif
 
-	BeamformerComputeContext *cs = &ctx->compute_context;
-	cs->rf_buffer.export_handle  = (OSHandle){OSInvalidHandleValue};
-
 	GLWorkerThreadContext *worker = &ctx->compute_worker;
 	/* TODO(rnp): we should lock this down after we have something working */
-	worker->user_context  = (iptr)ctx;
-	worker->window_handle = glfwCreateWindow(1, 1, "", 0, raylib_window_handle);
-	worker->handle        = os_create_thread("[compute]", worker, compute_worker_thread_entry_point);
+	worker->user_context = (iptr)ctx;
+	worker->handle       = os_create_thread("[compute]", worker, compute_worker_thread_entry_point);
 
 	GLWorkerThreadContext         *upload = &ctx->upload_worker;
 	BeamformerUploadThreadContext *upctx  = push_struct(&memory, typeof(*upctx));
@@ -333,10 +284,7 @@ beamformer_init(BeamformerInput *input)
 	upctx->shared_memory_size   = ctx->shared_memory_size;
 	upctx->compute_timing_table = ctx->compute_timing_table;
 	upctx->compute_worker_sync  = &ctx->compute_worker.sync_variable;
-	upload->window_handle       = glfwCreateWindow(1, 1, "", 0, raylib_window_handle);
-	upload->handle              = os_create_thread("[upload]", upload, beamformer_upload_entry_point);
-
-	glfwMakeContextCurrent(raylib_window_handle);
+	upload->handle = os_create_thread("[upload]", upload, beamformer_upload_entry_point);
 
 	/* NOTE: set up OpenGL debug logging */
 	Stream *gl_error_stream = push_struct(&memory, Stream);
@@ -352,171 +300,37 @@ beamformer_init(BeamformerInput *input)
 			i32   index = beamformer_reloadable_compute_shader_info_indices[it];
 			Arena temp  = scratch;
 			s8 file = push_s8_from_parts(&temp, os_path_separator(), s8("shaders"),
-			                             beamformer_reloadable_shader_files[index]);
+			                             beamformer_reloadable_shader_files[index][0]);
 			BeamformerFileReloadContext *frc = push_struct(&memory, typeof(*frc));
-			frc->kind                = BeamformerFileReloadKind_ComputeShader;
-			frc->compute_shader_kind = beamformer_reloadable_shader_kinds[index];
+			frc->kind                 = BeamformerFileReloadKind_ComputeShader;
+			frc->shader_reload.shader = beamformer_reloadable_shader_kinds[index];
 			os_add_file_watch((char *)file.data, file.len, frc);
 		}
-	}
 
-	FrameViewRenderContext *fvr = &ctx->frame_view_render_context;
-	glCreateFramebuffers(countof(fvr->framebuffers), fvr->framebuffers);
-	LABEL_GL_OBJECT(GL_FRAMEBUFFER, fvr->framebuffers[0], s8("Frame View Framebuffer"));
-	LABEL_GL_OBJECT(GL_FRAMEBUFFER, fvr->framebuffers[1], s8("Frame View Resolving Framebuffer"));
-
-	glCreateRenderbuffers(countof(fvr->renderbuffers), fvr->renderbuffers);
-	u32 msaa_samples = vk_gpu_info()->max_msaa_samples;
-	glNamedRenderbufferStorageMultisample(fvr->renderbuffers[0], msaa_samples, GL_RGBA8,
-	                                      FRAME_VIEW_RENDER_TARGET_SIZE);
-	glNamedRenderbufferStorageMultisample(fvr->renderbuffers[1], msaa_samples, GL_DEPTH_COMPONENT24,
-	                                      FRAME_VIEW_RENDER_TARGET_SIZE);
-
-	static_assert(countof(beamformer_reloadable_render_shader_info_indices) == 1,
-	              "only a single render shader is currently handled");
-	i32 render_rsi_index = beamformer_reloadable_render_shader_info_indices[0];
-
-	// TODO(rnp): leaks when BakeShaders is true
-	Arena *arena = &memory;
-	BeamformerShaderReloadContext *render_3d = push_struct(arena, typeof(*render_3d));
-	render_3d->reloadable_info_index = render_rsi_index;
-	render_3d->gl_type = GL_FRAGMENT_SHADER;
-	render_3d->header  = s8(""
-	"layout(location = 0) in  vec3 normal;\n"
-	"layout(location = 1) in  vec3 texture_coordinate;\n\n"
-	"layout(location = 2) in  vec3 test_texture_coordinate;\n\n"
-	"layout(location = 0) out vec4 out_colour;\n\n"
-	"layout(location = " str(FRAME_VIEW_DYNAMIC_RANGE_LOC) ") uniform float u_db_cutoff = 60;\n"
-	"layout(location = " str(FRAME_VIEW_THRESHOLD_LOC)     ") uniform float u_threshold = 40;\n"
-	"layout(location = " str(FRAME_VIEW_GAMMA_LOC)         ") uniform float u_gamma     = 1;\n"
-	"layout(location = " str(FRAME_VIEW_LOG_SCALE_LOC)     ") uniform bool  u_log_scale;\n"
-	"layout(location = " str(FRAME_VIEW_BB_COLOUR_LOC)     ") uniform vec4  u_bb_colour   = vec4(" str(FRAME_VIEW_BB_COLOUR) ");\n"
-	"layout(location = " str(FRAME_VIEW_BB_FRACTION_LOC)   ") uniform float u_bb_fraction = " str(FRAME_VIEW_BB_FRACTION) ";\n"
-	"layout(location = " str(FRAME_VIEW_SOLID_BB_LOC)      ") uniform bool  u_solid_bb;\n"
-	"\n"
-	"layout(binding = 0) uniform sampler3D u_texture;\n");
-
-	render_3d->link = push_struct(arena, typeof(*render_3d));
-	render_3d->link->reloadable_info_index = -1;
-	render_3d->link->gl_type = GL_VERTEX_SHADER;
-	render_3d->link->link    = render_3d;
-	render_3d->link->header  = s8(""
-	"layout(location = 0) in vec3 v_position;\n"
-	"layout(location = 1) in vec3 v_normal;\n"
-	"\n"
-	"layout(location = 0) out vec3 f_normal;\n"
-	"layout(location = 1) out vec3 f_texture_coordinate;\n"
-	"layout(location = 2) out vec3 f_orig_texture_coordinate;\n"
-	"\n"
-	"layout(location = " str(FRAME_VIEW_MODEL_MATRIX_LOC)  ") uniform mat4  u_model;\n"
-	"layout(location = " str(FRAME_VIEW_VIEW_MATRIX_LOC)   ") uniform mat4  u_view;\n"
-	"layout(location = " str(FRAME_VIEW_PROJ_MATRIX_LOC)   ") uniform mat4  u_projection;\n"
-	"\n"
-	"\n"
-	"void main()\n"
-	"{\n"
-	"\tvec3 pos = v_position;\n"
-	"\tf_orig_texture_coordinate = (2 * v_position + 1) / 2;\n"
-	//"\tif (v_position.y == -1) pos.x = clamp(v_position.x, -u_clip_fraction, u_clip_fraction);\n"
-	"\tvec3 tex_coord = (2 * pos + 1) / 2;\n"
-	"\tf_texture_coordinate = tex_coord;\n"
-	//"\tf_texture_coordinate = u_swizzle? tex_coord.xzy : tex_coord;\n"
-	//"\tf_normal    = normalize(mat3(u_model) * v_normal);\n"
-	"\tf_normal    = v_normal;\n"
-	"\tgl_Position = u_projection * u_view * u_model * vec4(pos, 1);\n"
-	"}\n");
-
-	// TODO(rnp): this is probably not expected by the platform, refactor so that all
-	// needed context (eg. headers) are available outside of here and push initial load
-	// into ui_init
-	{
-		BeamformerFileReloadContext *frc = push_struct(&memory, typeof(*frc));
-		frc->kind                  = BeamformerFileReloadKind_Shader;
-		frc->shader_reload_context = render_3d;
-		input->event_queue[input->event_count++] = (BeamformerInputEvent){
-			.kind = BeamformerInputEventKind_FileEvent,
-			.file_watch_user_context = frc,
-		};
+		for EachElement(beamformer_reloadable_compute_helpers_shader_info_indices, it) {
+			i32   index = beamformer_reloadable_compute_helpers_shader_info_indices[it];
+			Arena temp  = scratch;
+			s8 file = push_s8_from_parts(&temp, os_path_separator(), s8("shaders"),
+			                             beamformer_reloadable_shader_files[index][0]);
+			BeamformerFileReloadContext *frc = push_struct(&memory, typeof(*frc));
+			frc->kind                 = BeamformerFileReloadKind_ComputeShader;
+			frc->shader_reload.shader = beamformer_reloadable_shader_kinds[index];
+			os_add_file_watch((char *)file.data, file.len, frc);
+		}
 
-		s8 render_file = {0};
-		if (!BakeShaders) {
-			render_file = push_s8_from_parts(&scratch, os_path_separator(), s8("shaders"),
-			                                 beamformer_reloadable_shader_files[render_rsi_index]);
-			os_add_file_watch((char *)render_file.data, render_file.len, frc);
+		for EachElement(beamformer_reloadable_compute_internal_shader_info_indices, it) {
+			i32   index = beamformer_reloadable_compute_internal_shader_info_indices[it];
+			Arena temp  = scratch;
+			s8 file = push_s8_from_parts(&temp, os_path_separator(), s8("shaders"),
+			                             beamformer_reloadable_shader_files[index][0]);
+			BeamformerFileReloadContext *frc = push_struct(&memory, typeof(*frc));
+			frc->kind                   = BeamformerFileReloadKind_ComputeInternalShader;
+			frc->shader_reload.shader   = beamformer_reloadable_shader_kinds[index];
+			frc->shader_reload.pipeline = cs->compute_internal_pipelines + it;
+			os_add_file_watch((char *)file.data, file.len, frc);
 		}
 	}
 
-	f32 unit_cube_vertices[] = {
-		 0.5f,  0.5f, -0.5f,
-		 0.5f,  0.5f, -0.5f,
-		 0.5f,  0.5f, -0.5f,
-		 0.5f, -0.5f, -0.5f,
-		 0.5f, -0.5f, -0.5f,
-		 0.5f, -0.5f, -0.5f,
-		 0.5f,  0.5f,  0.5f,
-		 0.5f,  0.5f,  0.5f,
-		 0.5f,  0.5f,  0.5f,
-		 0.5f, -0.5f,  0.5f,
-		 0.5f, -0.5f,  0.5f,
-		 0.5f, -0.5f,  0.5f,
-		-0.5f,  0.5f, -0.5f,
-		-0.5f,  0.5f, -0.5f,
-		-0.5f,  0.5f, -0.5f,
-		-0.5f, -0.5f, -0.5f,
-		-0.5f, -0.5f, -0.5f,
-		-0.5f, -0.5f, -0.5f,
-		-0.5f,  0.5f,  0.5f,
-		-0.5f,  0.5f,  0.5f,
-		-0.5f,  0.5f,  0.5f,
-		-0.5f, -0.5f,  0.5f,
-		-0.5f, -0.5f,  0.5f,
-		-0.5f, -0.5f,  0.5f
-	};
-	f32 unit_cube_normals[] = {
-		 0.0f,  0.0f, -1.0f,
-		 0.0f,  1.0f,  0.0f,
-		 1.0f,  0.0f,  0.0f,
-		 0.0f,  0.0f, -1.0f,
-		 0.0f, -1.0f,  0.0f,
-		 1.0f,  0.0f,  0.0f,
-		 0.0f,  0.0f,  1.0f,
-		 0.0f,  1.0f,  0.0f,
-		 1.0f,  0.0f,  0.0f,
-		 0.0f,  0.0f,  1.0f,
-		 0.0f, -1.0f,  0.0f,
-		 1.0f,  0.0f,  0.0f,
-		 0.0f,  0.0f, -1.0f,
-		 0.0f,  1.0f,  0.0f,
-		-1.0f,  0.0f,  0.0f,
-		 0.0f,  0.0f, -1.0f,
-		 0.0f, -1.0f,  0.0f,
-		-1.0f,  0.0f,  0.0f,
-		 0.0f,  0.0f,  1.0f,
-		 0.0f,  1.0f,  0.0f,
-		-1.0f,  0.0f,  0.0f,
-		 0.0f,  0.0f,  1.0f,
-		 0.0f, -1.0f,  0.0f,
-		-1.0f,  0.0f,  0.0f
-	};
-	u16 unit_cube_indices[] = {
-		1,  13, 19,
-		1,  19, 7,
-		9,  6,  18,
-		9,  18, 21,
-		23, 20, 14,
-		23, 14, 17,
-		16, 4,  10,
-		16, 10, 22,
-		5,  2,  8,
-		5,  8,  11,
-		15, 12, 0,
-		15, 0,  3
-	};
-
-	cs->unit_cube_model = render_model_from_arrays(unit_cube_vertices, unit_cube_normals,
-	                                               sizeof(unit_cube_vertices),
-	                                               unit_cube_indices, countof(unit_cube_indices));
-
 	memory.end = scratch.end;
 	ctx->arena = memory;
 	ctx->state = BeamformerState_Running;
diff --git a/beamformer.h b/beamformer.h
@@ -182,6 +182,7 @@ typedef struct {
 	#if BEAMFORMER_RENDERDOC_HOOKS
 	void *renderdoc_start_frame_capture;
 	void *renderdoc_end_frame_capture;
+	void *renderdoc_set_capture_file_path_template;
 	#endif
 } BeamformerInput;
 
diff --git a/beamformer.meta b/beamformer.meta
@@ -1,5 +1,5 @@
 @Constant(4)    FilterSlots
-@Constant(16)   MaxBacklogFrames
+@Constant(4096) MaxBacklogFrames
 @Constant(256)  MaxChannelCount
 @Constant(256)  MaxEmissionsCount
 @Constant(16)   MaxComputeShaderStages
@@ -210,6 +210,13 @@
 @Library @Struct SimpleParameters
 @MATLAB  @Struct SimpleParameters
 
+@Struct DASArrayParameters
+{
+	[focal_vectors                 V2  MaxChannelCount]
+	[sparse_elements               S16 MaxChannelCount]
+	[transmit_receive_orientations U16 MaxChannelCount]
+}
+
 @Emit
 {
 	`read_only global u8 beamformer_data_kind_element_size[] = {`
@@ -274,6 +281,15 @@
 			[ToProcess            to_process             U32]
 			[TransmitCount        transmit_count         U32]
 		}
+
+		@PushConstants
+		{
+			[hadamard_buffer  U64]
+			[rf_buffer        U64]
+			[output_buffer    U64]
+			[output_rf_buffer U64]
+			[first_pass       B32]
+		}
 	}
 
 	@Shader(filter.glsl) Filter
@@ -301,22 +317,32 @@
 			[DemodulationFrequency demodulation_frequency F32]
 			[SamplingFrequency     sampling_frequency     F32]
 		}
+
+		@PushConstants
+		{
+			[input_data          U64]
+			[output_data         U64]
+			[filter_coefficients U64]
+		}
 	}
 
 	@Shader(das.glsl) DAS
 	{
+		@Constant    MaxChannelCount
+
 		@Enumeration AcquisitionKind
 		@Enumeration DataKind
 		@Enumeration InterpolationMode
 		@Enumeration RCAOrientation
 
+		@Struct      DASArrayParameters
+
 		@Bake
 		{
 			[DataKind                   data_kind                    U32]
 			[CoherencyWeighting         coherency_weighting          U32]
 			[SingleFocus                single_focus                 U32]
 			[SingleOrientation          single_orientation           U32]
-			[Fast                       fast                         U32]
 			[Sparse                     sparse                       U32]
 			[AcquisitionCount           acquisition_count            U32]
 			[AcquisitionKind            acquisition_kind             U32]
@@ -336,17 +362,101 @@
 
 		@PushConstants
 		{
-			[xdc_transform     M4]
-			[voxel_transform   M4]
-			[xdc_element_pitch V2]
+			[xdc_transform      M4]
+			[voxel_transform    M4]
+			[xdc_element_pitch  V2]
+			[rf_data           U64]
+			[output_data       U64]
+			[incoherent_output U64]
+			[array_parameters  U64]
+			[output_size_x     U32]
+			[output_size_y     U32]
+			[output_size_z     U32]
+			[cycle_t           U32]
+			[channel_t         S32]
 		}
 	}
 
-	@Shader(min_max.glsl) MinMax
 	@Shader(sum.glsl) Sum
+	{
+		@Enumeration DataKind
+		@PushConstants
+		{
+			[output_data    U64]
+			[input_data     U64]
+			[image_elements U32]
+			[scale          F32]
+		}
+	}
+
+	@Shader(min_max.glsl) MinMax
+}
+
+// NOTE: shaders which need to be baked into the beamforming pipeline
+// but should not be visible to the external interface
+@ShaderGroup ComputeHelpers
+{
+	@Shader(coherency_weighting.glsl) CoherencyWeighting
+	{
+		@Enumeration DataKind
+
+		@Bake
+		{
+			[DataKind data_kind U32]
+		}
+
+		@PushConstants
+		{
+			[left_side_buffer  U64]
+			[right_side_buffer U64]
+			[elements          U32]
+			[scale             F32]
+			[output_size_x     U32]
+			[output_size_y     U32]
+			[output_size_z     U32]
+		}
+	}
+}
+
+// NOTE: general compute shaders which do not need baking
+@ShaderGroup ComputeInternal
+{
+	@Shader(buffer_clear.glsl) BufferClear
+	{
+		@PushConstants
+		{
+			[data       U64]
+			[clear_word U32]
+			[words      U32]
+		}
+	}
 }
 
 @ShaderGroup Render
 {
-	@Shader(render_3d.frag.glsl) Render3D
+	@RenderShader RenderBeamformed
+	{
+		@Enumeration DataKind
+
+		@VertexShader(render_3d.vert.glsl)
+		@FragmentShader(render_3d.frag.glsl)
+
+		@PushConstants
+		{
+			[mvp_matrix             M4]
+			[positions             U64]
+			[normals               U64]
+
+			[bounding_box_colour    V4]
+			[bounding_box_fraction F32]
+			[db_cutoff             F32]
+			[threshold             F32]
+			[gamma                 F32]
+			[input_data            U64]
+			[input_size_x          U32]
+			[input_size_y          U32]
+			[input_size_z          U32]
+			[data_kind             U32]
+		}
+	}
 }
diff --git a/beamformer_core.c b/beamformer_core.c
@@ -11,14 +11,9 @@
  *      - the check for first pass reshaping is the last non constant check
  *        in the shader
  *      - this will also remove the need for the channel mapping in the decode shader
- * [X]: refactor: ui: reload only shader which is affected by the interaction
  * [ ]: BeamformWorkQueue -> BeamformerWorkQueue
- * [ ]: need to keep track of gpu memory in some way
- *      - want to be able to store more than 16 2D frames but limit 3D frames
- *      - maybe keep track of how much gpu memory is committed for beamformed images
- *        and use that to determine when to loop back over existing textures
- *      - to do this maybe use a circular linked list instead of a flat array
- *      - then have a way of querying how many frames are available for a specific point count
+ * [ ]: refactor: work queue needs a cleanup, we should only have a single one
+ *      - that queue isn't really considered hot so a lock is probably fine
  * [ ]: bug: reinit cuda on hot-reload
  */
 
@@ -32,33 +27,33 @@
 
 global f32 dt_for_frame;
 
-#define DECODE_FIRST_PASS_UNIFORM_LOC 1
-
-#define DAS_CYCLE_T_UNIFORM_LOC       2
-#define DAS_FAST_CHANNEL_UNIFORM_LOC  3
-
-#define MIN_MAX_MIPS_LEVEL_UNIFORM_LOC 1
-#define SUM_PRESCALE_UNIFORM_LOC       1
-
 #if !BEAMFORMER_RENDERDOC_HOOKS
 #define start_renderdoc_capture(...)
 #define end_renderdoc_capture(...)
 #define renderdoc_attached(...) (0)
 #else
-global renderdoc_start_frame_capture_fn *start_frame_capture;
-global renderdoc_end_frame_capture_fn   *end_frame_capture;
-#define start_renderdoc_capture(gl) if (start_frame_capture) start_frame_capture(gl, 0)
-#define end_renderdoc_capture(gl)   if (end_frame_capture)   end_frame_capture(gl, 0)
+global renderdoc_start_frame_capture_fn       *start_frame_capture;
+global renderdoc_set_capture_path_template_fn *set_capture_path_template;
+global renderdoc_end_frame_capture_fn         *end_frame_capture;
+#define start_renderdoc_capture()  do { \
+	if (set_capture_path_template) set_capture_path_template("captures/ogl.rdc"); \
+	if (start_frame_capture)       start_frame_capture(vk_renderdoc_instance_handle(), 0); \
+} while(0)
+#define end_renderdoc_capture()   if (end_frame_capture)   end_frame_capture(vk_renderdoc_instance_handle(), 0)
 #define renderdoc_attached(...)   (start_frame_capture != 0)
 #endif
 
-typedef struct {
-	BeamformerFrame *frames;
-	u32 capacity;
-	u32 offset;
-	u32 cursor;
-	u32 needed_frames;
-} ComputeFrameIterator;
+read_only global u32 beamformer_compute_array_parameter_sizes[] = {
+	#define X(k, type, elements) sizeof(type) * elements,
+	BEAMFORMER_COMPUTE_ARRAY_PARAMETERS_LIST
+	#undef X
+};
+
+read_only global u32 beamformer_compute_array_parameter_offsets[] = {
+	#define X(k, ...) offsetof(BeamformerComputeArrayParameters, k),
+	BEAMFORMER_COMPUTE_ARRAY_PARAMETERS_LIST
+	#undef X
+};
 
 function void
 beamformer_compute_plan_release(BeamformerComputeContext *cc, u32 block)
@@ -66,10 +61,9 @@ beamformer_compute_plan_release(BeamformerComputeContext *cc, u32 block)
 	assert(block < countof(cc->compute_plans));
 	BeamformerComputePlan *cp = cc->compute_plans[block];
 	if (cp) {
-		glDeleteBuffers(countof(cp->ubos), cp->ubos);
-		glDeleteTextures(countof(cp->textures), cp->textures);
+		vk_buffer_release(&cp->array_parameters);
 		for (u32 i = 0; i < countof(cp->filters); i++)
-			glDeleteBuffers(1, &cp->filters[i].ssbo);
+			vk_buffer_release(&cp->filters[i].buffer);
 		cc->compute_plans[block] = 0;
 		SLLPushFreelist(cp, cc->compute_plan_freelist);
 	}
@@ -88,39 +82,19 @@ beamformer_compute_plan_for_block(BeamformerComputeContext *cc, u32 block, Arena
 
 		result->ui_voxel_transform = m4_identity();
 
-		glCreateBuffers(countof(result->ubos), result->ubos);
-
 		Stream label = arena_stream(*arena);
-		#define X(k, t, ...) \
-			glNamedBufferStorage(result->ubos[BeamformerComputeUBOKind_##k], sizeof(t), \
-			                     0, GL_DYNAMIC_STORAGE_BIT); \
-			stream_append_s8(&label, s8(#t "[")); \
-			stream_append_u64(&label, block);     \
-			stream_append_byte(&label, ']');      \
-			glObjectLabel(GL_BUFFER, result->ubos[BeamformerComputeUBOKind_##k], \
-			              label.widx, (c8 *)label.data); \
-			label.widx = 0;
-		BEAMFORMER_COMPUTE_UBO_LIST
-		#undef X
-
-		#define X(_k, t, ...) t,
-		GLenum gl_kind[] = {BEAMFORMER_COMPUTE_TEXTURE_LIST_FULL};
-		#undef X
-		read_only local_persist s8 tex_prefix[] = {
-			#define X(k, ...) s8_comp(#k "["),
-			BEAMFORMER_COMPUTE_TEXTURE_LIST_FULL
-			#undef X
+		stream_append_s8(&label, s8("ComputeParameterArray["));
+		stream_append_u64(&label, block);
+		stream_append_s8(&label, s8("]"));
+		stream_append_byte(&label, 0);
+
+		GPUBufferAllocateInfo allocate_info = {
+			.size  = sizeof(BeamformerComputeArrayParameters),
+			.flags = VulkanUsageFlag_HostReadWrite,
+			.label = stream_to_s8(&label),
 		};
-		glCreateTextures(GL_TEXTURE_1D, BeamformerComputeTextureKind_Count - 1, result->textures);
-		for (u32 i = 0; i < BeamformerComputeTextureKind_Count - 1; i++) {
-			/* TODO(rnp): this could be predicated on channel count for this compute plan */
-			glTextureStorage1D(result->textures[i], 1, gl_kind[i], BeamformerMaxChannelCount);
-			stream_append_s8(&label, tex_prefix[i]);
-			stream_append_u64(&label, block);
-			stream_append_byte(&label, ']');
-			glObjectLabel(GL_TEXTURE, result->textures[i], label.widx, (c8 *)label.data);
-			label.widx = 0;
-		}
+		vk_buffer_allocate(&result->array_parameters, &allocate_info);
+		assert((result->array_parameters.gpu_pointer & 63) == 0);
 	}
 	return result;
 }
@@ -165,42 +139,16 @@ beamformer_filter_update(BeamformerFilter *f, BeamformerFilterParameters fp, u32
 
 	f->parameters = fp;
 
-	glDeleteBuffers(1, &f->ssbo);
-	glCreateBuffers(1, &f->ssbo);
-	glNamedBufferStorage(f->ssbo, f->length * (i32)sizeof(f32) * (fp.complex? 2 : 1), filter, 0);
-	glObjectLabel(GL_BUFFER, f->ssbo, (i32)label.len, (c8 *)label.data);
-}
-
-function ComputeFrameIterator
-compute_frame_iterator(BeamformerCtx *ctx, u32 start_index, u32 needed_frames)
-{
-	start_index = start_index % countof(ctx->beamform_frames);
-
-	ComputeFrameIterator result;
-	result.frames        = ctx->beamform_frames;
-	result.offset        = start_index;
-	result.capacity      = countof(ctx->beamform_frames);
-	result.cursor        = 0;
-	result.needed_frames = needed_frames;
-	return result;
-}
-
-function BeamformerFrame *
-frame_next(ComputeFrameIterator *bfi)
-{
-	BeamformerFrame *result = 0;
-	if (bfi->cursor != bfi->needed_frames) {
-		u32 index = (bfi->offset + bfi->cursor++) % bfi->capacity;
-		result    = bfi->frames + index;
+	u32 byte_size = f->length * (i32)sizeof(f32) * (fp.complex? 2 : 1);
+	if (f->buffer.size < byte_size) {
+		GPUBufferAllocateInfo allocate_info = {
+			.size  = byte_size,
+			.flags = VulkanUsageFlag_HostReadWrite,
+			.label = label,
+		};
+		vk_buffer_allocate(&f->buffer, &allocate_info);
 	}
-	return result;
-}
-
-function b32
-beamformer_frame_compatible(BeamformerFrame *f, iv3 dim, GLenum gl_kind)
-{
-	b32 result = gl_kind == f->gl_kind && iv3_equal(dim, f->dim);
-	return result;
+	vk_buffer_range_upload(&f->buffer, filter, 0, byte_size, 0);
 }
 
 function iv3
@@ -214,83 +162,51 @@ das_valid_points(iv3 points)
 }
 
 function void
-alloc_beamform_frame(BeamformerFrame *out, iv3 out_dim, GLenum gl_kind, s8 name, Arena arena)
+update_hadamard(BeamformerComputePlan *cp, i32 order, b32 row_major, Arena arena)
 {
-	out->dim = das_valid_points(out_dim);
+	f16 *hadamard = make_hadamard_transpose(&arena, order, row_major);
+	if (hadamard) {
+		u64 offset = offsetof(BeamformerComputeArrayParameters, Hadamard);
+		u64 size   = sizeof(*((BeamformerComputeArrayParameters *)0)->Hadamard) * order * order;
+		vk_buffer_range_upload(&cp->array_parameters, hadamard, offset, size, 0);
+		cp->hadamard_order = order;
+	}
+}
 
-	/* NOTE: allocate storage for beamformed output data;
-	 * this is shared between compute and fragment shaders */
-	u32 max_dim = (u32)Max(out->dim.x, Max(out->dim.y, out->dim.z));
-	out->mips   = (i32)ctz_u64(round_up_power_of_two(max_dim)) + 1;
+function u64
+beamformer_frame_byte_size(iv3 points, BeamformerDataKind kind)
+{
+	u64 result = points.x * points.y * points.z * beamformer_data_kind_byte_size[kind];
+	result = round_up_to(result, 64);
+	return result;
+}
 
-	out->gl_kind = gl_kind;
+function BeamformerFrame *
+beamformer_frame_next(BeamformerComputeContext *cc, iv3 output_points, b32 complex, u64 reserved_size)
+{
+	BeamformerFrameBacklog *bl = &cc->backlog;
 
-	Stream label = arena_stream(arena);
-	stream_append_s8(&label, name);
-	stream_append_byte(&label, '[');
-	stream_append_hex_u64(&label, out->id);
-	stream_append_byte(&label, ']');
+	BeamformerDataKind kind = complex ? BeamformerDataKind_Float32Complex : BeamformerDataKind_Float32;
+	u64 frame_size = beamformer_frame_byte_size(output_points, kind);
 
-	glDeleteTextures(1, &out->texture);
-	glCreateTextures(GL_TEXTURE_3D, 1, &out->texture);
-	glTextureStorage3D(out->texture, out->mips, gl_kind, out->dim.x, out->dim.y, out->dim.z);
+	// TODO(rnp): handle this somewhat gracefully (even it produces garbled output)
+	assert(frame_size + reserved_size <= (u64)bl->buffer->size);
 
-	glTextureParameteri(out->texture, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
-	glTextureParameteri(out->texture, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+	if (bl->next_offset > (u64)bl->buffer->size - frame_size - reserved_size)
+		bl->next_offset = 0;
 
-	LABEL_GL_OBJECT(GL_TEXTURE, out->texture, stream_to_s8(&label));
-}
+	u64 id = bl->counter++;
 
-function void
-update_hadamard_texture(BeamformerComputePlan *cp, i32 order, b32 row_major, Arena arena)
-{
-	f16 *hadamard = make_hadamard_transpose(&arena, order, row_major);
-	if (hadamard) {
-		cp->hadamard_order = order;
-		u32 *texture = cp->textures + BeamformerComputeTextureKind_Hadamard;
-		glDeleteTextures(1, texture);
-		glCreateTextures(GL_TEXTURE_2D, 1, texture);
-		glTextureStorage2D(*texture, 1, GL_R16F, order, order);
-		glTextureSubImage2D(*texture, 0, 0, 0, order, order, GL_RED, GL_SHORT, hadamard);
-
-		Stream label = arena_stream(arena);
-		stream_append_s8(&label, s8("Hadamard"));
-		stream_append_i64(&label, order);
-		LABEL_GL_OBJECT(GL_TEXTURE, *texture, stream_to_s8(&label));
-	}
-}
+	BeamformerFrame *result = bl->frames + (id % countof(bl->frames));
+	atomic_store_u64(&result->timeline_valid_value, -1ULL);
+	result->id            = id & U32_MAX;
+	result->buffer_offset = bl->next_offset;
+	result->points        = output_points;
+	result->data_kind     = kind;
 
-function void
-alloc_shader_storage(BeamformerCtx *ctx, u32 decoded_data_size, Arena arena)
-{
-	BeamformerComputeContext *cc = &ctx->compute_context;
-	glDeleteBuffers(countof(cc->ping_pong_ssbos), cc->ping_pong_ssbos);
-	glCreateBuffers(countof(cc->ping_pong_ssbos), cc->ping_pong_ssbos);
-
-	cc->ping_pong_ssbo_size = decoded_data_size;
-
-	Stream label = arena_stream(arena);
-	stream_append_s8(&label, s8("PingPongSSBO["));
-	i32 s_widx = label.widx;
-	for (i32 i = 0; i < countof(cc->ping_pong_ssbos); i++) {
-		glNamedBufferStorage(cc->ping_pong_ssbos[i], (iz)decoded_data_size, 0, 0);
-		stream_append_i64(&label, i);
-		stream_append_byte(&label, ']');
-		LABEL_GL_OBJECT(GL_BUFFER, cc->ping_pong_ssbos[i], stream_to_s8(&label));
-		stream_reset(&label, s_widx);
-	}
+	bl->next_offset += frame_size;
 
-	/* TODO(rnp): (25.08.04) cuda lib is heavily broken atm. First there are multiple RF
-	 * buffers and cuda decode shouldn't assume that the data is coming from the rf_buffer
-	 * ssbo. Second each parameter block may need a different hadamard matrix so ideally
-	 * decode should just take the texture as a parameter. Third, none of these dimensions
-	 * need to be pre-known by the library unless its allocating GPU memory which it shouldn't
-	 * need to do. For now grab out of parameter block 0 but it is not correct */
-	BeamformerParameterBlock *pb = beamformer_parameter_block(ctx->shared_memory, 0);
-	/* NOTE(rnp): these are stubs when CUDA isn't supported */
-	cuda_register_buffers(cc->ping_pong_ssbos, countof(cc->ping_pong_ssbos), cc->rf_buffer.ssbo);
-	u32 decoded_data_dimension[3] = {pb->parameters.sample_count, pb->parameters.channel_count, pb->parameters.acquisition_count};
-	cuda_init(pb->parameters.raw_data_dimensions.E, decoded_data_dimension);
+	return result;
 }
 
 function void
@@ -306,35 +222,69 @@ fill_frame_compute_work(BeamformerCtx *ctx, BeamformWork *work, BeamformerViewPl
 {
 	b32 result = work != 0;
 	if (result) {
-		u32 frame_id    = atomic_add_u32(&ctx->next_render_frame_index, 1);
-		u32 frame_index = frame_id % countof(ctx->beamform_frames);
-		work->kind      = indirect? BeamformerWorkKind_ComputeIndirect : BeamformerWorkKind_Compute;
-		work->lock      = BeamformerSharedMemoryLockKind_DispatchCompute;
+		work->kind = indirect? BeamformerWorkKind_ComputeIndirect : BeamformerWorkKind_Compute;
+		work->lock = BeamformerSharedMemoryLockKind_DispatchCompute;
 		work->compute_context.parameter_block = parameter_block;
-		work->compute_context.frame = ctx->beamform_frames + frame_index;
-		work->compute_context.frame->ready_to_present = 0;
-		work->compute_context.frame->view_plane_tag   = plane;
-		work->compute_context.frame->id               = frame_id;
 	}
 	return result;
 }
 
-function void
-do_sum_shader(BeamformerComputeContext *cc, u32 *in_textures, u32 in_texture_count,
-              u32 out_texture, iv3 out_data_dim)
+function uv3
+layout_for_output(iv3 points)
 {
-	/* NOTE: zero output before summing */
-	glClearTexImage(out_texture, 0, GL_RED, GL_FLOAT, 0);
-	glMemoryBarrier(GL_TEXTURE_UPDATE_BARRIER_BIT);
-
-	glBindImageTexture(0, out_texture, 0, GL_TRUE, 0, GL_READ_WRITE, GL_RG32F);
-	for (u32 i = 0; i < in_texture_count; i++) {
-		glBindImageTexture(1, in_textures[i], 0, GL_TRUE, 0, GL_READ_ONLY, GL_RG32F);
-		glDispatchCompute(ORONE((u32)out_data_dim.x / 32u),
-		                  ORONE((u32)out_data_dim.y),
-		                  ORONE((u32)out_data_dim.z / 32u));
-		glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT);
+	uv3 result = {{1, 1, 1}};
+
+	b32 has_x = points.x > 1;
+	b32 has_y = points.y > 1;
+	b32 has_z = points.z > 1;
+
+	u32 subgroup_size  = vk_gpu_info()->subgroup_size;
+	u32 grid_3d_z_size = Max(1, subgroup_size / (4 * 4));
+	u32 grid_2d_y_size = Max(1, subgroup_size / 8);
+
+	switch (iv3_dimension(points)) {
+	case 1:{
+		if (has_x) result.x = subgroup_size;
+		if (has_y) result.y = subgroup_size;
+		if (has_z) result.z = subgroup_size;
+	}break;
+
+	case 2:{
+		if (has_x && has_y) {result.x = 8; result.y = grid_2d_y_size;}
+		if (has_x && has_z) {result.x = 8; result.z = grid_2d_y_size;}
+		if (has_y && has_z) {result.y = 8; result.z = grid_2d_y_size;}
+	}break;
+
+	case 3:{result = (uv3){{4, 4, grid_3d_z_size}};}break;
+
+	InvalidDefaultCase;
 	}
+
+	return result;
+}
+
+function uv3
+dispatch_for_output(uv3 layout, iv3 points)
+{
+	uv3 result;
+	result.x = (u32)ceil_f32((f32)points.x / layout.x);
+	result.y = (u32)ceil_f32((f32)points.y / layout.y);
+	result.z = (u32)ceil_f32((f32)points.z / layout.z);
+	return result;
+}
+
+function b32
+compute_plan_push_shader(BeamformerComputePlan *p, BeamformerShaderKind shader, BeamformerShaderParameters *sp)
+{
+	b32 result = 0;
+	if (p->pipeline.shader_count < countof(p->pipeline.shaders)) {
+		u32 index = p->pipeline.shader_count++;
+		p->pipeline.shaders[index]    = shader;
+		p->pipeline.parameters[index] = *sp;
+		zero_struct(p->shader_descriptors + index);
+		result = 1;
+	}
+	return result;
 }
 
 function void
@@ -374,355 +324,372 @@ plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb)
 
 	f32 time_offset = pb->parameters.time_offset;
 
-	// TODO(rnp): subgroup size
-	u32 subgroup_size = vk_gpu_info()->vendor == GPUVendor_NVIDIA ? 32 : 64;
+	u32 subgroup_size = vk_gpu_info()->subgroup_size;
 
 	cp->pipeline.shader_count = 0;
 	for (u32 i = 0; i < pb->pipeline.shader_count; i++) {
 		BeamformerShaderParameters *sp = pb->pipeline.parameters + i;
 		u32 slot   = cp->pipeline.shader_count;
 		u32 shader = pb->pipeline.shaders[i];
-		b32 commit = 0;
 
 		BeamformerShaderDescriptor *ld = cp->shader_descriptors + slot - 1;
 		BeamformerShaderDescriptor *sd = cp->shader_descriptors + slot;
-		zero_struct(sd);
 
 		switch (shader) {
-		case BeamformerShaderKind_CudaHilbert:{ commit = run_cuda_hilbert; }break;
+
+		case BeamformerShaderKind_CudaHilbert:{
+			if (run_cuda_hilbert)
+				compute_plan_push_shader(cp, shader, sp);
+		}break;
+
 		case BeamformerShaderKind_Decode:{
 			/* TODO(rnp): rework decode first and demodulate after */
 			b32 first = slot == 0;
 
-			BeamformerDecodeBakeParameters *db = &sd->bake.Decode;
-			db->data_kind = data_kind;
-			if (!first) {
-				if (data_kind == BeamformerDataKind_Int16) {
-					db->data_kind = BeamformerDataKind_Int16Complex;
-				} else {
-					db->data_kind = BeamformerDataKind_Float32Complex;
-				}
-			}
-
 			BeamformerShaderKind *last_shader = cp->pipeline.shaders + slot - 1;
 			assert(first || ((*last_shader == BeamformerShaderKind_Demodulate ||
 			                  *last_shader == BeamformerShaderKind_Filter)));
 
-			db->decode_mode    = pb->parameters.decode_mode;
-			db->transmit_count = pb->parameters.acquisition_count;
+			if ((first || pb->parameters.decode_mode != BeamformerDecodeMode_None) &&
+			    compute_plan_push_shader(cp, shader, sp))
+			{
+				BeamformerDecodeBakeParameters *db = &sd->bake.Decode;
 
-			u32 channel_stride         = pb->parameters.acquisition_count * pb->parameters.sample_count;
-			db->input_sample_stride    = first? 1                           : ld->bake.Filter.output_sample_stride;
-			db->input_channel_stride   = first? channel_stride              : ld->bake.Filter.output_channel_stride;
-			db->input_transmit_stride  = first? pb->parameters.sample_count : 1;
+				db->data_kind = data_kind;
+				if (!first) {
+					if (data_kind == BeamformerDataKind_Int16) {
+						db->data_kind = BeamformerDataKind_Int16Complex;
+					} else {
+						db->data_kind = BeamformerDataKind_Float32Complex;
+					}
+				}
 
-			db->output_sample_stride   = das_sample_stride;
-			db->output_channel_stride  = das_channel_stride;
-			db->output_transmit_stride = das_transmit_stride;
-			if (first) {
-				db->output_channel_stride  *= decimation_rate;
-				db->output_transmit_stride *= decimation_rate;
-			}
+				db->decode_mode    = pb->parameters.decode_mode;
+				db->transmit_count = pb->parameters.acquisition_count;
 
-			db->dilate_output = run_cuda_hilbert;
+				u32 channel_stride         = pb->parameters.acquisition_count * pb->parameters.sample_count;
+				db->input_sample_stride    = first? 1                           : ld->bake.Filter.output_sample_stride;
+				db->input_channel_stride   = first? channel_stride              : ld->bake.Filter.output_channel_stride;
+				db->input_transmit_stride  = first? pb->parameters.sample_count : 1;
 
-			if (db->decode_mode == BeamformerDecodeMode_None) {
-				sd->layout = (uv3){{subgroup_size, 1, 1}};
+				db->output_sample_stride   = das_sample_stride;
+				db->output_channel_stride  = das_channel_stride;
+				db->output_transmit_stride = das_transmit_stride;
+				if (first) {
+					db->output_channel_stride  *= decimation_rate;
+					db->output_transmit_stride *= decimation_rate;
+				}
 
-				sd->dispatch.x = (u32)ceil_f32((f32)sample_count                     / (f32)sd->layout.x);
-				sd->dispatch.y = (u32)ceil_f32((f32)pb->parameters.channel_count     / (f32)sd->layout.y);
-				sd->dispatch.z = (u32)ceil_f32((f32)pb->parameters.acquisition_count / (f32)sd->layout.z);
-			} else if (db->transmit_count > 40) {
-				db->use_shared_memory = 1;
-				db->to_process        = 2;
+				db->dilate_output = run_cuda_hilbert;
 
-				if (db->transmit_count == 48)
-					db->to_process = db->transmit_count / 16;
+				if (db->decode_mode == BeamformerDecodeMode_None) {
+					sd->layout = (uv3){{subgroup_size, 1, 1}};
 
-				b32 use_16z  = db->transmit_count == 48 || db->transmit_count == 80 ||
-				               db->transmit_count == 96 || db->transmit_count == 160;
-				sd->layout = (uv3){{4, 1, use_16z? 16 : 32}};
+					sd->dispatch.x = (u32)ceil_f32((f32)sample_count                     / (f32)sd->layout.x);
+					sd->dispatch.y = (u32)ceil_f32((f32)pb->parameters.channel_count     / (f32)sd->layout.y);
+					sd->dispatch.z = (u32)ceil_f32((f32)pb->parameters.acquisition_count / (f32)sd->layout.z);
+				} else if (db->transmit_count > 40) {
+					db->use_shared_memory = 1;
+					db->to_process        = 2;
 
-				sd->dispatch.x = (u32)ceil_f32((f32)sample_count                     / (f32)sd->layout.x);
-				sd->dispatch.y = (u32)ceil_f32((f32)pb->parameters.channel_count     / (f32)sd->layout.y);
-				sd->dispatch.z = (u32)ceil_f32((f32)pb->parameters.acquisition_count / (f32)sd->layout.z / (f32)db->to_process);
-			} else {
-				db->to_process = 1;
+					if (db->transmit_count == 48)
+						db->to_process = db->transmit_count / 16;
 
-				/* NOTE(rnp): register caching. using more threads will cause the compiler to do
-				 * contortions to avoid spilling registers. using less gives higher performance */
-				sd->layout = (uv3){{subgroup_size / 2, 1, 1}};
+					b32 use_16z  = db->transmit_count == 48 || db->transmit_count == 80 ||
+					               db->transmit_count == 96 || db->transmit_count == 160;
+					sd->layout = (uv3){{4, 1, use_16z? 16 : 32}};
 
-				sd->dispatch.x = (u32)ceil_f32((f32)sample_count                 / (f32)sd->layout.x);
-				sd->dispatch.y = (u32)ceil_f32((f32)pb->parameters.channel_count / (f32)sd->layout.y);
-				sd->dispatch.z = 1;
-			}
+					sd->dispatch.x = (u32)ceil_f32((f32)sample_count                     / (f32)sd->layout.x);
+					sd->dispatch.y = (u32)ceil_f32((f32)pb->parameters.channel_count     / (f32)sd->layout.y);
+					sd->dispatch.z = (u32)ceil_f32((f32)pb->parameters.acquisition_count / (f32)sd->layout.z / (f32)db->to_process);
+				} else {
+					db->to_process = 1;
+
+					/* NOTE(rnp): register caching. using more threads will cause the compiler to do
+					 * contortions to avoid spilling registers. using less gives higher performance */
+					/* TODO(rnp): may need to be adjusted to 16 on NVIDIA */
+					sd->layout = (uv3){{subgroup_size / 2, 1, 1}};
 
-			if (first) sd->dispatch.x *= decimation_rate;
+					sd->dispatch.x = (u32)ceil_f32((f32)sample_count                 / (f32)sd->layout.x);
+					sd->dispatch.y = (u32)ceil_f32((f32)pb->parameters.channel_count / (f32)sd->layout.y);
+					sd->dispatch.z = 1;
+				}
 
-			/* NOTE(rnp): decode 2 samples per dispatch when data is i16 */
-			if (first && data_kind == BeamformerDataKind_Int16)
-				sd->dispatch.x = (u32)ceil_f32((f32)sd->dispatch.x / 2);
+				if (first) sd->dispatch.x *= decimation_rate;
 
-			commit = first || db->decode_mode != BeamformerDecodeMode_None;
+				/* NOTE(rnp): decode 2 samples per dispatch when data is i16 */
+				if (first && data_kind == BeamformerDataKind_Int16)
+					sd->dispatch.x = (u32)ceil_f32((f32)sd->dispatch.x / 2);
+			}
 		}break;
+
 		case BeamformerShaderKind_Demodulate:
 		case BeamformerShaderKind_Filter:
 		{
-			b32 first = slot == 0;
-			b32 demod = shader == BeamformerShaderKind_Demodulate;
-			BeamformerFilter *f = cp->filters + sp->filter_slot;
-
-			time_offset += f->time_delay;
-
-			BeamformerFilterBakeParameters *fb = &sd->bake.Filter;
-			fb->filter_length  = (u32)f->length;
-			fb->demodulate     = demod;
-			fb->complex_filter = f->parameters.complex;
-
-			fb->data_kind = data_kind;
-			if (!first) fb->data_kind = BeamformerDataKind_Float32;
-
-			/* NOTE(rnp): when we are demodulating we pretend that the sampler was alternating
-			 * between sampling the I portion and the Q portion of an IQ signal. Therefore there
-			 * is an implicit decimation factor of 2 which must always be included. All code here
-			 * assumes that the signal was sampled in such a way that supports this operation.
-			 * To recover IQ[n] from the sampled data (RF[n]) we do the following:
-			 *   I[n]  = RF[n]
-			 *   Q[n]  = RF[n + 1]
-			 *   IQ[n] = I[n] - j*Q[n]
-			 */
-			if (demod) {
-				fb->demodulation_frequency = pb->parameters.demodulation_frequency;
-				fb->sampling_frequency     = pb->parameters.sampling_frequency / 2;
-				fb->decimation_rate        = decimation_rate;
-				fb->sample_count           = pb->parameters.sample_count;
-
-				fb->output_channel_stride  = das_channel_stride;
-				fb->output_sample_stride   = das_sample_stride;
-				fb->output_transmit_stride = das_transmit_stride;
-
-				if (first) {
-					fb->input_channel_stride  = pb->parameters.sample_count * pb->parameters.acquisition_count / 2;
-					fb->input_sample_stride   = 1;
-					fb->input_transmit_stride = pb->parameters.sample_count / 2;
-
-					if (pb->parameters.decode_mode == BeamformerDecodeMode_None) {
-						fb->output_floats = 1;
+			if (compute_plan_push_shader(cp, shader, sp)) {
+				b32 first = slot == 0;
+				b32 demod = shader == BeamformerShaderKind_Demodulate;
+				BeamformerFilter *f = cp->filters + sp->filter_slot;
+
+				time_offset += f->time_delay;
+
+				BeamformerFilterBakeParameters *fb = &sd->bake.Filter;
+				fb->filter_length  = (u32)f->length;
+				fb->demodulate     = demod;
+				fb->complex_filter = f->parameters.complex;
+
+				fb->data_kind = data_kind;
+				if (!first) fb->data_kind = BeamformerDataKind_Float32;
+
+				/* NOTE(rnp): when we are demodulating we pretend that the sampler was alternating
+				 * between sampling the I portion and the Q portion of an IQ signal. Therefore there
+				 * is an implicit decimation factor of 2 which must always be included. All code here
+				 * assumes that the signal was sampled in such a way that supports this operation.
+				 * To recover IQ[n] from the sampled data (RF[n]) we do the following:
+				 *   I[n]  = RF[n]
+				 *   Q[n]  = RF[n + 1]
+				 *   IQ[n] = I[n] - j*Q[n]
+				 */
+				if (demod) {
+					fb->demodulation_frequency = pb->parameters.demodulation_frequency;
+					fb->sampling_frequency     = pb->parameters.sampling_frequency / 2;
+					fb->decimation_rate        = decimation_rate;
+					fb->sample_count           = pb->parameters.sample_count;
+
+					fb->output_channel_stride  = das_channel_stride;
+					fb->output_sample_stride   = das_sample_stride;
+					fb->output_transmit_stride = das_transmit_stride;
+
+					if (first) {
+						fb->input_channel_stride  = pb->parameters.sample_count * pb->parameters.acquisition_count / 2;
+						fb->input_sample_stride   = 1;
+						fb->input_transmit_stride = pb->parameters.sample_count / 2;
+
+						if (pb->parameters.decode_mode == BeamformerDecodeMode_None) {
+							fb->output_floats = 1;
+						} else {
+							/* NOTE(rnp): output optimized layout for decoding */
+							fb->output_channel_stride  = das_channel_stride;
+							fb->output_sample_stride   = pb->parameters.acquisition_count;
+							fb->output_transmit_stride = 1;
+						}
 					} else {
-						/* NOTE(rnp): output optimized layout for decoding */
-						fb->output_channel_stride  = das_channel_stride;
-						fb->output_sample_stride   = pb->parameters.acquisition_count;
-						fb->output_transmit_stride = 1;
+						assert(cp->pipeline.shaders[slot - 1] == BeamformerShaderKind_Decode);
+						fb->input_channel_stride  = ld->bake.Decode.output_channel_stride;
+						fb->input_sample_stride   = ld->bake.Decode.output_sample_stride;
+						fb->input_transmit_stride = ld->bake.Decode.output_transmit_stride;
 					}
 				} else {
-					assert(cp->pipeline.shaders[slot - 1] == BeamformerShaderKind_Decode);
-					fb->input_channel_stride  = ld->bake.Decode.output_channel_stride;
-					fb->input_sample_stride   = ld->bake.Decode.output_sample_stride;
-					fb->input_transmit_stride = ld->bake.Decode.output_transmit_stride;
+					fb->decimation_rate        = 1;
+					fb->output_channel_stride  = sample_count * pb->parameters.acquisition_count;
+					fb->output_sample_stride   = 1;
+					fb->output_transmit_stride = sample_count;
+					fb->input_channel_stride   = sample_count * pb->parameters.acquisition_count;
+					fb->input_sample_stride    = 1;
+					fb->input_transmit_stride  = sample_count;
+					fb->sample_count           = sample_count;
 				}
-			} else {
-				fb->decimation_rate        = 1;
-				fb->output_channel_stride  = sample_count * pb->parameters.acquisition_count;
-				fb->output_sample_stride   = 1;
-				fb->output_transmit_stride = sample_count;
-				fb->input_channel_stride   = sample_count * pb->parameters.acquisition_count;
-				fb->input_sample_stride    = 1;
-				fb->input_transmit_stride  = sample_count;
-				fb->sample_count           = sample_count;
-			}
-
-			/* TODO(rnp): filter may need a different dispatch layout */
-			sd->layout     = (uv3){{128, 1, 1}};
-			sd->dispatch.x = (u32)ceil_f32((f32)sample_count                     / (f32)sd->layout.x);
-			sd->dispatch.y = (u32)ceil_f32((f32)pb->parameters.channel_count     / (f32)sd->layout.y);
-			sd->dispatch.z = (u32)ceil_f32((f32)pb->parameters.acquisition_count / (f32)sd->layout.z);
 
-			commit = 1;
+				/* TODO(rnp): filter may need a different dispatch layout */
+				sd->layout     = (uv3){{128, 1, 1}};
+				sd->dispatch.x = (u32)ceil_f32((f32)sample_count                     / (f32)sd->layout.x);
+				sd->dispatch.y = (u32)ceil_f32((f32)pb->parameters.channel_count     / (f32)sd->layout.y);
+				sd->dispatch.z = (u32)ceil_f32((f32)pb->parameters.acquisition_count / (f32)sd->layout.z);
+			}
 		}break;
-		case BeamformerShaderKind_DAS:{
-			BeamformerDASBakeParameters *db = &sd->bake.DAS;
-
-			db->data_kind = BeamformerDataKind_Float32;
-			if (cp->iq_pipeline) db->data_kind = BeamformerDataKind_Float32Complex;
-
-			BeamformerDASPushConstants  *du = &cp->das_ubo_data;
-			du->xdc_element_pitch      = pb->parameters.xdc_element_pitch;
-			db->sampling_frequency     = sampling_frequency;
-			db->demodulation_frequency = pb->parameters.demodulation_frequency;
-			db->speed_of_sound         = pb->parameters.speed_of_sound;
-			db->time_offset            = time_offset;
-			db->f_number               = pb->parameters.f_number;
-			db->acquisition_kind       = pb->parameters.acquisition_kind;
-			db->sample_count           = sample_count;
-			db->channel_count          = pb->parameters.channel_count;
-			db->acquisition_count      = pb->parameters.acquisition_count;
-			db->interpolation_mode     = pb->parameters.interpolation_mode;
-			db->transmit_angle         = pb->parameters.focal_vector.E[0];
-			db->focus_depth            = pb->parameters.focal_vector.E[1];
-			db->transmit_receive_orientation = pb->parameters.transmit_receive_orientation;
-
-			// NOTE(rnp): old gcc will miscompile an assignment
-			mem_copy(du->voxel_transform.E, pb->parameters.das_voxel_transform.E, sizeof(du->voxel_transform));
-			mem_copy(du->xdc_transform.E,   pb->parameters.xdc_transform.E,       sizeof(du->xdc_transform));
-
-			du->voxel_transform = m4_mul(cp->ui_voxel_transform, du->voxel_transform);
-
-			u32 id = pb->parameters.acquisition_kind;
-
-			if (id == BeamformerAcquisitionKind_UFORCES || id == BeamformerAcquisitionKind_FORCES)
-				du->voxel_transform = m4_mul(du->xdc_transform, du->voxel_transform);
-
-			db->sparse = id == BeamformerAcquisitionKind_UFORCES ||
-			             id == BeamformerAcquisitionKind_UHERCULES;
-
-			db->single_focus        = pb->parameters.single_focus;
-			db->single_orientation  = pb->parameters.single_orientation;
-			db->coherency_weighting = pb->parameters.coherency_weighting;
-			db->fast                = !pb->parameters.coherency_weighting;
-
-			sd->layout = (uv3){{1, 1, 1}};
-
-			b32 has_x = cp->output_points.x > 1;
-			b32 has_y = cp->output_points.y > 1;
-			b32 has_z = cp->output_points.z > 1;
-
-			u32 grid_3d_z_size = Max(1, subgroup_size / (4 * 4));
-			u32 grid_2d_y_size = Max(1, subgroup_size / 8);
-
-			switch (iv3_dimension(cp->output_points)) {
-
-			case 1:{
-				if (has_x) sd->layout.x = subgroup_size;
-				if (has_y) sd->layout.y = subgroup_size;
-				if (has_z) sd->layout.z = subgroup_size;
-			}break;
-
-			case 2:{
-				if (has_x && has_y) {sd->layout.x = 8; sd->layout.y = grid_2d_y_size;}
-				if (has_x && has_z) {sd->layout.x = 8; sd->layout.z = grid_2d_y_size;}
-				if (has_y && has_z) {sd->layout.y = 8; sd->layout.z = grid_2d_y_size;}
-			}break;
 
-			case 3:{sd->layout = (uv3){{4, 4, grid_3d_z_size}};}break;
-
-			InvalidDefaultCase;
+		case BeamformerShaderKind_DAS:{
+			if (compute_plan_push_shader(cp, shader, sp)) {
+				BeamformerDASBakeParameters *db = &sd->bake.DAS;
+				db->data_kind = BeamformerDataKind_Float32;
+				if (cp->iq_pipeline) db->data_kind = BeamformerDataKind_Float32Complex;
+
+				cp->voxel_transform        = m4_mul(cp->ui_voxel_transform, pb->parameters.das_voxel_transform);
+				cp->xdc_element_pitch      = pb->parameters.xdc_element_pitch;
+
+				db->sampling_frequency     = sampling_frequency;
+				db->demodulation_frequency = pb->parameters.demodulation_frequency;
+				db->speed_of_sound         = pb->parameters.speed_of_sound;
+				db->time_offset            = time_offset;
+				db->f_number               = pb->parameters.f_number;
+				db->acquisition_kind       = pb->parameters.acquisition_kind;
+				db->sample_count           = sample_count;
+				db->channel_count          = pb->parameters.channel_count;
+				db->acquisition_count      = pb->parameters.acquisition_count;
+				db->interpolation_mode     = pb->parameters.interpolation_mode;
+				db->transmit_angle         = pb->parameters.focal_vector.E[0];
+				db->focus_depth            = pb->parameters.focal_vector.E[1];
+				db->transmit_receive_orientation = pb->parameters.transmit_receive_orientation;
+
+				// NOTE(rnp): old gcc will miscompile an assignment
+				mem_copy(cp->xdc_transform.E, pb->parameters.xdc_transform.E, sizeof(cp->xdc_transform));
+
+				u32 id = pb->parameters.acquisition_kind;
+				if (id == BeamformerAcquisitionKind_UFORCES || id == BeamformerAcquisitionKind_FORCES)
+					cp->voxel_transform = m4_mul(cp->xdc_transform, cp->voxel_transform);
+
+				db->sparse = id == BeamformerAcquisitionKind_UFORCES || id == BeamformerAcquisitionKind_UHERCULES;
+				db->single_focus        = pb->parameters.single_focus;
+				db->single_orientation  = pb->parameters.single_orientation;
+				db->coherency_weighting = pb->parameters.coherency_weighting;
+
+				sd->layout   = layout_for_output(cp->output_points);
+				sd->dispatch = dispatch_for_output(sd->layout, cp->output_points);
+
+				if (pb->parameters.coherency_weighting &&
+				    compute_plan_push_shader(cp, BeamformerShaderKind_CoherencyWeighting, sp))
+				{
+					BeamformerShaderDescriptor *shader_descriptor = cp->shader_descriptors + cp->pipeline.shader_count - 1;
+					shader_descriptor->layout   = sd->layout;
+					shader_descriptor->dispatch = sd->dispatch;
+					shader_descriptor->bake.CoherencyWeighting.data_kind = db->data_kind;
+				}
 			}
+		}break;
 
-			sd->dispatch.x = (u32)ceil_f32((f32)cp->output_points.x / sd->layout.x);
-			sd->dispatch.y = (u32)ceil_f32((f32)cp->output_points.y / sd->layout.y);
-			sd->dispatch.z = (u32)ceil_f32((f32)cp->output_points.z / sd->layout.z);
+		#if 0
+		case BeamformerShaderKind_Sum:{
+			sd->bake.data_kind = BeamformerDataKind_Float32;
+			if (cp->iq_pipeline)
+				sd->bake.data_kind = BeamformerDataKind_Float32Complex;
+
+			sd->layout   = layout_for_output(cp->output_points);
+			sd->dispatch = dispatch_for_output(sd->layout, cp->output_points);
 
 			commit = 1;
 		}break;
-		default:{ commit = 1; }break;
-		}
+		#endif
 
-		if (commit) {
-			u32 index = cp->pipeline.shader_count++;
-			cp->pipeline.shaders[index]    = shader;
-			cp->pipeline.parameters[index] = *sp;
+		default:{}break;
 		}
 	}
 	cp->pipeline.data_kind = data_kind;
 }
 
 function void
-stream_push_shader_header(Stream *s, BeamformerShaderKind shader_kind, s8 header)
+stream_append_shader_header(Stream *s, i32 reloadable_index, BeamformerShaderDescriptor *sd, uv3 layout)
 {
-	stream_append_s8s(s, s8("#version 460 core\n\n"), header);
-
-	switch (shader_kind) {
-	case BeamformerShaderKind_DAS:{
-		stream_append_s8(s, s8(""
-		"layout(location = " str(DAS_CYCLE_T_UNIFORM_LOC)      ") uniform uint  u_cycle_t;\n"
-		"layout(location = " str(DAS_FAST_CHANNEL_UNIFORM_LOC) ") uniform int   u_channel;\n\n"
-		));
-	}break;
-	case BeamformerShaderKind_Decode:{
-		stream_append_s8s(s, s8(""
-		"layout(location = " str(DECODE_FIRST_PASS_UNIFORM_LOC) ") uniform bool u_first_pass;\n\n"
-		));
-	}break;
-	case BeamformerShaderKind_MinMax:{
-		stream_append_s8(s, s8("layout(location = " str(MIN_MAX_MIPS_LEVEL_UNIFORM_LOC)
-		                       ") uniform int u_mip_map;\n\n"));
-	}break;
-	case BeamformerShaderKind_Sum:{
-		stream_append_s8(s, s8("layout(location = " str(SUM_PRESCALE_UNIFORM_LOC)
-		                       ") uniform float u_sum_prescale = 1.0;\n\n"));
-	}break;
-	default:{}break;
+	stream_append_s8s(s, s8("#version 460 core\n\n"
+	"#extension GL_EXT_buffer_reference : require\n"
+	"#extension GL_EXT_shader_16bit_storage : require\n"
+	"#extension GL_EXT_shader_explicit_arithmetic_types : require\n\n"));
+
+	i32  header_vector_length = beamformer_shader_header_vector_lengths[reloadable_index];
+	i32 *header_vector        = beamformer_shader_header_vectors[reloadable_index];
+	for (i32 index = 0; index < header_vector_length; index++)
+		stream_append_s8(s, beamformer_shader_global_header_strings[header_vector[index]]);
+
+	if (layout.x != 0) {
+		stream_append_s8(s,  s8("layout(local_size_x = "));
+		stream_append_u64(s, layout.x);
+		stream_append_s8(s,  s8(", local_size_y = "));
+		stream_append_u64(s, layout.y);
+		stream_append_s8(s,  s8(", local_size_z = "));
+		stream_append_u64(s, layout.z);
+		stream_append_s8(s,  s8(") in;\n\n"));
 	}
-}
-
-function void
-load_compute_shader(BeamformerCtx *ctx, BeamformerComputePlan *cp, u32 shader_slot, Arena arena)
-{
-	BeamformerShaderKind shader = cp->pipeline.shaders[shader_slot];
-
-	u32 program          = 0;
-	i32 reloadable_index = beamformer_shader_reloadable_index_by_shader[shader];
-	if (reloadable_index != -1) {
-		BeamformerShaderKind base_shader = beamformer_reloadable_shader_kinds[reloadable_index];
-		s8 path;
-		if (!BakeShaders)
-			path = push_s8_from_parts(&arena, os_path_separator(), s8("shaders"),
-		                            beamformer_reloadable_shader_files[reloadable_index]);
-
-		Stream shader_stream = arena_stream(arena);
-		stream_push_shader_header(&shader_stream, base_shader, s8(""));
-
-		i32  header_vector_length = beamformer_shader_header_vector_lengths[reloadable_index];
-		i32 *header_vector        = beamformer_shader_header_vectors[reloadable_index];
-		for (i32 index = 0; index < header_vector_length; index++)
-			stream_append_s8(&shader_stream, beamformer_shader_global_header_strings[header_vector[index]]);
-
-		BeamformerShaderDescriptor *sd = cp->shader_descriptors + shader_slot;
-
-		if (sd->layout.x != 0) {
-			stream_append_s8(&shader_stream,  s8("layout(local_size_x = "));
-			stream_append_u64(&shader_stream, sd->layout.x);
-			stream_append_s8(&shader_stream,  s8(", local_size_y = "));
-			stream_append_u64(&shader_stream, sd->layout.y);
-			stream_append_s8(&shader_stream,  s8(", local_size_z = "));
-			stream_append_u64(&shader_stream, sd->layout.z);
-			stream_append_s8(&shader_stream,  s8(") in;\n\n"));
-		}
 
+	if (sd) {
 		u32 *parameters = (u32 *)&sd->bake;
 		s8  *names      = beamformer_shader_bake_parameter_names[reloadable_index];
 		u32  float_bits = beamformer_shader_bake_parameter_float_bits[reloadable_index];
 		i32  count      = beamformer_shader_bake_parameter_counts[reloadable_index];
 
 		for (i32 index = 0; index < count; index++) {
-			stream_append_s8s(&shader_stream, s8("#define "), names[index],
+			stream_append_s8s(s, s8("#define "), names[index],
 			                  (float_bits & (1 << index))? s8(" uintBitsToFloat") : s8(" "), s8("(0x"));
-			stream_append_hex_u64(&shader_stream, parameters[index]);
-			stream_append_s8(&shader_stream, s8(")\n"));
+			stream_append_hex_u64(s, parameters[index]);
+			stream_append_s8(s, s8(")\n"));
 		}
+	}
 
-		if (!renderdoc_attached())
-			stream_append_s8(&shader_stream, s8("\n#line 1\n"));
+	if (!renderdoc_attached())
+		stream_append_s8(s, s8("\n\n#line 1\n"));
+}
+
+function void
+beamformer_reload_pipeline(VulkanHandle *pipeline, BeamformerShaderReloadInfo *sris, u32 count, Arena arena)
+{
+	assume(count <= 2);
+	s8 paths[2];
+	VulkanPipelineCreateInfo infos[2];
+
+	if (!BakeShaders) {
+		for (u32 i = 0; i < count; i++)
+			paths[i] = push_s8_from_parts(&arena, os_path_separator(), s8("shaders"), sris[i].filename_or_data);
+	}
+
+	u32 push_constants_size = 0;
+	for (u32 i = 0; i < count; i++) {
+		Stream shader_stream = arena_stream(arena);
+		i32 reloadable_index = beamformer_shader_reloadable_index_by_shader[sris[i].shader];
+		if (i == 0) push_constants_size = beamformer_shader_push_constant_sizes[reloadable_index];
+		else        assert(push_constants_size == beamformer_shader_push_constant_sizes[reloadable_index]);
+
+		stream_append_shader_header(&shader_stream, reloadable_index, sris[i].shader_descriptor, sris[i].layout);
 
-		s8 shader_text;
 		if (BakeShaders) {
-			stream_append_s8(&shader_stream, beamformer_shader_data[reloadable_index]);
-			shader_text = arena_stream_commit(&arena, &shader_stream);
+			stream_append_s8(&shader_stream, sris[i].filename_or_data);
 		} else {
-			shader_text = arena_stream_commit(&arena, &shader_stream);
-			i64 length = os_read_entire_file((c8 *)path.data, arena.beg, arena_capacity(&arena, u8));
-			shader_text.len += length;
-			arena_commit(&arena, length);
+			shader_stream.widx += os_read_entire_file((c8 *)paths[i].data,
+			                                          shader_stream.data + shader_stream.widx,
+			                                          shader_stream.cap  - shader_stream.widx);
 		}
 
-		/* TODO(rnp): instance name */
-		s8 shader_name = beamformer_shader_names[shader];
-		program = load_shader(arena, &shader_text, (u32 []){GL_COMPUTE_SHADER}, 1, shader_name);
+		infos[i].kind = sris[i].shader_kind;
+		infos[i].text = arena_stream_commit_zero(&arena, &shader_stream);
+		infos[i].name = beamformer_shader_names[sris[i].shader];
+
+		//s8 line = s8("---------------\n");
+		//s8 nl   = s8("\n");
+		//os_console_log(line.data, line.len);
+		//os_console_log(infos[i].name.data, infos[i].name.len);
+		//os_console_log(nl.data, nl.len);
+		//os_console_log(line.data, line.len);
+		//os_console_log(infos[i].text.data, infos[i].text.len);
+		//os_console_log(line.data, line.len);
 	}
 
-	glDeleteProgram(cp->programs[shader_slot]);
-	cp->programs[shader_slot] = program;
+	vk_pipeline_release(*pipeline);
+	*pipeline = vk_pipeline(infos, count, push_constants_size);
+}
+
+function void
+beamformer_reload_render_pipeline(VulkanHandle *pipeline, BeamformerShaderKind shader, Arena arena)
+{
+	i32 index = beamformer_shader_reloadable_index_by_shader[shader];
+	BeamformerShaderReloadInfo infos[2] = {
+		{
+			.shader      = shader,
+			.shader_kind = beamformer_shader_primitive_is_vertex[index] ? VulkanShaderKind_Vertex : VulkanShaderKind_Mesh,
+			.filename_or_data = BakeShaders ? beamformer_shader_data[index][0]
+			                                : beamformer_reloadable_shader_files[index][0],
+		},
+		{
+			.shader           = shader,
+			.shader_kind      = VulkanShaderKind_Fragment,
+			.filename_or_data = BakeShaders ? beamformer_shader_data[index][1]
+			                                : beamformer_reloadable_shader_files[index][1],
+		},
+	};
+	beamformer_reload_pipeline(pipeline, infos, countof(infos), arena);
+}
+
+function void
+beamformer_reload_compute_pipeline(VulkanHandle *pipeline, BeamformerShaderKind shader,
+                                   BeamformerShaderDescriptor *shader_descriptor, Arena arena)
+{
+	i32 index  = beamformer_shader_reloadable_index_by_shader[shader];
+	uv3 layout = shader_descriptor ? shader_descriptor->layout : (uv3){{vk_gpu_info()->subgroup_size, 1, 1}};
+	BeamformerShaderReloadInfo info = {
+		.shader            = shader,
+		.shader_kind       = VulkanShaderKind_Compute,
+		.shader_descriptor = shader_descriptor,
+		.filename_or_data  = BakeShaders ? beamformer_shader_data[index][0]
+		                                 : beamformer_reloadable_shader_files[index][0],
+		.layout            = layout,
+	};
+	beamformer_reload_pipeline(pipeline, &info, 1, arena);
 }
 
 function void
@@ -755,52 +722,58 @@ beamformer_commit_parameter_block(BeamformerCtx *ctx, BeamformerComputePlan *cp,
 				cp->shader_hashes[shader_slot] = hash;
 			}
 
-			#define X(k, t, v) glNamedBufferSubData(cp->ubos[BeamformerComputeUBOKind_##k], \
-			                                        0, sizeof(t), &cp->v ## _ubo_data);
-			BEAMFORMER_COMPUTE_UBO_LIST
-			#undef X
-
 			cp->acquisition_count = pb->parameters.acquisition_count;
 			cp->acquisition_kind  = pb->parameters.acquisition_kind;
 
-			u32 decoded_data_size = cp->rf_size;
-			if (ctx->compute_context.ping_pong_ssbo_size < decoded_data_size)
-				alloc_shader_storage(ctx, decoded_data_size, arena);
+			// NOTE(rnp): buffer size / 2 should be mutiple of 64
+			i64 buffer_size = round_up_to(2 * cp->rf_size, 128);
+			if (ctx->compute_context.ping_pong_buffer.size < buffer_size) {
+				GPUBufferAllocateInfo allocate_info = {.size = buffer_size, .label = s8("PingPongBuffer")};
+				vk_buffer_allocate(&ctx->compute_context.ping_pong_buffer, &allocate_info);
+				// TODO(rnp): figure out how to share with CUDA
+			}
 
 			if (cp->hadamard_order != (i32)cp->acquisition_count)
-				update_hadamard_texture(cp, (i32)cp->acquisition_count, 0, arena);
-
-			mem_copy(cp->voxel_transform.E,  pb->parameters.das_voxel_transform.E, sizeof(cp->voxel_transform));
-
-			GLenum gl_kind = cp->iq_pipeline ? GL_RG32F : GL_R32F;
-			if (cp->average_frames > 1 && !beamformer_frame_compatible(ctx->averaged_frames + 0, cp->output_points, gl_kind)) {
-				alloc_beamform_frame(ctx->averaged_frames + 0, cp->output_points, gl_kind, s8("Averaged Frame"), arena);
-				alloc_beamform_frame(ctx->averaged_frames + 1, cp->output_points, gl_kind, s8("Averaged Frame"), arena);
-			}
+				update_hadamard(cp, (i32)cp->acquisition_count, 0, arena);
 		}break;
+
 		case BeamformerParameterBlockRegion_ChannelMapping:{
 			cuda_set_channel_mapping(pb->channel_mapping);
 		}break;
+		case BeamformerParameterRegionFlag_TransmitReceiveOrientations:{
+			GPUBuffer *b = &cp->array_parameters;
+			u32 kind   = BeamformerComputeArrayParameterKind_TransmitReceiveOrientations;
+			u64 offset = beamformer_compute_array_parameter_offsets[kind];
+			u64 size   = beamformer_compute_array_parameter_sizes[kind];
+			{
+				Arena scratch = arena;
+				u16 *u16s = push_array(&scratch, u16, countof(pb->transmit_receive_orientations));
+				for (u32 i = 0; i < countof(pb->transmit_receive_orientations); i++)
+					u16s[i] = pb->transmit_receive_orientations[i];
+
+				vk_buffer_range_upload(b, u16s, offset, size, 0);
+			}
+		}break;
 		case BeamformerParameterRegionFlag_FocalVectors:
 		case BeamformerParameterRegionFlag_SparseElements:
-		case BeamformerParameterRegionFlag_TransmitReceiveOrientations:
 		{
-			BeamformerComputeTextureKind texture_kind = 0;
-			u32 pixel_type = 0, texture_format = 0;
+			u32 kind = BeamformerComputeArrayParameterKind_Count;
 			switch (region) {
-			#define X(kind, _gl, tf, pt, ...) \
-			case BeamformerParameterRegionFlag_##kind:{ \
-				texture_kind   = BeamformerComputeTextureKind_## kind; \
-				texture_format = tf;                                   \
-				pixel_type     = pt;                                   \
+			case BeamformerParameterBlockRegion_FocalVectors:{
+				kind = BeamformerComputeArrayParameterKind_FocalVectors;
+			}break;
+			case BeamformerParameterBlockRegion_SparseElements:{
+				kind = BeamformerComputeArrayParameterKind_SparseElements;
 			}break;
-			BEAMFORMER_COMPUTE_TEXTURE_LIST
-			#undef X
 			InvalidDefaultCase;
 			}
-			glTextureSubImage1D(cp->textures[texture_kind], 0, 0, BeamformerMaxChannelCount,
-			                    texture_format, pixel_type,
-			                    (u8 *)pb + BeamformerParameterBlockRegionOffsets[region]);
+
+			if (kind != BeamformerComputeArrayParameterKind_Count) {
+				GPUBuffer *b = &cp->array_parameters;
+				u64 offset = beamformer_compute_array_parameter_offsets[kind];
+				u64 size   = beamformer_compute_array_parameter_sizes[kind];
+				vk_buffer_range_upload(b, (u8 *)pb + BeamformerParameterBlockRegionOffsets[region], offset, size, 0);
+			}
 		}break;
 		}
 	}
@@ -808,127 +781,205 @@ beamformer_commit_parameter_block(BeamformerCtx *ctx, BeamformerComputePlan *cp,
 }
 
 function void
-do_compute_shader(BeamformerCtx *ctx, BeamformerComputePlan *cp, BeamformerFrame *frame,
-                  BeamformerShaderKind shader, u32 shader_slot, BeamformerShaderParameters *sp, Arena arena)
+do_compute_shader(BeamformerCtx *ctx, VulkanHandle cmd, BeamformerComputePlan *cp, BeamformerFrame *frame,
+                  u32 shader_slot, Arena arena, u64 rf_pointer)
 {
 	BeamformerComputeContext *cc = &ctx->compute_context;
 
-	u32 program = cp->programs[shader_slot];
-	glUseProgram(program);
+	u32 output_index = !cc->ping_pong_input_index;
+	u32 input_index  =  cc->ping_pong_input_index;
 
-	u32 output_ssbo_idx = !cc->last_output_ssbo_index;
-	u32 input_ssbo_idx  = cc->last_output_ssbo_index;
+	u64 pp_size = cc->ping_pong_buffer.size / 2;
+	u64 pp_input_pointer  = cc->ping_pong_buffer.gpu_pointer + input_index  * pp_size;
+	u64 pp_output_pointer = cc->ping_pong_buffer.gpu_pointer + output_index * pp_size;
 
 	uv3 dispatch = cp->shader_descriptors[shader_slot].dispatch;
-	switch (shader) {
-	case BeamformerShaderKind_Decode:{
-		glBindImageTexture(0, cp->textures[BeamformerComputeTextureKind_Hadamard], 0, 0, 0, GL_READ_ONLY, GL_R16F);
 
+	vk_command_bind_pipeline(cmd, cp->vulkan_pipelines[shader_slot]);
+
+	switch (cp->pipeline.shaders[shader_slot]) {
+
+	case BeamformerShaderKind_Decode:{
 		BeamformerDecodeMode mode = cp->shader_descriptors[shader_slot].bake.Decode.decode_mode;
-		if (shader_slot == 0) {
-			if (mode != BeamformerDecodeMode_None) {
-				glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, cc->ping_pong_ssbos[input_ssbo_idx]);
-				glProgramUniform1ui(program, DECODE_FIRST_PASS_UNIFORM_LOC, 1);
+		BeamformerDecodePushConstants pc = {
+			.hadamard_buffer = cp->array_parameters.gpu_pointer + offsetof(BeamformerComputeArrayParameters, Hadamard),
+			.output_buffer   = pp_output_pointer,
+		};
 
-				glDispatchCompute(dispatch.x, dispatch.y, dispatch.z);
-				glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
-			}
-		}
+		if (shader_slot == 0 && mode != BeamformerDecodeMode_None) {
+			pc.output_rf_buffer = pp_input_pointer;
+			pc.rf_buffer        = rf_pointer;
+			pc.first_pass       = 1;
 
-		if (mode != BeamformerDecodeMode_None)
-			glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, cc->ping_pong_ssbos[input_ssbo_idx]);
+			GPUMemoryBarrierInfo barrier = {
+				.gpu_buffer = &cc->ping_pong_buffer,
+				.offset     = pp_input_pointer - cc->ping_pong_buffer.gpu_pointer,
+				.size       = pp_size,
+			};
 
-		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 3, cc->ping_pong_ssbos[output_ssbo_idx]);
+			vk_command_push_constants(cmd, 0, sizeof(pc), &pc);
+			vk_command_dispatch_compute(cmd, dispatch);
+			vk_command_buffer_memory_barriers(cmd, &barrier, 1);
 
-		glProgramUniform1ui(program, DECODE_FIRST_PASS_UNIFORM_LOC, 0);
+			pc.output_rf_buffer = 0;
+		}
+
+		pc.rf_buffer  = pp_input_pointer;
+		pc.first_pass = 0;
 
-		glDispatchCompute(dispatch.x, dispatch.y, dispatch.z);
-		glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
+		GPUMemoryBarrierInfo barrier = {
+			.gpu_buffer = &cc->ping_pong_buffer,
+			.offset     = pp_output_pointer - cc->ping_pong_buffer.gpu_pointer,
+			.size       = pp_size,
+		};
 
-		cc->last_output_ssbo_index = !cc->last_output_ssbo_index;
+		vk_command_push_constants(cmd, 0, sizeof(pc), &pc);
+		vk_command_dispatch_compute(cmd, dispatch);
+		vk_command_buffer_memory_barriers(cmd, &barrier, 1);
+
+		cc->ping_pong_input_index = !cc->ping_pong_input_index;
 	}break;
+
 	case BeamformerShaderKind_CudaDecode:{
-		cuda_decode(0, output_ssbo_idx, 0);
-		cc->last_output_ssbo_index = !cc->last_output_ssbo_index;
+		cuda_decode(0, output_index, 0);
+		cc->ping_pong_input_index = !cc->ping_pong_input_index;
 	}break;
 	case BeamformerShaderKind_CudaHilbert:{
-		cuda_hilbert(input_ssbo_idx, output_ssbo_idx);
-		cc->last_output_ssbo_index = !cc->last_output_ssbo_index;
+		cuda_hilbert(input_index, output_index);
+		cc->ping_pong_input_index = !cc->ping_pong_input_index;
 	}break;
+
 	case BeamformerShaderKind_Filter:
 	case BeamformerShaderKind_Demodulate:
 	{
-		if (shader_slot != 0)
-			glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, cc->ping_pong_ssbos[input_ssbo_idx]);
-		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, cc->ping_pong_ssbos[output_ssbo_idx]);
-		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 3, cp->filters[sp->filter_slot].ssbo);
+		u32 filter_slot = cp->pipeline.parameters[shader_slot].filter_slot;
+		BeamformerFilterPushConstants pc = {
+			.filter_coefficients = cp->filters[filter_slot].buffer.gpu_pointer,
+			.output_data         = pp_output_pointer,
+			.input_data          = shader_slot == 0 ? rf_pointer : pp_input_pointer,
+		};
 
-		glDispatchCompute(dispatch.x, dispatch.y, dispatch.z);
-		glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
+		GPUMemoryBarrierInfo barrier = {
+			.gpu_buffer = &cc->ping_pong_buffer,
+			.offset     = pp_output_pointer - cc->ping_pong_buffer.gpu_pointer,
+			.size       = pp_size,
+		};
 
-		cc->last_output_ssbo_index = !cc->last_output_ssbo_index;
-	}break;
-	case BeamformerShaderKind_MinMax:{
-		for (i32 i = 1; i < frame->mips; i++) {
-			glBindImageTexture(0, frame->texture, i - 1, GL_TRUE, 0, GL_READ_ONLY,  GL_RG32F);
-			glBindImageTexture(1, frame->texture, i - 0, GL_TRUE, 0, GL_WRITE_ONLY, GL_RG32F);
-			glProgramUniform1i(program, MIN_MAX_MIPS_LEVEL_UNIFORM_LOC, i);
+		vk_command_push_constants(cmd, 0, sizeof(pc), &pc);
+		vk_command_dispatch_compute(cmd, dispatch);
+		vk_command_buffer_memory_barriers(cmd, &barrier, 1);
 
-			u32 width  = (u32)frame->dim.x >> i;
-			u32 height = (u32)frame->dim.y >> i;
-			u32 depth  = (u32)frame->dim.z >> i;
-			glDispatchCompute(ORONE(width / 32), ORONE(height), ORONE(depth / 32));
-			glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT);
-		}
+		cc->ping_pong_input_index = !cc->ping_pong_input_index;
 	}break;
+
 	case BeamformerShaderKind_DAS:{
 		local_persist u32 das_cycle_t = 0;
 
-		BeamformerDASBakeParameters *db = &cp->shader_descriptors[shader_slot].bake.DAS;
-		if (db->fast) {
-			glClearTexImage(frame->texture, 0, GL_RED, GL_FLOAT, 0);
-			glMemoryBarrier(GL_TEXTURE_UPDATE_BARRIER_BIT);
-			glBindImageTexture(0, frame->texture, 0, GL_TRUE, 0, GL_READ_WRITE, cp->iq_pipeline ? GL_RG32F : GL_R32F);
+		GPUBuffer *b = cc->backlog.buffer;
+
+		u64 frame_size      = beamformer_frame_byte_size(frame->points, frame->data_kind);
+		u64 incoherent_size = frame_size / beamformer_data_kind_element_count[frame->data_kind];
+
+		BeamformerDASPushConstants pc = {
+			.xdc_element_pitch  = cp->xdc_element_pitch,
+			.rf_data            = pp_input_pointer,
+			.output_data        = b->gpu_pointer + frame->buffer_offset,
+			.incoherent_output  = b->gpu_pointer + b->size - incoherent_size,
+			.array_parameters   = cp->array_parameters.gpu_pointer + offsetof(BeamformerDASArrayParameters, focal_vectors),
+			.output_size_x      = cp->output_points.x,
+			.output_size_y      = cp->output_points.y,
+			.output_size_z      = cp->output_points.z,
+			.cycle_t            = das_cycle_t++,
+		};
+		mem_copy(pc.voxel_transform.E, cp->voxel_transform.E, sizeof(pc.voxel_transform));
+		mem_copy(pc.xdc_transform.E,   cp->xdc_transform.E,   sizeof(pc.xdc_transform));
+
+		b32 coherent = cp->shader_descriptors[shader_slot].bake.DAS.coherency_weighting;
+
+		i32 loop_end;
+		if (cp->acquisition_kind == BeamformerAcquisitionKind_RCA_VLS ||
+		    cp->acquisition_kind == BeamformerAcquisitionKind_RCA_TPW)
+		{
+			/* NOTE(rnp): to avoid repeatedly sampling the whole focal vectors
+			 * texture we loop over transmits for VLS/TPW */
+			loop_end = (i32)cp->acquisition_count;
 		} else {
-			glBindImageTexture(0, frame->texture, 0, GL_TRUE, 0, GL_WRITE_ONLY, cp->iq_pipeline ? GL_RG32F : GL_R32F);
+			loop_end = (i32)cp->shader_descriptors[shader_slot].bake.DAS.channel_count;
 		}
 
-		u32 sparse_texture = cp->textures[BeamformerComputeTextureKind_SparseElements];
-		if (!db->sparse) sparse_texture = 0;
-
-		glBindBufferBase(GL_UNIFORM_BUFFER, 0, cp->ubos[BeamformerComputeUBOKind_DAS]);
-		glBindBufferRange(GL_SHADER_STORAGE_BUFFER, 1, cc->ping_pong_ssbos[input_ssbo_idx], 0, cp->rf_size);
-		glBindImageTexture(1, sparse_texture, 0, 0, 0, GL_READ_ONLY, GL_R16I);
-		glBindImageTexture(2, cp->textures[BeamformerComputeTextureKind_FocalVectors], 0, 0, 0, GL_READ_ONLY, GL_RG32F);
-		glBindImageTexture(3, cp->textures[BeamformerComputeTextureKind_TransmitReceiveOrientations], 0, 0, 0, GL_READ_ONLY, GL_R8I);
+		GPUMemoryBarrierInfo memory_barriers[2] = {
+			{
+				.gpu_buffer = b,
+				.offset     = frame->buffer_offset,
+				.size       = frame_size,
+			},
+			{
+				.gpu_buffer = b,
+				.offset     = pc.incoherent_output - b->gpu_pointer,
+				.size       = incoherent_size,
+			},
+		};
 
-		glProgramUniform1ui(program, DAS_CYCLE_T_UNIFORM_LOC, das_cycle_t++);
+		// NOTE(rnp): barrier to wait for clear pipeline to complete
+		vk_command_buffer_memory_barriers(cmd, memory_barriers, 1 + coherent);
 
-		if (db->fast) {
-			i32 loop_end;
-			if (db->acquisition_kind == BeamformerAcquisitionKind_RCA_VLS ||
-			    db->acquisition_kind == BeamformerAcquisitionKind_RCA_TPW)
-			{
-				/* NOTE(rnp): to avoid repeatedly sampling the whole focal vectors
-				 * texture we loop over transmits for VLS/TPW */
-				loop_end = (i32)db->acquisition_count;
-			} else {
-				loop_end = (i32)db->channel_count;
-			}
-			f32 percent_per_step = 1.0f / (f32)loop_end;
-			cc->processing_progress = -percent_per_step;
-			for (i32 index = 0; index < loop_end; index++) {
-				cc->processing_progress += percent_per_step;
-				/* IMPORTANT(rnp): prevents OS from coalescing and killing our shader */
-				glFinish();
-				glProgramUniform1i(program, DAS_FAST_CHANNEL_UNIFORM_LOC, index);
-				glDispatchCompute(dispatch.x, dispatch.y, dispatch.z);
-				glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT);
+		vk_command_push_constants(cmd, 0, sizeof(pc), &pc);
+		for (i32 index = 0; index < loop_end; index++) {
+			if (index != 0) {
+				pc.channel_t = index;
+				vk_command_push_constants(cmd, offsetof(BeamformerDASPushConstants, channel_t),
+				                          sizeof(pc.channel_t), &pc.channel_t);
 			}
-		} else {
-			glDispatchCompute(dispatch.x, dispatch.y, dispatch.z);
+			vk_command_dispatch_compute(cmd, dispatch);
+			vk_command_buffer_memory_barriers(cmd, memory_barriers, 1 + coherent);
+		}
+	}break;
+
+	case BeamformerShaderKind_CoherencyWeighting:{
+		GPUBuffer *b = cc->backlog.buffer;
+
+		u64 frame_size      = beamformer_frame_byte_size(frame->points, frame->data_kind);
+		u64 incoherent_size = frame_size / beamformer_data_kind_element_count[frame->data_kind];
+
+		GPUMemoryBarrierInfo memory_barrier = {
+			.gpu_buffer = b,
+			.offset     = frame->buffer_offset,
+			.size       = frame_size,
+		};
+
+		BeamformerCoherencyWeightingPushConstants cwpc = {
+			.left_side_buffer  = b->gpu_pointer + frame->buffer_offset,
+			.right_side_buffer = b->gpu_pointer + b->size - incoherent_size,
+			.elements          = incoherent_size / beamformer_data_kind_element_size[frame->data_kind],
+			.scale             = 1.0f,
+			.output_size_x     = cp->output_points.x,
+			.output_size_y     = cp->output_points.y,
+			.output_size_z     = cp->output_points.z,
+		};
+
+		vk_command_push_constants(cmd, 0, sizeof(cwpc), &cwpc);
+		vk_command_dispatch_compute(cmd, dispatch);
+		vk_command_buffer_memory_barriers(cmd, &memory_barrier, 1);
+	}break;
+
+	// NOTE(rnp): invalid stages should be filtered in planning phase
+	InvalidDefaultCase;
+	}
+
+	#if 0
+	switch (shader) {
+	case BeamformerShaderKind_MinMax:{
+		for (u32 i = 1; i < frame->image.mip_map_levels; i++) {
+			glBindImageTexture(0, frame->texture, i - 1, GL_TRUE, 0, GL_READ_ONLY,  GL_RG32F);
+			glBindImageTexture(1, frame->texture, i - 0, GL_TRUE, 0, GL_WRITE_ONLY, GL_RG32F);
+			glProgramUniform1i(program, MIN_MAX_MIPS_LEVEL_UNIFORM_LOC, i);
+
+			u32 width  = (u32)frame->dim.x >> i;
+			u32 height = (u32)frame->dim.y >> i;
+			u32 depth  = (u32)frame->dim.z >> i;
+			glDispatchCompute(ORONE(width / 32), ORONE(height), ORONE(depth / 32));
+			glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT);
 		}
-		glMemoryBarrier(GL_TEXTURE_UPDATE_BARRIER_BIT|GL_SHADER_IMAGE_ACCESS_BARRIER_BIT);
 	}break;
 	case BeamformerShaderKind_Sum:{
 		u32 aframe_index = ctx->averaged_frame_index % countof(ctx->averaged_frames);
@@ -950,77 +1001,27 @@ do_compute_shader(BeamformerCtx *ctx, BeamformerComputePlan *cp, BeamformerFrame
 		assert(to_average == frame_count);
 
 		glProgramUniform1f(program, SUM_PRESCALE_UNIFORM_LOC, 1 / (f32)frame_count);
-		do_sum_shader(cc, in_textures, frame_count, aframe->texture, aframe->dim);
+		/* NOTE: zero output before summing */
+		glClearTexImage(aframe->texture, 0, GL_RED, GL_FLOAT, 0);
+		glMemoryBarrier(GL_TEXTURE_UPDATE_BARRIER_BIT);
+
+		glBindImageTexture(0, out_texture, 0, GL_TRUE, 0, GL_READ_WRITE, GL_RG32F);
+		for (u32 i = 0; i < in_texture_count; i++) {
+			glBindImageTexture(1, in_textures[i], 0, GL_TRUE, 0, GL_READ_ONLY, GL_RG32F);
+			glDispatchCompute(dispatch.x, dispatch.y, dispatch.z);
+			glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT);
+		}
+
 		mem_copy(aframe->voxel_transform.E,  frame->voxel_transform.E, sizeof(frame->voxel_transform));
 		aframe->compound_count   = frame->compound_count;
 		aframe->acquisition_kind = frame->acquisition_kind;
 	}break;
-	InvalidDefaultCase;
 	}
-}
-
-function s8
-shader_text_with_header(s8 header, s8 filepath, b32 has_file, BeamformerShaderKind shader_kind, Arena *arena)
-{
-	Stream sb = arena_stream(*arena);
-	stream_push_shader_header(&sb, shader_kind, header);
-	stream_append_s8(&sb, s8("\n#line 1\n"));
-
-	s8 result;
-	if (BakeShaders) {
-		/* TODO(rnp): better handling of shaders with no backing file */
-		if (has_file) {
-			i32 reloadable_index = beamformer_shader_reloadable_index_by_shader[shader_kind];
-			stream_append_s8(&sb, beamformer_shader_data[reloadable_index]);
-		}
-		result = arena_stream_commit(arena, &sb);
-	} else {
-		result = arena_stream_commit(arena, &sb);
-		if (has_file) {
-			i64 length = os_read_entire_file((c8 *)filepath.data, arena->beg, arena_capacity(arena, u8));
-			result.len += length;
-			arena_commit(arena, length);
-		}
-	}
-
-	return result;
-}
-
-/* NOTE(rnp): currently this function is only handling rendering shaders.
- * look at load_compute_shader for compute shaders */
-function void
-beamformer_reload_shader(BeamformerCtx *ctx, BeamformerShaderReloadContext *src, Arena arena, s8 shader_name)
-{
-	BeamformerShaderKind kind = beamformer_reloadable_shader_kinds[src->reloadable_info_index];
-	assert(kind == BeamformerShaderKind_Render3D);
-
-	s8 path = push_s8_from_parts(&arena, os_path_separator(), s8("shaders"),
-	                             beamformer_reloadable_shader_files[src->reloadable_info_index]);
-
-	i32 shader_count = 1;
-	BeamformerShaderReloadContext *link = src->link;
-	while (link != src) { shader_count++; link = link->link; }
-
-	s8  *shader_texts = push_array(&arena, s8,  shader_count);
-	u32 *shader_types = push_array(&arena, u32, shader_count);
-
-	i32 index = 0;
-	do {
-		b32 has_file = link->reloadable_info_index >= 0;
-		shader_texts[index] = shader_text_with_header(link->header, path, has_file, kind, &arena);
-		shader_types[index] = link->gl_type;
-		index++;
-		link = link->link;
-	} while (link != src);
-
-	u32 *shader = &ctx->frame_view_render_context.shader;
-	glDeleteProgram(*shader);
-	*shader = load_shader(arena, shader_texts, shader_types, shader_count, shader_name);
-	ctx->frame_view_render_context.updated = 1;
+	#endif
 }
 
 function void
-complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena *arena, iptr gl_context)
+complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena *arena)
 {
 	BeamformerComputeContext * cs = &ctx->compute_context;
 	BeamformerSharedMemory *   sm = ctx->shared_memory;
@@ -1029,6 +1030,7 @@ complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena *arena, iptr gl_c
 	while (work) {
 		b32 can_commit = 1;
 		switch (work->kind) {
+
 		case BeamformerWorkKind_ExportBuffer:{
 			/* TODO(rnp): better way of handling DispatchCompute barrier */
 			post_sync_barrier(ctx->shared_memory, BeamformerSharedMemoryLockKind_DispatchCompute);
@@ -1036,15 +1038,15 @@ complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena *arena, iptr gl_c
 			BeamformerExportContext *ec = &work->export_context;
 			switch (ec->kind) {
 			case BeamformerExportKind_BeamformedData:{
-				BeamformerFrame *frame = ctx->latest_frame;
-				if (frame) {
-					assert(frame->ready_to_present);
-					u32 texture  = frame->texture;
-					iv3 dim      = frame->dim;
-					u32 out_size = (u32)dim.x * (u32)dim.y * (u32)dim.z * 2 * sizeof(f32);
-					if (out_size <= ec->size) {
-						glGetTextureImage(texture, 0, GL_RG, GL_FLOAT, (i32)out_size,
-						                  beamformer_shared_memory_scratch_arena(sm, ctx->shared_memory_size).beg);
+				BeamformerFrame *f = ctx->latest_frame;
+				if (f) {
+					u64 frame_size = beamformer_frame_byte_size(f->points, f->data_kind);
+					assert((frame_size & 63) == 0);
+					if (frame_size <= ec->size) {
+						vk_host_wait_timeline(VulkanTimeline_Compute, f->timeline_valid_value, -1ULL);
+						vk_buffer_range_download(beamformer_shared_memory_scratch_arena(sm, ctx->shared_memory_size).beg,
+						                         ctx->compute_context.backlog.buffer, f->buffer_offset,
+						                         frame_size, 1);
 					}
 				}
 			}break;
@@ -1062,6 +1064,7 @@ complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena *arena, iptr gl_c
 			beamformer_shared_memory_release_lock(ctx->shared_memory, work->lock);
 			post_sync_barrier(ctx->shared_memory, BeamformerSharedMemoryLockKind_ExportSync);
 		}break;
+
 		case BeamformerWorkKind_CreateFilter:{
 			/* TODO(rnp): this should probably get deleted and moved to lazy loading */
 			BeamformerCreateFilterContext *fctx = &work->create_filter_context;
@@ -1070,20 +1073,18 @@ complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena *arena, iptr gl_c
 			BeamformerComputePlan *cp = beamformer_compute_plan_for_block(cs, block, arena);
 			beamformer_filter_update(cp->filters + slot, fctx->parameters, block, slot, *arena);
 		}break;
+
 		case BeamformerWorkKind_ComputeIndirect:{
 			fill_frame_compute_work(ctx, work, work->compute_indirect_context.view_plane,
 			                        work->compute_indirect_context.parameter_block, 1);
 		} /* FALLTHROUGH */
-		case BeamformerWorkKind_Compute:{
-			DEBUG_DECL(glClearNamedBufferData(cs->ping_pong_ssbos[0], GL_RG32F, GL_RG, GL_FLOAT, 0);)
-			DEBUG_DECL(glClearNamedBufferData(cs->ping_pong_ssbos[1], GL_RG32F, GL_RG, GL_FLOAT, 0);)
-			DEBUG_DECL(glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);)
 
+		case BeamformerWorkKind_Compute:{
 			push_compute_timing_info(ctx->compute_timing_table,
 			                         (ComputeTimingInfo){.kind = ComputeTimingInfoKind_ComputeFrameBegin});
 
 			BeamformerComputePlan *cp = beamformer_compute_plan_for_block(cs, work->compute_context.parameter_block, arena);
-			if (beamformer_parameter_block_dirty(sm, work->compute_context.parameter_block)) {
+			if unlikely(beamformer_parameter_block_dirty(sm, work->compute_context.parameter_block)) {
 				u32 block = work->compute_context.parameter_block;
 				beamformer_commit_parameter_block(ctx, cp, block, *arena);
 			}
@@ -1094,91 +1095,134 @@ complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena *arena, iptr gl_c
 			static_assert(ISPOWEROF2(BeamformerMaxComputeShaderStages),
 			              "max compute shader stages must be power of 2");
 			assert((dirty_programs & ~((u32)BeamformerMaxComputeShaderStages - 1)) == 0);
-			for EachBit(dirty_programs, slot)
-				load_compute_shader(ctx, cp, (u32)slot, *arena);
+			for EachBit(dirty_programs, slot) {
+				beamformer_reload_compute_pipeline(cp->vulkan_pipelines + slot, cp->pipeline.shaders[slot],
+				                                   cp->shader_descriptors + slot, *arena);
+			}
 
 			atomic_store_u32(&cs->processing_compute, 1);
-			start_renderdoc_capture(gl_context);
 
-			BeamformerFrame *frame = work->compute_context.frame;
+			start_renderdoc_capture();
 
-			GLenum gl_kind = cp->iq_pipeline ? GL_RG32F : GL_R32F;
-			if (!beamformer_frame_compatible(frame, cp->output_points, gl_kind))
-				alloc_beamform_frame(frame, cp->output_points, gl_kind, s8("Beamformed_Data"), *arena);
+			i32 das_index = -1;
+			b32 has_sum   = 0;
+			for (u32 i = 0; i < cp->pipeline.shader_count; i++) {
+				has_sum |= cp->pipeline.shaders[i] == BeamformerShaderKind_Sum;
+				if (cp->pipeline.shaders[i] == BeamformerShaderKind_DAS)
+					das_index = (i32)i;
+			}
 
-			m4 voxel_transform = m4_mul(cp->ui_voxel_transform, cp->voxel_transform);
-			mem_copy(frame->voxel_transform.E, voxel_transform.E, sizeof(voxel_transform));
+			b32 das_coherent = das_index >= 0 && cp->shader_descriptors[das_index].bake.DAS.coherency_weighting;
+			u64 reserved_frame_size = 0;
+
+			if (has_sum)
+				reserved_frame_size += beamformer_frame_byte_size(cp->output_points, cp->iq_pipeline ?
+				                                                  BeamformerDataKind_Float32Complex :
+				                                                  BeamformerDataKind_Float32);
+
+			// TODO(rnp): incoherent sum for different data kinds
+			if (das_coherent)
+				reserved_frame_size += beamformer_frame_byte_size(cp->output_points, BeamformerDataKind_Float32);
+
+			BeamformerFrame *frame  = beamformer_frame_next(cs, cp->output_points, cp->iq_pipeline, reserved_frame_size);
 			frame->acquisition_kind = cp->acquisition_kind;
 			frame->compound_count   = cp->acquisition_count;
-
-			BeamformerComputeContext  *cc       = &ctx->compute_context;
-			BeamformerComputePipeline *pipeline = &cp->pipeline;
-			/* NOTE(rnp): first stage requires access to raw data buffer directly so we break
-			 * it out into a separate step. This way data can get released as soon as possible */
-			if (pipeline->shader_count > 0) {
-				BeamformerRFBuffer *rf = &cs->rf_buffer;
-				u32 compute_index = rf->compute_index;
-				u32 slot = compute_index % countof(rf->compute_syncs);
-
-				if (work->kind == BeamformerWorkKind_ComputeIndirect) {
-					/* NOTE(rnp): compute indirect is used when uploading data. if compute thread
-					 * preempts upload it must wait for slot counter to reach a value it hasn't
-					 * processed yet. */
-					spin_wait(atomic_load_u64(rf->uploaded_data_indices + slot) <= compute_index);
-
-					/* NOTE(rnp): if the GPU supports BAR there may be no need to synchronize
-					 * other than the above spin */
-					if (vk_buffer_needs_sync(&rf->buffer))
-						glWaitSemaphoreEXT(rf->gl_upload_semaphores[slot], 0, 0, 0, 0, 0);
-				} else {
-					slot = (rf->compute_index - 1) % countof(rf->compute_syncs);
+			mem_copy(frame->voxel_transform.E, cp->voxel_transform.E, sizeof(cp->voxel_transform));
+
+			VulkanHandle cmd = vk_command_begin(VulkanTimeline_Compute);
+			vk_command_timestamp(cmd);
+
+			if (das_index >= 0) {
+				GPUBuffer *backlog = cs->backlog.buffer;
+				u32 subgroup_size = vk_gpu_info()->subgroup_size;
+				BeamformerBufferClearPushConstants pc = {
+					.data       = backlog->gpu_pointer + frame->buffer_offset,
+					.clear_word = 0,
+					.words      = beamformer_frame_byte_size(frame->points, frame->data_kind) / sizeof(u32),
+				};
+
+				u32 index = BeamformerShaderKind_BufferClear - BeamformerShaderKind_ComputeInternalFirst;
+				vk_command_bind_pipeline(cmd, cs->compute_internal_pipelines[index]);
+				vk_command_push_constants(cmd, 0, sizeof(pc), &pc);
+				vk_command_dispatch_compute(cmd, (uv3){{(u32)ceil_f32((f32)pc.words / subgroup_size), 1, 1}});
+
+				if (das_coherent) {
+					pc.words = pc.words / beamformer_data_kind_element_count[frame->data_kind];
+					pc.data  = backlog->gpu_pointer + backlog->size - sizeof(u32) * pc.words;
+					vk_command_push_constants(cmd, 0, sizeof(pc), &pc);
+					vk_command_dispatch_compute(cmd, (uv3){{(u32)ceil_f32((f32)pc.words / subgroup_size), 1, 1}});
 				}
+			}
 
-				glBindBufferRange(GL_SHADER_STORAGE_BUFFER, 1, rf->ssbo, slot * rf->active_rf_size, rf->active_rf_size);
+			BeamformerRFBuffer *rf = &cs->rf_buffer;
+			u32 compute_index = rf->compute_index;
+			u32 slot = compute_index % countof(rf->upload_complete_values);
 
-				glBeginQuery(GL_TIME_ELAPSED, cc->shader_timer_ids[0]);
-				do_compute_shader(ctx, cp, frame, pipeline->shaders[0], 0, pipeline->parameters + 0, *arena);
-				glEndQuery(GL_TIME_ELAPSED);
+			if (work->kind == BeamformerWorkKind_ComputeIndirect) {
+				// TODO(rnp): this shouldn't be necessary, there should be a way of communicating
+				// what the value will be so that the only the command wait is needed.
+				spin_wait(atomic_load_u64(rf->upload_complete_values + slot) <= compute_index);
 
-				if (work->kind == BeamformerWorkKind_ComputeIndirect) {
-					atomic_store_u64(rf->compute_syncs + slot, glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0));
-					atomic_add_u64(&rf->compute_index, 1);
-				}
+				/* NOTE(rnp): if the GPU supports BAR there may be no need to synchronize
+				 * other than the above spin */
+				if (vk_buffer_needs_sync(&rf->buffer))
+					vk_command_wait_timeline(cmd, VulkanTimeline_Transfer, rf->upload_complete_values[slot]);
+			} else {
+				slot = (rf->compute_index - 1) % countof(rf->upload_complete_values);
+			}
+
+			for (u32 i = 0; i < cp->pipeline.shader_count; i++) {
+				do_compute_shader(ctx, cmd, cp, frame, i, *arena,
+				                  rf->buffer.gpu_pointer + slot * rf->active_rf_size);
+				vk_command_timestamp(cmd);
 			}
 
-			b32 did_sum_shader = 0;
-			for (u32 i = 1; i < pipeline->shader_count; i++) {
-				did_sum_shader |= pipeline->shaders[i] == BeamformerShaderKind_Sum;
-				glBeginQuery(GL_TIME_ELAPSED, cc->shader_timer_ids[i]);
-				do_compute_shader(ctx, cp, frame, pipeline->shaders[i], i, pipeline->parameters + i, *arena);
-				glEndQuery(GL_TIME_ELAPSED);
+			u64 end_timeline_value = vk_command_end(cmd, (VulkanHandle){0}, (VulkanHandle){0});
+			if (work->kind == BeamformerWorkKind_ComputeIndirect) {
+				atomic_store_u64(rf->compute_complete_values + slot, end_timeline_value);
+				atomic_add_u64(&rf->compute_index, 1);
 			}
 
-			/* NOTE(rnp): the first of these blocks until work completes */
-			for (u32 i = 0; i < pipeline->shader_count; i++) {
-				ComputeTimingInfo info = {0};
-				info.kind   = ComputeTimingInfoKind_Shader;
-				info.shader = pipeline->shaders[i];
-				glGetQueryObjectui64v(cc->shader_timer_ids[i], GL_QUERY_RESULT, &info.timer_count);
-				push_compute_timing_info(ctx->compute_timing_table, info);
+			atomic_store_u64(&frame->timeline_valid_value, end_timeline_value);
+
+			{
+				Arena scratch    = *arena;
+				/* NOTE(rnp): this blocks until work completes */
+				u64 * timestamps = vk_command_read_timestamps(VulkanTimeline_Compute, &scratch);
+
+				u64 last_time    = timestamps[0] > 0 ? timestamps[1] : 0;
+				u32 shader_index = 0;
+				for (u64 i = 2; i < timestamps[0] + 1; i++) {
+					push_compute_timing_info(ctx->compute_timing_table, (ComputeTimingInfo){
+						.kind        = ComputeTimingInfoKind_Shader,
+						.shader      = cp->pipeline.shaders[shader_index],
+						.shader_slot = shader_index,
+						.timer_count = timestamps[i] - last_time,
+					});
+					last_time = timestamps[i];
+					shader_index++;
+				}
 			}
+
 			cs->processing_progress = 1;
 
-			frame->ready_to_present = 1;
-			if (did_sum_shader) {
+			if (has_sum) {
+				#if 0
 				u32 aframe_index = ((ctx->averaged_frame_index++) % countof(ctx->averaged_frames));
 				ctx->averaged_frames[aframe_index].view_plane_tag  = frame->view_plane_tag;
 				ctx->averaged_frames[aframe_index].ready_to_present = 1;
 				atomic_store_u64((u64 *)&ctx->latest_frame, (u64)(ctx->averaged_frames + aframe_index));
+				#endif
 			} else {
 				atomic_store_u64((u64 *)&ctx->latest_frame, (u64)frame);
 			}
-			cs->processing_compute  = 0;
+
+			atomic_store_u32(&cs->processing_compute, 0);
 
 			push_compute_timing_info(ctx->compute_timing_table,
 			                         (ComputeTimingInfo){.kind = ComputeTimingInfoKind_ComputeFrameEnd});
 
-			end_renderdoc_capture(gl_context);
+			end_renderdoc_capture();
 		}break;
 		InvalidDefaultCase;
 		}
@@ -1199,95 +1243,69 @@ coalesce_timing_table(ComputeTimingTable *t, ComputeShaderStats *stats)
 	u32 target = atomic_load_u32(&t->write_index);
 	u32 stats_index = (stats->latest_frame_index + 1) % countof(stats->table.times);
 
-	static_assert(BeamformerShaderKind_Count + 1 <= 32, "timing coalescence bitfield test");
-	u32 seen_info_test = 0;
+	b32 has_rf = 0;
+	f32 gpu_clocks_to_nano = 1.0e-9f * vk_gpu_info()->timestamp_period_ns;
 
+	// NOTE(rnp): not equal (the index may wrap)
 	while (t->read_index != target) {
 		ComputeTimingInfo info = t->buffer[t->read_index % countof(t->buffer)];
 		switch (info.kind) {
+
 		case ComputeTimingInfoKind_ComputeFrameBegin:{
 			assert(t->compute_frame_active == 0);
 			t->compute_frame_active = 1;
 			/* NOTE(rnp): allow multiple instances of same shader to accumulate */
+			t->in_flight_shader_count = 0;
+			memory_clear(t->in_flight_shader_ids, 0, sizeof(t->in_flight_shader_ids));
 			memory_clear(stats->table.times[stats_index], 0, sizeof(stats->table.times[stats_index]));
 		}break;
+
 		case ComputeTimingInfoKind_ComputeFrameEnd:{
 			assert(t->compute_frame_active == 1);
 			t->compute_frame_active = 0;
 			stats->latest_frame_index = stats_index;
 			stats_index = (stats_index + 1) % countof(stats->table.times);
+			stats->table.shader_count = t->in_flight_shader_count;
+			mem_copy(stats->table.shader_ids, t->in_flight_shader_ids, sizeof(t->in_flight_shader_ids));
 		}break;
+
 		case ComputeTimingInfoKind_Shader:{
-			stats->table.times[stats_index][info.shader] += (f32)info.timer_count / 1.0e9f;
-			seen_info_test |= (1u << info.shader);
+			t->in_flight_shader_count = Max(t->in_flight_shader_count, info.shader_slot + 1u);
+			t->in_flight_shader_ids[info.shader_slot] = info.shader;
+			stats->table.times[stats_index][info.shader_slot] += info.timer_count * gpu_clocks_to_nano;
 		}break;
+
 		case ComputeTimingInfoKind_RF_Data:{
 			stats->latest_rf_index = (stats->latest_rf_index + 1) % countof(stats->table.rf_time_deltas);
-			f32 delta = (f32)(info.timer_count - stats->last_rf_timer_count) / 1.0e9f;
+			f32 delta = info.timer_count / (f32)os_system_info()->timer_frequency;
 			stats->table.rf_time_deltas[stats->latest_rf_index] = delta;
-			stats->last_rf_timer_count = info.timer_count;
-			seen_info_test |= (1 << BeamformerShaderKind_Count);
+			has_rf = 1;
 		}break;
 		}
 		/* NOTE(rnp): do this at the end so that stats table is always in a consistent state */
-		atomic_add_u32(&t->read_index, 1);
+		t->read_index++;
 	}
 
-	if (seen_info_test) {
-		for EachEnumValue(BeamformerShaderKind, shader) {
-			if (seen_info_test & (1 << shader)) {
-				f32 sum = 0;
-				for EachElement(stats->table.times, i)
-					sum += stats->table.times[i][shader];
-				stats->average_times[shader] = sum / countof(stats->table.times);
-			}
-		}
+	for (u32 i = 0; i < stats->table.shader_count; i++) {
+		f32 sum = 0;
+		for EachElement(stats->table.times, it)
+			sum += stats->table.times[it][i];
+		stats->average_times[i] = sum / countof(stats->table.times);
+	}
 
-		if (seen_info_test & (1 << BeamformerShaderKind_Count)) {
-			f32 sum = 0;
-			for EachElement(stats->table.rf_time_deltas, i)
-				sum += stats->table.rf_time_deltas[i];
-			stats->rf_time_delta_average = sum / countof(stats->table.rf_time_deltas);
-		}
+	if (has_rf) {
+		f32 sum = 0;
+		for EachElement(stats->table.rf_time_deltas, i)
+			sum += stats->table.rf_time_deltas[i];
+		stats->rf_time_delta_average = sum / countof(stats->table.rf_time_deltas);
 	}
 }
 
 DEBUG_EXPORT BEAMFORMER_COMPLETE_COMPUTE_FN(beamformer_complete_compute)
 {
-	BeamformerCtx *ctx         = (BeamformerCtx *)user_context;
 	BeamformerSharedMemory *sm = ctx->shared_memory;
-	complete_queue(ctx, &sm->external_work_queue, arena, gl_context);
-	complete_queue(ctx, ctx->beamform_work_queue, arena, gl_context);
-}
-
-function void
-beamformer_rf_buffer_allocate(BeamformerRFBuffer *rf, u32 rf_size)
-{
-	if ValidHandle(rf->export_handle)
-		os_release_handle(rf->export_handle);
-
-	OSHandle export = {0};
-	vk_buffer_allocate(&rf->buffer, (iz)rf_size, GPUBufferCreateFlags_HostWritable|GPUBufferCreateFlags_MemoryOnly,
-	                   &export, s8(""));
-
-	glDeleteBuffers(1, &rf->ssbo);
-	glCreateBuffers(1, &rf->ssbo);
-
-	glDeleteMemoryObjectsEXT(1, &rf->memory_object);
-	glCreateMemoryObjectsEXT(1, &rf->memory_object);
-
-	if (OS_WINDOWS) {
-		glImportMemoryWin32HandleEXT(rf->memory_object, rf->buffer.size, GL_HANDLE_TYPE_OPAQUE_WIN32_EXT,
-		                             (void *)export.value[0]);
-		// NOTE(rnp): w32 does not transfer ownership from handle back to driver
-		rf->export_handle = export;
-	} else {
-		glImportMemoryFdEXT(rf->memory_object, rf->buffer.size, GL_HANDLE_TYPE_OPAQUE_FD_EXT, export.value[0]);
-	}
-
-	glNamedBufferStorageMemEXT(rf->ssbo, rf->buffer.size, rf->memory_object, 0);
-
-	LABEL_GL_OBJECT(GL_BUFFER, rf->ssbo, s8("Raw_RF_SSBO"));
+	complete_queue(ctx, &sm->external_work_queue, arena);
+	complete_queue(ctx, ctx->beamform_work_queue, arena);
 }
 
 DEBUG_EXPORT BEAMFORMER_RF_UPLOAD_FN(beamformer_rf_upload)
@@ -1305,22 +1323,20 @@ DEBUG_EXPORT BEAMFORMER_RF_UPLOAD_FN(beamformer_rf_upload)
 		BeamformerRFBuffer *rf = ctx->rf_buffer;
 
 		rf->active_rf_size = vk_round_up_to_sync_size(rf_block_rf_size & 0xFFFFFFFFULL, 64);
-		if unlikely(rf->buffer.size < countof(rf->compute_syncs) * rf->active_rf_size)
-			beamformer_rf_buffer_allocate(rf, countof(rf->compute_syncs) * rf->active_rf_size);
+		if unlikely(rf->buffer.size < countof(rf->upload_complete_values) * rf->active_rf_size) {
+			GPUBufferAllocateInfo allocate_info = {
+				.size  = countof(rf->upload_complete_values) * rf->active_rf_size,
+				.flags = VulkanUsageFlag_HostReadWrite,
+				.label = s8("RawRFBuffer"),
+			};
+			vk_buffer_allocate(&rf->buffer, &allocate_info);
+		}
 
-		u32 slot = rf->insertion_index++ % countof(rf->compute_syncs);
+		u32 slot = rf->insertion_index % countof(rf->upload_complete_values);
 
 		/* NOTE(rnp): don't overwrite slot if the compute thread hasn't processed it */
-		u64 current_slot_value = rf->uploaded_data_indices[slot];
-		spin_wait(atomic_load_u64(&rf->compute_index) < current_slot_value);
-
-		if (atomic_load_u64(rf->compute_syncs + slot)) {
-			GLenum sync_result = glClientWaitSync(rf->compute_syncs[slot], 0, 1000000000);
-			if (sync_result == GL_TIMEOUT_EXPIRED || sync_result == GL_WAIT_FAILED) {
-				// TODO(rnp): what do?
-			}
-			glDeleteSync(rf->compute_syncs[slot]);
-		}
+		spin_wait(atomic_load_u64(&rf->compute_index) < rf->upload_complete_values[slot]);
+		vk_host_wait_timeline(VulkanTimeline_Compute, rf->compute_complete_values[slot], -1ULL);
 
 		vk_buffer_range_upload(&rf->buffer, beamformer_shared_memory_scratch_arena(sm, ctx->shared_memory_size).beg,
 		                       slot * rf->active_rf_size, rf->active_rf_size, 1);
@@ -1329,19 +1345,17 @@ DEBUG_EXPORT BEAMFORMER_RF_UPLOAD_FN(beamformer_rf_upload)
 		beamformer_shared_memory_release_lock(ctx->shared_memory, (i32)scratch_lock);
 		post_sync_barrier(ctx->shared_memory, upload_lock);
 
-		if (vk_buffer_needs_sync(&rf->buffer)) {
-			// TODO(rnp): vk_buffer_sync
-		}
-
-		atomic_store_u64(rf->uploaded_data_indices + slot, rf->insertion_index);
-		atomic_store_u64(rf->compute_syncs + slot, 0);
+		rf->insertion_index++;
+		atomic_store_u64(rf->upload_complete_values + slot, vk_host_signal_timeline(VulkanTimeline_Transfer));
 
 		os_wake_all_waiters(ctx->compute_worker_sync);
 
-		ComputeTimingInfo info = {.kind = ComputeTimingInfoKind_RF_Data};
-		glGetQueryObjectui64v(rf->data_timestamp_query, GL_QUERY_RESULT, &info.timer_count);
-		glQueryCounter(rf->data_timestamp_query, GL_TIMESTAMP);
-		push_compute_timing_info(ctx->compute_timing_table, info);
+		u64 current_time = os_timer_count();
+		push_compute_timing_info(ctx->compute_timing_table, (ComputeTimingInfo){
+			.kind        = ComputeTimingInfoKind_RF_Data,
+			.timer_count = current_time - rf->timestamp,
+		});
+		rf->timestamp = current_time;
 	}
 }
 
@@ -1373,33 +1387,49 @@ beamformer_process_input_events(BeamformerCtx *ctx, BeamformerInput *input,
 		case BeamformerInputEventKind_ExecutableReload:{
 			ui_init(ctx, ctx->ui_backing_store);
 
+			if (!vk_pipeline_valid(ctx->compute_context.compute_internal_pipelines[0])) {
+				for EachElement(ctx->compute_context.compute_internal_pipelines, it) {
+					beamformer_reload_compute_pipeline(ctx->compute_context.compute_internal_pipelines + it,
+					                                   BeamformerShaderKind_ComputeInternalFirst + it, 0,
+					                                   ctx->arena);
+				}
+			}
+
 			#if BEAMFORMER_RENDERDOC_HOOKS
-			start_frame_capture = input->renderdoc_start_frame_capture;
-			end_frame_capture   = input->renderdoc_end_frame_capture;
+			start_frame_capture       = input->renderdoc_start_frame_capture;
+			end_frame_capture         = input->renderdoc_end_frame_capture;
+			set_capture_path_template = input->renderdoc_set_capture_file_path_template;
 			#endif
 		}break;
 
 		case BeamformerInputEventKind_FileEvent:{
 			BeamformerFileReloadContext *frc = event->file_watch_user_context;
 			switch (frc->kind) {
-			case BeamformerFileReloadKind_Shader:{
-				BeamformerShaderReloadContext *src = frc->shader_reload_context;
-				BeamformerShaderKind kind = beamformer_reloadable_shader_kinds[src->reloadable_info_index];
-				beamformer_reload_shader(ctx, src, ctx->arena, beamformer_shader_names[kind]);
+			case BeamformerFileReloadKind_ComputeInternalShader:{
+				// TODO(rnp): this could stall, better to push it onto compute once queue is better
+				beamformer_reload_compute_pipeline(frc->shader_reload.pipeline, frc->shader_reload.shader, 0, ctx->arena);
 			}break;
+
 			case BeamformerFileReloadKind_ComputeShader:{
 				for EachElement(ctx->compute_context.compute_plans, block) {
 					BeamformerComputePlan *cp = ctx->compute_context.compute_plans[block];
 					for (u32 slot = 0; cp && slot < cp->pipeline.shader_count; slot++) {
 						i32 shader_index = beamformer_shader_reloadable_index_by_shader[cp->pipeline.shaders[slot]];
-						if (beamformer_reloadable_shader_kinds[shader_index] == frc->compute_shader_kind)
+						if (beamformer_reloadable_shader_kinds[shader_index] == frc->shader_reload.shader)
 							atomic_or_u32(&cp->dirty_programs, 1 << slot);
 					}
 				}
 
+				// TODO(rnp): track latest parameter block
 				if (ctx->latest_frame)
-					beamformer_queue_compute(ctx, ctx->latest_frame, ctx->latest_frame->parameter_block);
+					beamformer_queue_compute(ctx, ctx->latest_frame, 0);
+			}break;
+
+			case BeamformerFileReloadKind_RenderShader:{
+				beamformer_reload_render_pipeline(frc->shader_reload.pipeline, frc->shader_reload.shader, ctx->arena);
+				ctx->render_shader_updated = 1;
 			}break;
+
 			InvalidDefaultCase;
 			}
 		}break;
@@ -1437,5 +1467,5 @@ beamformer_frame_step(BeamformerInput *input)
 	BeamformerViewPlaneTag  tag   = frame? frame->view_plane_tag : 0;
 	draw_ui(ctx, input, frame, tag);
 
-	ctx->frame_view_render_context.updated = 0;
+	ctx->render_shader_updated = 0;
 }
diff --git a/beamformer_internal.h b/beamformer_internal.h
@@ -10,12 +10,8 @@
 #include "generated/beamformer.meta.c"
 #include "generated/beamformer_shaders.c"
 
-#include <raylib_extended.h>
-#include <rlgl.h>
-
-#include "threads.c"
-#include "util_gl.c"
-#include "util_os.c"
+#include "external/raylib/src/raylib.h"
+#include "external/raylib/src/rlgl.h"
 
 #define beamformer_info(s) s8("[info] " s "\n")
 
@@ -24,16 +20,63 @@
 typedef struct { u64 value[1]; } VulkanHandle;
 
 typedef enum {
-	GPUBufferCreateFlags_HostWritable = 1 << 0,
-	GPUBufferCreateFlags_MemoryOnly   = 1 << 1,
-} GPUBufferCreateFlags;
+	VulkanTimeline_Graphics,
+	VulkanTimeline_Compute,
+	VulkanTimeline_Transfer,
+	VulkanTimeline_Count,
+} VulkanTimeline;
+
+typedef enum {
+	VulkanShaderKind_Vertex,
+	VulkanShaderKind_Mesh,
+	VulkanShaderKind_Fragment,
+	VulkanShaderKind_Compute,
+	VulkanShaderKind_Count,
+} VulkanShaderKind;
+
+typedef enum {
+	VulkanImageUsage_None,
+	VulkanImageUsage_Colour,
+	VulkanImageUsage_DepthStencil,
+	VulkanImageUsage_Count,
+} VulkanImageUsage;
+
+typedef enum {
+	VulkanUsageFlag_ImageSampling       = 1 << 0,
+	VulkanUsageFlag_HostReadWrite       = 1 << 1, // NOTE: not valid on images
+	/* NOTE: uses:
+	 * - image-image copy operations
+	 * - buffer-buffer copy operations
+	 */
+	VulkanUsageFlag_TransferSource      = 1 << 2,
+	VulkanUsageFlag_TransferDestination = 1 << 3,
+} VulkanUsageFlags;
+
+typedef struct {
+	VulkanShaderKind kind;
+	s8               text;
+	s8               name;
+} VulkanPipelineCreateInfo;
 
 typedef struct {
+	VulkanHandle buffer;
 	u64          gpu_pointer;
 	i64          size;
-	VulkanHandle buffer;
+
+	// NOTE: only used for render models
+	u64          index_count;
 } GPUBuffer;
 
+typedef struct {
+	VulkanHandle image;
+	u32          width;
+	u32          height;
+	u32          samples;
+	u32          mip_map_levels;
+	// TODO(rnp): this is only here for importing from OpenGL, move it back into handle later
+	u64          memory_size;
+} GPUImage;
+
 typedef enum {
 	GPUVendor_AMD      = 0x1002,
 	GPUVendor_NVIDIA   = 0x10DE,
@@ -59,28 +102,94 @@ typedef struct {
 	u64 gpu_heap_used;
 } GPUInfo;
 
+typedef struct {
+	i64               size;
+	VulkanUsageFlags  flags;
+
+	// NOTE(rnp): only required if buffer will be used on multiple timelines
+	VulkanTimeline   *timelines_used;
+	u32               timeline_count;
+
+	s8                label;
+} GPUBufferAllocateInfo;
+
+typedef struct {
+	GPUBuffer *gpu_buffer;
+	u64        offset;
+	u64        size;
+} GPUMemoryBarrierInfo;
+
+typedef struct {
+	GPUBuffer model;
+	u32       vertex_count;
+	u32       normals_offset;
+} RenderModel;
+
+#include "threads.c"
+#include "util_os.c"
+
 ///////////////////////////
 // NOTE: vulkan layer API
 DEBUG_IMPORT void vk_load(OSLibrary vulkan, Arena *memory, Stream *error);
 
 DEBUG_IMPORT GPUInfo *vk_gpu_info(void);
 
-DEBUG_IMPORT void vk_buffer_allocate(GPUBuffer *, iz size, GPUBufferCreateFlags flags, OSHandle *export, s8 label);
+DEBUG_IMPORT void vk_buffer_allocate(GPUBuffer *, GPUBufferAllocateInfo *info);
 DEBUG_IMPORT void vk_buffer_release(GPUBuffer *);
 DEBUG_IMPORT void vk_buffer_range_upload(GPUBuffer *, void *data, u64 offset, u64 size, b32 non_temporal);
+DEBUG_IMPORT void vk_buffer_range_download(void *output, GPUBuffer *, u64 source_offset, u64 size, b32 non_temporal);
 DEBUG_IMPORT u64  vk_round_up_to_sync_size(u64, u64 min);
 
-/* NOTE: Compute shaders do not have bindings. Data should be passed using push constants.
+// NOTE: images are 2D only, any other use case should just use a buffer and index in the shader
+DEBUG_IMPORT void vk_image_allocate(GPUImage *, u32 width, u32 height, u32 mips, u32 samples, VulkanImageUsage usage, VulkanUsageFlags flags, OSHandle *export);
+DEBUG_IMPORT void vk_image_release(GPUImage *);
+
+DEBUG_IMPORT void vk_render_model_allocate(GPUBuffer *, void *indices, u64 index_count, u64 model_size, s8 label);
+DEBUG_IMPORT void vk_render_model_range_upload(GPUBuffer *, void *data, u64 offset, u64 size, b32 non_temporal);
+DEBUG_IMPORT void vk_render_model_release(GPUBuffer *);
+
+/* NOTE: Pipelines do not have bindings. Data should be passed using push constants.
  * In particular the push constants should contain pointers to gpu memory using the
  * BufferDeviceAddress extension. */
 // TODO(rnp): change this to accept SPIR-V directly and accept BakeParameters as specialization data
-DEBUG_IMPORT VulkanHandle vk_compute_shader(s8 text, s8 name);
-DEBUG_IMPORT void         vk_compute_shader_release(VulkanHandle);
+DEBUG_IMPORT VulkanHandle vk_pipeline(VulkanPipelineCreateInfo *infos, u32 count, u32 push_constants_size);
+DEBUG_IMPORT b32          vk_pipeline_valid(VulkanHandle);
+DEBUG_IMPORT void         vk_pipeline_release(VulkanHandle);
 
-// NOTE: temporary API
 DEBUG_IMPORT b32 vk_buffer_needs_sync(GPUBuffer *);
 
-DEBUG_IMPORT VulkanHandle vk_semaphore_create(OSHandle *export);
+DEBUG_IMPORT VulkanHandle vk_create_semaphore(OSHandle *export);
+
+DEBUG_IMPORT b32          vk_host_wait_timeline(VulkanTimeline timeline, u64 value, u64 timeout_ns);
+DEBUG_IMPORT u64          vk_host_signal_timeline(VulkanTimeline timeline);
+
+DEBUG_IMPORT VulkanHandle vk_command_begin(VulkanTimeline timeline);
+DEBUG_IMPORT void         vk_command_bind_pipeline(VulkanHandle command, VulkanHandle pipeline);
+DEBUG_IMPORT void         vk_command_buffer_memory_barriers(VulkanHandle command, GPUMemoryBarrierInfo *barriers, u64 count);
+DEBUG_IMPORT void         vk_command_dispatch_compute(VulkanHandle command, uv3 dispatch);
+DEBUG_IMPORT void         vk_command_push_constants(VulkanHandle command, u32 offset, u32 size, void *values);
+DEBUG_IMPORT void         vk_command_timestamp(VulkanHandle command);
+DEBUG_IMPORT void         vk_command_wait_timeline(VulkanHandle command, VulkanTimeline timeline, u64 value);
+// NOTE: extra semaphores only exist for synchronization with OpenGL and will be removed in the future
+DEBUG_IMPORT u64          vk_command_end(VulkanHandle command, VulkanHandle wait_semaphore, VulkanHandle finished_semaphore);
+
+DEBUG_IMPORT void         vk_command_begin_rendering(VulkanHandle command, GPUImage *restrict colour, GPUImage *restrict depth, GPUImage *restrict resolve);
+DEBUG_IMPORT void         vk_command_draw(VulkanHandle command, GPUBuffer *model);
+DEBUG_IMPORT void         vk_command_scissor(VulkanHandle command, u32 width, u32 height, u32 x_offset, u32 y_offset);
+DEBUG_IMPORT void         vk_command_viewport(VulkanHandle command, f32 width, f32 height, f32 x_offset, f32 y_offset, f32 min_depth, f32 max_depth);
+DEBUG_IMPORT void         vk_command_end_rendering(VulkanHandle command);
+
+DEBUG_IMPORT void         vk_command_copy_buffer(VulkanHandle command, GPUBuffer *restrict destination, GPUBuffer *restrict source, u64 source_offset, i64 size);
+
+// NOTE: returns array of valid timestamps + 1, first element is the count.
+//       Calling thread may stall until results available.
+DEBUG_IMPORT u64 *        vk_command_read_timestamps(VulkanTimeline timeline, Arena *arena);
+
+#if BEAMFORMER_RENDERDOC_HOOKS
+DEBUG_IMPORT void *       vk_renderdoc_instance_handle(void);
+#else
+#define vk_renderdoc_instance_handle() ((void *)0)
+#endif
 
 ///////////////////////////////
 // NOTE: CUDA Library Bindings
@@ -119,73 +228,34 @@ CUDALibraryProcedureList
 /////////////////////////////////////
 // NOTE: Core Beamformer Definitions
 
-/* TODO(rnp): this should be a UBO */
-#define FRAME_VIEW_MODEL_MATRIX_LOC   0
-#define FRAME_VIEW_VIEW_MATRIX_LOC    1
-#define FRAME_VIEW_PROJ_MATRIX_LOC    2
-#define FRAME_VIEW_DYNAMIC_RANGE_LOC  3
-#define FRAME_VIEW_THRESHOLD_LOC      4
-#define FRAME_VIEW_GAMMA_LOC          5
-#define FRAME_VIEW_LOG_SCALE_LOC      6
-#define FRAME_VIEW_BB_COLOUR_LOC      7
-#define FRAME_VIEW_BB_FRACTION_LOC    8
-#define FRAME_VIEW_SOLID_BB_LOC      10
-
-#define FRAME_VIEW_BB_COLOUR   0.92, 0.88, 0.78, 1.0
-#define FRAME_VIEW_BB_FRACTION 0.007f
-
-#define FRAME_VIEW_RENDER_TARGET_SIZE 1024, 1024
-
-typedef struct {
-	u32 shader;
-	u32 framebuffers[2];  /* [0] -> multisample target, [1] -> normal target for resolving */
-	u32 renderbuffers[2]; /* only used for 3D views, size is fixed */
-	b32 updated;
-} FrameViewRenderContext;
-
 #include "beamformer_parameters.h"
 #include "beamformer_shared_memory.c"
 
 typedef struct {
-	iptr elements_offset;
-	i32  elements;
-	u32  buffer;
-	u32  vao;
-} BeamformerRenderModel;
-
-typedef struct {
 	BeamformerFilterParameters parameters;
-	f32 time_delay;
-	i32 length;
-	u32 ssbo;
+	f32                        time_delay;
+	i32                        length;
+	GPUBuffer                  buffer;
 } BeamformerFilter;
 
-/* TODO(rnp): need 1 UBO per filter slot */
-#define BEAMFORMER_COMPUTE_UBO_LIST \
-	X(DAS,        BeamformerDASPushConstants,    das)
-
-#define X(k, ...) BeamformerComputeUBOKind_##k,
-typedef enum {BEAMFORMER_COMPUTE_UBO_LIST BeamformerComputeUBOKind_Count} BeamformerComputeUBOKind;
-#undef X
-
-// X(kind, gl_kind, texture_format, pixel_type)
-#define BEAMFORMER_COMPUTE_TEXTURE_LIST \
-	X(FocalVectors,                GL_RG32F, GL_RG,          GL_FLOAT) \
-	X(SparseElements,              GL_R16I,  GL_RED_INTEGER, GL_SHORT) \
-	X(TransmitReceiveOrientations, GL_R8I,   GL_RED_INTEGER, GL_BYTE)
-
-#define BEAMFORMER_COMPUTE_TEXTURE_LIST_FULL \
-	BEAMFORMER_COMPUTE_TEXTURE_LIST \
-	X(Hadamard,       GL_R16F)
+// X(kind, format, elements)
+#define BEAMFORMER_COMPUTE_ARRAY_PARAMETERS_LIST \
+	X(Hadamard,                    f16, BeamformerMaxChannelCount * BeamformerMaxChannelCount) \
+	X(FocalVectors,                v2,  BeamformerMaxChannelCount) \
+	X(SparseElements,              i16, BeamformerMaxChannelCount) \
+	X(TransmitReceiveOrientations, u16, BeamformerMaxChannelCount) \
 
 typedef enum {
-	#define X(k, ...) BeamformerComputeTextureKind_##k,
-	BEAMFORMER_COMPUTE_TEXTURE_LIST_FULL
+	#define X(k, ...) BeamformerComputeArrayParameterKind_##k,
+	BEAMFORMER_COMPUTE_ARRAY_PARAMETERS_LIST
 	#undef X
-	BeamformerComputeTextureKind_Count
-} BeamformerComputeTextureKind;
-static_assert((BeamformerComputeTextureKind_Count - 1) == BeamformerComputeTextureKind_Hadamard,
-              "BeamformerComputeTextureKind_Hadamard must be end of TextureKinds");
+	BeamformerComputeArrayParameterKind_Count
+} BeamformerComputeArrayParameterKind;
+
+// NOTE(rnp): only used to calculate offsets, never used directly
+#define X(name, type, elements) alignas(64) type name[elements];
+typedef struct {BEAMFORMER_COMPUTE_ARRAY_PARAMETERS_LIST} BeamformerComputeArrayParameters;
+#undef X
 
 typedef struct {
 	uv3 layout;
@@ -197,7 +267,7 @@ typedef struct BeamformerComputePlan BeamformerComputePlan;
 struct BeamformerComputePlan {
 	BeamformerComputePipeline pipeline;
 
-	u32 programs[BeamformerMaxComputeShaderStages];
+	VulkanHandle vulkan_pipelines[BeamformerMaxComputeShaderStages];
 
 	u32 dirty_programs;
 
@@ -214,14 +284,15 @@ struct BeamformerComputePlan {
 	iv3 output_points;
 	i32 average_frames;
 
-	u32 textures[BeamformerComputeTextureKind_Count];
-	u32 ubos[BeamformerComputeUBOKind_Count];
+	// TODO(rnp): specialization constants
+	v2  xdc_element_pitch;
+	m4  xdc_transform;
+	// TODO(rnp): probably just compute this everytime
+	m4  das_voxel_transform;
 
-	BeamformerFilter filters[BeamformerFilterSlots];
+	GPUBuffer array_parameters;
 
-	#define X(k, type, name) type name ##_ubo_data;
-	BEAMFORMER_COMPUTE_UBO_LIST
-	#undef X
+	BeamformerFilter filters[BeamformerFilterSlots];
 
 	u128 shader_hashes[BeamformerMaxComputeShaderStages];
 	BeamformerShaderDescriptor shader_descriptors[BeamformerMaxComputeShaderStages];
@@ -230,50 +301,20 @@ struct BeamformerComputePlan {
 };
 
 typedef struct {
-	// NOTE(rnp): w32 doesn't transfer ownership of these when they are imported
-	// into the driver. For now just store them here, this code won't be around for long
-	OSHandle     upload_semaphores_handles[BeamformerMaxRawDataFramesInFlight];
-	VulkanHandle vk_upload_semaphores[BeamformerMaxRawDataFramesInFlight];
-	u32          gl_upload_semaphores[BeamformerMaxRawDataFramesInFlight];
-
-	GLsync       compute_syncs[BeamformerMaxRawDataFramesInFlight];
-
-	u64          uploaded_data_indices[BeamformerMaxRawDataFramesInFlight];
+	u64 upload_complete_values[BeamformerMaxRawDataFramesInFlight];
+	u64 compute_complete_values[BeamformerMaxRawDataFramesInFlight];
 
 	GPUBuffer buffer;
-	OSHandle  export_handle;
-
-	u32 ssbo, memory_object;
 
 	u32 active_rf_size;
-	u32 data_timestamp_query;
+
+	u64 timestamp;
 
 	u64 insertion_index;
 	u64 compute_index;
 } BeamformerRFBuffer;
 
 typedef struct {
-	BeamformerRFBuffer rf_buffer;
-
-	BeamformerComputePlan *compute_plans[BeamformerMaxParameterBlocks];
-	BeamformerComputePlan *compute_plan_freelist;
-
-	/* NOTE(rnp): two interstage ssbos are allocated so that they may be used to
-	 * ping pong data between compute stages */
-	u32 ping_pong_ssbos[2];
-	u32 last_output_ssbo_index;
-
-	u32 ping_pong_ssbo_size;
-
-	f32 processing_progress;
-	b32 processing_compute;
-
-	u32 shader_timer_ids[BeamformerMaxComputeShaderStages];
-
-	BeamformerRenderModel unit_cube_model;
-} BeamformerComputeContext;
-
-typedef struct {
 	BeamformerComputeStatsTable table;
 	f32 average_times[BeamformerShaderKind_Count];
 
@@ -296,7 +337,11 @@ typedef struct {
 	u64 timer_count;
 	ComputeTimingInfoKind kind;
 	union {
-		BeamformerShaderKind shader;
+		struct {
+			static_assert(BeamformerShaderKind_Count <= U16_MAX, "");
+			u16 shader;
+			u16 shader_slot;
+		};
 	};
 } ComputeTimingInfo;
 
@@ -304,6 +349,10 @@ typedef struct {
 	u32 write_index;
 	u32 read_index;
 	b32 compute_frame_active;
+
+	u32                  in_flight_shader_count;
+	BeamformerShaderKind in_flight_shader_ids[BeamformerMaxComputeShaderStages];
+
 	ComputeTimingInfo buffer[4096];
 } ComputeTimingTable;
 
@@ -315,34 +364,57 @@ typedef struct {
 	i32                     *compute_worker_sync;
 } BeamformerUploadThreadContext;
 
-struct BeamformerFrame {
-	u32 texture;
-	b32 ready_to_present;
-
-	iv3 dim;
-	i32 mips;
+typedef struct {
+	u64 buffer_offset;
+	u64 timeline_valid_value;
 
 	/* NOTE: for use when displaying either prebeamformed frames or on the current frame
 	 * when we intend to recompute on the next frame */
 	m4  voxel_transform;
 
-	// metadata
-	GLenum                    gl_kind;
+	iv3 points;
+
 	u32                       id;
 	u32                       compound_count;
-	u32                       parameter_block;
+	BeamformerDataKind        data_kind;
 	BeamformerAcquisitionKind acquisition_kind;
 	BeamformerViewPlaneTag    view_plane_tag;
+} BeamformerFrame;
 
-	BeamformerFrame *next;
-};
+/* NOTE(rnp): backing storage for beamformed frames. The amount of backlog frames
+* is dependant on the currently requested output size. */
+typedef struct {
+	GPUBuffer   buffer[1];
+
+	u64         next_offset;
+	u64         counter;
+
+	BeamformerFrame frames[BeamformerMaxBacklogFrames];
+} BeamformerFrameBacklog;
+
+typedef struct {
+	BeamformerRFBuffer rf_buffer;
+
+	BeamformerComputePlan *compute_plans[BeamformerMaxParameterBlocks];
+	BeamformerComputePlan *compute_plan_freelist;
+
+	VulkanHandle compute_internal_pipelines[BeamformerShaderKind_ComputeInternalCount];
+
+	/* NOTE(rnp): used to ping pong data between compute stages.
+	 * Half the buffer will be used for reading and the other for writing. */
+	GPUBuffer ping_pong_buffer;
+	u32 ping_pong_input_index;
+
+	f32 processing_progress;
+	b32 processing_compute;
+
+	BeamformerFrameBacklog backlog;
+} BeamformerComputeContext;
 
 typedef struct {
 	OSThread handle;
 
 	Arena arena;
-	iptr  window_handle;
-	iptr  gl_context;
 	iptr  user_context;
 	i32   sync_variable;
 	b32   awake;
@@ -367,26 +439,15 @@ typedef struct {
 
 	u64    frame_timestamp;
 
-	BeamformerComputeContext compute_context;
-
-	/* TODO(rnp): ideally this would go in the UI but its hard to manage with the UI
-	 * destroying itself on hot-reload */
-	FrameViewRenderContext frame_view_render_context;
-
 	Stream error_stream;
 
-	BeamformWorkQueue *beamform_work_queue;
-
-	ComputeShaderStats *compute_shader_stats;
-	ComputeTimingTable *compute_timing_table;
-
 	BeamformerSharedMemory *shared_memory;
 	i64                     shared_memory_size;
 
-	BeamformerFrame beamform_frames[BeamformerMaxBacklogFrames];
 	BeamformerFrame *latest_frame;
-	u32 next_render_frame_index;
-	u32 display_frame_index;
+
+	// TODO(rnp): track elsewhere
+	b32 render_shader_updated;
 
 	/* NOTE: this will only be used when we are averaging */
 	u32             averaged_frame_index;
@@ -394,31 +455,47 @@ typedef struct {
 
 	GLWorkerThreadContext  upload_worker;
 	GLWorkerThreadContext  compute_worker;
+
+	BeamformerComputeContext compute_context;
+
+	ComputeShaderStats compute_shader_stats[1];
+	ComputeTimingTable compute_timing_table[1];
+
+	BeamformWorkQueue  beamform_work_queue[1];
 } BeamformerCtx;
 #define BeamformerContextMemory(m) (BeamformerCtx *)align_pointer_up((m), alignof(BeamformerCtx));
 
 typedef enum {
-	BeamformerFileReloadKind_Shader,
+	BeamformerFileReloadKind_ComputeInternalShader,
 	BeamformerFileReloadKind_ComputeShader,
+	BeamformerFileReloadKind_RenderShader,
 } BeamformerFileReloadKind;
 
-typedef struct BeamformerShaderReloadContext BeamformerShaderReloadContext;
-struct BeamformerShaderReloadContext {
-	BeamformerShaderReloadContext * link;
-	s8     header;
-	GLenum gl_type;
-	i32    reloadable_info_index;
-};
+typedef struct {
+	BeamformerShaderKind shader;
+	VulkanHandle *       pipeline;
+} BeamformerShaderReloadData;
+
+typedef struct {
+	BeamformerShaderKind  shader;
+	VulkanShaderKind      shader_kind;
+
+	// NOTE(rnp): based on BakeShaders compile time value
+	s8                    filename_or_data;
+
+	BeamformerShaderDescriptor *shader_descriptor;
+
+	uv3 layout;
+} BeamformerShaderReloadInfo;
 
 typedef struct {
 	BeamformerFileReloadKind kind;
 	union {
-		BeamformerShaderReloadContext * shader_reload_context;
-		BeamformerShaderKind            compute_shader_kind;
+		BeamformerShaderReloadData shader_reload;
 	};
 } BeamformerFileReloadContext;
 
-#define BEAMFORMER_COMPLETE_COMPUTE_FN(name) void name(iptr user_context, Arena *arena, iptr gl_context)
+#define BEAMFORMER_COMPLETE_COMPUTE_FN(name) void name(BeamformerCtx *ctx, Arena *arena)
 typedef BEAMFORMER_COMPLETE_COMPUTE_FN(beamformer_complete_compute_fn);
 
 #define BEAMFORMER_RF_UPLOAD_FN(name) void name(BeamformerUploadThreadContext *ctx)
diff --git a/beamformer_parameters.h b/beamformer_parameters.h
@@ -10,10 +10,12 @@
  */
 
 typedef struct {
+	uint64_t shader_count;
+	uint32_t shader_ids[BeamformerMaxComputeShaderStages];
 	/* NOTE(rnp): this wants to be iterated on both dimensions. it depends entirely on which
 	 * visualization method you want to use. the coalescing function wants both directions */
-	float times[32][BeamformerMaxComputeShaderStages];
-	float rf_time_deltas[32];
+	float    times[32][BeamformerMaxComputeShaderStages];
+	float    rf_time_deltas[32];
 } BeamformerComputeStatsTable;
 
 /* X(type, id, pretty name) */
diff --git a/beamformer_shared_memory.c b/beamformer_shared_memory.c
@@ -1,7 +1,5 @@
 /* See LICENSE for license details. */
-#define BEAMFORMER_SHARED_MEMORY_VERSION (28UL)
-
-typedef struct BeamformerFrame BeamformerFrame;
+#define BEAMFORMER_SHARED_MEMORY_VERSION (29UL)
 
 typedef enum {
 	BeamformerWorkKind_Compute,
@@ -39,8 +37,7 @@ typedef enum {BEAMFORMER_SHARED_MEMORY_LOCKS BeamformerSharedMemoryLockKind_Coun
 #undef X
 
 typedef struct {
-	BeamformerFrame *frame;
-	u32              parameter_block;
+	u32 parameter_block;
 } BeamformerComputeWorkContext;
 
 typedef struct {
@@ -161,6 +158,8 @@ typedef struct {
 	/* TODO(rnp): this is really sucky. we need a better way to communicate this */
 	u64 rf_block_rf_size;
 
+	u64 max_beamformed_data_size;
+
 	BeamformerLiveImagingParameters live_imaging_parameters;
 	BeamformerLiveImagingDirtyFlags live_imaging_dirty_flags;
 
diff --git a/build.c b/build.c
@@ -741,9 +741,8 @@ build_raylib(Arena a)
 {
 	b32 result = 1, shared = config.debug;
 	char *libraylib = shared ? OS_SHARED_LINK_LIB("raylib") : OUTPUT_LIB(OS_STATIC_LIB("raylib"));
-	if (needs_rebuild(libraylib, "external/include/rlgl.h", "external/raylib")) {
+	if (needs_rebuild(libraylib, "external/raylib")) {
 		git_submodule_update(a, "external/raylib");
-		os_copy_file("external/raylib/src/rlgl.h", "external/include/rlgl.h");
 
 		CommandList cc = {0};
 		cmd_base(&a, &cc, 0, config.debug);
@@ -752,16 +751,17 @@ build_raylib(Arena a)
 		if (!is_msvc) cmd_append(&a, &cc, "-Wno-unused-but-set-variable");
 		cmd_append(&a, &cc, "-Iexternal/include", "-Iexternal/raylib/src", "-Iexternal/raylib/src/external/glfw/include");
 		#define RAYLIB_SOURCES \
+			X(rcore)     \
 			X(rglfw)     \
 			X(rshapes)   \
 			X(rtext)     \
 			X(rtextures) \
 			X(utils)
 		#define X(name) "external/raylib/src/" #name ".c",
-		char *srcs[] = {"external/rcore_extended.c", RAYLIB_SOURCES};
+		char *srcs[] = {RAYLIB_SOURCES};
 		#undef X
 		#define X(name) OUTPUT(OBJECT(#name)),
-		char *outs[] = {OUTPUT(OBJECT("rcore_extended")), RAYLIB_SOURCES};
+		char *outs[] = {RAYLIB_SOURCES};
 		#undef X
 
 		if (shared) {
@@ -1045,9 +1045,11 @@ meta_end_and_write_matlab(MetaprogramContext *m, char *path)
 	X(EndScope) \
 	X(Enumeration) \
 	X(Expand) \
+	X(FragmentShader) \
 	X(Library) \
 	X(MATLAB) \
 	X(PushConstants) \
+	X(RenderShader) \
 	X(Shader) \
 	X(ShaderAlias) \
 	X(ShaderGroup) \
@@ -1055,6 +1057,7 @@ meta_end_and_write_matlab(MetaprogramContext *m, char *path)
 	X(Struct) \
 	X(Table) \
 	X(Union) \
+	X(VertexShader) \
 
 typedef enum {
 	#define X(k, ...) MetaEntryKind_## k,
@@ -1080,14 +1083,14 @@ typedef enum {
 } MetaEmitLang;
 
 #define META_KIND_LIST \
-	X(M4,  m4,  mat4,      float,    single, 64, 16) \
-	X(V4,  v4,  vec4,      float,    single, 16,  4) \
-	X(SV4, iv4, ivec4,     int32_t,  int32,  16,  4) \
-	X(UV4, uv4, uvec4,     uint32_t, uint32, 16,  4) \
-	X(UV2, uv2, uvec2,     uint32_t, uint32,  8,  2) \
-	X(V3,  v3,  vec3,      float,    single, 12,  3) \
-	X(V2,  v2,  vec2,      float,    single,  8,  2) \
-	X(F32, f32, float,     float,    single,  4,  1) \
+	X(M4,  m4,  f32mat4,   float,    single, 64, 16) \
+	X(V4,  v4,  f32vec4,   float,    single, 16,  4) \
+	X(SV4, iv4, i32vec4,   int32_t,  int32,  16,  4) \
+	X(UV4, uv4, u32vec4,   uint32_t, uint32, 16,  4) \
+	X(UV2, uv2, u32vec2,   uint32_t, uint32,  8,  2) \
+	X(V3,  v3,  f32vec3,   float,    single, 12,  3) \
+	X(V2,  v2,  f32vec2,   float,    single,  8,  2) \
+	X(F32, f32, float32_t, float,    single,  4,  1) \
 	X(S32, i32, int32_t,   int32_t,  int32,   4,  1) \
 	X(S16, i16, int16_t,   int16_t,  int16,   2,  1) \
 	X(S8,  i8,  int8_t,    int8_t,   int8,    1,  1) \
@@ -1749,14 +1752,28 @@ typedef struct {
 typedef enum {
 	MetaShaderKind_Alias,
 	MetaShaderKind_Compute,
+	MetaShaderKind_Render,
 	MetaShaderKind_Count,
 } MetaShaderKind;
 
+typedef enum {
+	MetaShaderPrimitiveKind_Mesh,
+	MetaShaderPrimitiveKind_Vertex,
+	MetaShaderPrimitiveKind_Count,
+} MetaShaderPrimitiveKind;
+
+typedef struct {
+	MetaShaderPrimitiveKind kind;
+} MetaRenderShader;
+
 typedef struct {
 	MetaShaderKind kind;
 	MetaIDList     entity_reference_ids;
-	s8             file;
-	MetaEntityID   alias_parent_id;
+	s8             files[2];
+	union {
+		MetaEntityID      alias_parent_id;
+		MetaRenderShader  render;
+	};
 } MetaShader;
 
 #define META_STRUCT_FIELDS \
@@ -2320,6 +2337,44 @@ meta_pack_shader_common(MetaContext *ctx, MetaEntityID shader_id, MetaEntry *e, 
 }
 
 function i64
+meta_pack_render_shader(MetaContext *ctx, MetaEntry *entries, i64 entry_count, MetaEntityID group_entity_id)
+{
+	assert(entries[0].kind == MetaEntryKind_RenderShader);
+
+	MetaEntityID entity_id = meta_intern_entity(ctx, entries->name, MetaEntityKind_Shader,
+	                                            group_entity_id, entries->location, 0);
+	meta_entity(ctx, entity_id)->shader.kind = MetaShaderKind_Render;
+
+	meta_entry_argument_expected(entries);
+
+	MetaEntryScope scope = meta_entry_extract_scope(entries, entry_count);
+	if (scope.consumed > 1) {
+		for (MetaEntry *e = scope.start; e < scope.one_past_last; e++) {
+			switch (e->kind) {
+
+			case MetaEntryKind_VertexShader:{
+				if (meta_entity(ctx, entity_id)->shader.files[0].len)
+					meta_entry_error(e, "primitive shader file redefined\n");
+				meta_entity(ctx, entity_id)->shader.files[0] = meta_entry_argument_expect(e, 0, MetaEntryArgumentKind_String).string;
+				meta_entity(ctx, entity_id)->shader.render.kind = MetaShaderPrimitiveKind_Vertex;
+			}break;
+
+			case MetaEntryKind_FragmentShader:{
+				if (meta_entity(ctx, entity_id)->shader.files[1].len)
+					meta_entry_error(e, "fragment shader file redefined\n");
+				meta_entity(ctx, entity_id)->shader.files[1] = meta_entry_argument_expect(e, 0, MetaEntryArgumentKind_String).string;
+			}break;
+
+			default:{
+				e += meta_pack_shader_common(ctx, entity_id, e, scope.one_past_last - e, group_entity_id);
+			}break;
+			}
+		}
+	}
+	return scope.consumed;
+}
+
+function i64
 meta_pack_compute_shader(MetaContext *ctx, MetaEntry *entries, i64 entry_count, MetaEntityID group_entity_id)
 {
 	assert(entries[0].kind == MetaEntryKind_Shader);
@@ -2332,7 +2387,7 @@ meta_pack_compute_shader(MetaContext *ctx, MetaEntry *entries, i64 entry_count, 
 		meta_entry_argument_expected(entries, s8("[file_name]"));
 	} else if (entries->argument_count == 1) {
 		s8 shader_file = meta_entry_argument_expect(entries, 0, MetaEntryArgumentKind_String).string;
-		meta_entity(ctx, entity_id)->shader.file = shader_file;
+		meta_entity(ctx, entity_id)->shader.files[0] = shader_file;
 	}
 
 	MetaEntryScope scope = meta_entry_extract_scope(entries, entry_count);
@@ -2360,6 +2415,9 @@ meta_pack_shader_group(MetaContext *ctx, MetaEntry *entries, i64 entry_count)
 	if (scope.consumed > 1) {
 		for (MetaEntry *e = scope.start; e < scope.one_past_last; e++) {
 			switch (e->kind) {
+			case MetaEntryKind_RenderShader:{
+				e += meta_pack_render_shader(ctx, e, scope.one_past_last - e, entity_id);
+			}break;
 			case MetaEntryKind_Shader:{
 				e += meta_pack_compute_shader(ctx, e, scope.one_past_last - e, entity_id);
 			}break;
@@ -3480,12 +3538,15 @@ meta_push_shader_reload_info(MetaprogramContext *m, MetaContext *ctx)
 		}
 	} meta_end_scope(m, s8("};\n"));
 
-	meta_begin_scope(m, s8("read_only global s8 " META_NAMESPACE_LOWER "_reloadable_shader_files[] = {"));
+	meta_begin_scope(m, s8("read_only global s8 *" META_NAMESPACE_LOWER "_reloadable_shader_files[] = {"));
 	{
 		for (da_count shader = 0; shader < ctx->base_shader_count; shader++) {
 			da_count    id = ctx->base_shader_ids[shader];
 			MetaShader *s  = &ctx->entities.data[id].shader;
-			meta_push_line(m, s8("s8_comp(\""), s->file, s8("\"),"));
+			meta_begin_line(m, s8("(s8 []){s8_comp(\""), s->files[0], s8("\")"));
+			if (s->files[1].len)
+				meta_push(m, s8(", s8_comp(\""), s->files[1], s8("\")"));
+			meta_end_line(m, s8("},"));
 		}
 	} meta_end_scope(m, s8("};\n"));
 
@@ -3558,7 +3619,7 @@ meta_push_shader_reload_info(MetaprogramContext *m, MetaContext *ctx)
 					.element_count_style = MetaPushStructStyle_C,
 					.base_types          = meta_kind_glsl_types,
 					.prefix              = str8("\"  "),
-					.suffix              = str8("\\n\""),
+					.suffix              = str8(";\\n\""),
 				});
 				meta_push_line(m, s8("\"};\\n\""));
 				meta_push_line(m, s8("\"\\n\"),"));
@@ -3566,7 +3627,7 @@ meta_push_shader_reload_info(MetaprogramContext *m, MetaContext *ctx)
 
 			case MetaEntityKind_PushConstants:{
 				meta_push_line(m, s8("s8_comp(\"\""));
-				meta_push_line(m, s8("\"layout(std140, binding = 0) uniform PushConstants {\\n\""));
+				meta_push_line(m, s8("\"layout(push_constant, std430) uniform PushConstants {\\n\""));
 				meta_push_struct_body(ctx, m, e, (MetaPushStructParameters){
 					.layout_style        = MetaPushStructStyle_C,
 					.union_style         = MetaPushStructStyle_C,
@@ -3593,6 +3654,21 @@ meta_push_shader_reload_info(MetaprogramContext *m, MetaContext *ctx)
 			m->scratch = ctx->scratch;
 		}
 	} meta_end_scope(m, s8("};\n"));
+
+	meta_begin_scope(m, s8("read_only global b8 " META_NAMESPACE_LOWER "_shader_has_primitive[] = {"));
+	for (da_count bs = 0; bs < ctx->base_shader_count; bs++) {
+		MetaShader *s = &ctx->entities.data[ctx->base_shader_ids[bs]].shader;
+		meta_push_line(m, s->kind == MetaShaderKind_Render ? s8("1,") : s8("0,"));
+	}
+	meta_end_scope(m, s8("};\n"));
+
+	meta_begin_scope(m, s8("read_only global b8 " META_NAMESPACE_LOWER "_shader_primitive_is_vertex[] = {"));
+	for (da_count bs = 0; bs < ctx->base_shader_count; bs++) {
+		MetaShader *s = &ctx->entities.data[ctx->base_shader_ids[bs]].shader;
+		b8 vertex = s->kind == MetaShaderKind_Render && s->render.kind == MetaShaderPrimitiveKind_Vertex;
+		meta_push_line(m, vertex ? s8("1,") : s8("0,"));
+	}
+	meta_end_scope(m, s8("};\n"));
 }
 
 function void
@@ -3603,30 +3679,67 @@ meta_push_shader_bake(MetaprogramContext *m, MetaContext *ctx)
 
 		s8 shader_name = ctx->entity_names.data[ctx->base_shader_ids[bs]];
 
-		meta_begin_line(m, s8("read_only global u8 " META_NAMESPACE_LOWER  "_shader_"));
-		for (i64 i = 0; i < shader_name.len; i++)
-			stream_append_byte(&m->stream, ToLower(shader_name.data[i]));
-
-		meta_begin_scope(m, s8("_bytes[] = {")); {
-			Arena scratch = m->scratch;
-			s8 filename = push_s8_from_parts(&scratch, s8(OS_PATH_SEPARATOR), s8("shaders"), s->file);
-			s8 file     = read_entire_file((c8 *)filename.data, &scratch);
-			metagen_push_byte_array(m, file);
-		} meta_end_scope(m, s8("};\n"));
+		for EachElement(s->files, it) {
+			if (s->files[it].len > 0) {
+				meta_begin_line(m, s8("read_only global u8 " META_NAMESPACE_LOWER  "_shader_"));
+				for (i64 i = 0; i < shader_name.len; i++)
+					stream_append_byte(&m->stream, ToLower(shader_name.data[i]));
+
+				if (s->kind == MetaShaderKind_Render)
+					meta_push(m, it == 0 ? s8("_primitive") : s8("_fragment"));
+
+				meta_begin_scope(m, s8("_bytes[] = {")); {
+					Arena scratch = m->scratch;
+					s8 filename = push_s8_from_parts(&scratch, s8(OS_PATH_SEPARATOR), s8("shaders"), s->files[it]);
+					s8 file     = read_entire_file((c8 *)filename.data, &scratch);
+					metagen_push_byte_array(m, file);
+				} meta_end_scope(m, s8("};\n"));
+			}
+		}
 	}
 
-	meta_begin_scope(m, s8("read_only global s8 " META_NAMESPACE_LOWER "_shader_data[] = {")); {
+	meta_begin_scope(m, s8("read_only global s8 *" META_NAMESPACE_LOWER "_shader_data[] = {")); {
 		for (da_count bs = 0; bs < ctx->base_shader_count; bs++) {
+			MetaShader *s = &ctx->entities.data[ctx->base_shader_ids[bs]].shader;
+
 			s8 shader_name = ctx->entity_names.data[ctx->base_shader_ids[bs]];
 
-			meta_begin_line(m, s8("{.data = " META_NAMESPACE_LOWER "_shader_"));
-			for (iz i = 0; i < shader_name.len; i++)
+			if (s->kind == MetaShaderKind_Render) {
+				meta_begin_scope(m, s8("(s8 []){"));
+				meta_indent(m);
+			} else {
+				meta_begin_line(m,  s8("(s8 []){"));
+			}
+
+			meta_push(m, s8("{.data = " META_NAMESPACE_LOWER "_shader_"));
+			for (i64 i = 0; i < shader_name.len; i++)
 				stream_append_byte(&m->stream, ToLower(shader_name.data[i]));
 
+			if (s->kind == MetaShaderKind_Render)
+				meta_push(m, s8("_primitive"));
+
 			meta_push(m, s8("_bytes, .len = countof(" META_NAMESPACE_LOWER "_shader_"));
-			for (iz i = 0; i < shader_name.len; i++)
+			for (i64 i = 0; i < shader_name.len; i++)
 				stream_append_byte(&m->stream, ToLower(shader_name.data[i]));
-			meta_end_line(m,  s8("_bytes)},"));
+
+			if (s->kind == MetaShaderKind_Render)
+				meta_push(m, s8("_primitive"));
+			meta_push(m, s8("_bytes)}"));
+
+			if (s->kind == MetaShaderKind_Render) {
+				meta_end_line(m, s8(","));
+				meta_begin_line(m, s8("{.data = " META_NAMESPACE_LOWER "_shader_"));
+				for (i64 i = 0; i < shader_name.len; i++)
+					stream_append_byte(&m->stream, ToLower(shader_name.data[i]));
+
+				meta_push(m, s8("_fragment_bytes, .len = countof(" META_NAMESPACE_LOWER "_shader_"));
+				for (i64 i = 0; i < shader_name.len; i++)
+					stream_append_byte(&m->stream, ToLower(shader_name.data[i]));
+				meta_end_line(m, s8("_fragment_bytes)}"));
+			}
+
+			if (s->kind == MetaShaderKind_Render) meta_end_scope(m, s8("},"));
+			else                                  meta_end_line(m,  s8("},"));
 		}
 	} meta_end_scope(m, s8("};\n"));
 }
@@ -3662,7 +3775,9 @@ metagen_emit_c_code(MetaContext *ctx, Arena arena)
 		u32 dep_count = 0;
 		for (da_count bs = 0; bs < ctx->base_shader_count; bs++) {
 			MetaShader *s = &ctx->entities.data[ctx->base_shader_ids[bs]].shader;
-			deps[dep_count++] = (c8 *)push_s8_from_parts(&m->scratch, s8(OS_PATH_SEPARATOR), s8("shaders"), s->file).data;
+			deps[dep_count++] = (c8 *)push_s8_from_parts(&m->scratch, s8(OS_PATH_SEPARATOR), s8("shaders"), s->files[0]).data;
+			if (s->files[1].len > 0)
+				deps[dep_count++] = (c8 *)push_s8_from_parts(&m->scratch, s8(OS_PATH_SEPARATOR), s8("shaders"), s->files[1]).data;
 		}
 		if (needs_rebuild_(out_shaders, deps, dep_count)) {
 			build_log_generate("Bake Shaders");
@@ -3939,6 +4054,19 @@ metagen_emit_c_code(MetaContext *ctx, Arena arena)
 		}
 	} meta_end_scope(m, s8("};\n"));
 
+	meta_begin_scope(m, s8("read_only global u8 " META_NAMESPACE_LOWER "_shader_push_constant_sizes[] = {"));
+	for (da_count bs = 0; bs < ctx->base_shader_count; bs++) {
+		da_count    id = ctx->base_shader_ids[bs];
+		MetaEntity *e  = ctx->entities.data + id;
+		MetaEntityID pc_id = meta_entity_first_child_of_kind(ctx, e, MetaEntityKind_PushConstants);
+		if (pc_id.value != 0) {
+			meta_push_line(m, s8("sizeof(" META_NAMESPACE_UPPER), ctx->entity_names.data[id], s8("PushConstants),"));
+		} else {
+			meta_push_line(m, s8("0,"));
+		}
+	}
+	meta_end_scope(m, s8("};\n"));
+
 	//fprintf(stderr, "%.*s\n", (i32)m.stream.widx, m.stream.data);
 
 	result = meta_write_and_reset(m, out_meta);
@@ -4734,7 +4862,7 @@ metagen_load_context(Arena *arena, char *filename)
 	{
 		for (da_count shader = 0; shader < ctx->entity_kind_counts[MetaEntityKind_Shader]; shader++) {
 			MetaEntity *e = ctx->entities.data + ctx->entity_kind_ids[MetaEntityKind_Shader][shader];
-			if (e->shader.file.len > 0)
+			if (e->shader.files[0].len > 0)
 				ctx->base_shader_count++;
 		}
 
@@ -4744,14 +4872,14 @@ metagen_load_context(Arena *arena, char *filename)
 		da_count base_shader_ids_index = 0;
 		for (da_count shader = 0; shader < ctx->entity_kind_counts[MetaEntityKind_Shader]; shader++) {
 			da_count id = ctx->entity_kind_ids[MetaEntityKind_Shader][shader];
-			if (ctx->entities.data[id].shader.file.len > 0)
+			if (ctx->entities.data[id].shader.files[0].len > 0)
 				ctx->base_shader_ids[base_shader_ids_index++] = id;
 		}
 
 		// NOTE(rnp): first pass to resolve real shaders
 		for (da_count shader = 0; shader < ctx->entity_kind_counts[MetaEntityKind_Shader]; shader++) {
 			da_count id = ctx->entity_kind_ids[MetaEntityKind_Shader][shader];
-			if (ctx->entities.data[id].shader.file.len > 0) {
+			if (ctx->entities.data[id].shader.files[0].len > 0) {
 				ctx->base_shader_id_map[shader] = meta_lookup_id_slow(ctx->base_shader_ids,
 				                                                      ctx->base_shader_count,
 				                                                      id);
diff --git a/external/include/raylib_extended.h b/external/include/raylib_extended.h
@@ -1,2 +0,0 @@
-#include "../raylib/src/raylib.h"
-RLAPI void *GetPlatformWindowHandle(void);
diff --git a/external/rcore_extended.c b/external/rcore_extended.c
@@ -1,8 +0,0 @@
-/* NOTE(rnp): hacky stuff to work around broken raylib garbage */
-#include <raylib_extended.h>
-#include "raylib/src/rcore.c"
-
-void *GetPlatformWindowHandle(void)
-{
-	return (void *)platform.handle;
-}
diff --git a/generated/beamformer.meta.c b/generated/beamformer.meta.c
@@ -4,7 +4,7 @@
 
 // NOTE: Constants (Integer)
 #define BeamformerFilterSlots              (4)
-#define BeamformerMaxBacklogFrames         (16)
+#define BeamformerMaxBacklogFrames         (4096)
 #define BeamformerMaxChannelCount          (256)
 #define BeamformerMaxEmissionsCount        (256)
 #define BeamformerMaxComputeShaderStages   (16)
@@ -84,23 +84,31 @@ typedef enum {
 } BeamformerAcquisitionKind;
 
 typedef enum {
-	BeamformerShaderKind_CudaDecode  = 0,
-	BeamformerShaderKind_CudaHilbert = 1,
-	BeamformerShaderKind_Decode      = 2,
-	BeamformerShaderKind_Filter      = 3,
-	BeamformerShaderKind_Demodulate  = 4,
-	BeamformerShaderKind_DAS         = 5,
-	BeamformerShaderKind_MinMax      = 6,
-	BeamformerShaderKind_Sum         = 7,
-	BeamformerShaderKind_Render3D    = 8,
+	BeamformerShaderKind_CudaDecode         = 0,
+	BeamformerShaderKind_CudaHilbert        = 1,
+	BeamformerShaderKind_Decode             = 2,
+	BeamformerShaderKind_Filter             = 3,
+	BeamformerShaderKind_Demodulate         = 4,
+	BeamformerShaderKind_DAS                = 5,
+	BeamformerShaderKind_Sum                = 6,
+	BeamformerShaderKind_MinMax             = 7,
+	BeamformerShaderKind_CoherencyWeighting = 8,
+	BeamformerShaderKind_BufferClear        = 9,
+	BeamformerShaderKind_RenderBeamformed   = 10,
 	BeamformerShaderKind_Count,
 
-	BeamformerShaderKind_ComputeFirst = BeamformerShaderKind_CudaDecode,
-	BeamformerShaderKind_ComputeLast  = BeamformerShaderKind_Sum,
-	BeamformerShaderKind_ComputeCount = 8,
-	BeamformerShaderKind_RenderFirst  = BeamformerShaderKind_Render3D,
-	BeamformerShaderKind_RenderLast   = BeamformerShaderKind_Render3D,
-	BeamformerShaderKind_RenderCount  = 1,
+	BeamformerShaderKind_ComputeFirst         = BeamformerShaderKind_CudaDecode,
+	BeamformerShaderKind_ComputeLast          = BeamformerShaderKind_MinMax,
+	BeamformerShaderKind_ComputeCount         = 8,
+	BeamformerShaderKind_ComputeHelpersFirst  = BeamformerShaderKind_CoherencyWeighting,
+	BeamformerShaderKind_ComputeHelpersLast   = BeamformerShaderKind_CoherencyWeighting,
+	BeamformerShaderKind_ComputeHelpersCount  = 1,
+	BeamformerShaderKind_ComputeInternalFirst = BeamformerShaderKind_BufferClear,
+	BeamformerShaderKind_ComputeInternalLast  = BeamformerShaderKind_BufferClear,
+	BeamformerShaderKind_ComputeInternalCount = 1,
+	BeamformerShaderKind_RenderFirst          = BeamformerShaderKind_RenderBeamformed,
+	BeamformerShaderKind_RenderLast           = BeamformerShaderKind_RenderBeamformed,
+	BeamformerShaderKind_RenderCount          = 1,
 } BeamformerShaderKind;
 
 typedef struct {
@@ -141,7 +149,6 @@ typedef struct {
 	u32 coherency_weighting;
 	u32 single_focus;
 	u32 single_orientation;
-	u32 fast;
 	u32 sparse;
 	u32 acquisition_count;
 	u32 acquisition_kind;
@@ -159,12 +166,78 @@ typedef struct {
 } BeamformerDASBakeParameters;
 
 typedef struct {
-	m4 xdc_transform;
-	m4 voxel_transform;
-	v2 xdc_element_pitch;
+	u32 data_kind;
+} BeamformerCoherencyWeightingBakeParameters;
+
+typedef struct {
+	u64 hadamard_buffer;
+	u64 rf_buffer;
+	u64 output_buffer;
+	u64 output_rf_buffer;
+	b32 first_pass;
+} BeamformerDecodePushConstants;
+
+typedef struct {
+	u64 input_data;
+	u64 output_data;
+	u64 filter_coefficients;
+} BeamformerFilterPushConstants;
+
+typedef struct {
+	m4  xdc_transform;
+	m4  voxel_transform;
+	v2  xdc_element_pitch;
+	u64 rf_data;
+	u64 output_data;
+	u64 incoherent_output;
+	u64 array_parameters;
+	u32 output_size_x;
+	u32 output_size_y;
+	u32 output_size_z;
+	u32 cycle_t;
+	i32 channel_t;
 } BeamformerDASPushConstants;
 
 typedef struct {
+	u64 output_data;
+	u64 input_data;
+	u32 image_elements;
+	f32 scale;
+} BeamformerSumPushConstants;
+
+typedef struct {
+	u64 left_side_buffer;
+	u64 right_side_buffer;
+	u32 elements;
+	f32 scale;
+	u32 output_size_x;
+	u32 output_size_y;
+	u32 output_size_z;
+} BeamformerCoherencyWeightingPushConstants;
+
+typedef struct {
+	u64 data;
+	u32 clear_word;
+	u32 words;
+} BeamformerBufferClearPushConstants;
+
+typedef struct {
+	m4  mvp_matrix;
+	u64 positions;
+	u64 normals;
+	v4  bounding_box_colour;
+	f32 bounding_box_fraction;
+	f32 db_cutoff;
+	f32 threshold;
+	f32 gamma;
+	u64 input_data;
+	u32 input_size_x;
+	u32 input_size_y;
+	u32 input_size_z;
+	u32 data_kind;
+} BeamformerRenderBeamformedPushConstants;
+
+typedef struct {
 	f32 cycles;
 	f32 frequency;
 } BeamformerSineParameters;
@@ -304,10 +377,17 @@ typedef struct {
 	BeamformerDataKind           data_kind;
 } BeamformerSimpleParameters;
 
+typedef struct {
+	v2  focal_vectors[BeamformerMaxChannelCount];
+	i16 sparse_elements[BeamformerMaxChannelCount];
+	u16 transmit_receive_orientations[BeamformerMaxChannelCount];
+} BeamformerDASArrayParameters;
+
 typedef union {
-	BeamformerDecodeBakeParameters Decode;
-	BeamformerFilterBakeParameters Filter;
-	BeamformerDASBakeParameters    DAS;
+	BeamformerDecodeBakeParameters             Decode;
+	BeamformerFilterBakeParameters             Filter;
+	BeamformerDASBakeParameters                DAS;
+	BeamformerCoherencyWeightingBakeParameters CoherencyWeighting;
 } BeamformerShaderBakeParameters;
 
 read_only global u8 beamformer_data_kind_element_size[] = {
@@ -399,27 +479,33 @@ read_only global s8 beamformer_shader_names[] = {
 	s8_comp("Filter"),
 	s8_comp("Demodulate"),
 	s8_comp("DAS"),
-	s8_comp("MinMax"),
 	s8_comp("Sum"),
-	s8_comp("Render3D"),
+	s8_comp("MinMax"),
+	s8_comp("CoherencyWeighting"),
+	s8_comp("BufferClear"),
+	s8_comp("RenderBeamformed"),
 };
 
 read_only global BeamformerShaderKind beamformer_reloadable_shader_kinds[] = {
 	BeamformerShaderKind_Decode,
 	BeamformerShaderKind_Filter,
 	BeamformerShaderKind_DAS,
-	BeamformerShaderKind_MinMax,
 	BeamformerShaderKind_Sum,
-	BeamformerShaderKind_Render3D,
+	BeamformerShaderKind_MinMax,
+	BeamformerShaderKind_CoherencyWeighting,
+	BeamformerShaderKind_BufferClear,
+	BeamformerShaderKind_RenderBeamformed,
 };
 
-read_only global s8 beamformer_reloadable_shader_files[] = {
-	s8_comp("decode.glsl"),
-	s8_comp("filter.glsl"),
-	s8_comp("das.glsl"),
-	s8_comp("min_max.glsl"),
-	s8_comp("sum.glsl"),
-	s8_comp("render_3d.frag.glsl"),
+read_only global s8 *beamformer_reloadable_shader_files[] = {
+	(s8 []){s8_comp("decode.glsl")},
+	(s8 []){s8_comp("filter.glsl")},
+	(s8 []){s8_comp("das.glsl")},
+	(s8 []){s8_comp("sum.glsl")},
+	(s8 []){s8_comp("min_max.glsl")},
+	(s8 []){s8_comp("coherency_weighting.glsl")},
+	(s8 []){s8_comp("buffer_clear.glsl")},
+	(s8 []){s8_comp("render_3d.vert.glsl"), s8_comp("render_3d.frag.glsl")},
 };
 
 read_only global i32 beamformer_shader_reloadable_index_by_shader[] = {
@@ -432,6 +518,8 @@ read_only global i32 beamformer_shader_reloadable_index_by_shader[] = {
 	3,
 	4,
 	5,
+	6,
+	7,
 };
 
 read_only global i32 beamformer_reloadable_compute_shader_info_indices[] = {
@@ -442,10 +530,18 @@ read_only global i32 beamformer_reloadable_compute_shader_info_indices[] = {
 	4,
 };
 
-read_only global i32 beamformer_reloadable_render_shader_info_indices[] = {
+read_only global i32 beamformer_reloadable_compute_helpers_shader_info_indices[] = {
 	5,
 };
 
+read_only global i32 beamformer_reloadable_compute_internal_shader_info_indices[] = {
+	6,
+};
+
+read_only global i32 beamformer_reloadable_render_shader_info_indices[] = {
+	7,
+};
+
 read_only global s8 beamformer_shader_global_header_strings[] = {
 	s8_comp(""
 	"#define DataKind_Int16          0\n"
@@ -460,6 +556,23 @@ read_only global s8 beamformer_shader_global_header_strings[] = {
 	"#define DecodeMode_Hadamard 1\n"
 	"\n"),
 	s8_comp(""
+	"layout(push_constant, std430) uniform PushConstants {\n"
+	"  uint64_t hadamard_buffer;\n"
+	"  uint64_t rf_buffer;\n"
+	"  uint64_t output_buffer;\n"
+	"  uint64_t output_rf_buffer;\n"
+	"  bool     first_pass;\n"
+	"};\n"
+	"\n"),
+	s8_comp(""
+	"layout(push_constant, std430) uniform PushConstants {\n"
+	"  uint64_t input_data;\n"
+	"  uint64_t output_data;\n"
+	"  uint64_t filter_coefficients;\n"
+	"};\n"
+	"\n"),
+	s8_comp("#define MaxChannelCount (256)\n\n"),
+	s8_comp(""
 	"#define AcquisitionKind_FORCES         0\n"
 	"#define AcquisitionKind_UFORCES        1\n"
 	"#define AcquisitionKind_HERCULES       2\n"
@@ -484,30 +597,115 @@ read_only global s8 beamformer_shader_global_header_strings[] = {
 	"#define RCAOrientation_Columns 2\n"
 	"\n"),
 	s8_comp(""
-	"layout(std140, binding = 0) uniform PushConstants {\n"
-	"  mat4 xdc_transform;\n"
-	"  mat4 voxel_transform;\n"
-	"  vec2 xdc_element_pitch;\n"
+	"struct DASArrayParameters {\n"
+	"  f32vec2  focal_vectors[MaxChannelCount];\n"
+	"  int16_t  sparse_elements[MaxChannelCount];\n"
+	"  uint16_t transmit_receive_orientations[MaxChannelCount];\n"
+	"};\n"
+	"\n"),
+	s8_comp(""
+	"layout(push_constant, std430) uniform PushConstants {\n"
+	"  f32mat4  xdc_transform;\n"
+	"  f32mat4  voxel_transform;\n"
+	"  f32vec2  xdc_element_pitch;\n"
+	"  uint64_t rf_data;\n"
+	"  uint64_t output_data;\n"
+	"  uint64_t incoherent_output;\n"
+	"  uint64_t array_parameters;\n"
+	"  uint32_t output_size_x;\n"
+	"  uint32_t output_size_y;\n"
+	"  uint32_t output_size_z;\n"
+	"  uint32_t cycle_t;\n"
+	"  int32_t  channel_t;\n"
+	"};\n"
+	"\n"),
+	s8_comp(""
+	"layout(push_constant, std430) uniform PushConstants {\n"
+	"  uint64_t  output_data;\n"
+	"  uint64_t  input_data;\n"
+	"  uint32_t  image_elements;\n"
+	"  float32_t scale;\n"
+	"};\n"
+	"\n"),
+	s8_comp(""
+	"layout(push_constant, std430) uniform PushConstants {\n"
+	"  uint64_t  left_side_buffer;\n"
+	"  uint64_t  right_side_buffer;\n"
+	"  uint32_t  elements;\n"
+	"  float32_t scale;\n"
+	"  uint32_t  output_size_x;\n"
+	"  uint32_t  output_size_y;\n"
+	"  uint32_t  output_size_z;\n"
+	"};\n"
+	"\n"),
+	s8_comp(""
+	"layout(push_constant, std430) uniform PushConstants {\n"
+	"  uint64_t data;\n"
+	"  uint32_t clear_word;\n"
+	"  uint32_t words;\n"
+	"};\n"
+	"\n"),
+	s8_comp(""
+	"layout(push_constant, std430) uniform PushConstants {\n"
+	"  f32mat4   mvp_matrix;\n"
+	"  uint64_t  positions;\n"
+	"  uint64_t  normals;\n"
+	"  f32vec4   bounding_box_colour;\n"
+	"  float32_t bounding_box_fraction;\n"
+	"  float32_t db_cutoff;\n"
+	"  float32_t threshold;\n"
+	"  float32_t gamma;\n"
+	"  uint64_t  input_data;\n"
+	"  uint32_t  input_size_x;\n"
+	"  uint32_t  input_size_y;\n"
+	"  uint32_t  input_size_z;\n"
+	"  uint32_t  data_kind;\n"
 	"};\n"
 	"\n"),
 };
 
-read_only global i32 *beamformer_shader_header_vectors[] = {
-	(i32 []){0, 1},
-	(i32 []){0},
-	(i32 []){2, 0, 3, 4, 5},
+read_only global b8 beamformer_shader_has_primitive[] = {
+	0,
 	0,
 	0,
 	0,
+	0,
+	0,
+	0,
+	1,
 };
 
-read_only global i32 beamformer_shader_header_vector_lengths[] = {
-	2,
-	1,
-	5,
+read_only global b8 beamformer_shader_primitive_is_vertex[] = {
+	0,
+	0,
+	0,
 	0,
 	0,
 	0,
+	0,
+	1,
+};
+
+read_only global i32 *beamformer_shader_header_vectors[] = {
+	(i32 []){0, 1, 2},
+	(i32 []){0, 3},
+	(i32 []){4, 5, 0, 6, 7, 8, 9},
+	(i32 []){0, 10},
+	0,
+	(i32 []){0, 11},
+	(i32 []){12},
+	(i32 []){0, 13},
+};
+
+read_only global i32 beamformer_shader_header_vector_lengths[] = {
+	3,
+	2,
+	7,
+	2,
+	0,
+	2,
+	1,
+	2,
 };
 
 read_only global s8 *beamformer_shader_bake_parameter_names[] = {
@@ -547,7 +745,6 @@ read_only global s8 *beamformer_shader_bake_parameter_names[] = {
 		s8_comp("CoherencyWeighting"),
 		s8_comp("SingleFocus"),
 		s8_comp("SingleOrientation"),
-		s8_comp("Fast"),
 		s8_comp("Sparse"),
 		s8_comp("AcquisitionCount"),
 		s8_comp("AcquisitionKind"),
@@ -565,13 +762,19 @@ read_only global s8 *beamformer_shader_bake_parameter_names[] = {
 	},
 	0,
 	0,
+	(s8 []){
+		s8_comp("DataKind"),
+	},
+	0,
 	0,
 };
 
 read_only global u32 beamformer_shader_bake_parameter_float_bits[] = {
 	0x00000000UL,
 	0x00006000UL,
-	0x0007f000UL,
+	0x0003f800UL,
+	0x00000000UL,
+	0x00000000UL,
 	0x00000000UL,
 	0x00000000UL,
 	0x00000000UL,
@@ -580,9 +783,22 @@ read_only global u32 beamformer_shader_bake_parameter_float_bits[] = {
 read_only global u8 beamformer_shader_bake_parameter_counts[] = {
 	12,
 	15,
-	19,
+	18,
 	0,
 	0,
+	1,
+	0,
+	0,
+};
+
+read_only global u8 beamformer_shader_push_constant_sizes[] = {
+	sizeof(BeamformerDecodePushConstants),
+	sizeof(BeamformerFilterPushConstants),
+	sizeof(BeamformerDASPushConstants),
+	sizeof(BeamformerSumPushConstants),
 	0,
+	sizeof(BeamformerCoherencyWeightingPushConstants),
+	sizeof(BeamformerBufferClearPushConstants),
+	sizeof(BeamformerRenderBeamformedPushConstants),
 };
 
diff --git a/lib/ogl_beamformer_lib.c b/lib/ogl_beamformer_lib.c
@@ -229,6 +229,15 @@ beamformer_get_last_error_string(void)
 	return beamformer_error_string(beamformer_get_last_error());
 }
 
+u64
+beamformer_maximum_frame_size(void)
+{
+	u64 result = U64_MAX;
+	if (check_shared_memory())
+		result = g_beamformer_library_context.bp->max_beamformed_data_size;
+	return result;
+}
+
 void
 beamformer_set_global_timeout(u32 timeout_ms)
 {
@@ -650,12 +659,14 @@ beamformer_beamform_data(BeamformerSimpleParameters *bp, void *data, uint32_t da
 			complex |= shader == BeamformerShaderKind_Demodulate || shader == BeamformerShaderKind_CudaHilbert;
 		}
 
-		iz output_size = output_points.x * output_points.y * output_points.z * (i32)sizeof(f32);
+		u64 output_size = output_points.x * output_points.y * output_points.z * sizeof(f32);
 		if (complex) output_size *= 2;
 
+		result = lib_error_check(output_size <= g_beamformer_library_context.bp->max_beamformed_data_size, FrameSizeOverflow);
+
 		Arena scratch = beamformer_shared_memory_scratch_arena(g_beamformer_library_context.bp,
 		                                                       g_beamformer_library_context.shared_memory_size);
-		if (out_data) result &= lib_error_check(output_size <= arena_capacity(&scratch, u8), ExportSpaceOverflow);
+		if (result && out_data) result &= lib_error_check((iz)output_size <= arena_capacity(&scratch, u8), ExportSpaceOverflow);
 
 		if (result) {
 			result = beamformer_push_data_with_compute(data, data_size, 0, 0);
diff --git a/lib/ogl_beamformer_lib_base.h b/lib/ogl_beamformer_lib_base.h
@@ -27,6 +27,7 @@
 	X(ExportSpaceOverflow,          16, "not enough space for data export")                  \
 	X(SharedMemory,                 17, "failed to open shared memory region")               \
 	X(SyncVariable,                 18, "failed to acquire lock within timeout period")      \
+	X(FrameSizeOverflow,            19, "maximum frame size exceeded")                       \
 
 #define X(type, num, string) BeamformerLibErrorKind_##type = num,
 typedef enum {BEAMFORMER_LIB_ERRORS} BeamformerLibErrorKind;
@@ -38,6 +39,9 @@ BEAMFORMER_LIB_EXPORT BeamformerLibErrorKind beamformer_get_last_error(void);
 BEAMFORMER_LIB_EXPORT const char *beamformer_get_last_error_string(void);
 BEAMFORMER_LIB_EXPORT const char *beamformer_error_string(BeamformerLibErrorKind kind);
 
+// NOTE: returns U64_MAX if shared memory could not be opened
+BEAMFORMER_LIB_EXPORT uint64_t beamformer_maximum_frame_size(void);
+
 ///////////////////////////
 // NOTE: Simple API
 /* Usage:
diff --git a/main_linux.c b/main_linux.c
@@ -252,16 +252,7 @@ load_platform_libraries(BeamformerInput *input)
 	#if BEAMFORMER_RENDERDOC_HOOKS
 	local_persist OSLibrary renderdoc_handle = {OSInvalidHandleValue};
 	renderdoc_handle = load_library(OS_RENDERDOC_SONAME, 0, RTLD_NOW|RTLD_LOCAL|RTLD_NOLOAD);
-	if ValidHandle(renderdoc_handle) {
-		renderdoc_get_api_fn *get_api = os_lookup_symbol(renderdoc_handle, "RENDERDOC_GetAPI");
-		if (get_api) {
-			RenderDocAPI *api = 0;
-			if (get_api(10600, (void **)&api)) {
-				input->renderdoc_start_frame_capture = RENDERDOC_START_FRAME_CAPTURE(api);
-				input->renderdoc_end_frame_capture   = RENDERDOC_END_FRAME_CAPTURE(api);
-			}
-		}
-	}
+	load_renderdoc_functions(input, renderdoc_handle);
 	#endif
 }
 
diff --git a/main_w32.c b/main_w32.c
@@ -301,16 +301,7 @@ load_platform_libraries(BeamformerInput *input)
 	#if BEAMFORMER_RENDERDOC_HOOKS
 	local_persist OSLibrary renderdoc_handle = {OSInvalidHandleValue};
 	renderdoc_handle = get_module(OS_RENDERDOC_SONAME);
-	if ValidHandle(renderdoc_handle) {
-		renderdoc_get_api_fn *get_api = os_lookup_symbol(renderdoc_handle, "RENDERDOC_GetAPI");
-		if (get_api) {
-			RenderDocAPI *api = 0;
-			if (get_api(10600, (void **)&api)) {
-				input->renderdoc_start_frame_capture = RENDERDOC_START_FRAME_CAPTURE(api);
-				input->renderdoc_end_frame_capture   = RENDERDOC_END_FRAME_CAPTURE(api);
-			}
-		}
-	}
+	load_renderdoc_functions(input, renderdoc_handle);
 	#endif
 }
 
diff --git a/math.c b/math.c
@@ -153,20 +153,6 @@ subrange_n_from_n_m_count(u64 n, u64 n_count, u64 m)
 	return result;
 }
 
-function b32
-iv2_equal(iv2 a, iv2 b)
-{
-	b32 result = a.x == b.x && a.y == b.y;
-	return result;
-}
-
-function b32
-iv3_equal(iv3 a, iv3 b)
-{
-	b32 result = a.x == b.x && a.y == b.y && a.z == b.z;
-	return result;
-}
-
 function i32
 iv3_dimension(iv3 points)
 {
@@ -574,12 +560,12 @@ function m4
 perspective_projection(f32 n, f32 f, f32 fov, f32 aspect)
 {
 	m4 result;
-	f32 t = tan_f32(fov / 2.0f);
+	f32 t = n * tan_f32(fov / 2.0f);
 	f32 r = t * aspect;
 	f32 a = -(f + n) / (f - n);
 	f32 b = -2 * f * n / (f - n);
-	result.c[0] = (v4){{1 / r, 0,     0,  0}};
-	result.c[1] = (v4){{0,     1 / t, 0,  0}};
+	result.c[0] = (v4){{n / r, 0,     0,  0}};
+	result.c[1] = (v4){{0,     n / t, 0,  0}};
 	result.c[2] = (v4){{0,     0,     a, -1}};
 	result.c[3] = (v4){{0,     0,     b,  0}};
 	return result;
diff --git a/opengl.h b/opengl.h
@@ -11,154 +11,63 @@
 #include <GL/gl.h>
 
 /* NOTE: do not add extra 0s to these, even at the start -> garbage compilers will complain */
-#define GL_DYNAMIC_STORAGE_BIT             0x0100
 #define GL_SHADER_IMAGE_ACCESS_BARRIER_BIT 0x00000020
 #define GL_TEXTURE_UPDATE_BARRIER_BIT      0x00000100
-#define GL_SHADER_STORAGE_BARRIER_BIT      0x00002000
 
-#define GL_UNSIGNED_INT_8_8_8_8            0x8035
-#define GL_TEXTURE_3D                      0x806F
-#define GL_MAX_3D_TEXTURE_SIZE             0x8073
-#define GL_MULTISAMPLE                     0x809D
+#define GL_NONE                            0
+
 #define GL_CLAMP_TO_BORDER                 0x812D
-#define GL_CLAMP_TO_EDGE                   0x812F
-#define GL_DEPTH_COMPONENT24               0x81A6
-#define GL_MAJOR_VERSION                   0x821B
-#define GL_MINOR_VERSION                   0x821C
-#define GL_RG                              0x8227
-#define GL_R16F                            0x822D
-#define GL_R32F                            0x822E
 #define GL_RG32F                           0x8230
-#define GL_R8I                             0x8231
-#define GL_R16I                            0x8233
-#define GL_MAX_COMPUTE_SHARED_MEMORY_SIZE  0x8262
-#define GL_BUFFER                          0x82E0
-#define GL_PROGRAM                         0x82E2
-#define GL_MIRRORED_REPEAT                 0x8370
-#define GL_QUERY_RESULT                    0x8866
 #define GL_READ_ONLY                       0x88B8
 #define GL_WRITE_ONLY                      0x88B9
 #define GL_READ_WRITE                      0x88BA
-#define GL_TIME_ELAPSED                    0x88BF
-#define GL_STATIC_DRAW                     0x88E4
-#define GL_UNIFORM_BUFFER                  0x8A11
-#define GL_MAX_UNIFORM_BLOCK_SIZE          0x8A30
-#define GL_FRAGMENT_SHADER                 0x8B30
-#define GL_VERTEX_SHADER                   0x8B31
-#define GL_COMPILE_STATUS                  0x8B81
-#define GL_LINK_STATUS                     0x8B82
-#define GL_INFO_LOG_LENGTH                 0x8B84
-#define GL_MAX_TEXTURE_BUFFER_SIZE         0x8C2B
-#define GL_COLOR_ATTACHMENT0               0x8CE0
-#define GL_DEPTH_ATTACHMENT                0x8D00
-#define GL_FRAMEBUFFER                     0x8D40
-#define GL_RENDERBUFFER                    0x8D41
-#define GL_RED_INTEGER                     0x8D94
-#define GL_TIMESTAMP                       0x8E28
-#define GL_MIN_MAP_BUFFER_ALIGNMENT        0x90BC
-#define GL_SHADER_STORAGE_BUFFER           0x90D2
-#define GL_MAX_SHADER_STORAGE_BLOCK_SIZE   0x90DE
-#define GL_MAX_SERVER_WAIT_TIMEOUT         0x9111
-#define GL_SYNC_GPU_COMMANDS_COMPLETE      0x9117
-#define GL_TIMEOUT_EXPIRED                 0x911B
-#define GL_WAIT_FAILED                     0x911D
-#define GL_TEXTURE_BUFFER_OFFSET_ALIGNMENT 0x919F
-#define GL_COMPUTE_SHADER                  0x91B9
 #define GL_DEBUG_OUTPUT                    0x92E0
 
+#define GL_DEDICATED_MEMORY_OBJECT_EXT     0x9581
 #define GL_HANDLE_TYPE_OPAQUE_FD_EXT       0x9586
 #define GL_HANDLE_TYPE_OPAQUE_WIN32_EXT    0x9587
+#define GL_LAYOUT_COLOR_ATTACHMENT_EXT     0x958E
+#define GL_LAYOUT_SHADER_READ_ONLY_EXT     0x9591
 
 typedef char GLchar;
 typedef i64  GLsizeiptr;
 typedef i64  GLintptr;
 typedef u64  GLuint64;
-typedef struct __GLsync *GLsync;
 
 /* X(name, ret, params) */
 #define OGLProcedureList \
-	X(glAttachShader,                        void,   (GLuint program, GLuint shader)) \
-	X(glBeginQuery,                          void,   (GLenum target, GLuint id)) \
-	X(glBindBufferBase,                      void,   (GLenum target, GLuint index, GLuint buffer)) \
-	X(glBindBufferRange,                     void,   (GLenum target, GLuint index, GLuint buffer, GLintptr offset, GLsizeiptr size)) \
-	X(glBindFramebuffer,                     void,   (GLenum target, GLuint framebuffer)) \
 	X(glBindImageTexture,                    void,   (GLuint unit, GLuint texture, GLint level, GLboolean layered, GLint layer, GLenum access, GLenum format)) \
-	X(glBindTextureUnit,                     void,   (GLuint unit, GLuint texture)) \
-	X(glBindVertexArray,                     void,   (GLuint array)) \
-	X(glBlitNamedFramebuffer,                void,   (GLuint sfb, GLuint dfb, GLint sx0, GLint sy0, GLint sx1, GLint sy1, GLint dx0, GLint dy0, GLint dx1, GLint dy1, GLbitfield mask, GLenum filter)) \
-	X(glClearNamedBufferData,                void,   (GLuint buffer, GLenum internalformat, GLenum format, GLenum type, const void *data)) \
 	X(glClearNamedFramebufferfv,             void,   (GLuint framebuffer, GLenum buffer, GLint drawbuffer, const GLfloat *value)) \
 	X(glClearTexImage,                       void,   (GLuint texture, GLint level, GLenum format, GLenum type, const void *data)) \
-	X(glClientWaitSync,                      GLenum, (GLsync sync, GLbitfield flags, GLuint64 timeout)) \
-	X(glCompileShader,                       void,   (GLuint shader)) \
-	X(glCopyImageSubData,                    void,   (GLuint srcName, GLenum srcTarget, GLint srcLevel, GLint srcX, GLint srcY, GLint srcZ, GLuint dstName, GLenum dstTarget, GLint dstLevel, GLint dstX, GLint dstY, GLint dstZ, GLsizei srcWidth, GLsizei srcHeight, GLsizei srcDepth)) \
-	X(glCreateBuffers,                       void,   (GLsizei n, GLuint *buffers)) \
-	X(glCreateFramebuffers,                  void,   (GLsizei n, GLuint *ids)) \
-	X(glCreateProgram,                       GLuint, (void)) \
-	X(glCreateQueries,                       void,   (GLenum target, GLsizei n, GLuint *ids)) \
-	X(glCreateRenderbuffers,                 void,   (GLsizei n, GLuint *renderbuffers)) \
-	X(glCreateShader,                        GLuint, (GLenum shaderType)) \
 	X(glCreateTextures,                      void,   (GLenum target, GLsizei n, GLuint *textures)) \
-	X(glCreateVertexArrays,                  void,   (GLsizei n, GLuint *arrays)) \
 	X(glDebugMessageCallback,                void,   (void (*)(GLenum source, GLenum type, GLuint id, GLenum severity, GLsizei length, const GLchar *message, const void *user), void *user)) \
-	X(glDeleteBuffers,                       void,   (GLsizei n, const GLuint *buffers)) \
-	X(glDeleteProgram,                       void,   (GLuint program)) \
-	X(glDeleteShader,                        void,   (GLuint shader)) \
-	X(glDeleteSync,                          void,   (GLsync sync)) \
 	X(glDispatchCompute,                     void,   (GLuint num_groups_x, GLuint num_groups_y, GLuint num_groups_z)) \
-	X(glEndQuery,                            void,   (GLenum target)) \
-	X(glEnableVertexArrayAttrib,             void,   (GLuint vao, GLuint index)) \
-	X(glFenceSync,                           GLsync, (GLenum condition, GLbitfield flags)) \
-	X(glGenerateTextureMipmap,               void,   (GLuint texture)) \
-	X(glGetProgramInfoLog,                   void,   (GLuint program, GLsizei maxLength, GLsizei *length, GLchar *infoLog)) \
-	X(glGetProgramiv,                        void,   (GLuint program, GLenum pname, GLint *params)) \
-	X(glGetQueryObjectui64v,                 void,   (GLuint id, GLenum pname, GLuint64 *params)) \
-	X(glGetShaderInfoLog,                    void,   (GLuint shader, GLsizei maxLength, GLsizei *length, GLchar *infoLog)) \
-	X(glGetShaderiv,                         void,   (GLuint shader, GLenum pname, GLint *params)) \
-	X(glGetTextureImage,                     void,   (GLuint texture, GLint level, GLenum format, GLenum type, GLsizei bufSize, void *pixels)) \
-	X(glLinkProgram,                         void,   (GLuint program)) \
 	X(glMemoryBarrier,                       void,   (GLbitfield barriers)) \
-	X(glNamedBufferData,                     void,   (GLuint buffer, GLsizeiptr size, const void *data, GLenum usage)) \
-	X(glNamedBufferStorage,                  void,   (GLuint buffer, GLsizeiptr size, const void *data, GLbitfield flags)) \
-	X(glNamedBufferSubData,                  void,   (GLuint buffer, GLintptr offset, GLsizei size, const void *data)) \
-	X(glNamedFramebufferRenderbuffer,        void,   (GLuint fb, GLenum attachment, GLenum renderbuffertarget, GLuint rb)) \
-	X(glNamedFramebufferTexture,             void,   (GLuint fb, GLenum attachment, GLuint texture, GLint level)) \
-	X(glNamedRenderbufferStorageMultisample, void,   (GLuint rb, GLsizei samples, GLenum internalformat, GLsizei width, GLsizei height)) \
 	X(glObjectLabel,                         void,   (GLenum identifier, GLuint name, GLsizei length, const char *label)) \
-	X(glProgramUniform1f,                    void,   (GLuint program, GLint location, GLfloat v0)) \
-	X(glProgramUniform1i,                    void,   (GLuint program, GLint location, GLint v0)) \
-	X(glProgramUniform1ui,                   void,   (GLuint program, GLint location, GLuint v0)) \
-	X(glProgramUniform3iv,                   void,   (GLuint program, GLint location, GLsizei count, const GLint *value)) \
-	X(glProgramUniform4fv,                   void,   (GLuint program, GLint location, GLsizei count, const GLfloat *value)) \
-	X(glProgramUniformMatrix4fv,             void,   (GLuint program, GLint location, GLsizei count, GLboolean transpose, const GLfloat *value)) \
-	X(glQueryCounter,                        void,   (GLuint id, GLenum target)) \
-	X(glShaderSource,                        void,   (GLuint shader, GLsizei count, const GLchar **strings, const GLint *lengths)) \
 	X(glTextureParameteri,                   void,   (GLuint texture, GLenum pname, GLint param)) \
 	X(glTextureParameterfv,                  void,   (GLuint texture, GLenum pname, const GLfloat *param)) \
-	X(glTextureStorage1D,                    void,   (GLuint texture, GLsizei levels, GLenum internalformat, GLsizei width)) \
-	X(glTextureStorage2D,                    void,   (GLuint texture, GLsizei levels, GLenum internalformat, GLsizei width, GLsizei height)) \
-	X(glTextureStorage3D,                    void,   (GLuint texture, GLsizei levels, GLenum internalformat, GLsizei width, GLsizei height, GLsizei depth)) \
-	X(glTextureSubImage1D,                   void,   (GLuint texture, GLint level, GLint xoff, GLsizei width, GLenum format, GLenum type, const void *pix)) \
-	X(glTextureSubImage2D,                   void,   (GLuint texture, GLint level, GLint xoff, GLint yoff, GLsizei width, GLsizei height, GLenum format, GLenum type, const void *pix)) \
-	X(glTextureSubImage3D,                   void,   (GLuint texture, GLint level, GLint xoff, GLint yoff, GLint zoff, GLsizei width, GLsizei height, GLsizei depth, GLenum format, GLenum type, const void *pix)) \
-	X(glUseProgram,                          void,   (GLuint program)) \
-	X(glVertexArrayAttribBinding,            void,   (GLuint vao, GLuint attribindex, GLuint bindingindex)) \
-	X(glVertexArrayAttribFormat,             void,   (GLuint vao, GLuint attribindex, GLint size, GLenum type, GLboolean normalized, GLuint relativeoffset)) \
-	X(glVertexArrayElementBuffer,            void,   (GLuint vao, GLuint buffer)) \
-	X(glVertexArrayVertexBuffer,             void,   (GLuint vao, GLuint bindingindex, GLuint buffer, GLintptr offset, GLsizei stride)) \
-
 
-#define OGLRequiredExtensionProcedureList \
+#define OGLRequiredExtensionProcedureListBase \
 	X(glCreateMemoryObjectsEXT,              void,   (GLsizei n, GLuint *memoryObjects)) \
 	X(glDeleteMemoryObjectsEXT,              void,   (GLsizei n, GLuint *memoryObjects)) \
 	X(glGenSemaphoresEXT,                    void,   (GLsizei n, GLuint *semaphores)) \
-	X(glImportMemoryFdEXT,                   void,   (GLuint memory, GLuint64 size, GLenum handleType, int fd)) \
+	X(glMemoryObjectParameterivEXT,          void,   (GLuint memoryObject, GLenum pname, const GLint *params)) \
+	X(glSignalSemaphoreEXT,                  void,   (GLuint semaphore, GLuint numBufferBarriers, const GLuint *buffers, GLuint numTextureBarriers, const GLuint *textures, const GLenum *dstLayouts)) \
+	X(glTextureStorageMem2DEXT,              void,   (GLuint texture, GLsizei levels, GLenum internalFormat, GLsizei width, GLsizei height, GLuint memory, GLuint64 offset)) \
+	X(glWaitSemaphoreEXT,                    void,   (GLuint semaphore, GLuint numBufferBarriers, const GLuint *buffers, GLuint numTextureBarriers, const GLuint *textures, const GLenum *srcLayouts)) \
+
+#define OGLRequiredExtensionProcedureListW32 \
 	X(glImportMemoryWin32HandleEXT,          void,   (GLuint memory, GLuint64 size, GLenum handleType, void *handle)) \
-	X(glImportSemaphoreFdEXT,                void,   (GLuint semaphore, GLenum handleType, int fd)) \
 	X(glImportSemaphoreWin32HandleEXT,       void,   (GLuint semaphore, GLenum handleType, void *handle)) \
-	X(glNamedBufferStorageMemEXT,            void,   (GLuint buffer, GLsizeiptr size, GLuint memory, GLuint64 offset)) \
-	X(glWaitSemaphoreEXT,                    void,   (GLuint semaphore, GLuint numBufferBarriers, const GLuint *buffers, GLuint numTextureBarriers, const GLuint *textures, const GLenum *srcLayouts)) \
 
+#define OGLRequiredExtensionProcedureListLinux \
+	X(glImportMemoryFdEXT,                   void,   (GLuint memory, GLuint64 size, GLenum handleType, int fd)) \
+	X(glImportSemaphoreFdEXT,                void,   (GLuint semaphore, GLenum handleType, int fd)) \
+
+#define OGLRequiredExtensionProcedureList \
+	OGLRequiredExtensionProcedureListBase \
+	OGLRequiredExtensionProcedureListW32 \
+	OGLRequiredExtensionProcedureListLinux \
 
 #define X(name, ret, params) typedef ret name##_fn params;
 OGLProcedureList
diff --git a/shaders/buffer_clear.glsl b/shaders/buffer_clear.glsl
@@ -0,0 +1,11 @@
+/* See LICENSE for license details. */
+layout(std430, buffer_reference, buffer_reference_align = 8) restrict writeonly buffer Buffer {
+	uint32_t values[];
+};
+
+void main()
+{
+	uint32_t word = gl_GlobalInvocationID.x;
+	if (word < words)
+		Buffer(data).values[word] = clear_word;
+}
diff --git a/shaders/coherency_weighting.glsl b/shaders/coherency_weighting.glsl
@@ -0,0 +1,41 @@
+/* See LICENSE for license details. */
+layout(std430, buffer_reference, buffer_reference_align = 8) restrict buffer Int16 {
+	int16_t values[];
+};
+
+layout(std430, buffer_reference, buffer_reference_align = 8) restrict buffer Int16Complex {
+	i16vec2 values[];
+};
+
+layout(std430, buffer_reference, buffer_reference_align = 8) restrict buffer Float32 {
+	float values[];
+};
+
+layout(std430, buffer_reference, buffer_reference_align = 8) restrict buffer Float32Complex {
+	vec2 values[];
+};
+
+#if   DataKind == DataKind_Float32
+  #define COHERENT_SAMPLE(index)    Float32(left_side_buffer).values[index]
+  #define INCOHERENT_SAMPLE(index)  Float32(right_side_buffer).values[index]
+#elif DataKind == DataKind_Float32Complex
+  #define COHERENT_SAMPLE(index)    Float32Complex(left_side_buffer).values[index]
+  #define INCOHERENT_SAMPLE(index)  Float32(right_side_buffer).values[index]
+#else
+  #error DataKind unsupported for CoherencyWeighting
+#endif
+
+uint32_t output_index(uint32_t x, uint32_t y, uint32_t z)
+{
+	uint32_t result = output_size_x * output_size_y * z + output_size_x * y + x;
+	return result;
+}
+
+void main()
+{
+	uvec3 out_voxel = gl_GlobalInvocationID;
+	if (!all(lessThan(out_voxel, uvec3(output_size_x, output_size_y, output_size_z))))
+		return;
+	uint32_t index = output_index(out_voxel.x, out_voxel.y, out_voxel.z);
+  COHERENT_SAMPLE(index) *= COHERENT_SAMPLE(index) / INCOHERENT_SAMPLE(index);
+}
diff --git a/shaders/das.glsl b/shaders/das.glsl
@@ -1,48 +1,54 @@
 /* See LICENSE for license details. */
 #if   DataKind == DataKind_Float32
-  #define SAMPLE_TYPE           float
-  #define TEXTURE_KIND          r32f
-  #define RESULT_TYPE_CAST(a)   (a).x
-  #define OUTPUT_TYPE_CAST(a)   vec4((a).x, 0, 0, 0)
-  #if !Fast
-    #define RESULT_TYPE         vec2
-    #define RESULT_LAST_INDEX   1
+  #if CoherencyWeighting
+    #define RESULT_TYPE               vec2
+    #define RESULT_COHERENT_CAST(a)   (a).x
+    #define RESULT_INCOHERENT_CAST(a) (a).y
   #endif
+  #define SAMPLE_TYPE float
 #elif DataKind == DataKind_Float32Complex
-  #define SAMPLE_TYPE           vec2
-  #define TEXTURE_KIND          rg32f
-  #define RESULT_TYPE_CAST(a)   (a).xy
-  #define OUTPUT_TYPE_CAST(a)   vec4((a).xy, 0, 0)
-  #if !Fast
-    #define RESULT_TYPE         vec3
-    #define RESULT_LAST_INDEX   2
+  #if CoherencyWeighting
+    #define RESULT_TYPE               vec3
+    #define RESULT_COHERENT_CAST(a)   (a).xy
+    #define RESULT_INCOHERENT_CAST(a) (a).z
   #endif
+  #define SAMPLE_TYPE vec2
 #else
   #error DataKind unsupported for DAS
 #endif
 
-layout(std430, binding = 1) readonly restrict buffer buffer_1 {
-	SAMPLE_TYPE rf_data[];
-};
-
 #ifndef RESULT_TYPE
   #define RESULT_TYPE SAMPLE_TYPE
 #endif
 
-#if Fast
-  #define RESULT_STORE(a, length_a) RESULT_TYPE(a)
-	layout(TEXTURE_KIND, binding = 0)           restrict uniform image3D  u_out_data_tex;
+#ifndef RESULT_COHERENT_CAST
+  #define RESULT_COHERENT_CAST(a) (a)
+#endif
+
+#if CoherencyWeighting
+  #define RESULT_STORE(a) RESULT_TYPE(RESULT_COHERENT_CAST(a), length(a))
 #else
-  #define RESULT_STORE(a, length_a) RESULT_TYPE(a, length_a)
-	layout(TEXTURE_KIND, binding = 0) writeonly restrict uniform image3D  u_out_data_tex;
+  #define RESULT_STORE(a) (a)
 #endif
 
-layout(r16i,  binding = 1) readonly  restrict uniform iimage1D sparse_elements;
-layout(rg32f, binding = 2) readonly  restrict uniform image1D  focal_vectors;
-layout(r8i,   binding = 3) readonly  restrict uniform iimage1D transmit_receive_orientations;
+layout(std430, buffer_reference, buffer_reference_align = 64) restrict readonly buffer RF {
+	SAMPLE_TYPE values[];
+};
+
+layout(std430, buffer_reference, buffer_reference_align = 64) restrict buffer Output {
+	SAMPLE_TYPE values[];
+};
 
-#define RX_ORIENTATION(tx_rx) (((tx_rx) >> 0) & 0x0F)
-#define TX_ORIENTATION(tx_rx) (((tx_rx) >> 4) & 0x0F)
+layout(std430, buffer_reference, buffer_reference_align = 64) restrict buffer IncoherentOutput {
+	float values[];
+};
+
+layout(std430, buffer_reference) restrict readonly buffer ArrayParameters {
+	DASArrayParameters data;
+};
+
+#define RX_ORIENTATION(tx_rx) bitfieldExtract((tx_rx), 0, 4)
+#define TX_ORIENTATION(tx_rx) bitfieldExtract((tx_rx), 4, 4)
 
 #define C_SPLINE 0.5
 
@@ -70,10 +76,10 @@ SAMPLE_TYPE cubic(const int base_index, const float t)
 	);
 
 	SAMPLE_TYPE samples[4] = {
-		rf_data[base_index + 0],
-		rf_data[base_index + 1],
-		rf_data[base_index + 2],
-		rf_data[base_index + 3],
+		RF(rf_data).values[base_index + 0],
+		RF(rf_data).values[base_index + 1],
+		RF(rf_data).values[base_index + 2],
+		RF(rf_data).values[base_index + 3],
 	};
 
 	vec4        S  = vec4(t * t * t, t * t, t, 1);
@@ -98,13 +104,13 @@ SAMPLE_TYPE sample_rf(const int rf_offset, const float index)
 	switch (InterpolationMode) {
 	case InterpolationMode_Nearest:{
 		if (int(index) >= 0 && int(round(index)) < SampleCount)
-			result = rotate_iq(rf_data[rf_offset + int(round(index))], index / SamplingFrequency);
+			result = rotate_iq(RF(rf_data).values[rf_offset + int(round(index))], index / SamplingFrequency);
 	}break;
 	case InterpolationMode_Linear:{
 		if (int(index) >= 0 && int(index) < SampleCount - 1) {
 			float tk, t = modf(index, tk);
 			int n = rf_offset + int(tk);
-			result = (1 - t) * rf_data[n] + t * rf_data[n + 1];
+			result = (1 - t) * RF(rf_data).values[n] + t * RF(rf_data).values[n + 1];
 			result = rotate_iq(result, index / SamplingFrequency);
 		}
 	}break;
@@ -124,6 +130,12 @@ float sample_index(const float distance)
 	return time * SamplingFrequency;
 }
 
+uint32_t output_index(uint32_t x, uint32_t y, uint32_t z)
+{
+	uint32_t result = output_size_x * output_size_y * z + output_size_x * y + x;
+	return result;
+}
+
 float apodize(const float arg)
 {
 	/* IMPORTANT: do not move calculation of arg into this function. It will generate a
@@ -158,19 +170,22 @@ float cylindrical_wave_transmit_distance(const vec3 point, const float focal_dep
 	return distance(rca_plane_projection(point, tx_rows), f);
 }
 
-int tx_rx_orientation_for_acquisition(const int acquisition)
+uint16_t tx_rx_orientation_for_acquisition(const int16_t acquisition)
 {
-	int result = bool(SingleOrientation) ? TransmitReceiveOrientation : imageLoad(transmit_receive_orientations, acquisition).x;
+	uint16_t result = uint16_t(TransmitReceiveOrientation);
+	if (!bool(SingleOrientation))
+		result = ArrayParameters(array_parameters).data.transmit_receive_orientations[acquisition];
 	return result;
 }
 
-vec2 focal_vector_for_acquisition(const int acquisition)
+vec2 focal_vector_for_acquisition(const int16_t acquisition)
 {
-	vec2 result = bool(SingleFocus) ? vec2(TransmitAngle, FocusDepth) : imageLoad(focal_vectors, acquisition).xy;
+	vec2 result = bool(SingleFocus) ? vec2(TransmitAngle, FocusDepth)
+	                                : ArrayParameters(array_parameters).data.focal_vectors[acquisition];
 	return result;
 }
 
-float rca_transmit_distance(const vec3 world_point, const vec2 focal_vector, const int transmit_receive_orientation)
+float rca_transmit_distance(const vec3 world_point, const vec2 focal_vector, const uint16_t transmit_receive_orientation)
 {
 	float result = 0;
 	if (TX_ORIENTATION(transmit_receive_orientation) != RCAOrientation_None) {
@@ -189,13 +204,13 @@ float rca_transmit_distance(const vec3 world_point, const vec2 focal_vector, con
 
 RESULT_TYPE RCA(const vec3 world_point)
 {
-	const int acquisition_start = bool(Fast)? u_channel     : 0;
-	const int acquisition_end   = bool(Fast)? u_channel + 1 : AcquisitionCount;
+	const int16_t acquisition_start = int16_t(channel_t);
+	const int16_t acquisition_end   = int16_t(channel_t + 1);
 	RESULT_TYPE result = RESULT_TYPE(0);
-	for (int acquisition = acquisition_start; acquisition < acquisition_end; acquisition++) {
-		const int  tx_rx_orientation = tx_rx_orientation_for_acquisition(acquisition);
-		const bool rx_rows           = RX_ORIENTATION(tx_rx_orientation) == RCAOrientation_Rows;
-		const vec2 focal_vector      = focal_vector_for_acquisition(acquisition);
+	for (int16_t acquisition = acquisition_start; acquisition < acquisition_end; acquisition++) {
+		const uint16_t tx_rx_orientation = tx_rx_orientation_for_acquisition(acquisition);
+		const bool     rx_rows           = RX_ORIENTATION(tx_rx_orientation) == RCAOrientation_Rows;
+		const vec2     focal_vector      = focal_vector_for_acquisition(acquisition);
 		vec2  xdc_world_point   = rca_plane_projection((xdc_transform * vec4(world_point, 1)).xyz, rx_rows);
 		float transmit_distance = rca_transmit_distance(world_point, focal_vector, tx_rx_orientation);
 
@@ -209,7 +224,7 @@ RESULT_TYPE RCA(const vec3 world_point)
 			if (a_arg < 0.5f) {
 				float       sidx  = sample_index(transmit_distance + length(receive_vector));
 				SAMPLE_TYPE value = apodize(a_arg) * sample_rf(rf_offset, sidx);
-				result += RESULT_STORE(value, length(value));
+				result += RESULT_STORE(value);
 			}
 			rf_offset += SampleCount * AcquisitionCount;
 		}
@@ -219,10 +234,10 @@ RESULT_TYPE RCA(const vec3 world_point)
 
 RESULT_TYPE HERCULES(const vec3 world_point)
 {
-	const int tx_rx_orientation  = tx_rx_orientation_for_acquisition(0);
-	const bool rx_cols           = RX_ORIENTATION(tx_rx_orientation) == RCAOrientation_Columns;
-	const vec2 focal_vector      = focal_vector_for_acquisition(0);
-	const vec3 xdc_world_point   = (xdc_transform * vec4(world_point, 1)).xyz;
+	const uint16_t tx_rx_orientation = tx_rx_orientation_for_acquisition(int16_t(0));
+	const bool     rx_cols           = RX_ORIENTATION(tx_rx_orientation) == RCAOrientation_Columns;
+	const vec2     focal_vector      = focal_vector_for_acquisition(int16_t(0));
+	const vec3     xdc_world_point   = (xdc_transform * vec4(world_point, 1)).xyz;
 
 	const float transmit_index   = sample_index(rca_transmit_distance(world_point, focal_vector, tx_rx_orientation));
 	const float z_delta_squared  = xdc_world_point.z * xdc_world_point.z;
@@ -231,11 +246,7 @@ RESULT_TYPE HERCULES(const vec3 world_point)
 	const float apodization_test = 0.25f / (f_number_over_z * f_number_over_z);
 
 	RESULT_TYPE result = RESULT_TYPE(0);
-	#if Fast
-	const int rx_channel = u_channel;
-	#else
-	for (int rx_channel = 0; rx_channel < ChannelCount; rx_channel++)
-	#endif
+	const int rx_channel = channel_t;
 	{
 		int rf_offset   = rx_channel * SampleCount * AcquisitionCount + Sparse * SampleCount;
 		rf_offset      -= int(InterpolationMode == InterpolationMode_Cubic);
@@ -249,7 +260,8 @@ RESULT_TYPE HERCULES(const vec3 world_point)
 		else         element_receive_delta_squared.y *= element_receive_delta_squared.y;
 
 		for (int transmit = Sparse; transmit < AcquisitionCount; transmit++) {
-			int tx_channel = bool(Sparse) ? imageLoad(sparse_elements, transmit - Sparse).x : transmit;
+			int tx_channel = bool(Sparse) ? ArrayParameters(array_parameters).data.sparse_elements[transmit - Sparse]
+			                              : transmit;
 
 			if (rx_cols) element_receive_delta_squared.y  = xy_world_point.y - tx_channel * xdc_element_pitch.y;
 			else         element_receive_delta_squared.x  = xy_world_point.x - tx_channel * xdc_element_pitch.x;
@@ -265,7 +277,7 @@ RESULT_TYPE HERCULES(const vec3 world_point)
 
 				float index = transmit_index + sqrt(z_delta_squared + element_delta_squared) * SamplingFrequency / SpeedOfSound;
 				SAMPLE_TYPE value = apodization * sample_rf(rf_offset, index);
-				result += RESULT_STORE(value, length(value));
+				result += RESULT_STORE(value);
 			}
 
 			rf_offset += SampleCount;
@@ -276,8 +288,8 @@ RESULT_TYPE HERCULES(const vec3 world_point)
 
 RESULT_TYPE FORCES(const vec3 xdc_world_point)
 {
-	const int rx_channel_start = bool(Fast)? u_channel     : 0;
-	const int rx_channel_end   = bool(Fast)? u_channel + 1 : ChannelCount;
+	const int16_t rx_channel_start = int16_t(channel_t);
+	const int16_t rx_channel_end   = int16_t(channel_t + 1);
 
 	RESULT_TYPE result = RESULT_TYPE(0);
 
@@ -285,7 +297,7 @@ RESULT_TYPE FORCES(const vec3 xdc_world_point)
 	float transmit_y_delta    = xdc_world_point.y - xdc_element_pitch.y * ChannelCount / 2;
 	float transmit_yz_squared = transmit_y_delta * transmit_y_delta + z_delta_squared;
 
-	for (int rx_channel = rx_channel_start; rx_channel < rx_channel_end; rx_channel++) {
+	for (int16_t rx_channel = rx_channel_start; rx_channel < rx_channel_end; rx_channel++) {
 		float receive_x_delta = xdc_world_point.x - rx_channel * xdc_element_pitch.x;
 		float a_arg           = abs(FNumber * receive_x_delta / xdc_world_point.z);
 
@@ -296,12 +308,13 @@ RESULT_TYPE FORCES(const vec3 xdc_world_point)
 			float receive_index = sample_index(sqrt(receive_x_delta * receive_x_delta + z_delta_squared));
 			float apodization   = apodize(a_arg);
 			for (int transmit = Sparse; transmit < AcquisitionCount; transmit++) {
-				int   tx_channel       = bool(Sparse) ? imageLoad(sparse_elements, transmit - Sparse).x : transmit;
+				int tx_channel = bool(Sparse) ? ArrayParameters(array_parameters).data.sparse_elements[transmit - Sparse]
+				                               : transmit;
 				float transmit_x_delta = xdc_world_point.x - xdc_element_pitch.x * tx_channel;
 				float transmit_index   = sqrt(transmit_yz_squared + transmit_x_delta * transmit_x_delta) * SamplingFrequency / SpeedOfSound;
 
 				SAMPLE_TYPE value = apodization * sample_rf(rf_offset, receive_index + transmit_index);
-				result    += RESULT_STORE(value, length(value));
+				result    += RESULT_STORE(value);
 				rf_offset += SampleCount;
 			}
 		}
@@ -311,15 +324,17 @@ RESULT_TYPE FORCES(const vec3 xdc_world_point)
 
 void main()
 {
-	ivec3 out_voxel    = ivec3(gl_GlobalInvocationID);
-	vec3  image_points = vec3(imageSize(u_out_data_tex)) - 1.0f;
-	if (!all(lessThan(out_voxel, imageSize(u_out_data_tex))))
+	uvec3 out_voxel = gl_GlobalInvocationID;
+	if (!all(lessThan(out_voxel, uvec3(output_size_x, output_size_y, output_size_z))))
 		return;
 
-	vec3 point       = vec3(out_voxel) / max(vec3(1.0f), image_points);
-	vec3 world_point = (voxel_transform * vec4(point, 1)).xyz;
+	vec3 image_points = vec3(output_size_x, output_size_y, output_size_z) - 1.0f;
+	vec3 point        = vec3(out_voxel) / max(vec3(1.0f), image_points);
+	vec3 world_point  = (voxel_transform * vec4(point, 1)).xyz;
+
+	uint32_t out_index = output_index(out_voxel.x, out_voxel.y, out_voxel.z);
 
-	RESULT_TYPE sum;
+	RESULT_TYPE sum = RESULT_TYPE(0);
 	switch (AcquisitionKind) {
 	case AcquisitionKind_FORCES:
 	case AcquisitionKind_UFORCES:
@@ -340,15 +355,9 @@ void main()
 	}break;
 	}
 
-	#if Fast
-		sum += RESULT_TYPE_CAST(imageLoad(u_out_data_tex, out_voxel));
-	#endif
-
 	#if CoherencyWeighting
-		/* TODO(rnp): scale such that brightness remains ~constant */
-		float denominator = sum[RESULT_LAST_INDEX] + float(sum[RESULT_LAST_INDEX] == 0);
-		RESULT_TYPE_CAST(sum) *= RESULT_TYPE_CAST(sum) / denominator;
+	IncoherentOutput(incoherent_output).values[out_index] += RESULT_INCOHERENT_CAST(sum);
 	#endif
 
-	imageStore(u_out_data_tex, out_voxel, OUTPUT_TYPE_CAST(sum));
+	Output(output_data).values[out_index] += RESULT_COHERENT_CAST(sum);
 }
diff --git a/shaders/decode.glsl b/shaders/decode.glsl
@@ -9,57 +9,42 @@
  */
 
 #if   DataKind == DataKind_Float32
-	#define INPUT_DATA_TYPE      float
-	#define SAMPLE_DATA_TYPE     float
-	#define SAMPLE_TYPE_CAST(x)  (x)
+  #define INPUT_DATA_TYPE  float
+  #define SAMPLE_DATA_TYPE float
 #elif DataKind == DataKind_Float32Complex
-	#define INPUT_DATA_TYPE      vec2
-	#define SAMPLE_DATA_TYPE     vec2
-	#define SAMPLE_TYPE_CAST(x)  (x)
+  #define INPUT_DATA_TYPE  vec2
+  #define SAMPLE_DATA_TYPE vec2
 #elif DataKind == DataKind_Int16Complex
-	#define INPUT_DATA_TYPE      int
-	#define SAMPLE_DATA_TYPE     vec2
-	#define SAMPLE_TYPE_CAST(x)  vec2(((x) << 16) >> 16, (x) >> 16)
+  #define INPUT_DATA_TYPE  i16vec2
+  #define SAMPLE_DATA_TYPE vec2
 #elif DataKind == DataKind_Int16
-	#define INPUT_DATA_TYPE      int
-	#define RF_SAMPLES_PER_INDEX 2
-	#if DilateOutput
-		#define SAMPLE_DATA_TYPE    vec4
-		#define SAMPLE_TYPE_CAST(x) vec4(((x) << 16) >> 16, 0, (x) >> 16, 0)
-	#else
-		#define SAMPLE_DATA_TYPE    vec2
-		#define SAMPLE_TYPE_CAST(x) vec2(((x) << 16) >> 16, (x) >> 16)
-		#define OUTPUT_SAMPLES_PER_INDEX 2
-	#endif
+  #define INPUT_DATA_TYPE  int16_t
+  #define SAMPLE_DATA_TYPE float
 #else
-	#error unsupported data kind for Decode
+  #error unsupported data kind for Decode
 #endif
 
-#ifndef OUTPUT_SAMPLES_PER_INDEX
-	#define OUTPUT_SAMPLES_PER_INDEX 1
-#endif
-
-#ifndef RF_SAMPLES_PER_INDEX
-	#define RF_SAMPLES_PER_INDEX 1
-#endif
+// TODO(rnp): fix DilateOutput
 
-layout(std430, binding = 1) readonly restrict buffer buffer_1 {
-	INPUT_DATA_TYPE rf_data[];
+layout(std430, buffer_reference, buffer_reference_align = 64) restrict readonly buffer RF {
+	INPUT_DATA_TYPE values[];
 };
 
-layout(std430, binding = 2) writeonly restrict buffer buffer_2 {
-	INPUT_DATA_TYPE out_rf_data[];
+layout(std430, buffer_reference, buffer_reference_align = 64) restrict writeonly buffer OutputRF {
+	INPUT_DATA_TYPE values[];
 };
 
-layout(std430, binding = 3) writeonly restrict buffer buffer_3 {
-	SAMPLE_DATA_TYPE out_data[];
+layout(std430, buffer_reference, buffer_reference_align = 64) restrict writeonly buffer Output {
+	SAMPLE_DATA_TYPE values[];
 };
 
-layout(r16f, binding = 0) readonly restrict uniform image2D  hadamard;
+layout(std430, buffer_reference, buffer_reference_align = 64) restrict readonly buffer Hadamard {
+	float16_t values[];
+};
 
 SAMPLE_DATA_TYPE sample_rf_data(uint index)
 {
-	SAMPLE_DATA_TYPE result = SAMPLE_TYPE_CAST(rf_data[index]);
+	SAMPLE_DATA_TYPE result = SAMPLE_DATA_TYPE(RF(rf_buffer).values[index]);
 	return result;
 }
 
@@ -67,7 +52,7 @@ SAMPLE_DATA_TYPE sample_rf_data(uint index)
 shared INPUT_DATA_TYPE rf[gl_WorkGroupSize.x * TransmitCount];
 void run_decode_large(void)
 {
-	uint time_sample = gl_GlobalInvocationID.x * RF_SAMPLES_PER_INDEX;
+	uint time_sample = gl_GlobalInvocationID.x;
 	uint channel     = gl_GlobalInvocationID.y;
 	uint transmit    = gl_GlobalInvocationID.z * ToProcess;
 
@@ -78,12 +63,11 @@ void run_decode_large(void)
 	uint leftover_samples    = rf.length() % thread_count;
 	uint samples_this_thread = samples_per_thread + uint(thread_index < leftover_samples);
 
-	uint rf_offset = (InputChannelStride * channel / RF_SAMPLES_PER_INDEX +
-	                  TransmitCount * gl_WorkGroupID.x * gl_WorkGroupSize.x);
+	uint rf_offset = InputChannelStride * channel + TransmitCount * gl_WorkGroupID.x * gl_WorkGroupSize.x;
 
 	for (uint i = 0; i < samples_this_thread; i++) {
 		uint index = i * thread_count + thread_index;
-		rf[index] = rf_data[rf_offset + index];
+		rf[index] = RF(rf_buffer).values[rf_offset + index];
 	}
 
 	barrier();
@@ -94,9 +78,9 @@ void run_decode_large(void)
 			result[i] = SAMPLE_DATA_TYPE(0);
 
 		for (int j = 0; j < TransmitCount; j++) {
-			SAMPLE_DATA_TYPE s = SAMPLE_TYPE_CAST(rf[gl_LocalInvocationID.x * TransmitCount + j]);
+			SAMPLE_DATA_TYPE s = SAMPLE_DATA_TYPE(rf[gl_LocalInvocationID.x * TransmitCount + j]);
 			for (uint i = 0; i < ToProcess; i++)
-				result[i] += imageLoad(hadamard, ivec2(j, transmit + i)).x * s;
+				result[i] += s * Hadamard(hadamard_buffer).values[TransmitCount * j + (i + transmit)];
 		}
 
 		for (uint i = 0; i < ToProcess; i++)
@@ -112,30 +96,30 @@ void run_decode_large(void)
 
 		for (uint i = 0; i < ToProcess; i++, out_off += OutputTransmitStride)
 			if (TransmitCount % (gl_WorkGroupSize.z * ToProcess) == 0 || transmit + i < TransmitCount)
-				out_data[out_off / OUTPUT_SAMPLES_PER_INDEX] = result[i];
+				Output(output_buffer).values[out_off] = result[i];
 	}
 }
 #endif
 
 void run_decode_small(void)
 {
-	uint time_sample = gl_GlobalInvocationID.x * RF_SAMPLES_PER_INDEX;
+	uint time_sample = gl_GlobalInvocationID.x;
 	uint channel     = gl_GlobalInvocationID.y;
-	uint rf_offset   = (InputChannelStride * channel + TransmitCount * time_sample) / RF_SAMPLES_PER_INDEX;
+	uint rf_offset   = InputChannelStride * channel + TransmitCount * time_sample;
 
 	if (time_sample < OutputTransmitStride) {
 		INPUT_DATA_TYPE rf[TransmitCount];
 		for (int j = 0; j < TransmitCount; j++)
-			rf[j] = rf_data[rf_offset + j];
+			rf[j] = RF(rf_buffer).values[rf_offset + j];
 
 		SAMPLE_DATA_TYPE result[TransmitCount];
 		for (int j = 0; j < TransmitCount; j++)
 			result[j] = SAMPLE_DATA_TYPE(0);
 
 		for (int i = 0; i < TransmitCount; i++) {
-			SAMPLE_DATA_TYPE s = SAMPLE_TYPE_CAST(rf[i]);
+			SAMPLE_DATA_TYPE s = SAMPLE_DATA_TYPE(rf[i]);
 			for (int j = 0; j < TransmitCount; j++) {
-				result[j] += imageLoad(hadamard, ivec2(i, j)).x * s;
+				result[j] += s * Hadamard(hadamard_buffer).values[TransmitCount * i + j];
 			}
 		}
 
@@ -145,7 +129,7 @@ void run_decode_small(void)
 		uint out_off = OutputChannelStride  * channel +
 		               OutputSampleStride   * time_sample;
 		for (int i = 0; i < TransmitCount; i++, out_off += OutputTransmitStride)
-			out_data[out_off / OUTPUT_SAMPLES_PER_INDEX] = result[i];
+			Output(output_buffer).values[out_off] = result[i];
 	}
 }
 
@@ -153,40 +137,40 @@ void main()
 {
 	switch (DecodeMode) {
 	case DecodeMode_None:{
-		uint time_sample = gl_GlobalInvocationID.x * RF_SAMPLES_PER_INDEX;
+		uint time_sample = gl_GlobalInvocationID.x;
 		uint channel     = gl_GlobalInvocationID.y;
 		uint transmit    = gl_GlobalInvocationID.z;
 
 		if (time_sample < OutputTransmitStride) {
-			uint in_off = (InputChannelStride  * channel +
-			               InputTransmitStride * transmit +
-			               InputSampleStride   * time_sample) / RF_SAMPLES_PER_INDEX;
+			uint in_off = InputChannelStride  * channel +
+			              InputTransmitStride * transmit +
+			              InputSampleStride   * time_sample;
 
-			uint out_off = (OutputChannelStride  * channel +
-			                OutputTransmitStride * transmit +
-			                OutputSampleStride   * time_sample) / OUTPUT_SAMPLES_PER_INDEX;
+			uint out_off = OutputChannelStride  * channel +
+			               OutputTransmitStride * transmit +
+			               OutputSampleStride   * time_sample;
 
-			out_data[out_off] = sample_rf_data(in_off);
+			Output(output_buffer).values[out_off] = sample_rf_data(in_off);
 		}
 	}break;
 	case DecodeMode_Hadamard:{
-		if (u_first_pass) {
-			uint time_sample = gl_GlobalInvocationID.x * RF_SAMPLES_PER_INDEX;
+		if (first_pass) {
+			uint time_sample = gl_GlobalInvocationID.x;
 			uint channel     = gl_GlobalInvocationID.y;
 			uint transmit    = gl_GlobalInvocationID.z * ToProcess;
 			if (time_sample < InputTransmitStride) {
-				uint out_off = (InputChannelStride * channel + TransmitCount * time_sample) / RF_SAMPLES_PER_INDEX;
-				uint in_off  = (InputChannelStride * channel + InputSampleStride  * time_sample);
+				uint out_off = InputChannelStride * channel + TransmitCount     * time_sample;
+				uint in_off  = InputChannelStride * channel + InputSampleStride * time_sample;
 				#if UseSharedMemory
 					in_off  += InputTransmitStride * transmit;
 					out_off += transmit;
 					for (uint i = 0; i < ToProcess; i++, in_off += InputTransmitStride) {
 						if (transmit + i < TransmitCount)
-							out_rf_data[out_off + i] = rf_data[in_off / RF_SAMPLES_PER_INDEX];
+							OutputRF(output_rf_buffer).values[out_off + i] = RF(rf_buffer).values[in_off];
 					}
 				#else
 					for (uint i = 0; i < TransmitCount; i++, in_off += InputTransmitStride)
-						out_rf_data[out_off + i] = rf_data[in_off / RF_SAMPLES_PER_INDEX];
+						OutputRF(output_rf_buffer).values[out_off + i] = RF(rf_buffer).values[in_off];
 				#endif
 			}
 		} else {
diff --git a/shaders/filter.glsl b/shaders/filter.glsl
@@ -28,16 +28,16 @@
 	#define apply_filter(iq, h) ((iq) * (h))
 #endif
 
-layout(std430, binding = 1) readonly restrict buffer buffer_1 {
-	DATA_TYPE in_data[];
+layout(std430, buffer_reference, buffer_reference_align = 64) restrict readonly buffer Input {
+	DATA_TYPE values[];
 };
 
-layout(std430, binding = 2) writeonly restrict buffer buffer_2 {
-	OUT_DATA_TYPE out_data[];
+layout(std430, buffer_reference, buffer_reference_align = 64) restrict writeonly buffer Output {
+	OUT_DATA_TYPE values[];
 };
 
-layout(std430, binding = 3) readonly restrict buffer buffer_3 {
-	FILTER_TYPE filter_coefficients[FilterLength];
+layout(std430, buffer_reference, buffer_reference_align = 64) restrict readonly buffer Filter {
+	FILTER_TYPE values[FilterLength];
 };
 
 vec2 complex_mul(vec2 a, vec2 b)
@@ -58,7 +58,7 @@ vec2 rotate_iq(vec2 iq, uint index)
 
 SAMPLE_TYPE sample_rf(uint index)
 {
-	SAMPLE_TYPE result = SAMPLE_TYPE_CAST(in_data[index]);
+	SAMPLE_TYPE result = SAMPLE_TYPE_CAST(Input(input_data).values[index]);
 	return result;
 }
 
@@ -80,6 +80,8 @@ void main()
 	/////////////////////////
 	// NOTE: sample caching
 	{
+		bool offset_wraps = (DecimationRate * gl_WorkGroupID.x * gl_WorkGroupSize.x) < (FilterLength - 1);
+
 		in_offset += DecimationRate * gl_WorkGroupID.x * gl_WorkGroupSize.x - (FilterLength - 1);
 
 		uint total_samples       = rf.length();
@@ -87,10 +89,10 @@ void main()
 		uint leftover_count      = total_samples % thread_count;
 		uint samples_this_thread = samples_per_thread + uint(thread_index < leftover_count);
 
-		const float scale = bool(ComplexFilter) ? 1 : sqrt(2);
+		const float scale = bool(ComplexFilter) ? 1 : sqrt(2.0f);
 		for (uint i = 0; i < samples_this_thread; i++) {
 			uint index = thread_count * i + thread_index;
-			if (gl_WorkGroupID.x == 0 && index < FilterLength - 1) {
+			if (offset_wraps && index < FilterLength - 1) {
 				rf[index] = SAMPLE_TYPE(0);
 			} else {
 				#if Demodulate
@@ -107,7 +109,7 @@ void main()
 		SAMPLE_TYPE result = SAMPLE_TYPE(0);
 		uint offset = DecimationRate * thread_index;
 		for (uint j = 0; j < FilterLength; j++)
-			result += apply_filter(rf[offset + j], filter_coefficients[j]);
-		out_data[out_offset] = RESULT_TYPE_CAST(result);
+			result += apply_filter(rf[offset + j], Filter(filter_coefficients).values[j]);
+		Output(output_data).values[out_offset] = RESULT_TYPE_CAST(result);
 	}
 }
diff --git a/shaders/render_3d.frag.glsl b/shaders/render_3d.frag.glsl
@@ -1,4 +1,15 @@
 /* See LICENSE for license details. */
+layout(location = 0) in  vec3 normal;
+layout(location = 1) in  vec3 texture_coordinate;
+layout(location = 0) out vec4 out_colour;
+
+layout(std430, buffer_reference, buffer_reference_align = 64) readonly buffer InputVec2 {
+	vec2 values[];
+};
+
+layout(std430, buffer_reference, buffer_reference_align = 64) readonly buffer InputFloat {
+	float values[];
+};
 
 /* input:  h [0,360] | s,v [0, 1] *
  * output: rgb [0,1]              */
@@ -20,24 +31,41 @@ float sdf_wire_box_outside(vec3 p, vec3 b, float e)
 	return result;
 }
 
-int texture_dimension(ivec3 points)
+uint32_t texture_dimension(uvec3 points)
 {
-	points = ivec3(greaterThan(points, ivec3(1)));
+	points = uvec3(greaterThan(points, uvec3(1)));
 	return points.x + points.y + points.z;
 }
 
+uint32_t input_index(vec3 uv)
+{
+	uv *= vec3(input_size_x - 1, input_size_y - 1, input_size_z - 1);
+	uint32_t result = input_size_y * input_size_x * uint32_t(uv.z) +
+	                                 input_size_x * uint32_t(uv.y) +
+	                                                uint32_t(uv.x);
+	result = min(result, input_size_z * input_size_y * input_size_x - 1);
+	return result;
+}
 
 float sample_value(vec3 p)
 {
-	float result = length(texture(u_texture, p).xy);
-	float threshold_val = pow(10.0f, u_threshold / 20.0f);
+	float result;
+	if (input_data != 0) {
+		uint32_t index = input_index(texture_coordinate);
+		switch (data_kind) {
+		case DataKind_Float32:{        result = length(InputFloat(input_data).values[index]); }break;
+		case DataKind_Float32Complex:{ result = length(InputVec2(input_data).values[index]);  }break;
+		}
+	}
+
+	float threshold_val = pow(10.0f, threshold / 20.0f);
 	result = clamp(result, 0.0f, threshold_val);
 	result = result / threshold_val;
-	result = pow(result, u_gamma);
+	result = pow(result, gamma);
 
-	if (u_log_scale) {
+	if (db_cutoff > 0) {
 		result = 20 * log(result) / log(10);
-		result = clamp(result, -u_db_cutoff, 0) / -u_db_cutoff;
+		result = clamp(result, -db_cutoff, 0) / -db_cutoff;
 		result = 1 - result;
 	}
 
@@ -54,40 +82,40 @@ float grad(float x)
 
 void main(void)
 {
-	int dimension = texture_dimension(textureSize(u_texture, 0));
+	uint32_t dimension = texture_dimension(uvec3(input_size_x, input_size_y, input_size_z));
 
 	if (dimension == 3) {
 		// TODO(rnp): add slice offset passed in as a uniform
 	}
 
-	float smp = sample_value(texture_coordinate);
+	float data = sample_value(texture_coordinate);
 	//float t = test_texture_coordinate.y;
 	//smp = smp * smoothstep(-0.4, 1.1, t) * u_gain;
 
-	vec3  p = 2.0f * test_texture_coordinate - 1.0f;
+	vec3  p = 2.0f * texture_coordinate - 1.0f;
 
 	switch (dimension) {
 	case 1:{
 
-		float df = mix(grad(texture_coordinate.x), dFdx(smp),
+		float df = mix(grad(texture_coordinate.x), dFdx(data),
 		               smoothstep(0.0f, 0.55f, abs(texture_coordinate.x - 0.5f)));
-		float de = abs(smp - texture_coordinate.y) / sqrt(1.0f + df * df);
+		float de = abs(data - texture_coordinate.y) / sqrt(1.0f + df * df);
 
 		float eps       = length(fwidth(texture_coordinate.xy));
 		float thickness = 4.f;
 
 		float alpha = smoothstep((0.5f * thickness + 2.0f) * eps, (0.5f * thickness + 0.0f) * eps, de);
-		out_colour = vec4(u_bb_colour.xyz, alpha);
+		out_colour = vec4(bounding_box_colour.xyz, alpha);
 	}break;
 
 	case 0: // NOTE(rnp): 0 is a special case for X-Plane Rendering
 	case 2:
 	case 3:
 	{
-		float t = clamp(sdf_wire_box_outside(p, vec3(1.0f), u_bb_fraction) / u_bb_fraction, 0, 1);
+		float t = clamp(sdf_wire_box_outside(p, vec3(1.0f), bounding_box_fraction) /  bounding_box_fraction, 0, 1);
 
-		out_colour = vec4(t * vec3(smp) + (1 - t) * u_bb_colour.xyz, 1);
-		if (u_solid_bb) out_colour = u_bb_colour;
+		out_colour = vec4(t * vec3(data) + (1 - t) * bounding_box_colour.xyz, 1);
+		//if (u_solid_bb) out_colour = u_bb_colour;
 	}break;
 	}
 
diff --git a/shaders/render_3d.vert.glsl b/shaders/render_3d.vert.glsl
@@ -0,0 +1,19 @@
+layout(location = 0) out vec3 f_normal;
+layout(location = 1) out vec3 f_texture_coordinate;
+
+layout(std430, buffer_reference, buffer_reference_align = 16) readonly buffer Vector4 {
+	vec4 values[];
+};
+
+void main()
+{
+	vec3 position = Vector4(positions).values[gl_VertexIndex].xyz;
+	vec3 normal   = Vector4(normals).values[gl_VertexIndex].xyz;
+	vec3 texture_coordinate = (2 * position + 1) / 2;
+
+	f_texture_coordinate = texture_coordinate;
+	f_normal             = normal;
+	//f_normal             = normalize(mat3(mvp_matrix) * normal);
+
+	gl_Position = mvp_matrix * vec4(position, 1);
+}
diff --git a/ui.c b/ui.c
@@ -63,6 +63,10 @@
 #define RULER_COLOUR           (v4){{1.00f, 0.70f, 0.00f, 1.0f}}
 #define BORDER_COLOUR          v4_lerp(FG_COLOUR, BG_COLOUR, 0.85f)
 
+#define FRAME_VIEW_BB_COLOUR          (v4){{0.92f, 0.88f, 0.78f, 1.0f}}
+#define FRAME_VIEW_BB_FRACTION        0.007f
+#define FRAME_VIEW_RENDER_TARGET_SIZE 1024, 1024
+
 #define MENU_PLUS_COLOUR       (v4){{0.33f, 0.42f, 1.00f, 1.00f}}
 #define MENU_CLOSE_COLOUR      FOCUSED_COLOUR
 
@@ -308,8 +312,7 @@ struct Variable {
 #define BEAMFORMER_FRAME_VIEW_KIND_LIST \
 	X(Latest,   "Latest")     \
 	X(3DXPlane, "3D X-Plane") \
-	X(Indexed,  "Indexed")    \
-	X(Copy,     "Copy")
+	X(Copy,     "Copy")       \
 
 typedef enum {
 	#define X(kind, ...) BeamformerFrameViewKind_##kind,
@@ -322,12 +325,16 @@ typedef struct BeamformerFrameView BeamformerFrameView;
 struct BeamformerFrameView {
 	BeamformerFrameViewKind kind;
 	b32 dirty;
-	BeamformerFrame     *frame;
 	BeamformerFrameView *prev, *next;
 
-	u32 texture;
-	i32 texture_mipmaps;
-	iv2 texture_dim;
+	// NOTE(rnp): for FrameViewKindCopy
+	GPUBuffer copy_buffer;
+
+	GPUImage colour_image;
+	// NOTE(rnp): temporary, on w32 we must hold onto this when importing vulkan data to OpenGL
+	OSHandle export_handle;
+	u32      memory_object;
+	u32      texture;
 
 	/* NOTE(rnp): any pointers to variables are added to the menu and will
 	 * be put onto the freelist if the view is closed. */
@@ -339,14 +346,13 @@ struct BeamformerFrameView {
 	Variable gamma;
 
 	union {
-		/* BeamformerFrameViewKind_Latest/BeamformerFrameViewKind_Indexed */
+		/* BeamformerFrameViewKind_Latest/BeamformerFrameViewKind_Copy */
 		struct {
 			Variable lateral_scale_bar;
 			Variable axial_scale_bar;
 			Variable *lateral_scale_bar_active;
 			Variable *axial_scale_bar_active;
-			/* NOTE(rnp): if kind is Latest  selects which plane to use
-			 *            if kind is Indexed selects the index */
+			/* NOTE(rnp): selects which plane to use */
 			Variable *cycler;
 			u32 cycler_state;
 
@@ -354,6 +360,8 @@ struct BeamformerFrameView {
 
 			v3 min_coordinate;
 			v3 max_coordinate;
+
+			BeamformerFrame frame;
 		};
 
 		/* BeamformerFrameViewKind_3DXPlane */
@@ -415,7 +423,6 @@ struct BeamformerUI {
 
 	BeamformerFrameView *views;
 	BeamformerFrameView *view_freelist;
-	BeamformerFrame     *frame_freelist;
 
 	Interaction interaction;
 	Interaction hot_interaction;
@@ -423,12 +430,20 @@ struct BeamformerUI {
 
 	InputState  text_input_state;
 
-	/* TODO(rnp): ideally this isn't copied all over the place */
-	BeamformerRenderModel unit_cube_model;
+	VulkanHandle    pipelines[BeamformerShaderKind_RenderCount];
+
+	OSHandle        render_semaphores_export[2];
+	VulkanHandle    render_semaphores[2];
+	u32             render_semaphores_gl[2];
+
+	GPUImage        render_3d_image;
+	GPUImage        render_3d_depth_image;
+	RenderModel     unit_cube_model;
 
 	v2_sll *scale_bar_savepoint_freelist;
 
-	BeamformerFrame *latest_plane[BeamformerViewPlaneTag_Count + 1];
+	BeamformerFrame latest_plane[BeamformerViewPlaneTag_Count + 1];
+	b32             latest_plane_valid[BeamformerViewPlaneTag_Count + 1];
 
 	BeamformerUIParameters params;
 	b32                    flush_params;
@@ -439,8 +454,6 @@ struct BeamformerUI {
 	f32 off_axis_position;
 	f32 beamform_plane;
 
-	FrameViewRenderContext *frame_view_render_context;
-
 	BeamformerSharedMemory * shared_memory;
 	BeamformerCtx *          beamformer_context;
 };
@@ -640,9 +653,9 @@ make_raylib_texture(BeamformerFrameView *v)
 {
 	Texture result;
 	result.id      = v->texture;
-	result.width   = v->texture_dim.w;
-	result.height  = v->texture_dim.h;
-	result.mipmaps = v->texture_mipmaps;
+	result.width   = v->colour_image.width;
+	result.height  = v->colour_image.height;
+	result.mipmaps = v->colour_image.mip_map_levels;
 	result.format  = PIXELFORMAT_UNCOMPRESSED_R8G8B8A8;
 	return result;
 }
@@ -743,16 +756,11 @@ push_custom_view_title(Stream *s, Variable *var)
 			#undef X
 			stream_append_s8(s, labels[*bv->cycler->cycler.state % (BeamformerViewPlaneTag_Count + 1)]);
 		}break;
-		case BeamformerFrameViewKind_Indexed:{
-			stream_append_s8(s, s8(": Index {"));
-			stream_append_u64(s, *bv->cycler->cycler.state % BeamformerMaxBacklogFrames);
-			stream_append_s8(s, s8("} ["));
-		}break;
 		case BeamformerFrameViewKind_3DXPlane:{ stream_append_s8(s, s8(": 3D X-Plane")); }break;
 		InvalidDefaultCase;
 		}
 		if (bv->kind != BeamformerFrameViewKind_3DXPlane) {
-			stream_append_hex_u64(s, bv->frame? bv->frame->id : 0);
+			stream_append_hex_u64(s, bv->frame.id);
 			stream_append_byte(s, ']');
 		}
 	}break;
@@ -954,19 +962,37 @@ table_end_subtable(Table *table)
 }
 
 function void
-resize_frame_view(BeamformerFrameView *view, iv2 dim)
+resize_frame_view(BeamformerFrameView *view, uv2 dim)
 {
+	if ValidHandle(view->export_handle) os_release_handle(view->export_handle);
+
+	glDeleteMemoryObjectsEXT(1, &view->memory_object);
+	glCreateMemoryObjectsEXT(1, &view->memory_object);
+
 	glDeleteTextures(1, &view->texture);
 	glCreateTextures(GL_TEXTURE_2D, 1, &view->texture);
 
-	view->texture_dim     = dim;
-	view->texture_mipmaps = (i32)ctz_u64((u64)Max(dim.x, dim.y)) + 1;
-	glTextureStorage2D(view->texture, view->texture_mipmaps, GL_RGBA8, dim.x, dim.y);
+	vk_image_allocate(&view->colour_image, dim.w, dim.h, 1, 1, VulkanImageUsage_Colour,
+	                  VulkanUsageFlag_ImageSampling, &view->export_handle);
 
-	glGenerateTextureMipmap(view->texture);
+	glMemoryObjectParameterivEXT(view->memory_object, GL_DEDICATED_MEMORY_OBJECT_EXT, (GLint []){1});
+
+	if (OS_WINDOWS) {
+		glImportMemoryWin32HandleEXT(view->memory_object, view->colour_image.memory_size,
+		                             GL_HANDLE_TYPE_OPAQUE_WIN32_EXT, (void *)view->export_handle.value[0]);
+		// NOTE(rnp): w32 does not transfer ownership from handle back to driver
+	} else {
+		glImportMemoryFdEXT(view->memory_object, view->colour_image.memory_size,
+		                    GL_HANDLE_TYPE_OPAQUE_FD_EXT, view->export_handle.value[0]);
+		view->export_handle.value[0] = OSInvalidHandleValue;
+	}
+
+	glTextureStorageMem2DEXT(view->texture, view->colour_image.mip_map_levels, GL_RGBA8,
+	                         view->colour_image.width, view->colour_image.height,
+	                         view->memory_object, 0);
 
 	/* NOTE(rnp): work around raylib's janky texture sampling */
-	v4 border_colour = (v4){{0, 0, 0, 1}};
+	v4 border_colour = {{0, 0, 0, 1}};
 	if (view->kind != BeamformerFrameViewKind_Copy) border_colour = (v4){0};
 	glTextureParameteri(view->texture, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_BORDER);
 	glTextureParameteri(view->texture, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_BORDER);
@@ -983,11 +1009,8 @@ resize_frame_view(BeamformerFrameView *view, iv2 dim)
 function void
 ui_beamformer_frame_view_release_subresources(BeamformerUI *ui, BeamformerFrameView *bv, BeamformerFrameViewKind kind)
 {
-	if (kind == BeamformerFrameViewKind_Copy && bv->frame) {
-		glDeleteTextures(1, &bv->frame->texture);
-		bv->frame->texture = 0;
-		SLLPushFreelist(bv->frame, ui->frame_freelist);
-	}
+	if (kind == BeamformerFrameViewKind_Copy)
+		vk_buffer_release(&bv->copy_buffer);
 
 	if (kind != BeamformerFrameViewKind_3DXPlane) {
 		if (bv->axial_scale_bar.scale_bar.savepoint_stack)
@@ -1289,10 +1312,10 @@ ui_beamformer_frame_view_convert(BeamformerUI *ui, Arena *arena, Variable *view,
 	bv->threshold.real32          = old? old->threshold.real32          : 55.0f;
 	bv->gamma.scaled_real32.val   = old? old->gamma.scaled_real32.val   : 1.0f;
 	bv->gamma.scaled_real32.scale = old? old->gamma.scaled_real32.scale : 0.05f;
-	bv->min_coordinate = (old && old->frame) ? m4_mul_v4(old->frame->voxel_transform, (v4){{0.0f, 0.0f, 0.0f, 1.0f}}).xyz
-	                                         : (v3){0};
-	bv->max_coordinate = (old && old->frame) ? m4_mul_v4(old->frame->voxel_transform, (v4){{1.0f, 1.0f, 1.0f, 1.0f}}).xyz
-	                                         : (v3){0};
+	bv->min_coordinate = old ? m4_mul_v4(old->frame.voxel_transform, (v4){{0.0f, 0.0f, 0.0f, 1.0f}}).xyz
+	                         : (v3){0};
+	bv->max_coordinate = old ? m4_mul_v4(old->frame.voxel_transform, (v4){{1.0f, 1.0f, 1.0f, 1.0f}}).xyz
+	                         : (v3){0};
 
 	#define X(_t, pretty) s8_comp(pretty),
 	read_only local_persist s8 kind_labels[] = {BEAMFORMER_FRAME_VIEW_KIND_LIST};
@@ -1302,7 +1325,7 @@ ui_beamformer_frame_view_convert(BeamformerUI *ui, Arena *arena, Variable *view,
 
 	/* TODO(rnp): this is quite dumb. what we actually want is to render directly
 	 * into the view region with the appropriate size for that region (scissor) */
-	resize_frame_view(bv, (iv2){{FRAME_VIEW_RENDER_TARGET_SIZE}});
+	resize_frame_view(bv, (uv2){{FRAME_VIEW_RENDER_TARGET_SIZE}});
 
 	switch (kind) {
 	case BeamformerFrameViewKind_3DXPlane:{
@@ -1329,11 +1352,10 @@ ui_beamformer_frame_view_convert(BeamformerUI *ui, Arena *arena, Variable *view,
 		axial->zoom_starting_coord   = F32_INFINITY;
 
 		b32 copy = kind == BeamformerFrameViewKind_Copy;
-		v3 normal = (v3){.y = 1.0f};
-		if (old && old->frame)
-			normal = cross(old->frame->voxel_transform.c[0].xyz, old->frame->voxel_transform.c[1].xyz);
+		v3 N = (v3){.y = 1.0f};
+		if (old) N = cross(old->frame.voxel_transform.c[0].xyz, old->frame.voxel_transform.c[1].xyz);
 
-		BeamformerViewPlaneTag plane = ui_plane_layout_from_normal(v3_normalize(normal));
+		BeamformerViewPlaneTag plane = ui_plane_layout_from_normal(v3_normalize(N));
 		switch (plane) {
 		case BeamformerViewPlaneTag_XY:{
 			lateral->min_value = copy ? &bv->min_coordinate.x : &ui->min_coordinate.x;
@@ -1395,10 +1417,6 @@ ui_beamformer_frame_view_convert(BeamformerUI *ui, Arena *arena, Variable *view,
 		                                 &bv->cycler_state, labels, countof(labels));
 		bv->cycler_state = BeamformerViewPlaneTag_Count;
 	}break;
-	case BeamformerFrameViewKind_Indexed:{
-		bv->cycler = add_variable_cycler(ui, menu, arena, 0, ui->small_font, s8("Index:"),
-		                                 &bv->cycler_state, 0, BeamformerMaxBacklogFrames);
-	}break;
 	default:{}break;
 	}
 
@@ -1411,6 +1429,7 @@ ui_beamformer_frame_view_new(BeamformerUI *ui, Arena *arena)
 	BeamformerFrameView *result = SLLPopFreelist(ui->view_freelist);
 	if (!result) result = push_struct_no_zero(arena, typeof(*result));
 	zero_struct(result);
+	result->export_handle.value[0] = OSInvalidHandleValue;
 	DLLPushDown(result, ui->views);
 	return result;
 }
@@ -1534,19 +1553,29 @@ ui_add_live_frame_view(BeamformerUI *ui, Variable *view, RegionSplitDirection di
 function void
 ui_beamformer_frame_view_copy_frame(BeamformerUI *ui, BeamformerFrameView *new, BeamformerFrameView *old)
 {
-	assert(old->frame);
-	new->frame = SLLPopFreelist(ui->frame_freelist);
-	if (!new->frame) new->frame = push_struct(&ui->arena, typeof(*new->frame));
+	mem_copy(&new->frame, &old->frame, sizeof(old->frame));
+
+	iv3 points     = new->frame.points;
+	i64 frame_size = points.x * points.y * points.z * beamformer_data_kind_byte_size[new->frame.data_kind];
 
-	mem_copy(new->frame, old->frame, sizeof(*new->frame));
-	new->frame->texture = 0;
-	new->frame->next    = 0;
-	alloc_beamform_frame(new->frame, old->frame->dim, old->frame->gl_kind, s8("Frame Copy: "), ui->arena);
+	Stream sb = arena_stream(ui->arena);
+	stream_append_s8(&sb, s8("Frame Copy ["));
+	stream_append_hex_u64(&sb, new->frame.id);
+	stream_append_s8(&sb, s8("]"));
+	stream_append_byte(&sb, 0);
+
+	GPUBufferAllocateInfo allocate_info = {
+		.size  = frame_size,
+		.flags = VulkanUsageFlag_TransferDestination,
+		.label = stream_to_s8(&sb),
+	};
+	vk_buffer_allocate(&new->copy_buffer, &allocate_info);
 
-	glCopyImageSubData(old->frame->texture, GL_TEXTURE_3D, 0, 0, 0, 0,
-	                   new->frame->texture, GL_TEXTURE_3D, 0, 0, 0, 0,
-	                   new->frame->dim.x, new->frame->dim.y, new->frame->dim.z);
-	glMemoryBarrier(GL_TEXTURE_UPDATE_BARRIER_BIT);
+	GPUBuffer *backlog = ui->beamformer_context->compute_context.backlog.buffer;
+	VulkanHandle cmd = vk_command_begin(VulkanTimeline_Compute);
+	vk_command_wait_timeline(cmd, VulkanTimeline_Compute, old->frame.timeline_valid_value);
+	vk_command_copy_buffer(cmd, &new->copy_buffer, backlog, old->frame.buffer_offset, frame_size);
+	new->frame.timeline_valid_value = vk_command_end(cmd, (VulkanHandle){0}, (VulkanHandle){0});
 }
 
 function void
@@ -1557,10 +1586,6 @@ ui_copy_frame(BeamformerUI *ui, Variable *view, RegionSplitDirection direction)
 	assert(view->type   == VT_UI_VIEW);
 
 	BeamformerFrameView *old = view->view.child->generic;
-	/* TODO(rnp): hack; it would be better if this was unreachable with a 0 old->frame */
-	if (!old->frame)
-		return;
-
 	Variable *new_region = ui_split_region(ui, region, view, direction);
 	new_region->region_split.right = add_beamformer_frame_view(ui, new_region, &ui->arena,
 	                                                           BeamformerFrameViewKind_Copy, 1, old);
@@ -1641,7 +1666,7 @@ function m4
 projection_matrix_for_x_plane_view(BeamformerFrameView *view)
 {
 	assert(view->kind == BeamformerFrameViewKind_3DXPlane);
-	f32 aspect = (f32)view->texture_dim.w / (f32)view->texture_dim.h;
+	f32 aspect = (f32)view->colour_image.width / (f32)view->colour_image.height;
 	m4 result = perspective_projection(10e-3f, 500e-3f, 45.0f * PI / 180.0f, aspect);
 	return result;
 }
@@ -1679,22 +1704,35 @@ view_plane_tag_from_x_plane_shift(BeamformerFrameView *view, Variable *x_plane_s
 
 function void
 render_single_xplane(BeamformerUI *ui, BeamformerFrameView *view, Variable *x_plane_shift,
-                     u32 program, f32 rotation_turns, v3 translate, BeamformerViewPlaneTag tag)
-{
-	u32 texture = 0;
-	if (ui->latest_plane[tag])
-		texture = ui->latest_plane[tag]->texture;
+                     f32 rotation_turns, v3 translate, BeamformerViewPlaneTag tag,
+                     VulkanHandle command, BeamformerRenderBeamformedPushConstants *pc)
+{
+	GPUBuffer *beamformed_buffer = ui->beamformer_context->compute_context.backlog.buffer;
+	pc->input_data = 0;
+	if (ui->latest_plane_valid[tag]) {
+		BeamformerFrame *f = ui->latest_plane + tag;
+		pc->input_data   = beamformed_buffer->gpu_pointer + f->buffer_offset;
+		pc->input_size_x = f->points.x;
+		pc->input_size_y = f->points.y;
+		pc->input_size_z = f->points.z;
+		pc->data_kind    = f->data_kind;
+		vk_command_wait_timeline(command, VulkanTimeline_Compute, f->timeline_valid_value);
+	}
+
+	v3 camera = camera_for_x_plane_view(ui, view);
+	v3 scale  = beamformer_frame_view_plane_size(ui, view);
 
-	v3 scale = beamformer_frame_view_plane_size(ui, view);
 	m4 model_transform = y_aligned_volume_transform(scale, translate, rotation_turns);
+	m4 view_m          = view_matrix_for_x_plane_view(ui, view, camera);
+	m4 projection      = projection_matrix_for_x_plane_view(view);
+
+	//pc->mvp_matrix          = m4_mul(m4_mul(model_transform, view_m), projection);
+	pc->mvp_matrix            = m4_mul(projection, m4_mul(view_m, model_transform));
+	pc->bounding_box_colour   = v4_lerp(FG_COLOUR, HOVERED_COLOUR, x_plane_shift->hover_t);
+	pc->bounding_box_fraction = FRAME_VIEW_BB_FRACTION;
 
-	v4 colour = v4_lerp(FG_COLOUR, HOVERED_COLOUR, x_plane_shift->hover_t);
-	glProgramUniformMatrix4fv(program, FRAME_VIEW_MODEL_MATRIX_LOC, 1, 0, model_transform.E);
-	glProgramUniform4fv(program, FRAME_VIEW_BB_COLOUR_LOC, 1, colour.E);
-	glProgramUniform1ui(program, FRAME_VIEW_SOLID_BB_LOC, 0);
-	glBindTextureUnit(0, texture);
-	glDrawElements(GL_TRIANGLES, ui->unit_cube_model.elements, GL_UNSIGNED_SHORT,
-	               (void *)ui->unit_cube_model.elements_offset);
+	vk_command_push_constants(command, 0, sizeof(*pc), pc);
+	vk_command_draw(command, &ui->unit_cube_model.model);
 
 	XPlaneShift *xp = &x_plane_shift->x_plane_shift;
 	v3 xp_delta = v3_sub(xp->end_point, xp->start_point);
@@ -1706,64 +1744,59 @@ render_single_xplane(BeamformerUI *ui, BeamformerFrameView *view, Variable *x_pl
 		/* TODO(rnp): there is no reason to compute the rotation matrix again */
 		model_transform = y_aligned_volume_transform(scale, v3_add(f, translate), rotation_turns);
 
-		glProgramUniformMatrix4fv(program, FRAME_VIEW_MODEL_MATRIX_LOC, 1, 0, model_transform.E);
-		glProgramUniform1ui(program, FRAME_VIEW_SOLID_BB_LOC, 1);
-		glProgramUniform4fv(program, FRAME_VIEW_BB_COLOUR_LOC, 1, HOVERED_COLOUR.E);
-		glDrawElements(GL_TRIANGLES, ui->unit_cube_model.elements, GL_UNSIGNED_SHORT,
-		               (void *)ui->unit_cube_model.elements_offset);
+		pc->mvp_matrix            = m4_mul(projection, m4_mul(view_m, model_transform));
+		pc->bounding_box_colour   = HOVERED_COLOUR;
+		pc->bounding_box_fraction = 1.0f;
+		pc->input_data            = 0;
+
+		vk_command_push_constants(command, 0, sizeof(*pc), pc);
+		vk_command_draw(command, &ui->unit_cube_model.model);
 	}
 }
 
 function void
-render_3D_xplane(BeamformerUI *ui, BeamformerFrameView *view, u32 program)
+render_3D_xplane(BeamformerUI *ui, BeamformerFrameView *view, VulkanHandle command, BeamformerRenderBeamformedPushConstants *pc)
 {
 	if (view->demo->bool32) {
 		view->rotation += dt_for_frame * 0.125f;
 		if (view->rotation > 1.0f) view->rotation -= 1.0f;
 	}
 
-	v3 camera     = camera_for_x_plane_view(ui, view);
-	m4 view_m     = view_matrix_for_x_plane_view(ui, view, camera);
-	m4 projection = projection_matrix_for_x_plane_view(view);
-
-	glProgramUniformMatrix4fv(program, FRAME_VIEW_VIEW_MATRIX_LOC,  1, 0, view_m.E);
-	glProgramUniformMatrix4fv(program, FRAME_VIEW_PROJ_MATRIX_LOC,  1, 0, projection.E);
-	glProgramUniform1f(program, FRAME_VIEW_BB_FRACTION_LOC, FRAME_VIEW_BB_FRACTION);
-
 	v3 model_translate = offset_x_plane_position(ui, view, BeamformerViewPlaneTag_XZ);
-	render_single_xplane(ui, view, view->x_plane_shifts + 0, program,
+	render_single_xplane(ui, view, view->x_plane_shifts + 0,
 	                     x_plane_rotation_for_view_plane(view, BeamformerViewPlaneTag_XZ),
-	                     model_translate, BeamformerViewPlaneTag_XZ);
+	                     model_translate, BeamformerViewPlaneTag_XZ, command, pc);
 	model_translate = offset_x_plane_position(ui, view, BeamformerViewPlaneTag_YZ);
 	model_translate.y -= 0.0001f;
-	render_single_xplane(ui, view, view->x_plane_shifts + 1, program,
+	render_single_xplane(ui, view, view->x_plane_shifts + 1,
 	                     x_plane_rotation_for_view_plane(view, BeamformerViewPlaneTag_YZ),
-	                     model_translate, BeamformerViewPlaneTag_YZ);
+	                     model_translate, BeamformerViewPlaneTag_YZ, command, pc);
 }
 
 function void
-render_2D_plane(BeamformerUI *ui, BeamformerFrameView *view, u32 program)
+render_2D_plane(BeamformerUI *ui, BeamformerFrameView *view, VulkanHandle command, BeamformerRenderBeamformedPushConstants *pc)
 {
 	m4 view_m     = m4_identity();
 	m4 model      = m4_scale((v3){{2.0f, 2.0f, 0.0f}});
 	m4 projection = orthographic_projection(0, 1, 1, 1);
 
-	glProgramUniformMatrix4fv(program, FRAME_VIEW_MODEL_MATRIX_LOC, 1, 0, model.E);
-	glProgramUniformMatrix4fv(program, FRAME_VIEW_VIEW_MATRIX_LOC,  1, 0, view_m.E);
-	glProgramUniformMatrix4fv(program, FRAME_VIEW_PROJ_MATRIX_LOC,  1, 0, projection.E);
+	GPUBuffer *beamformed_buffer = ui->beamformer_context->compute_context.backlog.buffer;
+	pc->mvp_matrix   = m4_mul(m4_mul(model, view_m), projection);
+	pc->input_data   = beamformed_buffer->gpu_pointer + view->frame.buffer_offset,
+	pc->input_size_x = view->frame.points.x,
+	pc->input_size_y = view->frame.points.y,
+	pc->input_size_z = view->frame.points.z,
+	pc->data_kind    = view->frame.data_kind,
 
-	glProgramUniform1f(program, FRAME_VIEW_BB_FRACTION_LOC, 0);
-	glBindTextureUnit(0, view->frame->texture);
-	glDrawElements(GL_TRIANGLES, ui->unit_cube_model.elements, GL_UNSIGNED_SHORT,
-	               (void *)ui->unit_cube_model.elements_offset);
+	vk_command_wait_timeline(command, VulkanTimeline_Compute, view->frame.timeline_valid_value);
+	vk_command_push_constants(command, 0, sizeof(*pc), pc);
+	vk_command_draw(command, &ui->unit_cube_model.model);
 }
 
 function b32
 frame_view_ready_to_present(BeamformerUI *ui, BeamformerFrameView *view)
 {
-	b32 result  = !iv2_equal((iv2){0}, view->texture_dim) && view->frame;
-	result     |= view->kind == BeamformerFrameViewKind_3DXPlane &&
-	              ui->latest_plane[BeamformerViewPlaneTag_Count];
+	b32 result = view->colour_image.width > 0 || view->colour_image.height > 0;
 	return result;
 }
 
@@ -1772,76 +1805,61 @@ view_update(BeamformerUI *ui, BeamformerFrameView *view)
 {
 	if (view->kind == BeamformerFrameViewKind_Latest) {
 		u32 index = *view->cycler->cycler.state;
-		view->dirty |= view->frame != ui->latest_plane[index];
-		view->frame  = ui->latest_plane[index];
-		if (view->dirty && view->frame) {
-			view->min_coordinate = m4_mul_v4(view->frame->voxel_transform, (v4){{0.0f, 0.0f, 0.0f, 1.0f}}).xyz;
-			view->max_coordinate = m4_mul_v4(view->frame->voxel_transform, (v4){{1.0f, 1.0f, 1.0f, 1.0f}}).xyz;
+		view->dirty |= view->frame.timeline_valid_value != ui->latest_plane[index].timeline_valid_value;
+		mem_copy(&view->frame, ui->latest_plane + index, sizeof(view->frame));
+		if (view->dirty) {
+			view->min_coordinate = m4_mul_v4(view->frame.voxel_transform, (v4){{0.0f, 0.0f, 0.0f, 1.0f}}).xyz;
+			view->max_coordinate = m4_mul_v4(view->frame.voxel_transform, (v4){{1.0f, 1.0f, 1.0f, 1.0f}}).xyz;
 		}
 	}
 
 	/* TODO(rnp): x-z or y-z */
-	view->dirty |= ui->frame_view_render_context->updated;
+	// TODO(rnp): how to track this now? use pipeline handle value?
+	view->dirty |= ui->beamformer_context->render_shader_updated;
 	view->dirty |= view->kind == BeamformerFrameViewKind_3DXPlane;
 
-	b32 result = frame_view_ready_to_present(ui, view) && view->dirty;
+	b32 result = view->dirty;
 	return result;
 }
 
 function void
 update_frame_views(BeamformerUI *ui, Rect window)
 {
-	FrameViewRenderContext *ctx = ui->frame_view_render_context;
-	b32 fbo_bound = 0;
 	for (BeamformerFrameView *view = ui->views; view; view = view->next) {
 		if (view_update(ui, view)) {
-			//start_renderdoc_capture(0);
-
-			if (!fbo_bound) {
-				fbo_bound = 1;
-				glBindFramebuffer(GL_FRAMEBUFFER, ctx->framebuffers[0]);
-				glUseProgram(ctx->shader);
-				glBindVertexArray(ui->unit_cube_model.vao);
-				glEnable(GL_DEPTH_TEST);
-			}
+			BeamformerRenderBeamformedPushConstants pc = {
+				.bounding_box_colour = FRAME_VIEW_BB_COLOUR,
+				.db_cutoff           = view->log_scale->bool32 ? view->dynamic_range.real32 : 0,
+				.threshold           = view->threshold.real32,
+				.gamma               = view->gamma.scaled_real32.val,
+				.positions           = ui->unit_cube_model.model.gpu_pointer,
+				.normals             = ui->unit_cube_model.model.gpu_pointer + ui->unit_cube_model.normals_offset,
+			};
 
-			u32 fb      = ctx->framebuffers[0];
-			u32 program = ctx->shader;
-			glViewport(0, 0, view->texture_dim.w, view->texture_dim.h);
-			glProgramUniform1f(program,  FRAME_VIEW_THRESHOLD_LOC,     view->threshold.real32);
-			glProgramUniform1f(program,  FRAME_VIEW_DYNAMIC_RANGE_LOC, view->dynamic_range.real32);
-			glProgramUniform1f(program,  FRAME_VIEW_GAMMA_LOC,         view->gamma.scaled_real32.val);
-			glProgramUniform1ui(program, FRAME_VIEW_LOG_SCALE_LOC,     view->log_scale->bool32);
+			//start_renderdoc_capture();
 
-			glNamedFramebufferRenderbuffer(fb, GL_COLOR_ATTACHMENT0, GL_RENDERBUFFER, ctx->renderbuffers[0]);
-			glNamedFramebufferRenderbuffer(fb, GL_DEPTH_ATTACHMENT,  GL_RENDERBUFFER, ctx->renderbuffers[1]);
-			glClearNamedFramebufferfv(fb, GL_COLOR, 0, (f32 []){0, 0, 0, 0});
-			glClearNamedFramebufferfv(fb, GL_DEPTH, 0, (f32 []){1});
+			glSignalSemaphoreEXT(ui->render_semaphores_gl[0], 0, 0, 1, &view->texture, (GLenum []){GL_NONE});
 
+			VulkanHandle cmd = vk_command_begin(VulkanTimeline_Graphics);
+			vk_command_bind_pipeline(cmd, ui->pipelines[BeamformerShaderKind_RenderBeamformed - BeamformerShaderKind_RenderFirst]);
+			vk_command_begin_rendering(cmd, &ui->render_3d_image, &ui->render_3d_depth_image, &view->colour_image);
+			vk_command_viewport(cmd, view->colour_image.width, view->colour_image.height, 0, 0, 0.0f, 1.0f);
+			vk_command_scissor(cmd, view->colour_image.width, view->colour_image.height, 0, 0);
 			if (view->kind == BeamformerFrameViewKind_3DXPlane) {
-				render_3D_xplane(ui, view, program);
+				render_3D_xplane(ui, view, cmd, &pc);
 			} else {
-				render_2D_plane(ui, view, program);
+				render_2D_plane(ui, view, cmd, &pc);
 			}
+			vk_command_end_rendering(cmd);
+			vk_command_end(cmd, ui->render_semaphores[0], ui->render_semaphores[1]);
 
-			/* NOTE(rnp): resolve multisampled scene */
-			glNamedFramebufferTexture(ctx->framebuffers[1], GL_COLOR_ATTACHMENT0, view->texture, 0);
-			glBlitNamedFramebuffer(fb, ctx->framebuffers[1], 0, 0, FRAME_VIEW_RENDER_TARGET_SIZE,
-			                       0, 0, view->texture_dim.w, view->texture_dim.h, GL_COLOR_BUFFER_BIT, GL_NEAREST);
+			glWaitSemaphoreEXT(ui->render_semaphores_gl[1], 0, 0, 1, &view->texture, (GLenum[]){GL_LAYOUT_COLOR_ATTACHMENT_EXT});
 
-			glGenerateTextureMipmap(view->texture);
-			view->dirty = 0;
+			//end_renderdoc_capture();
 
-			//end_renderdoc_capture(0);
+			view->dirty = 0;
 		}
 	}
-	if (fbo_bound) {
-		glBindFramebuffer(GL_FRAMEBUFFER, 0);
-		glViewport((i32)window.pos.x, (i32)window.pos.y, (i32)window.size.w, (i32)window.size.h);
-		/* NOTE(rnp): I don't trust raylib to not mess with us */
-		glBindVertexArray(0);
-		glDisable(GL_DEPTH_TEST);
-	}
 }
 
 function Color
@@ -2420,9 +2438,9 @@ draw_view_ruler(BeamformerFrameView *view, Arena a, Rect view_rect, TextSpec ts)
 {
 	v2 vr_max_p = v2_add(view_rect.pos, view_rect.size);
 
-	v3 U   = view->frame->voxel_transform.c[0].xyz;
-	v3 V   = view->frame->voxel_transform.c[1].xyz;
-	v3 min = view->frame->voxel_transform.c[3].xyz;
+	v3 U   = view->frame.voxel_transform.c[0].xyz;
+	v3 V   = view->frame.voxel_transform.c[1].xyz;
+	v3 min = view->frame.voxel_transform.c[3].xyz;
 
 	v2 start_uv = plane_uv(v3_sub(view->ruler.start, min), U, V);
 	v2 end_uv   = plane_uv(v3_sub(view->ruler.end,   min), U, V);
@@ -2487,7 +2505,7 @@ draw_3D_xplane_frame_view(BeamformerUI *ui, Arena arena, Variable *var, Rect dis
 	assert(var->type == VT_BEAMFORMER_FRAME_VIEW);
 	BeamformerFrameView *view  = var->generic;
 
-	f32 aspect = (f32)view->texture_dim.w / (f32)view->texture_dim.h;
+	f32 aspect = (f32)view->colour_image.width / (f32)view->colour_image.height;
 	Rect vr = display_rect;
 	if (aspect > 1.0f) vr.size.w = vr.size.h;
 	else               vr.size.h = vr.size.w;
@@ -2536,7 +2554,7 @@ draw_3D_xplane_frame_view(BeamformerUI *ui, Arena arena, Variable *var, Rect dis
 		it->hover_t = CLAMP01(it->hover_t);
 	}
 
-	Rectangle  tex_r  = {0, 0, (f32)view->texture_dim.w, (f32)view->texture_dim.h};
+	Rectangle  tex_r  = {0, 0, (f32)view->colour_image.width, (f32)view->colour_image.height};
 	NPatchInfo tex_np = {tex_r, 0, 0, 0, 0, NPATCH_NINE_PATCH};
 	DrawTextureNPatch(make_raylib_texture(view), tex_np, rl_rect(vr), (Vector2){0}, 0, WHITE);
 
@@ -2548,9 +2566,9 @@ draw_beamformer_frame_view(BeamformerUI *ui, Arena a, Variable *var, Rect displa
 {
 	assert(var->type == VT_BEAMFORMER_FRAME_VIEW);
 	BeamformerFrameView *view  = var->generic;
-	BeamformerFrame     *frame = view->frame;
+	BeamformerFrame     *frame = &view->frame;
 
-	b32 is_1d = iv3_dimension(frame->dim) == 1;
+	b32 is_1d = iv3_dimension(frame->points) == 1;
 
 	f32 txt_w = measure_text(ui->small_font, s8("-288.8 mm")).w;
 	f32 scale_bar_size = 1.2f * txt_w + RULER_TICK_LENGTH;
@@ -2603,11 +2621,11 @@ draw_beamformer_frame_view(BeamformerUI *ui, Arena a, Variable *var, Rect displa
 
 	Rectangle tex_r;
 	if (is_1d) {
-		tex_r  = (Rectangle){0, 0, view->texture_dim.x, -view->texture_dim.y};
+		tex_r  = (Rectangle){0, 0, view->colour_image.width, -view->colour_image.height};
 	} else {
 		v2 pixels_per_meter = {
-			.w = (f32)view->texture_dim.w / output_dim.w,
-			.h = (f32)view->texture_dim.h / output_dim.h,
+			.w = (f32)view->colour_image.width  / output_dim.w,
+			.h = (f32)view->colour_image.height / output_dim.h,
 		};
 
 		/* NOTE(rnp): math to resize the texture without stretching when the view changes
@@ -2742,22 +2760,19 @@ push_compute_time(Arena *arena, s8 prefix, f32 time)
 
 function v2
 draw_compute_stats_bar_view(BeamformerUI *ui, Arena arena, ComputeShaderStats *stats,
-                            BeamformerShaderKind *stages, u32 stages_count, f32 compute_time_sum,
-                            TextSpec ts, Rect r, v2 mouse)
+                            f32 compute_time_sum, TextSpec ts, Rect r, v2 mouse)
 {
 	read_only local_persist s8 frame_labels[] = {s8_comp("0:"), s8_comp("-1:"), s8_comp("-2:"), s8_comp("-3:")};
 	f32 total_times[countof(frame_labels)] = {0};
+
+	u32 stages = stats->table.shader_count;
 	Table *table = table_new(&arena, countof(frame_labels), TextAlignment_Right, TextAlignment_Left);
 	for (u32 i = 0; i < countof(frame_labels); i++) {
 		TableCell *cells = table_push_row(table, &arena, TRK_CELLS)->data;
 		cells[0].text = frame_labels[i];
 		u32 frame_index = (stats->latest_frame_index - i) % countof(stats->table.times);
-		u32 seen_shaders = 0;
-		for (u32 j = 0; j < stages_count; j++) {
-			if ((seen_shaders & (1u << stages[j])) == 0)
-				total_times[i] += stats->table.times[frame_index][stages[j]];
-			seen_shaders |= (1u << stages[j]);
-		}
+		for (u32 j = 0; j < stages; j++)
+			total_times[i] += stats->table.times[frame_index][j];
 	}
 
 	v2 result = table_extent(table, arena, ts.font);
@@ -2784,14 +2799,14 @@ draw_compute_stats_bar_view(BeamformerUI *ui, Arena arena, ComputeShaderStats *s
 		Rect rect;
 		rect.pos  = v2_add(cr.pos, (v2){{cr.size.w + table->cell_pad.w , cr.size.h * 0.15f}});
 		rect.size = (v2){.y = 0.7f * cr.size.h};
-		for (u32 i = 0; i < stages_count; i++) {
-			rect.size.w = total_width * stats->table.times[frame_index][stages[i]] / total_times[row_index];
+		for (u32 i = 0; i < stages; i++) {
+			rect.size.w = total_width * stats->table.times[frame_index][i] / total_times[row_index];
 			Color color = colour_from_normalized(g_colour_palette[i % countof(g_colour_palette)]);
 			DrawRectangleRec(rl_rect(rect), color);
 			if (point_in_rect(mouse, rect)) {
 				text_pos   = v2_add(rect.pos, (v2){.x = table->cell_pad.w});
-				s8 name    = push_s8_from_parts(&arena, s8(""), beamformer_shader_names[stages[i]], s8(": "));
-				mouse_text = push_compute_time(&arena, name, stats->table.times[frame_index][stages[i]]);
+				s8 name    = push_s8_from_parts(&arena, s8(""), beamformer_shader_names[stats->table.shader_ids[i]], s8(": "));
+				mouse_text = push_compute_time(&arena, name, stats->table.times[frame_index][i]);
 			}
 			rect.pos.x += rect.size.w;
 		}
@@ -2865,19 +2880,13 @@ draw_compute_stats_view(BeamformerUI *ui, Arena arena, Variable *view, Rect r, v
 	ComputeStatsView   *csv   = &view->compute_stats_view;
 	ComputeShaderStats *stats = csv->compute_shader_stats;
 	f32 compute_time_sum = 0;
-	u32 stages           = cp->pipeline.shader_count;
+	u32 stages           = stats->table.shader_count;
 	TextSpec text_spec   = {.font = &ui->font, .colour = FG_COLOUR, .flags = TF_LIMITED};
 
 	ui_blinker_update(&csv->blink, BLINK_SPEED);
 
-	static_assert(BeamformerShaderKind_ComputeCount <= 32, "shader kind bitfield test");
-	u32 seen_shaders = 0;
-	for (u32 i = 0; i < stages; i++) {
-		BeamformerShaderKind index = cp->pipeline.shaders[i];
-		if ((seen_shaders & (1u << index)) == 0)
-			compute_time_sum += stats->average_times[index];
-		seen_shaders |= (1u << index);
-	}
+	for (u32 index = 0; index < stages; index++)
+		compute_time_sum += stats->average_times[index];
 
 	v2 result = {0};
 
@@ -2886,13 +2895,12 @@ draw_compute_stats_view(BeamformerUI *ui, Arena arena, Variable *view, Rect r, v
 	case ComputeStatsViewKind_Average:{
 		da_reserve(&arena, table, stages);
 		for (u32 i = 0; i < stages; i++) {
-			push_table_time_row(table, &arena, beamformer_shader_names[cp->pipeline.shaders[i]],
-			                    stats->average_times[cp->pipeline.shaders[i]]);
+			push_table_time_row(table, &arena, beamformer_shader_names[stats->table.shader_ids[i]],
+			                    stats->average_times[i]);
 		}
 	}break;
 	case ComputeStatsViewKind_Bar:{
-		result = draw_compute_stats_bar_view(ui, arena, stats, cp->pipeline.shaders, stages,
-		                                     compute_time_sum, text_spec, r, mouse);
+		result = draw_compute_stats_bar_view(ui, arena, stats, compute_time_sum, text_spec, r, mouse);
 		r.pos = v2_add(r.pos, (v2){.y = result.y});
 	}break;
 	InvalidDefaultCase;
@@ -2920,9 +2928,9 @@ draw_compute_stats_view(BeamformerUI *ui, Arena arena, Variable *view, Rect r, v
 			cell_rect.size.w = t->widths[column];
 			text_spec.limits.size.w = r.size.w - (cell_rect.pos.x - it->start_x);
 
-			if (column == 0 && row_index < stages && cp->programs[row_index] == 0 &&
-			    cp->pipeline.shaders[row_index] != BeamformerShaderKind_CudaHilbert &&
-			    cp->pipeline.shaders[row_index] != BeamformerShaderKind_CudaDecode)
+			if (column == 0 && row_index < stages && vk_pipeline_valid(cp->vulkan_pipelines[row_index]) == 0 &&
+			    stats->table.shader_ids[row_index] != BeamformerShaderKind_CudaHilbert &&
+			    stats->table.shader_ids[row_index] != BeamformerShaderKind_CudaDecode)
 			{
 				text_spec.colour = v4_lerp(FG_COLOUR, FOCUSED_COLOUR, ease_in_out_quartic(csv->blink.t));
 			} else {
@@ -3707,7 +3715,7 @@ ui_begin_interact(BeamformerUI *ui, v2 mouse, b32 scroll)
 						switch (++bv->ruler.state) {
 						case RulerState_Start:{
 							hot.kind = InteractionKind_Ruler;
-							bv->ruler.start = world_point_from_plane_uv(bv->frame->voxel_transform,
+							bv->ruler.start = world_point_from_plane_uv(bv->frame.voxel_transform,
 							                                            rect_uv(mouse, hot.rect));
 						}break;
 						case RulerState_Hold:{}break;
@@ -3791,7 +3799,7 @@ ui_extra_actions(BeamformerUI *ui, Variable *var)
 
 			ui_beamformer_frame_view_release_subresources(ui, old, last_kind);
 			ui_beamformer_frame_view_convert(ui, &ui->arena, view->child, view->menu, old->kind, old, log_scale);
-			if (new->kind == BeamformerFrameViewKind_Copy && old->frame)
+			if (new->kind == BeamformerFrameViewKind_Copy)
 				ui_beamformer_frame_view_copy_frame(ui, new, old);
 
 			DLLRemove(old);
@@ -3947,7 +3955,7 @@ ui_interact(BeamformerUI *ui, BeamformerInput *input, Rect window_rect)
 		assert(it->var->type == VT_BEAMFORMER_FRAME_VIEW);
 		BeamformerFrameView *bv = it->var->generic;
 		v2 mouse = clamp_v2_rect(input_mouse, it->rect);
-		bv->ruler.end = world_point_from_plane_uv(bv->frame->voxel_transform, rect_uv(mouse, it->rect));
+		bv->ruler.end = world_point_from_plane_uv(bv->frame.voxel_transform, rect_uv(mouse, it->rect));
 	}break;
 	case InteractionKind_Drag:{
 		if (!IsMouseButtonDown(MOUSE_BUTTON_LEFT) && !IsMouseButtonDown(MOUSE_BUTTON_RIGHT)) {
@@ -4037,8 +4045,6 @@ ui_init(BeamformerCtx *ctx, Arena store)
 	if (!ui) {
 		ui = ctx->ui = push_struct(&store, typeof(*ui));
 		ui->arena = store;
-		ui->frame_view_render_context = &ctx->frame_view_render_context;
-		ui->unit_cube_model = ctx->compute_context.unit_cube_model;
 		ui->shared_memory   = ctx->shared_memory;
 		ui->beamformer_context = ctx;
 
@@ -4072,9 +4078,130 @@ ui_init(BeamformerCtx *ctx, Arena store)
 		split->region_split.left  = add_compute_progress_bar(split, ctx);
 		split->region_split.right = add_compute_stats_view(ui, split, &ui->arena, ctx);
 
+		u32 samples = vk_gpu_info()->max_msaa_samples;
+		vk_image_allocate(&ui->render_3d_image,       FRAME_VIEW_RENDER_TARGET_SIZE, 1, samples, VulkanImageUsage_Colour,       0, 0);
+		vk_image_allocate(&ui->render_3d_depth_image, FRAME_VIEW_RENDER_TARGET_SIZE, 1, samples, VulkanImageUsage_DepthStencil, 0, 0);
+
+		glGenSemaphoresEXT(countof(ui->render_semaphores_gl), ui->render_semaphores_gl);
+		for EachElement(ui->render_semaphores, it)
+			ui->render_semaphores[it] = vk_create_semaphore(ui->render_semaphores_export + it);
+
+		if (OS_WINDOWS) {
+			glImportSemaphoreWin32HandleEXT(ui->render_semaphores_gl[0], GL_HANDLE_TYPE_OPAQUE_WIN32_EXT, (void *)ui->render_semaphores_export[0].value[0]);
+			glImportSemaphoreWin32HandleEXT(ui->render_semaphores_gl[1], GL_HANDLE_TYPE_OPAQUE_WIN32_EXT, (void *)ui->render_semaphores_export[1].value[0]);
+		} else {
+			glImportSemaphoreFdEXT(ui->render_semaphores_gl[0], GL_HANDLE_TYPE_OPAQUE_FD_EXT, ui->render_semaphores_export[0].value[0]);
+			glImportSemaphoreFdEXT(ui->render_semaphores_gl[1], GL_HANDLE_TYPE_OPAQUE_FD_EXT, ui->render_semaphores_export[1].value[0]);
+			ui->render_semaphores_export[0].value[0] = OSInvalidHandleValue;
+			ui->render_semaphores_export[1].value[0] = OSInvalidHandleValue;
+		}
+
+		if (!BakeShaders)
+		{
+			for EachElement(beamformer_reloadable_render_shader_info_indices, it) {
+				i32 index = beamformer_reloadable_render_shader_info_indices[it];
+				for (u32 i = 0; i < 2; i++) {
+					BeamformerFileReloadContext *frc = push_struct(&ui->arena, typeof(*frc));
+					frc->kind                   = BeamformerFileReloadKind_RenderShader;
+					frc->shader_reload.shader   = beamformer_reloadable_shader_kinds[index];
+					frc->shader_reload.pipeline = ui->pipelines + it;
+
+					Arena scratch = ui->arena;
+					s8 file = push_s8_from_parts(&scratch, os_path_separator(), s8("shaders"),
+					                             beamformer_reloadable_shader_files[index][i]);
+
+					os_add_file_watch((char *)file.data, file.len, frc);
+				}
+			}
+		}
+
+		f32 unit_cube_vertices[] = {
+			 0.5f,  0.5f, -0.5f, 0.0f,
+			 0.5f,  0.5f, -0.5f, 0.0f,
+			 0.5f,  0.5f, -0.5f, 0.0f,
+			 0.5f, -0.5f, -0.5f, 0.0f,
+			 0.5f, -0.5f, -0.5f, 0.0f,
+			 0.5f, -0.5f, -0.5f, 0.0f,
+			 0.5f,  0.5f,  0.5f, 0.0f,
+			 0.5f,  0.5f,  0.5f, 0.0f,
+			 0.5f,  0.5f,  0.5f, 0.0f,
+			 0.5f, -0.5f,  0.5f, 0.0f,
+			 0.5f, -0.5f,  0.5f, 0.0f,
+			 0.5f, -0.5f,  0.5f, 0.0f,
+			-0.5f,  0.5f, -0.5f, 0.0f,
+			-0.5f,  0.5f, -0.5f, 0.0f,
+			-0.5f,  0.5f, -0.5f, 0.0f,
+			-0.5f, -0.5f, -0.5f, 0.0f,
+			-0.5f, -0.5f, -0.5f, 0.0f,
+			-0.5f, -0.5f, -0.5f, 0.0f,
+			-0.5f,  0.5f,  0.5f, 0.0f,
+			-0.5f,  0.5f,  0.5f, 0.0f,
+			-0.5f,  0.5f,  0.5f, 0.0f,
+			-0.5f, -0.5f,  0.5f, 0.0f,
+			-0.5f, -0.5f,  0.5f, 0.0f,
+			-0.5f, -0.5f,  0.5f, 0.0f,
+		};
+		f32 unit_cube_normals[] = {
+			 0.0f,  0.0f, -1.0f, 0.0f,
+			 0.0f,  1.0f,  0.0f, 0.0f,
+			 1.0f,  0.0f,  0.0f, 0.0f,
+			 0.0f,  0.0f, -1.0f, 0.0f,
+			 0.0f, -1.0f,  0.0f, 0.0f,
+			 1.0f,  0.0f,  0.0f, 0.0f,
+			 0.0f,  0.0f,  1.0f, 0.0f,
+			 0.0f,  1.0f,  0.0f, 0.0f,
+			 1.0f,  0.0f,  0.0f, 0.0f,
+			 0.0f,  0.0f,  1.0f, 0.0f,
+			 0.0f, -1.0f,  0.0f, 0.0f,
+			 1.0f,  0.0f,  0.0f, 0.0f,
+			 0.0f,  0.0f, -1.0f, 0.0f,
+			 0.0f,  1.0f,  0.0f, 0.0f,
+			-1.0f,  0.0f,  0.0f, 0.0f,
+			 0.0f,  0.0f, -1.0f, 0.0f,
+			 0.0f, -1.0f,  0.0f, 0.0f,
+			-1.0f,  0.0f,  0.0f, 0.0f,
+			 0.0f,  0.0f,  1.0f, 0.0f,
+			 0.0f,  1.0f,  0.0f, 0.0f,
+			-1.0f,  0.0f,  0.0f, 0.0f,
+			 0.0f,  0.0f,  1.0f, 0.0f,
+			 0.0f, -1.0f,  0.0f, 0.0f,
+			-1.0f,  0.0f,  0.0f, 0.0f,
+		};
+		u16 unit_cube_indices[] = {
+			1,  13, 19,
+			1,  19, 7,
+			9,  6,  18,
+			9,  18, 21,
+			23, 20, 14,
+			23, 14, 17,
+			16, 4,  10,
+			16, 10, 22,
+			5,  2,  8,
+			5,  8,  11,
+			15, 12, 0,
+			15, 0,  3
+		};
+
+		static_assert(countof(unit_cube_normals) == countof(unit_cube_vertices), "");
+
+		RenderModel *rm = &ui->unit_cube_model;
+		rm->vertex_count   = countof(unit_cube_vertices) / 4;
+		rm->normals_offset = round_up_to(sizeof(unit_cube_vertices), 16);
+
+		u64 model_size = 2 * round_up_to(sizeof(unit_cube_vertices), 16);
+		vk_render_model_allocate(&rm->model, unit_cube_indices, countof(unit_cube_indices), model_size, s8("unit_cube_model"));
+		vk_render_model_range_upload(&rm->model, unit_cube_vertices, 0,                  sizeof(unit_cube_vertices), 0);
+		vk_render_model_range_upload(&rm->model, unit_cube_normals,  rm->normals_offset, sizeof(unit_cube_normals),  0);
+
 		/* NOTE(rnp): shrink variable size once this fires */
 		assert((uz)(ui->arena.beg - (u8 *)ui) < KB(64));
 	}
+
+	for EachElement(beamformer_reloadable_render_shader_info_indices, it) {
+		i32 index = beamformer_reloadable_render_shader_info_indices[it];
+		BeamformerShaderKind shader = beamformer_reloadable_shader_kinds[index];
+		beamformer_reload_render_pipeline(ui->pipelines + it, shader, ui->arena);
+	}
 }
 
 function void
@@ -4091,8 +4218,15 @@ draw_ui(BeamformerCtx *ctx, BeamformerInput *input, BeamformerFrame *frame_to_dr
 {
 	BeamformerUI *ui = ctx->ui;
 
-	ui->latest_plane[BeamformerViewPlaneTag_Count] = frame_to_draw;
-	ui->latest_plane[frame_plane]                  = frame_to_draw;
+	if (frame_to_draw) {
+		mem_copy(ui->latest_plane + BeamformerViewPlaneTag_Count, frame_to_draw, sizeof(*frame_to_draw));
+		mem_copy(ui->latest_plane + frame_plane,                  frame_to_draw, sizeof(*frame_to_draw));
+		ui->latest_plane_valid[BeamformerViewPlaneTag_Count] = 1;
+		ui->latest_plane_valid[frame_plane]                  = 1;
+	} else {
+		ui->latest_plane_valid[BeamformerViewPlaneTag_Count] = 0;
+		ui->latest_plane_valid[frame_plane]                  = 0;
+	}
 
 	asan_poison_region(ui->arena.beg, ui->arena.end - ui->arena.beg);
 
@@ -4151,7 +4285,7 @@ draw_ui(BeamformerCtx *ctx, BeamformerInput *input, BeamformerFrame *frame_to_dr
 			if (pb) {
 				ui->flush_params = 0;
 
-				iv3 points    = ctx->latest_frame->dim;
+				iv3 points    = ctx->latest_frame->points;
 				i32 dimension = iv3_dimension(points);
 
 				// TODO(rnp): this is immediate mode code that should be in the ui building code
diff --git a/util.c b/util.c
@@ -607,14 +607,6 @@ s8_scan_backwards(s8 s, u8 byte)
 }
 
 function s8
-s8_trim_trailing(s8 s, u8 byte)
-{
-	s8 result = s;
-	while (result.len >= 1 && result.data[result.len - 1] == byte) result.len--;
-	return result;
-}
-
-function s8
 s8_cut_head(s8 s, iz cut)
 {
 	s8 result = s;
diff --git a/util.h b/util.h
@@ -377,18 +377,20 @@ typedef OS_WRITE_NEW_FILE_FN(os_write_new_file_fn);
 #define RENDERDOC_GET_API_FN(name) b32 name(u32 version, void **out_api)
 typedef RENDERDOC_GET_API_FN(renderdoc_get_api_fn);
 
-#define RENDERDOC_START_FRAME_CAPTURE_FN(name) void name(iptr gl_context, iptr window_handle)
+#define RENDERDOC_START_FRAME_CAPTURE_FN(name) void name(void *instance_handle, iptr window_handle)
 typedef RENDERDOC_START_FRAME_CAPTURE_FN(renderdoc_start_frame_capture_fn);
 
-#define RENDERDOC_END_FRAME_CAPTURE_FN(name) b32 name(iptr gl_context, iptr window_handle)
+#define RENDERDOC_END_FRAME_CAPTURE_FN(name) b32 name(void *instance_handle, iptr window_handle)
 typedef RENDERDOC_END_FRAME_CAPTURE_FN(renderdoc_end_frame_capture_fn);
 
-typedef alignas(16) u8 RenderDocAPI[216];
-#define RENDERDOC_API_FN_ADDR(a, offset) (*(iptr *)((*a) + offset))
-#define RENDERDOC_START_FRAME_CAPTURE(a) (renderdoc_start_frame_capture_fn *)RENDERDOC_API_FN_ADDR(a, 152)
-#define RENDERDOC_END_FRAME_CAPTURE(a)   (renderdoc_end_frame_capture_fn *)  RENDERDOC_API_FN_ADDR(a, 168)
+#define RENDERDOC_SET_CAPTURE_PATH_TEMPLATE_FN(name) void name(const char *template)
+typedef RENDERDOC_SET_CAPTURE_PATH_TEMPLATE_FN(renderdoc_set_capture_path_template_fn);
 
-#define LABEL_GL_OBJECT(type, id, s) {s8 _s = (s); glObjectLabel(type, id, (i32)_s.len, (c8 *)_s.data);}
+typedef alignas(16) u8 RenderDocAPI[216];
+#define RENDERDOC_API_FN_ADDR(a, offset)       (*(iptr *)((*a) + offset))
+#define RENDERDOC_START_FRAME_CAPTURE(a)       (renderdoc_start_frame_capture_fn *)       RENDERDOC_API_FN_ADDR(a, 152)
+#define RENDERDOC_END_FRAME_CAPTURE(a)         (renderdoc_end_frame_capture_fn *)         RENDERDOC_API_FN_ADDR(a, 168)
+#define RENDERDOC_SET_CAPTURE_PATH_TEMPLATE(a) (renderdoc_set_capture_path_template_fn *) RENDERDOC_API_FN_ADDR(a, 184)
 
 #include "util.c"
 #include "math.c"
diff --git a/util_gl.c b/util_gl.c
@@ -1,69 +0,0 @@
-/* See LICENSE for license details. */
-function u32
-compile_shader(Arena a, u32 type, s8 shader, s8 name)
-{
-	u32 sid = glCreateShader(type);
-	glShaderSource(sid, 1, (const char **)&shader.data, (int *)&shader.len);
-	glCompileShader(sid);
-
-	i32 res = 0;
-	glGetShaderiv(sid, GL_COMPILE_STATUS, &res);
-
-	if (res == GL_FALSE) {
-		Stream buf = arena_stream(a);
-		stream_append_s8s(&buf, s8("\n"), name, s8(": failed to compile\n"));
-
-		i32 len = 0, out_len = 0;
-		glGetShaderiv(sid, GL_INFO_LOG_LENGTH, &len);
-		glGetShaderInfoLog(sid, len, &out_len, (char *)(buf.data + buf.widx));
-		stream_commit(&buf, out_len);
-		glDeleteShader(sid);
-		os_console_log(buf.data, buf.widx);
-
-		sid = 0;
-	}
-
-	return sid;
-}
-
-function u32
-link_program(Arena a, u32 *shader_ids, i32 shader_id_count)
-{
-	i32 success = 0;
-	u32 result  = glCreateProgram();
-	for (i32 i = 0; i < shader_id_count; i++)
-		glAttachShader(result, shader_ids[i]);
-	glLinkProgram(result);
-	glGetProgramiv(result, GL_LINK_STATUS, &success);
-	if (success == GL_FALSE) {
-		i32 len    = 0;
-		Stream buf = arena_stream(a);
-		stream_append_s8(&buf, s8("shader link error: "));
-		glGetProgramInfoLog(result, buf.cap - buf.widx, &len, (c8 *)(buf.data + buf.widx));
-		stream_reset(&buf, len);
-		stream_append_byte(&buf, '\n');
-		os_console_log(buf.data, buf.widx);
-		glDeleteProgram(result);
-		result = 0;
-	}
-	return result;
-}
-
-function u32
-load_shader(Arena arena, s8 *shader_texts, u32 *shader_types, i32 count, s8 name)
-{
-	u32 result = 0;
-	u32 *ids   = push_array(&arena, u32, count);
-	b32 valid  = 1;
-	for (i32 i = 0; i < count; i++) {
-		ids[i]  = compile_shader(arena, shader_types[i], shader_texts[i], name);
-		valid  &= ids[i] != 0;
-	}
-
-	if (valid) result = link_program(arena, ids, count);
-	for (i32 i = 0; i < count; i++) glDeleteShader(ids[i]);
-
-	if (result) glObjectLabel(GL_PROGRAM, result, (i32)name.len, (c8 *)name.data);
-
-	return result;
-}
diff --git a/util_os.c b/util_os.c
@@ -24,3 +24,21 @@ release_lock(i32 *lock)
 	atomic_store_u32(lock, 0);
 	os_wake_all_waiters(lock);
 }
+
+#if BEAMFORMER_RENDERDOC_HOOKS
+function void
+load_renderdoc_functions(BeamformerInput *input, OSLibrary rdoc)
+{
+	if ValidHandle(rdoc) {
+		renderdoc_get_api_fn *get_api = os_lookup_symbol(rdoc, "RENDERDOC_GetAPI");
+		if (get_api) {
+			RenderDocAPI *api = 0;
+			if (get_api(10600, (void **)&api)) {
+				input->renderdoc_start_frame_capture            = RENDERDOC_START_FRAME_CAPTURE(api);
+				input->renderdoc_end_frame_capture              = RENDERDOC_END_FRAME_CAPTURE(api);
+				input->renderdoc_set_capture_file_path_template = RENDERDOC_SET_CAPTURE_PATH_TEMPLATE(api);
+			}
+		}
+	}
+}
+#endif
diff --git a/vulkan.c b/vulkan.c
@@ -1,3 +1,8 @@
+/* See LICENSE for license details. */
+// TODO(rnp)
+// [ ]: what is needed for HDR? I think it makes sense to just default to it nowadays
+// [ ]: once opengl is removed switch images to SRGB and/or 16 bit Float
+
 #include "beamformer_internal.h"
 #include "vulkan.h"
 #include "external/glslang/glslang/Include/glslang_c_interface.h"
@@ -7,6 +12,9 @@
 
 #define ValidVulkanHandle(h) ((h).value[0] != 0)
 
+#define MaxCommandBuffersInFlight  BeamformerMaxRawDataFramesInFlight
+#define MaxCommandBufferTimestamps (64)
+
 typedef enum {
 	VulkanQueueKind_Graphics,
 	VulkanQueueKind_Compute,
@@ -28,17 +36,44 @@ typedef struct {
 	void *            host_pointer;
 
 	VulkanMemoryKind  memory_kind;
+
+	// NOTE: only used when the buffer is backing a VulkanRenderModel.
+	VkIndexType       index_type;
 } VulkanBuffer;
 
 typedef struct {
-	VkPipeline       pipeline;
-	VkPipelineLayout layout;
-} VulkanShader;
+	VkDeviceMemory    memory;
+	VkImage           image;
+	VkImageView       view;
+} VulkanImage;
+
+typedef struct {
+	VkPipeline         pipeline;
+	VkPipelineLayout   layout;
+	VkShaderStageFlags stage_flags;
+} VulkanPipeline;
+
+typedef struct {
+	VkSemaphore semaphore;
+	u64         value;
+} VulkanSemaphore;
+
+typedef struct {
+	VulkanQueueKind kind;
+	u32             command_buffer_index;
+
+	// NOTE(rnp): since there may not be QueueKind_Count queues, when putting values into this
+	// array you must be careful to map through the queue_indices array in the vulkan_context.
+	u64 in_flight_wait_values[VulkanQueueKind_Count];
+} VulkanCommandBuffer;
 
 typedef enum {
 	VulkanEntityKind_Buffer,
+	VulkanEntityKind_CommandBuffer,
+	VulkanEntityKind_Image,
+	VulkanEntityKind_Pipeline,
+	VulkanEntityKind_RenderModel,
 	VulkanEntityKind_Semaphore,
-	VulkanEntityKind_Shader,
 } VulkanEntityKind;
 
 typedef struct VulkanEntity VulkanEntity;
@@ -46,9 +81,11 @@ struct VulkanEntity {
 	VulkanEntity *   next;
 	VulkanEntityKind kind;
 	union {
-		VulkanBuffer buffer;
-		VkSemaphore  semaphore;
-		VulkanShader shader;
+		VulkanBuffer        buffer;
+		VulkanCommandBuffer command_buffer;
+		VulkanImage         image;
+		VulkanPipeline      pipeline;
+		VulkanSemaphore     semaphore;
 	} as;
 };
 
@@ -59,10 +96,21 @@ typedef alignas(64) struct {
 	u16     queue_index;
 	VkQueue queue;
 
-	u8      _pad[48];
+	VkQueryPool     query_pool;
+	u32             query_pool_occupied[VulkanQueueKind_Count];
+
+	u32             next_command_buffer_index;
+	VkCommandPool   command_pool;
+	VkCommandBuffer command_buffers[MaxCommandBuffersInFlight];
+	u64             command_buffer_submission_values[MaxCommandBuffersInFlight];
+
+	VulkanSemaphore timeline_semaphore;
+
+	VkPipelineStageFlags2 pipeline_stage_flags;
+
+	VulkanPipeline *bound_pipeline;
 } VulkanQueue;
-static_assert(sizeof(VulkanQueue) == 64 && alignof(VulkanQueue) == 64,
-              "VulkanQueue must be placed on its own cacheline");
+static_assert(alignof(VulkanQueue) == 64, "VulkanQueue must be placed on its own cacheline");
 
 typedef struct {
 	Arena             arena;
@@ -72,8 +120,9 @@ typedef struct {
 	VkDevice          device;
 	VkPhysicalDevice  physical_device;
 
-	// NOTE(rnp): fallback for when a compute shader fails to compile
-	VulkanShader      default_compute_shader;
+	// NOTE(rnp): fallback for when a shader fails to compile
+	VulkanPipeline    default_compute_pipeline;
+	VulkanPipeline    default_graphics_pipeline;
 
 	GPUInfo           gpu_info;
 
@@ -88,6 +137,14 @@ typedef struct {
 	} memory_info;
 
 	VulkanQueue *     queues[VulkanQueueKind_Count];
+	// NOTE(rnp): there are a few places in the code where simply going through the queues map
+	// is not sufficient. those places need to know of the unique queues which unique queue
+	// is being referred to. that code uses this map instead.
+	u16               queue_indices[VulkanQueueKind_Count];
+	u16               unique_queues;
+
+	VkFormat          swap_chain_image_format;
+	VkFormat          depth_stencil_format;
 
 	VulkanEntity *    entity_freelist;
 	Arena             entity_arena;
@@ -110,8 +167,11 @@ read_only global const char *vk_required_instance_extensions[] = {
 #endif
 
 #define VK_REQUIRED_DEVICE_EXTENSIONS_LIST \
+	X("VK_KHR_16bit_storage") \
 	X("VK_KHR_external_memory") \
 	X("VK_KHR_external_semaphore") \
+	X("VK_KHR_storage_buffer_storage_class") \
+	X("VK_KHR_timeline_semaphore") \
 	VK_OS_REQUIRED_DEVICE_EXTENSIONS_LIST
 
 #define X(str) str,
@@ -126,6 +186,42 @@ VK_REQUIRED_DEVICE_EXTENSIONS_LIST
 };
 #undef X
 
+#define VK_REQUIRED_PHYSICAL_FEATURES \
+	X(shaderInt16) \
+	X(shaderInt64) \
+
+#define VK_REQUIRED_PHYSICAL_11_FEATURES \
+	X(storageBuffer16BitAccess) \
+
+#define VK_REQUIRED_PHYSICAL_12_FEATURES \
+	X(bufferDeviceAddress) \
+	X(shaderFloat16) \
+	X(timelineSemaphore) \
+
+#define VK_REQUIRED_PHYSICAL_13_FEATURES \
+	X(dynamicRendering) \
+	X(synchronization2) \
+
+#define VK_DEBUG_EXTENSIONS \
+	X(VK_KHR, shader_non_semantic_info) \
+	X(VK_KHR, shader_relaxed_extended_instruction) \
+
+#define X(p, s, ...) #p "_" #s,
+read_only global const char *vk_debug_extensions[] = {VK_DEBUG_EXTENSIONS};
+#undef X
+#define X(p, s, ...) sizeof(#p "_" #s) - 1,
+read_only global u32 vk_debug_extension_name_lengths[] = {VK_DEBUG_EXTENSIONS};
+#undef X
+
+global union {
+	struct {
+		#define X(_, name, ...) b8 name;
+		VK_DEBUG_EXTENSIONS
+		#undef X
+	};
+	b8 E[countof(vk_debug_extensions)];
+} vulkan_debug;
+
 global VulkanContext vulkan_context[1];
 
 /* NOTE(rnp): the idea here is to set reasonable development constraints.
@@ -214,6 +310,15 @@ global glslang_resource_t glslc_resource_constraints[1] = {{
 	},
 }};
 
+
+#if BEAMFORMER_RENDERDOC_HOOKS
+DEBUG_IMPORT void *
+vk_renderdoc_instance_handle(void)
+{
+	return *((void **)vulkan_context->handle);
+}
+#endif
+
 function VulkanEntity *
 vk_entity_allocate(VulkanEntityKind kind)
 {
@@ -246,6 +351,16 @@ vk_entity_data(VulkanHandle h, VulkanEntityKind kind)
 	return &e->as;
 }
 
+function VkCommandBuffer
+vk_command_buffer(VulkanHandle h)
+{
+	VulkanCommandBuffer *vcb = vk_entity_data(h, VulkanEntityKind_CommandBuffer);
+	VulkanQueue         *vq  = vulkan_context->queues[vcb->kind];
+
+	VkCommandBuffer result = vq->command_buffers[vcb->command_buffer_index];
+	return result;
+}
+
 #define glslang_log(a, ...) glslang_log_(a, arg_list(s8, __VA_ARGS__))
 function void
 glslang_log_(Arena arena, s8 *items, uz count)
@@ -253,8 +368,8 @@ glslang_log_(Arena arena, s8 *items, uz count)
 	Stream sb = arena_stream(arena);
 	stream_append_s8(&sb, glslang_info(""));
 	stream_append_s8s_(&sb, items, count);
-	s8 log = s8_trim_trailing(stream_to_s8(&sb), '\n');
-	os_console_log(log.data, log.len);
+	if (sb.data[sb.widx - 1] != '\n') stream_append_byte(&sb, '\n');
+	os_console_log(sb.data, sb.widx);
 }
 
 function s8
@@ -271,7 +386,7 @@ glsl_to_spirv(Arena *arena, u32 kind, s8 shader_text, s8 name)
 		.target_language                   = GLSLANG_TARGET_SPV,
 		.target_language_version           = GLSLANG_TARGET_SPV_1_6,
 		.code                              = (c8 *)shader_text.data,
-		.default_version                   = 100,
+		.default_version                   = 460,
 		.default_profile                   = GLSLANG_NO_PROFILE,
 		.force_default_version_and_profile = 0,
 		.forward_compatible                = 0,
@@ -302,13 +417,13 @@ glsl_to_spirv(Arena *arena, u32 kind, s8 shader_text, s8 name)
 		glslang_program_add_shader(program, shader);
 		i32 messages = GLSLANG_MSG_DEBUG_INFO_BIT|GLSLANG_MSG_SPV_RULES_BIT|GLSLANG_MSG_VULKAN_RULES_BIT;
 		if (glslang_program_link(program, messages)) {
-			glslang_spv_options_t options = {
-				.validate            = 1,
-				.generate_debug_info = 1,
-				.emit_nonsemantic_shader_debug_info = 1,
-				.emit_nonsemantic_shader_debug_source = 1,
-				//.disable_optimizer   = 1,
-			};
+			glslang_spv_options_t options = {.validate = 1,};
+
+			if (vulkan_debug.shader_non_semantic_info) {
+				options.generate_debug_info                  = 1;
+				options.emit_nonsemantic_shader_debug_info   = 1;
+				options.emit_nonsemantic_shader_debug_source = 1;
+			}
 
 			glslang_program_add_source_text(program, kind, (c8 *)shader_text.data, shader_text.len);
 			glslang_program_SPIRV_generate_with_options(program, kind, &options);
@@ -342,7 +457,7 @@ vk_shader_kind_to_glslang_shader_kind(u32 kind)
 function VkShaderModule
 vk_compile_shader_module(Arena arena, u32 kind, s8 text, s8 name)
 {
-	VkShaderModule result = 0;
+	VkShaderModule result = {0};
 	s8 spirv = glsl_to_spirv(&arena, vk_shader_kind_to_glslang_shader_kind(kind), text, name);
 	VkShaderModuleCreateInfo create_info = {
 		.sType    = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO,
@@ -350,19 +465,45 @@ vk_compile_shader_module(Arena arena, u32 kind, s8 text, s8 name)
 		.pCode    = (u32 *)spirv.data,
 	};
 	if (spirv.len > 0) vkCreateShaderModule(vulkan_context->device, &create_info, 0, &result);
+
+	return result;
+}
+
+function VkShaderStageFlags
+vk_stage_flags_from_shader_kind(VulkanShaderKind kind)
+{
+	read_only local_persist VkShaderStageFlags map[VulkanShaderKind_Count + 1] = {
+		[VulkanShaderKind_Vertex]   = VK_SHADER_STAGE_VERTEX_BIT,
+		[VulkanShaderKind_Mesh]     = VK_SHADER_STAGE_MESH_BIT_EXT,
+		[VulkanShaderKind_Fragment] = VK_SHADER_STAGE_FRAGMENT_BIT,
+		[VulkanShaderKind_Compute]  = VK_SHADER_STAGE_COMPUTE_BIT,
+		[VulkanShaderKind_Count]    = 0,
+	};
+	VkShaderStageFlags result = map[Clamp((u32)kind, 0, VulkanShaderKind_Count)];
 	return result;
 }
 
-function VulkanShader
-vk_compute_pipeline_from_shader_text(Arena arena, s8 text, s8 name)
+function VulkanPipeline
+vk_compute_pipeline_from_shader_text(Arena arena, s8 text, s8 name, u32 push_constants_size)
 {
-	VulkanShader result = {0};
+	VulkanPipeline result = {.stage_flags = VK_SHADER_STAGE_COMPUTE_BIT};
 	VkShaderModule module = vk_compile_shader_module(arena, VK_SHADER_STAGE_COMPUTE_BIT, text, name);
 	if (module) {
-		VkPipelineLayoutCreateInfo pli = {.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO};
-		vkCreatePipelineLayout(vulkan_context->device, &pli, 0, &result.layout);
+		VkPushConstantRange push_constant_range = {
+			.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
+			.offset     = 0,
+			.size       = push_constants_size,
+		};
+
+		VkPipelineLayoutCreateInfo pipeline_layout_create_info = {
+			.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
+			.pushConstantRangeCount = push_constants_size ? 1 : 0,
+			.pPushConstantRanges    = push_constants_size ? &push_constant_range : 0,
+		};
+
+		vkCreatePipelineLayout(vulkan_context->device, &pipeline_layout_create_info, 0, &result.layout);
 
-		VkComputePipelineCreateInfo pi = {
+		VkComputePipelineCreateInfo pipeline_create_info = {
 			.sType  = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
 			.layout = result.layout,
 			.stage  = {
@@ -373,10 +514,376 @@ vk_compute_pipeline_from_shader_text(Arena arena, s8 text, s8 name)
 			},
 		};
 
-		vkCreateComputePipelines(vulkan_context->device, 0, 1, &pi, 0, &result.pipeline);
+		vkCreateComputePipelines(vulkan_context->device, 0, 1, &pipeline_create_info, 0, &result.pipeline);
 		vkDestroyShaderModule(vulkan_context->device, module, 0);
 	}
+	if (result.pipeline == 0) result = vulkan_context->default_compute_pipeline;
+
+	return result;
+}
+
+function VulkanPipeline
+vk_graphics_pipeline_from_infos(Arena arena, VulkanPipelineCreateInfo *infos, u32 count, u32 push_constants_size)
+{
+	assume(count == 2);
+
+	VulkanPipeline result = {0};
+	VkShaderModule modules[2];
+
+	modules[0] = vk_compile_shader_module(arena, vk_stage_flags_from_shader_kind(infos[0].kind),
+	                                      infos[0].text, infos[0].name);
+	modules[1] = vk_compile_shader_module(arena, vk_stage_flags_from_shader_kind(infos[1].kind),
+	                                      infos[1].text, infos[1].name);
+	if (modules[0] && modules[1]) {
+		result.stage_flags = vk_stage_flags_from_shader_kind(infos[0].kind)
+		                     | vk_stage_flags_from_shader_kind(infos[1].kind);
+
+		VkPushConstantRange pcr = {
+			.stageFlags = result.stage_flags,
+			.offset     = 0,
+			.size       = push_constants_size,
+		};
+
+		VkPipelineLayoutCreateInfo pipeline_layout_info = {
+			.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
+			.pushConstantRangeCount = push_constants_size ? 1    : 0,
+			.pPushConstantRanges    = push_constants_size ? &pcr : 0,
+		};
+
+		vkCreatePipelineLayout(vulkan_context->device, &pipeline_layout_info, 0, &result.layout);
+
+		VkPipelineShaderStageCreateInfo shader_stage_create_infos[2] = {
+			{
+				.sType  = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
+				.stage  = vk_stage_flags_from_shader_kind(infos[0].kind),
+				.module = modules[0],
+				.pName  = "main",
+			},
+			{
+				.sType  = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
+				.stage  = vk_stage_flags_from_shader_kind(infos[1].kind),
+				.module = modules[1],
+				.pName  = "main",
+			},
+		};
+
+		VkPipelineVertexInputStateCreateInfo vertex_input_info = {
+			.sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO,
+		};
+
+		VkPipelineInputAssemblyStateCreateInfo input_assembly_info = {
+			.sType    = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO,
+			.topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST,
+		};
+
+		VkPipelineViewportStateCreateInfo viewport_info = {
+			.sType         = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO,
+			.viewportCount = 1,
+			.scissorCount  = 1,
+		};
+
+		VkPipelineRasterizationStateCreateInfo rasterization_info = {
+			.sType       = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO,
+			.polygonMode = VK_POLYGON_MODE_FILL,
+			.lineWidth   = 1.0f,
+			.cullMode    = VK_CULL_MODE_BACK_BIT,
+			.frontFace   = VK_FRONT_FACE_CLOCKWISE,
+		};
+
+		VkPipelineMultisampleStateCreateInfo multisampling_info = {
+			.sType                = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO,
+			.rasterizationSamples = vulkan_context->gpu_info.max_msaa_samples,
+		};
+
+		VkPipelineDepthStencilStateCreateInfo depth_test_create_info = {
+			.sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO,
+			.depthTestEnable       = 1,
+			.depthWriteEnable      = 1,
+			.depthCompareOp        = VK_COMPARE_OP_LESS,
+			.depthBoundsTestEnable = 1,
+			.stencilTestEnable     = 0,
+			.front                 = {0},
+			.back                  = {0},
+			.minDepthBounds        = 0.0f,
+			.maxDepthBounds        = 1.0f,
+		};
+
+		u32 colour_mask = VK_COLOR_COMPONENT_R_BIT|VK_COLOR_COMPONENT_G_BIT|VK_COLOR_COMPONENT_B_BIT|VK_COLOR_COMPONENT_A_BIT;
+		VkPipelineColorBlendAttachmentState blend_state = {
+			.colorWriteMask      = colour_mask,
+			.blendEnable         = 1,
+			.srcColorBlendFactor = VK_BLEND_FACTOR_SRC_ALPHA,
+			.dstColorBlendFactor = VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA,
+			.colorBlendOp        = VK_BLEND_OP_ADD,
+			.srcAlphaBlendFactor = VK_BLEND_FACTOR_ONE,
+			.dstAlphaBlendFactor = VK_BLEND_FACTOR_ZERO,
+			.alphaBlendOp        = VK_BLEND_OP_ADD,
+		};
+
+		VkPipelineColorBlendStateCreateInfo colour_blend_state_create = {
+			.sType           = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO,
+			.logicOpEnable   = 0,
+			.logicOp         = VK_LOGIC_OP_COPY,
+			.attachmentCount = 1,
+			.pAttachments    = &blend_state,
+		};
+
+		VkDynamicState dynamic_states[] = {
+			VK_DYNAMIC_STATE_VIEWPORT,
+			VK_DYNAMIC_STATE_SCISSOR,
+		};
+
+		VkPipelineDynamicStateCreateInfo dynamic_state_info = {
+			.sType             = VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO,
+			.dynamicStateCount = countof(dynamic_states),
+			.pDynamicStates    = dynamic_states,
+		};
+
+		//VkFormat colour_attachment_format = VK_FORMAT_R8G8B8A8_SRGB;
+		VkFormat colour_attachment_format = VK_FORMAT_R8G8B8A8_UNORM;
+		VkPipelineRenderingCreateInfo rendering_create_info = {
+			.sType                   = VK_STRUCTURE_TYPE_PIPELINE_RENDERING_CREATE_INFO,
+			.colorAttachmentCount    = 1,
+			.pColorAttachmentFormats = &colour_attachment_format,
+			.depthAttachmentFormat   = vulkan_context->depth_stencil_format,
+			.stencilAttachmentFormat = vulkan_context->depth_stencil_format,
+		};
+
+		VkGraphicsPipelineCreateInfo pci = {
+			.sType               = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO,
+			.pNext               = &rendering_create_info,
+			.stageCount          = countof(shader_stage_create_infos),
+			.pStages             = shader_stage_create_infos,
+			.pVertexInputState   = &vertex_input_info,
+			.pInputAssemblyState = &input_assembly_info,
+			.pViewportState      = &viewport_info,
+			.pRasterizationState = &rasterization_info,
+			.pMultisampleState   = &multisampling_info,
+			.pDepthStencilState  = &depth_test_create_info,
+			.pColorBlendState    = &colour_blend_state_create,
+			.pDynamicState       = &dynamic_state_info,
+			.layout              = result.layout,
+		};
+
+		vkCreateGraphicsPipelines(vulkan_context->device, 0, 1, &pci,0, &result.pipeline);
+	}
+
+	if (modules[0]) vkDestroyShaderModule(vulkan_context->device, modules[0], 0);
+	if (modules[1]) vkDestroyShaderModule(vulkan_context->device, modules[1], 0);
+
+	if (result.pipeline == 0) result = vulkan_context->default_graphics_pipeline;
+
+	return result;
+}
+
+function VulkanSemaphore
+vk_make_semaphore(OSHandle *export)
+{
+	VulkanContext *vk = vulkan_context;
+
+	VkSemaphoreCreateInfo       sci  = {.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO};
+	VkExportSemaphoreCreateInfo esci = {
+		.sType       = VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_CREATE_INFO,
+		.handleTypes = OS_WINDOWS ? VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT
+		                          : VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT,
+	};
+	VkSemaphoreTypeCreateInfo stc = {
+		.sType         = VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO,
+		.semaphoreType = VK_SEMAPHORE_TYPE_TIMELINE,
+	};
+
+	if (export) sci.pNext = &esci;
+	else        sci.pNext = &stc;
+
+	VulkanSemaphore result = {0};
+
+	vkCreateSemaphore(vk->device, &sci, 0, &result.semaphore);
+
+	if (export) {
+		if (OS_WINDOWS) {
+			VkSemaphoreGetWin32HandleInfoKHR ghi = {
+				.sType      = VK_STRUCTURE_TYPE_SEMAPHORE_GET_WIN32_HANDLE_INFO_KHR,
+				.handleType = VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT,
+				.semaphore  = result.semaphore,
+			};
+			void *handle;
+			vkGetSemaphoreWin32HandleKHR(vk->device, &ghi, &handle);
+			export->value[0] = (u64)handle;
+		} else {
+			VkSemaphoreGetFdInfoKHR ghi = {
+				.sType      = VK_STRUCTURE_TYPE_SEMAPHORE_GET_FD_INFO_KHR,
+				.handleType = VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT,
+				.semaphore  = result.semaphore,
+			};
+			i32 handle;
+			vkGetSemaphoreFdKHR(vk->device, &ghi, &handle);
+			export->value[0] = (u64)handle;
+		}
+	}
+
+	return result;
+}
+
+function void
+vk_release_memory(VkDeviceMemory memory, u64 size)
+{
+	VulkanContext *vk = vulkan_context;
+	vkFreeMemory(vk->device, memory, 0);
+	atomic_add_u64(&vk->gpu_info.gpu_heap_used, -size);
+}
+
+function b32
+vk_allocate_memory(VkDeviceMemory *memory, u64 size, VulkanMemoryKind kind, VkMemoryAllocateFlags flags,
+                   VkMemoryDedicatedAllocateInfo *dedicated_allocate_info, OSHandle *export)
+{
+	VulkanContext *vk = vulkan_context;
+
+	VkExportMemoryAllocateInfo export_info = {
+		.sType       = VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO,
+		.handleTypes = OS_WINDOWS ? VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT
+		                          : VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT,
+	};
+
+	VkMemoryAllocateFlagsInfo memory_allocate_flags_info = {
+		.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_FLAGS_INFO,
+		.flags = flags,
+		.pNext = dedicated_allocate_info,
+	};
+
+	if (export) {
+		export_info.pNext = dedicated_allocate_info;
+		memory_allocate_flags_info.pNext = &export_info;
+	}
+
+	VkMemoryAllocateInfo memory_allocate_info = {
+		.sType           = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
+		.allocationSize  = size,
+		.memoryTypeIndex = vk->memory_info.memory_type_indices[kind],
+		.pNext           = &memory_allocate_flags_info,
+	};
+
+	b32 result = vkAllocateMemory(vk->device, &memory_allocate_info, 0, memory) == VK_SUCCESS;
+	if (result) {
+		atomic_add_u64(&vk->gpu_info.gpu_heap_used, memory_allocate_info.allocationSize);
+
+		if (export) {
+			if (OS_WINDOWS) {
+				VkMemoryGetWin32HandleInfoKHR handle_info = {
+					.sType      = VK_STRUCTURE_TYPE_MEMORY_GET_WIN32_HANDLE_INFO_KHR,
+					.memory     = *memory,
+					.handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT,
+				};
+				void *handle;
+				vkGetMemoryWin32HandleKHR(vk->device, &handle_info, &handle);
+				export->value[0] = (u64)handle;
+			} else {
+				VkMemoryGetFdInfoKHR fd_info = {
+					.sType      = VK_STRUCTURE_TYPE_MEMORY_GET_FD_INFO_KHR,
+					.memory     = *memory,
+					.handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT,
+				};
+				i32 fd;
+				vkGetMemoryFdKHR(vk->device, &fd_info, &fd);
+				export->value[0] = (u64)fd;
+			}
+		}
+	}
+	return result;
+}
+
+function u32
+vk_index_size(VkIndexType type)
+{
+	u32 result = 0;
+	switch (type) {
+	case VK_INDEX_TYPE_UINT16:{ result = 2; }break;
+	case VK_INDEX_TYPE_UINT32:{ result = 4; }break;
+	InvalidDefaultCase;
+	}
+	return result;
+}
+
+typedef struct {
+	GPUBuffer        *gpu_buffer;
+	u64               size;
+	VulkanUsageFlags  flags;
+	u32               queue_family_count;
+	u32               queue_family_indices[VulkanTimeline_Count];
+	VkIndexType       index_type;
+	s8                label;
+} VulkanBufferAllocateInfo;
+
+function b32
+vk_buffer_allocate_common(VulkanBuffer *vb, VulkanBufferAllocateInfo *ai)
+{
+	VulkanContext *vk = vulkan_context;
+
+	// TODO(rnp): this probably should be handled, its usually 4GB. likely
+	// need to chain multiple allocations and handle it in shader code
+	u64 size = Min(ai->size, vk->memory_info.max_allocation_size & ~(vk->memory_info.non_coherent_atom_size - 1));
+
+	VkBufferCreateInfo buffer_create_info = {
+		.sType       = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
+		.usage       = VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT,
+		.size        = size,
+		.sharingMode = ai->queue_family_count > 1 ? VK_SHARING_MODE_CONCURRENT : VK_SHARING_MODE_EXCLUSIVE,
+		.queueFamilyIndexCount = ai->queue_family_count,
+		.pQueueFamilyIndices   = ai->queue_family_indices,
+	};
+
+	if (ai->flags & VulkanUsageFlag_TransferSource)
+		buffer_create_info.usage |= VK_BUFFER_USAGE_TRANSFER_SRC_BIT;
+
+	if (ai->flags & VulkanUsageFlag_TransferDestination)
+		buffer_create_info.usage |= VK_BUFFER_USAGE_TRANSFER_DST_BIT;
+
+	if (ai->index_type != VK_INDEX_TYPE_NONE_KHR)
+		buffer_create_info.usage |= VK_BUFFER_USAGE_INDEX_BUFFER_BIT;
+
+	vkCreateBuffer(vk->device, &buffer_create_info, 0, &vb->buffer);
+
+	VkMemoryRequirements memory_requirements;
+	vkGetBufferMemoryRequirements(vk->device, vb->buffer, &memory_requirements);
+
+	assert((u64)size <= memory_requirements.size);
+	size = memory_requirements.size;
+
+	VkMemoryDedicatedAllocateInfo dedicated_allocate_info = {
+		.sType  = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO,
+		.buffer = vb->buffer,
+	};
+
+	/* NOTE(rnp): to create a CPU writable buffer:
+	 * 1. try to allocate and map the entire buffer
+	 *    - this may fail if the buffer is bigger than the BAR size
+	 *      (unknowable from vulkan), or the memory space has become
+	 *      too fragmented (unlikely)
+	 * 2. if allocation or mapping fails we must chain a host buffer
+	 *    for staging. If this happens in practice we should add
+	 *    the ability to import an existing external allocation
+	 */
+	b32 host_read_write = (ai->flags & VulkanUsageFlag_HostReadWrite) != 0;
+	vb->memory_kind = host_read_write ? VulkanMemoryKind_BAR : VulkanMemoryKind_Device;
+
+	b32 result = 0;
+	// TODO(rnp): this may fail if the allocation is too big for the BAR size
+	// it needs to handled properly
+	if (vk_allocate_memory(&vb->memory, size, vb->memory_kind, VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_BIT, &dedicated_allocate_info, 0)) {
+		result  = 1;
+		ai->gpu_buffer->size = size;
+
+		vb->index_type = ai->index_type;
 
+		if (host_read_write)
+			vkMapMemory(vk->device, vb->memory, 0, size, 0, &vb->host_pointer);
+
+		vkBindBufferMemory(vk->device, vb->buffer, vb->memory, 0);
+		VkBufferDeviceAddressInfo buffer_device_address_info = {
+			.sType  = VK_STRUCTURE_TYPE_BUFFER_DEVICE_ADDRESS_INFO,
+			.buffer = vb->buffer,
+		};
+		ai->gpu_buffer->gpu_pointer = vkGetBufferDeviceAddress(vk->device, &buffer_device_address_info);
+	}
 	return result;
 }
 
@@ -398,7 +905,9 @@ vk_load_instance(void)
 
 	/* TODO(rnp): debug only, and check for these before enabling */
 	const char *validation_layers[] = {
+		#if BEAMFORMER_DEBUG
 		"VK_LAYER_KHRONOS_validation",
+		#endif
 	};
 
 	VkInstanceCreateInfo instance_create_info = {
@@ -410,6 +919,23 @@ vk_load_instance(void)
 		.enabledLayerCount       = countof(validation_layers),
 	};
 
+	#if 0 && BEAMFORMER_DEBUG
+	VkValidationFeatureEnableEXT validation_feature_enables[] = {
+		VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_EXT,
+		VK_VALIDATION_FEATURE_ENABLE_BEST_PRACTICES_EXT,
+		VK_VALIDATION_FEATURE_ENABLE_DEBUG_PRINTF_EXT,
+		VK_VALIDATION_FEATURE_ENABLE_SYNCHRONIZATION_VALIDATION_EXT,
+	};
+
+	VkValidationFeaturesEXT validation_features = {
+		.sType                         = VK_STRUCTURE_TYPE_VALIDATION_FEATURES_EXT,
+		.enabledValidationFeatureCount = countof(validation_feature_enables),
+		.pEnabledValidationFeatures    = validation_feature_enables,
+	};
+
+	instance_create_info.pNext = &validation_features;
+	#endif
+
 	vkCreateInstance(&instance_create_info, 0, &vulkan_context->handle);
 
 	#define X(name, ...) name = (name##_fn *)vkGetInstanceProcAddr(vulkan_context->handle, #name);
@@ -451,7 +977,7 @@ vk_load_physical_device(Arena arena, Stream *err)
 
 	VkPhysicalDeviceProperties2        dp   = {.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2};
 	VkPhysicalDeviceVulkan11Properties v11p = {.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_PROPERTIES};
-	dp.pNext= &v11p;
+	dp.pNext = &v11p;
 
 	vkGetPhysicalDeviceProperties2(vk->physical_device, &dp);
 
@@ -498,6 +1024,89 @@ vk_load_physical_device(Arena arena, Stream *err)
 			}
 			fatal(stream_to_s8(err));
 		}
+
+		#if BEAMFORMER_DEBUG
+		for (u32 index = 0; index < extension_count; index++) {
+			for EachElement(vk_debug_extensions, it) {
+				s8 test = {
+					.data = (u8 *)vk_debug_extensions[it],
+					.len  = vk_debug_extension_name_lengths[it],
+				};
+				vulkan_debug.E[it] |= s8_equal(test, ext_str8s[index]);
+			}
+		}
+		#endif
+	}
+
+	{
+		VkPhysicalDeviceFeatures2        df   = {.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2};
+		VkPhysicalDeviceVulkan11Features v11f = {.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES};
+		VkPhysicalDeviceVulkan12Features v12f = {.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES};
+		VkPhysicalDeviceVulkan13Features v13f = {.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_3_FEATURES};
+		df.pNext   = &v11f;
+		v11f.pNext = &v12f;
+		v12f.pNext = &v13f;
+		vkGetPhysicalDeviceFeatures2(vk->physical_device, &df);
+
+		{
+			b32 all_supported = 1;
+			#define X(name, ...) all_supported &= df.features.name;
+			VK_REQUIRED_PHYSICAL_FEATURES
+			#undef X
+
+			if (!all_supported) {
+				stream_append_s8(err, vulkan_info("fatal error: missing physical device features:\n"));
+				#define X(name, ...) if (!df.features.name) stream_append_s8(err, s8("    " #name "\n"));
+				VK_REQUIRED_PHYSICAL_FEATURES
+				#undef X
+				fatal(stream_to_s8(err));
+			}
+		}
+
+		{
+			b32 all_supported = 1;
+			#define X(name, ...) all_supported &= v11f.name;
+			VK_REQUIRED_PHYSICAL_11_FEATURES
+			#undef X
+
+			if (!all_supported) {
+				stream_append_s8(err, vulkan_info("fatal error: missing physical device features:\n"));
+				#define X(name, ...) if (!v11f.name) stream_append_s8(err, s8("    " #name "\n"));
+				VK_REQUIRED_PHYSICAL_11_FEATURES
+				#undef X
+				fatal(stream_to_s8(err));
+			}
+		}
+
+		{
+			b32 all_supported = 1;
+			#define X(name, ...) all_supported &= v12f.name;
+			VK_REQUIRED_PHYSICAL_12_FEATURES
+			#undef X
+
+			if (!all_supported) {
+				stream_append_s8(err, vulkan_info("fatal error: missing physical device features:\n"));
+				#define X(name, ...) if (!v12f.name) stream_append_s8(err, s8("    " #name "\n"));
+				VK_REQUIRED_PHYSICAL_12_FEATURES
+				#undef X
+				fatal(stream_to_s8(err));
+			}
+		}
+
+		{
+			b32 all_supported = 1;
+			#define X(name, ...) all_supported &= v13f.name;
+			VK_REQUIRED_PHYSICAL_13_FEATURES
+			#undef X
+
+			if (!all_supported) {
+				stream_append_s8(err, vulkan_info("fatal error: missing physical device features:\n"));
+				#define X(name, ...) if (!v13f.name) stream_append_s8(err, s8("    " #name "\n"));
+				VK_REQUIRED_PHYSICAL_13_FEATURES
+				#undef X
+				fatal(stream_to_s8(err));
+			}
+		}
 	}
 
 	VkPhysicalDeviceMemoryProperties2 mp = {.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MEMORY_PROPERTIES_2};
@@ -686,27 +1295,30 @@ vk_load_queues(Arena *memory, Stream *err)
 		assigned_subindices[VulkanQueueKind_Transfer] += 1;
 	}
 
-	u32 unique_queues = 0;
 	for EachElement(assigned_subindices, it)
-		unique_queues += assigned_subindices[it];
+		vk->unique_queues += assigned_subindices[it];
 
 	end_temp_arena(arena_save);
 
 	/////////////////////////////////////////////
 	// NOTE(rnp): fill in info and create device
-
-	VulkanQueue *qs = push_array(memory, VulkanQueue, unique_queues);
 	for EachElement(vk->queues, it) {
 		u32 index = queue_subindices[it];
 		for (i32 i = 0; i < queue_indices[it]; i++)
 			index += assigned_subindices[i];
-
-		vk->queues[it]         = qs + index;
-		qs[index].queue_family = queue_indices[it];
-		qs[index].queue_index  = queue_subindices[it];
+		vk->queue_indices[it] = index;
 	}
 
-	VkDeviceQueueCreateInfo queue_create_infos[VulkanQueueKind_Count];
+	for EachElement(vk->queues, it) {
+		if (vk->queues[vk->queue_indices[it]] == 0) {
+			vk->queues[vk->queue_indices[it]] = push_struct(memory, VulkanQueue);
+			vk->queues[vk->queue_indices[it]]->queue_family = queue_indices[it];
+			vk->queues[vk->queue_indices[it]]->queue_index  = queue_subindices[it];
+		}
+		vk->queues[it] = vk->queues[vk->queue_indices[it]];
+	}
+
+	VkDeviceQueueCreateInfo queue_create_infos[VulkanQueueKind_Count];
 
 	f32 queue_priorities[VulkanQueueKind_Count][VulkanQueueKind_Count];
 	for (u32 i = 0; i < VulkanQueueKind_Count; i++)
@@ -716,7 +1328,7 @@ vk_load_queues(Arena *memory, Stream *err)
 
 	u32 queue_create_index = 0;
 	b32 queue_info_filled[VulkanQueueKind_Count] = {0};
-	for (u32 q = 0; q < unique_queues; q++) {
+	for (u32 q = 0; q < vk->unique_queues; q++) {
 		u32 base_q = queue_indices[q];
 		if (!queue_info_filled[base_q]) {
 			queue_create_infos[queue_create_index++] = (VkDeviceQueueCreateInfo){
@@ -729,14 +1341,63 @@ vk_load_queues(Arena *memory, Stream *err)
 		queue_info_filled[base_q] = 1;
 	}
 
-	VkPhysicalDeviceFeatures device_features = {0};
+	VkPhysicalDeviceVulkan13Features v13f = {
+		.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_3_FEATURES,
+		#define X(name, ...) .name = 1,
+		VK_REQUIRED_PHYSICAL_13_FEATURES
+		#undef X
+	};
+
+	VkPhysicalDeviceShaderRelaxedExtendedInstructionFeaturesKHR pdsre = {
+		.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_RELAXED_EXTENDED_INSTRUCTION_FEATURES_KHR,
+		.shaderRelaxedExtendedInstruction = 1,
+	};
+	if (vulkan_debug.shader_relaxed_extended_instruction) v13f.pNext = &pdsre;
+
+	VkPhysicalDeviceVulkan12Features v12f = {
+		.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES,
+		.pNext = &v13f,
+		#define X(name, ...) .name = 1,
+		VK_REQUIRED_PHYSICAL_12_FEATURES
+		#undef X
+	};
+
+	VkPhysicalDeviceVulkan11Features v11f = {
+		.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES,
+		.pNext = &v12f,
+		#define X(name, ...) .name = 1,
+		VK_REQUIRED_PHYSICAL_11_FEATURES
+		#undef X
+	};
+	VkPhysicalDeviceFeatures2 device_features = {
+		.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2,
+		.pNext = &v11f,
+		.features = {
+			#define X(name, ...) .name = 1,
+			VK_REQUIRED_PHYSICAL_FEATURES
+			#undef X
+		},
+	};
+
+	Arena arena = *memory;
+	u32   enabled_count = countof(vk_required_device_extensions) + countof(vk_debug_extensions);
+	const char **enabled_extensions = push_array(&arena, const char *, enabled_count);
+
+	enabled_count = 0;
+	for EachElement(vk_required_device_extensions, it)
+		enabled_extensions[enabled_count++] = vk_required_device_extensions[it];
+
+	for EachElement(vk_debug_extensions, it)
+		if (vulkan_debug.E[it])
+			enabled_extensions[enabled_count++] = vk_debug_extensions[it];
+
 	VkDeviceCreateInfo device_create_info = {
 		.sType                   = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO,
+		.pNext                   = &device_features,
 		.pQueueCreateInfos       = queue_create_infos,
 		.queueCreateInfoCount    = queue_create_index,
-		.pEnabledFeatures        = &device_features,
-		.ppEnabledExtensionNames = vk_required_device_extensions,
-		.enabledExtensionCount   = countof(vk_required_device_extensions),
+		.ppEnabledExtensionNames = enabled_extensions,
+		.enabledExtensionCount   = enabled_count,
 	};
 	vkCreateDevice(vk->physical_device, &device_create_info, 0, &vk->device);
 
@@ -744,9 +1405,69 @@ vk_load_queues(Arena *memory, Stream *err)
 	VkDeviceProcedureList
 	#undef X
 
-	for (u32 q = 0; q < unique_queues; q++) {
+	for (u32 q = 0; q < vk->unique_queues; q++) {
 		VulkanQueue *qp = vk->queues[q];
 		vkGetDeviceQueue(vk->device, qp->queue_family, qp->queue_index, &qp->queue);
+
+		VkCommandPoolCreateInfo command_pool_create_info = {
+			.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
+			.flags = VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT,
+			.queueFamilyIndex = qp->queue_family,
+		};
+		vkCreateCommandPool(vk->device, &command_pool_create_info, 0, &qp->command_pool);
+
+		VkCommandBufferAllocateInfo command_buffer_allocate_info = {
+			.sType              = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
+			.commandPool        = qp->command_pool,
+			.level              = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
+			.commandBufferCount = countof(qp->command_buffers),
+		};
+		vkAllocateCommandBuffers(vk->device, &command_buffer_allocate_info, qp->command_buffers);
+
+		qp->timeline_semaphore = vk_make_semaphore(0);
+
+		VkQueryPoolCreateInfo query_pool_create_info = {
+			.sType      = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO,
+			.queryType  = VK_QUERY_TYPE_TIMESTAMP,
+			.queryCount = countof(qp->command_buffers) * MaxCommandBufferTimestamps,
+		};
+		vkCreateQueryPool(vk->device, &query_pool_create_info, 0, &qp->query_pool);
+	}
+
+	vk->queues[VulkanQueueKind_Graphics]->pipeline_stage_flags |= VK_PIPELINE_STAGE_2_ALL_GRAPHICS_BIT;
+	vk->queues[VulkanQueueKind_Compute]->pipeline_stage_flags  |= VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT;
+}
+
+function void
+vk_load_graphics(void)
+{
+	VulkanContext *vk = vulkan_context;
+
+	// NOTE: swap chain image format
+	{
+	}
+
+	// NOTE: depth/stencil format
+	{
+		VkFormat depth_formats[] = {
+			VK_FORMAT_D32_SFLOAT_S8_UINT,
+			VK_FORMAT_D24_UNORM_S8_UINT,
+			VK_FORMAT_D16_UNORM_S8_UINT,
+		};
+
+		vk->depth_stencil_format = VK_FORMAT_UNDEFINED;
+		for EachElement(depth_formats, it) {
+			VkFormatProperties3 format_properties3 = {.sType = VK_STRUCTURE_TYPE_FORMAT_PROPERTIES_3};
+			VkFormatProperties2 format_properties2 = {
+				.sType = VK_STRUCTURE_TYPE_FORMAT_PROPERTIES_2,
+				.pNext = &format_properties3,
+			};
+			vkGetPhysicalDeviceFormatProperties2(vk->physical_device, depth_formats[it], &format_properties2);
+			if (format_properties3.optimalTilingFeatures & VK_FORMAT_FEATURE_2_DEPTH_STENCIL_ATTACHMENT_BIT) {
+				vk->depth_stencil_format = depth_formats[it];
+				break;
+			}
+		}
 	}
 }
 
@@ -772,17 +1493,43 @@ vk_load(OSLibrary vulkan_library_handle, Arena *memory, Stream *err)
 	vk_load_instance();
 	vk_load_physical_device(vulkan_context->arena, err);
 	vk_load_queues(&vulkan_context->arena, err);
+	vk_load_graphics();
 
-	// TODO: setup compute pipeline
 	read_only local_persist s8 default_compute_shader = s8(""
 		"#version 430 core\n"
+		"layout(push_constant) uniform pc { uint data[256 / 4]; };\n"
 		"void main() {}\n"
 		"\n");
+	vk->default_compute_pipeline = vk_compute_pipeline_from_shader_text(vk->arena, default_compute_shader,
+	                                                                    s8("error_compute_shader"), 256);
+
+	read_only local_persist s8 default_vertex_shader = s8(""
+		"#version 430 core\n"
+		"layout(push_constant) uniform pc { uint data[256 / 4]; };\n"
+		"void main() {gl_Position = vec4(0);}\n"
+		"\n");
+	read_only local_persist s8 default_fragment_shader = s8(""
+		"#version 430 core\n"
+		"layout(location = 0) out vec4 out_colour;"
+		"layout(push_constant) uniform pc { uint data[256 / 4]; };\n"
+		"void main() {out_colour = vec4(0.5f, 0.0f, 0.5f, 1.0f);}\n"
+		"\n");
 
-	vk->default_compute_shader = vk_compute_pipeline_from_shader_text(vk->arena, default_compute_shader,
-	                                                                  s8("error_compute_shader"));
+	VulkanPipelineCreateInfo pipeline_create_infos[2] = {
+		{
+			.kind = VulkanShaderKind_Vertex,
+			.text = default_vertex_shader,
+			.name = s8("error_vertex_shader"),
+		},
+		{
+			.kind = VulkanShaderKind_Fragment,
+			.text = default_fragment_shader,
+			.name = s8("error_fragment_shader"),
+		},
+	};
+	vk->default_graphics_pipeline = vk_graphics_pipeline_from_infos(vk->arena, pipeline_create_infos, 2, 256);
 
-	// TODO: setup render pipeline
+	// TODO: setup ui render pipeline
 
 	if (err->widx > 0) {
 		os_console_log(err->data, err->widx);
@@ -796,110 +1543,63 @@ vk_gpu_info(void)
 	return &vulkan_context->gpu_info;
 }
 
-DEBUG_IMPORT void
-vk_buffer_release(GPUBuffer *b)
+function void
+vk_vulkan_buffer_release(VulkanBuffer *vb, u64 size)
 {
 	VulkanContext *vk = vulkan_context;
-	if ValidVulkanHandle(b->buffer) {
-		VulkanBuffer *vb = vk_entity_data(b->buffer, VulkanEntityKind_Buffer);
-		// TODO(rnp): this happens implicitly, probably just delete this if block
-		if (vb->host_pointer)
-			vkUnmapMemory(vk->device, vb->memory);
+	VulkanEntity  *e  = (VulkanEntity *)((u8 *)vb - offsetof(VulkanEntity, as));
+	// TODO(rnp): this happens implicitly, probably just delete this if block
+	if (vb->host_pointer)
+		vkUnmapMemory(vk->device, vb->memory);
 
-		if (vb->buffer)
-			vkDestroyBuffer(vk->device, vb->buffer, 0);
+	if (vb->buffer)
+		vkDestroyBuffer(vk->device, vb->buffer, 0);
 
-		vkFreeMemory(vk->device, vb->memory, 0);
-		if (vb->memory_kind != VulkanMemoryKind_Host)
-			vk->gpu_info.gpu_heap_used -= b->size;
+	vk_release_memory(vb->memory, vb->memory_kind != VulkanMemoryKind_Host ? size : 0);
+	vk_entity_release(e);
+}
 
-		vk_entity_release((VulkanEntity *)b->buffer.value[0]);
-	}
+DEBUG_IMPORT void
+vk_buffer_release(GPUBuffer *b)
+{
+	if ValidVulkanHandle(b->buffer)
+		vk_vulkan_buffer_release(vk_entity_data(b->buffer, VulkanEntityKind_Buffer), b->size);
 	zero_struct(b);
 }
 
 DEBUG_IMPORT void
-vk_buffer_allocate(GPUBuffer *b, iz size, GPUBufferCreateFlags flags, OSHandle *export, s8 label)
+vk_buffer_allocate(GPUBuffer *b, GPUBufferAllocateInfo *info)
 {
-	vk_buffer_release(b);
 	VulkanContext *vk = vulkan_context;
-	VulkanEntity  *e  = vk_entity_allocate(VulkanEntityKind_Buffer);
-	VulkanBuffer  *vb = &e->as.buffer;
-
-	b->buffer.value[0] = (u64)e;
-
-	assert(size > 0);
-
-	// TODO(rnp): this probably should be handled, its usually 4GB. likely
-	// need to chain multiple allocations and handle it in shader code
-	assert((u64)size <= vk->memory_info.max_allocation_size);
-	size = (iz)Min((u64)size, vk->memory_info.max_allocation_size);
-
-	u64 remaining = vk->gpu_info.gpu_heap_size - vk->gpu_info.gpu_heap_used;
 
-	VkExportMemoryAllocateInfo ei = {
-		.sType       = VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO,
-		.handleTypes = OS_WINDOWS ? VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT
-		                          : VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT,
-	};
+	vk_buffer_release(b);
 
-	VkMemoryAllocateFlagsInfo mafi = {
-		.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_FLAGS_INFO,
-		//.flags = VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_BIT,
-		.pNext = (export) ? & ei: 0,
-	};
+	assert(info->size > 0);
 
-	/* NOTE(rnp): to create a CPU writable buffer:
-	 * 1. try to allocate and map the entire buffer
-	 *    - this may fail if the buffer is bigger than the BAR size
-	 *      (unknowable from vulkan), or the memory space has become
-	 *      too fragmented (unlikely)
-	 * 2. if allocation or mapping fails we must chain a host buffer
-	 *    for staging. If this happens in practice we should add
-	 *    the ability to import an existing external allocation
-	 */
-	vb->memory_kind = flags & GPUBufferCreateFlags_HostWritable ? VulkanMemoryKind_BAR : VulkanMemoryKind_Device;
-	VkMemoryAllocateInfo mai = {
-		.sType           = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
-		.allocationSize  = Min((u64)size, remaining),
-		.memoryTypeIndex = vk->memory_info.memory_type_indices[vb->memory_kind],
-		.pNext           = &mafi,
+	VulkanEntity *e = vk_entity_allocate(VulkanEntityKind_Buffer);
+	VulkanBufferAllocateInfo vulkan_buffer_allocate_info = {
+		.gpu_buffer = b,
+		.size       = (u64)info->size,
+		.flags      = info->flags,
+		.index_type = VK_INDEX_TYPE_NONE_KHR,
+		.label      = info->label,
 	};
 
-	// TODO(rnp): this may fail if the allocation is too big for the BAR size
-	// it needs to handled properly
-	if (vkAllocateMemory(vk->device, &mai, 0, &vb->memory) == VK_SUCCESS) {
-		vk->gpu_info.gpu_heap_used += mai.allocationSize;
-		b->size = mai.allocationSize;
-
-		if (flags & GPUBufferCreateFlags_HostWritable)
-			vkMapMemory(vk->device, vb->memory, 0, b->size, 0, &vb->host_pointer);
+	u32 queue_index_hit_count[VulkanQueueKind_Count] = {0};
+	for (u32 it = 0; it < info->timeline_count; it++)
+		queue_index_hit_count[vk->queue_indices[info->timelines_used[it]]]++;
 
-		if (export) {
-			if (OS_WINDOWS) {
-				VkMemoryGetWin32HandleInfoKHR handle_info = {
-					.sType      = VK_STRUCTURE_TYPE_MEMORY_GET_WIN32_HANDLE_INFO_KHR,
-					.memory     = vb->memory,
-					.handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT,
-				};
-				void *handle;
-				vkGetMemoryWin32HandleKHR(vk->device, &handle_info, &handle);
-				export->value[0] = (u64)handle;
-			} else {
-				VkMemoryGetFdInfoKHR fd_info = {
-					.sType      = VK_STRUCTURE_TYPE_MEMORY_GET_FD_INFO_KHR,
-					.memory     = vb->memory,
-					.handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT,
-				};
-				i32 fd;
-				vkGetMemoryFdKHR(vk->device, &fd_info, &fd);
-				export->value[0] = (u64)fd;
-			}
+	for EachElement(queue_index_hit_count, it) {
+		if (queue_index_hit_count[it] > 0) {
+			u32 index = vulkan_buffer_allocate_info.queue_family_count++;
+			vulkan_buffer_allocate_info.queue_family_indices[index] = vk->queues[vk->queue_indices[it]]->queue_family;
 		}
 	}
 
-	if ((flags & GPUBufferCreateFlags_MemoryOnly) == 0) {
-		// TODO(rnp): create and bind memory to buffer
+	if (vk_buffer_allocate_common(&e->as.buffer, &vulkan_buffer_allocate_info)) {
+		b->buffer.value[0] = (u64)e;
+	} else {
+		vk_entity_release(e);
 	}
 }
 
@@ -925,108 +1625,854 @@ vk_round_up_to_sync_size(u64 size, u64 min)
 	return result;
 }
 
-DEBUG_IMPORT void
-vk_buffer_range_upload(GPUBuffer *b, void *data, u64 offset, u64 size, b32 non_temporal)
+function force_inline void
+vk_buffer_buffer_copy(VulkanBuffer *destination, VulkanBuffer *source, u64 destination_offset, u64 source_offset, u64 size, b32 non_temporal)
 {
 	VulkanContext *vk = vulkan_context;
-	VulkanBuffer  *vb = vk_entity_data(b->buffer, VulkanEntityKind_Buffer);
 
-	switch (vb->memory_kind) {
-	case VulkanMemoryKind_Host:
+	switch (source->memory_kind) {
 	case VulkanMemoryKind_BAR:
 	{
-		assert(vb->host_pointer);
-		void *dest = (u8 *)vb->host_pointer + offset;
-		// NOTE(rnp): don't trash the CPU cache for large data stores
-		if (non_temporal) memory_copy_non_temporal(dest, data, size);
-		else              mem_copy(dest, data, size);
-
-		b32 coherent = vk->memory_info.memory_host_coherent[vb->memory_kind];
-		if (!coherent) {
-			u64 nca_size = vk->memory_info.non_coherent_atom_size;
-			VkMappedMemoryRange mrs[1] = {{
-				.sType  = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE,
-				.memory = vb->memory,
-				.offset = offset - (offset % nca_size),
-				.size   = vk_round_up_to_sync_size(size, nca_size),
-			}};
-			vkFlushMappedMemoryRanges(vk->device, countof(mrs), mrs);
+		switch (destination->memory_kind) {
+		case VulkanMemoryKind_Host:{
+			if (destination->memory) {
+				// TODO(rnp): there is likely a more efficient way of doing this in this case
+				InvalidCodePath;
+			} else {
+				assert(source->host_pointer);
+				b32 coherent = vk->memory_info.memory_host_coherent[source->memory_kind];
+				if (!coherent) {
+					u64 nca_size = vk->memory_info.non_coherent_atom_size;
+					VkMappedMemoryRange mrs[1] = {{
+						.sType  = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE,
+						.memory = source->memory,
+						.offset = source_offset - (source_offset % nca_size),
+						.size   = vk_round_up_to_sync_size(size, nca_size),
+					}};
+					vkInvalidateMappedMemoryRanges(vk->device, countof(mrs), mrs);
+				}
+
+				void *dest = (u8 *)destination->host_pointer + destination_offset;
+				void *src  = (u8 *)source->host_pointer + source_offset;
+
+				// NOTE(rnp): don't trash the CPU cache for large data stores
+				if (non_temporal) memory_copy_non_temporal(dest, src, size);
+				else              mem_copy(dest, src, size);
+			}
+		}break;
+		InvalidDefaultCase;
+		}
+	}break;
+
+	case VulkanMemoryKind_Host:{
+		switch (destination->memory_kind) {
+		case VulkanMemoryKind_BAR:{
+			assert(destination->host_pointer);
+
+			void *dest = (u8 *)destination->host_pointer + destination_offset;
+			void *src  = (u8 *)source->host_pointer + source_offset;
+
+			// NOTE(rnp): don't trash the CPU cache for large data stores
+			if (non_temporal) memory_copy_non_temporal(dest, src, size);
+			else              mem_copy(dest, src, size);
+
+			b32 coherent = vk->memory_info.memory_host_coherent[destination->memory_kind];
+			if (!coherent) {
+				u64 nca_size = vk->memory_info.non_coherent_atom_size;
+				VkMappedMemoryRange mrs[1] = {{
+					.sType  = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE,
+					.memory = destination->memory,
+					.offset = destination_offset - (destination_offset % nca_size),
+					.size   = vk_round_up_to_sync_size(size, nca_size),
+				}};
+				vkFlushMappedMemoryRanges(vk->device, countof(mrs), mrs);
+			}
+		}break;
+		InvalidDefaultCase;
+
 		}
 	}break;
+
 	// TODO(rnp): use transfer queue when not mapped
 	InvalidDefaultCase;
 	}
 }
 
-DEBUG_IMPORT VulkanHandle
-vk_semaphore_create(OSHandle *export)
+DEBUG_IMPORT void
+vk_buffer_range_upload(GPUBuffer *b, void *data, u64 offset, u64 size, b32 non_temporal)
 {
+	VulkanBuffer *db = vk_entity_data(b->buffer, VulkanEntityKind_Buffer);
+	VulkanBuffer  sb = {
+		.host_pointer = data,
+		.memory_kind  = VulkanMemoryKind_Host,
+	};
+	vk_buffer_buffer_copy(db, &sb, offset, 0, size, non_temporal);
+}
+
+DEBUG_IMPORT void
+vk_buffer_range_download(void *destination, GPUBuffer *source, u64 offset, u64 size, b32 non_temporal)
+{
+	VulkanBuffer *sb = vk_entity_data(source->buffer, VulkanEntityKind_Buffer);
+	VulkanBuffer  db = {
+		.host_pointer = destination,
+		.memory_kind  = VulkanMemoryKind_Host,
+	};
+	vk_buffer_buffer_copy(&db, sb, 0, offset, size, non_temporal);
+}
+
+DEBUG_IMPORT void
+vk_render_model_release(GPUBuffer *model)
+{
+	if ValidVulkanHandle(model->buffer)
+		vk_vulkan_buffer_release(vk_entity_data(model->buffer, VulkanEntityKind_RenderModel), model->size);
+	zero_struct(model);
+}
+
+DEBUG_IMPORT void
+vk_render_model_allocate(GPUBuffer *model, void *indices, u64 index_count, u64 model_size, s8 label)
+{
+	vk_render_model_release(model);
+
+	VulkanEntity *e = vk_entity_allocate(VulkanEntityKind_RenderModel);
+
+	assert(index_count <= U32_MAX);
+	VkIndexType index_type;
+	if (index_count <= U16_MAX) index_type = VK_INDEX_TYPE_UINT16;
+	else                        index_type = VK_INDEX_TYPE_UINT32;
+
+	i64 indices_size = round_up_to(vk_index_size(index_type) * index_count, 64);
+
+	i64 size = round_up_to(model_size + indices_size, 64);
+	assert(size > 0);
+
+	VulkanBufferAllocateInfo vulkan_buffer_allocate_info = {
+		.gpu_buffer              = model,
+		.size                    = (u64)size,
+		.flags                   = VulkanUsageFlag_HostReadWrite,
+		.index_type              = index_type,
+		.label                   = label,
+		.queue_family_count      = 1,
+		.queue_family_indices[0] = vulkan_context->queues[VulkanQueueKind_Graphics]->queue_family,
+	};
+	if (vk_buffer_allocate_common(&e->as.buffer, &vulkan_buffer_allocate_info)) {
+		model->buffer.value[0] = (u64)e;
+		model->index_count  = index_count;
+		model->gpu_pointer += indices_size;
+
+		VulkanBuffer  sb = {
+			.host_pointer = indices,
+			.memory_kind  = VulkanMemoryKind_Host,
+		};
+
+		vk_buffer_buffer_copy(&e->as.buffer, &sb, 0, 0, vk_index_size(index_type) * index_count, 0);
+	} else {
+		vk_entity_release(e);
+	}
+}
+
+DEBUG_IMPORT void
+vk_render_model_range_upload(GPUBuffer *model, void *data, u64 offset, u64 size, b32 non_temporal)
+{
+	VulkanBuffer *db = vk_entity_data(model->buffer, VulkanEntityKind_RenderModel);
+	VulkanBuffer  sb = {
+		.host_pointer = data,
+		.memory_kind  = VulkanMemoryKind_Host,
+	};
+
+	offset += round_up_to(vk_index_size(db->index_type) * model->index_count, 64);
+
+	vk_buffer_buffer_copy(db, &sb, offset, 0, size, non_temporal);
+}
+
+DEBUG_IMPORT void
+vk_image_release(GPUImage *image)
+{
+	if ValidVulkanHandle(image->image) {
+		VulkanContext *vk = vulkan_context;
+		VulkanImage   *vi = vk_entity_data(image->image, VulkanEntityKind_Image);
+
+		vkDestroyImageView(vk->device, vi->view, 0);
+		vkDestroyImage(vk->device, vi->image, 0);
+		vk_release_memory(vi->memory, image->memory_size);
+
+		vk_entity_release((VulkanEntity *)image->image.value[0]);
+	}
+	zero_struct(image);
+}
+
+DEBUG_IMPORT void
+vk_image_allocate(GPUImage *image, u32 width, u32 height, u32 mips, u32 samples,
+                  VulkanImageUsage usage, VulkanUsageFlags flags, OSHandle *export)
+{
+	assert(IsPowerOfTwo(samples));
+
+	vk_image_release(image);
+
 	VulkanContext *vk = vulkan_context;
+	VulkanEntity  *e  = vk_entity_allocate(VulkanEntityKind_Image);
+	VulkanImage   *vi = &e->as.image;
+
+	image->image.value[0] = (u64)e;
+	image->width          = Min(width,   vk->gpu_info.max_image_dimension_2D);
+	image->height         = Min(height,  vk->gpu_info.max_image_dimension_2D);
+	image->mip_map_levels = Max(mips,    1);
+	image->samples        = Min(samples, vk->gpu_info.max_msaa_samples);
+
+	VkFormat usage_format_map[VulkanImageUsage_Count + 1] = {
+		[VulkanImageUsage_None]         = VK_FORMAT_UNDEFINED,
+		//[VulkanImageUsage_Colour]       = VK_FORMAT_R8G8B8A8_SRGB,
+		[VulkanImageUsage_Colour]       = VK_FORMAT_R8G8B8A8_UNORM,
+		[VulkanImageUsage_DepthStencil] = vk->depth_stencil_format,
+		[VulkanImageUsage_Count]        = VK_FORMAT_UNDEFINED,
+	};
 
-	VkSemaphoreCreateInfo       sci  = {.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO};
-	VkExportSemaphoreCreateInfo esci = {
-		.sType       = VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_CREATE_INFO,
-		.handleTypes = OS_WINDOWS ? VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT
-		                          : VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT,
+	read_only local_persist VkImageUsageFlagBits usage_extra_bit_map[VulkanImageUsage_Count + 1] = {
+		[VulkanImageUsage_None]         = 0,
+		[VulkanImageUsage_Colour]       = VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT,
+		[VulkanImageUsage_DepthStencil] = VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,
+		[VulkanImageUsage_Count]        = 0,
 	};
-	if (export) sci.pNext = &esci;
 
+	read_only local_persist VkImageAspectFlags usage_image_aspect_map[VulkanImageUsage_Count + 1] = {
+		[VulkanImageUsage_None]         = 0,
+		[VulkanImageUsage_Colour]       = VK_IMAGE_ASPECT_COLOR_BIT,
+		[VulkanImageUsage_DepthStencil] = VK_IMAGE_ASPECT_DEPTH_BIT|VK_IMAGE_ASPECT_STENCIL_BIT,
+		[VulkanImageUsage_Count]        = 0,
+	};
+
+	usage = Clamp((u32)usage, 0, VulkanImageUsage_Count);
+	VkImageUsageFlagBits usage_flags = usage_extra_bit_map[usage];
+
+	if (flags & VulkanUsageFlag_ImageSampling)       usage_flags |= VK_IMAGE_USAGE_SAMPLED_BIT;
+	if (flags & VulkanUsageFlag_TransferSource)      usage_flags |= VK_IMAGE_USAGE_TRANSFER_SRC_BIT;
+	if (flags & VulkanUsageFlag_TransferDestination) usage_flags |= VK_IMAGE_USAGE_TRANSFER_DST_BIT;
+
+	u32 queue_family = vk->queues[VulkanQueueKind_Graphics]->queue_family;
+	VkImageCreateInfo image_create_info = {
+		.sType                 = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
+		.flags                 = export ? VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT : 0,
+		.imageType             = VK_IMAGE_TYPE_2D,
+		.format                = usage_format_map[usage],
+		.extent                = {image->width, image->height, 1},
+		.mipLevels             = image->mip_map_levels,
+		.arrayLayers           = 1,
+		.samples               = image->samples,
+		.tiling                = VK_IMAGE_TILING_OPTIMAL,
+		.usage                 = usage_flags,
+		// NOTE(rnp): needed if multiple queue families are accessed
+		.sharingMode           = VK_SHARING_MODE_EXCLUSIVE,
+		.queueFamilyIndexCount = 1,
+		.pQueueFamilyIndices   = &queue_family,
+		.initialLayout         = VK_IMAGE_LAYOUT_UNDEFINED,
+	};
+
+	VkExternalMemoryImageCreateInfo external_memory_image_create_info = {
+		.sType       = VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_IMAGE_CREATE_INFO,
+		.handleTypes = OS_WINDOWS ? VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT
+		                          : VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT,
+	};
+
+	if (export) image_create_info.pNext = &external_memory_image_create_info;
+
+	vkCreateImage(vk->device, &image_create_info, 0, &vi->image);
+
+	VkMemoryRequirements memory_requirements;
+	vkGetImageMemoryRequirements(vk->device, vi->image, &memory_requirements);
+
+	VkMemoryDedicatedAllocateInfo dedicated_allocate_info = {
+		.sType  = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO,
+		.image  = vi->image,
+	};
+
+	if (vk_allocate_memory(&vi->memory, memory_requirements.size, VulkanMemoryKind_Device, 0, &dedicated_allocate_info, export)) {
+		image->memory_size = memory_requirements.size;
+		vkBindImageMemory(vk->device, vi->image, vi->memory, 0);
+
+		VkImageViewCreateInfo image_view_info = {
+			.sType      = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
+			.image      = vi->image,
+			.viewType   = VK_IMAGE_VIEW_TYPE_2D,
+			.format     = usage_format_map[usage],
+			.subresourceRange = {
+				.aspectMask     = usage_image_aspect_map[usage],
+				.baseMipLevel   = 0,
+				.levelCount     = 1,
+				.baseArrayLayer = 0,
+				.layerCount     = 1,
+			},
+		};
+		vkCreateImageView(vk->device, &image_view_info, 0, &vi->view);
+	} else {
+		vkDestroyImage(vk->device, vi->image, 0);
+		vk_entity_release(e);
+		zero_struct(image);
+	}
+}
+
+DEBUG_IMPORT VulkanHandle
+vk_create_semaphore(OSHandle *export)
+{
 	VulkanEntity *e = vk_entity_allocate(VulkanEntityKind_Semaphore);
+	e->as.semaphore = vk_make_semaphore(export);
 	VulkanHandle result = {(u64)e};
+	return result;
+}
 
-	vkCreateSemaphore(vk->device, &sci, 0, &e->as.semaphore);
-
-	if (export) {
-		if (OS_WINDOWS) {
-			VkSemaphoreGetWin32HandleInfoKHR ghi = {
-				.sType      = VK_STRUCTURE_TYPE_SEMAPHORE_GET_WIN32_HANDLE_INFO_KHR,
-				.handleType = VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT,
-				.semaphore  = e->as.semaphore,
-			};
-			void *handle;
-			vkGetSemaphoreWin32HandleKHR(vk->device, &ghi, &handle);
-			export->value[0] = (u64)handle;
-		} else {
-			VkSemaphoreGetFdInfoKHR ghi = {
-				.sType      = VK_STRUCTURE_TYPE_SEMAPHORE_GET_FD_INFO_KHR,
-				.handleType = VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT,
-				.semaphore  = e->as.semaphore,
-			};
-			i32 handle;
-			vkGetSemaphoreFdKHR(vk->device, &ghi, &handle);
-			export->value[0] = (u64)handle;
-		}
+DEBUG_IMPORT b32
+vk_host_wait_timeline(VulkanTimeline timeline, u64 value, u64 timeout_ns)
+{
+	b32 result = 0;
+	if Between(timeline, 0, VulkanTimeline_Count - 1) {
+		VulkanContext *vk = vulkan_context;
+		VulkanQueue   *vq = vk->queues[timeline];
+		VkSemaphoreWaitInfo semaphore_wait_info = {
+			.sType          = VK_STRUCTURE_TYPE_SEMAPHORE_WAIT_INFO,
+			.pSemaphores    = &vq->timeline_semaphore.semaphore,
+			.semaphoreCount = 1,
+			.pValues        = &value,
+		};
+		result = vkWaitSemaphores(vk->device, &semaphore_wait_info, timeout_ns) == VK_SUCCESS;
 	}
+	return result;
+}
 
+DEBUG_IMPORT u64
+vk_host_signal_timeline(VulkanTimeline timeline)
+{
+	u64 result = -1;
+	if Between(timeline, 0, VulkanTimeline_Count - 1) {
+		VulkanContext   *vk = vulkan_context;
+		VulkanQueue     *vq = vk->queues[timeline];
+		VulkanSemaphore *vs = &vq->timeline_semaphore;
+		result = ++vs->value;
+		VkSemaphoreSignalInfo ssi = {
+			.sType     = VK_STRUCTURE_TYPE_SEMAPHORE_SIGNAL_INFO,
+			.semaphore = vs->semaphore,
+			.value     = result,
+		};
+		vkSignalSemaphore(vk->device, &ssi);
+	}
 	return result;
 }
 
 DEBUG_IMPORT VulkanHandle
-vk_compute_shader(s8 text, s8 name)
+vk_pipeline(VulkanPipelineCreateInfo *infos, u32 count, u32 push_constants_size)
 {
+	assert(Between(count, 1, 2));
+	assert(count == 2 || infos[0].kind == VulkanShaderKind_Compute);
+
 	VulkanHandle result = {0};
 	DeferLoop(take_lock(&vulkan_context->arena_lock, -1), release_lock(&vulkan_context->arena_lock))
 	{
 		Arena arena = vulkan_context->arena;
 
-		VulkanEntity *e = vk_entity_allocate(VulkanEntityKind_Shader);
+		VulkanEntity *e = vk_entity_allocate(VulkanEntityKind_Pipeline);
 		result = (VulkanHandle){(u64)e};
 
-		e->as.shader = vk_compute_pipeline_from_shader_text(arena, text, name);
-		if (e->as.shader.pipeline == 0) e->as.shader = vulkan_context->default_compute_shader;
+		if (count == 2) e->as.pipeline = vk_graphics_pipeline_from_infos(arena, infos, count, push_constants_size);
+		else            e->as.pipeline = vk_compute_pipeline_from_shader_text(arena, infos[0].text, infos[0].name, push_constants_size);
 	}
 	return result;
 }
 
-DEBUG_IMPORT void
-vk_compute_shader_release(VulkanHandle h)
+DEBUG_IMPORT b32
+vk_pipeline_valid(VulkanHandle h)
 {
+	b32 result = 0;
 	if ValidVulkanHandle(h) {
-		VulkanShader *vs = vk_entity_data(h, VulkanEntityKind_Shader);
-		if (vs->pipeline != vulkan_context->default_compute_shader.pipeline) {
-			vkDestroyPipeline(vulkan_context->device, vs->pipeline, 0);
-			vkDestroyPipelineLayout(vulkan_context->device, vs->layout, 0);
+		VulkanPipeline *vp = vk_entity_data(h, VulkanEntityKind_Pipeline);
+		if (vp->stage_flags == VK_SHADER_STAGE_COMPUTE_BIT)
+			result = vp->pipeline != vulkan_context->default_compute_pipeline.pipeline;
+		else
+			result = vp->pipeline != vulkan_context->default_graphics_pipeline.pipeline;
+	}
+	return result;
+}
+
+DEBUG_IMPORT void
+vk_pipeline_release(VulkanHandle h)
+{
+	if (vk_pipeline_valid(h)) {
+		VulkanEntity *e = (VulkanEntity *)h.value[0];
+		VulkanTimeline timeline;
+		if (e->as.pipeline.stage_flags == VK_SHADER_STAGE_COMPUTE_BIT) timeline = VulkanTimeline_Compute;
+		else                                                           timeline = VulkanTimeline_Graphics;
+
+		VulkanQueue  *vq = vulkan_context->queues[timeline];
+		DeferLoop(take_lock(&vq->lock, -1), release_lock(&vq->lock))
+		{
+			u32 index = (vq->next_command_buffer_index - 1) % countof(vq->command_buffers);
+			vk_host_wait_timeline(timeline, vq->command_buffer_submission_values[index], -1ULL);
+
+			if (&e->as.pipeline == vq->bound_pipeline)
+				vq->bound_pipeline = 0;
+
+			vkDestroyPipeline(vulkan_context->device, e->as.pipeline.pipeline, 0);
+			vkDestroyPipelineLayout(vulkan_context->device, e->as.pipeline.layout, 0);
+		}
+		vk_entity_release(e);
+	}
+}
+
+DEBUG_IMPORT VulkanHandle
+vk_command_begin(VulkanTimeline timeline)
+{
+	VulkanHandle result = {0};
+	if Between(timeline, 0, VulkanTimeline_Count - 1) {
+		VulkanContext *vk = vulkan_context;
+		VulkanQueue   *vq = vk->queues[timeline];
+
+		take_lock(&vq->lock, -1);
+
+		VulkanEntity        *e   = vk_entity_allocate(VulkanEntityKind_CommandBuffer);
+		VulkanCommandBuffer *vcb = &e->as.command_buffer;
+		u32 index = vq->next_command_buffer_index++ % countof(vq->command_buffers);
+		vcb->kind                 = (VulkanQueueKind)timeline;
+		vcb->command_buffer_index = index;
+
+		// TODO(rnp): probably not the best to have this here but it will likely not be hit
+		b32 wait_result = vk_host_wait_timeline(timeline, vq->command_buffer_submission_values[index], -1ULL);
+		assert(wait_result);
+
+		VkCommandBufferBeginInfo buffer_begin_info = {
+			.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
+			.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT,
+		};
+
+		vq->query_pool_occupied[index] = 0;
+
+		vkBeginCommandBuffer(vq->command_buffers[index], &buffer_begin_info);
+		vkCmdResetQueryPool(vq->command_buffers[index], vq->query_pool,
+		                    index * MaxCommandBufferTimestamps, MaxCommandBufferTimestamps);
+
+		result = (VulkanHandle){(u64)e};
+	}
+	return result;
+}
+
+DEBUG_IMPORT void
+vk_command_bind_pipeline(VulkanHandle command, VulkanHandle pipeline)
+{
+	if ValidVulkanHandle(command) {
+		VulkanContext       *vk  = vulkan_context;
+		VulkanCommandBuffer *vcb = vk_entity_data(command, VulkanEntityKind_CommandBuffer);
+		VulkanQueue         *vq  = vk->queues[vcb->kind];
+
+		VulkanPipeline *vp = 0;
+		if ValidVulkanHandle(pipeline) {
+			vp = vk_entity_data(pipeline, VulkanEntityKind_Pipeline);
+		} else if (vcb->kind == VulkanQueueKind_Compute) {
+			vp = &vk->default_compute_pipeline;
+		} else if (vcb->kind == VulkanQueueKind_Graphics) {
+			vp = &vk->default_graphics_pipeline;
+		} else {
+			InvalidCodePath;
+		}
+
+		read_only local_persist VkPipelineBindPoint bind_point_lut[VulkanQueueKind_Count] = {
+			[VulkanQueueKind_Graphics] = VK_PIPELINE_BIND_POINT_GRAPHICS,
+			[VulkanQueueKind_Compute]  = VK_PIPELINE_BIND_POINT_COMPUTE,
+			[VulkanQueueKind_Transfer] = -1,
+		};
+
+		VkPipelineBindPoint bind_point = bind_point_lut[vcb->kind];
+		assert(bind_point != (VkPipelineBindPoint)-1);
+
+		vkCmdBindPipeline(vq->command_buffers[vcb->command_buffer_index], bind_point, vp->pipeline);
+		vq->bound_pipeline = vp;
+	}
+}
+
+DEBUG_IMPORT void
+vk_command_buffer_memory_barriers(VulkanHandle command, GPUMemoryBarrierInfo *barriers, u64 count)
+{
+	if ValidVulkanHandle(command) {
+		VulkanContext       *vk  = vulkan_context;
+		VulkanCommandBuffer *vcb = vk_entity_data(command, VulkanEntityKind_CommandBuffer);
+		VulkanQueue         *vq  = vk->queues[vcb->kind];
+
+		DeferLoop(take_lock(&vk->arena_lock, -1), release_lock(&vk->arena_lock))
+		{
+			Arena arena = vk->arena;
+			u32 valid_count = 0;
+			VkBufferMemoryBarrier2 *memory_barriers = push_array(&arena, VkBufferMemoryBarrier2, count);
+			for (u64 it = 0; it < count; it++) {
+				if ValidVulkanHandle(barriers[it].gpu_buffer->buffer) {
+					u32           index = valid_count++;
+					VulkanBuffer *vb    = vk_entity_data(barriers[it].gpu_buffer->buffer, VulkanEntityKind_Buffer);
+					memory_barriers[index].sType               = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2;
+					memory_barriers[index].srcStageMask        = vq->pipeline_stage_flags;
+					memory_barriers[index].srcAccessMask       = VK_ACCESS_2_MEMORY_WRITE_BIT;
+					memory_barriers[index].dstStageMask        = vq->pipeline_stage_flags;
+					memory_barriers[index].dstAccessMask       = VK_ACCESS_2_MEMORY_READ_BIT;
+					memory_barriers[index].srcQueueFamilyIndex = vq->queue_family;
+					memory_barriers[index].dstQueueFamilyIndex = vq->queue_family;
+					memory_barriers[index].buffer              = vb->buffer;
+					memory_barriers[index].offset              = barriers[it].offset;
+					memory_barriers[index].size                = barriers[it].size;
+				}
+			}
+
+			VkDependencyInfo dependancy_info = {
+				.sType                    = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
+				.bufferMemoryBarrierCount = valid_count,
+				.pBufferMemoryBarriers    = memory_barriers,
+			};
+
+			vkCmdPipelineBarrier2(vq->command_buffers[vcb->command_buffer_index], &dependancy_info);
+		}
+	}
+}
+
+DEBUG_IMPORT void
+vk_command_dispatch_compute(VulkanHandle command, uv3 dispatch)
+{
+	assert(dispatch.x <= U16_MAX);
+	assert(dispatch.y <= U16_MAX);
+	assert(dispatch.z <= U16_MAX);
+	if ValidVulkanHandle(command) {
+		VkCommandBuffer cmd = vk_command_buffer(command);
+		vkCmdDispatch(cmd, dispatch.x, dispatch.y, dispatch.z);
+	}
+}
+
+DEBUG_IMPORT void
+vk_command_push_constants(VulkanHandle command, u32 offset, u32 size, void *values)
+{
+	if ValidVulkanHandle(command) {
+		VulkanCommandBuffer *vcb = vk_entity_data(command, VulkanEntityKind_CommandBuffer);
+		VulkanQueue         *vq  = vulkan_context->queues[vcb->kind];
+		VulkanPipeline      *vp  = vq->bound_pipeline;
+
+		assert(vp);
+
+		vkCmdPushConstants(vq->command_buffers[vcb->command_buffer_index], vp->layout, vp->stage_flags,
+		                   offset, size, values);
+	}
+}
+
+DEBUG_IMPORT void
+vk_command_timestamp(VulkanHandle command)
+{
+	if ValidVulkanHandle(command) {
+		VulkanContext       *vk  = vulkan_context;
+		VulkanCommandBuffer *vcb = vk_entity_data(command, VulkanEntityKind_CommandBuffer);
+		VulkanQueue         *vq  = vk->queues[vcb->kind];
+
+		read_only local_persist VkPipelineStageFlags2 stage_lut[VulkanQueueKind_Count] = {
+			[VulkanQueueKind_Graphics] = VK_PIPELINE_STAGE_2_ALL_GRAPHICS_BIT,
+			[VulkanQueueKind_Compute]  = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
+			[VulkanQueueKind_Transfer] = -1,
+		};
+
+		VkPipelineStageFlags2 stage = stage_lut[vcb->kind];
+		assert(stage != (VkPipelineStageFlags2)-1);
+
+		if (vq->query_pool_occupied[vcb->command_buffer_index] < MaxCommandBufferTimestamps) {
+			u32 query_index = vq->query_pool_occupied[vcb->command_buffer_index]++;
+			vkCmdWriteTimestamp2(vq->command_buffers[vcb->command_buffer_index], stage,
+			                     vq->query_pool,
+			                     vcb->command_buffer_index * MaxCommandBufferTimestamps + query_index);
 		}
-		vk_entity_release((VulkanEntity *)h.value[0]);
 	}
 }
+
+DEBUG_IMPORT void
+vk_command_wait_timeline(VulkanHandle command, VulkanTimeline timeline, u64 value)
+{
+	if (ValidVulkanHandle(command) && Between(timeline, 0, VulkanTimeline_Count - 1)) {
+		VulkanContext       *vk  = vulkan_context;
+		VulkanCommandBuffer *vcb = vk_entity_data(command, VulkanEntityKind_CommandBuffer);
+
+		u32 wait_index = vk->queue_indices[timeline];
+		vcb->in_flight_wait_values[wait_index] = Max(value, vcb->in_flight_wait_values[wait_index]);
+	}
+}
+
+DEBUG_IMPORT u64
+vk_command_end(VulkanHandle command, VulkanHandle wait_semaphore, VulkanHandle finished_semaphore)
+{
+	u64 result = -1;
+	if ValidVulkanHandle(command) {
+		VulkanContext       *vk  = vulkan_context;
+		VulkanCommandBuffer *vcb = vk_entity_data(command, VulkanEntityKind_CommandBuffer);
+		VulkanQueue         *vq  = vk->queues[vcb->kind];
+		VulkanSemaphore     *vs  = &vq->timeline_semaphore;
+
+		vkEndCommandBuffer(vq->command_buffers[vcb->command_buffer_index]);
+
+		VkCommandBufferSubmitInfo command_buffer_submit_info = {
+			.sType         = VK_STRUCTURE_TYPE_COMMAND_BUFFER_SUBMIT_INFO,
+			.commandBuffer = vq->command_buffers[vcb->command_buffer_index],
+		};
+
+		result = ++vs->value;
+
+		u32 signal_submit_info_count = 1;
+		VkSemaphoreSubmitInfo signal_submit_infos[2] = {{
+			.sType     = VK_STRUCTURE_TYPE_SEMAPHORE_SUBMIT_INFO,
+			.semaphore = vs->semaphore,
+			.value     = result,
+			.stageMask = vq->pipeline_stage_flags,
+		}};
+
+		if ValidVulkanHandle(finished_semaphore) {
+			VulkanSemaphore *fs = vk_entity_data(finished_semaphore, VulkanEntityKind_Semaphore);
+			signal_submit_infos[signal_submit_info_count++] = (VkSemaphoreSubmitInfo){
+				.sType     = VK_STRUCTURE_TYPE_SEMAPHORE_SUBMIT_INFO,
+				.semaphore = fs->semaphore,
+				.stageMask = vq->pipeline_stage_flags,
+			};
+		}
+
+		u32 wait_submit_info_count = 0;
+		VkSemaphoreSubmitInfo wait_submit_infos[VulkanQueueKind_Count + 1];
+		for (u32 i = 0; i < vk->unique_queues; i++) {
+			u32 queue_index = vk->queue_indices[i];
+			if (vcb->in_flight_wait_values[queue_index] > 0) {
+				VulkanQueue *q = vk->queues[queue_index];
+				VkSemaphoreSubmitInfo wait_ssi = {
+					.sType     = VK_STRUCTURE_TYPE_SEMAPHORE_SUBMIT_INFO,
+					.semaphore = q->timeline_semaphore.semaphore,
+					.value     = vcb->in_flight_wait_values[queue_index],
+					.stageMask = q->pipeline_stage_flags,
+				};
+				wait_submit_infos[wait_submit_info_count++] = wait_ssi;
+			}
+		}
+
+		if ValidVulkanHandle(wait_semaphore) {
+			VulkanSemaphore *ws = vk_entity_data(wait_semaphore, VulkanEntityKind_Semaphore);
+			wait_submit_infos[wait_submit_info_count++] = (VkSemaphoreSubmitInfo){
+				.sType     = VK_STRUCTURE_TYPE_SEMAPHORE_SUBMIT_INFO,
+				.semaphore = ws->semaphore,
+				.stageMask = vq->pipeline_stage_flags,
+			};
+		}
+
+		VkSubmitInfo2 submit_info = {
+			.sType                    = VK_STRUCTURE_TYPE_SUBMIT_INFO_2,
+			.commandBufferInfoCount   = 1,
+			.pCommandBufferInfos      = &command_buffer_submit_info,
+			.waitSemaphoreInfoCount   = wait_submit_info_count,
+			.pWaitSemaphoreInfos      = wait_submit_infos,
+			.signalSemaphoreInfoCount = signal_submit_info_count,
+			.pSignalSemaphoreInfos    = signal_submit_infos,
+		};
+
+		vkQueueSubmit2(vq->queue, 1, &submit_info, 0);
+
+		vq->bound_pipeline = 0;
+
+		atomic_store_u64(vq->command_buffer_submission_values + vcb->command_buffer_index, result);
+
+		release_lock(&vq->lock);
+
+		vk_entity_release((VulkanEntity *)command.value[0]);
+	}
+	return result;
+}
+
+DEBUG_IMPORT void
+vk_command_begin_rendering(VulkanHandle command, GPUImage *colour, GPUImage *depth, GPUImage *resolve)
+{
+	if ValidVulkanHandle(command) {
+		VkCommandBuffer cmd = vk_command_buffer(command);
+
+		assert((colour->width == depth->width) && (colour->height == depth->height));
+
+		VulkanImage *ci = vk_entity_data(colour->image, VulkanEntityKind_Image);
+		VulkanImage *di = vk_entity_data(depth->image,  VulkanEntityKind_Image);
+		VulkanImage *ri = 0;
+		if (resolve) ri = vk_entity_data(resolve->image, VulkanEntityKind_Image);
+
+		// NOTE: Layout Transitions
+		{
+			u32 image_memory_barrier_count = 2;
+			VkImageMemoryBarrier2 image_memory_barriers[3] = {
+				{
+					.sType            = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER_2,
+					.srcStageMask     = VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT,
+					.srcAccessMask    = 0,
+					.dstStageMask     = VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT,
+					.dstAccessMask    = VK_ACCESS_2_COLOR_ATTACHMENT_READ_BIT|VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT,
+					.oldLayout        = VK_IMAGE_LAYOUT_UNDEFINED,
+					.newLayout        = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,
+					.image            = ci->image,
+					.subresourceRange = {
+						.aspectMask     = VK_IMAGE_ASPECT_COLOR_BIT,
+						.baseMipLevel   = 0,
+						.levelCount     = 1,
+						.baseArrayLayer = 0,
+						.layerCount     = 1,
+					},
+				},
+				{
+					.sType            = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER_2,
+					.srcStageMask     = VK_PIPELINE_STAGE_2_EARLY_FRAGMENT_TESTS_BIT|VK_PIPELINE_STAGE_2_LATE_FRAGMENT_TESTS_BIT,
+					.srcAccessMask    = 0,
+					.dstStageMask     = VK_PIPELINE_STAGE_2_EARLY_FRAGMENT_TESTS_BIT|VK_PIPELINE_STAGE_2_LATE_FRAGMENT_TESTS_BIT,
+					.dstAccessMask    = VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT,
+					.oldLayout        = VK_IMAGE_LAYOUT_UNDEFINED,
+					.newLayout        = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL,
+					.image            = di->image,
+					.subresourceRange = {
+						.aspectMask     = VK_IMAGE_ASPECT_DEPTH_BIT|VK_IMAGE_ASPECT_STENCIL_BIT,
+						.baseMipLevel   = 0,
+						.levelCount     = 1,
+						.baseArrayLayer = 0,
+						.layerCount     = 1,
+					},
+				},
+			};
+
+			if (resolve) image_memory_barriers[image_memory_barrier_count++] = (VkImageMemoryBarrier2){
+				.sType            = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER_2,
+				.srcStageMask     = VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT,
+				.srcAccessMask    = 0,
+				.dstStageMask     = VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT|VK_PIPELINE_STAGE_2_RESOLVE_BIT,
+				.dstAccessMask    = VK_ACCESS_2_COLOR_ATTACHMENT_READ_BIT|VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT,
+				.oldLayout        = VK_IMAGE_LAYOUT_UNDEFINED,
+				.newLayout        = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,
+				.image            = ri->image,
+				.subresourceRange = {
+					.aspectMask     = VK_IMAGE_ASPECT_COLOR_BIT,
+					.baseMipLevel   = 0,
+					.levelCount     = 1,
+					.baseArrayLayer = 0,
+					.layerCount     = 1,
+				},
+			};
+
+			VkDependencyInfo dependency_info = {
+				.sType                   = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
+				.imageMemoryBarrierCount = image_memory_barrier_count,
+				.pImageMemoryBarriers    = image_memory_barriers,
+			};
+
+			vkCmdPipelineBarrier2(cmd, &dependency_info);
+		}
+
+		VkRenderingAttachmentInfo colour_attachment = {
+			.sType              = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO,
+			.imageView          = ci->view,
+			.imageLayout        = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,
+			.resolveMode        = ri ? VK_RESOLVE_MODE_AVERAGE_BIT : 0,
+			.resolveImageView   = ri ? ri->view : 0,
+			.resolveImageLayout = ri ? VK_IMAGE_LAYOUT_GENERAL : 0,
+			.loadOp             = VK_ATTACHMENT_LOAD_OP_CLEAR,
+			.storeOp            = VK_ATTACHMENT_STORE_OP_STORE,
+			.clearValue         = {.color = {{0.0f, 0.0f, 0.0f, 0.0f}}},
+		};
+
+		VkRenderingAttachmentInfo depth_stencil_attachment = {
+			.sType       = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO,
+			.imageView   = di->view,
+			.imageLayout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL,
+			.loadOp      = VK_ATTACHMENT_LOAD_OP_CLEAR,
+			.storeOp     = VK_ATTACHMENT_STORE_OP_STORE,
+			.clearValue  = {.depthStencil = {1.0f, 0}},
+		};
+
+		VkRenderingInfo rendering_info = {
+			.sType                = VK_STRUCTURE_TYPE_RENDERING_INFO,
+			.renderArea           = {.offset = {0}, .extent = {colour->width, colour->height}},
+			.layerCount           = 1,
+			.colorAttachmentCount = 1,
+			.pColorAttachments    = &colour_attachment,
+			.pDepthAttachment     = &depth_stencil_attachment,
+			.pStencilAttachment   = &depth_stencil_attachment,
+		};
+
+		vkCmdBeginRendering(cmd, &rendering_info);
+	}
+}
+
+DEBUG_IMPORT void
+vk_command_draw(VulkanHandle command, GPUBuffer *model)
+{
+	if (ValidVulkanHandle(command) && ValidVulkanHandle(model->buffer)) {
+		VkCommandBuffer cmd = vk_command_buffer(command);
+		VulkanBuffer   *vb  = vk_entity_data(model->buffer, VulkanEntityKind_RenderModel);
+		vkCmdBindIndexBuffer2(cmd, vb->buffer, 0, vk_index_size(vb->index_type) * model->index_count, vb->index_type);
+		vkCmdDrawIndexed(cmd, model->index_count, 1, 0, 0, 0);
+	}
+}
+
+DEBUG_IMPORT void
+vk_command_scissor(VulkanHandle command, u32 width, u32 height, u32 x_offset, u32 y_offset)
+{
+	if ValidVulkanHandle(command) {
+		VkCommandBuffer cmd = vk_command_buffer(command);
+		VkRect2D scissor = {.offset = {x_offset, y_offset}, .extent = {width, height}};
+		vkCmdSetScissor(cmd, 0, 1, &scissor);
+	}
+}
+
+DEBUG_IMPORT void
+vk_command_viewport(VulkanHandle command, f32 width, f32 height, f32 x_offset, f32 y_offset, f32 min_depth, f32 max_depth)
+{
+	if ValidVulkanHandle(command) {
+		VkCommandBuffer cmd = vk_command_buffer(command);
+		VkViewport viewport = {x_offset, y_offset, width, height, min_depth, max_depth};
+		vkCmdSetViewport(cmd, 0, 1, &viewport);
+	}
+}
+
+DEBUG_IMPORT void
+vk_command_end_rendering(VulkanHandle command)
+{
+	if ValidVulkanHandle(command) vkCmdEndRendering(vk_command_buffer(command));
+}
+
+DEBUG_IMPORT void
+vk_command_copy_buffer(VulkanHandle command, GPUBuffer *restrict destination,
+                       GPUBuffer *restrict source, u64 source_offset, i64 size)
+{
+	if (ValidVulkanHandle(command) && ValidVulkanHandle(destination->buffer) && ValidVulkanHandle(source->buffer)) {
+		VkCommandBuffer cmd = vk_command_buffer(command);
+		VulkanBuffer *db = vk_entity_data(destination->buffer, VulkanEntityKind_Buffer);
+		VulkanBuffer *sb = vk_entity_data(source->buffer,      VulkanEntityKind_Buffer);
+
+		VkBufferCopy2 buffer_copy = {
+			.sType     = VK_STRUCTURE_TYPE_BUFFER_COPY_2,
+			.srcOffset = source_offset,
+			.dstOffset = 0,
+			.size      = size,
+		};
+
+		VkCopyBufferInfo2 copy_buffer_info = {
+			.sType       = VK_STRUCTURE_TYPE_COPY_BUFFER_INFO_2,
+			.srcBuffer   = sb->buffer,
+			.dstBuffer   = db->buffer,
+			.regionCount = 1,
+			.pRegions    = &buffer_copy,
+		};
+
+		vkCmdCopyBuffer2(cmd, &copy_buffer_info);
+	}
+}
+
+DEBUG_IMPORT u64 *
+vk_command_read_timestamps(VulkanTimeline timeline, Arena *arena)
+{
+	u64 *result = 0;
+	if Between(timeline, 0, VulkanTimeline_Count - 1) {
+		VulkanContext *vk = vulkan_context;
+		VulkanQueue   *vq = vk->queues[timeline];
+		DeferLoop(take_lock(&vq->lock, -1), release_lock(&vq->lock)) {
+			u32 index = (vq->next_command_buffer_index - 1) % countof(vq->command_buffers);
+			u32 count = vq->query_pool_occupied[index];
+			if (count > 0) {
+				result = push_array(arena, u64, count + 1);
+				result[0] = count;
+
+				vkGetQueryPoolResults(vk->device, vq->query_pool, index * MaxCommandBufferTimestamps, count,
+				                      count * sizeof(u64), result + 1, 8, VK_QUERY_RESULT_WAIT_BIT);
+			}
+		}
+	} else {
+		result = push_array(arena, u64, 1);
+	}
+	return result;
+}
diff --git a/vulkan.h b/vulkan.h
@@ -24,7 +24,9 @@
 
 typedef uint32_t VkBool32;
 typedef uint32_t VkFlags;
+typedef uint64_t VkFlags64;
 typedef uint32_t VkSampleMask;
+typedef uint64_t VkDeviceAddress;
 typedef uint64_t VkDeviceSize;
 VK_HANDLE(VkBuffer);
 VK_HANDLE(VkCommandBuffer);
@@ -41,6 +43,7 @@ VK_HANDLE(VkPhysicalDevice);
 VK_HANDLE(VkPipeline);
 VK_HANDLE(VkPipelineCache);
 VK_HANDLE(VkPipelineLayout);
+VK_HANDLE(VkQueryPool);
 VK_HANDLE(VkQueue);
 VK_HANDLE(VkRenderPass);
 VK_HANDLE(VkSampler);
@@ -51,53 +54,85 @@ VK_HANDLE(VkSwapchainKHR);
 
 typedef enum {
 	VK_SUCCESS               = 0,
+	VK_TIMEOUT               = 2,
 	VK_SUBOPTIMAL_KHR        = 1000001003,
 	VK_ERROR_OUT_OF_DATE_KHR = -1000001004,
 	VK_RESULT_MAX_ENUM       = 0x7FFFFFFF
 } VkResult;
 
 typedef enum {
-	VK_STRUCTURE_TYPE_APPLICATION_INFO                          = 0,
-	VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO                      = 1,
-	VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO                  = 2,
-	VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO                        = 3,
-	VK_STRUCTURE_TYPE_SUBMIT_INFO                               = 4,
-	VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO                      = 5,
-	VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE                       = 6,
-	VK_STRUCTURE_TYPE_FENCE_CREATE_INFO                         = 8,
-	VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO                     = 9,
-	VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO                    = 15,
-	VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO                 = 16,
-	VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO         = 18,
-	VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO   = 19,
-	VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO = 20,
-	VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO       = 22,
-	VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO  = 23,
-	VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO    = 24,
-	VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO    = 26,
-	VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO        = 27,
-	VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO             = 28,
-	VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO              = 29,
-	VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO               = 30,
-	VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO                   = 37,
-	VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO                   = 38,
-	VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO                  = 39,
-	VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO              = 40,
-	VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO                 = 42,
-	VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO                    = 43,
-	VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_PROPERTIES     = 50,
-	VK_STRUCTURE_TYPE_SWAPCHAIN_CREATE_INFO_KHR                 = 1000001000,
-	VK_STRUCTURE_TYPE_PRESENT_INFO_KHR                          = 1000001001,
-	VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2              = 1000059001,
-	VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MEMORY_PROPERTIES_2       = 1000059006,
-	VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_FLAGS_INFO                = 1000060000,
-	VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO               = 1000072002,
-	VK_STRUCTURE_TYPE_MEMORY_GET_WIN32_HANDLE_INFO_KHR          = 1000073003,
-	VK_STRUCTURE_TYPE_MEMORY_GET_FD_INFO_KHR                    = 1000074002,
-	VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_CREATE_INFO              = 1000077000,
-	VK_STRUCTURE_TYPE_SEMAPHORE_GET_WIN32_HANDLE_INFO_KHR       = 1000078003,
-	VK_STRUCTURE_TYPE_SEMAPHORE_GET_FD_INFO_KHR                 = 1000079001,
-	VK_STRUCTURE_TYPE_MAX_ENUM                                  = 0x7FFFFFFF,
+	VK_STRUCTURE_TYPE_APPLICATION_INFO                                                 = 0,
+	VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO                                             = 1,
+	VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO                                         = 2,
+	VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO                                               = 3,
+	VK_STRUCTURE_TYPE_SUBMIT_INFO                                                      = 4,
+	VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO                                             = 5,
+	VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE                                              = 6,
+	VK_STRUCTURE_TYPE_FENCE_CREATE_INFO                                                = 8,
+	VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO                                            = 9,
+	VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO                                           = 11,
+	VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO                                               = 12,
+	VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO                                                = 14,
+	VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO                                           = 15,
+	VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO                                        = 16,
+	VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO                                = 18,
+	VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO                          = 19,
+	VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO                        = 20,
+	VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO                              = 22,
+	VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO                         = 23,
+	VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO                           = 24,
+	VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO                         = 25,
+	VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO                           = 26,
+	VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO                               = 27,
+	VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO                                    = 28,
+	VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO                                     = 29,
+	VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO                                      = 30,
+	VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO                                          = 37,
+	VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO                                          = 38,
+	VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO                                         = 39,
+	VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO                                     = 40,
+	VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO                                        = 42,
+	VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO                                           = 43,
+	VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES                              = 49,
+	VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_PROPERTIES                            = 50,
+	VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES                              = 51,
+	VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_PROPERTIES                            = 52,
+	VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_3_FEATURES                              = 53,
+	VK_STRUCTURE_TYPE_SWAPCHAIN_CREATE_INFO_KHR                                        = 1000001000,
+	VK_STRUCTURE_TYPE_PRESENT_INFO_KHR                                                 = 1000001001,
+	VK_STRUCTURE_TYPE_RENDERING_INFO                                                   = 1000044000,
+	VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO                                        = 1000044001,
+	VK_STRUCTURE_TYPE_PIPELINE_RENDERING_CREATE_INFO                                   = 1000044002,
+	VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2                                       = 1000059000,
+	VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2                                     = 1000059001,
+	VK_STRUCTURE_TYPE_FORMAT_PROPERTIES_2                                              = 1000059002,
+	VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MEMORY_PROPERTIES_2                              = 1000059006,
+	VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_FLAGS_INFO                                       = 1000060000,
+	VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_IMAGE_CREATE_INFO                                = 1000072001,
+	VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO                                      = 1000072002,
+	VK_STRUCTURE_TYPE_MEMORY_GET_WIN32_HANDLE_INFO_KHR                                 = 1000073003,
+	VK_STRUCTURE_TYPE_MEMORY_GET_FD_INFO_KHR                                           = 1000074002,
+	VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_CREATE_INFO                                     = 1000077000,
+	VK_STRUCTURE_TYPE_SEMAPHORE_GET_WIN32_HANDLE_INFO_KHR                              = 1000078003,
+	VK_STRUCTURE_TYPE_SEMAPHORE_GET_FD_INFO_KHR                                        = 1000079001,
+	VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO                                   = 1000127001,
+	VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO                                       = 1000207002,
+	VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO                                   = 1000207003,
+	VK_STRUCTURE_TYPE_SEMAPHORE_WAIT_INFO                                              = 1000207004,
+	VK_STRUCTURE_TYPE_SEMAPHORE_SIGNAL_INFO                                            = 1000207005,
+	VK_STRUCTURE_TYPE_BUFFER_DEVICE_ADDRESS_INFO                                       = 1000244001,
+	VK_STRUCTURE_TYPE_VALIDATION_FEATURES_EXT                                          = 1000247000,
+	VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2                                          = 1000314001,
+	VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER_2                                           = 1000314002,
+	VK_STRUCTURE_TYPE_DEPENDENCY_INFO                                                  = 1000314003,
+	VK_STRUCTURE_TYPE_SUBMIT_INFO_2                                                    = 1000314004,
+	VK_STRUCTURE_TYPE_SEMAPHORE_SUBMIT_INFO                                            = 1000314005,
+	VK_STRUCTURE_TYPE_COMMAND_BUFFER_SUBMIT_INFO                                       = 1000314006,
+	VK_STRUCTURE_TYPE_COPY_BUFFER_INFO_2                                               = 1000337000,
+	VK_STRUCTURE_TYPE_BUFFER_COPY_2                                                    = 1000337006,
+	VK_STRUCTURE_TYPE_FORMAT_PROPERTIES_3                                              = 1000360000,
+	VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_RELAXED_EXTENDED_INSTRUCTION_FEATURES_KHR = 1000558000,
+	VK_STRUCTURE_TYPE_MAX_ENUM                                                         = 0x7FFFFFFF,
 } VkStructureType;
 
 typedef enum {
@@ -110,6 +145,27 @@ typedef enum {
 } VkPhysicalDeviceType;
 
 typedef enum {
+	VK_QUERY_TYPE_OCCLUSION                                                      = 0,
+	VK_QUERY_TYPE_PIPELINE_STATISTICS                                            = 1,
+	VK_QUERY_TYPE_TIMESTAMP                                                      = 2,
+	VK_QUERY_TYPE_RESULT_STATUS_ONLY_KHR                                         = 1000023000,
+	VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT                                  = 1000028004,
+	VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR                                          = 1000116000,
+	VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR                      = 1000150000,
+	VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR                  = 1000150001,
+	VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_NV                       = 1000165000,
+	VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL                                        = 1000210000,
+	VK_QUERY_TYPE_VIDEO_ENCODE_FEEDBACK_KHR                                      = 1000299000,
+	VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT                                  = 1000328000,
+	VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT                                       = 1000382000,
+	VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR = 1000386000,
+	VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR                                = 1000386001,
+	VK_QUERY_TYPE_MICROMAP_SERIALIZATION_SIZE_EXT                                = 1000396000,
+	VK_QUERY_TYPE_MICROMAP_COMPACTED_SIZE_EXT                                    = 1000396001,
+	VK_QUERY_TYPE_MAX_ENUM                                                       = 0x7FFFFFFF
+} VkQueryType;
+
+typedef enum {
 	VK_SYSTEM_ALLOCATION_SCOPE_COMMAND  = 0,
 	VK_SYSTEM_ALLOCATION_SCOPE_OBJECT   = 1,
 	VK_SYSTEM_ALLOCATION_SCOPE_CACHE    = 2,
@@ -180,6 +236,121 @@ typedef enum {
 } VkPipelineStageFlagBits;
 typedef VkFlags VkPipelineStageFlags;
 
+typedef enum {
+	VK_PIPELINE_STAGE_2_NONE                                     = 0ULL,
+	VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT                          = 0x00000001ULL,
+	VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT                        = 0x00000002ULL,
+	VK_PIPELINE_STAGE_2_VERTEX_INPUT_BIT                         = 0x00000004ULL,
+	VK_PIPELINE_STAGE_2_VERTEX_SHADER_BIT                        = 0x00000008ULL,
+	VK_PIPELINE_STAGE_2_TESSELLATION_CONTROL_SHADER_BIT          = 0x00000010ULL,
+	VK_PIPELINE_STAGE_2_TESSELLATION_EVALUATION_SHADER_BIT       = 0x00000020ULL,
+	VK_PIPELINE_STAGE_2_GEOMETRY_SHADER_BIT                      = 0x00000040ULL,
+	VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT                      = 0x00000080ULL,
+	VK_PIPELINE_STAGE_2_EARLY_FRAGMENT_TESTS_BIT                 = 0x00000100ULL,
+	VK_PIPELINE_STAGE_2_LATE_FRAGMENT_TESTS_BIT                  = 0x00000200ULL,
+	VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT              = 0x00000400ULL,
+	VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT                       = 0x00000800ULL,
+	VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT                         = 0x00001000ULL,
+	VK_PIPELINE_STAGE_2_TRANSFER_BIT                             = 0x00001000ULL,
+	VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT                       = 0x00002000ULL,
+	VK_PIPELINE_STAGE_2_HOST_BIT                                 = 0x00004000ULL,
+	VK_PIPELINE_STAGE_2_ALL_GRAPHICS_BIT                         = 0x00008000ULL,
+	VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT                         = 0x00010000ULL,
+	VK_PIPELINE_STAGE_2_COPY_BIT                                 = 0x100000000ULL,
+	VK_PIPELINE_STAGE_2_RESOLVE_BIT                              = 0x200000000ULL,
+	VK_PIPELINE_STAGE_2_BLIT_BIT                                 = 0x400000000ULL,
+	VK_PIPELINE_STAGE_2_CLEAR_BIT                                = 0x800000000ULL,
+	VK_PIPELINE_STAGE_2_INDEX_INPUT_BIT                          = 0x1000000000ULL,
+	VK_PIPELINE_STAGE_2_VERTEX_ATTRIBUTE_INPUT_BIT               = 0x2000000000ULL,
+	VK_PIPELINE_STAGE_2_PRE_RASTERIZATION_SHADERS_BIT            = 0x4000000000ULL,
+	VK_PIPELINE_STAGE_2_VIDEO_DECODE_BIT_KHR                     = 0x04000000ULL,
+	VK_PIPELINE_STAGE_2_VIDEO_ENCODE_BIT_KHR                     = 0x08000000ULL,
+	VK_PIPELINE_STAGE_2_TRANSFORM_FEEDBACK_BIT_EXT               = 0x01000000ULL,
+	VK_PIPELINE_STAGE_2_CONDITIONAL_RENDERING_BIT_EXT            = 0x00040000ULL,
+	VK_PIPELINE_STAGE_2_COMMAND_PREPROCESS_BIT_NV                = 0x00020000ULL,
+	VK_PIPELINE_STAGE_2_COMMAND_PREPROCESS_BIT_EXT               = 0x00020000ULL,
+	VK_PIPELINE_STAGE_2_FRAGMENT_SHADING_RATE_ATTACHMENT_BIT_KHR = 0x00400000ULL,
+	VK_PIPELINE_STAGE_2_SHADING_RATE_IMAGE_BIT_NV                = 0x00400000ULL,
+	VK_PIPELINE_STAGE_2_ACCELERATION_STRUCTURE_BUILD_BIT_KHR     = 0x02000000ULL,
+	VK_PIPELINE_STAGE_2_RAY_TRACING_SHADER_BIT_KHR               = 0x00200000ULL,
+	VK_PIPELINE_STAGE_2_RAY_TRACING_SHADER_BIT_NV                = 0x00200000ULL,
+	VK_PIPELINE_STAGE_2_ACCELERATION_STRUCTURE_BUILD_BIT_NV      = 0x02000000ULL,
+	VK_PIPELINE_STAGE_2_FRAGMENT_DENSITY_PROCESS_BIT_EXT         = 0x00800000ULL,
+	VK_PIPELINE_STAGE_2_TASK_SHADER_BIT_NV                       = 0x00080000ULL,
+	VK_PIPELINE_STAGE_2_MESH_SHADER_BIT_NV                       = 0x00100000ULL,
+	VK_PIPELINE_STAGE_2_TASK_SHADER_BIT_EXT                      = 0x00080000ULL,
+	VK_PIPELINE_STAGE_2_MESH_SHADER_BIT_EXT                      = 0x00100000ULL,
+	VK_PIPELINE_STAGE_2_SUBPASS_SHADER_BIT_HUAWEI                = 0x8000000000ULL,
+	VK_PIPELINE_STAGE_2_INVOCATION_MASK_BIT_HUAWEI               = 0x10000000000ULL,
+	VK_PIPELINE_STAGE_2_ACCELERATION_STRUCTURE_COPY_BIT_KHR      = 0x10000000ULL,
+	VK_PIPELINE_STAGE_2_MICROMAP_BUILD_BIT_EXT                   = 0x40000000ULL,
+	VK_PIPELINE_STAGE_2_CLUSTER_CULLING_SHADER_BIT_HUAWEI        = 0x20000000000ULL,
+	VK_PIPELINE_STAGE_2_OPTICAL_FLOW_BIT_NV                      = 0x20000000ULL,
+	VK_PIPELINE_STAGE_2_CONVERT_COOPERATIVE_VECTOR_MATRIX_BIT_NV = 0x100000000000ULL,
+	VK_PIPELINE_STAGE_2_DATA_GRAPH_BIT_ARM                       = 0x40000000000ULL,
+	VK_PIPELINE_STAGE_2_COPY_INDIRECT_BIT_KHR                    = 0x400000000000ULL,
+	VK_PIPELINE_STAGE_2_MEMORY_DECOMPRESSION_BIT_EXT             = 0x200000000000ULL,
+} VkPipelineStageFlagBits2;
+typedef VkFlags64 VkPipelineStageFlags2;
+
+typedef enum {
+	VK_ACCESS_2_NONE                                          = 0ULL,
+	VK_ACCESS_2_INDIRECT_COMMAND_READ_BIT                     = 0x00000001ULL,
+	VK_ACCESS_2_INDEX_READ_BIT                                = 0x00000002ULL,
+	VK_ACCESS_2_VERTEX_ATTRIBUTE_READ_BIT                     = 0x00000004ULL,
+	VK_ACCESS_2_UNIFORM_READ_BIT                              = 0x00000008ULL,
+	VK_ACCESS_2_INPUT_ATTACHMENT_READ_BIT                     = 0x00000010ULL,
+	VK_ACCESS_2_SHADER_READ_BIT                               = 0x00000020ULL,
+	VK_ACCESS_2_SHADER_WRITE_BIT                              = 0x00000040ULL,
+	VK_ACCESS_2_COLOR_ATTACHMENT_READ_BIT                     = 0x00000080ULL,
+	VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT                    = 0x00000100ULL,
+	VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_READ_BIT             = 0x00000200ULL,
+	VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT            = 0x00000400ULL,
+	VK_ACCESS_2_TRANSFER_READ_BIT                             = 0x00000800ULL,
+	VK_ACCESS_2_TRANSFER_WRITE_BIT                            = 0x00001000ULL,
+	VK_ACCESS_2_HOST_READ_BIT                                 = 0x00002000ULL,
+	VK_ACCESS_2_HOST_WRITE_BIT                                = 0x00004000ULL,
+	VK_ACCESS_2_MEMORY_READ_BIT                               = 0x00008000ULL,
+	VK_ACCESS_2_MEMORY_WRITE_BIT                              = 0x00010000ULL,
+	VK_ACCESS_2_SHADER_SAMPLED_READ_BIT                       = 0x100000000ULL,
+	VK_ACCESS_2_SHADER_STORAGE_READ_BIT                       = 0x200000000ULL,
+	VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT                      = 0x400000000ULL,
+	VK_ACCESS_2_VIDEO_DECODE_READ_BIT_KHR                     = 0x800000000ULL,
+	VK_ACCESS_2_VIDEO_DECODE_WRITE_BIT_KHR                    = 0x1000000000ULL,
+	VK_ACCESS_2_VIDEO_ENCODE_READ_BIT_KHR                     = 0x2000000000ULL,
+	VK_ACCESS_2_VIDEO_ENCODE_WRITE_BIT_KHR                    = 0x4000000000ULL,
+	VK_ACCESS_2_SHADER_TILE_ATTACHMENT_READ_BIT_QCOM          = 0x8000000000000ULL,
+	VK_ACCESS_2_SHADER_TILE_ATTACHMENT_WRITE_BIT_QCOM         = 0x10000000000000ULL,
+	VK_ACCESS_2_TRANSFORM_FEEDBACK_WRITE_BIT_EXT              = 0x02000000ULL,
+	VK_ACCESS_2_TRANSFORM_FEEDBACK_COUNTER_READ_BIT_EXT       = 0x04000000ULL,
+	VK_ACCESS_2_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT      = 0x08000000ULL,
+	VK_ACCESS_2_CONDITIONAL_RENDERING_READ_BIT_EXT            = 0x00100000ULL,
+	VK_ACCESS_2_COMMAND_PREPROCESS_READ_BIT_NV                = 0x00020000ULL,
+	VK_ACCESS_2_COMMAND_PREPROCESS_WRITE_BIT_NV               = 0x00040000ULL,
+	VK_ACCESS_2_COMMAND_PREPROCESS_READ_BIT_EXT               = 0x00020000ULL,
+	VK_ACCESS_2_COMMAND_PREPROCESS_WRITE_BIT_EXT              = 0x00040000ULL,
+	VK_ACCESS_2_FRAGMENT_SHADING_RATE_ATTACHMENT_READ_BIT_KHR = 0x00800000ULL,
+	VK_ACCESS_2_SHADING_RATE_IMAGE_READ_BIT_NV                = 0x00800000ULL,
+	VK_ACCESS_2_ACCELERATION_STRUCTURE_READ_BIT_KHR           = 0x00200000ULL,
+	VK_ACCESS_2_ACCELERATION_STRUCTURE_WRITE_BIT_KHR          = 0x00400000ULL,
+	VK_ACCESS_2_ACCELERATION_STRUCTURE_READ_BIT_NV            = 0x00200000ULL,
+	VK_ACCESS_2_ACCELERATION_STRUCTURE_WRITE_BIT_NV           = 0x00400000ULL,
+	VK_ACCESS_2_FRAGMENT_DENSITY_MAP_READ_BIT_EXT             = 0x01000000ULL,
+	VK_ACCESS_2_COLOR_ATTACHMENT_READ_NONCOHERENT_BIT_EXT     = 0x00080000ULL,
+	VK_ACCESS_2_DESCRIPTOR_BUFFER_READ_BIT_EXT                = 0x20000000000ULL,
+	VK_ACCESS_2_INVOCATION_MASK_READ_BIT_HUAWEI               = 0x8000000000ULL,
+	VK_ACCESS_2_SHADER_BINDING_TABLE_READ_BIT_KHR             = 0x10000000000ULL,
+	VK_ACCESS_2_MICROMAP_READ_BIT_EXT                         = 0x100000000000ULL,
+	VK_ACCESS_2_MICROMAP_WRITE_BIT_EXT                        = 0x200000000000ULL,
+	VK_ACCESS_2_OPTICAL_FLOW_READ_BIT_NV                      = 0x40000000000ULL,
+	VK_ACCESS_2_OPTICAL_FLOW_WRITE_BIT_NV                     = 0x80000000000ULL,
+	VK_ACCESS_2_DATA_GRAPH_READ_BIT_ARM                       = 0x800000000000ULL,
+	VK_ACCESS_2_DATA_GRAPH_WRITE_BIT_ARM                      = 0x1000000000000ULL,
+	VK_ACCESS_2_MEMORY_DECOMPRESSION_READ_BIT_EXT             = 0x80000000000000ULL,
+	VK_ACCESS_2_MEMORY_DECOMPRESSION_WRITE_BIT_EXT            = 0x100000000000000ULL,
+} VkAccessFlagBits2;
+typedef VkFlags64 VkAccessFlags2;
+
 typedef VkFlags VkDeviceCreateFlags;
 
 typedef enum {
@@ -189,6 +360,128 @@ typedef enum {
 } VkPointClippingBehavior;
 
 typedef enum {
+	VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT                                                           = 0x00000001,
+	VK_FORMAT_FEATURE_STORAGE_IMAGE_BIT                                                           = 0x00000002,
+	VK_FORMAT_FEATURE_STORAGE_IMAGE_ATOMIC_BIT                                                    = 0x00000004,
+	VK_FORMAT_FEATURE_UNIFORM_TEXEL_BUFFER_BIT                                                    = 0x00000008,
+	VK_FORMAT_FEATURE_STORAGE_TEXEL_BUFFER_BIT                                                    = 0x00000010,
+	VK_FORMAT_FEATURE_STORAGE_TEXEL_BUFFER_ATOMIC_BIT                                             = 0x00000020,
+	VK_FORMAT_FEATURE_VERTEX_BUFFER_BIT                                                           = 0x00000040,
+	VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT                                                        = 0x00000080,
+	VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BLEND_BIT                                                  = 0x00000100,
+	VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT                                                = 0x00000200,
+	VK_FORMAT_FEATURE_BLIT_SRC_BIT                                                                = 0x00000400,
+	VK_FORMAT_FEATURE_BLIT_DST_BIT                                                                = 0x00000800,
+	VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_LINEAR_BIT                                             = 0x00001000,
+	VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_CUBIC_BIT_EXT                                          = 0x00002000,
+	VK_FORMAT_FEATURE_TRANSFER_SRC_BIT                                                            = 0x00004000,
+	VK_FORMAT_FEATURE_TRANSFER_DST_BIT                                                            = 0x00008000,
+	VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_MINMAX_BIT                                             = 0x00010000,
+	VK_FORMAT_FEATURE_MIDPOINT_CHROMA_SAMPLES_BIT                                                 = 0x00020000,
+	VK_FORMAT_FEATURE_SAMPLED_IMAGE_YCBCR_CONVERSION_LINEAR_FILTER_BIT                            = 0x00040000,
+	VK_FORMAT_FEATURE_SAMPLED_IMAGE_YCBCR_CONVERSION_SEPARATE_RECONSTRUCTION_FILTER_BIT           = 0x00080000,
+	VK_FORMAT_FEATURE_SAMPLED_IMAGE_YCBCR_CONVERSION_CHROMA_RECONSTRUCTION_EXPLICIT_BIT           = 0x00100000,
+	VK_FORMAT_FEATURE_SAMPLED_IMAGE_YCBCR_CONVERSION_CHROMA_RECONSTRUCTION_EXPLICIT_FORCEABLE_BIT = 0x00200000,
+	VK_FORMAT_FEATURE_DISJOINT_BIT                                                                = 0x00400000,
+	VK_FORMAT_FEATURE_COSITED_CHROMA_SAMPLES_BIT                                                  = 0x00800000,
+	VK_FORMAT_FEATURE_FRAGMENT_DENSITY_MAP_BIT_EXT                                                = 0x01000000,
+	VK_FORMAT_FEATURE_VIDEO_DECODE_OUTPUT_BIT_KHR                                                 = 0x02000000,
+	VK_FORMAT_FEATURE_VIDEO_DECODE_DPB_BIT_KHR                                                    = 0x04000000,
+	VK_FORMAT_FEATURE_VIDEO_ENCODE_DPB_BIT_KHR                                                    = 0x10000000,
+	VK_FORMAT_FEATURE_ACCELERATION_STRUCTURE_VERTEX_BUFFER_BIT_KHR                                = 0x20000000,
+	VK_FORMAT_FEATURE_FRAGMENT_SHADING_RATE_ATTACHMENT_BIT_KHR                                    = 0x40000000,
+	VK_FORMAT_FEATURE_FLAG_BITS_MAX_ENUM                                                          = 0x7FFFFFFF
+} VkFormatFeatureFlagBits;
+typedef VkFlags VkFormatFeatureFlags;
+
+
+typedef enum {
+	VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_BIT                                                           = (1ULL <<  0),
+	VK_FORMAT_FEATURE_2_STORAGE_IMAGE_BIT                                                           = (1ULL <<  1),
+	VK_FORMAT_FEATURE_2_STORAGE_IMAGE_ATOMIC_BIT                                                    = (1ULL <<  2),
+	VK_FORMAT_FEATURE_2_UNIFORM_TEXEL_BUFFER_BIT                                                    = (1ULL <<  3),
+	VK_FORMAT_FEATURE_2_STORAGE_TEXEL_BUFFER_BIT                                                    = (1ULL <<  4),
+	VK_FORMAT_FEATURE_2_STORAGE_TEXEL_BUFFER_ATOMIC_BIT                                             = (1ULL <<  5),
+	VK_FORMAT_FEATURE_2_VERTEX_BUFFER_BIT                                                           = (1ULL <<  6),
+	VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BIT                                                        = (1ULL <<  7),
+	VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BLEND_BIT                                                  = (1ULL <<  8),
+	VK_FORMAT_FEATURE_2_DEPTH_STENCIL_ATTACHMENT_BIT                                                = (1ULL <<  9),
+	VK_FORMAT_FEATURE_2_BLIT_SRC_BIT                                                                = (1ULL << 10),
+	VK_FORMAT_FEATURE_2_BLIT_DST_BIT                                                                = (1ULL << 11),
+	VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_FILTER_LINEAR_BIT                                             = (1ULL << 12),
+	VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_FILTER_CUBIC_BIT                                              = (1ULL << 13),
+	VK_FORMAT_FEATURE_2_TRANSFER_SRC_BIT                                                            = (1ULL << 14),
+	VK_FORMAT_FEATURE_2_TRANSFER_DST_BIT                                                            = (1ULL << 15),
+	VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_FILTER_MINMAX_BIT                                             = (1ULL << 16),
+	VK_FORMAT_FEATURE_2_MIDPOINT_CHROMA_SAMPLES_BIT                                                 = (1ULL << 17),
+	VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_YCBCR_CONVERSION_LINEAR_FILTER_BIT                            = (1ULL << 18),
+	VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_YCBCR_CONVERSION_SEPARATE_RECONSTRUCTION_FILTER_BIT           = (1ULL << 19),
+	VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_YCBCR_CONVERSION_CHROMA_RECONSTRUCTION_EXPLICIT_BIT           = (1ULL << 20),
+	VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_YCBCR_CONVERSION_CHROMA_RECONSTRUCTION_EXPLICIT_FORCEABLE_BIT = (1ULL << 21),
+	VK_FORMAT_FEATURE_2_DISJOINT_BIT                                                                = (1ULL << 22),
+	VK_FORMAT_FEATURE_2_COSITED_CHROMA_SAMPLES_BIT                                                  = (1ULL << 23),
+	VK_FORMAT_FEATURE_2_FRAGMENT_DENSITY_MAP_BIT_EXT                                                = (1ULL << 24),
+	VK_FORMAT_FEATURE_2_VIDEO_DECODE_OUTPUT_BIT_KHR                                                 = (1ULL << 25),
+	VK_FORMAT_FEATURE_2_VIDEO_DECODE_DPB_BIT_KHR                                                    = (1ULL << 26),
+	VK_FORMAT_FEATURE_2_VIDEO_ENCODE_INPUT_BIT_KHR                                                  = (1ULL << 27),
+	VK_FORMAT_FEATURE_2_VIDEO_ENCODE_DPB_BIT_KHR                                                    = (1ULL << 28),
+	VK_FORMAT_FEATURE_2_ACCELERATION_STRUCTURE_VERTEX_BUFFER_BIT_KHR                                = (1ULL << 29),
+	VK_FORMAT_FEATURE_2_FRAGMENT_SHADING_RATE_ATTACHMENT_BIT_KHR                                    = (1ULL << 30),
+	VK_FORMAT_FEATURE_2_STORAGE_READ_WITHOUT_FORMAT_BIT                                             = (1ULL << 31),
+	VK_FORMAT_FEATURE_2_STORAGE_WRITE_WITHOUT_FORMAT_BIT                                            = (1ULL << 32),
+	VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_DEPTH_COMPARISON_BIT                                          = (1ULL << 33),
+	VK_FORMAT_FEATURE_2_WEIGHT_IMAGE_BIT_QCOM                                                       = (1ULL << 34),
+	VK_FORMAT_FEATURE_2_WEIGHT_SAMPLED_IMAGE_BIT_QCOM                                               = (1ULL << 35),
+	VK_FORMAT_FEATURE_2_BLOCK_MATCHING_BIT_QCOM                                                     = (1ULL << 36),
+	VK_FORMAT_FEATURE_2_BOX_FILTER_SAMPLED_BIT_QCOM                                                 = (1ULL << 37),
+	VK_FORMAT_FEATURE_2_LINEAR_COLOR_ATTACHMENT_BIT_NV                                              = (1ULL << 38),
+	VK_FORMAT_FEATURE_2_TENSOR_SHADER_BIT_ARM                                                       = (1ULL << 39),
+	VK_FORMAT_FEATURE_2_OPTICAL_FLOW_IMAGE_BIT_NV                                                   = (1ULL << 40),
+	VK_FORMAT_FEATURE_2_OPTICAL_FLOW_VECTOR_BIT_NV                                                  = (1ULL << 41),
+	VK_FORMAT_FEATURE_2_OPTICAL_FLOW_COST_BIT_NV                                                    = (1ULL << 42),
+	VK_FORMAT_FEATURE_2_TENSOR_IMAGE_ALIASING_BIT_ARM                                               = (1ULL << 43),
+
+	VK_FORMAT_FEATURE_2_HOST_IMAGE_TRANSFER_BIT                                                     = (1ULL << 46),
+
+	VK_FORMAT_FEATURE_2_TENSOR_DATA_GRAPH_BIT_ARM                                                   = (1ULL << 48),
+	VK_FORMAT_FEATURE_2_VIDEO_ENCODE_QUANTIZATION_DELTA_MAP_BIT_KHR                                 = (1ULL << 49),
+	VK_FORMAT_FEATURE_2_VIDEO_ENCODE_EMPHASIS_MAP_BIT_KHR                                           = (1ULL << 50),
+	VK_FORMAT_FEATURE_2_ACCELERATION_STRUCTURE_RADIUS_BUFFER_BIT_NV                                 = (1ULL << 51),
+	VK_FORMAT_FEATURE_2_DEPTH_COPY_ON_COMPUTE_QUEUE_BIT_KHR                                         = (1ULL << 52),
+	VK_FORMAT_FEATURE_2_DEPTH_COPY_ON_TRANSFER_QUEUE_BIT_KHR                                        = (1ULL << 53),
+	VK_FORMAT_FEATURE_2_STENCIL_COPY_ON_COMPUTE_QUEUE_BIT_KHR                                       = (1ULL << 54),
+	VK_FORMAT_FEATURE_2_STENCIL_COPY_ON_TRANSFER_QUEUE_BIT_KHR                                      = (1ULL << 55),
+
+	VK_FORMAT_FEATURE_2_COPY_IMAGE_INDIRECT_DST_BIT_KHR                                             = (1ULL << 59),
+} VkFormatFeatureFlagBits2;
+typedef VkFlags64 VkFormatFeatureFlags2;
+
+typedef enum {
+	VK_IMAGE_CREATE_SPARSE_BINDING_BIT                            = 0x00000001,
+	VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT                          = 0x00000002,
+	VK_IMAGE_CREATE_SPARSE_ALIASED_BIT                            = 0x00000004,
+	VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT                            = 0x00000008,
+	VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT                           = 0x00000010,
+	VK_IMAGE_CREATE_ALIAS_BIT                                     = 0x00000400,
+	VK_IMAGE_CREATE_SPLIT_INSTANCE_BIND_REGIONS_BIT               = 0x00000040,
+	VK_IMAGE_CREATE_2D_ARRAY_COMPATIBLE_BIT                       = 0x00000020,
+	VK_IMAGE_CREATE_BLOCK_TEXEL_VIEW_COMPATIBLE_BIT               = 0x00000080,
+	VK_IMAGE_CREATE_EXTENDED_USAGE_BIT                            = 0x00000100,
+	VK_IMAGE_CREATE_PROTECTED_BIT                                 = 0x00000800,
+	VK_IMAGE_CREATE_DISJOINT_BIT                                  = 0x00000200,
+	VK_IMAGE_CREATE_CORNER_SAMPLED_BIT_NV                         = 0x00002000,
+	VK_IMAGE_CREATE_SAMPLE_LOCATIONS_COMPATIBLE_DEPTH_BIT_EXT     = 0x00001000,
+	VK_IMAGE_CREATE_SUBSAMPLED_BIT_EXT                            = 0x00004000,
+	VK_IMAGE_CREATE_DESCRIPTOR_BUFFER_CAPTURE_REPLAY_BIT_EXT      = 0x00010000,
+	VK_IMAGE_CREATE_MULTISAMPLED_RENDER_TO_SINGLE_SAMPLED_BIT_EXT = 0x00040000,
+	VK_IMAGE_CREATE_2D_VIEW_COMPATIBLE_BIT_EXT                    = 0x00020000,
+	VK_IMAGE_CREATE_VIDEO_PROFILE_INDEPENDENT_BIT_KHR             = 0x00100000,
+	VK_IMAGE_CREATE_FRAGMENT_DENSITY_MAP_OFFSET_BIT_EXT           = 0x00008000,
+	VK_IMAGE_CREATE_FLAG_BITS_MAX_ENUM                            = 0x7FFFFFFF
+} VkImageCreateFlagBits;
+typedef VkFlags VkImageCreateFlags;
+
+typedef enum {
 	VK_SAMPLE_COUNT_1_BIT              = 0x00000001,
 	VK_SAMPLE_COUNT_2_BIT              = 0x00000002,
 	VK_SAMPLE_COUNT_4_BIT              = 0x00000004,
@@ -253,6 +546,14 @@ typedef enum {
 	VK_IMAGE_VIEW_TYPE_MAX_ENUM   = 0x7FFFFFFF
 } VkImageViewType;
 
+typedef enum VkIndexType {
+	VK_INDEX_TYPE_UINT16   = 0,
+	VK_INDEX_TYPE_UINT32   = 1,
+	VK_INDEX_TYPE_UINT8    = 1000265000,
+	VK_INDEX_TYPE_NONE_KHR = 1000165000,
+	VK_INDEX_TYPE_MAX_ENUM = 0x7FFFFFFF
+} VkIndexType;
+
 typedef enum {
 	VK_BLEND_FACTOR_ZERO                     = 0,
 	VK_BLEND_FACTOR_ONE                      = 1,
@@ -336,9 +637,22 @@ typedef enum {
 	VK_FENCE_CREATE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
 } VkFenceCreateFlagBits;
 typedef VkFlags VkFenceCreateFlags;
+
+typedef enum {
+	VK_QUERY_POOL_CREATE_RESET_BIT_KHR      = 0x00000001,
+	VK_QUERY_POOL_CREATE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkQueryPoolCreateFlagBits;
+typedef VkFlags VkQueryPoolCreateFlags;
+
 typedef VkFlags VkSemaphoreCreateFlags;
 
 typedef enum {
+	VK_SEMAPHORE_WAIT_ANY_BIT            = 0x00000001,
+	VK_SEMAPHORE_WAIT_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkSemaphoreWaitFlagBits;
+typedef VkFlags VkSemaphoreWaitFlags;
+
+typedef enum {
 	VK_IMAGE_VIEW_CREATE_FRAGMENT_DENSITY_MAP_DYNAMIC_BIT_EXT     = 0x00000001,
 	VK_IMAGE_VIEW_CREATE_DESCRIPTOR_BUFFER_CAPTURE_REPLAY_BIT_EXT = 0x00000004,
 	VK_IMAGE_VIEW_CREATE_FRAGMENT_DENSITY_MAP_DEFERRED_BIT_EXT    = 0x00000002,
@@ -816,6 +1130,20 @@ typedef enum {
 } VkFormat;
 
 typedef enum {
+	VK_IMAGE_TILING_OPTIMAL                 = 0,
+	VK_IMAGE_TILING_LINEAR                  = 1,
+	VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT = 1000158000,
+	VK_IMAGE_TILING_MAX_ENUM                = 0x7FFFFFFF
+} VkImageTiling;
+
+typedef enum {
+	VK_IMAGE_TYPE_1D       = 0,
+	VK_IMAGE_TYPE_2D       = 1,
+	VK_IMAGE_TYPE_3D       = 2,
+	VK_IMAGE_TYPE_MAX_ENUM = 0x7FFFFFFF
+} VkImageType;
+
+typedef enum {
 	VK_COLOR_SPACE_SRGB_NONLINEAR_KHR          = 0,
 	VK_COLOR_SPACE_DISPLAY_P3_NONLINEAR_EXT    = 1000104001,
 	VK_COLOR_SPACE_EXTENDED_SRGB_LINEAR_EXT    = 1000104002,
@@ -843,6 +1171,61 @@ typedef enum {
 } VkSharingMode;
 
 typedef enum {
+	VK_QUERY_RESULT_64_BIT                = 0x00000001,
+	VK_QUERY_RESULT_WAIT_BIT              = 0x00000002,
+	VK_QUERY_RESULT_WITH_AVAILABILITY_BIT = 0x00000004,
+	VK_QUERY_RESULT_PARTIAL_BIT           = 0x00000008,
+	VK_QUERY_RESULT_WITH_STATUS_BIT_KHR   = 0x00000010,
+	VK_QUERY_RESULT_FLAG_BITS_MAX_ENUM    = 0x7FFFFFFF
+} VkQueryResultFlagBits;
+typedef VkFlags VkQueryResultFlags;
+
+typedef enum {
+	VK_BUFFER_CREATE_SPARSE_BINDING_BIT                       = 0x00000001,
+	VK_BUFFER_CREATE_SPARSE_RESIDENCY_BIT                     = 0x00000002,
+	VK_BUFFER_CREATE_SPARSE_ALIASED_BIT                       = 0x00000004,
+	VK_BUFFER_CREATE_PROTECTED_BIT                            = 0x00000008,
+	VK_BUFFER_CREATE_DEVICE_ADDRESS_CAPTURE_REPLAY_BIT        = 0x00000010,
+	VK_BUFFER_CREATE_DESCRIPTOR_BUFFER_CAPTURE_REPLAY_BIT_EXT = 0x00000020,
+	VK_BUFFER_CREATE_VIDEO_PROFILE_INDEPENDENT_BIT_KHR        = 0x00000040,
+	VK_BUFFER_CREATE_FLAG_BITS_MAX_ENUM                       = 0x7FFFFFFF
+} VkBufferCreateFlagBits;
+typedef VkFlags VkBufferCreateFlags;
+
+typedef enum {
+	VK_BUFFER_USAGE_TRANSFER_SRC_BIT                                     = 0x00000001,
+	VK_BUFFER_USAGE_TRANSFER_DST_BIT                                     = 0x00000002,
+	VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT                             = 0x00000004,
+	VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT                             = 0x00000008,
+	VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT                                   = 0x00000010,
+	VK_BUFFER_USAGE_STORAGE_BUFFER_BIT                                   = 0x00000020,
+	VK_BUFFER_USAGE_INDEX_BUFFER_BIT                                     = 0x00000040,
+	VK_BUFFER_USAGE_VERTEX_BUFFER_BIT                                    = 0x00000080,
+	VK_BUFFER_USAGE_INDIRECT_BUFFER_BIT                                  = 0x00000100,
+	VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT                            = 0x00020000,
+	VK_BUFFER_USAGE_VIDEO_DECODE_SRC_BIT_KHR                             = 0x00002000,
+	VK_BUFFER_USAGE_VIDEO_DECODE_DST_BIT_KHR                             = 0x00004000,
+	VK_BUFFER_USAGE_TRANSFORM_FEEDBACK_BUFFER_BIT_EXT                    = 0x00000800,
+	VK_BUFFER_USAGE_TRANSFORM_FEEDBACK_COUNTER_BUFFER_BIT_EXT            = 0x00001000,
+	VK_BUFFER_USAGE_CONDITIONAL_RENDERING_BIT_EXT                        = 0x00000200,
+	VK_BUFFER_USAGE_EXECUTION_GRAPH_SCRATCH_BIT_AMDX                     = 0x02000000,
+	VK_BUFFER_USAGE_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT_KHR = 0x00080000,
+	VK_BUFFER_USAGE_ACCELERATION_STRUCTURE_STORAGE_BIT_KHR               = 0x00100000,
+	VK_BUFFER_USAGE_SHADER_BINDING_TABLE_BIT_KHR                         = 0x00000400,
+	VK_BUFFER_USAGE_VIDEO_ENCODE_DST_BIT_KHR                             = 0x00008000,
+	VK_BUFFER_USAGE_VIDEO_ENCODE_SRC_BIT_KHR                             = 0x00010000,
+	VK_BUFFER_USAGE_SAMPLER_DESCRIPTOR_BUFFER_BIT_EXT                    = 0x00200000,
+	VK_BUFFER_USAGE_RESOURCE_DESCRIPTOR_BUFFER_BIT_EXT                   = 0x00400000,
+	VK_BUFFER_USAGE_PUSH_DESCRIPTORS_DESCRIPTOR_BUFFER_BIT_EXT           = 0x04000000,
+	VK_BUFFER_USAGE_MICROMAP_BUILD_INPUT_READ_ONLY_BIT_EXT               = 0x00800000,
+	VK_BUFFER_USAGE_MICROMAP_STORAGE_BIT_EXT                             = 0x01000000,
+	VK_BUFFER_USAGE_TILE_MEMORY_BIT_QCOM                                 = 0x08000000,
+	VK_BUFFER_USAGE_FLAG_BITS_MAX_ENUM                                   = 0x7FFFFFFF
+} VkBufferUsageFlagBits;
+typedef VkFlags VkBufferUsageFlags;
+typedef VkFlags VkBufferViewCreateFlags;
+
+typedef enum {
 	VK_PIPELINE_SHADER_STAGE_CREATE_ALLOW_VARYING_SUBGROUP_SIZE_BIT = 0x00000001,
 	VK_PIPELINE_SHADER_STAGE_CREATE_REQUIRE_FULL_SUBGROUPS_BIT      = 0x00000002,
 	VK_PIPELINE_SHADER_STAGE_CREATE_FLAG_BITS_MAX_ENUM              = 0x7FFFFFFF
@@ -1079,12 +1462,6 @@ typedef enum {
 typedef VkFlags VkDescriptorSetLayoutCreateFlags;
 
 typedef enum {
-	VK_ATTACHMENT_DESCRIPTION_MAY_ALIAS_BIT      = 0x00000001,
-	VK_ATTACHMENT_DESCRIPTION_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
-} VkAttachmentDescriptionFlagBits;
-typedef VkFlags VkAttachmentDescriptionFlags;
-
-typedef enum {
 	VK_DESCRIPTOR_TYPE_SAMPLER                               = 0,
 	VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER                = 1,
 	VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE                         = 2,
@@ -1138,13 +1515,6 @@ typedef enum {
 } VkCommandBufferLevel;
 
 typedef enum {
-	VK_SUBPASS_CONTENTS_INLINE                                   = 0,
-	VK_SUBPASS_CONTENTS_SECONDARY_COMMAND_BUFFERS                = 1,
-	VK_SUBPASS_CONTENTS_INLINE_AND_SECONDARY_COMMAND_BUFFERS_KHR = 1000451000,
-	VK_SUBPASS_CONTENTS_MAX_ENUM                                 = 0x7FFFFFFF
-} VkSubpassContents;
-
-typedef enum {
 	VK_IMAGE_LAYOUT_UNDEFINED                                    = 0,
 	VK_IMAGE_LAYOUT_GENERAL                                      = 1,
 	VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL                     = 2,
@@ -1195,26 +1565,6 @@ typedef enum {
 typedef VkFlags VkFramebufferCreateFlags;
 
 typedef enum {
-	VK_RENDER_PASS_CREATE_TRANSFORM_BIT_QCOM = 0x00000002,
-	VK_RENDER_PASS_CREATE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
-} VkRenderPassCreateFlagBits;
-typedef VkFlags VkRenderPassCreateFlags;
-
-typedef enum {
-	VK_SUBPASS_DESCRIPTION_PER_VIEW_ATTRIBUTES_BIT_NVX                           = 0x00000001,
-	VK_SUBPASS_DESCRIPTION_PER_VIEW_POSITION_X_ONLY_BIT_NVX                      = 0x00000002,
-	VK_SUBPASS_DESCRIPTION_FRAGMENT_REGION_BIT_QCOM                              = 0x00000004,
-	VK_SUBPASS_DESCRIPTION_SHADER_RESOLVE_BIT_QCOM                               = 0x00000008,
-	VK_SUBPASS_DESCRIPTION_TILE_SHADING_APRON_BIT_QCOM                           = 0x00000100,
-	VK_SUBPASS_DESCRIPTION_RASTERIZATION_ORDER_ATTACHMENT_COLOR_ACCESS_BIT_EXT   = 0x00000010,
-	VK_SUBPASS_DESCRIPTION_RASTERIZATION_ORDER_ATTACHMENT_DEPTH_ACCESS_BIT_EXT   = 0x00000020,
-	VK_SUBPASS_DESCRIPTION_RASTERIZATION_ORDER_ATTACHMENT_STENCIL_ACCESS_BIT_EXT = 0x00000040,
-	VK_SUBPASS_DESCRIPTION_ENABLE_LEGACY_DITHERING_BIT_EXT                       = 0x00000080,
-	VK_SUBPASS_DESCRIPTION_FLAG_BITS_MAX_ENUM                                    = 0x7FFFFFFF
-} VkSubpassDescriptionFlagBits;
-typedef VkFlags VkSubpassDescriptionFlags;
-
-typedef enum {
 	VK_COMMAND_POOL_CREATE_TRANSIENT_BIT            = 0x00000001,
 	VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT = 0x00000002,
 	VK_COMMAND_POOL_CREATE_PROTECTED_BIT            = 0x00000004,
@@ -1296,24 +1646,24 @@ typedef enum {
 typedef VkFlags VkMemoryAllocateFlags;
 
 typedef enum {
-    VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT                       = 0x00000001,
-    VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT                    = 0x00000002,
-    VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT                = 0x00000004,
-    VK_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_TEXTURE_BIT                   = 0x00000008,
-    VK_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_TEXTURE_KMT_BIT               = 0x00000010,
-    VK_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP_BIT                      = 0x00000020,
-    VK_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE_BIT                  = 0x00000040,
-    VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT                     = 0x00000200,
-    VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID = 0x00000400,
-    VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT             = 0x00000080,
-    VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_MAPPED_FOREIGN_MEMORY_BIT_EXT  = 0x00000100,
-    VK_EXTERNAL_MEMORY_HANDLE_TYPE_ZIRCON_VMO_BIT_FUCHSIA              = 0x00000800,
-    VK_EXTERNAL_MEMORY_HANDLE_TYPE_RDMA_ADDRESS_BIT_NV                 = 0x00001000,
-    VK_EXTERNAL_MEMORY_HANDLE_TYPE_SCREEN_BUFFER_BIT_QNX               = 0x00004000,
-    VK_EXTERNAL_MEMORY_HANDLE_TYPE_MTLBUFFER_BIT_EXT                   = 0x00010000,
-    VK_EXTERNAL_MEMORY_HANDLE_TYPE_MTLTEXTURE_BIT_EXT                  = 0x00020000,
-    VK_EXTERNAL_MEMORY_HANDLE_TYPE_MTLHEAP_BIT_EXT                     = 0x00040000,
-    VK_EXTERNAL_MEMORY_HANDLE_TYPE_FLAG_BITS_MAX_ENUM                  = 0x7FFFFFFF,
+	VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT                       = 0x00000001,
+	VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT                    = 0x00000002,
+	VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT                = 0x00000004,
+	VK_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_TEXTURE_BIT                   = 0x00000008,
+	VK_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_TEXTURE_KMT_BIT               = 0x00000010,
+	VK_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP_BIT                      = 0x00000020,
+	VK_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE_BIT                  = 0x00000040,
+	VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT                     = 0x00000200,
+	VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID = 0x00000400,
+	VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT             = 0x00000080,
+	VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_MAPPED_FOREIGN_MEMORY_BIT_EXT  = 0x00000100,
+	VK_EXTERNAL_MEMORY_HANDLE_TYPE_ZIRCON_VMO_BIT_FUCHSIA              = 0x00000800,
+	VK_EXTERNAL_MEMORY_HANDLE_TYPE_RDMA_ADDRESS_BIT_NV                 = 0x00001000,
+	VK_EXTERNAL_MEMORY_HANDLE_TYPE_SCREEN_BUFFER_BIT_QNX               = 0x00004000,
+	VK_EXTERNAL_MEMORY_HANDLE_TYPE_MTLBUFFER_BIT_EXT                   = 0x00010000,
+	VK_EXTERNAL_MEMORY_HANDLE_TYPE_MTLTEXTURE_BIT_EXT                  = 0x00020000,
+	VK_EXTERNAL_MEMORY_HANDLE_TYPE_MTLHEAP_BIT_EXT                     = 0x00040000,
+	VK_EXTERNAL_MEMORY_HANDLE_TYPE_FLAG_BITS_MAX_ENUM                  = 0x7FFFFFFF,
 } VkExternalMemoryHandleTypeFlagBits;
 typedef VkFlags VkExternalMemoryHandleTypeFlags;
 
@@ -1328,6 +1678,44 @@ typedef enum {
 } VkExternalSemaphoreHandleTypeFlagBits;
 typedef VkFlags VkExternalSemaphoreHandleTypeFlags;
 
+typedef enum {
+	VK_SEMAPHORE_TYPE_BINARY   = 0,
+	VK_SEMAPHORE_TYPE_TIMELINE = 1,
+	VK_SEMAPHORE_TYPE_MAX_ENUM = 0x7FFFFFFF
+} VkSemaphoreType;
+
+typedef enum {
+	VK_RESOLVE_MODE_NONE                                   = 0,
+	VK_RESOLVE_MODE_SAMPLE_ZERO_BIT                        = 0x00000001,
+	VK_RESOLVE_MODE_AVERAGE_BIT                            = 0x00000002,
+	VK_RESOLVE_MODE_MIN_BIT                                = 0x00000004,
+	VK_RESOLVE_MODE_MAX_BIT                                = 0x00000008,
+	VK_RESOLVE_MODE_EXTERNAL_FORMAT_DOWNSAMPLE_BIT_ANDROID = 0x00000010,
+	VK_RESOLVE_MODE_CUSTOM_BIT_EXT                         = 0x00000020,
+	VK_RESOLVE_MODE_FLAG_BITS_MAX_ENUM                     = 0x7FFFFFFF
+} VkResolveModeFlagBits;
+typedef VkFlags VkResolveModeFlags;
+
+typedef enum {
+	VK_RENDERING_CONTENTS_SECONDARY_COMMAND_BUFFERS_BIT       = 0x00000001,
+	VK_RENDERING_SUSPENDING_BIT                               = 0x00000002,
+	VK_RENDERING_RESUMING_BIT                                 = 0x00000004,
+	VK_RENDERING_ENABLE_LEGACY_DITHERING_BIT_EXT              = 0x00000008,
+	VK_RENDERING_CONTENTS_INLINE_BIT_KHR                      = 0x00000010,
+	VK_RENDERING_PER_LAYER_FRAGMENT_DENSITY_BIT_VALVE         = 0x00000020,
+	VK_RENDERING_FRAGMENT_REGION_BIT_EXT                      = 0x00000040,
+	VK_RENDERING_CUSTOM_RESOLVE_BIT_EXT                       = 0x00000080,
+	VK_RENDERING_LOCAL_READ_CONCURRENT_ACCESS_CONTROL_BIT_KHR = 0x00000100,
+	VK_RENDERING_FLAG_BITS_MAX_ENUM                           = 0x7FFFFFFF
+} VkRenderingFlagBits;
+typedef VkFlags VkRenderingFlags;
+
+typedef enum {
+	VK_SUBMIT_PROTECTED_BIT      = 0x00000001,
+	VK_SUBMIT_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkSubmitFlagBits;
+typedef VkFlags VkSubmitFlags;
+
 typedef struct {
 	uint32_t width;
 	uint32_t height;
@@ -1371,18 +1759,6 @@ typedef struct {
 } VkLayerProperties;
 
 typedef struct {
-	VkStructureType              sType;
-	const void *                 pNext;
-	uint32_t                     waitSemaphoreCount;
-	const VkSemaphore *          pWaitSemaphores;
-	const VkPipelineStageFlags * pWaitDstStageMask;
-	uint32_t                     commandBufferCount;
-	const VkCommandBuffer *      pCommandBuffers;
-	uint32_t                     signalSemaphoreCount;
-	const VkSemaphore *          pSignalSemaphores;
-} VkSubmitInfo;
-
-typedef struct {
 	VkStructureType sType;
 	const void *    pNext;
 	const char *    pApplicationName;
@@ -1393,6 +1769,26 @@ typedef struct {
 } VkApplicationInfo;
 
 typedef struct {
+	VkFormatFeatureFlags linearTilingFeatures;
+	VkFormatFeatureFlags optimalTilingFeatures;
+	VkFormatFeatureFlags bufferFeatures;
+} VkFormatProperties;
+
+typedef struct {
+	VkStructureType    sType;
+	void *             pNext;
+	VkFormatProperties formatProperties;
+} VkFormatProperties2;
+
+typedef struct {
+	VkStructureType       sType;
+	void *                pNext;
+	VkFormatFeatureFlags2 linearTilingFeatures;
+	VkFormatFeatureFlags2 optimalTilingFeatures;
+	VkFormatFeatureFlags2 bufferFeatures;
+} VkFormatProperties3;
+
+typedef struct {
 	VkStructureType           sType;
 	const void *              pNext;
 	VkInstanceCreateFlags     flags;
@@ -1640,6 +2036,107 @@ typedef struct {
 } VkPhysicalDeviceFeatures;
 
 typedef struct {
+	VkStructureType sType;
+	void *          pNext;
+	VkBool32        storageBuffer16BitAccess;
+	VkBool32        uniformAndStorageBuffer16BitAccess;
+	VkBool32        storagePushConstant16;
+	VkBool32        storageInputOutput16;
+	VkBool32        multiview;
+	VkBool32        multiviewGeometryShader;
+	VkBool32        multiviewTessellationShader;
+	VkBool32        variablePointersStorageBuffer;
+	VkBool32        variablePointers;
+	VkBool32        protectedMemory;
+	VkBool32        samplerYcbcrConversion;
+	VkBool32        shaderDrawParameters;
+} VkPhysicalDeviceVulkan11Features;
+
+typedef struct {
+	VkStructureType sType;
+	void *          pNext;
+	VkBool32        samplerMirrorClampToEdge;
+	VkBool32        drawIndirectCount;
+	VkBool32        storageBuffer8BitAccess;
+	VkBool32        uniformAndStorageBuffer8BitAccess;
+	VkBool32        storagePushConstant8;
+	VkBool32        shaderBufferInt64Atomics;
+	VkBool32        shaderSharedInt64Atomics;
+	VkBool32        shaderFloat16;
+	VkBool32        shaderInt8;
+	VkBool32        descriptorIndexing;
+	VkBool32        shaderInputAttachmentArrayDynamicIndexing;
+	VkBool32        shaderUniformTexelBufferArrayDynamicIndexing;
+	VkBool32        shaderStorageTexelBufferArrayDynamicIndexing;
+	VkBool32        shaderUniformBufferArrayNonUniformIndexing;
+	VkBool32        shaderSampledImageArrayNonUniformIndexing;
+	VkBool32        shaderStorageBufferArrayNonUniformIndexing;
+	VkBool32        shaderStorageImageArrayNonUniformIndexing;
+	VkBool32        shaderInputAttachmentArrayNonUniformIndexing;
+	VkBool32        shaderUniformTexelBufferArrayNonUniformIndexing;
+	VkBool32        shaderStorageTexelBufferArrayNonUniformIndexing;
+	VkBool32        descriptorBindingUniformBufferUpdateAfterBind;
+	VkBool32        descriptorBindingSampledImageUpdateAfterBind;
+	VkBool32        descriptorBindingStorageImageUpdateAfterBind;
+	VkBool32        descriptorBindingStorageBufferUpdateAfterBind;
+	VkBool32        descriptorBindingUniformTexelBufferUpdateAfterBind;
+	VkBool32        descriptorBindingStorageTexelBufferUpdateAfterBind;
+	VkBool32        descriptorBindingUpdateUnusedWhilePending;
+	VkBool32        descriptorBindingPartiallyBound;
+	VkBool32        descriptorBindingVariableDescriptorCount;
+	VkBool32        runtimeDescriptorArray;
+	VkBool32        samplerFilterMinmax;
+	VkBool32        scalarBlockLayout;
+	VkBool32        imagelessFramebuffer;
+	VkBool32        uniformBufferStandardLayout;
+	VkBool32        shaderSubgroupExtendedTypes;
+	VkBool32        separateDepthStencilLayouts;
+	VkBool32        hostQueryReset;
+	VkBool32        timelineSemaphore;
+	VkBool32        bufferDeviceAddress;
+	VkBool32        bufferDeviceAddressCaptureReplay;
+	VkBool32        bufferDeviceAddressMultiDevice;
+	VkBool32        vulkanMemoryModel;
+	VkBool32        vulkanMemoryModelDeviceScope;
+	VkBool32        vulkanMemoryModelAvailabilityVisibilityChains;
+	VkBool32        shaderOutputViewportIndex;
+	VkBool32        shaderOutputLayer;
+	VkBool32        subgroupBroadcastDynamicId;
+} VkPhysicalDeviceVulkan12Features;
+
+typedef struct {
+	VkStructureType sType;
+	void *          pNext;
+	VkBool32        robustImageAccess;
+	VkBool32        inlineUniformBlock;
+	VkBool32        descriptorBindingInlineUniformBlockUpdateAfterBind;
+	VkBool32        pipelineCreationCacheControl;
+	VkBool32        privateData;
+	VkBool32        shaderDemoteToHelperInvocation;
+	VkBool32        shaderTerminateInvocation;
+	VkBool32        subgroupSizeControl;
+	VkBool32        computeFullSubgroups;
+	VkBool32        synchronization2;
+	VkBool32        textureCompressionASTC_HDR;
+	VkBool32        shaderZeroInitializeWorkgroupMemory;
+	VkBool32        dynamicRendering;
+	VkBool32        shaderIntegerDotProduct;
+	VkBool32        maintenance4;
+} VkPhysicalDeviceVulkan13Features;
+
+typedef struct {
+	VkStructureType sType;
+	void *          pNext;
+	VkBool32        shaderRelaxedExtendedInstruction;
+} VkPhysicalDeviceShaderRelaxedExtendedInstructionFeaturesKHR;
+
+typedef struct {
+	VkStructureType          sType;
+	void *                   pNext;
+	VkPhysicalDeviceFeatures features;
+} VkPhysicalDeviceFeatures2;
+
+typedef struct {
 	VkQueueFlags queueFlags;
 	uint32_t     queueCount;
 	uint32_t     timestampValidBits;
@@ -1718,7 +2215,6 @@ typedef struct {
 	VkColorSpaceKHR colorSpace;
 } VkSurfaceFormatKHR;
 
-
 typedef struct {
 	VkStructureType    sType;
 	const void *       pNext;
@@ -1726,39 +2222,82 @@ typedef struct {
 } VkFenceCreateInfo;
 
 typedef struct {
+	VkStructureType               sType;
+	const void *                  pNext;
+	VkQueryPoolCreateFlags        flags;
+	VkQueryType                   queryType;
+	uint32_t                      queryCount;
+	VkQueryPipelineStatisticFlags pipelineStatistics;
+} VkQueryPoolCreateInfo;
+
+typedef struct {
 	VkStructureType        sType;
 	const void *           pNext;
 	VkSemaphoreCreateFlags flags;
 } VkSemaphoreCreateInfo;
 
 typedef struct {
-    VkStructureType                    sType;
-    const void *                       pNext;
-    VkExternalSemaphoreHandleTypeFlags handleTypes;
+	VkStructureType sType;
+	const void *    pNext;
+	VkSemaphoreType semaphoreType;
+	uint64_t        initialValue;
+} VkSemaphoreTypeCreateInfo;
+
+typedef struct {
+	VkStructureType  sType;
+	const void *     pNext;
+	uint32_t         waitSemaphoreValueCount;
+	const uint64_t * pWaitSemaphoreValues;
+	uint32_t         signalSemaphoreValueCount;
+	const uint64_t * pSignalSemaphoreValues;
+} VkTimelineSemaphoreSubmitInfo;
+
+typedef struct {
+	VkStructureType      sType;
+	const void *         pNext;
+	VkSemaphoreWaitFlags flags;
+	uint32_t             semaphoreCount;
+	const VkSemaphore *  pSemaphores;
+	const uint64_t *     pValues;
+} VkSemaphoreWaitInfo;
+
+typedef struct {
+	VkStructureType sType;
+	const void *    pNext;
+	VkSemaphore     semaphore;
+	uint64_t        value;
+} VkSemaphoreSignalInfo;
+
+typedef struct {
+	VkStructureType       sType;
+	const void *          pNext;
+	VkSemaphore           semaphore;
+	uint64_t              value;
+	VkPipelineStageFlags2 stageMask;
+	uint32_t              deviceIndex;
+} VkSemaphoreSubmitInfo;
+
+typedef struct {
+	VkStructureType                    sType;
+	const void *                       pNext;
+	VkExternalSemaphoreHandleTypeFlags handleTypes;
 } VkExportSemaphoreCreateInfo;
 
 typedef struct {
-    VkStructureType                       sType;
-    const void *                          pNext;
-    VkSemaphore                           semaphore;
-    VkExternalSemaphoreHandleTypeFlagBits handleType;
+	VkStructureType                       sType;
+	const void *                          pNext;
+	VkSemaphore                           semaphore;
+	VkExternalSemaphoreHandleTypeFlagBits handleType;
 } VkSemaphoreGetWin32HandleInfoKHR;
 
 typedef struct {
-    VkStructureType                       sType;
-    const void *                          pNext;
-    VkSemaphore                           semaphore;
-    VkExternalSemaphoreHandleTypeFlagBits handleType;
+	VkStructureType                       sType;
+	const void *                          pNext;
+	VkSemaphore                           semaphore;
+	VkExternalSemaphoreHandleTypeFlagBits handleType;
 } VkSemaphoreGetFdInfoKHR;
 
 typedef struct {
-	VkComponentSwizzle r;
-	VkComponentSwizzle g;
-	VkComponentSwizzle b;
-	VkComponentSwizzle a;
-} VkComponentMapping;
-
-typedef struct {
 	VkImageAspectFlags aspectMask;
 	uint32_t           baseMipLevel;
 	uint32_t           levelCount;
@@ -1767,6 +2306,64 @@ typedef struct {
 } VkImageSubresourceRange;
 
 typedef struct {
+	VkStructureType       sType;
+	const void *          pNext;
+	VkPipelineStageFlags2 srcStageMask;
+	VkAccessFlags2        srcAccessMask;
+	VkPipelineStageFlags2 dstStageMask;
+	VkAccessFlags2        dstAccessMask;
+} VkMemoryBarrier2;
+
+typedef struct {
+	VkStructureType       sType;
+	const void *          pNext;
+	VkPipelineStageFlags2 srcStageMask;
+	VkAccessFlags2        srcAccessMask;
+	VkPipelineStageFlags2 dstStageMask;
+	VkAccessFlags2        dstAccessMask;
+	uint32_t              srcQueueFamilyIndex;
+	uint32_t              dstQueueFamilyIndex;
+	VkBuffer              buffer;
+	VkDeviceSize          offset;
+	VkDeviceSize          size;
+} VkBufferMemoryBarrier2;
+
+typedef struct {
+	VkStructureType         sType;
+	const void *            pNext;
+	VkPipelineStageFlags2   srcStageMask;
+	VkAccessFlags2          srcAccessMask;
+	VkPipelineStageFlags2   dstStageMask;
+	VkAccessFlags2          dstAccessMask;
+	VkImageLayout           oldLayout;
+	VkImageLayout           newLayout;
+	uint32_t                srcQueueFamilyIndex;
+	uint32_t                dstQueueFamilyIndex;
+	VkImage                 image;
+	VkImageSubresourceRange subresourceRange;
+} VkImageMemoryBarrier2;
+
+typedef struct {
+	VkStructureType                sType;
+	const void *                   pNext;
+	VkDependencyFlags              dependencyFlags;
+	uint32_t                       memoryBarrierCount;
+	const VkMemoryBarrier2 *       pMemoryBarriers;
+	uint32_t                       bufferMemoryBarrierCount;
+	const VkBufferMemoryBarrier2 * pBufferMemoryBarriers;
+	uint32_t                       imageMemoryBarrierCount;
+	const VkImageMemoryBarrier2 *  pImageMemoryBarriers;
+} VkDependencyInfo;
+
+
+typedef struct {
+	VkComponentSwizzle r;
+	VkComponentSwizzle g;
+	VkComponentSwizzle b;
+	VkComponentSwizzle a;
+} VkComponentMapping;
+
+typedef struct {
 	VkStructureType         sType;
 	const void *            pNext;
 	VkImageViewCreateFlags  flags;
@@ -1778,6 +2375,35 @@ typedef struct {
 } VkImageViewCreateInfo;
 
 typedef struct {
+	VkStructureType     sType;
+	const void *        pNext;
+	VkBufferCreateFlags flags;
+	VkDeviceSize        size;
+	VkBufferUsageFlags  usage;
+	VkSharingMode       sharingMode;
+	uint32_t            queueFamilyIndexCount;
+	const uint32_t *    pQueueFamilyIndices;
+} VkBufferCreateInfo;
+
+typedef struct {
+	VkStructureType       sType;
+	const void *          pNext;
+	VkImageCreateFlags    flags;
+	VkImageType           imageType;
+	VkFormat              format;
+	VkExtent3D            extent;
+	uint32_t              mipLevels;
+	uint32_t              arrayLayers;
+	VkSampleCountFlagBits samples;
+	VkImageTiling         tiling;
+	VkImageUsageFlags     usage;
+	VkSharingMode         sharingMode;
+	uint32_t              queueFamilyIndexCount;
+	const uint32_t *      pQueueFamilyIndices;
+	VkImageLayout         initialLayout;
+} VkImageCreateInfo;
+
+typedef struct {
 	VkStructureType           sType;
 	const void *              pNext;
 	VkShaderModuleCreateFlags flags;
@@ -1965,69 +2591,57 @@ typedef struct {
 	const VkPushConstantRange *   pPushConstantRanges;
 } VkPipelineLayoutCreateInfo;
 
-typedef struct {
-	VkAttachmentDescriptionFlags flags;
-	VkFormat                     format;
-	VkSampleCountFlagBits        samples;
-	VkAttachmentLoadOp           loadOp;
-	VkAttachmentStoreOp          storeOp;
-	VkAttachmentLoadOp           stencilLoadOp;
-	VkAttachmentStoreOp          stencilStoreOp;
-	VkImageLayout                initialLayout;
-	VkImageLayout                finalLayout;
-} VkAttachmentDescription;
+typedef union {
+	float    float32[4];
+	int32_t  int32[4];
+	uint32_t uint32[4];
+} VkClearColorValue;
 
 typedef struct {
-	uint32_t      attachment;
-	VkImageLayout layout;
-} VkAttachmentReference;
+	float    depth;
+	uint32_t stencil;
+} VkClearDepthStencilValue;
+
+typedef union {
+	VkClearColorValue        color;
+	VkClearDepthStencilValue depthStencil;
+} VkClearValue;
 
 typedef struct {
-	VkStructureType          sType;
-	const void *             pNext;
-	VkFramebufferCreateFlags flags;
-	VkRenderPass             renderPass;
-	uint32_t                 attachmentCount;
-	const VkImageView *      pAttachments;
-	uint32_t                 width;
-	uint32_t                 height;
-	uint32_t                 layers;
-} VkFramebufferCreateInfo;
-
-typedef struct {
-	VkSubpassDescriptionFlags     flags;
-	VkPipelineBindPoint           pipelineBindPoint;
-	uint32_t                      inputAttachmentCount;
-	const VkAttachmentReference * pInputAttachments;
-	uint32_t                      colorAttachmentCount;
-	const VkAttachmentReference * pColorAttachments;
-	const VkAttachmentReference * pResolveAttachments;
-	const VkAttachmentReference * pDepthStencilAttachment;
-	uint32_t                      preserveAttachmentCount;
-	const uint32_t *              pPreserveAttachments;
-} VkSubpassDescription;
-
-typedef struct {
-	uint32_t             srcSubpass;
-	uint32_t             dstSubpass;
-	VkPipelineStageFlags srcStageMask;
-	VkPipelineStageFlags dstStageMask;
-	VkAccessFlags        srcAccessMask;
-	VkAccessFlags        dstAccessMask;
-	VkDependencyFlags    dependencyFlags;
-} VkSubpassDependency;
+	VkStructureType       sType;
+	const void *          pNext;
+	VkImageView           imageView;
+	VkImageLayout         imageLayout;
+	VkResolveModeFlagBits resolveMode;
+	VkImageView           resolveImageView;
+	VkImageLayout         resolveImageLayout;
+	VkAttachmentLoadOp    loadOp;
+	VkAttachmentStoreOp   storeOp;
+	VkClearValue          clearValue;
+} VkRenderingAttachmentInfo;
 
 typedef struct {
-	VkStructureType                 sType;
-	const void *                    pNext;
-	VkRenderPassCreateFlags         flags;
-	uint32_t                        attachmentCount;
-	const VkAttachmentDescription * pAttachments;
-	uint32_t                        subpassCount;
-	const VkSubpassDescription *    pSubpasses;
-	uint32_t                        dependencyCount;
-	const VkSubpassDependency *     pDependencies;
-} VkRenderPassCreateInfo;
+	VkStructureType                   sType;
+	const void *                      pNext;
+	VkRenderingFlags                  flags;
+	VkRect2D                          renderArea;
+	uint32_t                          layerCount;
+	uint32_t                          viewMask;
+	uint32_t                          colorAttachmentCount;
+	const VkRenderingAttachmentInfo * pColorAttachments;
+	const VkRenderingAttachmentInfo * pDepthAttachment;
+	const VkRenderingAttachmentInfo * pStencilAttachment;
+} VkRenderingInfo;
+
+typedef struct {
+	VkStructureType  sType;
+	const void *     pNext;
+	uint32_t         viewMask;
+	uint32_t         colorAttachmentCount;
+	const VkFormat * pColorAttachmentFormats;
+	VkFormat         depthAttachmentFormat;
+	VkFormat         stencilAttachmentFormat;
+} VkPipelineRenderingCreateInfo;
 
 typedef struct {
 	VkStructureType          sType;
@@ -2062,21 +2676,41 @@ typedef struct {
 	const VkCommandBufferInheritanceInfo * pInheritanceInfo;
 } VkCommandBufferBeginInfo;
 
-typedef union {
-	float    float32[4];
-	int32_t  int32[4];
-	uint32_t uint32[4];
-} VkClearColorValue;
+typedef struct {
+	VkStructureType sType;
+	const void *    pNext;
+	VkCommandBuffer commandBuffer;
+	uint32_t        deviceMask;
+} VkCommandBufferSubmitInfo;
 
 typedef struct {
-	float    depth;
-	uint32_t stencil;
-} VkClearDepthStencilValue;
+	VkStructureType                   sType;
+	const void *                      pNext;
+	VkSubmitFlags                     flags;
+	uint32_t                          waitSemaphoreInfoCount;
+	const VkSemaphoreSubmitInfo *     pWaitSemaphoreInfos;
+	uint32_t                          commandBufferInfoCount;
+	const VkCommandBufferSubmitInfo * pCommandBufferInfos;
+	uint32_t                          signalSemaphoreInfoCount;
+	const VkSemaphoreSubmitInfo *     pSignalSemaphoreInfos;
+} VkSubmitInfo2;
 
-typedef union {
-	VkClearColorValue        color;
-	VkClearDepthStencilValue depthStencil;
-} VkClearValue;
+typedef struct {
+	VkStructureType sType;
+	const void *    pNext;
+	VkDeviceSize    srcOffset;
+	VkDeviceSize    dstOffset;
+	VkDeviceSize    size;
+} VkBufferCopy2;
+
+typedef struct {
+	VkStructureType       sType;
+	const void *          pNext;
+	VkBuffer              srcBuffer;
+	VkBuffer              dstBuffer;
+	uint32_t              regionCount;
+	const VkBufferCopy2 * pRegions;
+} VkCopyBufferInfo2;
 
 typedef struct {
 	VkStructureType                                sType;
@@ -2126,25 +2760,44 @@ typedef struct {
 } VkAllocationCallbacks;
 
 typedef struct {
-    VkStructureType sType;
-    const void *    pNext;
-    VkDeviceMemory  memory;
-    VkDeviceSize    offset;
-    VkDeviceSize    size;
+	VkStructureType sType;
+	const void *    pNext;
+	VkDeviceMemory  memory;
+	VkDeviceSize    offset;
+	VkDeviceSize    size;
 } VkMappedMemoryRange;
 
 typedef struct {
 	VkStructureType sType;
 	const void *    pNext;
+	VkBuffer        buffer;
+} VkBufferDeviceAddressInfo;
+
+typedef struct {
+	VkStructureType sType;
+	const void *    pNext;
 	VkDeviceSize    allocationSize;
 	uint32_t        memoryTypeIndex;
 } VkMemoryAllocateInfo;
 
 typedef struct {
-    VkStructureType       sType;
-    const void *          pNext;
-    VkMemoryAllocateFlags flags;
-    uint32_t              deviceMask;
+	VkDeviceSize size;
+	VkDeviceSize alignment;
+	uint32_t     memoryTypeBits;
+} VkMemoryRequirements;
+
+typedef struct {
+	VkStructureType sType;
+	const void *    pNext;
+	VkImage         image;
+	VkBuffer        buffer;
+} VkMemoryDedicatedAllocateInfo;
+
+typedef struct {
+	VkStructureType       sType;
+	const void *          pNext;
+	VkMemoryAllocateFlags flags;
+	uint32_t              deviceMask;
 } VkMemoryAllocateFlagsInfo;
 
 typedef struct {
@@ -2154,17 +2807,23 @@ typedef struct {
 } VkExportMemoryAllocateInfo;
 
 typedef struct {
-    VkStructureType                    sType;
-    const void *                       pNext;
-    VkDeviceMemory                     memory;
-    VkExternalMemoryHandleTypeFlagBits handleType;
+	VkStructureType                 sType;
+	const void *                    pNext;
+	VkExternalMemoryHandleTypeFlags handleTypes;
+} VkExternalMemoryImageCreateInfo;
+
+typedef struct {
+	VkStructureType                    sType;
+	const void *                       pNext;
+	VkDeviceMemory                     memory;
+	VkExternalMemoryHandleTypeFlagBits handleType;
 } VkMemoryGetWin32HandleInfoKHR;
 
 typedef struct {
-    VkStructureType                    sType;
-    const void *                       pNext;
-    VkDeviceMemory                     memory;
-    VkExternalMemoryHandleTypeFlagBits handleType;
+	VkStructureType                    sType;
+	const void *                       pNext;
+	VkDeviceMemory                     memory;
+	VkExternalMemoryHandleTypeFlagBits handleType;
 } VkMemoryGetFdInfoKHR;
 
 typedef struct {
@@ -2193,6 +2852,36 @@ typedef struct {
 	const VkDescriptorSetLayoutBinding * pBindings;
 } VkDescriptorSetLayoutCreateInfo;
 
+typedef enum {
+	VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_EXT                      = 0,
+	VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_RESERVE_BINDING_SLOT_EXT = 1,
+	VK_VALIDATION_FEATURE_ENABLE_BEST_PRACTICES_EXT                    = 2,
+	VK_VALIDATION_FEATURE_ENABLE_DEBUG_PRINTF_EXT                      = 3,
+	VK_VALIDATION_FEATURE_ENABLE_SYNCHRONIZATION_VALIDATION_EXT        = 4,
+	VK_VALIDATION_FEATURE_ENABLE_MAX_ENUM_EXT                          = 0x7FFFFFFF
+} VkValidationFeatureEnableEXT;
+
+typedef enum {
+	VK_VALIDATION_FEATURE_DISABLE_ALL_EXT                     = 0,
+	VK_VALIDATION_FEATURE_DISABLE_SHADERS_EXT                 = 1,
+	VK_VALIDATION_FEATURE_DISABLE_THREAD_SAFETY_EXT           = 2,
+	VK_VALIDATION_FEATURE_DISABLE_API_PARAMETERS_EXT          = 3,
+	VK_VALIDATION_FEATURE_DISABLE_OBJECT_LIFETIMES_EXT        = 4,
+	VK_VALIDATION_FEATURE_DISABLE_CORE_CHECKS_EXT             = 5,
+	VK_VALIDATION_FEATURE_DISABLE_UNIQUE_HANDLES_EXT          = 6,
+	VK_VALIDATION_FEATURE_DISABLE_SHADER_VALIDATION_CACHE_EXT = 7,
+	VK_VALIDATION_FEATURE_DISABLE_MAX_ENUM_EXT                = 0x7FFFFFFF
+} VkValidationFeatureDisableEXT;
+
+typedef struct {
+	VkStructureType                       sType;
+	const void *                          pNext;
+	uint32_t                              enabledValidationFeatureCount;
+	const VkValidationFeatureEnableEXT *  pEnabledValidationFeatures;
+	uint32_t                              disabledValidationFeatureCount;
+	const VkValidationFeatureDisableEXT * pDisabledValidationFeatures;
+} VkValidationFeaturesEXT;
+
 
 /* X(name, ret, params) */
 #define VkLoaderProcedureList \
@@ -2208,6 +2897,8 @@ typedef struct {
 	X(vkEnumerateDeviceExtensionProperties,     VkResult, (VkPhysicalDevice physicalDevice, const char *pLayerName, uint32_t *pPropertyCount, VkExtensionProperties *pProperties)) \
 	X(vkEnumeratePhysicalDevices,               VkResult, (VkInstance instance, uint32_t *pPhysicalDeviceCount, VkPhysicalDevice *pPhysicalDevices)) \
 	X(vkGetDeviceProcAddr,                      void *,   (VkDevice device, const char *pName)) \
+	X(vkGetPhysicalDeviceFeatures2,             void,     (VkPhysicalDevice physicalDevice, VkPhysicalDeviceFeatures2 *pFeatures)) \
+	X(vkGetPhysicalDeviceFormatProperties2,     void,     (VkPhysicalDevice physicalDevice, VkFormat format, VkFormatProperties2 *pFormatProperties)) \
 	X(vkGetPhysicalDeviceMemoryProperties2,     void,     (VkPhysicalDevice physicalDevice, VkPhysicalDeviceMemoryProperties2 *pMemoryProperties)) \
 	X(vkGetPhysicalDeviceProperties2,           void,     (VkPhysicalDevice physicalDevice, VkPhysicalDeviceProperties2 *pProperties)) \
 	X(vkGetPhysicalDeviceQueueFamilyProperties, void,     (VkPhysicalDevice physicalDevice, uint32_t *pQueueFamilyPropertyCount, VkQueueFamilyProperties *pQueueFamilyProperties)) \
@@ -2215,24 +2906,58 @@ typedef struct {
 
 /* X(name, ret, params) */
 #define VkDeviceProcedureList \
-	X(vkAllocateMemory,             VkResult, (VkDevice device, const VkMemoryAllocateInfo *pAllocateInfo, const VkAllocationCallbacks *pAllocator, VkDeviceMemory *pMemory)) \
-	X(vkCreateComputePipelines,     VkResult, (VkDevice device, VkPipelineCache pipelineCache, uint32_t createInfoCount, const VkComputePipelineCreateInfo *pCreateInfos, const VkAllocationCallbacks *pAllocator, VkPipeline *pPipelines)) \
-	X(vkCreatePipelineLayout,       VkResult, (VkDevice device, const VkPipelineLayoutCreateInfo *pCreateInfo, const VkAllocationCallbacks *pAllocator, VkPipelineLayout *pPipelineLayout)) \
-	X(vkCreateSemaphore,            VkResult, (VkDevice device, const VkSemaphoreCreateInfo *pCreateInfo, const VkAllocationCallbacks *pAllocator, VkSemaphore *pSemaphore)) \
-	X(vkCreateShaderModule,         VkResult, (VkDevice device, const VkShaderModuleCreateInfo *pCreateInfo, const VkAllocationCallbacks *pAllocator, VkShaderModule *pShaderModule)) \
-	X(vkDestroyBuffer,              void,     (VkDevice device, VkBuffer buffer, const VkAllocationCallbacks *pAllocator)) \
-	X(vkDestroyPipeline,            void,     (VkDevice device, VkPipeline pipeline, const VkAllocationCallbacks *pAllocator)) \
-	X(vkDestroyPipelineLayout,      void,     (VkDevice device, VkPipelineLayout pipelineLayout, const VkAllocationCallbacks *pAllocator)) \
-	X(vkDestroyShaderModule,        void,     (VkDevice device, VkShaderModule shaderModule, const VkAllocationCallbacks *pAllocator)) \
-	X(vkFlushMappedMemoryRanges,    VkResult, (VkDevice device, uint32_t memoryRangeCount, const VkMappedMemoryRange *pMemoryRanges)) \
-	X(vkFreeMemory,                 void,     (VkDevice device, VkDeviceMemory memory, const VkAllocationCallbacks *pAllocator)) \
-	X(vkGetDeviceQueue,             void,     (VkDevice device, uint32_t queueFamilyIndex, uint32_t queueIndex, VkQueue *pQueue)) \
-	X(vkGetMemoryFdKHR,             VkResult, (VkDevice device, const VkMemoryGetFdInfoKHR *pGetFdInfo, int *pFd)) \
-	X(vkGetMemoryWin32HandleKHR,    VkResult, (VkDevice device, const VkMemoryGetWin32HandleInfoKHR *pGetWin32HandleInfo, void **pHandle)) \
-	X(vkGetSemaphoreFdKHR,          VkResult, (VkDevice device, const VkSemaphoreGetFdInfoKHR *pGetFdInfo, int *pFd)) \
-	X(vkGetSemaphoreWin32HandleKHR, VkResult, (VkDevice device, const VkSemaphoreGetWin32HandleInfoKHR *pGetWin32HandleInfo, void **pHandle)) \
-	X(vkMapMemory,                  VkResult, (VkDevice device, VkDeviceMemory memory, VkDeviceSize offset, VkDeviceSize size, VkMemoryMapFlags flags, void **ppData)) \
-	X(vkUnmapMemory,                void,     (VkDevice device, VkDeviceMemory memory)) \
+	X(vkAllocateCommandBuffers,        VkResult, (VkDevice device, const VkCommandBufferAllocateInfo *pAllocateInfo, VkCommandBuffer *pCommandBuffers)) \
+	X(vkAllocateMemory,                VkResult, (VkDevice device, const VkMemoryAllocateInfo *pAllocateInfo, const VkAllocationCallbacks *pAllocator, VkDeviceMemory *pMemory)) \
+	X(vkBindBufferMemory,              VkResult, (VkDevice device, VkBuffer buffer, VkDeviceMemory memory, VkDeviceSize memoryOffset)) \
+	X(vkBindImageMemory,               VkResult, (VkDevice device, VkImage image, VkDeviceMemory memory, VkDeviceSize memoryOffset)) \
+	X(vkCreateBuffer,                  VkResult, (VkDevice device, const VkBufferCreateInfo *pCreateInfo, const VkAllocationCallbacks *pAllocator, VkBuffer *pBuffer)) \
+	X(vkCreateCommandPool,             VkResult, (VkDevice device, const VkCommandPoolCreateInfo *pCreateInfo, const VkAllocationCallbacks *pAllocator, VkCommandPool *pCommandPool)) \
+	X(vkCreateComputePipelines,        VkResult, (VkDevice device, VkPipelineCache pipelineCache, uint32_t createInfoCount, const VkComputePipelineCreateInfo *pCreateInfos, const VkAllocationCallbacks *pAllocator, VkPipeline *pPipelines)) \
+	X(vkCreateGraphicsPipelines,       VkResult, (VkDevice device, VkPipelineCache pipelineCache, uint32_t createInfoCount, const VkGraphicsPipelineCreateInfo *pCreateInfos, const VkAllocationCallbacks *pAllocator, VkPipeline *pPipelines)) \
+	X(vkCreateImage,                   VkResult, (VkDevice device, const VkImageCreateInfo *pCreateInfo, const VkAllocationCallbacks *pAllocator, VkImage *pImage)) \
+	X(vkCreateImageView,               VkResult, (VkDevice device, const VkImageViewCreateInfo *pCreateInfo, const VkAllocationCallbacks *pAllocator, VkImageView *pView)) \
+	X(vkCreatePipelineLayout,          VkResult, (VkDevice device, const VkPipelineLayoutCreateInfo *pCreateInfo, const VkAllocationCallbacks *pAllocator, VkPipelineLayout *pPipelineLayout)) \
+	X(vkCreateQueryPool,               VkResult, (VkDevice device, const VkQueryPoolCreateInfo *pCreateInfo, const VkAllocationCallbacks *pAllocator, VkQueryPool *pQueryPool)) \
+	X(vkCreateSemaphore,               VkResult, (VkDevice device, const VkSemaphoreCreateInfo *pCreateInfo, const VkAllocationCallbacks *pAllocator, VkSemaphore *pSemaphore)) \
+	X(vkCreateShaderModule,            VkResult, (VkDevice device, const VkShaderModuleCreateInfo *pCreateInfo, const VkAllocationCallbacks *pAllocator, VkShaderModule *pShaderModule)) \
+	X(vkDestroyBuffer,                 void,     (VkDevice device, VkBuffer buffer, const VkAllocationCallbacks *pAllocator)) \
+	X(vkDestroyImage,                  void,     (VkDevice device, VkImage image, const VkAllocationCallbacks *pAllocator)) \
+	X(vkDestroyImageView,              void,     (VkDevice device, VkImageView imageView, const VkAllocationCallbacks *pAllocator)) \
+	X(vkDestroyPipeline,               void,     (VkDevice device, VkPipeline pipeline, const VkAllocationCallbacks *pAllocator)) \
+	X(vkDestroyPipelineLayout,         void,     (VkDevice device, VkPipelineLayout pipelineLayout, const VkAllocationCallbacks *pAllocator)) \
+	X(vkDestroyShaderModule,           void,     (VkDevice device, VkShaderModule shaderModule, const VkAllocationCallbacks *pAllocator)) \
+	X(vkFlushMappedMemoryRanges,       VkResult, (VkDevice device, uint32_t memoryRangeCount, const VkMappedMemoryRange *pMemoryRanges)) \
+	X(vkFreeMemory,                    void,     (VkDevice device, VkDeviceMemory memory, const VkAllocationCallbacks *pAllocator)) \
+	X(vkGetBufferDeviceAddress,        VkDeviceAddress, (VkDevice device, const VkBufferDeviceAddressInfo *pInfo)) \
+	X(vkGetBufferMemoryRequirements,   void,     (VkDevice device, VkBuffer buffer, VkMemoryRequirements *pMemoryRequirements)) \
+	X(vkGetDeviceQueue,                void,     (VkDevice device, uint32_t queueFamilyIndex, uint32_t queueIndex, VkQueue *pQueue)) \
+	X(vkGetImageMemoryRequirements,    void,     (VkDevice device, VkImage image, VkMemoryRequirements *pMemoryRequirements)) \
+	X(vkGetMemoryFdKHR,                VkResult, (VkDevice device, const VkMemoryGetFdInfoKHR *pGetFdInfo, int *pFd)) \
+	X(vkGetMemoryWin32HandleKHR,       VkResult, (VkDevice device, const VkMemoryGetWin32HandleInfoKHR *pGetWin32HandleInfo, void **pHandle)) \
+	X(vkGetQueryPoolResults,           VkResult, (VkDevice device, VkQueryPool queryPool, uint32_t firstQuery, uint32_t queryCount, size_t dataSize, void *pData, VkDeviceSize stride, VkQueryResultFlags flags)) \
+	X(vkGetSemaphoreFdKHR,             VkResult, (VkDevice device, const VkSemaphoreGetFdInfoKHR *pGetFdInfo, int *pFd)) \
+	X(vkGetSemaphoreWin32HandleKHR,    VkResult, (VkDevice device, const VkSemaphoreGetWin32HandleInfoKHR *pGetWin32HandleInfo, void **pHandle)) \
+	X(vkInvalidateMappedMemoryRanges,  VkResult, (VkDevice device, uint32_t memoryRangeCount, const VkMappedMemoryRange *pMemoryRanges)) \
+	X(vkMapMemory,                     VkResult, (VkDevice device, VkDeviceMemory memory, VkDeviceSize offset, VkDeviceSize size, VkMemoryMapFlags flags, void **ppData)) \
+	X(vkSignalSemaphore,               VkResult, (VkDevice device, const VkSemaphoreSignalInfo *pSignalInfo)) \
+	X(vkUnmapMemory,                   void,     (VkDevice device, VkDeviceMemory memory)) \
+	X(vkWaitSemaphores,                VkResult, (VkDevice device, const VkSemaphoreWaitInfo *pWaitInfo, uint64_t timeout)) \
+	X(vkBeginCommandBuffer,            VkResult, (VkCommandBuffer commandBuffer, const VkCommandBufferBeginInfo *pBeginInfo)) \
+	X(vkCmdBeginRendering,             void,     (VkCommandBuffer commandBuffer, const VkRenderingInfo *pRenderingInfo)) \
+	X(vkCmdBindIndexBuffer2,           void,     (VkCommandBuffer commandBuffer, VkBuffer buffer, VkDeviceSize offset, VkDeviceSize size, VkIndexType indexType)) \
+	X(vkCmdBindPipeline,               void,     (VkCommandBuffer commandBuffer, VkPipelineBindPoint pipelineBindPoint, VkPipeline pipeline)) \
+	X(vkCmdCopyBuffer2,                void,     (VkCommandBuffer commandBuffer, const VkCopyBufferInfo2 *pCopyBufferInfo)) \
+	X(vkCmdDispatch,                   void,     (VkCommandBuffer commandBuffer, uint32_t groupCountX, uint32_t groupCountY, uint32_t groupCountZ)) \
+	X(vkCmdDrawIndexed,                void,     (VkCommandBuffer commandBuffer, uint32_t indexCount, uint32_t instanceCount, uint32_t firstIndex, int32_t vertexOffset, uint32_t firstInstance)) \
+	X(vkCmdEndRendering,               void,     (VkCommandBuffer commandBuffer)) \
+	X(vkCmdPipelineBarrier2,           void,     (VkCommandBuffer commandBuffer, const VkDependencyInfo *pDependencyInfo)) \
+	X(vkCmdPushConstants,              void,     (VkCommandBuffer commandBuffer, VkPipelineLayout layout, VkShaderStageFlags stageFlags, uint32_t offset, uint32_t size, const void *pValues)) \
+	X(vkCmdResetQueryPool,             void,     (VkCommandBuffer commandBuffer, VkQueryPool queryPool, uint32_t firstQuery, uint32_t queryCount)) \
+	X(vkCmdSetScissor,                 void,     (VkCommandBuffer commandBuffer, uint32_t firstScissor, uint32_t scissorCount, const VkRect2D *pScissors)) \
+	X(vkCmdSetViewport,                void,     (VkCommandBuffer commandBuffer, uint32_t firstViewport, uint32_t viewportCount, const VkViewport *pViewports)) \
+	X(vkCmdWriteTimestamp2,            void,     (VkCommandBuffer commandBuffer, VkPipelineStageFlags2 stage, VkQueryPool queryPool, uint32_t query)) \
+	X(vkEndCommandBuffer,              VkResult, (VkCommandBuffer commandBuffer)) \
+	X(vkQueueSubmit2,                  VkResult, (VkQueue queue, uint32_t submitCount, const VkSubmitInfo2 *pSubmits, VkFence fence)) \
 
 
 #define X(name, ret, params) typedef ret name##_fn params;

M	beamformer.c	\|	338	++++++++++++++++++-------------------------------------------------------------
M	beamformer.h	\|	1	+
M	beamformer.meta	\|	124	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
M	beamformer_core.c	\|	1620	++++++++++++++++++++++++++++++++++++++++---------------------------------------
M	beamformer_internal.h	\|	383	+++++++++++++++++++++++++++++++++++++++++++++++--------------------------------
M	beamformer_parameters.h	\|	6	++++--
M	beamformer_shared_memory.c	\|	9	++++-----
M	build.c	\|	204	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------
D	external/include/raylib_extended.h	\|	2	--
D	external/rcore_extended.c	\|	8	--------
M	generated/beamformer.meta.c	\|	316	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------
M	lib/ogl_beamformer_lib.c	\|	15	+++++++++++++--
M	lib/ogl_beamformer_lib_base.h	\|	4	++++
M	main_linux.c	\|	11	+----------
M	main_w32.c	\|	11	+----------
M	math.c	\|	20	+++-----------------
M	opengl.h	\|	131	++++++++++++-------------------------------------------------------------------
A	shaders/buffer_clear.glsl	\|	11	+++++++++++
A	shaders/coherency_weighting.glsl	\|	41	+++++++++++++++++++++++++++++++++++++++++
M	shaders/das.glsl	\|	161	++++++++++++++++++++++++++++++++++++++++++-------------------------------------
M	shaders/decode.glsl	\|	108	++++++++++++++++++++++++++++++++++---------------------------------------------
M	shaders/filter.glsl	\|	24	+++++++++++++-----------
M	shaders/render_3d.frag.glsl	\|	60	++++++++++++++++++++++++++++++++++++++++++++----------------
A	shaders/render_3d.vert.glsl	\|	19	+++++++++++++++++++
M	ui.c	\|	528	+++++++++++++++++++++++++++++++++++++++++++++++++------------------------------
M	util.c	\|	8	--------
M	util.h	\|	16	+++++++++-------
D	util_gl.c	\|	69	---------------------------------------------------------------------
M	util_os.c	\|	18	++++++++++++++++++
M	vulkan.c	\|	1838	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------
M	vulkan.h	\|	1179	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------