Commit: 30238107bb1ebd803a7fedfa4369bbc89d4a566e
Parent: e3fdaace41c864d779e9c5ebc7405afe43bc4734
Author: Randy Palamar
Date: Thu, 12 Mar 2026 07:07:31 -0600
buffer_clear: clear 16 bytes at time
There isn't too much reason to go wider than this since its not
commonly supported. On AMD scalar instructions can go wider but
vector instructions are limited to 16 bytes.
Diffstat:
4 files changed, 20 insertions(+), 19 deletions(-)
diff --git a/beamformer.meta b/beamformer.meta
@@ -450,9 +450,9 @@
{
@PushConstants
{
- [data U64]
- [clear_word U32]
- [words U32]
+ [clear_v4 UV4]
+ [data U64]
+ [bins U32]
}
}
}
diff --git a/beamformer_core.c b/beamformer_core.c
@@ -1137,21 +1137,22 @@ complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena *arena)
GPUBuffer *backlog = cs->backlog.buffer;
u32 subgroup_size = vk_gpu_info()->subgroup_size;
BeamformerBufferClearPushConstants pc = {
- .data = backlog->gpu_pointer + frame->buffer_offset,
- .clear_word = 0,
- .words = beamformer_frame_byte_size(frame->points, frame->data_kind) / sizeof(u32),
+ .data = backlog->gpu_pointer + frame->buffer_offset,
+ .clear_v4 = (uv4){{0}},
+ .bins = beamformer_frame_byte_size(frame->points, frame->data_kind) / sizeof(uv4),
};
u32 index = BeamformerShaderKind_BufferClear - BeamformerShaderKind_ComputeInternalFirst;
vk_command_bind_pipeline(cmd, cs->compute_internal_pipelines[index]);
vk_command_push_constants(cmd, 0, sizeof(pc), &pc);
- vk_command_dispatch_compute(cmd, (uv3){{(u32)ceil_f32((f32)pc.words / subgroup_size), 1, 1}});
+ vk_command_dispatch_compute(cmd, (uv3){{(u32)ceil_f32((f32)pc.bins / subgroup_size), 1, 1}});
if (das_coherent) {
- pc.words = pc.words / beamformer_data_kind_element_count[frame->data_kind];
- pc.data = backlog->gpu_pointer + backlog->size - sizeof(u32) * pc.words;
+ assert((pc.bins % beamformer_data_kind_element_count[frame->data_kind]) == 0);
+ pc.bins = pc.bins / beamformer_data_kind_element_count[frame->data_kind];
+ pc.data = backlog->gpu_pointer + backlog->size - sizeof(uv4) * pc.bins;
vk_command_push_constants(cmd, 0, sizeof(pc), &pc);
- vk_command_dispatch_compute(cmd, (uv3){{(u32)ceil_f32((f32)pc.words / subgroup_size), 1, 1}});
+ vk_command_dispatch_compute(cmd, (uv3){{(u32)ceil_f32((f32)pc.bins / subgroup_size), 1, 1}});
}
}
diff --git a/generated/beamformer.meta.c b/generated/beamformer.meta.c
@@ -229,9 +229,9 @@ typedef struct {
} BeamformerCoherencyWeightingPushConstants;
typedef struct {
+ uv4 clear_v4;
u64 data;
- u32 clear_word;
- u32 words;
+ u32 bins;
} BeamformerBufferClearPushConstants;
typedef struct {
@@ -669,9 +669,9 @@ read_only global s8 beamformer_shader_global_header_strings[] = {
"\n"),
s8_comp(""
"layout(push_constant, std430) uniform PushConstants {\n"
+ " u32vec4 clear_v4;\n"
" uint64_t data;\n"
- " uint32_t clear_word;\n"
- " uint32_t words;\n"
+ " uint32_t bins;\n"
"};\n"
"\n"),
s8_comp(""
diff --git a/shaders/buffer_clear.glsl b/shaders/buffer_clear.glsl
@@ -1,11 +1,11 @@
/* See LICENSE for license details. */
-layout(std430, buffer_reference, buffer_reference_align = 8) restrict writeonly buffer Buffer {
- uint32_t values[];
+layout(std430, buffer_reference, buffer_reference_align = 32) restrict writeonly buffer Buffer {
+ u32vec4 x[];
};
void main()
{
- uint32_t word = gl_GlobalInvocationID.x;
- if (word < words)
- Buffer(data).values[word] = clear_word;
+ u32 index = gl_GlobalInvocationID.x;
+ if (index < bins)
+ Buffer(data).x[index] = clear_v4;
}