math.c - ogl_beamforming - Ultrasound Beamforming Implemented with OpenGL

math.c (26596B)
      1 /* See LICENSE for license details. */
      2 #include "external/cephes.c"
      3 
      4 function void
      5 fill_kronecker_sub_matrix_f16(f16 *out, i32 out_stride, f16 scale, f16 *b, iv2 b_dim)
      6 {
      7 	for (i32 i = 0; i < b_dim.y; i++) {
      8 		for (i32 j = 0; j < b_dim.x; j += 4, b += 4) {
      9 			out[j + 0] = scale * b[0];
     10 			out[j + 1] = scale * b[1];
     11 			out[j + 2] = scale * b[2];
     12 			out[j + 3] = scale * b[3];
     13 		}
     14 		out += out_stride;
     15 	}
     16 }
     17 
     18 /* NOTE: this won't check for valid space/etc and assumes row major order */
     19 function void
     20 kronecker_product_f16(f16 *out, f16 *a, iv2 a_dim, f16 *b, iv2 b_dim)
     21 {
     22 	iv2 out_dim = {{a_dim.x * b_dim.x, a_dim.y * b_dim.y}};
     23 	assert(out_dim.y % 4 == 0);
     24 	for (i32 i = 0; i < a_dim.y; i++) {
     25 		f16 *vout = out;
     26 		for (i32 j = 0; j < a_dim.x; j++, a++) {
     27 			fill_kronecker_sub_matrix_f16(vout, out_dim.y, *a, b, b_dim);
     28 			vout += b_dim.y;
     29 		}
     30 		out += out_dim.y * b_dim.x;
     31 	}
     32 }
     33 
     34 /* NOTE/TODO: to support even more hadamard sizes use the Paley construction */
     35 function f16 *
     36 make_hadamard_transpose(Arena *a, i32 dim, b32 row_major)
     37 {
     38 	read_only local_persist	f16 hadamard_12_12_transpose[] = {
     39 		1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
     40 		1, -1, -1,  1, -1, -1, -1,  1,  1,  1, -1,  1,
     41 		1,  1, -1, -1,  1, -1, -1, -1,  1,  1,  1, -1,
     42 		1, -1,  1, -1, -1,  1, -1, -1, -1,  1,  1,  1,
     43 		1,  1, -1,  1, -1, -1,  1, -1, -1, -1,  1,  1,
     44 		1,  1,  1, -1,  1, -1, -1,  1, -1, -1, -1,  1,
     45 		1,  1,  1,  1, -1,  1, -1, -1,  1, -1, -1, -1,
     46 		1, -1,  1,  1,  1, -1,  1, -1, -1,  1, -1, -1,
     47 		1, -1, -1,  1,  1,  1, -1,  1, -1, -1,  1, -1,
     48 		1, -1, -1, -1,  1,  1,  1, -1,  1, -1, -1,  1,
     49 		1,  1, -1, -1, -1,  1,  1,  1, -1,  1, -1, -1,
     50 		1, -1,  1, -1, -1, -1,  1,  1,  1, -1,  1, -1,
     51 	};
     52 
     53 	read_only local_persist f16 hadamard_20_20_transpose[] = {
     54 		1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
     55 		1, -1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1, -1,  1,  1,  1,  1, -1, -1,  1,
     56 		1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1, -1,  1,  1,  1,  1, -1, -1,  1, -1,
     57 		1,  1,  1, -1, -1, -1, -1,  1, -1,  1, -1,  1,  1,  1,  1, -1, -1,  1, -1, -1,
     58 		1,  1, -1, -1, -1, -1,  1, -1,  1, -1,  1,  1,  1,  1, -1, -1,  1, -1, -1,  1,
     59 		1, -1, -1, -1, -1,  1, -1,  1, -1,  1,  1,  1,  1, -1, -1,  1, -1, -1,  1,  1,
     60 		1, -1, -1, -1,  1, -1,  1, -1,  1,  1,  1,  1, -1, -1,  1, -1, -1,  1,  1, -1,
     61 		1, -1, -1,  1, -1,  1, -1,  1,  1,  1,  1, -1, -1,  1, -1, -1,  1,  1, -1, -1,
     62 		1, -1,  1, -1,  1, -1,  1,  1,  1,  1, -1, -1,  1, -1, -1,  1,  1, -1, -1, -1,
     63 		1,  1, -1,  1, -1,  1,  1,  1,  1, -1, -1,  1, -1, -1,  1,  1, -1, -1, -1, -1,
     64 		1, -1,  1, -1,  1,  1,  1,  1, -1, -1,  1, -1, -1,  1,  1, -1, -1, -1, -1,  1,
     65 		1,  1, -1,  1,  1,  1,  1, -1, -1,  1, -1, -1,  1,  1, -1, -1, -1, -1,  1, -1,
     66 		1, -1,  1,  1,  1,  1, -1, -1,  1, -1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1,
     67 		1,  1,  1,  1,  1, -1, -1,  1, -1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1, -1,
     68 		1,  1,  1,  1, -1, -1,  1, -1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1, -1,  1,
     69 		1,  1,  1, -1, -1,  1, -1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1, -1,  1,  1,
     70 		1,  1, -1, -1,  1, -1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1, -1,  1,  1,  1,
     71 		1, -1, -1,  1, -1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1, -1,  1,  1,  1,  1,
     72 		1, -1,  1, -1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1, -1,  1,  1,  1,  1, -1,
     73 		1,  1, -1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1, -1,  1,  1,  1,  1, -1, -1,
     74 	};
     75 
     76 
     77 	f16 *result = 0;
     78 
     79 	i32 order          = dim;
     80 	b32 power_of_2     = IsPowerOfTwo(dim);
     81 	b32 multiple_of_12 = dim % 12 == 0;
     82 	b32 multiple_of_20 = dim % 20 == 0;
     83 	i64 elements       = dim * dim;
     84 
     85 	i32 base_dim = 0;
     86 	if (power_of_2) {
     87 		base_dim  = dim;
     88 	} else if (multiple_of_20 && IsPowerOfTwo(dim / 20)) {
     89 		base_dim  = 20;
     90 		dim      /= 20;
     91 	} else if (multiple_of_12 && IsPowerOfTwo(dim / 12)) {
     92 		base_dim  = 12;
     93 		dim      /= 12;
     94 	}
     95 
     96 	if (power_of_2 && base_dim && arena_capacity(a, f16) >= elements * (1 + (dim != base_dim))) {
     97 		result = push_array(a, f16, elements);
     98 
     99 		Arena tmp = *a;
    100 		f16 *m = dim == base_dim ? result : push_array(&tmp, f16, elements);
    101 
    102 		#define IND(i, j) ((i) * dim + (j))
    103 		m[0] = 1;
    104 		for (i32 k = 1; k < dim; k *= 2) {
    105 			for (i32 i = 0; i < k; i++) {
    106 				for (i32 j = 0; j < k; j++) {
    107 					f16 val = m[IND(i, j)];
    108 					m[IND(i + k, j)]     =  val;
    109 					m[IND(i, j + k)]     =  val;
    110 					m[IND(i + k, j + k)] = -val;
    111 				}
    112 			}
    113 		}
    114 		#undef IND
    115 
    116 		f16 *m2 = 0;
    117 		iv2 m2_dim;
    118 		switch (base_dim) {
    119 		case 12:{ m2 = hadamard_12_12_transpose; m2_dim = (iv2){{12, 12}}; }break;
    120 		case 20:{ m2 = hadamard_20_20_transpose; m2_dim = (iv2){{20, 20}}; }break;
    121 		}
    122 		if (m2) kronecker_product_f16(result, m, (iv2){{dim, dim}}, m2, m2_dim);
    123 	}
    124 
    125 	if (row_major) {
    126 		for (i32 r = 0; r < order; r++)
    127 			for (i32 c = 0; c < order; c++)
    128 				swap(result[r * order + c], result[c * order + r]);
    129 	}
    130 
    131 	return result;
    132 }
    133 
    134 function b32
    135 u128_equal(u128 a, u128 b)
    136 {
    137 	b32 result = a.U64[0] == b.U64[0] && a.U64[1] == b.U64[1];
    138 	return result;
    139 }
    140 
    141 function RangeU64
    142 subrange_n_from_n_m_count(u64 n, u64 n_count, u64 m)
    143 {
    144 	assert(n < n_count);
    145 
    146 	u64 per_lane            = m / n_count;
    147 	u64 leftover            = m - per_lane * n_count;
    148 	u64 leftovers_before_n  = Min(leftover, n);
    149 	u64 base_index          = n * per_lane + leftovers_before_n;
    150 	u64 one_past_last_index = base_index + per_lane + ((n < leftover) ? 1 : 0);
    151 
    152 	RangeU64 result = {base_index, one_past_last_index};
    153 	return result;
    154 }
    155 
    156 function i32
    157 iv3_dimension(iv3 points)
    158 {
    159 	i32 result = (points.x > 1) + (points.y > 1) + (points.z > 1);
    160 	return result;
    161 }
    162 
    163 function bv3
    164 iv3_equal(iv3 a, iv3 b)
    165 {
    166 	bv3 result;
    167 	result.x = a.x == b.x;
    168 	result.y = a.y == b.y;
    169 	result.z = a.z == b.z;
    170 	return result;
    171 }
    172 
    173 function b32
    174 bv3_all(bv3 a)
    175 {
    176 	b32 result = a.x != 0 && a.y != 0 && a.z != 0;
    177 	return result;
    178 }
    179 
    180 function b32
    181 bv3_any(bv3 a)
    182 {
    183 	b32 result = a.x != 0 || a.y != 0 || a.z != 0;
    184 	return result;
    185 }
    186 
    187 function v2
    188 clamp_v2_rect(v2 v, Rect r)
    189 {
    190 	v2 result = v;
    191 	result.x = Clamp(v.x, r.pos.x, r.pos.x + r.size.x);
    192 	result.y = Clamp(v.y, r.pos.y, r.pos.y + r.size.y);
    193 	return result;
    194 }
    195 
    196 function v2
    197 v2_from_iv2(iv2 v)
    198 {
    199 	v2 result;
    200 	result.E[0] = (f32)v.E[0];
    201 	result.E[1] = (f32)v.E[1];
    202 	return result;
    203 }
    204 
    205 function v2
    206 v2_abs(v2 a)
    207 {
    208 	v2 result;
    209 	result.x = Abs(a.x);
    210 	result.y = Abs(a.y);
    211 	return result;
    212 }
    213 
    214 function v2
    215 v2_scale(v2 a, f32 scale)
    216 {
    217 	v2 result;
    218 	result.x = a.x * scale;
    219 	result.y = a.y * scale;
    220 	return result;
    221 }
    222 
    223 function v2
    224 v2_add(v2 a, v2 b)
    225 {
    226 	v2 result;
    227 	result.x = a.x + b.x;
    228 	result.y = a.y + b.y;
    229 	return result;
    230 }
    231 
    232 function v2
    233 v2_sub(v2 a, v2 b)
    234 {
    235 	v2 result = v2_add(a, v2_scale(b, -1.0f));
    236 	return result;
    237 }
    238 
    239 function v2
    240 v2_mul(v2 a, v2 b)
    241 {
    242 	v2 result;
    243 	result.x = a.x * b.x;
    244 	result.y = a.y * b.y;
    245 	return result;
    246 }
    247 
    248 function v2
    249 v2_div(v2 a, v2 b)
    250 {
    251 	v2 result;
    252 	result.x = a.x / b.x;
    253 	result.y = a.y / b.y;
    254 	return result;
    255 }
    256 
    257 function v2
    258 v2_floor(v2 a)
    259 {
    260 	v2 result;
    261 	result.x = (f32)((i32)a.x);
    262 	result.y = (f32)((i32)a.y);
    263 	return result;
    264 }
    265 
    266 function f32
    267 v2_magnitude_squared(v2 a)
    268 {
    269 	f32 result = a.x * a.x + a.y * a.y;
    270 	return result;
    271 }
    272 
    273 function f32
    274 v2_magnitude(v2 a)
    275 {
    276 	f32 result = sqrt_f32(a.x * a.x + a.y * a.y);
    277 	return result;
    278 }
    279 
    280 function v3
    281 cross(v3 a, v3 b)
    282 {
    283 	v3 result;
    284 	result.x = a.y * b.z - a.z * b.y;
    285 	result.y = a.z * b.x - a.x * b.z;
    286 	result.z = a.x * b.y - a.y * b.x;
    287 	return result;
    288 }
    289 
    290 function v3
    291 v3_from_iv3(iv3 v)
    292 {
    293 	v3 result;
    294 	result.E[0] = (f32)v.E[0];
    295 	result.E[1] = (f32)v.E[1];
    296 	result.E[2] = (f32)v.E[2];
    297 	return result;
    298 }
    299 
    300 function v3
    301 v3_abs(v3 a)
    302 {
    303 	v3 result;
    304 	result.x = Abs(a.x);
    305 	result.y = Abs(a.y);
    306 	result.z = Abs(a.z);
    307 	return result;
    308 }
    309 
    310 function v3
    311 v3_scale(v3 a, f32 scale)
    312 {
    313 	v3 result;
    314 	result.x = scale * a.x;
    315 	result.y = scale * a.y;
    316 	result.z = scale * a.z;
    317 	return result;
    318 }
    319 
    320 function v3
    321 v3_add(v3 a, v3 b)
    322 {
    323 	v3 result;
    324 	result.x = a.x + b.x;
    325 	result.y = a.y + b.y;
    326 	result.z = a.z + b.z;
    327 	return result;
    328 }
    329 
    330 function v3
    331 v3_sub(v3 a, v3 b)
    332 {
    333 	v3 result = v3_add(a, v3_scale(b, -1.0f));
    334 	return result;
    335 }
    336 
    337 function v3
    338 v3_div(v3 a, v3 b)
    339 {
    340 	v3 result;
    341 	result.x = a.x / b.x;
    342 	result.y = a.y / b.y;
    343 	result.z = a.z / b.z;
    344 	return result;
    345 }
    346 
    347 function f32
    348 v3_dot(v3 a, v3 b)
    349 {
    350 	f32 result = a.x * b.x + a.y * b.y + a.z * b.z;
    351 	return result;
    352 }
    353 
    354 function f32
    355 v3_magnitude_squared(v3 a)
    356 {
    357 	f32 result = v3_dot(a, a);
    358 	return result;
    359 }
    360 
    361 function f32
    362 v3_magnitude(v3 a)
    363 {
    364 	f32 result = sqrt_f32(v3_dot(a, a));
    365 	return result;
    366 }
    367 
    368 function v3
    369 v3_normalize(v3 a)
    370 {
    371 	v3 result = v3_scale(a, 1.0f / v3_magnitude(a));
    372 	return result;
    373 }
    374 
    375 function v4
    376 v4_scale(v4 a, f32 scale)
    377 {
    378 	v4 result;
    379 	result.x = scale * a.x;
    380 	result.y = scale * a.y;
    381 	result.z = scale * a.z;
    382 	result.w = scale * a.w;
    383 	return result;
    384 }
    385 
    386 function v4
    387 v4_add(v4 a, v4 b)
    388 {
    389 	v4 result;
    390 	result.x = a.x + b.x;
    391 	result.y = a.y + b.y;
    392 	result.z = a.z + b.z;
    393 	result.w = a.w + b.w;
    394 	return result;
    395 }
    396 
    397 function v4
    398 v4_sub(v4 a, v4 b)
    399 {
    400 	v4 result = v4_add(a, v4_scale(b, -1));
    401 	return result;
    402 }
    403 
    404 function f32
    405 v4_dot(v4 a, v4 b)
    406 {
    407 	f32 result = a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
    408 	return result;
    409 }
    410 
    411 function v4
    412 v4_lerp(v4 a, v4 b, f32 t)
    413 {
    414 	v4 result = v4_add(a, v4_scale(v4_sub(b, a), t));
    415 	return result;
    416 }
    417 
    418 function b32
    419 m4_equal(m4 a, m4 b)
    420 {
    421 	b32 result = 1;
    422 	for EachElement(a.E, it)
    423 		result &= f32_equal(a.E[it], b.E[it]);
    424 	return result;
    425 }
    426 
    427 #define m4_identity() \
    428 	(m4){.E = { \
    429 		1, 0, 0, 0, \
    430 		0, 1, 0, 0, \
    431 		0, 0, 1, 0, \
    432 		0, 0, 0, 1, \
    433 	}}
    434 
    435 function v4
    436 m4_row(m4 a, u32 row)
    437 {
    438 	v4 result;
    439 	result.E[0] = a.c[0].E[row];
    440 	result.E[1] = a.c[1].E[row];
    441 	result.E[2] = a.c[2].E[row];
    442 	result.E[3] = a.c[3].E[row];
    443 	return result;
    444 }
    445 
    446 function m4
    447 m4_mul(m4 a, m4 b)
    448 {
    449 	m4 result;
    450 	for (u32 i = 0; i < 4; i++) {
    451 		for (u32 j = 0; j < 4; j++) {
    452 			result.c[i].E[j] = v4_dot(m4_row(a, j), b.c[i]);
    453 		}
    454 	}
    455 	return result;
    456 }
    457 
    458 /* NOTE(rnp): based on:
    459  * https://web.archive.org/web/20131215123403/ftp://download.intel.com/design/PentiumIII/sml/24504301.pdf
    460  * TODO(rnp): redo with SIMD as given in the link (but need to rewrite for column-major)
    461  */
    462 function m4
    463 m4_inverse(m4 m)
    464 {
    465 	m4 result;
    466 	result.E[ 0] =  m.E[5] * m.E[10] * m.E[15] - m.E[5] * m.E[11] * m.E[14] - m.E[9] * m.E[6] * m.E[15] + m.E[9] * m.E[7] * m.E[14] + m.E[13] * m.E[6] * m.E[11] - m.E[13] * m.E[7] * m.E[10];
    467 	result.E[ 4] = -m.E[4] * m.E[10] * m.E[15] + m.E[4] * m.E[11] * m.E[14] + m.E[8] * m.E[6] * m.E[15] - m.E[8] * m.E[7] * m.E[14] - m.E[12] * m.E[6] * m.E[11] + m.E[12] * m.E[7] * m.E[10];
    468 	result.E[ 8] =  m.E[4] * m.E[ 9] * m.E[15] - m.E[4] * m.E[11] * m.E[13] - m.E[8] * m.E[5] * m.E[15] + m.E[8] * m.E[7] * m.E[13] + m.E[12] * m.E[5] * m.E[11] - m.E[12] * m.E[7] * m.E[ 9];
    469 	result.E[12] = -m.E[4] * m.E[ 9] * m.E[14] + m.E[4] * m.E[10] * m.E[13] + m.E[8] * m.E[5] * m.E[14] - m.E[8] * m.E[6] * m.E[13] - m.E[12] * m.E[5] * m.E[10] + m.E[12] * m.E[6] * m.E[ 9];
    470 	result.E[ 1] = -m.E[1] * m.E[10] * m.E[15] + m.E[1] * m.E[11] * m.E[14] + m.E[9] * m.E[2] * m.E[15] - m.E[9] * m.E[3] * m.E[14] - m.E[13] * m.E[2] * m.E[11] + m.E[13] * m.E[3] * m.E[10];
    471 	result.E[ 5] =  m.E[0] * m.E[10] * m.E[15] - m.E[0] * m.E[11] * m.E[14] - m.E[8] * m.E[2] * m.E[15] + m.E[8] * m.E[3] * m.E[14] + m.E[12] * m.E[2] * m.E[11] - m.E[12] * m.E[3] * m.E[10];
    472 	result.E[ 9] = -m.E[0] * m.E[ 9] * m.E[15] + m.E[0] * m.E[11] * m.E[13] + m.E[8] * m.E[1] * m.E[15] - m.E[8] * m.E[3] * m.E[13] - m.E[12] * m.E[1] * m.E[11] + m.E[12] * m.E[3] * m.E[ 9];
    473 	result.E[13] =  m.E[0] * m.E[ 9] * m.E[14] - m.E[0] * m.E[10] * m.E[13] - m.E[8] * m.E[1] * m.E[14] + m.E[8] * m.E[2] * m.E[13] + m.E[12] * m.E[1] * m.E[10] - m.E[12] * m.E[2] * m.E[ 9];
    474 	result.E[ 2] =  m.E[1] * m.E[ 6] * m.E[15] - m.E[1] * m.E[ 7] * m.E[14] - m.E[5] * m.E[2] * m.E[15] + m.E[5] * m.E[3] * m.E[14] + m.E[13] * m.E[2] * m.E[ 7] - m.E[13] * m.E[3] * m.E[ 6];
    475 	result.E[ 6] = -m.E[0] * m.E[ 6] * m.E[15] + m.E[0] * m.E[ 7] * m.E[14] + m.E[4] * m.E[2] * m.E[15] - m.E[4] * m.E[3] * m.E[14] - m.E[12] * m.E[2] * m.E[ 7] + m.E[12] * m.E[3] * m.E[ 6];
    476 	result.E[10] =  m.E[0] * m.E[ 5] * m.E[15] - m.E[0] * m.E[ 7] * m.E[13] - m.E[4] * m.E[1] * m.E[15] + m.E[4] * m.E[3] * m.E[13] + m.E[12] * m.E[1] * m.E[ 7] - m.E[12] * m.E[3] * m.E[ 5];
    477 	result.E[14] = -m.E[0] * m.E[ 5] * m.E[14] + m.E[0] * m.E[ 6] * m.E[13] + m.E[4] * m.E[1] * m.E[14] - m.E[4] * m.E[2] * m.E[13] - m.E[12] * m.E[1] * m.E[ 6] + m.E[12] * m.E[2] * m.E[ 5];
    478 	result.E[ 3] = -m.E[1] * m.E[ 6] * m.E[11] + m.E[1] * m.E[ 7] * m.E[10] + m.E[5] * m.E[2] * m.E[11] - m.E[5] * m.E[3] * m.E[10] - m.E[ 9] * m.E[2] * m.E[ 7] + m.E[ 9] * m.E[3] * m.E[ 6];
    479 	result.E[ 7] =  m.E[0] * m.E[ 6] * m.E[11] - m.E[0] * m.E[ 7] * m.E[10] - m.E[4] * m.E[2] * m.E[11] + m.E[4] * m.E[3] * m.E[10] + m.E[ 8] * m.E[2] * m.E[ 7] - m.E[ 8] * m.E[3] * m.E[ 6];
    480 	result.E[11] = -m.E[0] * m.E[ 5] * m.E[11] + m.E[0] * m.E[ 7] * m.E[ 9] + m.E[4] * m.E[1] * m.E[11] - m.E[4] * m.E[3] * m.E[ 9] - m.E[ 8] * m.E[1] * m.E[ 7] + m.E[ 8] * m.E[3] * m.E[ 5];
    481 	result.E[15] =  m.E[0] * m.E[ 5] * m.E[10] - m.E[0] * m.E[ 6] * m.E[ 9] - m.E[4] * m.E[1] * m.E[10] + m.E[4] * m.E[2] * m.E[ 9] + m.E[ 8] * m.E[1] * m.E[ 6] - m.E[ 8] * m.E[2] * m.E[ 5];
    482 
    483 	f32 determinant = m.E[0] * result.E[0] + m.E[1] * result.E[4] + m.E[2] * result.E[8] + m.E[3] * result.E[12];
    484 	determinant = 1.0f / determinant;
    485 	for(i32 i = 0; i < 16; i++)
    486 		result.E[i] *= determinant;
    487 	return result;
    488 }
    489 
    490 function m4
    491 m4_translation(v3 delta)
    492 {
    493 	m4 result;
    494 	result.c[0] = (v4){{1, 0, 0, 0}};
    495 	result.c[1] = (v4){{0, 1, 0, 0}};
    496 	result.c[2] = (v4){{0, 0, 1, 0}};
    497 	result.c[3] = (v4){{delta.x, delta.y, delta.z, 1}};
    498 	return result;
    499 }
    500 
    501 function m4
    502 m4_scale(v3 scale)
    503 {
    504 	m4 result;
    505 	result.c[0] = (v4){{scale.x, 0,       0,       0}};
    506 	result.c[1] = (v4){{0,       scale.y, 0,       0}};
    507 	result.c[2] = (v4){{0,       0,       scale.z, 0}};
    508 	result.c[3] = (v4){{0,       0,       0,       1}};
    509 	return result;
    510 }
    511 
    512 function m4
    513 m4_rotation_about_axis(v3 axis, f32 turns)
    514 {
    515 	assert(f32_equal(v3_magnitude_squared(axis), 1.0f));
    516 	f32 sa  = sin_f32(turns * 2 * PI);
    517 	f32 ca  = cos_f32(turns * 2 * PI);
    518 	f32 mca = 1.0f - ca;
    519 
    520 	f32 x = axis.x, x2 = x * x;
    521 	f32 y = axis.y, y2 = y * y;
    522 	f32 z = axis.z, z2 = z * z;
    523 
    524 	m4 result;
    525 	result.c[0] = (v4){{ca + mca * x2,        mca * x * y - sa * z, mca * x * z + sa * y, 0}};
    526 	result.c[1] = (v4){{mca * x * y + sa * z, ca + mca * y2,        mca * y * z - sa * x, 0}};
    527 	result.c[2] = (v4){{mca * x * z - sa * y, mca * y * z + sa * x, ca + mca * z2,        0}};
    528 	result.c[3] = (v4){{0, 0, 0, 1}};
    529 	return result;
    530 }
    531 
    532 function m4
    533 m4_rotation_about_y(f32 turns)
    534 {
    535 	m4 result = m4_rotation_about_axis((v3){.y = 1.0f}, turns);
    536 	return result;
    537 }
    538 
    539 function m4
    540 y_aligned_volume_transform(v3 extent, v3 translation, f32 rotation_turns)
    541 {
    542 	m4 T = m4_translation(translation);
    543 	m4 R = m4_rotation_about_axis((v3){.y = 1.0f}, rotation_turns);
    544 	m4 S = m4_scale(extent);
    545 	m4 result = m4_mul(T, m4_mul(R, S));
    546 	return result;
    547 }
    548 
    549 function v4
    550 m4_mul_v4(m4 a, v4 v)
    551 {
    552 	v4 result;
    553 	result.x = v4_dot(m4_row(a, 0), v);
    554 	result.y = v4_dot(m4_row(a, 1), v);
    555 	result.z = v4_dot(m4_row(a, 2), v);
    556 	result.w = v4_dot(m4_row(a, 3), v);
    557 	return result;
    558 }
    559 
    560 function v3
    561 m4_mul_v3(m4 a, v3 v)
    562 {
    563 	v3 result = m4_mul_v4(a, (v4){{v.x, v.y, v.z, 1.0f}}).xyz;
    564 	return result;
    565 }
    566 
    567 function v2
    568 rect_uv(v2 p, Rect r)
    569 {
    570 	v2 result = v2_div(v2_sub(p, r.pos), r.size);
    571 	return result;
    572 }
    573 
    574 function v2
    575 rect_uv_ndc(v2 p, Rect r)
    576 {
    577 	v2 uv     = rect_uv(p, r);
    578 	v2 result = v2_sub(v2_scale(uv, 2.f), (v2){{1.f, 1.f}});
    579 	return result;
    580 }
    581 
    582 function Rect
    583 rect_intersect(Rect a, Rect b)
    584 {
    585 	v2 ae = v2_add(a.pos, a.size);
    586 	v2 be = v2_add(b.pos, b.size);
    587 
    588 	Rect result   = {0};
    589 	result.pos.x  = Max(a.pos.x, b.pos.x);
    590 	result.pos.y  = Max(a.pos.y, b.pos.y);
    591 	result.size.x = Min(ae.x, be.x) - result.pos.x;
    592 	result.size.y = Min(ae.y, be.y) - result.pos.y;
    593 	return result;
    594 }
    595 
    596 function Rect
    597 rect_squish_centered(Rect a, v2 pct)
    598 {
    599 	v2 delta_size = v2_mul(a.size, pct);
    600 	Rect result;
    601 	result.pos  = v2_add(a.pos,  v2_scale(delta_size, 0.5f));
    602 	result.size = v2_add(a.size, v2_scale(delta_size, -1.f));
    603 	return result;
    604 }
    605 
    606 function Rect
    607 rect_shrink_centered(Rect a, v2 px)
    608 {
    609 	Rect result;
    610 	result.pos  = v2_add(a.pos,  v2_scale(px, 0.5f));
    611 	result.size = v2_add(a.size, v2_scale(px, -1.f));
    612 	return result;
    613 }
    614 
    615 function m4
    616 orthographic_projection(f32 n, f32 f, f32 t, f32 r)
    617 {
    618 	m4 result;
    619 	f32 a = -2 / (f - n);
    620 	f32 b = - (f + n) / (f - n);
    621 	result.c[0] = (v4){{1 / r, 0,     0,  0}};
    622 	result.c[1] = (v4){{0,     1 / t, 0,  0}};
    623 	result.c[2] = (v4){{0,     0,     a,  0}};
    624 	result.c[3] = (v4){{0,     0,     b,  1}};
    625 	return result;
    626 }
    627 
    628 function m4
    629 perspective_projection(f32 n, f32 f, f32 fov, f32 aspect)
    630 {
    631 	m4 result;
    632 	f32 t = n * tan_f32(fov / 2.0f);
    633 	f32 r = t * aspect;
    634 	f32 a = -(f + n) / (f - n);
    635 	f32 b = -2 * f * n / (f - n);
    636 	result.c[0] = (v4){{n / r, 0,     0,  0}};
    637 	result.c[1] = (v4){{0,     n / t, 0,  0}};
    638 	result.c[2] = (v4){{0,     0,     a, -1}};
    639 	result.c[3] = (v4){{0,     0,     b,  0}};
    640 	return result;
    641 }
    642 
    643 function m4
    644 camera_look_at(v3 camera, v3 point)
    645 {
    646 	v3 orthogonal = {{0, 1.0f, 0}};
    647 	v3 normal     = v3_normalize(v3_sub(camera, point));
    648 	v3 right      = cross(orthogonal, normal);
    649 	v3 up         = cross(normal,     right);
    650 
    651 	v3 translate;
    652 	camera      = v3_sub((v3){0}, camera);
    653 	translate.x = v3_dot(camera, right);
    654 	translate.y = v3_dot(camera, up);
    655 	translate.z = v3_dot(camera, normal);
    656 
    657 	m4 result;
    658 	result.c[0] = (v4){{right.x,     up.x,        normal.x,    0}};
    659 	result.c[1] = (v4){{right.y,     up.y,        normal.y,    0}};
    660 	result.c[2] = (v4){{right.z,     up.z,        normal.z,    0}};
    661 	result.c[3] = (v4){{translate.x, translate.y, translate.z, 1}};
    662 	return result;
    663 }
    664 
    665 /* NOTE(rnp): adapted from "Essential Mathematics for Games and Interactive Applications" (Verth, Bishop) */
    666 function f32
    667 obb_raycast(m4 obb_orientation, v3 obb_size, v3 obb_center, ray r)
    668 {
    669 	v3 p = v3_sub(obb_center, r.origin);
    670 	v3 X = obb_orientation.c[0].xyz;
    671 	v3 Y = obb_orientation.c[1].xyz;
    672 	v3 Z = obb_orientation.c[2].xyz;
    673 
    674 	/* NOTE(rnp): projects direction vector onto OBB axis */
    675 	v3 f;
    676 	f.x = v3_dot(X, r.direction);
    677 	f.y = v3_dot(Y, r.direction);
    678 	f.z = v3_dot(Z, r.direction);
    679 
    680 	/* NOTE(rnp): projects relative vector onto OBB axis */
    681 	v3 e;
    682 	e.x = v3_dot(X, p);
    683 	e.y = v3_dot(Y, p);
    684 	e.z = v3_dot(Z, p);
    685 
    686 	f32 result = 0;
    687 	f32 t[6] = {0};
    688 	for (i32 i = 0; i < 3; i++) {
    689 		if (f32_equal(f.E[i], 0)) {
    690 			if (-e.E[i] - obb_size.E[i] > 0 || -e.E[i] + obb_size.E[i] < 0)
    691 				result = -1.0f;
    692 			f.E[i] = F32_EPSILON;
    693 		}
    694 		t[i * 2 + 0] = (e.E[i] + obb_size.E[i]) / f.E[i];
    695 		t[i * 2 + 1] = (e.E[i] - obb_size.E[i]) / f.E[i];
    696 	}
    697 
    698 	if (result != -1) {
    699 		f32 tmin = Max(Max(Min(t[0], t[1]), Min(t[2], t[3])), Min(t[4], t[5]));
    700 		f32 tmax = Min(Min(Max(t[0], t[1]), Max(t[2], t[3])), Max(t[4], t[5]));
    701 		if (tmax >= 0 && tmin <= tmax) {
    702 			result = tmin > 0 ? tmin : tmax;
    703 		} else {
    704 			result = -1;
    705 		}
    706 	}
    707 
    708 	return result;
    709 }
    710 
    711 function f32
    712 complex_filter_first_moment(v2 *filter, i32 length, f32 sampling_frequency)
    713 {
    714 	f32 n = 0, d = 0;
    715 	for (i32 i = 0; i < length; i++) {
    716 		f32 t = v2_magnitude_squared(filter[i]);
    717 		n += (f32)i * t;
    718 		d += t;
    719 	}
    720 	f32 result = n / d / sampling_frequency;
    721 	return result;
    722 }
    723 
    724 function f32
    725 real_filter_first_moment(f32 *filter, i32 length, f32 sampling_frequency)
    726 {
    727 	f32 n = 0, d = 0;
    728 	for (i32 i = 0; i < length; i++) {
    729 		f32 t = filter[i] * filter[i];
    730 		n += (f32)i * t;
    731 		d += t;
    732 	}
    733 	f32 result = n / d / sampling_frequency;
    734 	return result;
    735 }
    736 
    737 function f32
    738 tukey_window(f32 t, f32 tapering)
    739 {
    740 	f32 r = tapering;
    741 	f32 result = 1;
    742 	if (t < r / 2)      result = 0.5f * (1 + cos_f32(2 * PI * (t - r / 2)     / r));
    743 	if (t >= 1 - r / 2) result = 0.5f * (1 + cos_f32(2 * PI * (t - 1 + r / 2) / r));
    744 	return result;
    745 }
    746 
    747 /* NOTE(rnp): adapted from "Discrete Time Signal Processing" (Oppenheim) */
    748 function f32 *
    749 kaiser_low_pass_filter(Arena *arena, f32 cutoff_frequency, f32 sampling_frequency, f32 beta, i32 length)
    750 {
    751 	f32 *result = push_array(arena, f32, length);
    752 	f32 wc      = 2 * PI * cutoff_frequency / sampling_frequency;
    753 	f32 a       = (f32)length / 2.0f;
    754 	f32 pi_i0_b = PI * (f32)cephes_i0(beta);
    755 
    756 	for (i32 n = 0; n < length; n++) {
    757 		f32 t       = (f32)n - a;
    758 		f32 impulse = !f32_equal(t, 0) ? sin_f32(wc * t) / t : wc;
    759 		t           = t / a;
    760 		f32 window  = (f32)cephes_i0(beta * sqrt_f32(1 - t * t)) / pi_i0_b;
    761 		result[n]   = impulse * window;
    762 	}
    763 
    764 	return result;
    765 }
    766 
    767 function f32 *
    768 rf_chirp(Arena *arena, f32 min_frequency, f32 max_frequency, f32 sampling_frequency,
    769          i32 length, b32 reverse)
    770 {
    771 	f32 *result = push_array(arena, f32, length);
    772 	for (i32 i = 0; i < length; i++) {
    773 		i32 index = reverse? length - 1 - i : i;
    774 		f32 fc    = min_frequency + (f32)i * (max_frequency - min_frequency) / (2 * (f32)length);
    775 		f32 arg   = 2 * PI * fc * (f32)i / sampling_frequency;
    776 		result[index] = sin_f32(arg) * tukey_window((f32)i / (f32)length, 0.2f);
    777 	}
    778 	return result;
    779 }
    780 
    781 function v2 *
    782 baseband_chirp(Arena *arena, f32 min_frequency, f32 max_frequency, f32 sampling_frequency,
    783                i32 length, b32 reverse, f32 scale)
    784 {
    785 	v2 *result    = push_array(arena, v2, length);
    786 	f32 conjugate = reverse ? -1 : 1;
    787 	for (i32 i = 0; i < length; i++) {
    788 		i32 index = reverse? length - 1 - i : i;
    789 		f32 fc    = min_frequency + (f32)i * (max_frequency - min_frequency) / (2 * (f32)length);
    790 		f32 arg   = 2 * PI * fc * (f32)i / sampling_frequency;
    791 		v2 sample = {{scale * cos_f32(arg), conjugate * scale * sin_f32(arg)}};
    792 		result[index] = v2_scale(sample, tukey_window((f32)i / (f32)length, 0.2f));
    793 	}
    794 	return result;
    795 }
    796 
    797 function iv3
    798 das_output_dimension(iv3 points)
    799 {
    800 	iv3 result;
    801 	result.x = Max(points.x, 1);
    802 	result.y = Max(points.y, 1);
    803 	result.z = Max(points.z, 1);
    804 
    805 	switch (iv3_dimension(result)) {
    806 	case 1:{
    807 		if (result.y > 1) result.x = result.y;
    808 		if (result.z > 1) result.x = result.z;
    809 		result.y = result.z = 1;
    810 	}break;
    811 
    812 	case 2:{
    813 		if (result.x > 1) {
    814 			if (result.z > 1) result.y = result.z;
    815 		} else {
    816 			result.x = result.z;
    817 		}
    818 		result.z = 1;
    819 	}break;
    820 
    821 	case 3:{}break;
    822 
    823 	InvalidDefaultCase;
    824 	}
    825 
    826 	return result;
    827 }
    828 
    829 function m4
    830 das_transform_1d(v3 p1, v3 p2)
    831 {
    832 	v3 extent = v3_sub(p2, p1);
    833 	m4 result = {
    834 		.c[0] = (v4){{extent.x, extent.y, extent.z, 0.0f}},
    835 		.c[1] = (v4){{0.0f, 0.0f, 0.0f, 0.0f}},
    836 		.c[2] = (v4){{0.0f, 0.0f, 0.0f, 0.0f}},
    837 		.c[3] = (v4){{p1.x, p1.y, p1.z, 1.0f}},
    838 	};
    839 	return result;
    840 }
    841 
    842 function m4
    843 das_transform_2d_with_normal(v3 normal, v2 min_coordinate, v2 max_coordinate, f32 offset)
    844 {
    845 	v3 U = {{0, 1.0f, 0}};
    846 	if (f32_equal(v3_dot(U, normal), 1.0f))
    847 		U = (v3){{1.0f, 0, 0}};
    848 
    849 	v3 N = normal;
    850 	v3 V = cross(U, N);
    851 
    852 	v3 min = v3_add(v3_scale(U, min_coordinate.x), v3_scale(V, min_coordinate.y));
    853 	v3 max = v3_add(v3_scale(U, max_coordinate.x), v3_scale(V, max_coordinate.y));
    854 
    855 	v3 extent = v3_sub(max, min);
    856 	U = v3_scale(U, v3_dot(U, extent));
    857 	V = v3_scale(V, v3_dot(V, extent));
    858 
    859 	v3 t = v3_add(v3_scale(N, offset), min);
    860 
    861 	m4 result;
    862 	result.c[0] = (v4){{U.x,  U.y,  U.z,  0.0f}};
    863 	result.c[1] = (v4){{V.x,  V.y,  V.z,  0.0f}};
    864 	result.c[2] = (v4){{N.x,  N.y,  N.z,  0.0f}};
    865 	result.c[3] = (v4){{t.x,  t.y,  t.z,  1.0f}};
    866 
    867 	return result;
    868 }
    869 
    870 function m4
    871 das_transform_2d_xz(v2 min_coordinate, v2 max_coordinate, f32 y_off)
    872 {
    873 	m4 result = das_transform_2d_with_normal((v3){.y = 1.0f}, min_coordinate, max_coordinate, y_off);
    874 	return result;
    875 }
    876 
    877 function m4
    878 das_transform_2d_yz(v2 min_coordinate, v2 max_coordinate, f32 x_off)
    879 {
    880 	// NOTE(rnp): flip so that region extends in correct direction
    881 	m4 result = das_transform_2d_with_normal((v3){.x = -1.0f}, min_coordinate, max_coordinate, x_off);
    882 	return result;
    883 }
    884 
    885 function m4
    886 das_transform_2d_xy(v2 min_coordinate, v2 max_coordinate, f32 z_off)
    887 {
    888 	m4 result = das_transform_2d_with_normal((v3){.z = 1.0f}, min_coordinate, max_coordinate, z_off);
    889 	return result;
    890 }
    891 
    892 function m4
    893 das_transform_3d(v3 min_coordinate, v3 max_coordinate)
    894 {
    895 	v3 extent = v3_sub(max_coordinate, min_coordinate);
    896 	m4 result;
    897 	result.c[0] = (v4){{extent.x,         0.0f,             0.0f,             0.0f}};
    898 	result.c[1] = (v4){{0.0f,             extent.y,         0.0f,             0.0f}};
    899 	result.c[2] = (v4){{0.0f,             0.0f,             extent.z,         0.0f}};
    900 	result.c[3] = (v4){{min_coordinate.x, min_coordinate.y, min_coordinate.z, 1.0f}};
    901 	return result;
    902 }
    903 
    904 function m4
    905 das_transform(v3 min_coordinate, v3 max_coordinate, iv3 *points)
    906 {
    907 	m4 result;
    908 
    909 	*points = das_output_dimension(*points);
    910 
    911 	switch (iv3_dimension(*points)) {
    912 	case 1:{result = das_transform_1d(      min_coordinate,     max_coordinate);    }break;
    913 	case 2:{result = das_transform_2d_xz(XY(min_coordinate), XY(max_coordinate), 0);}break;
    914 	case 3:{result = das_transform_3d(      min_coordinate,     max_coordinate);    }break;
    915 	}
    916 
    917 	return result;
    918 }
    919 
    920 function v3
    921 plane_normal_from_transform(m4 transform)
    922 {
    923 	v3 U = v3_normalize(transform.c[0].xyz);
    924 	v3 V = v3_normalize(transform.c[1].xyz);
    925 	v3 result  = cross(V, U);
    926 	return result;
    927 }
    928 
    929 function f32
    930 plane_offset_from_transform(m4 transform)
    931 {
    932 	f32 result = v3_dot(plane_normal_from_transform(transform), transform.c[3].xyz);
    933 	return result;
    934 }
    935 
    936 function void
    937 plane_corners_from_transform(m4 transform, v2 *min, v2 *max)
    938 {
    939 	v3 U = v3_normalize(transform.c[0].xyz);
    940 	v3 V = v3_normalize(transform.c[1].xyz);
    941 
    942 	v3 min_3d = m4_mul_v3(transform, (v3){{0.f, 0.f, 0.f}});
    943 	v3 max_3d = m4_mul_v3(transform, (v3){{1.f, 1.f, 1.f}});
    944 
    945 	if (min) *min = (v2){{v3_dot(U, min_3d), v3_dot(V, min_3d)}};
    946 	if (max) *max = (v2){{v3_dot(U, max_3d), v3_dot(V, max_3d)}};
    947 }
    948 
    949 function v2
    950 plane_uv(v3 point, v3 U, v3 V)
    951 {
    952 	v2 result;
    953 	result.x = v3_dot(U, point) / v3_dot(U, U);
    954 	result.y = v3_dot(V, point) / v3_dot(V, V);
    955 	return result;
    956 }
    957 
    958 function v4
    959 hsv_to_rgb(v4 hsv)
    960 {
    961 	/* f(k(n))   = V - V*S*max(0, min(k, min(4 - k, 1)))
    962 	 * k(n)      = fmod((n + H * 6), 6)
    963 	 * (R, G, B) = (f(n = 5), f(n = 3), f(n = 1))
    964 	 */
    965 	alignas(16) f32 nval[4] = {5.0f, 3.0f, 1.0f, 0.0f};
    966 	f32x4 n   = load_f32x4(nval);
    967 	f32x4 H   = dup_f32x4(hsv.x);
    968 	f32x4 S   = dup_f32x4(hsv.y);
    969 	f32x4 V   = dup_f32x4(hsv.z);
    970 	f32x4 six = dup_f32x4(6);
    971 
    972 	f32x4 t   = add_f32x4(n, mul_f32x4(six, H));
    973 	f32x4 rem = floor_f32x4(div_f32x4(t, six));
    974 	f32x4 k   = sub_f32x4(t, mul_f32x4(rem, six));
    975 
    976 	t = min_f32x4(sub_f32x4(dup_f32x4(4), k), dup_f32x4(1));
    977 	t = max_f32x4(dup_f32x4(0), min_f32x4(k, t));
    978 	t = mul_f32x4(t, mul_f32x4(S, V));
    979 
    980 	v4 rgba;
    981 	store_f32x4(rgba.E, sub_f32x4(V, t));
    982 	rgba.a = hsv.a;
    983 	return rgba;
    984 }
    985 
    986 function f32
    987 ease_in_out_cubic(f32 t)
    988 {
    989 	f32 result;
    990 	if (t < 0.5f) {
    991 		result = 4.0f * t * t * t;
    992 	} else {
    993 		t      = -2.0f * t + 2.0f;
    994 		result =  1.0f - t * t * t / 2.0f;
    995 	}
    996 	return result;
    997 }
    998 
    999 function f32
   1000 ease_in_out_quartic(f32 t)
   1001 {
   1002 	f32 result;
   1003 	if (t < 0.5f) {
   1004 		result = 8.0f * t * t * t * t;
   1005 	} else {
   1006 		t      = -2.0f * t + 2.0f;
   1007 		result =  1.0f - t * t * t * t / 2.0f;
   1008 	}
   1009 	return result;
   1010 }