Skip to content

Commit 38df49c

Browse files
committed
Break: Shorter symbol names
1 parent 75993e7 commit 38df49c

File tree

9 files changed

+707
-603
lines changed

9 files changed

+707
-603
lines changed

README.md

+3-3
Original file line numberDiff line numberDiff line change
@@ -637,9 +637,9 @@ int main() {
637637
simsimd_f32_t vector_a[1536];
638638
simsimd_f32_t vector_b[1536];
639639
simsimd_kernel_punned_t distance_function = simsimd_metric_punned(
640-
simsimd_cos_k, // Metric kind, like the angular cosine distance
641-
simsimd_datatype_f32_k, // Data type, like: f16, f32, f64, i8, b8, and complex variants
642-
simsimd_cap_any_k); // Which CPU capabilities are we allowed to use
640+
simsimd_cos_k, // Metric kind, like the angular cosine distance
641+
simsimd_f32_k, // Data type, like: f16, f32, f64, i8, b8, complex variants, etc.
642+
simsimd_cap_any_k); // Which CPU capabilities are we allowed to use
643643
simsimd_distance_t distance;
644644
distance_function(vector_a, vector_b, 1536, &distance);
645645
return 0;

c/lib.c

+94-94
Large diffs are not rendered by default.

golang/simsimd.go

+6-6
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,12 @@ package simsimd
88
#include "../include/simsimd/simsimd.h"
99
#include <stdlib.h>
1010
11-
inline static simsimd_f32_t cosine_i8(simsimd_i8_t const* a, simsimd_i8_t const* b, simsimd_size_t d) { return simsimd_metric_punned(simsimd_cosine_k, simsimd_datatype_i8_k, simsimd_cap_any_k)(a, b, d, d); }
12-
inline static simsimd_f32_t cosine_f32(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_size_t d) { return simsimd_metric_punned(simsimd_cosine_k, simsimd_datatype_f32_k, simsimd_cap_any_k)(a, b, d, d); }
13-
inline static simsimd_f32_t inner_i8(simsimd_i8_t const* a, simsimd_i8_t const* b, simsimd_size_t d) { return simsimd_metric_punned(simsimd_inner_k, simsimd_datatype_i8_k, simsimd_cap_any_k)(a, b, d, d); }
14-
inline static simsimd_f32_t inner_f32(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_size_t d) { return simsimd_metric_punned(simsimd_inner_k, simsimd_datatype_f32_k, simsimd_cap_any_k)(a, b, d, d); }
15-
inline static simsimd_f32_t sqeuclidean_i8(simsimd_i8_t const* a, simsimd_i8_t const* b, simsimd_size_t d) { return simsimd_metric_punned(simsimd_sqeuclidean_k, simsimd_datatype_i8_k, simsimd_cap_any_k)(a, b, d, d); }
16-
inline static simsimd_f32_t sqeuclidean_f32(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_size_t d) { return simsimd_metric_punned(simsimd_sqeuclidean_k, simsimd_datatype_f32_k, simsimd_cap_any_k)(a, b, d, d); }
11+
inline static simsimd_f32_t cosine_i8(simsimd_i8_t const* a, simsimd_i8_t const* b, simsimd_size_t d) { return simsimd_metric_punned(simsimd_cosine_k, simsimd_i8_k, simsimd_cap_any_k)(a, b, d, d); }
12+
inline static simsimd_f32_t cosine_f32(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_size_t d) { return simsimd_metric_punned(simsimd_cosine_k, simsimd_f32_k, simsimd_cap_any_k)(a, b, d, d); }
13+
inline static simsimd_f32_t inner_i8(simsimd_i8_t const* a, simsimd_i8_t const* b, simsimd_size_t d) { return simsimd_metric_punned(simsimd_inner_k, simsimd_i8_k, simsimd_cap_any_k)(a, b, d, d); }
14+
inline static simsimd_f32_t inner_f32(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_size_t d) { return simsimd_metric_punned(simsimd_inner_k, simsimd_f32_k, simsimd_cap_any_k)(a, b, d, d); }
15+
inline static simsimd_f32_t sqeuclidean_i8(simsimd_i8_t const* a, simsimd_i8_t const* b, simsimd_size_t d) { return simsimd_metric_punned(simsimd_sqeuclidean_k, simsimd_i8_k, simsimd_cap_any_k)(a, b, d, d); }
16+
inline static simsimd_f32_t sqeuclidean_f32(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_size_t d) { return simsimd_metric_punned(simsimd_sqeuclidean_k, simsimd_f32_k, simsimd_cap_any_k)(a, b, d, d); }
1717
*/
1818
import "C"
1919

include/simsimd/elementwise.h

+105-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
* @author Ash Vardanian
55
* @date October 16, 2024
66
*
7-
* Contains following element-wise operations:
7+
* Contains following @b Unary/Binary/Ternary element-wise operations:
88
* - Scale (Multiply) with Shift: R[i] = Alpha * A[i] + Beta
99
* - Sum (Add): R[i] = A[i] + B[i]
1010
* - WSum or Weighted-Sum: R[i] = Alpha * A[i] + Beta * B[i]
@@ -1211,6 +1211,110 @@ SIMSIMD_PUBLIC void simsimd_fma_u8_haswell(
12111211
}
12121212
}
12131213

1214+
SIMSIMD_PUBLIC void simsimd_sum_i16_haswell(simsimd_i16_t const *a, simsimd_i16_t const *b, simsimd_size_t n,
1215+
simsimd_i16_t *result) {
1216+
// The main loop:
1217+
simsimd_size_t i = 0;
1218+
for (; i + 16 <= n; i += 16) {
1219+
__m256i a_vec = _mm256_lddqu_si256((__m256i *)(a + i));
1220+
__m256i b_vec = _mm256_lddqu_si256((__m256i *)(b + i));
1221+
__m256i sum_vec = _mm256_adds_epi16(a_vec, b_vec);
1222+
_mm256_storeu_si256((__m256i *)(result + i), sum_vec);
1223+
}
1224+
1225+
// The tail:
1226+
for (; i < n; ++i) {
1227+
simsimd_i64_t ai = a[i], bi = b[i];
1228+
simsimd_i64_t sum = ai + bi;
1229+
_simsimd_i64_to_i16(&sum, result + i);
1230+
}
1231+
}
1232+
1233+
SIMSIMD_PUBLIC void simsimd_scale_i16_haswell(simsimd_i16_t const *a, simsimd_size_t n, simsimd_distance_t alpha,
1234+
simsimd_distance_t beta, simsimd_i16_t *result) {
1235+
1236+
simsimd_f32_t alpha_f32 = (simsimd_f32_t)alpha;
1237+
simsimd_f32_t beta_f32 = (simsimd_f32_t)beta;
1238+
__m256 alpha_vec = _mm256_set1_ps(alpha_f32);
1239+
__m256 beta_vec = _mm256_set1_ps(beta_f32);
1240+
1241+
// The main loop:
1242+
simsimd_size_t i = 0;
1243+
for (; i + 8 <= n; i += 8) {
1244+
__m256 a_vec = _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm_lddqu_si128((__m128i *)(a + i))));
1245+
__m256 b_vec = _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm_lddqu_si128((__m128i *)(a + i))));
1246+
__m256 sum_vec = _mm256_fmadd_ps(a_vec, alpha_vec, beta_vec);
1247+
__m256i sum_i32_vec = _mm256_cvtps_epi32(sum_vec);
1248+
sum_i32_vec = _mm256_max_epi32(sum_i32_vec, _mm256_set1_epi32(-32768));
1249+
sum_i32_vec = _mm256_min_epi32(sum_i32_vec, _mm256_set1_epi32(32767));
1250+
__m128i sum_i16_vec =
1251+
_mm_packs_epi32(_mm256_castsi256_si128(sum_i32_vec), _mm256_extracti128_si256(sum_i32_vec, 1));
1252+
_mm_storeu_si128((__m128i *)(result + i), sum_i16_vec);
1253+
}
1254+
1255+
// The tail:
1256+
for (; i < n; ++i) {
1257+
simsimd_f32_t ai = a[i];
1258+
simsimd_f32_t sum = alpha_f32 * ai + beta_f32;
1259+
_simsimd_f32_to_i16(&sum, result + i);
1260+
}
1261+
}
1262+
1263+
SIMSIMD_PUBLIC void simsimd_fma_i16_haswell( //
1264+
simsimd_i16_t const *a, simsimd_i16_t const *b, simsimd_i16_t const *c, simsimd_size_t n, //
1265+
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_i16_t *result) {
1266+
#if 0
1267+
simsimd_f32_t alpha_f32 = (simsimd_f32_t)alpha;
1268+
simsimd_f32_t beta_f32 = (simsimd_f32_t)beta;
1269+
__m256 alpha_vec = _mm256_set1_ps(alpha_f32);
1270+
__m256 beta_vec = _mm256_set1_ps(beta_f32);
1271+
int sum_i32s[8], a_i32s[8], b_i32s[8], c_i32s[8];
1272+
1273+
// The main loop:
1274+
simsimd_size_t i = 0;
1275+
for (; i + 8 <= n; i += 8) {
1276+
//? Handling loads and stores with SIMD is tricky. Not because of upcasting, but the
1277+
//? downcasting at the end of the loop. In AVX2 it's a drag! Keep it for another day.
1278+
a_i32s[0] = a[i + 0], a_i32s[1] = a[i + 1], a_i32s[2] = a[i + 2], a_i32s[3] = a[i + 3], //
1279+
a_i32s[4] = a[i + 4], a_i32s[5] = a[i + 5], a_i32s[6] = a[i + 6], a_i32s[7] = a[i + 7];
1280+
b_i32s[0] = b[i + 0], b_i32s[1] = b[i + 1], b_i32s[2] = b[i + 2], b_i32s[3] = b[i + 3], //
1281+
b_i32s[4] = b[i + 4], b_i32s[5] = b[i + 5], b_i32s[6] = b[i + 6], b_i32s[7] = b[i + 7];
1282+
c_i32s[0] = c[i + 0], c_i32s[1] = c[i + 1], c_i32s[2] = c[i + 2], c_i32s[3] = c[i + 3], //
1283+
c_i32s[4] = c[i + 4], c_i32s[5] = c[i + 5], c_i32s[6] = c[i + 6], c_i32s[7] = c[i + 7];
1284+
//! This can be done at least 50% faster if we convert 8-bit integers to floats instead
1285+
//! of relying on the slow `_mm256_cvtepi32_ps` instruction.
1286+
__m256 a_vec = _mm256_cvtepi32_ps(_mm256_lddqu_si256((__m256i *)a_i32s));
1287+
__m256 b_vec = _mm256_cvtepi32_ps(_mm256_lddqu_si256((__m256i *)b_i32s));
1288+
__m256 c_vec = _mm256_cvtepi32_ps(_mm256_lddqu_si256((__m256i *)c_i32s));
1289+
// The normal part.
1290+
__m256 ab_vec = _mm256_mul_ps(a_vec, b_vec);
1291+
__m256 ab_scaled_vec = _mm256_mul_ps(ab_vec, alpha_vec);
1292+
__m256 sum_vec = _mm256_fmadd_ps(c_vec, beta_vec, ab_scaled_vec);
1293+
// Instead of serial calls to expensive `_simsimd_f32_to_u8`, convert and clip with SIMD.
1294+
__m256i sum_i32_vec = _mm256_cvtps_epi32(sum_vec);
1295+
sum_i32_vec = _mm256_max_epi32(sum_i32_vec, _mm256_set1_epi32(-128));
1296+
sum_i32_vec = _mm256_min_epi32(sum_i32_vec, _mm256_set1_epi32(127));
1297+
// Export into a serial buffer.
1298+
_mm256_storeu_si256((__m256i *)sum_i32s, sum_i32_vec);
1299+
result[i + 0] = (simsimd_i16_t)sum_i32s[0];
1300+
result[i + 1] = (simsimd_i16_t)sum_i32s[1];
1301+
result[i + 2] = (simsimd_i16_t)sum_i32s[2];
1302+
result[i + 3] = (simsimd_i16_t)sum_i32s[3];
1303+
result[i + 4] = (simsimd_i16_t)sum_i32s[4];
1304+
result[i + 5] = (simsimd_i16_t)sum_i32s[5];
1305+
result[i + 6] = (simsimd_i16_t)sum_i32s[6];
1306+
result[i + 7] = (simsimd_i16_t)sum_i32s[7];
1307+
}
1308+
1309+
// The tail:
1310+
for (; i < n; ++i) {
1311+
simsimd_f32_t ai = a[i], bi = b[i], ci = c[i];
1312+
simsimd_f32_t sum = alpha_f32 * ai * bi + beta_f32 * ci;
1313+
_simsimd_f32_to_i16(&sum, result + i);
1314+
}
1315+
#endif
1316+
}
1317+
12141318
#pragma clang attribute pop
12151319
#pragma GCC pop_options
12161320
#endif // SIMSIMD_TARGET_HASWELL

0 commit comments

Comments
 (0)