|
4 | 4 | * @author Ash Vardanian
|
5 | 5 | * @date October 16, 2024
|
6 | 6 | *
|
7 |
| - * Contains following element-wise operations: |
| 7 | + * Contains following @b Unary/Binary/Ternary element-wise operations: |
8 | 8 | * - Scale (Multiply) with Shift: R[i] = Alpha * A[i] + Beta
|
9 | 9 | * - Sum (Add): R[i] = A[i] + B[i]
|
10 | 10 | * - WSum or Weighted-Sum: R[i] = Alpha * A[i] + Beta * B[i]
|
@@ -1211,6 +1211,110 @@ SIMSIMD_PUBLIC void simsimd_fma_u8_haswell(
|
1211 | 1211 | }
|
1212 | 1212 | }
|
1213 | 1213 |
|
| 1214 | +SIMSIMD_PUBLIC void simsimd_sum_i16_haswell(simsimd_i16_t const *a, simsimd_i16_t const *b, simsimd_size_t n, |
| 1215 | + simsimd_i16_t *result) { |
| 1216 | + // The main loop: |
| 1217 | + simsimd_size_t i = 0; |
| 1218 | + for (; i + 16 <= n; i += 16) { |
| 1219 | + __m256i a_vec = _mm256_lddqu_si256((__m256i *)(a + i)); |
| 1220 | + __m256i b_vec = _mm256_lddqu_si256((__m256i *)(b + i)); |
| 1221 | + __m256i sum_vec = _mm256_adds_epi16(a_vec, b_vec); |
| 1222 | + _mm256_storeu_si256((__m256i *)(result + i), sum_vec); |
| 1223 | + } |
| 1224 | + |
| 1225 | + // The tail: |
| 1226 | + for (; i < n; ++i) { |
| 1227 | + simsimd_i64_t ai = a[i], bi = b[i]; |
| 1228 | + simsimd_i64_t sum = ai + bi; |
| 1229 | + _simsimd_i64_to_i16(&sum, result + i); |
| 1230 | + } |
| 1231 | +} |
| 1232 | + |
| 1233 | +SIMSIMD_PUBLIC void simsimd_scale_i16_haswell(simsimd_i16_t const *a, simsimd_size_t n, simsimd_distance_t alpha, |
| 1234 | + simsimd_distance_t beta, simsimd_i16_t *result) { |
| 1235 | + |
| 1236 | + simsimd_f32_t alpha_f32 = (simsimd_f32_t)alpha; |
| 1237 | + simsimd_f32_t beta_f32 = (simsimd_f32_t)beta; |
| 1238 | + __m256 alpha_vec = _mm256_set1_ps(alpha_f32); |
| 1239 | + __m256 beta_vec = _mm256_set1_ps(beta_f32); |
| 1240 | + |
| 1241 | + // The main loop: |
| 1242 | + simsimd_size_t i = 0; |
| 1243 | + for (; i + 8 <= n; i += 8) { |
| 1244 | + __m256 a_vec = _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm_lddqu_si128((__m128i *)(a + i)))); |
| 1245 | + __m256 b_vec = _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm_lddqu_si128((__m128i *)(a + i)))); |
| 1246 | + __m256 sum_vec = _mm256_fmadd_ps(a_vec, alpha_vec, beta_vec); |
| 1247 | + __m256i sum_i32_vec = _mm256_cvtps_epi32(sum_vec); |
| 1248 | + sum_i32_vec = _mm256_max_epi32(sum_i32_vec, _mm256_set1_epi32(-32768)); |
| 1249 | + sum_i32_vec = _mm256_min_epi32(sum_i32_vec, _mm256_set1_epi32(32767)); |
| 1250 | + __m128i sum_i16_vec = |
| 1251 | + _mm_packs_epi32(_mm256_castsi256_si128(sum_i32_vec), _mm256_extracti128_si256(sum_i32_vec, 1)); |
| 1252 | + _mm_storeu_si128((__m128i *)(result + i), sum_i16_vec); |
| 1253 | + } |
| 1254 | + |
| 1255 | + // The tail: |
| 1256 | + for (; i < n; ++i) { |
| 1257 | + simsimd_f32_t ai = a[i]; |
| 1258 | + simsimd_f32_t sum = alpha_f32 * ai + beta_f32; |
| 1259 | + _simsimd_f32_to_i16(&sum, result + i); |
| 1260 | + } |
| 1261 | +} |
| 1262 | + |
| 1263 | +SIMSIMD_PUBLIC void simsimd_fma_i16_haswell( // |
| 1264 | + simsimd_i16_t const *a, simsimd_i16_t const *b, simsimd_i16_t const *c, simsimd_size_t n, // |
| 1265 | + simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_i16_t *result) { |
| 1266 | +#if 0 |
| 1267 | + simsimd_f32_t alpha_f32 = (simsimd_f32_t)alpha; |
| 1268 | + simsimd_f32_t beta_f32 = (simsimd_f32_t)beta; |
| 1269 | + __m256 alpha_vec = _mm256_set1_ps(alpha_f32); |
| 1270 | + __m256 beta_vec = _mm256_set1_ps(beta_f32); |
| 1271 | + int sum_i32s[8], a_i32s[8], b_i32s[8], c_i32s[8]; |
| 1272 | + |
| 1273 | + // The main loop: |
| 1274 | + simsimd_size_t i = 0; |
| 1275 | + for (; i + 8 <= n; i += 8) { |
| 1276 | + //? Handling loads and stores with SIMD is tricky. Not because of upcasting, but the |
| 1277 | + //? downcasting at the end of the loop. In AVX2 it's a drag! Keep it for another day. |
| 1278 | + a_i32s[0] = a[i + 0], a_i32s[1] = a[i + 1], a_i32s[2] = a[i + 2], a_i32s[3] = a[i + 3], // |
| 1279 | + a_i32s[4] = a[i + 4], a_i32s[5] = a[i + 5], a_i32s[6] = a[i + 6], a_i32s[7] = a[i + 7]; |
| 1280 | + b_i32s[0] = b[i + 0], b_i32s[1] = b[i + 1], b_i32s[2] = b[i + 2], b_i32s[3] = b[i + 3], // |
| 1281 | + b_i32s[4] = b[i + 4], b_i32s[5] = b[i + 5], b_i32s[6] = b[i + 6], b_i32s[7] = b[i + 7]; |
| 1282 | + c_i32s[0] = c[i + 0], c_i32s[1] = c[i + 1], c_i32s[2] = c[i + 2], c_i32s[3] = c[i + 3], // |
| 1283 | + c_i32s[4] = c[i + 4], c_i32s[5] = c[i + 5], c_i32s[6] = c[i + 6], c_i32s[7] = c[i + 7]; |
| 1284 | + //! This can be done at least 50% faster if we convert 8-bit integers to floats instead |
| 1285 | + //! of relying on the slow `_mm256_cvtepi32_ps` instruction. |
| 1286 | + __m256 a_vec = _mm256_cvtepi32_ps(_mm256_lddqu_si256((__m256i *)a_i32s)); |
| 1287 | + __m256 b_vec = _mm256_cvtepi32_ps(_mm256_lddqu_si256((__m256i *)b_i32s)); |
| 1288 | + __m256 c_vec = _mm256_cvtepi32_ps(_mm256_lddqu_si256((__m256i *)c_i32s)); |
| 1289 | + // The normal part. |
| 1290 | + __m256 ab_vec = _mm256_mul_ps(a_vec, b_vec); |
| 1291 | + __m256 ab_scaled_vec = _mm256_mul_ps(ab_vec, alpha_vec); |
| 1292 | + __m256 sum_vec = _mm256_fmadd_ps(c_vec, beta_vec, ab_scaled_vec); |
| 1293 | + // Instead of serial calls to expensive `_simsimd_f32_to_u8`, convert and clip with SIMD. |
| 1294 | + __m256i sum_i32_vec = _mm256_cvtps_epi32(sum_vec); |
| 1295 | + sum_i32_vec = _mm256_max_epi32(sum_i32_vec, _mm256_set1_epi32(-128)); |
| 1296 | + sum_i32_vec = _mm256_min_epi32(sum_i32_vec, _mm256_set1_epi32(127)); |
| 1297 | + // Export into a serial buffer. |
| 1298 | + _mm256_storeu_si256((__m256i *)sum_i32s, sum_i32_vec); |
| 1299 | + result[i + 0] = (simsimd_i16_t)sum_i32s[0]; |
| 1300 | + result[i + 1] = (simsimd_i16_t)sum_i32s[1]; |
| 1301 | + result[i + 2] = (simsimd_i16_t)sum_i32s[2]; |
| 1302 | + result[i + 3] = (simsimd_i16_t)sum_i32s[3]; |
| 1303 | + result[i + 4] = (simsimd_i16_t)sum_i32s[4]; |
| 1304 | + result[i + 5] = (simsimd_i16_t)sum_i32s[5]; |
| 1305 | + result[i + 6] = (simsimd_i16_t)sum_i32s[6]; |
| 1306 | + result[i + 7] = (simsimd_i16_t)sum_i32s[7]; |
| 1307 | + } |
| 1308 | + |
| 1309 | + // The tail: |
| 1310 | + for (; i < n; ++i) { |
| 1311 | + simsimd_f32_t ai = a[i], bi = b[i], ci = c[i]; |
| 1312 | + simsimd_f32_t sum = alpha_f32 * ai * bi + beta_f32 * ci; |
| 1313 | + _simsimd_f32_to_i16(&sum, result + i); |
| 1314 | + } |
| 1315 | +#endif |
| 1316 | +} |
| 1317 | + |
1214 | 1318 | #pragma clang attribute pop
|
1215 | 1319 | #pragma GCC pop_options
|
1216 | 1320 | #endif // SIMSIMD_TARGET_HASWELL
|
|
0 commit comments