79 #ifndef INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H
80 #define INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H
89 const unsigned int num_points)
92 for (
unsigned int i = 0;
i < num_points; ++
i) {
97 diff = symbol - *points++;
105 #include <immintrin.h>
109 volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_avx2(
float* target,
113 unsigned int num_points)
115 const unsigned int num_bytes = num_points * 8;
118 __m256 xmm_points0, xmm_points1, xmm_result;
120 const unsigned int bound = num_bytes >> 6;
123 const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((
const double*)src0));
124 const __m128 xmm128_symbol = _mm256_extractf128_ps(xmm_symbol, 1);
127 const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
128 const __m128 xmm128_scalar = _mm256_extractf128_ps(xmm_scalar, 1);
131 const __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
133 for (
unsigned int i = 0;
i < bound; ++
i) {
134 xmm_points0 = _mm256_load_ps((
float*)points);
135 xmm_points1 = _mm256_load_ps((
float*)(points + 4));
140 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
142 _mm256_store_ps(target, xmm_result);
146 if (num_bytes >> 5 & 1) {
147 xmm_points0 = _mm256_load_ps((
float*)points);
149 xmm4 = _mm256_sub_ps(xmm_symbol, xmm_points0);
153 xmm6 = _mm256_mul_ps(xmm4, xmm4);
155 xmm4 = _mm256_hadd_ps(xmm6, xmm6);
156 xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
158 xmm_result = _mm256_mul_ps(xmm4, xmm_scalar);
160 xmm9 = _mm256_extractf128_ps(xmm_result, 1);
161 _mm_store_ps(target, xmm9);
165 if (num_bytes >> 4 & 1) {
166 xmm9 = _mm_load_ps((
float*)points);
168 xmm10 = _mm_sub_ps(xmm128_symbol, xmm9);
172 xmm9 = _mm_mul_ps(xmm10, xmm10);
174 xmm10 = _mm_hadd_ps(xmm9, xmm9);
176 xmm10 = _mm_mul_ps(xmm10, xmm128_scalar);
178 _mm_storeh_pi((__m64*)target, xmm10);
189 #include <immintrin.h>
197 unsigned int num_points)
199 const int eightsPoints = num_points / 8;
200 const int remainder = num_points - 8 * eightsPoints;
202 __m256 xmm_points0, xmm_points1, xmm_result;
205 const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((
const double*)src0));
208 const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
210 for (
int i = 0;
i < eightsPoints; ++
i) {
211 xmm_points0 = _mm256_load_ps((
float*)points);
212 xmm_points1 = _mm256_load_ps((
float*)(points + 4));
216 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
218 _mm256_store_ps(target, xmm_result);
230 #include <pmmintrin.h>
238 unsigned int num_points)
240 __m128 xmm_points0, xmm_points1, xmm_result;
248 const int quarterPoints = num_points / 4;
249 const int leftovers0 = (num_points / 2) - 2 * quarterPoints;
250 const int leftovers1 = num_points % 2;
253 const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((
const double*)src0));
256 const __m128 xmm_scalar = _mm_load1_ps(&scalar);
258 for (
int i = 0;
i < quarterPoints; ++
i) {
259 xmm_points0 = _mm_load_ps((
float*)points);
260 xmm_points1 = _mm_load_ps((
float*)(points + 2));
265 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
267 _mm_store_ps(target, xmm_result);
271 for (
int i = 0;
i < leftovers0; ++
i) {
272 xmm_points0 = _mm_load_ps((
float*)points);
275 xmm_points0 = _mm_sub_ps(xmm_symbol, xmm_points0);
276 xmm_points0 = _mm_mul_ps(xmm_points0, xmm_points0);
277 xmm_points0 = _mm_hadd_ps(xmm_points0, xmm_points0);
278 xmm_result = _mm_mul_ps(xmm_points0, xmm_scalar);
280 _mm_storeh_pi((__m64*)target, xmm_result);
291 #include <xmmintrin.h>
297 unsigned int num_points)
299 const __m128 xmm_scalar = _mm_set1_ps(scalar);
300 const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((
const double*)src0));
302 for (
unsigned i = 0;
i < num_points / 4; ++
i) {
303 __m128 xmm_points0 = _mm_load_ps((
float*)points);
304 __m128 xmm_points1 = _mm_load_ps((
float*)(points + 2));
307 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
308 _mm_store_ps((
float*)target, xmm_result);
314 #endif // LV_HAVE_SSE
316 #ifdef LV_HAVE_GENERIC
322 unsigned int num_points)
333 #ifndef INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_H
334 #define INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_H
340 #include <immintrin.h>
344 volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_avx2(
float* target,
348 unsigned int num_points)
350 const unsigned int num_bytes = num_points * 8;
353 __m256 xmm_points0, xmm_points1, xmm_result;
355 const unsigned int bound = num_bytes >> 6;
358 const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((
const double*)src0));
359 const __m128 xmm128_symbol = _mm256_extractf128_ps(xmm_symbol, 1);
362 const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
363 const __m128 xmm128_scalar = _mm256_extractf128_ps(xmm_scalar, 1);
366 const __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
368 for (
unsigned int i = 0;
i < bound; ++
i) {
369 xmm_points0 = _mm256_loadu_ps((
float*)points);
370 xmm_points1 = _mm256_loadu_ps((
float*)(points + 4));
375 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
377 _mm256_storeu_ps(target, xmm_result);
381 if (num_bytes >> 5 & 1) {
382 xmm_points0 = _mm256_loadu_ps((
float*)points);
384 xmm4 = _mm256_sub_ps(xmm_symbol, xmm_points0);
388 xmm6 = _mm256_mul_ps(xmm4, xmm4);
390 xmm4 = _mm256_hadd_ps(xmm6, xmm6);
391 xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
393 xmm_result = _mm256_mul_ps(xmm4, xmm_scalar);
395 xmm9 = _mm256_extractf128_ps(xmm_result, 1);
396 _mm_storeu_ps(target, xmm9);
400 if (num_bytes >> 4 & 1) {
401 xmm9 = _mm_loadu_ps((
float*)points);
403 xmm10 = _mm_sub_ps(xmm128_symbol, xmm9);
407 xmm9 = _mm_mul_ps(xmm10, xmm10);
409 xmm10 = _mm_hadd_ps(xmm9, xmm9);
411 xmm10 = _mm_mul_ps(xmm10, xmm128_scalar);
413 _mm_storeh_pi((__m64*)target, xmm10);
424 #include <immintrin.h>
432 unsigned int num_points)
434 const int eightsPoints = num_points / 8;
435 const int remainder = num_points - 8 * eightsPoints;
437 __m256 xmm_points0, xmm_points1, xmm_result;
440 const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((
const double*)src0));
443 const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
445 for (
int i = 0;
i < eightsPoints; ++
i) {
446 xmm_points0 = _mm256_loadu_ps((
float*)points);
447 xmm_points1 = _mm256_loadu_ps((
float*)(points + 4));
451 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
453 _mm256_storeu_ps(target, xmm_result);
465 #include <pmmintrin.h>
473 unsigned int num_points)
475 __m128 xmm_points0, xmm_points1, xmm_result;
483 const int quarterPoints = num_points / 4;
484 const int leftovers0 = (num_points / 2) - 2 * quarterPoints;
485 const int leftovers1 = num_points % 2;
488 const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((
const double*)src0));
491 const __m128 xmm_scalar = _mm_load1_ps(&scalar);
493 for (
int i = 0;
i < quarterPoints; ++
i) {
494 xmm_points0 = _mm_loadu_ps((
float*)points);
495 xmm_points1 = _mm_loadu_ps((
float*)(points + 2));
500 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
502 _mm_storeu_ps(target, xmm_result);
506 for (
int i = 0;
i < leftovers0; ++
i) {
507 xmm_points0 = _mm_loadu_ps((
float*)points);
510 xmm_points0 = _mm_sub_ps(xmm_symbol, xmm_points0);
511 xmm_points0 = _mm_mul_ps(xmm_points0, xmm_points0);
512 xmm_points0 = _mm_hadd_ps(xmm_points0, xmm_points0);
513 xmm_result = _mm_mul_ps(xmm_points0, xmm_scalar);
515 _mm_storeh_pi((__m64*)target, xmm_result);
526 #include <xmmintrin.h>
532 unsigned int num_points)
534 const __m128 xmm_scalar = _mm_set1_ps(scalar);
535 const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((
const double*)src0));
537 for (
unsigned i = 0;
i < num_points / 4; ++
i) {
538 __m128 xmm_points0 = _mm_loadu_ps((
float*)points);
539 __m128 xmm_points1 = _mm_loadu_ps((
float*)(points + 2));
542 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
543 _mm_storeu_ps((
float*)target, xmm_result);
549 #endif // LV_HAVE_SSE