72 #ifndef INCLUDED_volk_32fc_x2_divide_32fc_u_H
73 #define INCLUDED_volk_32fc_x2_divide_32fc_u_H
80 #ifdef LV_HAVE_GENERIC
85 unsigned int num_points)
91 for (
unsigned int number = 0; number < num_points; number++) {
92 *cPtr++ = (*aPtr++) / (*bPtr++);
99 #include <pmmintrin.h>
105 unsigned int num_points)
113 unsigned int number = 0;
114 const unsigned int quarterPoints = num_points / 4;
116 __m128 num01, num23, den01, den23, norm, result;
121 for (; number < quarterPoints; number++) {
122 num01 = _mm_loadu_ps((
float*)a);
123 den01 = _mm_loadu_ps((
float*)b);
128 num23 = _mm_loadu_ps((
float*)a);
129 den23 = _mm_loadu_ps((
float*)b);
135 den01 = _mm_unpacklo_ps(norm, norm);
136 den23 = _mm_unpackhi_ps(norm, norm);
138 result = _mm_div_ps(num01, den01);
139 _mm_storeu_ps((
float*)c, result);
141 result = _mm_div_ps(num23, den23);
142 _mm_storeu_ps((
float*)c, result);
147 for (; number < num_points; number++) {
158 #include <immintrin.h>
164 unsigned int num_points)
172 unsigned int number = 0;
173 const unsigned int quarterPoints = num_points / 4;
175 __m256 num, denum, mul_conj, sq, mag_sq, mag_sq_un, div;
180 for (; number < quarterPoints; number++) {
181 num = _mm256_loadu_ps(
183 denum = _mm256_loadu_ps(
186 sq = _mm256_mul_ps(denum, denum);
187 mag_sq_un = _mm256_hadd_ps(
189 mag_sq = _mm256_permute_ps(mag_sq_un, 0xd8);
192 div = _mm256_div_ps(mul_conj, mag_sq);
194 _mm256_storeu_ps((
float*)c, div);
201 number = quarterPoints * 4;
203 for (; number < num_points; number++) {
204 *c++ = (*a++) / (*b++);
213 #ifndef INCLUDED_volk_32fc_x2_divide_32fc_a_H
214 #define INCLUDED_volk_32fc_x2_divide_32fc_a_H
217 #include <inttypes.h>
222 #include <pmmintrin.h>
228 unsigned int num_points)
236 unsigned int number = 0;
237 const unsigned int quarterPoints = num_points / 4;
239 __m128 num01, num23, den01, den23, norm, result;
244 for (; number < quarterPoints; number++) {
245 num01 = _mm_load_ps((
float*)a);
246 den01 = _mm_load_ps((
float*)b);
251 num23 = _mm_load_ps((
float*)a);
252 den23 = _mm_load_ps((
float*)b);
259 den01 = _mm_unpacklo_ps(norm, norm);
260 den23 = _mm_unpackhi_ps(norm, norm);
262 result = _mm_div_ps(num01, den01);
263 _mm_store_ps((
float*)c, result);
265 result = _mm_div_ps(num23, den23);
266 _mm_store_ps((
float*)c, result);
271 for (; number < num_points; number++) {
281 #include <immintrin.h>
287 unsigned int num_points)
303 const unsigned int eigthPoints = num_points / 8;
305 __m256 num01, num23, denum01, denum23, complex_result, result0, result1;
307 for (
unsigned int number = 0; number < eigthPoints; number++) {
309 num01 = _mm256_load_ps((
float*)a);
310 denum01 = _mm256_load_ps((
float*)b);
316 num23 = _mm256_load_ps((
float*)a);
317 denum23 = _mm256_load_ps((
float*)b);
322 complex_result = _mm256_hadd_ps(_mm256_mul_ps(denum01, denum01),
323 _mm256_mul_ps(denum23, denum23));
325 denum01 = _mm256_shuffle_ps(complex_result, complex_result, 0x50);
326 denum23 = _mm256_shuffle_ps(complex_result, complex_result, 0xfa);
328 result0 = _mm256_div_ps(num01, denum01);
329 result1 = _mm256_div_ps(num23, denum23);
331 _mm256_store_ps((
float*)c, result0);
333 _mm256_store_ps((
float*)c, result1);
343 #include <arm_neon.h>
348 unsigned int num_points)
354 float32x4x2_t aVal, bVal, cVal;
355 float32x4_t bAbs, bAbsInv;
357 const unsigned int quarterPoints = num_points / 4;
358 unsigned int number = 0;
359 for (; number < quarterPoints; number++) {
360 aVal = vld2q_f32((
const float*)(aPtr));
361 bVal = vld2q_f32((
const float*)(bPtr));
367 bAbs = vmulq_f32(bVal.val[0], bVal.val[0]);
368 bAbs = vmlaq_f32(bAbs, bVal.val[1], bVal.val[1]);
370 bAbsInv = vrecpeq_f32(bAbs);
371 bAbsInv = vmulq_f32(bAbsInv, vrecpsq_f32(bAbsInv, bAbs));
372 bAbsInv = vmulq_f32(bAbsInv, vrecpsq_f32(bAbsInv, bAbs));
374 cVal.val[0] = vmulq_f32(aVal.val[0], bVal.val[0]);
375 cVal.val[0] = vmlaq_f32(cVal.val[0], aVal.val[1], bVal.val[1]);
376 cVal.val[0] = vmulq_f32(cVal.val[0], bAbsInv);
378 cVal.val[1] = vmulq_f32(aVal.val[1], bVal.val[0]);
379 cVal.val[1] = vmlsq_f32(cVal.val[1], aVal.val[0], bVal.val[1]);
380 cVal.val[1] = vmulq_f32(cVal.val[1], bAbsInv);
382 vst2q_f32((
float*)(cPtr), cVal);
386 for (number = quarterPoints * 4; number < num_points; number++) {
387 *cPtr++ = (*aPtr++) / (*bPtr++);