74 #ifndef INCLUDED_volk_32fc_x2_add_32fc_u_H
75 #define INCLUDED_volk_32fc_x2_add_32fc_u_H
78 #include <immintrin.h>
83 unsigned int num_points)
85 unsigned int number = 0;
86 const unsigned int quarterPoints = num_points / 4;
92 __m256 aVal, bVal, cVal;
93 for (; number < quarterPoints; number++) {
95 aVal = _mm256_loadu_ps((
float*)aPtr);
96 bVal = _mm256_loadu_ps((
float*)bPtr);
98 cVal = _mm256_add_ps(aVal, bVal);
100 _mm256_storeu_ps((
float*)cPtr,
108 number = quarterPoints * 4;
109 for (; number < num_points; number++) {
110 *cPtr++ = (*aPtr++) + (*bPtr++);
117 #include <immintrin.h>
122 unsigned int num_points)
124 unsigned int number = 0;
125 const unsigned int quarterPoints = num_points / 4;
131 __m256 aVal, bVal, cVal;
132 for (; number < quarterPoints; number++) {
134 aVal = _mm256_load_ps((
float*)aPtr);
135 bVal = _mm256_load_ps((
float*)bPtr);
137 cVal = _mm256_add_ps(aVal, bVal);
139 _mm256_store_ps((
float*)cPtr,
147 number = quarterPoints * 4;
148 for (; number < num_points; number++) {
149 *cPtr++ = (*aPtr++) + (*bPtr++);
156 #include <xmmintrin.h>
161 unsigned int num_points)
163 unsigned int number = 0;
164 const unsigned int halfPoints = num_points / 2;
170 __m128 aVal, bVal, cVal;
171 for (; number < halfPoints; number++) {
173 aVal = _mm_loadu_ps((
float*)aPtr);
174 bVal = _mm_loadu_ps((
float*)bPtr);
176 cVal = _mm_add_ps(aVal, bVal);
178 _mm_storeu_ps((
float*)cPtr, cVal);
185 number = halfPoints * 2;
186 for (; number < num_points; number++) {
187 *cPtr++ = (*aPtr++) + (*bPtr++);
193 #ifdef LV_HAVE_GENERIC
198 unsigned int num_points)
203 unsigned int number = 0;
205 for (number = 0; number < num_points; number++) {
206 *cPtr++ = (*aPtr++) + (*bPtr++);
213 #include <xmmintrin.h>
218 unsigned int num_points)
220 unsigned int number = 0;
221 const unsigned int halfPoints = num_points / 2;
227 __m128 aVal, bVal, cVal;
228 for (; number < halfPoints; number++) {
229 aVal = _mm_load_ps((
float*)aPtr);
230 bVal = _mm_load_ps((
float*)bPtr);
232 cVal = _mm_add_ps(aVal, bVal);
234 _mm_store_ps((
float*)cPtr, cVal);
241 number = halfPoints * 2;
242 for (; number < num_points; number++) {
243 *cPtr++ = (*aPtr++) + (*bPtr++);
250 #include <arm_neon.h>
255 unsigned int num_points)
257 unsigned int number = 0;
258 const unsigned int halfPoints = num_points / 2;
263 float32x4_t aVal, bVal, cVal;
264 for (number = 0; number < halfPoints; number++) {
266 aVal = vld1q_f32((
const float32_t*)(aPtr));
267 bVal = vld1q_f32((
const float32_t*)(bPtr));
272 cVal = vaddq_f32(aVal, bVal);
274 vst1q_f32((
float*)(cPtr), cVal);
281 number = halfPoints * 2;
282 for (; number < num_points; number++) {
283 *cPtr++ = (*aPtr++) + (*bPtr++);