34 #ifndef INCLUDED_volk_16ic_x2_multiply_16ic_H
35 #define INCLUDED_volk_16ic_x2_multiply_16ic_H
40 #ifdef LV_HAVE_GENERIC
45 unsigned int num_points)
48 for (n = 0; n < num_points; n++) {
49 result[n] = in_a[n] * in_b[n];
57 #include <emmintrin.h>
62 unsigned int num_points)
64 const unsigned int sse_iters = num_points / 4;
65 __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
68 mask_imag = _mm_set_epi8(
69 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
70 mask_real = _mm_set_epi8(
71 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
78 for (number = 0; number < sse_iters; number++) {
81 b = _mm_load_si128((__m128i*)_in_b);
82 c = _mm_mullo_epi16(a, b);
84 c_sr = _mm_srli_si128(c, 2);
86 real = _mm_subs_epi16(c, c_sr);
87 real = _mm_and_si128(real,
90 b_sl = _mm_slli_si128(b, 2);
91 a_sl = _mm_slli_si128(a, 2);
93 imag1 = _mm_mullo_epi16(a, b_sl);
94 imag2 = _mm_mullo_epi16(b, a_sl);
96 imag = _mm_adds_epi16(imag1, imag2);
97 imag = _mm_and_si128(imag, mask_imag);
99 result = _mm_or_si128(real, imag);
101 _mm_store_si128((__m128i*)_out, result);
108 for (number = sse_iters * 4; number < num_points; ++number) {
109 *_out++ = (*_in_a++) * (*_in_b++);
116 #include <emmintrin.h>
121 unsigned int num_points)
123 const unsigned int sse_iters = num_points / 4;
124 __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
127 mask_imag = _mm_set_epi8(
128 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
129 mask_real = _mm_set_epi8(
130 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
137 for (number = 0; number < sse_iters; number++) {
140 b = _mm_loadu_si128((__m128i*)_in_b);
141 c = _mm_mullo_epi16(a, b);
143 c_sr = _mm_srli_si128(c, 2);
145 real = _mm_subs_epi16(c, c_sr);
146 real = _mm_and_si128(real,
149 b_sl = _mm_slli_si128(b, 2);
150 a_sl = _mm_slli_si128(a, 2);
152 imag1 = _mm_mullo_epi16(a, b_sl);
153 imag2 = _mm_mullo_epi16(b, a_sl);
155 imag = _mm_adds_epi16(imag1, imag2);
156 imag = _mm_and_si128(imag, mask_imag);
158 result = _mm_or_si128(real, imag);
160 _mm_storeu_si128((__m128i*)_out, result);
167 for (number = sse_iters * 4; number < num_points; ++number) {
168 *_out++ = (*_in_a++) * (*_in_b++);
175 #include <immintrin.h>
177 static inline void volk_16ic_x2_multiply_16ic_u_avx2(
lv_16sc_t* out,
180 unsigned int num_points)
182 unsigned int number = 0;
183 const unsigned int avx2_points = num_points / 8;
189 __m256i a, b, c, c_sr, real, imag, imag1, imag2, b_sl, a_sl, result;
191 const __m256i mask_imag = _mm256_set_epi8(0xFF,
223 const __m256i mask_real = _mm256_set_epi8(0,
256 for (; number < avx2_points; number++) {
257 a = _mm256_loadu_si256(
259 b = _mm256_loadu_si256(
261 c = _mm256_mullo_epi16(a, b);
263 c_sr = _mm256_srli_si256(c, 2);
265 real = _mm256_subs_epi16(c, c_sr);
266 real = _mm256_and_si256(
269 b_sl = _mm256_slli_si256(b, 2);
270 a_sl = _mm256_slli_si256(a, 2);
272 imag1 = _mm256_mullo_epi16(a, b_sl);
273 imag2 = _mm256_mullo_epi16(b, a_sl);
275 imag = _mm256_adds_epi16(imag1, imag2);
276 imag = _mm256_and_si256(imag, mask_imag);
278 result = _mm256_or_si256(real, imag);
280 _mm256_storeu_si256((__m256i*)_out, result);
287 number = avx2_points * 8;
288 for (; number < num_points; number++) {
289 *_out++ = (*_in_a++) * (*_in_b++);
296 #include <immintrin.h>
298 static inline void volk_16ic_x2_multiply_16ic_a_avx2(
lv_16sc_t* out,
301 unsigned int num_points)
303 unsigned int number = 0;
304 const unsigned int avx2_points = num_points / 8;
310 __m256i a, b, c, c_sr, real, imag, imag1, imag2, b_sl, a_sl, result;
312 const __m256i mask_imag = _mm256_set_epi8(0xFF,
344 const __m256i mask_real = _mm256_set_epi8(0,
377 for (; number < avx2_points; number++) {
378 a = _mm256_load_si256(
380 b = _mm256_load_si256(
382 c = _mm256_mullo_epi16(a, b);
384 c_sr = _mm256_srli_si256(c, 2);
386 real = _mm256_subs_epi16(c, c_sr);
387 real = _mm256_and_si256(
390 b_sl = _mm256_slli_si256(b, 2);
391 a_sl = _mm256_slli_si256(a, 2);
393 imag1 = _mm256_mullo_epi16(a, b_sl);
394 imag2 = _mm256_mullo_epi16(b, a_sl);
396 imag = _mm256_adds_epi16(imag1, imag2);
397 imag = _mm256_and_si256(imag, mask_imag);
399 result = _mm256_or_si256(real, imag);
401 _mm256_store_si256((__m256i*)_out, result);
408 number = avx2_points * 8;
409 for (; number < num_points; number++) {
410 *_out++ = (*_in_a++) * (*_in_b++);
417 #include <arm_neon.h>
422 unsigned int num_points)
426 unsigned int quarter_points = num_points / 4;
427 int16x4x2_t a_val, b_val, c_val;
428 int16x4x2_t tmp_real, tmp_imag;
429 unsigned int number = 0;
431 for (number = 0; number < quarter_points; ++number) {
432 a_val = vld2_s16((int16_t*)a_ptr);
433 b_val = vld2_s16((int16_t*)b_ptr);
439 tmp_real.val[0] = vmul_s16(a_val.val[0], b_val.val[0]);
441 tmp_real.val[1] = vmul_s16(a_val.val[1], b_val.val[1]);
445 tmp_imag.val[0] = vmul_s16(a_val.val[0], b_val.val[1]);
447 tmp_imag.val[1] = vmul_s16(a_val.val[1], b_val.val[0]);
450 c_val.val[0] = vsub_s16(tmp_real.val[0], tmp_real.val[1]);
451 c_val.val[1] = vadd_s16(tmp_imag.val[0], tmp_imag.val[1]);
452 vst2_s16((int16_t*)out, c_val);
459 for (number = quarter_points * 4; number < num_points; number++) {
460 *out++ = (*a_ptr++) * (*b_ptr++);
static void volk_16ic_x2_multiply_16ic_a_sse2(lv_16sc_t *out, const lv_16sc_t *in_a, const lv_16sc_t *in_b, unsigned int num_points)
Definition: volk_16ic_x2_multiply_16ic.h:59
static void volk_16ic_x2_multiply_16ic_u_sse2(lv_16sc_t *out, const lv_16sc_t *in_a, const lv_16sc_t *in_b, unsigned int num_points)
Definition: volk_16ic_x2_multiply_16ic.h:118
static void volk_16ic_x2_multiply_16ic_generic(lv_16sc_t *result, const lv_16sc_t *in_a, const lv_16sc_t *in_b, unsigned int num_points)
Definition: volk_16ic_x2_multiply_16ic.h:42
static void volk_16ic_x2_multiply_16ic_neon(lv_16sc_t *out, const lv_16sc_t *in_a, const lv_16sc_t *in_b, unsigned int num_points)
Definition: volk_16ic_x2_multiply_16ic.h:419
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:68
short complex lv_16sc_t
Definition: volk_complex.h:71