Vector Optimized Library of Kernels  3.1.2
Architecture-tuned implementations of math kernels
volk_16i_max_star_horizontal_16i.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  * Copyright 2023 Magnus Lundmark <magnuslundmark@gmail.com>
5  *
6  * This file is part of VOLK
7  *
8  * SPDX-License-Identifier: LGPL-3.0-or-later
9  */
10 
41 #ifndef INCLUDED_volk_16i_max_star_horizontal_16i_a_H
42 #define INCLUDED_volk_16i_max_star_horizontal_16i_a_H
43 
44 #include <volk/volk_common.h>
45 
46 #include <inttypes.h>
47 #include <stdio.h>
48 
49 
50 #ifdef LV_HAVE_SSSE3
51 
52 #include <emmintrin.h>
53 #include <tmmintrin.h>
54 #include <xmmintrin.h>
55 
56 static inline void volk_16i_max_star_horizontal_16i_a_ssse3(int16_t* target,
57  int16_t* src0,
58  unsigned int num_points)
59 {
60  const unsigned int num_bytes = num_points * 2;
61 
62  static const uint8_t shufmask0[16] = {
63  0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d,
64  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
65  };
66  static const uint8_t shufmask1[16] = {
67  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
68  0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d
69  };
70  static const uint8_t andmask0[16] = {
71  0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
72  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
73  };
74  static const uint8_t andmask1[16] = {
75  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
76  0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02
77  };
78 
79  __m128i xmm0 = {}, xmm1 = {}, xmm2 = {}, xmm3 = {}, xmm4 = {};
80  __m128i xmm5 = {}, xmm6 = {}, xmm7 = {}, xmm8 = {};
81 
82  xmm4 = _mm_load_si128((__m128i*)shufmask0);
83  xmm5 = _mm_load_si128((__m128i*)shufmask1);
84  xmm6 = _mm_load_si128((__m128i*)andmask0);
85  xmm7 = _mm_load_si128((__m128i*)andmask1);
86 
87  __m128i *p_target, *p_src0;
88 
89  p_target = (__m128i*)target;
90  p_src0 = (__m128i*)src0;
91 
92  int bound = num_bytes >> 5;
93  int intermediate = (num_bytes >> 4) & 1;
94  int leftovers = (num_bytes >> 1) & 7;
95 
96  int i = 0;
97 
98  for (i = 0; i < bound; ++i) {
99  xmm0 = _mm_load_si128(p_src0);
100  xmm1 = _mm_load_si128(&p_src0[1]);
101 
102  xmm2 = _mm_xor_si128(xmm2, xmm2);
103  p_src0 += 2;
104 
105  xmm3 = _mm_hsub_epi16(xmm0, xmm1);
106 
107  xmm2 = _mm_cmpgt_epi16(xmm2, xmm3);
108 
109  xmm8 = _mm_and_si128(xmm2, xmm6);
110  xmm3 = _mm_and_si128(xmm2, xmm7);
111 
112 
113  xmm8 = _mm_add_epi8(xmm8, xmm4);
114  xmm3 = _mm_add_epi8(xmm3, xmm5);
115 
116  xmm0 = _mm_shuffle_epi8(xmm0, xmm8);
117  xmm1 = _mm_shuffle_epi8(xmm1, xmm3);
118 
119 
120  xmm3 = _mm_add_epi16(xmm0, xmm1);
121 
122 
123  _mm_store_si128(p_target, xmm3);
124 
125  p_target += 1;
126  }
127 
128  if (intermediate) {
129  xmm0 = _mm_load_si128(p_src0);
130 
131  xmm2 = _mm_xor_si128(xmm2, xmm2);
132  p_src0 += 1;
133 
134  xmm3 = _mm_hsub_epi16(xmm0, xmm1);
135  xmm2 = _mm_cmpgt_epi16(xmm2, xmm3);
136 
137  xmm8 = _mm_and_si128(xmm2, xmm6);
138 
139  xmm3 = _mm_add_epi8(xmm8, xmm4);
140 
141  xmm0 = _mm_shuffle_epi8(xmm0, xmm3);
142 
143  _mm_storel_pd((double*)p_target, bit128_p(&xmm0)->double_vec);
144 
145  p_target = (__m128i*)((int8_t*)p_target + 8);
146  }
147 
148  for (i = (bound << 4) + (intermediate << 3);
149  i < (bound << 4) + (intermediate << 3) + leftovers;
150  i += 2) {
151  target[i >> 1] = ((int16_t)(src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i + 1];
152  }
153 }
154 
155 #endif /*LV_HAVE_SSSE3*/
156 
157 #ifdef LV_HAVE_NEON
158 
159 #include <arm_neon.h>
160 static inline void volk_16i_max_star_horizontal_16i_neon(int16_t* target,
161  int16_t* src0,
162  unsigned int num_points)
163 {
164  const unsigned int eighth_points = num_points / 16;
165  unsigned number;
166  int16x8x2_t input_vec;
167  int16x8_t diff, max_vec, zeros;
168  uint16x8_t comp1, comp2;
169  zeros = vdupq_n_s16(0);
170  for (number = 0; number < eighth_points; ++number) {
171  input_vec = vld2q_s16(src0);
172  //__VOLK_PREFETCH(src0+16);
173  diff = vsubq_s16(input_vec.val[0], input_vec.val[1]);
174  comp1 = vcgeq_s16(diff, zeros);
175  comp2 = vcltq_s16(diff, zeros);
176 
177  input_vec.val[0] = vandq_s16(input_vec.val[0], (int16x8_t)comp1);
178  input_vec.val[1] = vandq_s16(input_vec.val[1], (int16x8_t)comp2);
179 
180  max_vec = vaddq_s16(input_vec.val[0], input_vec.val[1]);
181  vst1q_s16(target, max_vec);
182  src0 += 16;
183  target += 8;
184  }
185  for (number = 0; number < num_points % 16; number += 2) {
186  target[number >> 1] = ((int16_t)(src0[number] - src0[number + 1]) > 0)
187  ? src0[number]
188  : src0[number + 1];
189  }
190 }
191 #endif /* LV_HAVE_NEON */
192 
193 #ifdef LV_HAVE_NEONV7
194 extern void volk_16i_max_star_horizontal_16i_a_neonasm(int16_t* target,
195  int16_t* src0,
196  unsigned int num_points);
197 #endif /* LV_HAVE_NEONV7 */
198 
199 #ifdef LV_HAVE_GENERIC
200 static inline void volk_16i_max_star_horizontal_16i_generic(int16_t* target,
201  int16_t* src0,
202  unsigned int num_points)
203 {
204  const unsigned int num_bytes = num_points * 2;
205 
206  int i = 0;
207 
208  int bound = num_bytes >> 1;
209 
210  for (i = 0; i < bound; i += 2) {
211  target[i >> 1] = ((int16_t)(src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i + 1];
212  }
213 }
214 
215 #endif /*LV_HAVE_GENERIC*/
216 
217 #endif /*INCLUDED_volk_16i_max_star_horizontal_16i_a_H*/
static void volk_16i_max_star_horizontal_16i_neon(int16_t *target, int16_t *src0, unsigned int num_points)
Definition: volk_16i_max_star_horizontal_16i.h:160
static void volk_16i_max_star_horizontal_16i_a_ssse3(int16_t *target, int16_t *src0, unsigned int num_points)
Definition: volk_16i_max_star_horizontal_16i.h:56
static void volk_16i_max_star_horizontal_16i_generic(int16_t *target, int16_t *src0, unsigned int num_points)
Definition: volk_16i_max_star_horizontal_16i.h:200
#define bit128_p(x)
Definition: volk_common.h:147
for i
Definition: volk_config_fixed.tmpl.h:13