Line data Source code
1 : /*
2 : * Copyright(c) 2019 Intel Corporation
3 : * SPDX - License - Identifier: BSD - 2 - Clause - Patent
4 : */
5 :
6 : #include "EbDefinitions.h"
7 : #include "smmintrin.h"
8 : #include "aom_dsp_rtcd.h"
9 21266000 : void eb_av1_filter_intra_edge_sse4_1(uint8_t *p, int32_t sz, int32_t strength) {
10 21266000 : if (!strength) return;
11 :
12 : DECLARE_ALIGNED(16, static const int8_t, kern[3][16]) = {
13 : { 4, 8, 4, 0, 4, 8, 4, 0, 4, 8, 4, 0, 4, 8, 4, 0 }, // strength 1: 4,8,4
14 : { 5, 6, 5, 0, 5, 6, 5, 0, 5, 6, 5, 0, 5, 6, 5, 0 }, // strength 2: 5,6,5
15 : { 2, 4, 4, 4, 2, 0, 0, 0, 2, 4, 4, 4, 2, 0, 0, 0 } // strength 3: 2,4,4,4,2
16 : };
17 :
18 : DECLARE_ALIGNED(16, static const int8_t, v_const[5][16]) = {
19 : { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 },
20 : { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 },
21 : { 0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8 },
22 : { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
23 : };
24 :
25 : // Extend the first and last samples to simplify the loop for the 5-tap case
26 14016900 : p[-1] = p[0];
27 14016900 : __m128i last = _mm_set1_epi8(p[sz - 1]);
28 14016900 : _mm_storeu_si128((__m128i *)&p[sz], last);
29 :
30 : // Adjust input pointer for filter support area
31 14016900 : uint8_t *in = (strength == 3) ? p - 1 : p;
32 :
33 : // Avoid modifying first sample
34 14016900 : uint8_t *out = p + 1;
35 14016900 : int32_t len = sz - 1;
36 :
37 14016900 : const int32_t use_3tap_filter = (strength < 3);
38 :
39 14016900 : if (use_3tap_filter) {
40 13555600 : __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]);
41 6777830 : __m128i shuf0 = _mm_lddqu_si128((__m128i const *)v_const[0]);
42 6777810 : __m128i shuf1 = _mm_lddqu_si128((__m128i const *)v_const[1]);
43 6777790 : __m128i iden = _mm_lddqu_si128((__m128i *)v_const[3]);
44 6778310 : __m128i in0 = _mm_lddqu_si128((__m128i *)in);
45 17580500 : while (len > 0) {
46 10802100 : int32_t n_out = (len < 8) ? len : 8;
47 10802100 : __m128i d0 = _mm_shuffle_epi8(in0, shuf0);
48 10802100 : __m128i d1 = _mm_shuffle_epi8(in0, shuf1);
49 10802100 : d0 = _mm_maddubs_epi16(d0, coef0);
50 10802100 : d1 = _mm_maddubs_epi16(d1, coef0);
51 10802100 : d0 = _mm_hadd_epi16(d0, d1);
52 10802100 : __m128i eight = _mm_set1_epi16(8);
53 10802100 : d0 = _mm_add_epi16(d0, eight);
54 10802100 : d0 = _mm_srai_epi16(d0, 4);
55 10802100 : d0 = _mm_packus_epi16(d0, d0);
56 10802300 : __m128i out0 = _mm_lddqu_si128((__m128i *)out);
57 21604600 : __m128i n0 = _mm_set1_epi8(n_out);
58 10802300 : __m128i mask = _mm_cmpgt_epi8(n0, iden);
59 10802300 : out0 = _mm_blendv_epi8(out0, d0, mask);
60 10802300 : _mm_storel_epi64((__m128i *)out, out0);
61 10802300 : __m128i in1 = _mm_lddqu_si128((__m128i *)(in + 16));
62 10802200 : in0 = _mm_alignr_epi8(in1, in0, 8);
63 10802200 : in += 8;
64 10802200 : out += 8;
65 10802200 : len -= n_out;
66 : }
67 : }
68 : else { // 5-tap filter
69 14493700 : __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]);
70 7254570 : __m128i two = _mm_set1_epi8(2);
71 7254920 : __m128i shuf_a = _mm_lddqu_si128((__m128i const *)v_const[2]);
72 7254920 : __m128i shuf_b = _mm_add_epi8(shuf_a, two);
73 7254920 : __m128i shuf_c = _mm_add_epi8(shuf_b, two);
74 7254920 : __m128i shuf_d = _mm_add_epi8(shuf_c, two);
75 7254900 : __m128i iden = _mm_lddqu_si128((__m128i *)v_const[3]);
76 7254840 : __m128i in0 = _mm_lddqu_si128((__m128i *)in);
77 31020800 : while (len > 0) {
78 23766200 : int32_t n_out = (len < 8) ? len : 8;
79 23766200 : __m128i d0 = _mm_shuffle_epi8(in0, shuf_a);
80 23766200 : __m128i d1 = _mm_shuffle_epi8(in0, shuf_b);
81 23766200 : __m128i d2 = _mm_shuffle_epi8(in0, shuf_c);
82 23766200 : __m128i d3 = _mm_shuffle_epi8(in0, shuf_d);
83 23766200 : d0 = _mm_maddubs_epi16(d0, coef0);
84 23766200 : d1 = _mm_maddubs_epi16(d1, coef0);
85 23766200 : d2 = _mm_maddubs_epi16(d2, coef0);
86 23766200 : d3 = _mm_maddubs_epi16(d3, coef0);
87 23766200 : d0 = _mm_hadd_epi16(d0, d1);
88 23766200 : d2 = _mm_hadd_epi16(d2, d3);
89 23766200 : d0 = _mm_hadd_epi16(d0, d2);
90 23766200 : __m128i eight = _mm_set1_epi16(8);
91 23766200 : d0 = _mm_add_epi16(d0, eight);
92 23766200 : d0 = _mm_srai_epi16(d0, 4);
93 23766200 : d0 = _mm_packus_epi16(d0, d0);
94 23767000 : __m128i out0 = _mm_lddqu_si128((__m128i *)out);
95 47533900 : __m128i n0 = _mm_set1_epi8(n_out);
96 23767000 : __m128i mask = _mm_cmpgt_epi8(n0, iden);
97 23767000 : out0 = _mm_blendv_epi8(out0, d0, mask);
98 23767000 : _mm_storel_epi64((__m128i *)out, out0);
99 23767000 : __m128i in1 = _mm_lddqu_si128((__m128i *)(in + 16));
100 23766000 : in0 = _mm_alignr_epi8(in1, in0, 8);
101 23766000 : in += 8;
102 23766000 : out += 8;
103 23766000 : len -= n_out;
104 : }
105 : }
106 : }
107 :
108 0 : void eb_av1_filter_intra_edge_high_sse4_1(uint16_t *p, int32_t sz, int32_t strength) {
109 0 : if (!strength) return;
110 :
111 : DECLARE_ALIGNED(16, static const int16_t, kern[3][8]) = {
112 : { 4, 8, 4, 8, 4, 8, 4, 8 }, // strength 1: 4,8,4
113 : { 5, 6, 5, 6, 5, 6, 5, 6 }, // strength 2: 5,6,5
114 : { 2, 4, 2, 4, 2, 4, 2, 4 } // strength 3: 2,4,4,4,2
115 : };
116 :
117 : DECLARE_ALIGNED(16, static const int16_t,
118 : v_const[1][8]) = { { 0, 1, 2, 3, 4, 5, 6, 7 } };
119 :
120 : // Extend the first and last samples to simplify the loop for the 5-tap case
121 0 : p[-1] = p[0];
122 0 : __m128i last = _mm_set1_epi16(p[sz - 1]);
123 0 : _mm_storeu_si128((__m128i *)&p[sz], last);
124 :
125 : // Adjust input pointer for filter support area
126 0 : uint16_t *in = (strength == 3) ? p - 1 : p;
127 :
128 : // Avoid modifying first sample
129 0 : uint16_t *out = p + 1;
130 0 : int32_t len = sz - 1;
131 :
132 0 : const int32_t use_3tap_filter = (strength < 3);
133 :
134 0 : if (use_3tap_filter) {
135 0 : __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]);
136 0 : __m128i iden = _mm_lddqu_si128((__m128i *)v_const[0]);
137 0 : __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]);
138 0 : __m128i in8 = _mm_lddqu_si128((__m128i *)&in[8]);
139 0 : while (len > 0) {
140 0 : int32_t n_out = (len < 8) ? len : 8;
141 0 : __m128i in1 = _mm_alignr_epi8(in8, in0, 2);
142 0 : __m128i in2 = _mm_alignr_epi8(in8, in0, 4);
143 0 : __m128i in02 = _mm_add_epi16(in0, in2);
144 0 : __m128i d0 = _mm_unpacklo_epi16(in02, in1);
145 0 : __m128i d1 = _mm_unpackhi_epi16(in02, in1);
146 0 : d0 = _mm_mullo_epi16(d0, coef0);
147 0 : d1 = _mm_mullo_epi16(d1, coef0);
148 0 : d0 = _mm_hadd_epi16(d0, d1);
149 0 : __m128i eight = _mm_set1_epi16(8);
150 0 : d0 = _mm_add_epi16(d0, eight);
151 0 : d0 = _mm_srli_epi16(d0, 4);
152 0 : __m128i out0 = _mm_lddqu_si128((__m128i *)out);
153 0 : __m128i n0 = _mm_set1_epi16(n_out);
154 0 : __m128i mask = _mm_cmpgt_epi16(n0, iden);
155 0 : out0 = _mm_blendv_epi8(out0, d0, mask);
156 : _mm_storeu_si128((__m128i *)out, out0);
157 0 : in += 8;
158 0 : in0 = in8;
159 0 : in8 = _mm_lddqu_si128((__m128i *)&in[8]);
160 0 : out += 8;
161 0 : len -= n_out;
162 : }
163 : }
164 : else { // 5-tap filter
165 0 : __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]);
166 0 : __m128i iden = _mm_lddqu_si128((__m128i *)v_const[0]);
167 0 : __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]);
168 0 : __m128i in8 = _mm_lddqu_si128((__m128i *)&in[8]);
169 0 : while (len > 0) {
170 0 : int32_t n_out = (len < 8) ? len : 8;
171 0 : __m128i in1 = _mm_alignr_epi8(in8, in0, 2);
172 0 : __m128i in2 = _mm_alignr_epi8(in8, in0, 4);
173 0 : __m128i in3 = _mm_alignr_epi8(in8, in0, 6);
174 0 : __m128i in4 = _mm_alignr_epi8(in8, in0, 8);
175 0 : __m128i in04 = _mm_add_epi16(in0, in4);
176 0 : __m128i in123 = _mm_add_epi16(in1, in2);
177 0 : in123 = _mm_add_epi16(in123, in3);
178 0 : __m128i d0 = _mm_unpacklo_epi16(in04, in123);
179 0 : __m128i d1 = _mm_unpackhi_epi16(in04, in123);
180 0 : d0 = _mm_mullo_epi16(d0, coef0);
181 0 : d1 = _mm_mullo_epi16(d1, coef0);
182 0 : d0 = _mm_hadd_epi16(d0, d1);
183 0 : __m128i eight = _mm_set1_epi16(8);
184 0 : d0 = _mm_add_epi16(d0, eight);
185 0 : d0 = _mm_srli_epi16(d0, 4);
186 0 : __m128i out0 = _mm_lddqu_si128((__m128i *)out);
187 0 : __m128i n0 = _mm_set1_epi16(n_out);
188 0 : __m128i mask = _mm_cmpgt_epi16(n0, iden);
189 0 : out0 = _mm_blendv_epi8(out0, d0, mask);
190 : _mm_storeu_si128((__m128i *)out, out0);
191 0 : in += 8;
192 0 : in0 = in8;
193 0 : in8 = _mm_lddqu_si128((__m128i *)&in[8]);
194 0 : out += 8;
195 0 : len -= n_out;
196 : }
197 : }
198 : }
199 :
200 5947590 : void eb_av1_upsample_intra_edge_sse4_1(uint8_t *p, int32_t sz) {
201 : // interpolate half-sample positions
202 : assert(sz <= 24);
203 :
204 : DECLARE_ALIGNED(16, static const int8_t, kernel[1][16]) = {
205 : { -1, 9, 9, -1, -1, 9, 9, -1, -1, 9, 9, -1, -1, 9, 9, -1 }
206 : };
207 :
208 : DECLARE_ALIGNED(16, static const int8_t, v_const[2][16]) = {
209 : { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 },
210 : { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
211 : };
212 :
213 : // Extend first/last samples (upper-left p[-1], last p[sz-1])
214 : // to support 4-tap filter
215 5947590 : p[-2] = p[-1];
216 5947590 : p[sz] = p[sz - 1];
217 :
218 5947590 : uint8_t *in = &p[-2];
219 5947590 : uint8_t *out = &p[-2];
220 :
221 5947590 : int32_t n = sz + 1; // Input length including upper-left sample
222 :
223 5949060 : __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]);
224 11898100 : __m128i in16 = _mm_lddqu_si128((__m128i *)&in[16]);
225 :
226 5949040 : __m128i coef0 = _mm_lddqu_si128((__m128i *)kernel[0]);
227 5949030 : __m128i shuf0 = _mm_lddqu_si128((__m128i *)v_const[0]);
228 5949020 : __m128i shuf1 = _mm_lddqu_si128((__m128i *)v_const[1]);
229 :
230 12744900 : while (n > 0) {
231 6795890 : __m128i in8 = _mm_alignr_epi8(in16, in0, 8);
232 6795890 : __m128i d0 = _mm_shuffle_epi8(in0, shuf0);
233 6795890 : __m128i d1 = _mm_shuffle_epi8(in0, shuf1);
234 6795890 : __m128i d2 = _mm_shuffle_epi8(in8, shuf0);
235 6795890 : __m128i d3 = _mm_shuffle_epi8(in8, shuf1);
236 6795890 : d0 = _mm_maddubs_epi16(d0, coef0);
237 6795890 : d1 = _mm_maddubs_epi16(d1, coef0);
238 6795890 : d2 = _mm_maddubs_epi16(d2, coef0);
239 6795890 : d3 = _mm_maddubs_epi16(d3, coef0);
240 6795890 : d0 = _mm_hadd_epi16(d0, d1);
241 6795890 : d2 = _mm_hadd_epi16(d2, d3);
242 6795890 : __m128i eight = _mm_set1_epi16(8);
243 6795890 : d0 = _mm_add_epi16(d0, eight);
244 6795890 : d2 = _mm_add_epi16(d2, eight);
245 6795890 : d0 = _mm_srai_epi16(d0, 4);
246 6795890 : d2 = _mm_srai_epi16(d2, 4);
247 6795890 : d0 = _mm_packus_epi16(d0, d2);
248 6795890 : __m128i in1 = _mm_alignr_epi8(in16, in0, 1);
249 6795890 : __m128i out0 = _mm_unpacklo_epi8(in1, d0);
250 6795890 : __m128i out1 = _mm_unpackhi_epi8(in1, d0);
251 : _mm_storeu_si128((__m128i *)&out[0], out0);
252 6795890 : _mm_storeu_si128((__m128i *)&out[16], out1);
253 6795890 : in0 = in16;
254 6795890 : in16 = _mm_setzero_si128();
255 6795890 : out += 32;
256 6795890 : n -= 16;
257 : }
258 5949020 : }
|