Line data Source code
1 : /*
2 : * Copyright(c) 2019 Intel Corporation
3 : * SPDX - License - Identifier: BSD - 2 - Clause - Patent
4 : */
5 :
6 : #include "EbAvcStyleMcp_SSSE3.h"
7 :
8 : #include "EbMcp_SSE2.h"
9 : #include "EbDefinitions.h"
10 : #include "EbAvcStyleMcp_SSE2.h"
11 :
12 : #include "emmintrin.h"
13 : #include "tmmintrin.h"
14 :
15 : EB_EXTERN EB_ALIGN(16) const int8_t AvcStyleLumaIFCoeff8_SSSE3[] = {
16 : -1, 25, -1, 25, -1, 25, -1, 25, -1, 25, -1, 25, -1, 25, -1, 25,
17 : 9, -1, 9, -1, 9, -1, 9, -1, 9, -1, 9, -1, 9, -1, 9, -1,
18 : -2, 18, -2, 18, -2, 18, -2, 18, -2, 18, -2, 18, -2, 18, -2, 18,
19 : 18, -2, 18, -2, 18, -2, 18, -2, 18, -2, 18, -2, 18, -2, 18, -2,
20 : -1, 9, -1, 9, -1, 9, -1, 9, -1, 9, -1, 9, -1, 9, -1, 9,
21 : 25, -1, 25, -1, 25, -1, 25, -1, 25, -1, 25, -1, 25, -1, 25, -1
22 : };
23 :
24 0 : void avc_style_luma_interpolation_filter_pose_ssse3(
25 : EbByte ref_pic,
26 : uint32_t src_stride,
27 : EbByte dst,
28 : uint32_t dst_stride,
29 : uint32_t pu_width,
30 : uint32_t pu_height,
31 : EbByte temp_buf,
32 : EbBool skip,
33 : uint32_t frac_pos)
34 : {
35 0 : uint32_t tempBufSize = pu_width * pu_height;
36 : (void)frac_pos;
37 0 : avc_style_luma_interpolation_filter_horizontal_ssse3_intrin(ref_pic, src_stride, temp_buf, pu_width, pu_width, pu_height, 0, skip, 2);
38 0 : avc_style_luma_interpolation_filter_vertical_ssse3_intrin(ref_pic, src_stride, temp_buf + tempBufSize, pu_width, pu_width, pu_height, 0, skip, 2);
39 0 : picture_average_kernel_sse2(temp_buf, pu_width, temp_buf + tempBufSize, pu_width, dst, dst_stride, pu_width, pu_height);
40 0 : }
41 :
42 0 : void avc_style_luma_interpolation_filter_posf_ssse3(
43 : EbByte ref_pic,
44 : uint32_t src_stride,
45 : EbByte dst,
46 : uint32_t dst_stride,
47 : uint32_t pu_width,
48 : uint32_t pu_height,
49 : EbByte temp_buf,
50 : EbBool skip,
51 : uint32_t frac_pos)
52 : {
53 0 : uint32_t tempBufSize = pu_width * pu_height;
54 : (void)frac_pos;
55 0 : avc_style_luma_interpolation_filter_horizontal_ssse3_intrin(ref_pic - src_stride, src_stride, temp_buf + tempBufSize, pu_width, pu_width, skip ? (2 * pu_height + 3) : (pu_height + 3), 0, EB_FALSE, 2);
56 0 : avc_style_luma_interpolation_filter_vertical_ssse3_intrin(temp_buf + tempBufSize + pu_width, pu_width, temp_buf, pu_width, pu_width, pu_height, 0, skip, 2);
57 0 : picture_average_kernel_sse2(temp_buf + tempBufSize + pu_width, skip ? 2 * pu_width : pu_width, temp_buf, pu_width, dst, dst_stride, pu_width, pu_height);
58 0 : }
59 :
60 0 : void avc_style_luma_interpolation_filter_posg_ssse3(
61 : EbByte ref_pic,
62 : uint32_t src_stride,
63 : EbByte dst,
64 : uint32_t dst_stride,
65 : uint32_t pu_width,
66 : uint32_t pu_height,
67 : EbByte temp_buf,
68 : EbBool skip,
69 : uint32_t frac_pos)
70 : {
71 0 : uint32_t tempBufSize = pu_width * pu_height;
72 : (void)frac_pos;
73 0 : avc_style_luma_interpolation_filter_horizontal_ssse3_intrin(ref_pic, src_stride, temp_buf, pu_width, pu_width, pu_height, 0, skip, 2);
74 0 : avc_style_luma_interpolation_filter_vertical_ssse3_intrin(ref_pic + 1, src_stride, temp_buf + tempBufSize, pu_width, pu_width, pu_height, 0, skip, 2);
75 0 : picture_average_kernel_sse2(temp_buf, pu_width, temp_buf + tempBufSize, pu_width, dst, dst_stride, pu_width, pu_height);
76 0 : }
77 :
78 0 : void avc_style_luma_interpolation_filter_posi_ssse3(
79 : EbByte ref_pic,
80 : uint32_t src_stride,
81 : EbByte dst,
82 : uint32_t dst_stride,
83 : uint32_t pu_width,
84 : uint32_t pu_height,
85 : EbByte temp_buf,
86 : EbBool skip,
87 : uint32_t frac_pos)
88 : {
89 0 : uint32_t tempBufSize = pu_width * pu_height;
90 : (void)frac_pos;
91 0 : avc_style_luma_interpolation_filter_vertical_ssse3_intrin(ref_pic, src_stride, temp_buf, pu_width, pu_width, pu_height, 0, skip, 2);
92 0 : avc_style_luma_interpolation_filter_posj_ssse3(ref_pic, src_stride, temp_buf + tempBufSize, pu_width, pu_width, pu_height, temp_buf + 2 * tempBufSize, skip, 2);
93 0 : picture_average_kernel_sse2(temp_buf, pu_width, temp_buf + tempBufSize, pu_width, dst, dst_stride, pu_width, pu_height);
94 0 : }
95 :
96 0 : void avc_style_luma_interpolation_filter_posj_ssse3(
97 : EbByte ref_pic,
98 : uint32_t src_stride,
99 : EbByte dst,
100 : uint32_t dst_stride,
101 : uint32_t pu_width,
102 : uint32_t pu_height,
103 : EbByte temp_buf,
104 : EbBool skip,
105 : uint32_t frac_pos)
106 : {
107 : (void)frac_pos;
108 0 : if (skip)
109 0 : avc_style_luma_interpolation_filter_horizontal_ssse3_intrin(ref_pic - src_stride, src_stride, temp_buf, pu_width, pu_width, (pu_height + 3), 0, EB_FALSE, 2);
110 : else
111 0 : avc_style_luma_interpolation_filter_horizontal_ssse3_intrin(ref_pic - src_stride, src_stride, temp_buf, pu_width, pu_width, skip ? (2 * pu_height + 3) : (pu_height + 3), 0, EB_FALSE, 2);
112 :
113 0 : avc_style_luma_interpolation_filter_vertical_ssse3_intrin(temp_buf + pu_width, pu_width, dst, dst_stride, pu_width, pu_height, 0, skip, 2);
114 0 : }
115 :
116 0 : void avc_style_luma_interpolation_filter_posk_ssse3(
117 : EbByte ref_pic,
118 : uint32_t src_stride,
119 : EbByte dst,
120 : uint32_t dst_stride,
121 : uint32_t pu_width,
122 : uint32_t pu_height,
123 : EbByte temp_buf,
124 : EbBool skip,
125 : uint32_t frac_pos)
126 : {
127 0 : uint32_t tempBufSize = pu_width * pu_height;
128 : (void)frac_pos;
129 0 : avc_style_luma_interpolation_filter_vertical_ssse3_intrin(ref_pic + 1, src_stride, temp_buf, pu_width, pu_width, pu_height, 0, skip, 2);
130 0 : avc_style_luma_interpolation_filter_posj_ssse3(ref_pic, src_stride, temp_buf + tempBufSize, pu_width, pu_width, pu_height, temp_buf + 2 * tempBufSize, skip, 2);
131 0 : picture_average_kernel_sse2(temp_buf, pu_width, temp_buf + tempBufSize, pu_width, dst, dst_stride, pu_width, pu_height);
132 0 : }
133 :
134 0 : void avc_style_luma_interpolation_filter_posp_ssse3(
135 : EbByte ref_pic,
136 : uint32_t src_stride,
137 : EbByte dst,
138 : uint32_t dst_stride,
139 : uint32_t pu_width,
140 : uint32_t pu_height,
141 : EbByte temp_buf,
142 : EbBool skip,
143 : uint32_t frac_pos)
144 : {
145 0 : uint32_t tempBufSize = pu_width * pu_height;
146 : (void)frac_pos;
147 0 : avc_style_luma_interpolation_filter_vertical_ssse3_intrin(ref_pic, src_stride, temp_buf, pu_width, pu_width, pu_height, 0, skip, 2);
148 0 : avc_style_luma_interpolation_filter_horizontal_ssse3_intrin(ref_pic + src_stride, src_stride, temp_buf + tempBufSize, pu_width, pu_width, pu_height, 0, skip, 2);
149 0 : picture_average_kernel_sse2(temp_buf, pu_width, temp_buf + tempBufSize, pu_width, dst, dst_stride, pu_width, pu_height);
150 0 : }
151 :
152 0 : void avc_style_luma_interpolation_filter_posq_ssse3(
153 : EbByte ref_pic,
154 : uint32_t src_stride,
155 : EbByte dst,
156 : uint32_t dst_stride,
157 : uint32_t pu_width,
158 : uint32_t pu_height,
159 : EbByte temp_buf,
160 : EbBool skip,
161 : uint32_t frac_pos)
162 : {
163 0 : uint32_t tempBufSize = pu_width * pu_height;
164 : (void)frac_pos;
165 0 : avc_style_luma_interpolation_filter_horizontal_ssse3_intrin(ref_pic - src_stride, src_stride, temp_buf + tempBufSize, pu_width, pu_width, skip ? (2 * pu_height + 3) : (pu_height + 3), 0, EB_FALSE, 2);
166 0 : avc_style_luma_interpolation_filter_vertical_ssse3_intrin(temp_buf + tempBufSize + pu_width, pu_width, temp_buf, pu_width, pu_width, pu_height, 0, skip, 2);
167 0 : picture_average_kernel_sse2(temp_buf + tempBufSize + 2 * pu_width, skip ? 2 * pu_width : pu_width, temp_buf, pu_width, dst, dst_stride, pu_width, pu_height);
168 0 : }
169 :
170 0 : void avc_style_luma_interpolation_filter_posr_ssse3(
171 : EbByte ref_pic,
172 : uint32_t src_stride,
173 : EbByte dst,
174 : uint32_t dst_stride,
175 : uint32_t pu_width,
176 : uint32_t pu_height,
177 : EbByte temp_buf,
178 : EbBool skip,
179 : uint32_t frac_pos)
180 : {
181 0 : uint32_t tempBufSize = pu_width * pu_height;
182 : (void)frac_pos;
183 0 : avc_style_luma_interpolation_filter_vertical_ssse3_intrin(ref_pic + 1, src_stride, temp_buf, pu_width, pu_width, pu_height, 0, skip, 2);
184 0 : avc_style_luma_interpolation_filter_horizontal_ssse3_intrin(ref_pic + src_stride, src_stride, temp_buf + tempBufSize, pu_width, pu_width, pu_height, 0, skip, 2);
185 0 : picture_average_kernel_sse2(temp_buf, pu_width, temp_buf + tempBufSize, pu_width, dst, dst_stride, pu_width, pu_height);
186 0 : }
187 :
188 26580 : void avc_style_luma_interpolation_filter_horizontal_ssse3_intrin(
189 : EbByte ref_pic,
190 : uint32_t src_stride,
191 : EbByte dst,
192 : uint32_t dst_stride,
193 : uint32_t pu_width,
194 : uint32_t pu_height,
195 : EbByte temp_buf,
196 : EbBool skip,
197 : uint32_t frac_pos)
198 : {
199 : (void)temp_buf;
200 : __m128i IFOffset, IFCoeff_1_0, IFCoeff_3_2, sum_clip_U8;
201 : uint32_t width_cnt, height_cnt;
202 26580 : uint32_t IFShift = 5;
203 :
204 26580 : src_stride <<= skip;
205 26580 : dst_stride <<= skip;
206 26580 : pu_height >>= skip;
207 26580 : frac_pos <<= 5;
208 26580 : IFOffset = _mm_set1_epi16(0x0010);
209 26580 : IFCoeff_1_0 = _mm_load_si128((__m128i *)(AvcStyleLumaIFCoeff8_SSSE3 + frac_pos - 32));
210 26580 : IFCoeff_3_2 = _mm_load_si128((__m128i *)(AvcStyleLumaIFCoeff8_SSSE3 + frac_pos - 16));
211 :
212 26580 : if (!(pu_width & 15)) { // 16x
213 : __m128i ref0, ref1, ref2, ref3, ref01_lo, ref01_hi, ref23_lo, ref23_hi, sum_lo, sum_hi;
214 :
215 0 : for (height_cnt = 0; height_cnt < pu_height; ++height_cnt) {
216 0 : for (width_cnt = 0; width_cnt < pu_width; width_cnt += 16) {
217 0 : ref0 = _mm_loadu_si128((__m128i *)(ref_pic + width_cnt - 1));
218 0 : ref1 = _mm_loadu_si128((__m128i *)(ref_pic + width_cnt));
219 0 : ref2 = _mm_loadu_si128((__m128i *)(ref_pic + width_cnt + 1));
220 0 : ref3 = _mm_loadu_si128((__m128i *)(ref_pic + width_cnt + 2));
221 :
222 0 : ref01_lo = _mm_unpacklo_epi8(ref0, ref1);
223 0 : ref01_hi = _mm_unpackhi_epi8(ref0, ref1);
224 0 : ref23_lo = _mm_unpacklo_epi8(ref2, ref3);
225 0 : ref23_hi = _mm_unpackhi_epi8(ref2, ref3);
226 :
227 0 : sum_lo = _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(_mm_maddubs_epi16(ref01_lo, IFCoeff_1_0), _mm_maddubs_epi16(ref23_lo, IFCoeff_3_2)), IFOffset), IFShift);
228 0 : sum_hi = _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(_mm_maddubs_epi16(ref01_hi, IFCoeff_1_0), _mm_maddubs_epi16(ref23_hi, IFCoeff_3_2)), IFOffset), IFShift);
229 0 : sum_clip_U8 = _mm_packus_epi16(sum_lo, sum_hi);
230 0 : _mm_storeu_si128((__m128i *)(dst + width_cnt), sum_clip_U8);
231 : }
232 0 : ref_pic += src_stride;
233 0 : dst += dst_stride;
234 : }
235 : //do the last row if sub-pred ON.
236 0 : if (skip) {
237 0 : ref_pic -= (src_stride >> 1);
238 0 : dst -= (dst_stride >> 1);
239 0 : for (width_cnt = 0; width_cnt < pu_width; width_cnt += 16) {
240 0 : ref0 = _mm_loadu_si128((__m128i *)(ref_pic + width_cnt - 1));
241 0 : ref1 = _mm_loadu_si128((__m128i *)(ref_pic + width_cnt));
242 0 : ref2 = _mm_loadu_si128((__m128i *)(ref_pic + width_cnt + 1));
243 0 : ref3 = _mm_loadu_si128((__m128i *)(ref_pic + width_cnt + 2));
244 :
245 0 : ref01_lo = _mm_unpacklo_epi8(ref0, ref1);
246 0 : ref01_hi = _mm_unpackhi_epi8(ref0, ref1);
247 0 : ref23_lo = _mm_unpacklo_epi8(ref2, ref3);
248 0 : ref23_hi = _mm_unpackhi_epi8(ref2, ref3);
249 :
250 0 : sum_lo = _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(_mm_maddubs_epi16(ref01_lo, IFCoeff_1_0), _mm_maddubs_epi16(ref23_lo, IFCoeff_3_2)), IFOffset), IFShift);
251 0 : sum_hi = _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(_mm_maddubs_epi16(ref01_hi, IFCoeff_1_0), _mm_maddubs_epi16(ref23_hi, IFCoeff_3_2)), IFOffset), IFShift);
252 0 : sum_clip_U8 = _mm_packus_epi16(sum_lo, sum_hi);
253 0 : _mm_storeu_si128((__m128i *)(dst + width_cnt), sum_clip_U8);
254 : }
255 : }
256 : }
257 : else { //8x
258 : __m128i sum01, sum23, sum;
259 :
260 4125730 : for (height_cnt = 0; height_cnt < pu_height; ++height_cnt) {
261 97917100 : for (width_cnt = 0; width_cnt < pu_width; width_cnt += 8) {
262 281454000 : sum01 = _mm_maddubs_epi16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ref_pic + width_cnt - 1)),
263 93817900 : _mm_loadl_epi64((__m128i *)(ref_pic + width_cnt))), IFCoeff_1_0);
264 :
265 281454000 : sum23 = _mm_maddubs_epi16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ref_pic + width_cnt + 1)),
266 93817900 : _mm_loadl_epi64((__m128i *)(ref_pic + width_cnt + 2))), IFCoeff_3_2);
267 :
268 375272000 : sum = _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(sum01, sum23), IFOffset), IFShift);
269 93817900 : sum_clip_U8 = _mm_packus_epi16(sum, sum);
270 :
271 93817900 : _mm_storel_epi64((__m128i *)(dst + width_cnt), sum_clip_U8);
272 : }
273 4099150 : ref_pic += src_stride;
274 4099150 : dst += dst_stride;
275 : }
276 :
277 : //do the last row if sub-pred ON.
278 26580 : if (skip) {
279 0 : ref_pic -= (src_stride >> 1);
280 0 : dst -= (dst_stride >> 1);
281 0 : for (width_cnt = 0; width_cnt < pu_width; width_cnt += 8) {
282 0 : sum01 = _mm_maddubs_epi16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ref_pic + width_cnt - 1)),
283 0 : _mm_loadl_epi64((__m128i *)(ref_pic + width_cnt))), IFCoeff_1_0);
284 :
285 0 : sum23 = _mm_maddubs_epi16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ref_pic + width_cnt + 1)),
286 0 : _mm_loadl_epi64((__m128i *)(ref_pic + width_cnt + 2))), IFCoeff_3_2);
287 :
288 0 : sum = _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(sum01, sum23), IFOffset), IFShift);
289 0 : sum_clip_U8 = _mm_packus_epi16(sum, sum);
290 :
291 0 : _mm_storel_epi64((__m128i *)(dst + width_cnt), sum_clip_U8);
292 : }
293 : }
294 : }
295 26580 : }
296 :
297 53156 : void avc_style_luma_interpolation_filter_vertical_ssse3_intrin(
298 : EbByte ref_pic,
299 : uint32_t src_stride,
300 : EbByte dst,
301 : uint32_t dst_stride,
302 : uint32_t pu_width,
303 : uint32_t pu_height,
304 : EbByte temp_buf,
305 : EbBool skip,
306 : uint32_t frac_pos)
307 : {
308 : (void)temp_buf;
309 : __m128i IFOffset, IFCoeff_1_0, IFCoeff_3_2, sum_clip_U8;
310 : uint32_t width_cnt, height_cnt;
311 53156 : uint32_t IFShift = 5;
312 53156 : uint32_t srcStrideSkip = src_stride << (skip ? 1 : 0);
313 : EbByte refPicTemp, dstTemp;
314 :
315 53156 : frac_pos <<= 5;
316 53156 : ref_pic -= src_stride;
317 53156 : IFOffset = _mm_set1_epi16(0x0010);
318 53156 : IFCoeff_1_0 = _mm_load_si128((__m128i *)(AvcStyleLumaIFCoeff8_SSSE3 + frac_pos - 32));
319 53156 : IFCoeff_3_2 = _mm_load_si128((__m128i *)(AvcStyleLumaIFCoeff8_SSSE3 + frac_pos - 16));
320 53156 : dst_stride <<= skip;
321 53156 : pu_height >>= skip;
322 53156 : if (!(pu_width & 15)) { //16x
323 :
324 : __m128i sum_lo, sum_hi, ref0, refs, ref2s, ref3s;
325 :
326 0 : for (width_cnt = 0; width_cnt < pu_width; width_cnt += 16) {
327 0 : refPicTemp = ref_pic;
328 0 : dstTemp = dst;
329 :
330 0 : for (height_cnt = 0; height_cnt < pu_height; ++height_cnt) {
331 0 : ref0 = _mm_loadu_si128((__m128i *)(refPicTemp));
332 0 : refs = _mm_loadu_si128((__m128i *)(refPicTemp + src_stride));
333 0 : ref2s = _mm_loadu_si128((__m128i *)(refPicTemp + 2 * src_stride));
334 0 : ref3s = _mm_loadu_si128((__m128i *)(refPicTemp + 3 * src_stride));
335 :
336 0 : sum_lo = _mm_add_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(ref0, refs), IFCoeff_1_0),
337 : _mm_maddubs_epi16(_mm_unpacklo_epi8(ref2s, ref3s), IFCoeff_3_2));
338 :
339 0 : sum_hi = _mm_add_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(ref0, refs), IFCoeff_1_0),
340 : _mm_maddubs_epi16(_mm_unpackhi_epi8(ref2s, ref3s), IFCoeff_3_2));
341 :
342 0 : sum_lo = _mm_srai_epi16(_mm_add_epi16(sum_lo, IFOffset), IFShift);
343 0 : sum_hi = _mm_srai_epi16(_mm_add_epi16(sum_hi, IFOffset), IFShift);
344 0 : sum_clip_U8 = _mm_packus_epi16(sum_lo, sum_hi);
345 : _mm_storeu_si128((__m128i *)(dstTemp), sum_clip_U8);
346 0 : dstTemp += dst_stride;
347 0 : refPicTemp += srcStrideSkip;
348 : }
349 : //do the last row if sub-pred is ON.
350 0 : if (skip) {
351 0 : dstTemp -= (dst_stride >> 1);
352 0 : refPicTemp -= (srcStrideSkip >> 1);
353 : {
354 0 : ref0 = _mm_loadu_si128((__m128i *)(refPicTemp));
355 0 : refs = _mm_loadu_si128((__m128i *)(refPicTemp + src_stride));
356 0 : ref2s = _mm_loadu_si128((__m128i *)(refPicTemp + 2 * src_stride));
357 0 : ref3s = _mm_loadu_si128((__m128i *)(refPicTemp + 3 * src_stride));
358 :
359 0 : sum_lo = _mm_add_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(ref0, refs), IFCoeff_1_0),
360 : _mm_maddubs_epi16(_mm_unpacklo_epi8(ref2s, ref3s), IFCoeff_3_2));
361 :
362 0 : sum_hi = _mm_add_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(ref0, refs), IFCoeff_1_0),
363 : _mm_maddubs_epi16(_mm_unpackhi_epi8(ref2s, ref3s), IFCoeff_3_2));
364 :
365 0 : sum_lo = _mm_srai_epi16(_mm_add_epi16(sum_lo, IFOffset), IFShift);
366 0 : sum_hi = _mm_srai_epi16(_mm_add_epi16(sum_hi, IFOffset), IFShift);
367 0 : sum_clip_U8 = _mm_packus_epi16(sum_lo, sum_hi);
368 : _mm_storeu_si128((__m128i *)(dstTemp), sum_clip_U8);
369 : }
370 : }
371 0 : ref_pic += 16;
372 0 : dst += 16;
373 : }
374 : }
375 : else { //8x
376 : __m128i sum, sum01, sum23;
377 :
378 1194370 : for (width_cnt = 0; width_cnt < pu_width; width_cnt += 8) {
379 1141220 : refPicTemp = ref_pic;
380 1141220 : dstTemp = dst;
381 :
382 184618000 : for (height_cnt = 0; height_cnt < pu_height; ++height_cnt) {
383 550431000 : sum01 = _mm_maddubs_epi16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(refPicTemp)),
384 183477000 : _mm_loadl_epi64((__m128i *)(refPicTemp + src_stride))), IFCoeff_1_0);
385 :
386 550431000 : sum23 = _mm_maddubs_epi16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(refPicTemp + 2 * src_stride)),
387 183477000 : _mm_loadl_epi64((__m128i *)(refPicTemp + 3 * src_stride))), IFCoeff_3_2);
388 :
389 733908000 : sum = _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(sum01, sum23), IFOffset), IFShift);
390 183477000 : sum_clip_U8 = _mm_packus_epi16(sum, sum);
391 183477000 : _mm_storel_epi64((__m128i *)(dstTemp), sum_clip_U8);
392 :
393 183477000 : dstTemp += dst_stride;
394 183477000 : refPicTemp += srcStrideSkip;
395 : }
396 : //do the last row if sub-pred is ON.
397 1141220 : if (skip) {
398 0 : dstTemp -= (dst_stride >> 1);
399 0 : refPicTemp -= (srcStrideSkip >> 1);
400 : {
401 0 : sum01 = _mm_maddubs_epi16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(refPicTemp)),
402 0 : _mm_loadl_epi64((__m128i *)(refPicTemp + src_stride))), IFCoeff_1_0);
403 :
404 0 : sum23 = _mm_maddubs_epi16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(refPicTemp + 2 * src_stride)),
405 0 : _mm_loadl_epi64((__m128i *)(refPicTemp + 3 * src_stride))), IFCoeff_3_2);
406 :
407 0 : sum = _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(sum01, sum23), IFOffset), IFShift);
408 0 : sum_clip_U8 = _mm_packus_epi16(sum, sum);
409 0 : _mm_storel_epi64((__m128i *)(dstTemp), sum_clip_U8);
410 : }
411 : }
412 1141220 : ref_pic += 8;
413 1141220 : dst += 8;
414 : }
415 : }
416 53156 : }
417 :
418 79736 : void avc_style_luma_interpolation_filter_helper_ssse3(
419 : EbByte ref_pic,
420 : uint32_t src_stride,
421 : EbByte dst,
422 : uint32_t dst_stride,
423 : uint32_t pu_width,
424 : uint32_t pu_height,
425 : EbByte temp_buf,
426 : EbBool skip,
427 : uint32_t frac_pos,
428 : uint8_t fractional_position)
429 : {
430 :
431 79736 : switch (fractional_position) {
432 0 : case 0:
433 0 : avc_style_copy_sse2(ref_pic, src_stride, dst, dst_stride, pu_width, pu_height, temp_buf, skip, frac_pos); break;
434 0 : case 1:
435 0 : avc_style_luma_interpolation_filter_horizontal_ssse3_intrin(ref_pic, src_stride, dst, dst_stride, pu_width, pu_height, temp_buf, skip, frac_pos); break;
436 26580 : case 2:
437 26580 : avc_style_luma_interpolation_filter_horizontal_ssse3_intrin(ref_pic, src_stride, dst, dst_stride, pu_width, pu_height, temp_buf, skip, frac_pos); break;
438 0 : case 3:
439 0 : avc_style_luma_interpolation_filter_horizontal_ssse3_intrin(ref_pic, src_stride, dst, dst_stride, pu_width, pu_height, temp_buf, skip, frac_pos); break;
440 0 : case 4:
441 0 : avc_style_luma_interpolation_filter_vertical_ssse3_intrin(ref_pic, src_stride, dst, dst_stride, pu_width, pu_height, temp_buf, skip, frac_pos); break;
442 0 : case 5:
443 0 : avc_style_luma_interpolation_filter_pose_ssse3(ref_pic, src_stride, dst, dst_stride, pu_width, pu_height, temp_buf, skip, frac_pos); break;
444 0 : case 6:
445 0 : avc_style_luma_interpolation_filter_posf_ssse3(ref_pic, src_stride, dst, dst_stride, pu_width, pu_height, temp_buf, skip, frac_pos); break;
446 0 : case 7:
447 0 : avc_style_luma_interpolation_filter_posg_ssse3(ref_pic, src_stride, dst, dst_stride, pu_width, pu_height, temp_buf, skip, frac_pos); break;
448 53158 : case 8:
449 53158 : avc_style_luma_interpolation_filter_vertical_ssse3_intrin(ref_pic, src_stride, dst, dst_stride, pu_width, pu_height, temp_buf, skip, frac_pos); break;
450 0 : case 9:
451 0 : avc_style_luma_interpolation_filter_posi_ssse3(ref_pic, src_stride, dst, dst_stride, pu_width, pu_height, temp_buf, skip, frac_pos); break;
452 0 : case 10:
453 0 : avc_style_luma_interpolation_filter_posj_ssse3(ref_pic, src_stride, dst, dst_stride, pu_width, pu_height, temp_buf, skip, frac_pos); break;
454 0 : case 11:
455 0 : avc_style_luma_interpolation_filter_posk_ssse3(ref_pic, src_stride, dst, dst_stride, pu_width, pu_height, temp_buf, skip, frac_pos); break;
456 0 : case 12:
457 0 : avc_style_luma_interpolation_filter_vertical_ssse3_intrin(ref_pic, src_stride, dst, dst_stride, pu_width, pu_height, temp_buf, skip, frac_pos); break;
458 0 : case 13:
459 0 : avc_style_luma_interpolation_filter_posp_ssse3(ref_pic, src_stride, dst, dst_stride, pu_width, pu_height, temp_buf, skip, frac_pos); break;
460 0 : case 14:
461 0 : avc_style_luma_interpolation_filter_posq_ssse3(ref_pic, src_stride, dst, dst_stride, pu_width, pu_height, temp_buf, skip, frac_pos); break;
462 0 : case 15:
463 0 : avc_style_luma_interpolation_filter_posr_ssse3(ref_pic, src_stride, dst, dst_stride, pu_width, pu_height, temp_buf, skip, frac_pos); break;
464 79737 : default:
465 : assert(0);
466 : }
467 79737 : }
|