Line data Source code
1 : /*
2 : * Copyright(c) 2019 Intel Corporation
3 : * SPDX - License - Identifier: BSD - 2 - Clause - Patent
4 : */
5 :
6 : #include "EbPictureOperators_SSE4_1.h"
7 : #include "smmintrin.h"
8 :
9 0 : uint64_t compute8x8_satd_sse4(
10 : int16_t *diff) // input parameter, diff samples Ptr
11 : {
12 0 : uint64_t satdBlock8x8 = 0;
13 : int16_t m2[8][8];
14 :
15 : uint32_t j, jj;
16 : __m128i s0, s1, s2, s3, s4, s5, s6, s7, s9, s10, s11, s12;
17 0 : __m128i s8 = _mm_setzero_si128();
18 : __m128i sum01Neg, sum01Pos, sum23Neg, sum23Pos, sum45Neg, sum45Pos, sum67Neg, sum67Pos;
19 : __m128i sum0to3Pos, sum4to7Pos, sum0to3Neg, sum4to7Neg, diff0to3Pos, diff4to7Pos, diff0to3Neg, diff4to7Neg;
20 : __m128i sum0, sum1, difference0, difference1;
21 :
22 0 : for (j = 0; j < 8; j += 2)
23 : {
24 0 : jj = j << 3;
25 0 : s0 = _mm_loadu_si128((__m128i*)(diff + jj));
26 0 : s10 = _mm_loadu_si128((__m128i*)(diff + 8 + jj));
27 :
28 0 : sum0 = _mm_hadd_epi16(s0, s8);
29 0 : sum1 = _mm_hadd_epi16(s10, s8);
30 :
31 0 : difference0 = _mm_hsub_epi16(s0, s8);
32 0 : difference1 = _mm_hsub_epi16(s10, s8);
33 :
34 : // m2[j][0]
35 : // diff[jj] + diff[jj + 4] + diff[jj + 2] + diff[jj + 6] + diff[jj + 1] + diff[jj + 5] + diff[jj + 3] + diff[jj + 7]
36 : // diff[jj] + diff[jj + 1] + diff[jj + 2] + diff[jj + 3] + diff[jj + 4] + diff[jj + 5] + diff[jj + 6] + diff[jj + 7]
37 0 : s1 = _mm_hadd_epi16(sum0, sum1);
38 0 : s1 = _mm_hadd_epi16(s1, s8);
39 0 : m2[j][0] = _mm_extract_epi16(s1, 0);
40 0 : m2[j + 1][0] = _mm_extract_epi16(s1, 2);
41 :
42 : //m2[j][1]
43 : //diff[jj] + diff[jj + 4] + diff[jj + 2] + diff[jj + 6] - diff[jj + 1] - diff[jj + 5] - diff[jj + 3] - diff[jj + 7]
44 : //diff[jj] - diff[jj + 1] + diff[jj + 2] - diff[jj + 3] + diff[jj + 4] - diff[jj + 5] + diff[jj + 6] - diff[jj + 7]
45 : //(diff[jj] - diff[jj + 1]) + (diff[jj + 2] - diff[jj + 3]) + (diff[jj + 4] - diff[jj + 5]) + (diff[jj + 6] - diff[jj + 7])
46 0 : s1 = _mm_hadd_epi16(difference0, difference1);
47 0 : s1 = _mm_hadd_epi16(s1, s8);
48 0 : m2[j][1] = _mm_extract_epi16(s1, 0);
49 0 : m2[j + 1][1] = _mm_extract_epi16(s1, 2);
50 :
51 : //m2[j][2]
52 : //diff[jj] + diff[jj + 4] - diff[jj + 2] - diff[jj + 6] + diff[jj + 1] + diff[jj + 5] - diff[jj + 3] - diff[jj + 7]
53 : //diff[jj] + diff[jj + 1] - diff[jj + 2] - diff[jj + 3] + diff[jj + 4] + diff[jj + 5] - diff[jj + 6] - diff[jj + 7]
54 0 : s1 = _mm_hsub_epi16(sum0, sum1);
55 0 : s1 = _mm_hadd_epi16(s1, s8);
56 0 : m2[j][2] = _mm_extract_epi16(s1, 0);
57 0 : m2[j + 1][2] = _mm_extract_epi16(s1, 2);
58 :
59 : //m2[j][3]
60 : //diff[jj] + diff[jj + 4] - diff[jj + 2] - diff[jj + 6] - diff[jj + 1] - diff[jj + 5] + diff[jj + 3] + diff[jj + 7]
61 : //diff[jj] - diff[jj + 1] - diff[jj + 2] + diff[jj + 3] + diff[jj + 4] - diff[jj + 5] - diff[jj + 6] + diff[jj + 7]
62 : //diff[jj] - diff[jj + 1] - diff[jj + 2] + diff[jj + 3] + diff[jj + 4] - diff[jj + 5] - diff[jj + 6] + diff[jj + 7]
63 0 : s1 = _mm_hsub_epi16(difference0, difference1);
64 0 : s1 = _mm_hadd_epi16(s1, s8);
65 0 : m2[j][3] = _mm_extract_epi16(s1, 0);
66 0 : m2[j + 1][3] = _mm_extract_epi16(s1, 2);
67 :
68 : //m2[j][4]
69 : //diff[jj] - diff[jj + 4] + diff[jj + 2] - diff[jj + 6] + diff[jj + 1] - diff[jj + 5] + diff[jj + 3] - diff[jj + 7]
70 : //diff[jj] + diff[jj + 1] + diff[jj + 2] + diff[jj + 3] - diff[jj + 4] - diff[jj + 5] - diff[jj + 6] - diff[jj + 7]
71 0 : s1 = _mm_hadd_epi16(sum0, sum1);
72 0 : s1 = _mm_hsub_epi16(s1, s8);
73 0 : m2[j][4] = _mm_extract_epi16(s1, 0);
74 0 : m2[j + 1][4] = _mm_extract_epi16(s1, 2);
75 :
76 : //m2[j][5]
77 : //m1[j][4] - m1[j][5]
78 : //diff[jj] - diff[jj + 1] + diff[jj + 2] - diff[jj + 3] - diff[jj + 4] + diff[jj + 5] - diff[jj + 6] + diff[jj + 7]
79 0 : s1 = _mm_hadd_epi16(difference0, difference1);
80 0 : s1 = _mm_hsub_epi16(s1, s8);
81 0 : m2[j][5] = _mm_extract_epi16(s1, 0);
82 0 : m2[j + 1][5] = _mm_extract_epi16(s1, 2);
83 :
84 : //m2[j][6]
85 : //diff[jj] - diff[jj + 4] - diff[jj + 2] + diff[jj + 6] + diff[jj + 1] - diff[jj + 5] - diff[jj + 3] + diff[jj + 7]
86 : //diff[jj] + diff[jj + 1] - diff[jj + 2] - diff[jj + 3] - diff[jj + 4] - diff[jj + 5] + diff[jj + 6] + diff[jj + 7]
87 :
88 0 : s1 = _mm_hsub_epi16(sum0, sum1);
89 0 : s1 = _mm_hsub_epi16(s1, s8);
90 0 : m2[j][6] = _mm_extract_epi16(s1, 0);
91 0 : m2[j + 1][6] = _mm_extract_epi16(s1, 2);
92 :
93 : //m2[j][7]
94 : //diff[jj] - diff[jj + 4] - diff[jj + 2] + diff[jj + 6] - diff[jj + 1] + diff[jj + 5] + diff[jj + 3] - diff[jj + 7]
95 : //diff[jj] - diff[jj + 1] - diff[jj + 2] + diff[jj + 3] - diff[jj + 4] + diff[jj + 5] + diff[jj + 6] - diff[jj + 7]
96 0 : s1 = _mm_hsub_epi16(difference0, difference1);
97 0 : s1 = _mm_hsub_epi16(s1, s8);
98 0 : m2[j][7] = _mm_extract_epi16(s1, 0);
99 0 : m2[j + 1][7] = _mm_extract_epi16(s1, 2);
100 : }
101 :
102 : // Vertical
103 0 : s0 = _mm_loadu_si128((__m128i*)(m2[0]));
104 0 : s1 = _mm_loadu_si128((__m128i*)(m2[1]));
105 0 : s2 = _mm_loadu_si128((__m128i*)(m2[2]));
106 0 : s3 = _mm_loadu_si128((__m128i*)(m2[3]));
107 0 : s4 = _mm_loadu_si128((__m128i*)(m2[4]));
108 0 : s5 = _mm_loadu_si128((__m128i*)(m2[5]));
109 0 : s6 = _mm_loadu_si128((__m128i*)(m2[6]));
110 0 : s7 = _mm_loadu_si128((__m128i*)(m2[7]));
111 :
112 0 : sum01Pos = _mm_add_epi16(s0, s1);
113 0 : sum23Pos = _mm_add_epi16(s2, s3);
114 0 : sum45Pos = _mm_add_epi16(s4, s5);
115 0 : sum67Pos = _mm_add_epi16(s6, s7);
116 :
117 0 : sum01Neg = _mm_sub_epi16(s0, s1);
118 0 : sum23Neg = _mm_sub_epi16(s2, s3);
119 0 : sum45Neg = _mm_sub_epi16(s4, s5);
120 0 : sum67Neg = _mm_sub_epi16(s6, s7);
121 :
122 0 : sum0to3Pos = _mm_add_epi16(sum01Pos, sum23Pos);
123 0 : sum4to7Pos = _mm_add_epi16(sum45Pos, sum67Pos);
124 0 : diff0to3Pos = _mm_sub_epi16(sum01Pos, sum23Pos);
125 0 : diff4to7Pos = _mm_sub_epi16(sum45Pos, sum67Pos);
126 :
127 0 : sum0to3Neg = _mm_add_epi16(sum01Neg, sum23Neg);
128 0 : sum4to7Neg = _mm_add_epi16(sum45Neg, sum67Neg);
129 0 : diff0to3Neg = _mm_sub_epi16(sum01Neg, sum23Neg);
130 0 : diff4to7Neg = _mm_sub_epi16(sum45Neg, sum67Neg);
131 :
132 : //m2[0][i] = m1[0][i] + m1[1][i]
133 : //m2[0][i] = m3[0][i] + m3[2][i] + m3[1][i] + m3[3][i]
134 : //m2[0][i] = m2[0][i] + m2[4][i] + m2[2][i] + m2[6][i] + m2[1][i] + m2[5][i] + m2[3][i] + m2[7][i]
135 : //m2[0][i] = m2[0][i] + m2[1][i] + m2[2][i] + m2[3][i] + m2[4][i] + m2[5][i] + m2[6][i] + m2[7][i]
136 0 : s9 = _mm_add_epi16(sum0to3Pos, sum4to7Pos);
137 0 : s9 = _mm_abs_epi16(s9);
138 0 : s10 = _mm_unpacklo_epi16(s9, _mm_setzero_si128());
139 0 : s11 = _mm_unpackhi_epi16(s9, _mm_setzero_si128());
140 0 : s10 = _mm_add_epi32(s10, s11);
141 0 : s10 = _mm_hadd_epi32(s10, s8);
142 0 : s10 = _mm_hadd_epi32(s10, s8);
143 :
144 : //m2[1][i] = m1[0][i] - m1[1][i]
145 : //m2[1][i] = m3[0][i] + m3[2][i] -(m3[1][i] + m3[3][i])
146 : //m2[1][i] = m2[0][i] + m2[4][i] + m2[2][i] + m2[6][i] -(m2[1][i] + m2[5][i] + m2[3][i] + m2[7][i])
147 : //m2[1][i] = m2[0][i] - m2[1][i] + m2[2][i] - m2[3][i] + m2[4][i] - m2[5][i] + m2[6][i] - m2[7][i]
148 0 : s9 = _mm_add_epi16(sum0to3Neg, sum4to7Neg);
149 0 : s9 = _mm_abs_epi16(s9);
150 0 : s12 = _mm_unpacklo_epi16(s9, _mm_setzero_si128());
151 0 : s11 = _mm_unpackhi_epi16(s9, _mm_setzero_si128());
152 0 : s12 = _mm_add_epi32(s12, s11);
153 0 : s12 = _mm_hadd_epi32(s12, s8);
154 0 : s12 = _mm_hadd_epi32(s12, s8);
155 0 : s12 = _mm_add_epi32(s10, s12);
156 :
157 : //m2[2][i] = m1[2][i] + m1[3][i]
158 : //m2[2][i] = m3[0][i] - m3[2][i] + m3[1][i] - m3[3][i]
159 : //m2[2][i] = m2[0][i] + m2[4][i] - (m2[2][i] + m2[6][i]) + m2[1][i] + m2[5][i] - (m2[3][i] + m2[7][i])
160 : //m2[2][i] = m2[0][i] + m2[1][i] - m2[2][i] - m2[3][i] + m2[4][i] + m2[5][i] - m2[6][i] - m2[7][i]
161 : //m2[2][i] = m2[0][i] + m2[1][i] - (m2[2][i] + m2[3][i]) + m2[4][i] + m2[5][i] - (m2[6][i] + m2[7][i])
162 0 : s9 = _mm_add_epi16(diff0to3Pos, diff4to7Pos);
163 0 : s9 = _mm_abs_epi16(s9);
164 0 : s10 = _mm_unpacklo_epi16(s9, _mm_setzero_si128());
165 0 : s11 = _mm_unpackhi_epi16(s9, _mm_setzero_si128());
166 0 : s10 = _mm_add_epi32(s10, s11);
167 0 : s10 = _mm_hadd_epi32(s10, s8);
168 0 : s10 = _mm_hadd_epi32(s10, s8);
169 0 : s12 = _mm_add_epi32(s10, s12);
170 :
171 : //m2[3][i] = m1[2][i] - m1[3][i]
172 : //m2[3][i] = m3[0][i] - m3[2][i] - (m3[1][i] - m3[3][i])
173 : //m2[3][i] = m2[0][i] + m2[4][i] - (m2[2][i] + m2[6][i]) - (m2[1][i] + m2[5][i] - m2[3][i] - m2[7][i])
174 : //m2[3][i] = m2[0][i] - m2[1][i] - m2[2][i] + m2[3][i] + m2[4][i] - m2[5][i] - m2[6][i] + m2[7][i]
175 : //m2[3][i] = m2[0][i] - m2[1][i] - (m2[2][i] - m2[3][i]) + (m2[4][i] - m2[5][i]) - (m2[6][i] - m2[7][i])
176 0 : s9 = _mm_add_epi16(diff0to3Neg, diff4to7Neg);
177 0 : s9 = _mm_abs_epi16(s9);
178 0 : s10 = _mm_unpacklo_epi16(s9, _mm_setzero_si128());
179 0 : s11 = _mm_unpackhi_epi16(s9, _mm_setzero_si128());
180 0 : s10 = _mm_add_epi32(s10, s11);
181 0 : s10 = _mm_hadd_epi32(s10, s8);
182 0 : s10 = _mm_hadd_epi32(s10, s8);
183 0 : s12 = _mm_add_epi32(s10, s12);
184 :
185 : //m2[4][i] = m1[4][i] + m1[5][i]
186 : //m2[4][i] = m3[4][i] + m3[6][i] + m3[5][i] + m3[7][i]
187 : //m2[4][i] = m2[0][i] - m2[4][i] + m2[2][i] - m2[6][i] + m2[1][i] - m2[5][i] + m2[3][i] - m2[7][i]
188 : //m2[4][i] = m2[0][i] + m2[1][i] + m2[2][i] + m2[3][i] - m2[4][i] - m2[5][i] - m2[6][i] - m2[7][i]
189 : //m2[4][i] = m2[0][i] + m2[1][i] + m2[2][i] + m2[3][i] - ( (m2[4][i] + m2[5][i]) + (m2[6][i] + m2[7][i]) )
190 0 : s9 = _mm_sub_epi16(sum0to3Pos, sum4to7Pos);
191 0 : s9 = _mm_abs_epi16(s9);
192 0 : s10 = _mm_unpacklo_epi16(s9, _mm_setzero_si128());
193 0 : s11 = _mm_unpackhi_epi16(s9, _mm_setzero_si128());
194 0 : s10 = _mm_add_epi32(s10, s11);
195 0 : s10 = _mm_hadd_epi32(s10, s8);
196 0 : s10 = _mm_hadd_epi32(s10, s8);
197 0 : s12 = _mm_add_epi32(s10, s12);
198 :
199 : //m2[5][i] = m1[4][i] - m1[5][i]
200 : //m2[5][i] = m3[4][i] + m3[6][i] - (m3[5][i] + m3[7][i])
201 : //m2[5][i] = m2[0][i] - m2[4][i] + m2[2][i] - m2[6][i] - (m2[1][i] - m2[5][i] + m2[3][i] - m2[7][i])
202 : //m2[5][i] = m2[0][i] - m2[1][i] + m2[2][i] - m2[3][i] - m2[4][i] + m2[5][i] - m2[6][i] + m2[7][i]
203 : //m2[5][i] = m2[0][i] - m2[1][i] + (m2[2][i] - m2[3][i]) - ( (m2[4][i] - m2[5][i]) + (m2[6][i] - m2[7][i]) )
204 0 : s9 = _mm_sub_epi16(sum0to3Neg, sum4to7Neg);
205 0 : s9 = _mm_abs_epi16(s9);
206 0 : s10 = _mm_unpacklo_epi16(s9, _mm_setzero_si128());
207 0 : s11 = _mm_unpackhi_epi16(s9, _mm_setzero_si128());
208 0 : s10 = _mm_add_epi32(s10, s11);
209 0 : s10 = _mm_hadd_epi32(s10, s8);
210 0 : s10 = _mm_hadd_epi32(s10, s8);
211 0 : s12 = _mm_add_epi32(s10, s12);
212 :
213 : //m2[6][i] = m1[6][i] + m1[7][i]
214 : //m2[6][i] = m3[4][i] - m3[6][i] + m3[5][i] - m3[7][i]
215 : //m2[6][i] = m2[0][i] - m2[4][i] - (m2[2][i] - m2[6][i]) + m2[1][i] - m2[5][i] - (m2[3][i] - m2[7][i])
216 : //m2[6][i] = m2[0][i] + m2[1][i] - m2[2][i] - m2[3][i] - m2[4][i] - m2[5][i] + m2[6][i] + m2[7][i]
217 : //m2[6][i] = (m2[0][i] + m2[1][i]) - (m2[2][i] + m2[3][i]) - ( (m2[4][i] + m2[5][i]) - (m2[6][i] + m2[7][i]) )
218 0 : s9 = _mm_sub_epi16(diff0to3Pos, diff4to7Pos);
219 0 : s9 = _mm_abs_epi16(s9);
220 0 : s10 = _mm_unpacklo_epi16(s9, _mm_setzero_si128());
221 0 : s11 = _mm_unpackhi_epi16(s9, _mm_setzero_si128());
222 0 : s10 = _mm_add_epi32(s10, s11);
223 0 : s10 = _mm_hadd_epi32(s10, s8);
224 0 : s10 = _mm_hadd_epi32(s10, s8);
225 0 : s12 = _mm_add_epi32(s10, s12);
226 :
227 : //m2[7][i] = m1[6][i] - m1[7][i]
228 : //m2[7][i] = m3[4][i] - m3[6][i] - (m3[5][i] - m3[7][i])
229 : //m2[7][i] = m2[0][i] - m2[4][i] - (m2[2][i] - m2[6][i]) - ((m2[1][i] - m2[5][i]) - (m2[3][i] - m2[7][i]))
230 : //m2[7][i] = (m2[0][i] - m2[1][i]) - (m2[2][i] - m2[3][i]) - ( (m2[4][i] - m2[5][i]) - (m2[6][i] - m2[7][i]) )
231 0 : s9 = _mm_sub_epi16(diff0to3Neg, diff4to7Neg);
232 0 : s9 = _mm_abs_epi16(s9);
233 0 : s10 = _mm_unpacklo_epi16(s9, _mm_setzero_si128());
234 0 : s11 = _mm_unpackhi_epi16(s9, _mm_setzero_si128());
235 0 : s10 = _mm_add_epi32(s10, s11);
236 0 : s10 = _mm_hadd_epi32(s10, s8);
237 0 : s10 = _mm_hadd_epi32(s10, s8);
238 0 : s12 = _mm_add_epi32(s10, s12);
239 :
240 0 : satdBlock8x8 = (uint64_t)_mm_extract_epi32(s12, 0);
241 :
242 0 : satdBlock8x8 = ((satdBlock8x8 + 2) >> 2);
243 :
244 0 : return satdBlock8x8;
245 : }
246 :
247 0 : uint64_t compute8x8_satd_u8_sse4(
248 : uint8_t *src, // input parameter, diff samples Ptr
249 : uint64_t *dc_value,
250 : uint32_t src_stride)
251 : {
252 0 : uint64_t satdBlock8x8 = 0;
253 : int16_t m2[8][8];
254 :
255 : uint32_t j;
256 : __m128i s0, s1, s2, s3, s4, s5, s6, s7, s9, s10, s11, s12;
257 0 : __m128i s8 = _mm_setzero_si128();
258 : __m128i sum01Neg, sum01Pos, sum23Neg, sum23Pos, sum45Neg, sum45Pos, sum67Neg, sum67Pos;
259 : __m128i sum0to3Pos, sum4to7Pos, sum0to3Neg, sum4to7Neg, diff0to3Pos, diff4to7Pos, diff0to3Neg, diff4to7Neg;
260 : __m128i sum0, sum1, difference0, difference1;
261 :
262 0 : for (j = 0; j < 8; j += 2)
263 : {
264 0 : s0 = _mm_loadl_epi64((__m128i*)(src + (j *src_stride)));
265 0 : s10 = _mm_loadl_epi64((__m128i*)(src + ((j + 1) *src_stride)));
266 0 : s10 = _mm_unpacklo_epi8(s10, _mm_setzero_si128());
267 0 : s0 = _mm_unpacklo_epi8(s0, _mm_setzero_si128());
268 :
269 0 : sum0 = _mm_hadd_epi16(s0, s8);
270 0 : sum1 = _mm_hadd_epi16(s10, s8);
271 :
272 0 : difference0 = _mm_hsub_epi16(s0, s8);
273 0 : difference1 = _mm_hsub_epi16(s10, s8);
274 :
275 : // m2[j][0]
276 : // diff[jj] + diff[jj + 4] + diff[jj + 2] + diff[jj + 6] + diff[jj + 1] + diff[jj + 5] + diff[jj + 3] + diff[jj + 7]
277 : // diff[jj] + diff[jj + 1] + diff[jj + 2] + diff[jj + 3] + diff[jj + 4] + diff[jj + 5] + diff[jj + 6] + diff[jj + 7]
278 0 : s1 = _mm_hadd_epi16(sum0, sum1);
279 0 : s1 = _mm_hadd_epi16(s1, s8);
280 0 : m2[j][0] = _mm_extract_epi16(s1, 0);
281 0 : m2[j + 1][0] = _mm_extract_epi16(s1, 2);
282 :
283 : //m2[j][1]
284 : //diff[jj] + diff[jj + 4] + diff[jj + 2] + diff[jj + 6] - diff[jj + 1] - diff[jj + 5] - diff[jj + 3] - diff[jj + 7]
285 : //diff[jj] - diff[jj + 1] + diff[jj + 2] - diff[jj + 3] + diff[jj + 4] - diff[jj + 5] + diff[jj + 6] - diff[jj + 7]
286 : //(diff[jj] - diff[jj + 1]) + (diff[jj + 2] - diff[jj + 3]) + (diff[jj + 4] - diff[jj + 5]) + (diff[jj + 6] - diff[jj + 7])
287 0 : s1 = _mm_hadd_epi16(difference0, difference1);
288 0 : s1 = _mm_hadd_epi16(s1, s8);
289 0 : m2[j][1] = _mm_extract_epi16(s1, 0);
290 0 : m2[j + 1][1] = _mm_extract_epi16(s1, 2);
291 :
292 : //m2[j][2]
293 : //diff[jj] + diff[jj + 4] - diff[jj + 2] - diff[jj + 6] + diff[jj + 1] + diff[jj + 5] - diff[jj + 3] - diff[jj + 7]
294 : //diff[jj] + diff[jj + 1] - diff[jj + 2] - diff[jj + 3] + diff[jj + 4] + diff[jj + 5] - diff[jj + 6] - diff[jj + 7]
295 0 : s1 = _mm_hsub_epi16(sum0, sum1);
296 0 : s1 = _mm_hadd_epi16(s1, s8);
297 0 : m2[j][2] = _mm_extract_epi16(s1, 0);
298 0 : m2[j + 1][2] = _mm_extract_epi16(s1, 2);
299 :
300 : //m2[j][3]
301 : //diff[jj] + diff[jj + 4] - diff[jj + 2] - diff[jj + 6] - diff[jj + 1] - diff[jj + 5] + diff[jj + 3] + diff[jj + 7]
302 : //diff[jj] - diff[jj + 1] - diff[jj + 2] + diff[jj + 3] + diff[jj + 4] - diff[jj + 5] - diff[jj + 6] + diff[jj + 7]
303 : //diff[jj] - diff[jj + 1] - diff[jj + 2] + diff[jj + 3] + diff[jj + 4] - diff[jj + 5] - diff[jj + 6] + diff[jj + 7]
304 0 : s1 = _mm_hsub_epi16(difference0, difference1);
305 0 : s1 = _mm_hadd_epi16(s1, s8);
306 0 : m2[j][3] = _mm_extract_epi16(s1, 0);
307 0 : m2[j + 1][3] = _mm_extract_epi16(s1, 2);
308 :
309 : //m2[j][4]
310 : //diff[jj] - diff[jj + 4] + diff[jj + 2] - diff[jj + 6] + diff[jj + 1] - diff[jj + 5] + diff[jj + 3] - diff[jj + 7]
311 : //diff[jj] + diff[jj + 1] + diff[jj + 2] + diff[jj + 3] - diff[jj + 4] - diff[jj + 5] - diff[jj + 6] - diff[jj + 7]
312 0 : s1 = _mm_hadd_epi16(sum0, sum1);
313 0 : s1 = _mm_hsub_epi16(s1, s8);
314 0 : m2[j][4] = _mm_extract_epi16(s1, 0);
315 0 : m2[j + 1][4] = _mm_extract_epi16(s1, 2);
316 :
317 : //m2[j][5]
318 : //m1[j][4] - m1[j][5]
319 : //diff[jj] - diff[jj + 1] + diff[jj + 2] - diff[jj + 3] - diff[jj + 4] + diff[jj + 5] - diff[jj + 6] + diff[jj + 7]
320 0 : s1 = _mm_hadd_epi16(difference0, difference1);
321 0 : s1 = _mm_hsub_epi16(s1, s8);
322 0 : m2[j][5] = _mm_extract_epi16(s1, 0);
323 0 : m2[j + 1][5] = _mm_extract_epi16(s1, 2);
324 :
325 : //m2[j][6]
326 : //diff[jj] - diff[jj + 4] - diff[jj + 2] + diff[jj + 6] + diff[jj + 1] - diff[jj + 5] - diff[jj + 3] + diff[jj + 7]
327 : //diff[jj] + diff[jj + 1] - diff[jj + 2] - diff[jj + 3] - diff[jj + 4] - diff[jj + 5] + diff[jj + 6] + diff[jj + 7]
328 :
329 0 : s1 = _mm_hsub_epi16(sum0, sum1);
330 0 : s1 = _mm_hsub_epi16(s1, s8);
331 0 : m2[j][6] = _mm_extract_epi16(s1, 0);
332 0 : m2[j + 1][6] = _mm_extract_epi16(s1, 2);
333 :
334 : //m2[j][7]
335 : //diff[jj] - diff[jj + 4] - diff[jj + 2] + diff[jj + 6] - diff[jj + 1] + diff[jj + 5] + diff[jj + 3] - diff[jj + 7]
336 : //diff[jj] - diff[jj + 1] - diff[jj + 2] + diff[jj + 3] - diff[jj + 4] + diff[jj + 5] + diff[jj + 6] - diff[jj + 7]
337 0 : s1 = _mm_hsub_epi16(difference0, difference1);
338 0 : s1 = _mm_hsub_epi16(s1, s8);
339 0 : m2[j][7] = _mm_extract_epi16(s1, 0);
340 0 : m2[j + 1][7] = _mm_extract_epi16(s1, 2);
341 : }
342 :
343 : // Vertical
344 0 : s0 = _mm_loadu_si128((__m128i*)(m2[0]));
345 0 : s1 = _mm_loadu_si128((__m128i*)(m2[1]));
346 0 : s2 = _mm_loadu_si128((__m128i*)(m2[2]));
347 0 : s3 = _mm_loadu_si128((__m128i*)(m2[3]));
348 0 : s4 = _mm_loadu_si128((__m128i*)(m2[4]));
349 0 : s5 = _mm_loadu_si128((__m128i*)(m2[5]));
350 0 : s6 = _mm_loadu_si128((__m128i*)(m2[6]));
351 0 : s7 = _mm_loadu_si128((__m128i*)(m2[7]));
352 :
353 0 : sum01Pos = _mm_add_epi16(s0, s1);
354 0 : sum23Pos = _mm_add_epi16(s2, s3);
355 0 : sum45Pos = _mm_add_epi16(s4, s5);
356 0 : sum67Pos = _mm_add_epi16(s6, s7);
357 :
358 0 : sum01Neg = _mm_sub_epi16(s0, s1);
359 0 : sum23Neg = _mm_sub_epi16(s2, s3);
360 0 : sum45Neg = _mm_sub_epi16(s4, s5);
361 0 : sum67Neg = _mm_sub_epi16(s6, s7);
362 :
363 0 : sum0to3Pos = _mm_add_epi16(sum01Pos, sum23Pos);
364 0 : sum4to7Pos = _mm_add_epi16(sum45Pos, sum67Pos);
365 0 : diff0to3Pos = _mm_sub_epi16(sum01Pos, sum23Pos);
366 0 : diff4to7Pos = _mm_sub_epi16(sum45Pos, sum67Pos);
367 :
368 0 : sum0to3Neg = _mm_add_epi16(sum01Neg, sum23Neg);
369 0 : sum4to7Neg = _mm_add_epi16(sum45Neg, sum67Neg);
370 0 : diff0to3Neg = _mm_sub_epi16(sum01Neg, sum23Neg);
371 0 : diff4to7Neg = _mm_sub_epi16(sum45Neg, sum67Neg);
372 :
373 : //m2[0][i] = m1[0][i] + m1[1][i]
374 : //m2[0][i] = m3[0][i] + m3[2][i] + m3[1][i] + m3[3][i]
375 : //m2[0][i] = m2[0][i] + m2[4][i] + m2[2][i] + m2[6][i] + m2[1][i] + m2[5][i] + m2[3][i] + m2[7][i]
376 : //m2[0][i] = m2[0][i] + m2[1][i] + m2[2][i] + m2[3][i] + m2[4][i] + m2[5][i] + m2[6][i] + m2[7][i]
377 0 : s9 = _mm_add_epi16(sum0to3Pos, sum4to7Pos);
378 0 : s9 = _mm_abs_epi16(s9);
379 0 : *dc_value += _mm_extract_epi16(s9, 0);
380 :
381 0 : s10 = _mm_unpacklo_epi16(s9, _mm_setzero_si128());
382 0 : s11 = _mm_unpackhi_epi16(s9, _mm_setzero_si128());
383 0 : s10 = _mm_add_epi32(s10, s11);
384 0 : s10 = _mm_hadd_epi32(s10, s8);
385 0 : s10 = _mm_hadd_epi32(s10, s8);
386 :
387 : //m2[1][i] = m1[0][i] - m1[1][i]
388 : //m2[1][i] = m3[0][i] + m3[2][i] -(m3[1][i] + m3[3][i])
389 : //m2[1][i] = m2[0][i] + m2[4][i] + m2[2][i] + m2[6][i] -(m2[1][i] + m2[5][i] + m2[3][i] + m2[7][i])
390 : //m2[1][i] = m2[0][i] - m2[1][i] + m2[2][i] - m2[3][i] + m2[4][i] - m2[5][i] + m2[6][i] - m2[7][i]
391 0 : s9 = _mm_add_epi16(sum0to3Neg, sum4to7Neg);
392 0 : s9 = _mm_abs_epi16(s9);
393 0 : s12 = _mm_unpacklo_epi16(s9, _mm_setzero_si128());
394 0 : s11 = _mm_unpackhi_epi16(s9, _mm_setzero_si128());
395 0 : s12 = _mm_add_epi32(s12, s11);
396 0 : s12 = _mm_hadd_epi32(s12, s8);
397 0 : s12 = _mm_hadd_epi32(s12, s8);
398 0 : s12 = _mm_add_epi32(s10, s12);
399 :
400 : //m2[2][i] = m1[2][i] + m1[3][i]
401 : //m2[2][i] = m3[0][i] - m3[2][i] + m3[1][i] - m3[3][i]
402 : //m2[2][i] = m2[0][i] + m2[4][i] - (m2[2][i] + m2[6][i]) + m2[1][i] + m2[5][i] - (m2[3][i] + m2[7][i])
403 : //m2[2][i] = m2[0][i] + m2[1][i] - m2[2][i] - m2[3][i] + m2[4][i] + m2[5][i] - m2[6][i] - m2[7][i]
404 : //m2[2][i] = m2[0][i] + m2[1][i] - (m2[2][i] + m2[3][i]) + m2[4][i] + m2[5][i] - (m2[6][i] + m2[7][i])
405 0 : s9 = _mm_add_epi16(diff0to3Pos, diff4to7Pos);
406 0 : s9 = _mm_abs_epi16(s9);
407 0 : s10 = _mm_unpacklo_epi16(s9, _mm_setzero_si128());
408 0 : s11 = _mm_unpackhi_epi16(s9, _mm_setzero_si128());
409 0 : s10 = _mm_add_epi32(s10, s11);
410 0 : s10 = _mm_hadd_epi32(s10, s8);
411 0 : s10 = _mm_hadd_epi32(s10, s8);
412 0 : s12 = _mm_add_epi32(s10, s12);
413 :
414 : //m2[3][i] = m1[2][i] - m1[3][i]
415 : //m2[3][i] = m3[0][i] - m3[2][i] - (m3[1][i] - m3[3][i])
416 : //m2[3][i] = m2[0][i] + m2[4][i] - (m2[2][i] + m2[6][i]) - (m2[1][i] + m2[5][i] - m2[3][i] - m2[7][i])
417 : //m2[3][i] = m2[0][i] - m2[1][i] - m2[2][i] + m2[3][i] + m2[4][i] - m2[5][i] - m2[6][i] + m2[7][i]
418 : //m2[3][i] = m2[0][i] - m2[1][i] - (m2[2][i] - m2[3][i]) + (m2[4][i] - m2[5][i]) - (m2[6][i] - m2[7][i])
419 0 : s9 = _mm_add_epi16(diff0to3Neg, diff4to7Neg);
420 0 : s9 = _mm_abs_epi16(s9);
421 0 : s10 = _mm_unpacklo_epi16(s9, _mm_setzero_si128());
422 0 : s11 = _mm_unpackhi_epi16(s9, _mm_setzero_si128());
423 0 : s10 = _mm_add_epi32(s10, s11);
424 0 : s10 = _mm_hadd_epi32(s10, s8);
425 0 : s10 = _mm_hadd_epi32(s10, s8);
426 0 : s12 = _mm_add_epi32(s10, s12);
427 :
428 : //m2[4][i] = m1[4][i] + m1[5][i]
429 : //m2[4][i] = m3[4][i] + m3[6][i] + m3[5][i] + m3[7][i]
430 : //m2[4][i] = m2[0][i] - m2[4][i] + m2[2][i] - m2[6][i] + m2[1][i] - m2[5][i] + m2[3][i] - m2[7][i]
431 : //m2[4][i] = m2[0][i] + m2[1][i] + m2[2][i] + m2[3][i] - m2[4][i] - m2[5][i] - m2[6][i] - m2[7][i]
432 : //m2[4][i] = m2[0][i] + m2[1][i] + m2[2][i] + m2[3][i] - ( (m2[4][i] + m2[5][i]) + (m2[6][i] + m2[7][i]) )
433 0 : s9 = _mm_sub_epi16(sum0to3Pos, sum4to7Pos);
434 0 : s9 = _mm_abs_epi16(s9);
435 0 : s10 = _mm_unpacklo_epi16(s9, _mm_setzero_si128());
436 0 : s11 = _mm_unpackhi_epi16(s9, _mm_setzero_si128());
437 0 : s10 = _mm_add_epi32(s10, s11);
438 0 : s10 = _mm_hadd_epi32(s10, s8);
439 0 : s10 = _mm_hadd_epi32(s10, s8);
440 0 : s12 = _mm_add_epi32(s10, s12);
441 :
442 : //m2[5][i] = m1[4][i] - m1[5][i]
443 : //m2[5][i] = m3[4][i] + m3[6][i] - (m3[5][i] + m3[7][i])
444 : //m2[5][i] = m2[0][i] - m2[4][i] + m2[2][i] - m2[6][i] - (m2[1][i] - m2[5][i] + m2[3][i] - m2[7][i])
445 : //m2[5][i] = m2[0][i] - m2[1][i] + m2[2][i] - m2[3][i] - m2[4][i] + m2[5][i] - m2[6][i] + m2[7][i]
446 : //m2[5][i] = m2[0][i] - m2[1][i] + (m2[2][i] - m2[3][i]) - ( (m2[4][i] - m2[5][i]) + (m2[6][i] - m2[7][i]) )
447 0 : s9 = _mm_sub_epi16(sum0to3Neg, sum4to7Neg);
448 0 : s9 = _mm_abs_epi16(s9);
449 0 : s10 = _mm_unpacklo_epi16(s9, _mm_setzero_si128());
450 0 : s11 = _mm_unpackhi_epi16(s9, _mm_setzero_si128());
451 0 : s10 = _mm_add_epi32(s10, s11);
452 0 : s10 = _mm_hadd_epi32(s10, s8);
453 0 : s10 = _mm_hadd_epi32(s10, s8);
454 0 : s12 = _mm_add_epi32(s10, s12);
455 :
456 : //m2[6][i] = m1[6][i] + m1[7][i]
457 : //m2[6][i] = m3[4][i] - m3[6][i] + m3[5][i] - m3[7][i]
458 : //m2[6][i] = m2[0][i] - m2[4][i] - (m2[2][i] - m2[6][i]) + m2[1][i] - m2[5][i] - (m2[3][i] - m2[7][i])
459 : //m2[6][i] = m2[0][i] + m2[1][i] - m2[2][i] - m2[3][i] - m2[4][i] - m2[5][i] + m2[6][i] + m2[7][i]
460 : //m2[6][i] = (m2[0][i] + m2[1][i]) - (m2[2][i] + m2[3][i]) - ( (m2[4][i] + m2[5][i]) - (m2[6][i] + m2[7][i]) )
461 0 : s9 = _mm_sub_epi16(diff0to3Pos, diff4to7Pos);
462 0 : s9 = _mm_abs_epi16(s9);
463 0 : s10 = _mm_unpacklo_epi16(s9, _mm_setzero_si128());
464 0 : s11 = _mm_unpackhi_epi16(s9, _mm_setzero_si128());
465 0 : s10 = _mm_add_epi32(s10, s11);
466 0 : s10 = _mm_hadd_epi32(s10, s8);
467 0 : s10 = _mm_hadd_epi32(s10, s8);
468 0 : s12 = _mm_add_epi32(s10, s12);
469 :
470 : //m2[7][i] = m1[6][i] - m1[7][i]
471 : //m2[7][i] = m3[4][i] - m3[6][i] - (m3[5][i] - m3[7][i])
472 : //m2[7][i] = m2[0][i] - m2[4][i] - (m2[2][i] - m2[6][i]) - ((m2[1][i] - m2[5][i]) - (m2[3][i] - m2[7][i]))
473 : //m2[7][i] = (m2[0][i] - m2[1][i]) - (m2[2][i] - m2[3][i]) - ( (m2[4][i] - m2[5][i]) - (m2[6][i] - m2[7][i]) )
474 0 : s9 = _mm_sub_epi16(diff0to3Neg, diff4to7Neg);
475 0 : s9 = _mm_abs_epi16(s9);
476 0 : s10 = _mm_unpacklo_epi16(s9, _mm_setzero_si128());
477 0 : s11 = _mm_unpackhi_epi16(s9, _mm_setzero_si128());
478 0 : s10 = _mm_add_epi32(s10, s11);
479 0 : s10 = _mm_hadd_epi32(s10, s8);
480 0 : s10 = _mm_hadd_epi32(s10, s8);
481 0 : s12 = _mm_add_epi32(s10, s12);
482 :
483 0 : satdBlock8x8 = (uint64_t)_mm_extract_epi32(s12, 0);
484 :
485 0 : satdBlock8x8 = ((satdBlock8x8 + 2) >> 2);
486 :
487 0 : return satdBlock8x8;
488 : }
|