Line data Source code
1 : /*
2 : * Copyright(c) 2019 Intel Corporation
3 : * SPDX - License - Identifier: BSD - 2 - Clause - Patent
4 : */
5 :
6 : #include "EbNoiseExtractAVX2.h"
7 : #include "EbDefinitions.h"
8 : #include "immintrin.h"
9 : #include "EbUtility.h"
10 :
11 : EB_EXTERN EB_ALIGN(16) const uint8_t filterType[] = {
12 : 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4
13 : };
14 :
15 : EB_EXTERN EB_ALIGN(16) const uint8_t WeakChromafilter[2][32] = {
16 : { 2, 4, 2, 4, 2, 4, 2, 4, 2, 4, 2, 4, 2, 4, 2, 4, 2, 4, 2, 4, 2, 4, 2, 4, 2, 4, 2, 4, 2, 4, 2, 4 },
17 : { 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2 },
18 : };
19 :
20 0 : inline void luma_weak_filter_avx2_intrin(
21 : __m256i top,
22 : __m256i curr,
23 : __m256i bottom,
24 : __m256i curr_prev,
25 : __m256i curr_next,
26 : uint8_t *ptr_denoised,
27 : uint8_t *ptr_noise
28 : )
29 : {
30 : __m256i topFirstHalf, bottomFirstHalf,
31 : filterFirstHalf, filterSecondHalf,
32 : currNextFirstHalf, currNextSecondHalf,
33 : weights, currLeftMidFirstHalfWeight,
34 : currLeftMidFirstHalflo, currLeftMidFirstHalfhi, currPrevPermutation, currPermutation, currNextPermutation,
35 : topPermutation, bottomPermutation;
36 :
37 0 : currPrevPermutation = _mm256_permute4x64_epi64(curr_prev, 216);
38 0 : currPermutation = _mm256_permute4x64_epi64(curr, 216);
39 0 : currLeftMidFirstHalflo = _mm256_unpacklo_epi8(currPrevPermutation, currPermutation);
40 0 : weights = _mm256_loadu_si256((__m256i*)filterType);
41 0 : currLeftMidFirstHalfWeight = _mm256_maddubs_epi16(currLeftMidFirstHalflo, weights);
42 0 : currNextPermutation = _mm256_permute4x64_epi64(curr_next, 88);
43 0 : currNextFirstHalf = _mm256_unpacklo_epi8(currNextPermutation, _mm256_setzero_si256());
44 0 : currLeftMidFirstHalflo = _mm256_add_epi16(currNextFirstHalf, currLeftMidFirstHalfWeight);
45 :
46 0 : currLeftMidFirstHalfhi = _mm256_unpackhi_epi8(currPrevPermutation, currPermutation);
47 0 : currLeftMidFirstHalfWeight = _mm256_maddubs_epi16(currLeftMidFirstHalfhi, weights);
48 0 : currNextPermutation = _mm256_permute4x64_epi64(curr_next, 216);
49 0 : currNextSecondHalf = _mm256_unpackhi_epi8(currNextPermutation, _mm256_setzero_si256());
50 0 : currLeftMidFirstHalfhi = _mm256_add_epi16(currNextSecondHalf, currLeftMidFirstHalfWeight);
51 :
52 0 : topPermutation = _mm256_permute4x64_epi64(top, 216);
53 0 : topFirstHalf = _mm256_unpacklo_epi8(topPermutation, _mm256_setzero_si256());
54 0 : bottomPermutation = _mm256_permute4x64_epi64(bottom, 216);
55 0 : bottomFirstHalf = _mm256_unpacklo_epi8(bottomPermutation, _mm256_setzero_si256());
56 0 : filterFirstHalf = _mm256_adds_epi16(_mm256_adds_epi16(bottomFirstHalf, topFirstHalf), currLeftMidFirstHalflo);
57 0 : filterFirstHalf = _mm256_srli_epi16(filterFirstHalf, 3);
58 :
59 0 : topFirstHalf = _mm256_unpackhi_epi8(topPermutation, _mm256_setzero_si256());
60 0 : bottomFirstHalf = _mm256_unpackhi_epi8(bottomPermutation, _mm256_setzero_si256());
61 0 : filterSecondHalf = _mm256_adds_epi16(_mm256_adds_epi16(bottomFirstHalf, topFirstHalf), currLeftMidFirstHalfhi);
62 0 : filterSecondHalf = _mm256_srli_epi16(filterSecondHalf, 3);
63 :
64 0 : filterFirstHalf = _mm256_permute4x64_epi64(_mm256_packus_epi16(filterFirstHalf, filterSecondHalf), 216);
65 : _mm256_storeu_si256((__m256i *)(ptr_denoised), filterFirstHalf);
66 :
67 0 : _mm256_storeu_si256((__m256i *)(ptr_noise), _mm256_subs_epu8(curr, filterFirstHalf));
68 0 : }
69 0 : inline void chroma_weak_luma_strong_filter_avx2_intrin(
70 : __m256i top,
71 : __m256i curr,
72 : __m256i bottom,
73 : __m256i curr_prev,
74 : __m256i curr_next,
75 : __m256i top_prev,
76 : __m256i top_next,
77 : __m256i bottom_prev,
78 : __m256i bottom_next,
79 : uint8_t *ptr_denoised
80 : )
81 : {
82 : __m256i filterFirstHalf, filterSecondHalf,
83 : currNextFirstHalf, currNextSecondHalf,
84 : weights, currLeftMidFirstHalfWeight,
85 : currLeftMidFirstHalflo, currLeftMidFirstHalfhi, currPrevPermutation, currPermutation, currNextPermutation,
86 : topPermutation, bottomPermutation,
87 : topPrevPermutation, topLeftMidFirstHalflo, topLeftMidFirstHalfWeight, topNextFirstHalf,
88 : topNextPermutation, topLeftMidFirstHalfhi, topNextSecondHalf,
89 : bottomPrevPermutation, bottomLeftMidFirstHalflo, bottomLeftMidFirstHalfWeight, bottomNextPermutation,
90 : bottomNextFirstHalf, bottomLeftMidFirstHalfhi, bottomNextSecondHalf;
91 :
92 : // Curr
93 0 : currPrevPermutation = _mm256_permute4x64_epi64(curr_prev, 216);
94 0 : currPermutation = _mm256_permute4x64_epi64(curr, 216);
95 0 : currLeftMidFirstHalflo = _mm256_unpacklo_epi8(currPrevPermutation, currPermutation);
96 0 : weights = _mm256_loadu_si256((__m256i*)WeakChromafilter[0]);
97 0 : currLeftMidFirstHalfWeight = _mm256_maddubs_epi16(currLeftMidFirstHalflo, weights);
98 0 : currNextPermutation = _mm256_permute4x64_epi64(curr_next, 88);
99 0 : currNextFirstHalf = _mm256_unpacklo_epi8(currNextPermutation, _mm256_setzero_si256());
100 0 : currNextFirstHalf = _mm256_slli_epi16(currNextFirstHalf, 1);
101 0 : currLeftMidFirstHalflo = _mm256_add_epi16(currNextFirstHalf, currLeftMidFirstHalfWeight);
102 :
103 0 : currLeftMidFirstHalfhi = _mm256_unpackhi_epi8(currPrevPermutation, currPermutation);
104 0 : currLeftMidFirstHalfWeight = _mm256_maddubs_epi16(currLeftMidFirstHalfhi, weights);
105 0 : currNextPermutation = _mm256_permute4x64_epi64(curr_next, 216);
106 0 : currNextSecondHalf = _mm256_unpackhi_epi8(currNextPermutation, _mm256_setzero_si256());
107 0 : currNextSecondHalf = _mm256_slli_epi16(currNextSecondHalf, 1);
108 0 : currLeftMidFirstHalfhi = _mm256_add_epi16(currNextSecondHalf, currLeftMidFirstHalfWeight);
109 :
110 : // Top
111 0 : topPrevPermutation = _mm256_permute4x64_epi64(top_prev, 216);
112 0 : topPermutation = _mm256_permute4x64_epi64(top, 216);
113 0 : topLeftMidFirstHalflo = _mm256_unpacklo_epi8(topPrevPermutation, topPermutation);
114 0 : weights = _mm256_loadu_si256((__m256i*)WeakChromafilter[1]);
115 0 : topLeftMidFirstHalfWeight = _mm256_maddubs_epi16(topLeftMidFirstHalflo, weights);
116 0 : topNextPermutation = _mm256_permute4x64_epi64(top_next, 88);
117 0 : topNextFirstHalf = _mm256_unpacklo_epi8(topNextPermutation, _mm256_setzero_si256());
118 0 : topLeftMidFirstHalflo = _mm256_add_epi16(topNextFirstHalf, topLeftMidFirstHalfWeight);
119 :
120 0 : topLeftMidFirstHalfhi = _mm256_unpackhi_epi8(topPrevPermutation, topPermutation);
121 0 : topLeftMidFirstHalfWeight = _mm256_maddubs_epi16(topLeftMidFirstHalfhi, weights);
122 0 : topNextPermutation = _mm256_permute4x64_epi64(top_next, 216);
123 0 : topNextSecondHalf = _mm256_unpackhi_epi8(topNextPermutation, _mm256_setzero_si256());
124 0 : topLeftMidFirstHalfhi = _mm256_add_epi16(topNextSecondHalf, topLeftMidFirstHalfWeight);
125 :
126 : // Bottom
127 0 : bottomPrevPermutation = _mm256_permute4x64_epi64(bottom_prev, 216);
128 0 : bottomPermutation = _mm256_permute4x64_epi64(bottom, 216);
129 0 : bottomLeftMidFirstHalflo = _mm256_unpacklo_epi8(bottomPrevPermutation, bottomPermutation);
130 0 : weights = _mm256_loadu_si256((__m256i*)WeakChromafilter[1]);
131 0 : bottomLeftMidFirstHalfWeight = _mm256_maddubs_epi16(bottomLeftMidFirstHalflo, weights);
132 0 : bottomNextPermutation = _mm256_permute4x64_epi64(bottom_next, 88);
133 0 : bottomNextFirstHalf = _mm256_unpacklo_epi8(bottomNextPermutation, _mm256_setzero_si256());
134 0 : bottomLeftMidFirstHalflo = _mm256_add_epi16(bottomNextFirstHalf, bottomLeftMidFirstHalfWeight);
135 :
136 0 : bottomLeftMidFirstHalfhi = _mm256_unpackhi_epi8(bottomPrevPermutation, bottomPermutation);
137 0 : bottomLeftMidFirstHalfWeight = _mm256_maddubs_epi16(bottomLeftMidFirstHalfhi, weights);
138 0 : bottomNextPermutation = _mm256_permute4x64_epi64(bottom_next, 216);
139 0 : bottomNextSecondHalf = _mm256_unpackhi_epi8(bottomNextPermutation, _mm256_setzero_si256());
140 0 : bottomLeftMidFirstHalfhi = _mm256_add_epi16(bottomNextSecondHalf, bottomLeftMidFirstHalfWeight);
141 :
142 0 : filterFirstHalf = _mm256_adds_epi16(_mm256_adds_epi16(bottomLeftMidFirstHalflo, topLeftMidFirstHalflo), currLeftMidFirstHalflo);
143 0 : filterFirstHalf = _mm256_srli_epi16(filterFirstHalf, 4);
144 0 : filterSecondHalf = _mm256_adds_epi16(_mm256_adds_epi16(bottomLeftMidFirstHalfhi, topLeftMidFirstHalfhi), currLeftMidFirstHalfhi);
145 0 : filterSecondHalf = _mm256_srli_epi16(filterSecondHalf, 4);
146 :
147 0 : filterFirstHalf = _mm256_permute4x64_epi64(_mm256_packus_epi16(filterFirstHalf, filterSecondHalf), 216);
148 : _mm256_storeu_si256((__m256i *)(ptr_denoised), filterFirstHalf);
149 0 : }
150 :
151 0 : inline void chroma_strong_avx2_intrin(
152 : __m256i top,
153 : __m256i curr,
154 : __m256i bottom,
155 : __m256i curr_prev,
156 : __m256i curr_next,
157 : __m256i top_prev,
158 : __m256i top_next,
159 : __m256i bottom_prev,
160 : __m256i bottom_next,
161 : uint8_t *ptr_denoised
162 : )
163 : {
164 : __m256i currLeftMidFirstHalflo, currLeftMidFirstHalfhi, currPrevPermutation, currPermutation, currNextPermutation,
165 : topPermutation, topPrevPermutation, topLeftMidFirstHalflo, topNextPermutation, topLeftMidFirstHalfhi,
166 : bottomPermutation, bottomPrevPermutation, bottomLeftMidFirstHalflo, bottomNextPermutation, bottomLeftMidFirstHalfhi;
167 :
168 0 : currPrevPermutation = _mm256_permute4x64_epi64(curr_prev, 216);
169 0 : currPermutation = _mm256_permute4x64_epi64(curr, 216);
170 0 : currNextPermutation = _mm256_permute4x64_epi64(curr_next, 216);
171 :
172 0 : currLeftMidFirstHalflo = _mm256_add_epi16(_mm256_unpacklo_epi8(currPermutation, _mm256_setzero_si256()),
173 : _mm256_unpacklo_epi8(currPrevPermutation, _mm256_setzero_si256()));
174 0 : currLeftMidFirstHalflo = _mm256_add_epi16(_mm256_unpacklo_epi8(currNextPermutation, _mm256_setzero_si256()), currLeftMidFirstHalflo);
175 :
176 0 : currLeftMidFirstHalfhi = _mm256_add_epi16(_mm256_unpackhi_epi8(currPermutation, _mm256_setzero_si256()),
177 : _mm256_unpackhi_epi8(currPrevPermutation, _mm256_setzero_si256()));
178 0 : currLeftMidFirstHalfhi = _mm256_add_epi16(_mm256_unpackhi_epi8(currNextPermutation, _mm256_setzero_si256()), currLeftMidFirstHalfhi);
179 :
180 0 : topPrevPermutation = _mm256_permute4x64_epi64(top_prev, 216);
181 0 : topPermutation = _mm256_permute4x64_epi64(top, 216);
182 0 : topNextPermutation = _mm256_permute4x64_epi64(top_next, 216);
183 :
184 0 : topLeftMidFirstHalflo = _mm256_add_epi16(_mm256_unpacklo_epi8(topPermutation, _mm256_setzero_si256()),
185 : _mm256_unpacklo_epi8(topPrevPermutation, _mm256_setzero_si256()));
186 0 : topLeftMidFirstHalflo = _mm256_add_epi16(_mm256_unpacklo_epi8(topNextPermutation, _mm256_setzero_si256()), topLeftMidFirstHalflo);
187 :
188 0 : topLeftMidFirstHalfhi = _mm256_add_epi16(_mm256_unpackhi_epi8(topPermutation, _mm256_setzero_si256()),
189 : _mm256_unpackhi_epi8(topPrevPermutation, _mm256_setzero_si256()));
190 0 : topLeftMidFirstHalfhi = _mm256_add_epi16(_mm256_unpackhi_epi8(topNextPermutation, _mm256_setzero_si256()), topLeftMidFirstHalfhi);
191 :
192 0 : bottomPrevPermutation = _mm256_permute4x64_epi64(bottom_prev, 216);
193 0 : bottomPermutation = _mm256_permute4x64_epi64(bottom, 216);
194 0 : bottomNextPermutation = _mm256_permute4x64_epi64(bottom_next, 216);
195 :
196 0 : bottomLeftMidFirstHalflo = _mm256_add_epi16(_mm256_unpacklo_epi8(bottomPermutation, _mm256_setzero_si256()),
197 : _mm256_unpacklo_epi8(bottomPrevPermutation, _mm256_setzero_si256()));
198 0 : bottomLeftMidFirstHalflo = _mm256_add_epi16(_mm256_unpacklo_epi8(bottomNextPermutation, _mm256_setzero_si256()), bottomLeftMidFirstHalflo);
199 :
200 0 : bottomLeftMidFirstHalfhi = _mm256_add_epi16(_mm256_unpackhi_epi8(bottomPermutation, _mm256_setzero_si256()),
201 : _mm256_unpackhi_epi8(bottomPrevPermutation, _mm256_setzero_si256()));
202 0 : bottomLeftMidFirstHalfhi = _mm256_add_epi16(_mm256_unpackhi_epi8(bottomNextPermutation, _mm256_setzero_si256()), bottomLeftMidFirstHalfhi);
203 :
204 0 : currLeftMidFirstHalflo = _mm256_add_epi16(_mm256_add_epi16(currLeftMidFirstHalflo, topLeftMidFirstHalflo), bottomLeftMidFirstHalflo);
205 0 : currLeftMidFirstHalfhi = _mm256_add_epi16(_mm256_add_epi16(currLeftMidFirstHalfhi, topLeftMidFirstHalfhi), bottomLeftMidFirstHalfhi);
206 :
207 0 : topLeftMidFirstHalflo = _mm256_unpacklo_epi16(currLeftMidFirstHalflo, _mm256_setzero_si256());
208 0 : topLeftMidFirstHalflo = _mm256_mullo_epi32(topLeftMidFirstHalflo, _mm256_set1_epi32(7282));
209 0 : topLeftMidFirstHalflo = _mm256_srli_epi32(topLeftMidFirstHalflo, 16);
210 0 : bottomLeftMidFirstHalflo = _mm256_unpackhi_epi16(currLeftMidFirstHalflo, _mm256_setzero_si256());
211 0 : bottomLeftMidFirstHalflo = _mm256_mullo_epi32(bottomLeftMidFirstHalflo, _mm256_set1_epi32(7282));
212 0 : bottomLeftMidFirstHalflo = _mm256_srli_epi32(bottomLeftMidFirstHalflo, 16);
213 0 : currLeftMidFirstHalflo = _mm256_packus_epi32(topLeftMidFirstHalflo, bottomLeftMidFirstHalflo);
214 :
215 0 : currLeftMidFirstHalflo = _mm256_insertf128_si256(_mm256_setzero_si256(), _mm_packus_epi16(_mm256_extracti128_si256(currLeftMidFirstHalflo, 0), _mm256_extracti128_si256(currLeftMidFirstHalflo, 1)), 0);
216 :
217 0 : topLeftMidFirstHalfhi = _mm256_unpacklo_epi16(currLeftMidFirstHalfhi, _mm256_setzero_si256());
218 0 : topLeftMidFirstHalfhi = _mm256_mullo_epi32(topLeftMidFirstHalfhi, _mm256_set1_epi32(7282));
219 0 : topLeftMidFirstHalfhi = _mm256_srli_epi32(topLeftMidFirstHalfhi, 16);
220 :
221 0 : bottomLeftMidFirstHalfhi = _mm256_unpackhi_epi16(currLeftMidFirstHalfhi, _mm256_setzero_si256());
222 0 : bottomLeftMidFirstHalfhi = _mm256_mullo_epi32(bottomLeftMidFirstHalfhi, _mm256_set1_epi32(7282));
223 0 : bottomLeftMidFirstHalfhi = _mm256_srli_epi32(bottomLeftMidFirstHalfhi, 16);
224 0 : currLeftMidFirstHalfhi = _mm256_packus_epi32(topLeftMidFirstHalfhi, bottomLeftMidFirstHalfhi);
225 :
226 0 : currLeftMidFirstHalflo = _mm256_insertf128_si256(currLeftMidFirstHalflo, _mm_packus_epi16(_mm256_extracti128_si256(currLeftMidFirstHalfhi, 0), _mm256_extracti128_si256(currLeftMidFirstHalfhi, 1)), 1);
227 : _mm256_storeu_si256((__m256i *)(ptr_denoised), currLeftMidFirstHalflo);
228 0 : }
229 : /*******************************************
230 : * noise_extract_luma_weak
231 : * weak filter Luma and store noise.
232 : *******************************************/
233 0 : void noise_extract_luma_weak_avx2_intrin(
234 : EbPictureBufferDesc *input_picture_ptr,
235 : EbPictureBufferDesc *denoised_picture_ptr,
236 : EbPictureBufferDesc *noise_picture_ptr,
237 : uint32_t sb_origin_y,
238 : uint32_t sb_origin_x
239 : )
240 : {
241 : uint32_t ii, jj, kk;
242 : uint32_t picHeight, sb_height;
243 : uint32_t picWidth;
244 : uint32_t inputOriginIndex;
245 : uint32_t inputOriginIndexPad;
246 : uint32_t noiseOriginIndex;
247 :
248 : uint8_t *ptrIn;
249 : uint32_t stride_in;
250 : uint8_t *ptr_denoised, *ptrDenoisedInterm;
251 :
252 : uint8_t *ptr_noise, *ptrNoiseInterm;
253 : uint32_t strideOut;
254 :
255 : __m256i top, curr, bottom, curr_prev, curr_next,
256 : secondtop, secondcurr, secondbottom, secondcurrPrev, secondcurrNext;
257 : (void)sb_origin_x;
258 :
259 : //Luma
260 : {
261 0 : picHeight = input_picture_ptr->height;
262 0 : picWidth = input_picture_ptr->width;
263 0 : sb_height = MIN(BLOCK_SIZE_64, picHeight - sb_origin_y);
264 0 : sb_height = ((sb_origin_y + BLOCK_SIZE_64 >= picHeight) || (sb_origin_y == 0)) ? sb_height - 1 : sb_height;
265 0 : stride_in = input_picture_ptr->stride_y;
266 0 : inputOriginIndex = input_picture_ptr->origin_x + (input_picture_ptr->origin_y + sb_origin_y) * input_picture_ptr->stride_y;
267 0 : ptrIn = &(input_picture_ptr->buffer_y[inputOriginIndex]);
268 :
269 0 : inputOriginIndexPad = denoised_picture_ptr->origin_x + (denoised_picture_ptr->origin_y + sb_origin_y) * denoised_picture_ptr->stride_y;
270 0 : strideOut = denoised_picture_ptr->stride_y;
271 0 : ptr_denoised = &(denoised_picture_ptr->buffer_y[inputOriginIndexPad]);
272 0 : ptrDenoisedInterm = ptr_denoised;
273 :
274 0 : noiseOriginIndex = noise_picture_ptr->origin_x + noise_picture_ptr->origin_y * noise_picture_ptr->stride_y;
275 0 : ptr_noise = &(noise_picture_ptr->buffer_y[noiseOriginIndex]);
276 0 : ptrNoiseInterm = ptr_noise;
277 :
278 : ////Luma
279 : //a = (p[1] +
280 : // p[0 + stride] + 4 * p[1 + stride] + p[2 + stride] +
281 : // p[1 + 2 * stride]) / 8;
282 :
283 0 : top = curr = secondtop = secondcurr = _mm256_setzero_si256();
284 :
285 0 : for (kk = 0; kk + BLOCK_SIZE_64 <= picWidth; kk += BLOCK_SIZE_64)
286 : {
287 0 : for (jj = 0; jj < sb_height; jj++)
288 : {
289 0 : if (sb_origin_y == 0)
290 : {
291 0 : if (jj == 0)
292 : {
293 0 : top = _mm256_loadu_si256((__m256i*)(ptrIn + kk + jj * stride_in));
294 0 : secondtop = _mm256_loadu_si256((__m256i*)(ptrIn + kk + 32 + jj * stride_in));
295 0 : curr = _mm256_loadu_si256((__m256i*)(ptrIn + kk + (1 + jj)*stride_in));
296 0 : secondcurr = _mm256_loadu_si256((__m256i*)((ptrIn + kk + 32) + (1 + jj)*stride_in));
297 0 : _mm256_storeu_si256((__m256i *)(ptr_denoised + kk), top);
298 0 : _mm256_storeu_si256((__m256i *)(ptr_denoised + kk + 32), secondtop);
299 0 : _mm256_storeu_si256((__m256i *)(ptr_noise + kk), _mm256_setzero_si256());
300 0 : _mm256_storeu_si256((__m256i *)(ptr_noise + kk + 32), _mm256_setzero_si256());
301 : }
302 0 : curr_prev = _mm256_loadu_si256((__m256i*)(ptrIn - 1 + kk + ((1 + jj)*stride_in)));
303 0 : curr_next = _mm256_loadu_si256((__m256i*)(ptrIn + 1 + kk + ((1 + jj)*stride_in)));
304 0 : secondcurrPrev = _mm256_loadu_si256((__m256i*)((ptrIn + kk + 32) - 1 + ((1 + jj)*stride_in)));
305 0 : secondcurrNext = _mm256_loadu_si256((__m256i*)((ptrIn + kk + 32) + 1 + ((1 + jj)*stride_in)));
306 0 : bottom = _mm256_loadu_si256((__m256i*)((ptrIn + kk) + (2 + jj)* stride_in));
307 0 : secondbottom = _mm256_loadu_si256((__m256i*)((ptrIn + kk + 32) + (2 + jj)* stride_in));
308 0 : ptrDenoisedInterm = ptr_denoised + kk + ((1 + jj)*strideOut);
309 0 : ptrNoiseInterm = ptr_noise + kk + ((1 + jj)*strideOut);
310 : }
311 : else
312 : {
313 0 : if (jj == 0)
314 : {
315 0 : top = _mm256_loadu_si256((__m256i*)(ptrIn + kk + jj * stride_in - stride_in));
316 0 : secondtop = _mm256_loadu_si256((__m256i*)(ptrIn + kk + 32 + jj * stride_in - stride_in));
317 0 : curr = _mm256_loadu_si256((__m256i*)(ptrIn + kk + (1 + jj)*stride_in - stride_in));
318 0 : secondcurr = _mm256_loadu_si256((__m256i*)((ptrIn + kk + 32) + (1 + jj)*stride_in - stride_in));
319 : }
320 0 : curr_prev = _mm256_loadu_si256((__m256i*)(ptrIn - 1 + kk + ((1 + jj)*stride_in - stride_in)));
321 0 : curr_next = _mm256_loadu_si256((__m256i*)(ptrIn + 1 + kk + ((1 + jj)*stride_in - stride_in)));
322 0 : secondcurrPrev = _mm256_loadu_si256((__m256i*)((ptrIn + kk + 32) - 1 + ((1 + jj)*stride_in - stride_in)));
323 0 : secondcurrNext = _mm256_loadu_si256((__m256i*)((ptrIn + kk + 32) + 1 + ((1 + jj)*stride_in - stride_in)));
324 0 : bottom = _mm256_loadu_si256((__m256i*)((ptrIn + kk) + (2 + jj)* stride_in - stride_in));
325 0 : secondbottom = _mm256_loadu_si256((__m256i*)((ptrIn + kk + 32) + (2 + jj)* stride_in - stride_in));
326 0 : ptrDenoisedInterm = ptr_denoised + kk + ((1 + jj)*strideOut - strideOut);
327 0 : ptrNoiseInterm = ptr_noise + kk + jj * strideOut;
328 : }
329 :
330 0 : luma_weak_filter_avx2_intrin(
331 : top,
332 : curr,
333 : bottom,
334 : curr_prev,
335 : curr_next,
336 : ptrDenoisedInterm,
337 : ptrNoiseInterm);
338 :
339 0 : luma_weak_filter_avx2_intrin(
340 : secondtop,
341 : secondcurr,
342 : secondbottom,
343 : secondcurrPrev,
344 : secondcurrNext,
345 : ptrDenoisedInterm + 32,
346 : ptrNoiseInterm + 32);
347 :
348 0 : top = curr;
349 0 : curr = bottom;
350 0 : secondtop = secondcurr;
351 0 : secondcurr = secondbottom;
352 : }
353 : }
354 :
355 0 : sb_height = MIN(BLOCK_SIZE_64, picHeight - sb_origin_y);
356 :
357 0 : for (jj = 0; jj < sb_height; jj++) {
358 0 : for (ii = 0; ii < picWidth; ii++) {
359 0 : if (!((jj < sb_height - 1 || sb_origin_y + sb_height < picHeight) && ii > 0 && ii < picWidth - 1)) {
360 0 : ptr_denoised[ii + jj * strideOut] = ptrIn[ii + jj * stride_in];
361 0 : ptr_noise[ii + jj * strideOut] = 0;
362 : }
363 : }
364 : }
365 : }
366 0 : }
367 :
368 0 : void noise_extract_luma_weak_lcu_avx2_intrin(
369 : EbPictureBufferDesc *input_picture_ptr,
370 : EbPictureBufferDesc *denoised_picture_ptr,
371 : EbPictureBufferDesc *noise_picture_ptr,
372 : uint32_t sb_origin_y,
373 : uint32_t sb_origin_x
374 : )
375 : {
376 : uint32_t ii, jj;
377 : uint32_t picHeight, sb_height;
378 : uint32_t picWidth, sb_width;
379 : uint32_t inputOriginIndex;
380 : uint32_t inputOriginIndexPad;
381 : uint32_t noiseOriginIndex;
382 :
383 : uint8_t *ptrIn;
384 : uint32_t stride_in;
385 : uint8_t *ptr_denoised, *ptrDenoisedInterm;
386 :
387 : uint8_t *ptr_noise, *ptrNoiseInterm;
388 : uint32_t strideOut;
389 :
390 : __m256i top, curr, bottom, curr_prev, curr_next,
391 : secondtop, secondcurr, secondbottom, secondcurrPrev, secondcurrNext;
392 : (void)sb_origin_x;
393 :
394 : //Luma
395 : {
396 0 : picHeight = input_picture_ptr->height;
397 0 : picWidth = input_picture_ptr->width;
398 0 : sb_height = MIN(BLOCK_SIZE_64, picHeight - sb_origin_y);
399 0 : sb_width = MIN(BLOCK_SIZE_64, picWidth - sb_origin_x);
400 0 : sb_height = ((sb_origin_y + BLOCK_SIZE_64 >= picHeight) || (sb_origin_y == 0)) ? sb_height - 1 : sb_height;
401 0 : stride_in = input_picture_ptr->stride_y;
402 0 : inputOriginIndex = input_picture_ptr->origin_x + sb_origin_x + (input_picture_ptr->origin_y + sb_origin_y) * input_picture_ptr->stride_y;
403 0 : ptrIn = &(input_picture_ptr->buffer_y[inputOriginIndex]);
404 :
405 0 : inputOriginIndexPad = denoised_picture_ptr->origin_x + sb_origin_x + (denoised_picture_ptr->origin_y + sb_origin_y) * denoised_picture_ptr->stride_y;
406 0 : strideOut = denoised_picture_ptr->stride_y;
407 0 : ptr_denoised = &(denoised_picture_ptr->buffer_y[inputOriginIndexPad]);
408 0 : ptrDenoisedInterm = ptr_denoised;
409 :
410 0 : noiseOriginIndex = noise_picture_ptr->origin_x + sb_origin_x + noise_picture_ptr->origin_y * noise_picture_ptr->stride_y;
411 0 : ptr_noise = &(noise_picture_ptr->buffer_y[noiseOriginIndex]);
412 0 : ptrNoiseInterm = ptr_noise;
413 :
414 : ////Luma
415 : //a = (p[1] +
416 : // p[0 + stride] + 4 * p[1 + stride] + p[2 + stride] +
417 : // p[1 + 2 * stride]) / 8;
418 :
419 0 : top = curr = secondtop = secondcurr = _mm256_setzero_si256();
420 :
421 : //for (kk = 0; kk + BLOCK_SIZE_64 <= picWidth; kk += BLOCK_SIZE_64)
422 : {
423 0 : for (jj = 0; jj < sb_height; jj++)
424 : {
425 0 : if (sb_origin_y == 0)
426 : {
427 0 : if (jj == 0)
428 : {
429 0 : top = _mm256_loadu_si256((__m256i*)(ptrIn + jj * stride_in));
430 0 : secondtop = _mm256_loadu_si256((__m256i*)(ptrIn + 32 + jj * stride_in));
431 0 : curr = _mm256_loadu_si256((__m256i*)(ptrIn + (1 + jj)*stride_in));
432 0 : secondcurr = _mm256_loadu_si256((__m256i*)((ptrIn + 32) + (1 + jj)*stride_in));
433 : _mm256_storeu_si256((__m256i *)(ptr_denoised), top);
434 0 : _mm256_storeu_si256((__m256i *)(ptr_denoised + 32), secondtop);
435 0 : _mm256_storeu_si256((__m256i *)(ptr_noise), _mm256_setzero_si256());
436 0 : _mm256_storeu_si256((__m256i *)(ptr_noise + 32), _mm256_setzero_si256());
437 : }
438 0 : curr_prev = _mm256_loadu_si256((__m256i*)(ptrIn - 1 + ((1 + jj)*stride_in)));
439 0 : curr_next = _mm256_loadu_si256((__m256i*)(ptrIn + 1 + ((1 + jj)*stride_in)));
440 0 : secondcurrPrev = _mm256_loadu_si256((__m256i*)((ptrIn + 32) - 1 + ((1 + jj)*stride_in)));
441 0 : secondcurrNext = _mm256_loadu_si256((__m256i*)((ptrIn + 32) + 1 + ((1 + jj)*stride_in)));
442 0 : bottom = _mm256_loadu_si256((__m256i*)((ptrIn)+(2 + jj)* stride_in));
443 0 : secondbottom = _mm256_loadu_si256((__m256i*)((ptrIn + 32) + (2 + jj)* stride_in));
444 0 : ptrDenoisedInterm = ptr_denoised + ((1 + jj)*strideOut);
445 0 : ptrNoiseInterm = ptr_noise + ((1 + jj)*strideOut);
446 : }
447 : else
448 : {
449 0 : if (jj == 0)
450 : {
451 0 : top = _mm256_loadu_si256((__m256i*)(ptrIn + jj * stride_in - stride_in));
452 0 : secondtop = _mm256_loadu_si256((__m256i*)(ptrIn + 32 + jj * stride_in - stride_in));
453 0 : curr = _mm256_loadu_si256((__m256i*)(ptrIn + (1 + jj)*stride_in - stride_in));
454 0 : secondcurr = _mm256_loadu_si256((__m256i*)((ptrIn + 32) + (1 + jj)*stride_in - stride_in));
455 : }
456 0 : curr_prev = _mm256_loadu_si256((__m256i*)(ptrIn - 1 + ((1 + jj)*stride_in - stride_in)));
457 0 : curr_next = _mm256_loadu_si256((__m256i*)(ptrIn + 1 + ((1 + jj)*stride_in - stride_in)));
458 0 : secondcurrPrev = _mm256_loadu_si256((__m256i*)((ptrIn + 32) - 1 + ((1 + jj)*stride_in - stride_in)));
459 0 : secondcurrNext = _mm256_loadu_si256((__m256i*)((ptrIn + 32) + 1 + ((1 + jj)*stride_in - stride_in)));
460 0 : bottom = _mm256_loadu_si256((__m256i*)((ptrIn)+(2 + jj)* stride_in - stride_in));
461 0 : secondbottom = _mm256_loadu_si256((__m256i*)((ptrIn + 32) + (2 + jj)* stride_in - stride_in));
462 0 : ptrDenoisedInterm = ptr_denoised + ((1 + jj)*strideOut - strideOut);
463 0 : ptrNoiseInterm = ptr_noise + jj * strideOut;
464 : }
465 :
466 0 : luma_weak_filter_avx2_intrin(
467 : top,
468 : curr,
469 : bottom,
470 : curr_prev,
471 : curr_next,
472 : ptrDenoisedInterm,
473 : ptrNoiseInterm);
474 :
475 0 : luma_weak_filter_avx2_intrin(
476 : secondtop,
477 : secondcurr,
478 : secondbottom,
479 : secondcurrPrev,
480 : secondcurrNext,
481 : ptrDenoisedInterm + 32,
482 : ptrNoiseInterm + 32);
483 :
484 0 : top = curr;
485 0 : curr = bottom;
486 0 : secondtop = secondcurr;
487 0 : secondcurr = secondbottom;
488 : }
489 : }
490 :
491 0 : sb_height = MIN(BLOCK_SIZE_64, picHeight - sb_origin_y);
492 :
493 0 : for (jj = 0; jj < sb_height; jj++) {
494 0 : for (ii = 0; ii < sb_width; ii++) {
495 0 : if (!((jj > 0 || sb_origin_y > 0) && (jj < sb_height - 1 || sb_origin_y + sb_height < picHeight) && (ii > 0 || sb_origin_x > 0) && (ii + sb_origin_x) < picWidth - 1)) {
496 0 : ptr_denoised[ii + jj * strideOut] = ptrIn[ii + jj * stride_in];
497 0 : ptr_noise[ii + jj * strideOut] = 0;
498 : }
499 : }
500 : }
501 : }
502 0 : }
503 : /*******************************************
504 : * noise_extract_luma_strong
505 : * strong filter Luma.
506 : *******************************************/
507 0 : void noise_extract_luma_strong_avx2_intrin(
508 : EbPictureBufferDesc *input_picture_ptr,
509 : EbPictureBufferDesc *denoised_picture_ptr,
510 : uint32_t sb_origin_y,
511 : uint32_t sb_origin_x
512 : )
513 : {
514 : uint32_t ii, jj, kk;
515 : uint32_t picHeight, sb_height;
516 : uint32_t picWidth;
517 : uint32_t inputOriginIndex;
518 : uint32_t inputOriginIndexPad;
519 :
520 : uint8_t *ptrIn;
521 : uint32_t stride_in;
522 : uint8_t *ptr_denoised, *ptrDenoisedInterm;
523 :
524 : uint32_t strideOut;
525 : __m256i top, curr, bottom, curr_prev, curr_next, top_prev, top_next, bottom_prev, bottom_next,
526 : secondtop, secondcurr, secondcurrPrev, secondcurrNext, secondbottom, secondtopPrev, secondtopNext, secondbottomPrev, secondbottomNext;
527 : (void)sb_origin_x;
528 : //Luma
529 : {
530 0 : picHeight = input_picture_ptr->height;
531 0 : picWidth = input_picture_ptr->width;
532 0 : sb_height = MIN(BLOCK_SIZE_64, picHeight - sb_origin_y);
533 :
534 0 : sb_height = ((sb_origin_y + BLOCK_SIZE_64 >= picHeight) || (sb_origin_y == 0)) ? sb_height - 1 : sb_height;
535 0 : stride_in = input_picture_ptr->stride_y;
536 0 : inputOriginIndex = input_picture_ptr->origin_x + (input_picture_ptr->origin_y + sb_origin_y)* input_picture_ptr->stride_y;
537 0 : ptrIn = &(input_picture_ptr->buffer_y[inputOriginIndex]);
538 :
539 0 : inputOriginIndexPad = denoised_picture_ptr->origin_x + (denoised_picture_ptr->origin_y + sb_origin_y) * denoised_picture_ptr->stride_y;
540 0 : strideOut = denoised_picture_ptr->stride_y;
541 0 : ptr_denoised = &(denoised_picture_ptr->buffer_y[inputOriginIndexPad]);
542 0 : ptrDenoisedInterm = ptr_denoised;
543 :
544 0 : top = curr = secondtop = secondcurr = top_next = top_prev = curr_next = curr_prev = secondcurrPrev = secondcurrNext = secondtopPrev = secondtopNext = _mm256_setzero_si256();
545 0 : for (kk = 0; kk + BLOCK_SIZE_64 <= picWidth; kk += BLOCK_SIZE_64)
546 : {
547 0 : for (jj = 0; jj < sb_height; jj++)
548 : {
549 0 : if (sb_origin_y == 0)
550 : {
551 0 : if (jj == 0)
552 : {
553 0 : top = _mm256_loadu_si256((__m256i*)(ptrIn + kk + jj * stride_in));
554 0 : secondtop = _mm256_loadu_si256((__m256i*)(ptrIn + kk + 32 + jj * stride_in));
555 :
556 0 : curr = _mm256_loadu_si256((__m256i*)(ptrIn + kk + (1 + jj)*stride_in));
557 0 : secondcurr = _mm256_loadu_si256((__m256i*)((ptrIn + kk + 32) + (1 + jj)*stride_in));
558 :
559 0 : top_prev = _mm256_loadu_si256((__m256i*)(ptrIn - 1 + kk + ((jj)*stride_in)));
560 0 : secondtopPrev = _mm256_loadu_si256((__m256i*)(ptrIn - 1 + kk + 32 + ((jj)*stride_in)));
561 :
562 0 : top_next = _mm256_loadu_si256((__m256i*)(ptrIn + 1 + kk + ((jj)*stride_in)));
563 0 : secondtopNext = _mm256_loadu_si256((__m256i*)(ptrIn + 1 + kk + 32 + ((jj)*stride_in)));
564 :
565 0 : curr_prev = _mm256_loadu_si256((__m256i*)(ptrIn - 1 + kk + ((1 + jj)*stride_in)));
566 0 : secondcurrPrev = _mm256_loadu_si256((__m256i*)(ptrIn - 1 + kk + 32 + ((1 + jj)*stride_in)));
567 :
568 0 : curr_next = _mm256_loadu_si256((__m256i*)(ptrIn + 1 + kk + ((1 + jj)*stride_in)));
569 0 : secondcurrNext = _mm256_loadu_si256((__m256i*)(ptrIn + 1 + kk + 32 + ((1 + jj)*stride_in)));
570 :
571 0 : _mm256_storeu_si256((__m256i *)(ptr_denoised + kk), top);
572 0 : _mm256_storeu_si256((__m256i *)(ptr_denoised + kk + 32), secondtop);
573 : }
574 0 : bottom_prev = _mm256_loadu_si256((__m256i*)(ptrIn - 1 + kk + ((2 + jj)*stride_in)));
575 0 : secondbottomPrev = _mm256_loadu_si256((__m256i*)(ptrIn - 1 + kk + 32 + ((2 + jj)*stride_in)));
576 :
577 0 : bottom_next = _mm256_loadu_si256((__m256i*)(ptrIn + 1 + kk + ((2 + jj)*stride_in)));
578 0 : secondbottomNext = _mm256_loadu_si256((__m256i*)(ptrIn + 1 + kk + 32 + ((2 + jj)*stride_in)));
579 :
580 0 : bottom = _mm256_loadu_si256((__m256i*)((ptrIn + kk) + (2 + jj)* stride_in));
581 0 : secondbottom = _mm256_loadu_si256((__m256i*)((ptrIn + kk + 32) + (2 + jj)* stride_in));
582 0 : ptrDenoisedInterm = ptr_denoised + kk + ((1 + jj)*strideOut);
583 : }
584 : else
585 : {
586 0 : if (jj == 0)
587 : {
588 0 : top = _mm256_loadu_si256((__m256i*)(ptrIn + kk + jj * stride_in - stride_in));
589 0 : secondtop = _mm256_loadu_si256((__m256i*)(ptrIn + kk + 32 + jj * stride_in - stride_in));
590 0 : curr = _mm256_loadu_si256((__m256i*)(ptrIn + kk + (1 + jj)*stride_in - stride_in));
591 0 : secondcurr = _mm256_loadu_si256((__m256i*)((ptrIn + kk + 32) + (1 + jj)*stride_in - stride_in));
592 0 : top_prev = _mm256_loadu_si256((__m256i*)(ptrIn - 1 + kk + ((jj)*stride_in) - stride_in));
593 0 : secondtopPrev = _mm256_loadu_si256((__m256i*)(ptrIn - 1 + kk + 32 + ((jj)*stride_in) - stride_in));
594 :
595 0 : top_next = _mm256_loadu_si256((__m256i*)(ptrIn + 1 + kk + ((jj)*stride_in) - stride_in));
596 0 : secondtopNext = _mm256_loadu_si256((__m256i*)(ptrIn + 1 + kk + 32 + ((jj)*stride_in) - stride_in));
597 :
598 0 : curr_prev = _mm256_loadu_si256((__m256i*)(ptrIn - 1 + kk + ((1 + jj)*stride_in - stride_in)));
599 0 : secondcurrPrev = _mm256_loadu_si256((__m256i*)(ptrIn - 1 + kk + 32 + ((1 + jj)*stride_in - stride_in)));
600 :
601 0 : curr_next = _mm256_loadu_si256((__m256i*)(ptrIn + 1 + kk + ((1 + jj)*stride_in - stride_in)));
602 0 : secondcurrNext = _mm256_loadu_si256((__m256i*)(ptrIn + 1 + kk + 32 + ((1 + jj)*stride_in - stride_in)));
603 : }
604 0 : bottom_prev = _mm256_loadu_si256((__m256i*)(ptrIn - 1 + kk + ((2 + jj)*stride_in) - stride_in));
605 0 : secondbottomPrev = _mm256_loadu_si256((__m256i*)(ptrIn - 1 + kk + 32 + ((2 + jj)*stride_in - stride_in)));
606 :
607 0 : bottom_next = _mm256_loadu_si256((__m256i*)(ptrIn + 1 + kk + ((2 + jj)*stride_in) - stride_in));
608 0 : secondbottomNext = _mm256_loadu_si256((__m256i*)(ptrIn + 1 + kk + 32 + ((2 + jj)*stride_in - stride_in)));
609 :
610 0 : bottom = _mm256_loadu_si256((__m256i*)((ptrIn + kk) + (2 + jj)* stride_in - stride_in));
611 0 : secondbottom = _mm256_loadu_si256((__m256i*)((ptrIn + kk + 32) + (2 + jj)* stride_in - stride_in));
612 :
613 0 : ptrDenoisedInterm = ptr_denoised + kk + ((1 + jj)*strideOut - strideOut);
614 : }
615 :
616 0 : chroma_weak_luma_strong_filter_avx2_intrin(
617 : top,
618 : curr,
619 : bottom,
620 : curr_prev,
621 : curr_next,
622 : top_prev,
623 : top_next,
624 : bottom_prev,
625 : bottom_next,
626 : ptrDenoisedInterm);
627 :
628 0 : chroma_weak_luma_strong_filter_avx2_intrin(
629 : secondtop,
630 : secondcurr,
631 : secondbottom,
632 : secondcurrPrev,
633 : secondcurrNext,
634 : secondtopPrev,
635 : secondtopNext,
636 : secondbottomPrev,
637 : secondbottomNext,
638 : ptrDenoisedInterm + 32);
639 :
640 0 : top = curr;
641 0 : curr = bottom;
642 0 : top_prev = curr_prev;
643 0 : top_next = curr_next;
644 0 : curr_prev = bottom_prev;
645 0 : curr_next = bottom_next;
646 0 : secondtop = secondcurr;
647 0 : secondcurr = secondbottom;
648 0 : secondtopPrev = secondcurrPrev;
649 0 : secondtopNext = secondcurrNext;
650 0 : secondcurrPrev = secondbottomPrev;
651 0 : secondcurrNext = secondbottomNext;
652 : }
653 : }
654 :
655 0 : sb_height = MIN(BLOCK_SIZE_64, picHeight - sb_origin_y);
656 :
657 0 : for (jj = 0; jj < sb_height; jj++) {
658 0 : for (ii = 0; ii < picWidth; ii++) {
659 0 : if (!((jj < sb_height - 1 || sb_origin_y + sb_height < picHeight) && ii > 0 && ii < picWidth - 1))
660 0 : ptr_denoised[ii + jj * strideOut] = ptrIn[ii + jj * stride_in];
661 : }
662 : }
663 : }
664 0 : }
665 :
666 : /*******************************************
667 : * noise_extract_chroma_strong
668 : * strong filter chroma.
669 : *******************************************/
670 0 : void noise_extract_chroma_strong_avx2_intrin(
671 : EbPictureBufferDesc *input_picture_ptr,
672 : EbPictureBufferDesc *denoised_picture_ptr,
673 : uint32_t sb_origin_y,
674 : uint32_t sb_origin_x
675 : )
676 : {
677 : uint32_t ii, jj, kk;
678 : uint32_t picHeight, sb_height;
679 : uint32_t picWidth;
680 : uint32_t inputOriginIndex;
681 : uint32_t inputOriginIndexPad;
682 :
683 : uint8_t *ptrIn, *ptrInCr;
684 : uint32_t stride_in, strideInCr;
685 : uint8_t *ptr_denoised, *ptrDenoisedInterm, *ptrDenoisedCr, *ptrDenoisedIntermCr;
686 :
687 : uint32_t strideOut, strideOutCr;
688 : __m256i top, curr, bottom, curr_prev, curr_next, top_prev, top_next, bottom_prev, bottom_next,
689 : topCr, currCr, bottomCr, currPrevCr, currNextCr, topPrevCr, topNextCr, bottomPrevCr, bottomNextCr;
690 : (void)sb_origin_x;
691 : {
692 0 : picHeight = input_picture_ptr->height / 2;
693 0 : picWidth = input_picture_ptr->width / 2;
694 0 : sb_height = MIN(BLOCK_SIZE_64 / 2, picHeight - sb_origin_y);
695 :
696 0 : sb_height = ((sb_origin_y + BLOCK_SIZE_64 / 2 >= picHeight) || (sb_origin_y == 0)) ? sb_height - 1 : sb_height;
697 :
698 0 : stride_in = input_picture_ptr->stride_cb;
699 0 : inputOriginIndex = input_picture_ptr->origin_x / 2 + (input_picture_ptr->origin_y / 2 + sb_origin_y) * input_picture_ptr->stride_cb;
700 0 : ptrIn = &(input_picture_ptr->buffer_cb[inputOriginIndex]);
701 :
702 0 : inputOriginIndexPad = denoised_picture_ptr->origin_x / 2 + (denoised_picture_ptr->origin_y / 2 + sb_origin_y) * denoised_picture_ptr->stride_cb;
703 0 : strideOut = denoised_picture_ptr->stride_cb;
704 0 : ptr_denoised = &(denoised_picture_ptr->buffer_cb[inputOriginIndexPad]);
705 0 : ptrDenoisedInterm = ptr_denoised;
706 :
707 0 : strideInCr = input_picture_ptr->stride_cr;
708 0 : inputOriginIndex = input_picture_ptr->origin_x / 2 + (input_picture_ptr->origin_y / 2 + sb_origin_y) * input_picture_ptr->stride_cr;
709 0 : ptrInCr = &(input_picture_ptr->buffer_cr[inputOriginIndex]);
710 :
711 0 : inputOriginIndexPad = denoised_picture_ptr->origin_x / 2 + (denoised_picture_ptr->origin_y / 2 + sb_origin_y) * denoised_picture_ptr->stride_cr;
712 0 : strideOutCr = denoised_picture_ptr->stride_cr;
713 0 : ptrDenoisedCr = &(denoised_picture_ptr->buffer_cr[inputOriginIndexPad]);
714 0 : ptrDenoisedIntermCr = ptrDenoisedCr;
715 : ////Chroma
716 : //a = (4 * p[0] + 4 * p[1] + 4 * p[2] +
717 : // 4 * p[0 + stride] + 4 * p[1 + stride] + 4 * p[2 + stride] +
718 : // 4 * p[0 + 2 * stride] + 4 * p[1 + 2 * stride] + 4 * p[2 + 2 * stride]) / 36;
719 :
720 0 : top = curr = top_next = top_prev = curr_next = curr_prev = topCr = currCr = topNextCr = topPrevCr = currNextCr = currPrevCr = _mm256_setzero_si256();
721 :
722 0 : for (kk = 0; kk + BLOCK_SIZE_64 / 2 <= picWidth; kk += BLOCK_SIZE_64 / 2)
723 : {
724 0 : for (jj = 0; jj < sb_height; jj++)
725 : {
726 0 : if (sb_origin_y == 0)
727 : {
728 0 : if (jj == 0)
729 : {
730 0 : top = _mm256_loadu_si256((__m256i*)(ptrIn + kk + jj * stride_in));
731 0 : curr = _mm256_loadu_si256((__m256i*)(ptrIn + kk + (1 + jj)*stride_in));
732 0 : top_prev = _mm256_loadu_si256((__m256i*)(ptrIn - 1 + kk + ((jj)*stride_in)));
733 0 : top_next = _mm256_loadu_si256((__m256i*)(ptrIn + 1 + kk + ((jj)*stride_in)));
734 0 : curr_prev = _mm256_loadu_si256((__m256i*)(ptrIn - 1 + kk + ((1 + jj)*stride_in)));
735 0 : curr_next = _mm256_loadu_si256((__m256i*)(ptrIn + 1 + kk + ((1 + jj)*stride_in)));
736 0 : topCr = _mm256_loadu_si256((__m256i*)(ptrInCr + kk + jj * strideInCr));
737 0 : currCr = _mm256_loadu_si256((__m256i*)(ptrInCr + kk + (1 + jj)*strideInCr));
738 0 : topPrevCr = _mm256_loadu_si256((__m256i*)(ptrInCr - 1 + kk + ((jj)*strideInCr)));
739 0 : topNextCr = _mm256_loadu_si256((__m256i*)(ptrInCr + 1 + kk + ((jj)*strideInCr)));
740 0 : currPrevCr = _mm256_loadu_si256((__m256i*)(ptrInCr - 1 + kk + ((1 + jj)*strideInCr)));
741 0 : currNextCr = _mm256_loadu_si256((__m256i*)(ptrInCr + 1 + kk + ((1 + jj)*strideInCr)));
742 0 : _mm256_storeu_si256((__m256i *)(ptr_denoised + kk), top);
743 0 : _mm256_storeu_si256((__m256i *)(ptrDenoisedCr + kk), topCr);
744 : }
745 0 : bottom_prev = _mm256_loadu_si256((__m256i*)(ptrIn - 1 + kk + ((2 + jj)*stride_in)));
746 0 : bottom_next = _mm256_loadu_si256((__m256i*)(ptrIn + 1 + kk + ((2 + jj)*stride_in)));
747 0 : bottom = _mm256_loadu_si256((__m256i*)((ptrIn + kk) + (2 + jj)* stride_in));
748 0 : bottomPrevCr = _mm256_loadu_si256((__m256i*)(ptrInCr - 1 + kk + ((2 + jj)*strideInCr)));
749 0 : bottomNextCr = _mm256_loadu_si256((__m256i*)(ptrInCr + 1 + kk + ((2 + jj)*strideInCr)));
750 0 : bottomCr = _mm256_loadu_si256((__m256i*)((ptrInCr + kk) + (2 + jj)* strideInCr));
751 0 : ptrDenoisedInterm = ptr_denoised + kk + ((1 + jj)*strideOut);
752 0 : ptrDenoisedIntermCr = ptrDenoisedCr + kk + ((1 + jj)*strideOutCr);
753 : }
754 : else
755 : {
756 0 : if (jj == 0)
757 : {
758 0 : top = _mm256_loadu_si256((__m256i*)(ptrIn + kk + jj * stride_in - stride_in));
759 0 : curr = _mm256_loadu_si256((__m256i*)(ptrIn + kk + (1 + jj)*stride_in - stride_in));
760 0 : top_prev = _mm256_loadu_si256((__m256i*)(ptrIn - 1 + kk + ((jj)*stride_in) - stride_in));
761 0 : top_next = _mm256_loadu_si256((__m256i*)(ptrIn + 1 + kk + ((jj)*stride_in) - stride_in));
762 0 : curr_prev = _mm256_loadu_si256((__m256i*)(ptrIn - 1 + kk + ((1 + jj)*stride_in - stride_in)));
763 0 : curr_next = _mm256_loadu_si256((__m256i*)(ptrIn + 1 + kk + ((1 + jj)*stride_in - stride_in)));
764 0 : topCr = _mm256_loadu_si256((__m256i*)(ptrInCr + kk + jj * strideInCr - strideInCr));
765 0 : currCr = _mm256_loadu_si256((__m256i*)(ptrInCr + kk + (1 + jj)*strideInCr - strideInCr));
766 0 : topPrevCr = _mm256_loadu_si256((__m256i*)(ptrInCr - 1 + kk + ((jj)*strideInCr) - strideInCr));
767 0 : topNextCr = _mm256_loadu_si256((__m256i*)(ptrInCr + 1 + kk + ((jj)*strideInCr) - strideInCr));
768 0 : currPrevCr = _mm256_loadu_si256((__m256i*)(ptrInCr - 1 + kk + ((1 + jj)*strideInCr - strideInCr)));
769 0 : currNextCr = _mm256_loadu_si256((__m256i*)(ptrInCr + 1 + kk + ((1 + jj)*strideInCr - strideInCr)));
770 : }
771 0 : bottom_prev = _mm256_loadu_si256((__m256i*)(ptrIn - 1 + kk + ((2 + jj)*stride_in) - stride_in));
772 0 : bottom_next = _mm256_loadu_si256((__m256i*)(ptrIn + 1 + kk + ((2 + jj)*stride_in) - stride_in));
773 0 : bottom = _mm256_loadu_si256((__m256i*)((ptrIn + kk) + (2 + jj)* stride_in - stride_in));
774 0 : ptrDenoisedInterm = ptr_denoised + kk + ((1 + jj)*strideOut - strideOut);
775 0 : bottomPrevCr = _mm256_loadu_si256((__m256i*)(ptrInCr - 1 + kk + ((2 + jj)*strideInCr) - strideInCr));
776 0 : bottomNextCr = _mm256_loadu_si256((__m256i*)(ptrInCr + 1 + kk + ((2 + jj)*strideInCr) - strideInCr));
777 0 : bottomCr = _mm256_loadu_si256((__m256i*)((ptrInCr + kk) + (2 + jj)* strideInCr - strideInCr));
778 0 : ptrDenoisedIntermCr = ptrDenoisedCr + kk + ((1 + jj)*strideOutCr - strideOutCr);
779 : }
780 :
781 0 : chroma_strong_avx2_intrin(
782 : top,
783 : curr,
784 : bottom,
785 : curr_prev,
786 : curr_next,
787 : top_prev,
788 : top_next,
789 : bottom_prev,
790 : bottom_next,
791 : ptrDenoisedInterm);
792 :
793 0 : chroma_strong_avx2_intrin(
794 : topCr,
795 : currCr,
796 : bottomCr,
797 : currPrevCr,
798 : currNextCr,
799 : topPrevCr,
800 : topNextCr,
801 : bottomPrevCr,
802 : bottomNextCr,
803 : ptrDenoisedIntermCr);
804 :
805 0 : top = curr;
806 0 : curr = bottom;
807 0 : top_prev = curr_prev;
808 0 : top_next = curr_next;
809 0 : curr_prev = bottom_prev;
810 0 : curr_next = bottom_next;
811 0 : topCr = currCr;
812 0 : currCr = bottomCr;
813 0 : topPrevCr = currPrevCr;
814 0 : topNextCr = currNextCr;
815 0 : currPrevCr = bottomPrevCr;
816 0 : currNextCr = bottomNextCr;
817 : }
818 : }
819 :
820 0 : sb_height = MIN(BLOCK_SIZE_64 / 2, picHeight - sb_origin_y);
821 :
822 0 : for (jj = 0; jj < sb_height; jj++) {
823 0 : for (ii = 0; ii < picWidth; ii++) {
824 0 : if (!((jj < sb_height - 1 || (sb_origin_y + sb_height) < picHeight) && ii > 0 && ii < picWidth - 1)) {
825 0 : ptr_denoised[ii + jj * strideOut] = ptrIn[ii + jj * stride_in];
826 0 : ptrDenoisedCr[ii + jj * strideOut] = ptrInCr[ii + jj * stride_in];
827 : }
828 : }
829 : }
830 : }
831 0 : }
832 :
833 : /*******************************************
834 : * noise_extract_chroma_weak
835 : * weak filter chroma.
836 : *******************************************/
837 0 : void noise_extract_chroma_weak_avx2_intrin(
838 : EbPictureBufferDesc *input_picture_ptr,
839 : EbPictureBufferDesc *denoised_picture_ptr,
840 : uint32_t sb_origin_y,
841 : uint32_t sb_origin_x
842 : )
843 : {
844 : uint32_t ii, jj, kk;
845 : uint32_t picHeight, sb_height;
846 : uint32_t picWidth;
847 : uint32_t inputOriginIndex;
848 : uint32_t inputOriginIndexPad;
849 :
850 : uint8_t *ptrIn, *ptrInCr;
851 : uint32_t stride_in, strideInCr;
852 : uint8_t *ptr_denoised, *ptrDenoisedInterm, *ptrDenoisedCr, *ptrDenoisedIntermCr;
853 :
854 : uint32_t strideOut, strideOutCr;
855 :
856 : __m256i top, curr, bottom, curr_prev, curr_next, top_prev, top_next, bottom_prev, bottom_next,
857 : topCr, currCr, bottomCr, currPrevCr, currNextCr, topPrevCr, topNextCr, bottomPrevCr, bottomNextCr;
858 : (void)sb_origin_x;
859 : ////gaussian matrix(Chroma)
860 : //a = (1 * p[0] + 2 * p[1] + 1 * p[2] +
861 : // 2 * p[0 + stride] + 4 * p[1 + stride] + 2 * p[2 + stride] +
862 : // 1 * p[0 + 2 * stride] + 2 * p[1 + 2 * stride] + 1 * p[2 + 2 * stride]) / 16;
863 :
864 : {
865 0 : picHeight = input_picture_ptr->height / 2;
866 0 : picWidth = input_picture_ptr->width / 2;
867 :
868 0 : sb_height = MIN(BLOCK_SIZE_64 / 2, picHeight - sb_origin_y);
869 :
870 0 : sb_height = ((sb_origin_y + BLOCK_SIZE_64 / 2 >= picHeight) || (sb_origin_y == 0)) ? sb_height - 1 : sb_height;
871 0 : stride_in = input_picture_ptr->stride_cb;
872 0 : inputOriginIndex = input_picture_ptr->origin_x / 2 + (input_picture_ptr->origin_y / 2 + sb_origin_y)* input_picture_ptr->stride_cb;
873 0 : ptrIn = &(input_picture_ptr->buffer_cb[inputOriginIndex]);
874 :
875 0 : inputOriginIndexPad = denoised_picture_ptr->origin_x / 2 + (denoised_picture_ptr->origin_y / 2 + sb_origin_y)* denoised_picture_ptr->stride_cb;
876 0 : strideOut = denoised_picture_ptr->stride_cb;
877 0 : ptr_denoised = &(denoised_picture_ptr->buffer_cb[inputOriginIndexPad]);
878 0 : ptrDenoisedInterm = ptr_denoised;
879 :
880 0 : strideInCr = input_picture_ptr->stride_cr;
881 0 : inputOriginIndex = input_picture_ptr->origin_x / 2 + (input_picture_ptr->origin_y / 2 + sb_origin_y) * input_picture_ptr->stride_cr;
882 0 : ptrInCr = &(input_picture_ptr->buffer_cr[inputOriginIndex]);
883 :
884 0 : inputOriginIndexPad = denoised_picture_ptr->origin_x / 2 + (denoised_picture_ptr->origin_y / 2 + sb_origin_y) * denoised_picture_ptr->stride_cr;
885 0 : strideOutCr = denoised_picture_ptr->stride_cr;
886 0 : ptrDenoisedCr = &(denoised_picture_ptr->buffer_cr[inputOriginIndexPad]);
887 0 : ptrDenoisedIntermCr = ptrDenoisedCr;
888 :
889 0 : top = curr = top_next = top_prev = curr_next = curr_prev = topCr = currCr = topNextCr = topPrevCr = currNextCr = currPrevCr = _mm256_setzero_si256();
890 0 : for (kk = 0; kk + BLOCK_SIZE_64 / 2 <= picWidth; kk += BLOCK_SIZE_64 / 2)
891 : {
892 0 : for (jj = 0; jj < sb_height; jj++)
893 : {
894 0 : if (sb_origin_y == 0)
895 : {
896 0 : if (jj == 0)
897 : {
898 0 : top = _mm256_loadu_si256((__m256i*)(ptrIn + kk + jj * stride_in));
899 0 : curr = _mm256_loadu_si256((__m256i*)(ptrIn + kk + (1 + jj)*stride_in));
900 0 : top_prev = _mm256_loadu_si256((__m256i*)(ptrIn - 1 + kk + ((jj)*stride_in)));
901 0 : top_next = _mm256_loadu_si256((__m256i*)(ptrIn + 1 + kk + ((jj)*stride_in)));
902 0 : curr_prev = _mm256_loadu_si256((__m256i*)(ptrIn - 1 + kk + ((1 + jj)*stride_in)));
903 0 : curr_next = _mm256_loadu_si256((__m256i*)(ptrIn + 1 + kk + ((1 + jj)*stride_in)));
904 0 : _mm256_storeu_si256((__m256i *)(ptr_denoised + kk), top);
905 0 : topCr = _mm256_loadu_si256((__m256i*)(ptrInCr + kk + jj * strideInCr));
906 0 : currCr = _mm256_loadu_si256((__m256i*)(ptrInCr + kk + (1 + jj)*strideInCr));
907 0 : topPrevCr = _mm256_loadu_si256((__m256i*)(ptrInCr - 1 + kk + ((jj)*strideInCr)));
908 0 : topNextCr = _mm256_loadu_si256((__m256i*)(ptrInCr + 1 + kk + ((jj)*strideInCr)));
909 0 : currPrevCr = _mm256_loadu_si256((__m256i*)(ptrInCr - 1 + kk + ((1 + jj)*strideInCr)));
910 0 : currNextCr = _mm256_loadu_si256((__m256i*)(ptrInCr + 1 + kk + ((1 + jj)*strideInCr)));
911 0 : _mm256_storeu_si256((__m256i *)(ptrDenoisedCr + kk), topCr);
912 : }
913 0 : bottom_prev = _mm256_loadu_si256((__m256i*)(ptrIn - 1 + kk + ((2 + jj)*stride_in)));
914 0 : bottom_next = _mm256_loadu_si256((__m256i*)(ptrIn + 1 + kk + ((2 + jj)*stride_in)));
915 0 : bottom = _mm256_loadu_si256((__m256i*)((ptrIn + kk) + (2 + jj)* stride_in));
916 0 : ptrDenoisedInterm = ptr_denoised + kk + ((1 + jj)*strideOut);
917 0 : bottomPrevCr = _mm256_loadu_si256((__m256i*)(ptrInCr - 1 + kk + ((2 + jj)*strideInCr)));
918 0 : bottomNextCr = _mm256_loadu_si256((__m256i*)(ptrInCr + 1 + kk + ((2 + jj)*strideInCr)));
919 0 : bottomCr = _mm256_loadu_si256((__m256i*)((ptrInCr + kk) + (2 + jj)* strideInCr));
920 0 : ptrDenoisedIntermCr = ptrDenoisedCr + kk + ((1 + jj)*strideOutCr);
921 : }
922 : else
923 : {
924 0 : if (jj == 0)
925 : {
926 0 : top = _mm256_loadu_si256((__m256i*)(ptrIn + kk + jj * stride_in - stride_in));
927 0 : curr = _mm256_loadu_si256((__m256i*)(ptrIn + kk + (1 + jj)*stride_in - stride_in));
928 0 : top_prev = _mm256_loadu_si256((__m256i*)(ptrIn - 1 + kk + ((jj)*stride_in) - stride_in));
929 0 : top_next = _mm256_loadu_si256((__m256i*)(ptrIn + 1 + kk + ((jj)*stride_in) - stride_in));
930 0 : curr_prev = _mm256_loadu_si256((__m256i*)(ptrIn - 1 + kk + ((1 + jj)*stride_in - stride_in)));
931 0 : curr_next = _mm256_loadu_si256((__m256i*)(ptrIn + 1 + kk + ((1 + jj)*stride_in - stride_in)));
932 0 : topCr = _mm256_loadu_si256((__m256i*)(ptrInCr + kk + jj * strideInCr - strideInCr));
933 0 : currCr = _mm256_loadu_si256((__m256i*)(ptrInCr + kk + (1 + jj)*strideInCr - strideInCr));
934 0 : topPrevCr = _mm256_loadu_si256((__m256i*)(ptrInCr - 1 + kk + ((jj)*strideInCr) - strideInCr));
935 0 : topNextCr = _mm256_loadu_si256((__m256i*)(ptrInCr + 1 + kk + ((jj)*strideInCr) - strideInCr));
936 0 : currPrevCr = _mm256_loadu_si256((__m256i*)(ptrInCr - 1 + kk + ((1 + jj)*strideInCr - strideInCr)));
937 0 : currNextCr = _mm256_loadu_si256((__m256i*)(ptrInCr + 1 + kk + ((1 + jj)*strideInCr - strideInCr)));
938 : }
939 0 : bottom_prev = _mm256_loadu_si256((__m256i*)(ptrIn - 1 + kk + ((2 + jj)*stride_in) - stride_in));
940 0 : bottom_next = _mm256_loadu_si256((__m256i*)(ptrIn + 1 + kk + ((2 + jj)*stride_in) - stride_in));
941 0 : bottom = _mm256_loadu_si256((__m256i*)((ptrIn + kk) + (2 + jj)* stride_in - stride_in));
942 0 : ptrDenoisedInterm = ptr_denoised + kk + ((1 + jj)*strideOut - strideOut);
943 0 : bottomPrevCr = _mm256_loadu_si256((__m256i*)(ptrInCr - 1 + kk + ((2 + jj)*strideInCr) - strideInCr));
944 0 : bottomNextCr = _mm256_loadu_si256((__m256i*)(ptrInCr + 1 + kk + ((2 + jj)*strideInCr) - strideInCr));
945 0 : bottomCr = _mm256_loadu_si256((__m256i*)((ptrInCr + kk) + (2 + jj)* strideInCr - strideInCr));
946 0 : ptrDenoisedIntermCr = ptrDenoisedCr + kk + ((1 + jj)*strideOutCr - strideOutCr);
947 : }
948 :
949 0 : chroma_weak_luma_strong_filter_avx2_intrin(
950 : top,
951 : curr,
952 : bottom,
953 : curr_prev,
954 : curr_next,
955 : top_prev,
956 : top_next,
957 : bottom_prev,
958 : bottom_next,
959 : ptrDenoisedInterm);
960 :
961 0 : chroma_weak_luma_strong_filter_avx2_intrin(
962 : topCr,
963 : currCr,
964 : bottomCr,
965 : currPrevCr,
966 : currNextCr,
967 : topPrevCr,
968 : topNextCr,
969 : bottomPrevCr,
970 : bottomNextCr,
971 : ptrDenoisedIntermCr);
972 :
973 0 : top = curr;
974 0 : curr = bottom;
975 0 : top_prev = curr_prev;
976 0 : top_next = curr_next;
977 0 : curr_prev = bottom_prev;
978 0 : curr_next = bottom_next;
979 0 : topCr = currCr;
980 0 : currCr = bottomCr;
981 0 : topPrevCr = currPrevCr;
982 0 : topNextCr = currNextCr;
983 0 : currPrevCr = bottomPrevCr;
984 0 : currNextCr = bottomNextCr;
985 : }
986 : }
987 :
988 0 : sb_height = MIN(BLOCK_SIZE_64 / 2, picHeight - sb_origin_y);
989 0 : for (jj = 0; jj < sb_height; jj++) {
990 0 : for (ii = 0; ii < picWidth; ii++) {
991 0 : if (!((jj < sb_height - 1 || (sb_origin_y + sb_height) < picHeight) && ii > 0 && ii < picWidth - 1)) {
992 0 : ptr_denoised[ii + jj * strideOut] = ptrIn[ii + jj * stride_in];
993 0 : ptrDenoisedCr[ii + jj * strideOut] = ptrInCr[ii + jj * strideInCr];
994 : }
995 : }
996 : }
997 : }
998 0 : }
|