Line data Source code
1 : /*
2 : * Copyright(c) 2019 Intel Corporation
3 : * SPDX - License - Identifier: BSD - 2 - Clause - Patent
4 : */
5 :
6 : #ifndef EbPictureOperators_Inline_AVX2_h
7 : #define EbPictureOperators_Inline_AVX2_h
8 :
9 : #include <immintrin.h>
10 : #include "EbDefinitions.h"
11 : #include "EbMemory_AVX2.h"
12 : #include "EbPictureOperators_SSE2.h"
13 :
14 : #ifdef __cplusplus
15 : extern "C" {
16 : #endif
17 :
18 : SIMD_INLINE void residual_kernel4_avx2(
19 : const uint8_t *input, const uint32_t input_stride, const uint8_t *pred,
20 : const uint32_t pred_stride, int16_t *residual,
21 : const uint32_t residual_stride, const uint32_t area_height)
22 : {
23 20595000 : const __m256i zero = _mm256_setzero_si256();
24 20595000 : uint32_t y = area_height;
25 :
26 : do {
27 36208200 : const __m256i in = load_u8_4x4_avx2(input, input_stride);
28 36212800 : const __m256i pr = load_u8_4x4_avx2(pred, pred_stride);
29 36208700 : const __m256i in_lo = _mm256_unpacklo_epi8(in, zero);
30 36208700 : const __m256i pr_lo = _mm256_unpacklo_epi8(pr, zero);
31 36208700 : const __m256i re_lo = _mm256_sub_epi16(in_lo, pr_lo);
32 36208700 : const __m128i r0 = _mm256_castsi256_si128(re_lo);
33 36208700 : const __m128i r1 = _mm256_extracti128_si256(re_lo, 1);
34 :
35 36208700 : store_s16_4x2_sse2(r0, residual + 0 * residual_stride, residual_stride);
36 36210700 : store_s16_4x2_sse2(r1, residual + 2 * residual_stride, residual_stride);
37 :
38 36203100 : input += 4 * input_stride;
39 36203100 : pred += 4 * pred_stride;
40 36203100 : residual += 4 * residual_stride;
41 36203100 : y -= 4;
42 36203100 : } while (y);
43 20589800 : }
44 :
45 : SIMD_INLINE void residual_kernel8_avx2(
46 : const uint8_t *input, const uint32_t input_stride, const uint8_t *pred,
47 : const uint32_t pred_stride, int16_t *residual,
48 : const uint32_t residual_stride, const uint32_t area_height)
49 : {
50 24146600 : const __m256i zero = _mm256_setzero_si256();
51 24146600 : uint32_t y = area_height;
52 :
53 : do {
54 68984000 : const __m256i in = load_u8_8x4_avx2(input, input_stride);
55 69005500 : const __m256i pr = load_u8_8x4_avx2(pred, pred_stride);
56 69008400 : const __m256i in_lo = _mm256_unpacklo_epi8(in, zero);
57 69008400 : const __m256i in_hi = _mm256_unpackhi_epi8(in, zero);
58 69008400 : const __m256i pr_lo = _mm256_unpacklo_epi8(pr, zero);
59 69008400 : const __m256i pr_hi = _mm256_unpackhi_epi8(pr, zero);
60 69008400 : const __m256i r0 = _mm256_sub_epi16(in_lo, pr_lo);
61 69008400 : const __m256i r1 = _mm256_sub_epi16(in_hi, pr_hi);
62 :
63 69008400 : storeu_s16_8x2_avx2(r0, residual + 0 * residual_stride, 2 * residual_stride);
64 69014000 : storeu_s16_8x2_avx2(r1, residual + 1 * residual_stride, 2 * residual_stride);
65 :
66 68981100 : input += 4 * input_stride;
67 68981100 : pred += 4 * pred_stride;
68 68981100 : residual += 4 * residual_stride;
69 68981100 : y -= 4;
70 68981100 : } while (y);
71 24143700 : }
72 :
73 : SIMD_INLINE void residual_kernel16_avx2(
74 : const uint8_t *input, const uint32_t input_stride, const uint8_t *pred,
75 : const uint32_t pred_stride, int16_t *residual,
76 : const uint32_t residual_stride, const uint32_t area_height)
77 : {
78 15538700 : const __m256i zero = _mm256_setzero_si256();
79 15538700 : uint32_t y = area_height;
80 :
81 : do {
82 117448000 : const __m256i in0 = loadu_u8_16x2_avx2(input, input_stride);
83 117490000 : const __m256i pr0 = loadu_u8_16x2_avx2(pred, pred_stride);
84 117450000 : const __m256i in1 = _mm256_permute4x64_epi64(in0, 0xD8);
85 117450000 : const __m256i pr1 = _mm256_permute4x64_epi64(pr0, 0xD8);
86 117450000 : const __m256i in_lo = _mm256_unpacklo_epi8(in1, zero);
87 117450000 : const __m256i in_hi = _mm256_unpackhi_epi8(in1, zero);
88 117450000 : const __m256i pr_lo = _mm256_unpacklo_epi8(pr1, zero);
89 117450000 : const __m256i pr_hi = _mm256_unpackhi_epi8(pr1, zero);
90 117450000 : const __m256i re_lo = _mm256_sub_epi16(in_lo, pr_lo);
91 117450000 : const __m256i re_hi = _mm256_sub_epi16(in_hi, pr_hi);
92 :
93 : _mm256_storeu_si256((__m256i*)(residual + 0 * residual_stride), re_lo);
94 117450000 : _mm256_storeu_si256((__m256i*)(residual + 1 * residual_stride), re_hi);
95 117450000 : input += 2 * input_stride;
96 117450000 : pred += 2 * pred_stride;
97 117450000 : residual += 2 * residual_stride;
98 117450000 : y -= 2;
99 117450000 : } while (y);
100 15540600 : }
101 :
102 4390600000 : static INLINE void Distortion_AVX2_INTRIN(const __m256i input,
103 : const __m256i recon, __m256i *const sum) {
104 8781210000 : const __m256i in = _mm256_unpacklo_epi8(input, _mm256_setzero_si256());
105 8781210000 : const __m256i re = _mm256_unpacklo_epi8(recon, _mm256_setzero_si256());
106 4390600000 : const __m256i diff = _mm256_sub_epi16(in, re);
107 4390600000 : const __m256i dist = _mm256_madd_epi16(diff, diff);
108 4390600000 : *sum = _mm256_add_epi32(*sum, dist);
109 4390600000 : }
110 :
111 5675640000 : static INLINE void SpatialFullDistortionKernel16_AVX2_INTRIN(
112 : const uint8_t *const input, const uint8_t *const recon,
113 : __m256i *const sum)
114 : {
115 5675640000 : const __m128i in8 = _mm_loadu_si128((__m128i *)input);
116 5675640000 : const __m128i re8 = _mm_loadu_si128((__m128i *)recon);
117 5675640000 : const __m256i in16 = _mm256_cvtepu8_epi16(in8);
118 5675640000 : const __m256i re16 = _mm256_cvtepu8_epi16(re8);
119 5675640000 : const __m256i diff = _mm256_sub_epi16(in16, re16);
120 5675640000 : const __m256i dist = _mm256_madd_epi16(diff, diff);
121 11351300000 : *sum = _mm256_add_epi32(*sum, dist);
122 5675640000 : }
123 :
124 0 : static INLINE void SpatialFullDistortionKernel32Leftover_AVX2_INTRIN(
125 : const uint8_t *const input, const uint8_t *const recon, __m256i *const sum0,
126 : __m256i *const sum1)
127 : {
128 0 : const __m256i in = _mm256_loadu_si256((__m256i *)input);
129 0 : const __m256i re = _mm256_loadu_si256((__m256i *)recon);
130 0 : const __m256i max = _mm256_max_epu8(in, re);
131 0 : const __m256i min = _mm256_min_epu8(in, re);
132 0 : const __m256i diff = _mm256_sub_epi8(max, min);
133 0 : const __m256i diff_L = _mm256_unpacklo_epi8(diff, _mm256_setzero_si256());
134 0 : const __m256i diff_H = _mm256_unpackhi_epi8(diff, _mm256_setzero_si256());
135 0 : const __m256i dist_L = _mm256_madd_epi16(diff_L, diff_L);
136 0 : const __m256i dist_H = _mm256_madd_epi16(diff_H, diff_H);
137 0 : *sum0 = _mm256_add_epi32(*sum0, dist_L);
138 0 : *sum1 = _mm256_add_epi32(*sum1, dist_H);
139 0 : }
140 :
141 5059830000 : static INLINE void SpatialFullDistortionKernel32_AVX2_INTRIN(
142 : const uint8_t *const input, const uint8_t *const recon, __m256i *const sum)
143 : {
144 5059830000 : const __m256i in = _mm256_loadu_si256((__m256i *)input);
145 5059830000 : const __m256i re = _mm256_loadu_si256((__m256i *)recon);
146 5059830000 : const __m256i max = _mm256_max_epu8(in, re);
147 5059830000 : const __m256i min = _mm256_min_epu8(in, re);
148 5059830000 : const __m256i diff = _mm256_sub_epi8(max, min);
149 10119700000 : const __m256i diff_L = _mm256_unpacklo_epi8(diff, _mm256_setzero_si256());
150 10119700000 : const __m256i diff_H = _mm256_unpackhi_epi8(diff, _mm256_setzero_si256());
151 5059830000 : const __m256i dist_L = _mm256_madd_epi16(diff_L, diff_L);
152 5059830000 : const __m256i dist_H = _mm256_madd_epi16(diff_H, diff_H);
153 5059830000 : const __m256i dist = _mm256_add_epi32(dist_L, dist_H);
154 5059830000 : *sum = _mm256_add_epi32(*sum, dist);
155 5059830000 : }
156 :
157 1194270300 : static INLINE int32_t Hadd32_AVX2_INTRIN(const __m256i src) {
158 1194270300 : const __m128i src_L = _mm256_extracti128_si256(src, 0);
159 1194270300 : const __m128i src_H = _mm256_extracti128_si256(src, 1);
160 1194270300 : const __m128i sum = _mm_add_epi32(src_L, src_H);
161 :
162 1194270300 : return Hadd32_SSE2_INTRIN(sum);
163 : }
164 :
165 : #ifdef __cplusplus
166 : }
167 : #endif
168 :
169 : #endif // EbPictureOperators_Inline_AVX2_h
|