Line data Source code
1 : /*
2 : * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3 : *
4 : * This source code is subject to the terms of the BSD 2 Clause License and
5 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 : * was not distributed with this source code in the LICENSE file, you can
7 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 : * Media Patent License 1.0 was not distributed with this source code in the
9 : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 : */
11 :
12 :
13 : #include "EbDefinitions.h"
14 : // #include <immintrin.h>
15 : // #include "convolve.h"
16 : #include "aom_dsp_rtcd.h"
17 : // #include "convolve_avx2.h"
18 : // #include "EbInterPrediction.h"
19 : // #include "EbMemory_AVX2.h"
20 : #include "synonyms.h"
21 :
22 : #include <assert.h>
23 : #include <immintrin.h>
24 :
25 : // #include "config/aom_config.h"
26 :
27 : // #include "mem.h"
28 : // #include "aom_integer.h"
29 :
30 : // #include "aom_dsp_common.h"
31 : // #include "obmc_intrinsic_ssse3.h"
32 : #include "synonyms.h"
33 :
34 : // Loads and stores to do away with the tedium of casting the address
35 : // to the right type.
36 0 : static INLINE __m128i xx_loadl_32(const void *a) {
37 : int val;
38 0 : memcpy(&val, a, sizeof(val));
39 0 : return _mm_cvtsi32_si128(val);
40 : }
41 :
42 16366300 : static INLINE int32_t xx_hsum_epi32_si32(__m128i v_d) {
43 16366300 : v_d = _mm_hadd_epi32(v_d, v_d);
44 16366300 : v_d = _mm_hadd_epi32(v_d, v_d);
45 16366300 : return _mm_cvtsi128_si32(v_d);
46 : }
47 : ////////////////////////////////////////////////////////////////////////////////
48 : // 8 bit
49 : ////////////////////////////////////////////////////////////////////////////////
50 :
51 0 : static INLINE unsigned int obmc_sad_w4_avx2(const uint8_t *pre,
52 : const int pre_stride,
53 : const int32_t *wsrc,
54 : const int32_t *mask,
55 : const int height) {
56 0 : int n = 0;
57 0 : __m256i v_sad_d = _mm256_setzero_si256();
58 0 : const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
59 :
60 : do {
61 0 : const __m128i v_p_b_0 = xx_loadl_32(pre);
62 0 : const __m128i v_p_b_1 = xx_loadl_32(pre + pre_stride);
63 0 : const __m128i v_p_b = _mm_unpacklo_epi32(v_p_b_0, v_p_b_1);
64 0 : const __m256i v_m_d = _mm256_lddqu_si256((__m256i *)(mask + n));
65 0 : const __m256i v_w_d = _mm256_lddqu_si256((__m256i *)(wsrc + n));
66 :
67 0 : const __m256i v_p_d = _mm256_cvtepu8_epi32(v_p_b);
68 :
69 : // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
70 : // boundaries. We use pmaddwd, as it has lower latency on Haswell
71 : // than pmulld but produces the same result with these inputs.
72 0 : const __m256i v_pm_d = _mm256_madd_epi16(v_p_d, v_m_d);
73 :
74 0 : const __m256i v_diff_d = _mm256_sub_epi32(v_w_d, v_pm_d);
75 0 : const __m256i v_absdiff_d = _mm256_abs_epi32(v_diff_d);
76 :
77 : // Rounded absolute difference
78 0 : const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff_d, v_bias_d);
79 0 : const __m256i v_rad_d = _mm256_srli_epi32(v_tmp_d, 12);
80 :
81 0 : v_sad_d = _mm256_add_epi32(v_sad_d, v_rad_d);
82 :
83 0 : n += 8;
84 0 : pre += pre_stride << 1;
85 0 : } while (n < 8 * (height >> 1));
86 :
87 0 : __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d);
88 0 : __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1);
89 0 : v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1);
90 0 : return xx_hsum_epi32_si32(v_sad_d_0);
91 : }
92 :
93 16363000 : static INLINE unsigned int obmc_sad_w8n_avx2(
94 : const uint8_t *pre, const int pre_stride, const int32_t *wsrc,
95 : const int32_t *mask, const int width, const int height) {
96 16363000 : const int pre_step = pre_stride - width;
97 16363000 : int n = 0;
98 16363000 : __m256i v_sad_d = _mm256_setzero_si256();
99 16363000 : const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
100 16363000 : assert(width >= 8);
101 16363000 : assert(IS_POWER_OF_TWO(width));
102 :
103 : do {
104 894102000 : const __m128i v_p0_b = xx_loadl_64(pre + n);
105 894032000 : const __m256i v_m0_d = _mm256_lddqu_si256((__m256i *)(mask + n));
106 1788160000 : const __m256i v_w0_d = _mm256_lddqu_si256((__m256i *)(wsrc + n));
107 :
108 894106000 : const __m256i v_p0_d = _mm256_cvtepu8_epi32(v_p0_b);
109 :
110 : // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
111 : // boundaries. We use pmaddwd, as it has lower latency on Haswell
112 : // than pmulld but produces the same result with these inputs.
113 894106000 : const __m256i v_pm0_d = _mm256_madd_epi16(v_p0_d, v_m0_d);
114 :
115 894106000 : const __m256i v_diff0_d = _mm256_sub_epi32(v_w0_d, v_pm0_d);
116 894106000 : const __m256i v_absdiff0_d = _mm256_abs_epi32(v_diff0_d);
117 :
118 : // Rounded absolute difference
119 894106000 : const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff0_d, v_bias_d);
120 894106000 : const __m256i v_rad0_d = _mm256_srli_epi32(v_tmp_d, 12);
121 :
122 894106000 : v_sad_d = _mm256_add_epi32(v_sad_d, v_rad0_d);
123 :
124 894106000 : n += 8;
125 :
126 894106000 : if ((n & (width - 1)) == 0) pre += pre_step;
127 894106000 : } while (n < width * height);
128 :
129 16366900 : __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d);
130 16366900 : __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1);
131 16366900 : v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1);
132 16366900 : return xx_hsum_epi32_si32(v_sad_d_0);
133 : }
134 :
135 : #define OBMCSADWXH(w, h) \
136 : unsigned int aom_obmc_sad##w##x##h##_avx2( \
137 : const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
138 : const int32_t *msk) { \
139 : if (w == 4) { \
140 : return obmc_sad_w4_avx2(pre, pre_stride, wsrc, msk, h); \
141 : } else { \
142 : return obmc_sad_w8n_avx2(pre, pre_stride, wsrc, msk, w, h); \
143 : } \
144 : }
145 :
146 0 : OBMCSADWXH(128, 128)
147 0 : OBMCSADWXH(128, 64)
148 0 : OBMCSADWXH(64, 128)
149 291507 : OBMCSADWXH(64, 64)
150 315512 : OBMCSADWXH(64, 32)
151 369262 : OBMCSADWXH(32, 64)
152 850705 : OBMCSADWXH(32, 32)
153 781635 : OBMCSADWXH(32, 16)
154 776484 : OBMCSADWXH(16, 32)
155 1960860 : OBMCSADWXH(16, 16)
156 1865820 : OBMCSADWXH(16, 8)
157 2207290 : OBMCSADWXH(8, 16)
158 3437330 : OBMCSADWXH(8, 8)
159 0 : OBMCSADWXH(8, 4)
160 0 : OBMCSADWXH(4, 8)
161 0 : OBMCSADWXH(4, 4)
162 0 : OBMCSADWXH(4, 16)
163 0 : OBMCSADWXH(16, 4)
164 1195760 : OBMCSADWXH(8, 32)
165 1204250 : OBMCSADWXH(32, 8)
166 598090 : OBMCSADWXH(16, 64)
167 515683 : OBMCSADWXH(64, 16)
|