Line data Source code
1 : /*
2 : * Copyright(c) 2019 Intel Corporation
3 : * SPDX - License - Identifier: BSD - 2 - Clause - Patent
4 : */
5 :
6 : #include "EbMeSadCalculation_SSE2.h"
7 : #include <emmintrin.h>
8 : #include "stdint.h"
9 :
10 0 : static INLINE void sad8x4x2_sse2_intrin(const uint8_t *src,
11 : const uint32_t src_stride, const uint8_t *ref, const uint32_t ref_stride,
12 : __m128i *sad8x8)
13 : {
14 0 : *sad8x8 = _mm_add_epi32(*sad8x8, _mm_sad_epu8(_mm_loadu_si128((__m128i*)(src + 0 * src_stride)), _mm_loadu_si128((__m128i*)(ref + 0 * ref_stride))));
15 0 : *sad8x8 = _mm_add_epi32(*sad8x8, _mm_sad_epu8(_mm_loadu_si128((__m128i*)(src + 2 * src_stride)), _mm_loadu_si128((__m128i*)(ref + 2 * ref_stride))));
16 0 : *sad8x8 = _mm_add_epi32(*sad8x8, _mm_sad_epu8(_mm_loadu_si128((__m128i*)(src + 4 * src_stride)), _mm_loadu_si128((__m128i*)(ref + 4 * ref_stride))));
17 0 : *sad8x8 = _mm_add_epi32(*sad8x8, _mm_sad_epu8(_mm_loadu_si128((__m128i*)(src + 6 * src_stride)), _mm_loadu_si128((__m128i*)(ref + 6 * ref_stride))));
18 0 : }
19 :
20 0 : void sad_calculation_8x8_16x16_sse2_intrin(
21 : uint8_t *src,
22 : uint32_t src_stride,
23 : uint8_t *ref,
24 : uint32_t ref_stride,
25 : uint32_t *p_best_sad8x8,
26 : uint32_t *p_best_sad16x16,
27 : uint32_t *p_best_mv8x8,
28 : uint32_t *p_best_mv16x16,
29 : uint32_t mv,
30 : uint32_t *p_sad16x16,
31 : EbBool sub_sad)
32 : {
33 : __m128i xmm_sad16x16, xmm_sad8x8[2], xmm_sad16x16_total, sad8x8_0_3, sad8x8_less_than_bitmask, xmm_N1;
34 : __m128i sad8x8_greater_or_eq_bitmask, BestMV8x8, BestSad8x8, xmm_pBestSad8x8, xmm_pBestMV8x8, xmm_mv;
35 :
36 0 : xmm_sad8x8[0] = xmm_sad8x8[1] = _mm_setzero_si128();
37 :
38 : //sad8x8_0, sad8x8_1
39 0 : sad8x4x2_sse2_intrin(src + 0 * src_stride, src_stride, ref + 0 * ref_stride, ref_stride, &xmm_sad8x8[0]);
40 :
41 : //sad8x8_2, sad8x8_3
42 0 : sad8x4x2_sse2_intrin(src + 8 * src_stride, src_stride, ref + 8 * ref_stride, ref_stride, &xmm_sad8x8[1]);
43 :
44 0 : if (sub_sad) {
45 0 : xmm_sad8x8[0] = _mm_slli_epi32(xmm_sad8x8[0], 1);
46 0 : xmm_sad8x8[1] = _mm_slli_epi32(xmm_sad8x8[1], 1);
47 : }
48 : else {
49 : //sad8x8_0, sad8x8_1
50 0 : sad8x4x2_sse2_intrin(src + 1 * src_stride, src_stride, ref + 1 * ref_stride, ref_stride, &xmm_sad8x8[0]);
51 :
52 : //sad8x8_2, sad8x8_3
53 0 : sad8x4x2_sse2_intrin(src + 9 * src_stride, src_stride, ref + 9 * ref_stride, ref_stride, &xmm_sad8x8[1]);
54 : }
55 :
56 0 : xmm_sad16x16 = _mm_add_epi32(xmm_sad8x8[0], xmm_sad8x8[1]);
57 0 : xmm_sad16x16_total = _mm_add_epi32(_mm_srli_si128(xmm_sad16x16, 8), xmm_sad16x16);
58 :
59 0 : *p_sad16x16 = _mm_cvtsi128_si32(xmm_sad16x16_total);
60 :
61 0 : sad8x8_0_3 = _mm_packs_epi32(xmm_sad8x8[0], xmm_sad8x8[1]);
62 :
63 0 : xmm_mv = _mm_cvtsi64_si128(mv);
64 0 : xmm_mv = _mm_unpacklo_epi32(xmm_mv, xmm_mv);
65 0 : xmm_mv = _mm_unpacklo_epi64(xmm_mv, xmm_mv);
66 :
67 0 : xmm_pBestSad8x8 = _mm_loadu_si128((__m128i*)p_best_sad8x8);
68 0 : xmm_pBestMV8x8 = _mm_loadu_si128((__m128i*)p_best_mv8x8);
69 :
70 : // sad8x8_0 < p_best_sad8x8[0] for 0 to 3
71 0 : sad8x8_less_than_bitmask = _mm_cmplt_epi32(sad8x8_0_3, xmm_pBestSad8x8);
72 :
73 0 : xmm_N1 = _mm_cmpeq_epi8(xmm_sad8x8[0], xmm_sad8x8[0]);
74 :
75 0 : sad8x8_greater_or_eq_bitmask = _mm_sub_epi32(xmm_N1, sad8x8_less_than_bitmask);
76 :
77 0 : BestSad8x8 = _mm_or_si128(_mm_and_si128(xmm_pBestSad8x8, sad8x8_greater_or_eq_bitmask), _mm_and_si128(sad8x8_less_than_bitmask, sad8x8_0_3));
78 0 : BestMV8x8 = _mm_or_si128(_mm_and_si128(xmm_pBestMV8x8, sad8x8_greater_or_eq_bitmask), _mm_and_si128(sad8x8_less_than_bitmask, xmm_mv));
79 :
80 : _mm_storeu_si128((__m128i*)p_best_sad8x8, BestSad8x8);
81 : _mm_storeu_si128((__m128i*)p_best_mv8x8, BestMV8x8);
82 :
83 0 : uint64_t sad16x16 = _mm_cvtsi128_si64(xmm_sad16x16_total);
84 0 : if (sad16x16 < p_best_sad16x16[0]) {
85 0 : p_best_sad16x16[0] = (uint32_t)sad16x16;
86 0 : p_best_mv16x16[0] = _mm_cvtsi128_si32(xmm_mv);
87 : }
88 0 : }
89 :
90 0 : void sad_calculation_32x32_64x64_sse2_intrin(
91 : uint32_t *p_sad16x16,
92 : uint32_t *p_best_sad32x32,
93 : uint32_t *p_best_sad64x64,
94 : uint32_t *p_best_mv32x32,
95 : uint32_t *p_best_mv64x64,
96 : uint32_t mv)
97 : {
98 : __m128i xmm_N1, sad32x32_greater_than_bitmask, sad32x32_less_than_or_eq_bitmask, BestSad32x32, BestMV32x32, xmm_mv;
99 : __m128i Sad16x16_0_7_lo, Sad16x16_0_7_hi, Sad16x16_8_15_lo, Sad16x16_8_15_hi, xmm_sad64x64, xmm_sad64x64_total, xmm_pBestSad32x32, xmm_pBestMV32x32;
100 :
101 0 : Sad16x16_0_7_lo = _mm_unpacklo_epi32(_mm_loadu_si128((__m128i*)p_sad16x16), _mm_loadu_si128((__m128i*)(p_sad16x16 + 4)));
102 0 : Sad16x16_0_7_hi = _mm_unpackhi_epi32(_mm_loadu_si128((__m128i*)p_sad16x16), _mm_loadu_si128((__m128i*)(p_sad16x16 + 4)));
103 0 : Sad16x16_8_15_lo = _mm_unpacklo_epi32(_mm_loadu_si128((__m128i*)(p_sad16x16 + 8)), _mm_loadu_si128((__m128i*)(p_sad16x16 + 12)));
104 0 : Sad16x16_8_15_hi = _mm_unpackhi_epi32(_mm_loadu_si128((__m128i*)(p_sad16x16 + 8)), _mm_loadu_si128((__m128i*)(p_sad16x16 + 12)));
105 :
106 0 : xmm_sad64x64 = _mm_add_epi32(_mm_add_epi32(_mm_unpacklo_epi64(Sad16x16_0_7_lo, Sad16x16_8_15_lo), _mm_unpackhi_epi64(Sad16x16_0_7_lo, Sad16x16_8_15_lo)),
107 : _mm_add_epi32(_mm_unpacklo_epi64(Sad16x16_0_7_hi, Sad16x16_8_15_hi), _mm_unpackhi_epi64(Sad16x16_0_7_hi, Sad16x16_8_15_hi)));
108 :
109 0 : xmm_sad64x64_total = _mm_add_epi32(_mm_srli_si128(xmm_sad64x64, 8), xmm_sad64x64);
110 :
111 0 : xmm_sad64x64_total = _mm_add_epi32(_mm_srli_si128(xmm_sad64x64_total, 4), xmm_sad64x64_total);
112 :
113 0 : xmm_mv = _mm_cvtsi32_si128(mv);
114 0 : xmm_mv = _mm_unpacklo_epi32(xmm_mv, xmm_mv);
115 0 : xmm_mv = _mm_unpacklo_epi64(xmm_mv, xmm_mv);
116 :
117 0 : xmm_pBestSad32x32 = _mm_loadu_si128((__m128i*)p_best_sad32x32);
118 0 : xmm_pBestMV32x32 = _mm_loadu_si128((__m128i*)p_best_mv32x32);
119 :
120 0 : sad32x32_greater_than_bitmask = _mm_cmpgt_epi32(xmm_pBestSad32x32, xmm_sad64x64);// _mm_cmplt_epi32(xmm_pBestSad32x32, xmm_sad64x64);
121 :
122 0 : xmm_N1 = _mm_cmpeq_epi8(xmm_mv, xmm_mv); // anything compared to itself is equal (get 0xFFFFFFFF)
123 0 : sad32x32_less_than_or_eq_bitmask = _mm_sub_epi32(xmm_N1, sad32x32_greater_than_bitmask);
124 :
125 0 : BestSad32x32 = _mm_or_si128(_mm_and_si128(xmm_pBestSad32x32, sad32x32_less_than_or_eq_bitmask), _mm_and_si128(xmm_sad64x64, sad32x32_greater_than_bitmask));
126 0 : BestMV32x32 = _mm_or_si128(_mm_and_si128(xmm_pBestMV32x32, sad32x32_less_than_or_eq_bitmask), _mm_and_si128(xmm_mv, sad32x32_greater_than_bitmask));
127 :
128 : _mm_storeu_si128((__m128i*)p_best_sad32x32, BestSad32x32);
129 : _mm_storeu_si128((__m128i*)p_best_mv32x32, BestMV32x32);
130 :
131 0 : uint32_t sad64x64 = _mm_cvtsi128_si32(xmm_sad64x64_total);
132 0 : if (sad64x64 < p_best_sad64x64[0]) {
133 0 : p_best_sad64x64[0] = sad64x64;
134 0 : p_best_mv64x64[0] = _mm_cvtsi128_si32(xmm_mv);
135 : }
136 0 : }
137 :
138 50692 : void initialize_buffer_32bits_sse2_intrin(
139 : uint32_t* pointer,
140 : uint32_t count128,
141 : uint32_t count32,
142 : uint32_t value)
143 : {
144 : __m128i xmm1, xmm2;
145 : uint32_t index128;
146 50692 : xmm2 = _mm_cvtsi32_si128(value);
147 50692 : xmm1 = _mm_or_si128(_mm_slli_si128(xmm2, 4), xmm2);
148 50692 : xmm2 = _mm_or_si128(_mm_slli_si128(xmm1, 8), xmm1);
149 :
150 2499200 : for (index128 = 0; index128 < count128; ++index128) {
151 : _mm_storeu_si128((__m128i *)pointer, xmm2);
152 2448510 : pointer += 4;
153 : }
154 50692 : if (count32 == 3) { //Initialize 96 bits
155 0 : _mm_storel_epi64((__m128i *)(pointer), xmm2);
156 0 : *(pointer + 2) = _mm_cvtsi128_si32(xmm2);
157 : }
158 50692 : else if (count32 == 2) { // Initialize 64 bits
159 0 : _mm_storel_epi64((__m128i *)pointer, xmm2);
160 : }
161 50692 : else if (count32 == 1) { // Initialize 32 bits
162 44940 : *(pointer) = _mm_cvtsi128_si32(xmm2);
163 : }
164 50692 : }
|