Line data Source code
1 : /*
2 : * Copyright(c) 2019 Intel Corporation
3 : * SPDX - License - Identifier: BSD - 2 - Clause - Patent
4 : */
5 :
6 : #include "emmintrin.h"
7 : #include "EbComputeMean_SSE2.h"
8 :
9 0 : uint64_t compute_subd_mean_of_squared_values8x8_sse2_intrin(
10 : uint8_t * input_samples, // input parameter, input samples Ptr
11 : uint16_t input_stride) // input parameter, input stride
12 :
13 : {
14 : __m128i xmm0, xmm_blockMean, xmm_input;
15 :
16 0 : xmm0 = _mm_setzero_si128();
17 0 : xmm_input = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)input_samples), xmm0);
18 0 : xmm_blockMean = _mm_madd_epi16(xmm_input, xmm_input);
19 :
20 : /*xmm_input = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(input_samples + input_stride)), xmm0);
21 : xmm_blockMean = _mm_add_epi32(xmm_blockMean, _mm_madd_epi16(xmm_input, xmm_input));*/
22 :
23 0 : xmm_input = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(input_samples + 2 * input_stride)), xmm0);
24 0 : xmm_blockMean = _mm_add_epi32(xmm_blockMean, _mm_madd_epi16(xmm_input, xmm_input));
25 :
26 : /*xmm_input = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(input_samples+3*input_stride)), xmm0);
27 : xmm_blockMean = _mm_add_epi32(xmm_blockMean, _mm_madd_epi16(xmm_input, xmm_input));*/
28 :
29 0 : xmm_input = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(input_samples + 4 * input_stride)), xmm0);
30 0 : xmm_blockMean = _mm_add_epi32(xmm_blockMean, _mm_madd_epi16(xmm_input, xmm_input));
31 :
32 : //xmm_input = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(input_samples+5*input_stride)), xmm0);
33 : //xmm_blockMean = _mm_add_epi32(xmm_blockMean, _mm_madd_epi16(xmm_input, xmm_input));
34 :
35 0 : xmm_input = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(input_samples + 6 * input_stride)), xmm0);
36 0 : xmm_blockMean = _mm_add_epi32(xmm_blockMean, _mm_madd_epi16(xmm_input, xmm_input));
37 :
38 : /*xmm_input = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(input_samples+7*input_stride)), xmm0);
39 : xmm_blockMean = _mm_add_epi32(xmm_blockMean, _mm_madd_epi16(xmm_input, xmm_input));*/
40 :
41 0 : xmm_blockMean = _mm_add_epi32(xmm_blockMean, _mm_srli_si128(xmm_blockMean, 8));
42 0 : xmm_blockMean = _mm_add_epi32(xmm_blockMean, _mm_srli_si128(xmm_blockMean, 4));
43 :
44 0 : return (uint64_t)_mm_cvtsi128_si32(xmm_blockMean) << 11;
45 : }
46 :
47 580667 : uint64_t compute_sub_mean8x8_sse2_intrin(
48 : uint8_t * input_samples, // input parameter, input samples Ptr
49 : uint16_t input_stride) // input parameter, input stride
50 :
51 : {
52 580667 : __m128i xmm0 = _mm_setzero_si128(), xmm1, xmm3, xmm_sum1, xmm_sum2;
53 :
54 580667 : xmm1 = _mm_sad_epu8(_mm_loadl_epi64((__m128i *)(input_samples)), xmm0);
55 : //xmm2 = _mm_sad_epu8(_mm_loadl_epi64((__m128i *)(input_samples + input_stride)), xmm0);
56 1742000 : xmm3 = _mm_sad_epu8(_mm_loadl_epi64((__m128i *)(input_samples + 2 * input_stride)), xmm0);
57 : //xmm4 = _mm_sad_epu8(_mm_loadl_epi64((__m128i *)(input_samples + 3 * input_stride)), xmm0);
58 580667 : xmm_sum1 = _mm_add_epi16(xmm1, xmm3);
59 :
60 580667 : input_samples += 4 * input_stride;
61 580667 : xmm1 = _mm_sad_epu8(_mm_loadl_epi64((__m128i *)(input_samples)), xmm0);
62 : //xmm2 = _mm_sad_epu8(_mm_loadl_epi64((__m128i *)(input_samples + input_stride)), xmm0);
63 1742000 : xmm3 = _mm_sad_epu8(_mm_loadl_epi64((__m128i *)(input_samples + 2 * input_stride)), xmm0);
64 : //xmm4 = _mm_sad_epu8(_mm_loadl_epi64((__m128i *)(input_samples + 3 * input_stride)), xmm0);
65 580667 : xmm_sum2 = _mm_add_epi16(xmm1, xmm3);
66 580667 : xmm_sum2 = _mm_add_epi16(xmm_sum1, xmm_sum2);
67 :
68 580667 : return (uint64_t)_mm_cvtsi128_si32(xmm_sum2) << 3;
69 : }
70 :
71 0 : uint64_t compute_mean_of_squared_values8x8_sse2_intrin(
72 : uint8_t * input_samples, // input parameter, input samples Ptr
73 : uint32_t input_stride, // input parameter, input stride
74 : uint32_t input_area_width, // input parameter, input area width
75 : uint32_t input_area_height) // input parameter, input area height
76 : {
77 : __m128i xmm0, xmm_blockMean, xmm_input;
78 : (void)input_area_width;
79 : (void)input_area_height;
80 0 : xmm0 = _mm_setzero_si128();
81 0 : xmm_input = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)input_samples), xmm0);
82 0 : xmm_blockMean = _mm_madd_epi16(xmm_input, xmm_input);
83 :
84 0 : xmm_input = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(input_samples + input_stride)), xmm0);
85 0 : xmm_blockMean = _mm_add_epi32(xmm_blockMean, _mm_madd_epi16(xmm_input, xmm_input));
86 :
87 0 : xmm_input = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(input_samples + 2 * input_stride)), xmm0);
88 0 : xmm_blockMean = _mm_add_epi32(xmm_blockMean, _mm_madd_epi16(xmm_input, xmm_input));
89 :
90 0 : xmm_input = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(input_samples + 3 * input_stride)), xmm0);
91 0 : xmm_blockMean = _mm_add_epi32(xmm_blockMean, _mm_madd_epi16(xmm_input, xmm_input));
92 :
93 0 : xmm_input = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(input_samples + 4 * input_stride)), xmm0);
94 0 : xmm_blockMean = _mm_add_epi32(xmm_blockMean, _mm_madd_epi16(xmm_input, xmm_input));
95 :
96 0 : xmm_input = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(input_samples + 5 * input_stride)), xmm0);
97 0 : xmm_blockMean = _mm_add_epi32(xmm_blockMean, _mm_madd_epi16(xmm_input, xmm_input));
98 :
99 0 : xmm_input = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(input_samples + 6 * input_stride)), xmm0);
100 0 : xmm_blockMean = _mm_add_epi32(xmm_blockMean, _mm_madd_epi16(xmm_input, xmm_input));
101 :
102 0 : xmm_input = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(input_samples + 7 * input_stride)), xmm0);
103 0 : xmm_blockMean = _mm_add_epi32(xmm_blockMean, _mm_madd_epi16(xmm_input, xmm_input));
104 :
105 0 : xmm_blockMean = _mm_add_epi32(xmm_blockMean, _mm_srli_si128(xmm_blockMean, 8));
106 0 : xmm_blockMean = _mm_add_epi32(xmm_blockMean, _mm_srli_si128(xmm_blockMean, 4));
107 :
108 0 : return (uint64_t)_mm_cvtsi128_si32(xmm_blockMean) << 10;
109 : }
110 :
111 0 : uint64_t compute_mean8x8_sse2_intrin(
112 : uint8_t * input_samples, // input parameter, input samples Ptr
113 : uint32_t input_stride, // input parameter, input stride
114 : uint32_t input_area_width, // input parameter, input area width
115 : uint32_t input_area_height) // input parameter, input area height
116 : {
117 0 : __m128i xmm0 = _mm_setzero_si128(), xmm1, xmm2, xmm3, xmm4, xmm_sum1, xmm_sum2;
118 :
119 0 : xmm1 = _mm_sad_epu8(_mm_loadl_epi64((__m128i *)(input_samples)), xmm0);
120 0 : xmm2 = _mm_sad_epu8(_mm_loadl_epi64((__m128i *)(input_samples + input_stride)), xmm0);
121 0 : xmm3 = _mm_sad_epu8(_mm_loadl_epi64((__m128i *)(input_samples + 2 * input_stride)), xmm0);
122 0 : xmm4 = _mm_sad_epu8(_mm_loadl_epi64((__m128i *)(input_samples + 3 * input_stride)), xmm0);
123 0 : xmm_sum1 = _mm_add_epi16(_mm_add_epi16(xmm1, xmm2), _mm_add_epi16(xmm3, xmm4));
124 :
125 0 : input_samples += 4 * input_stride;
126 0 : xmm1 = _mm_sad_epu8(_mm_loadl_epi64((__m128i *)(input_samples)), xmm0);
127 0 : xmm2 = _mm_sad_epu8(_mm_loadl_epi64((__m128i *)(input_samples + input_stride)), xmm0);
128 0 : xmm3 = _mm_sad_epu8(_mm_loadl_epi64((__m128i *)(input_samples + 2 * input_stride)), xmm0);
129 0 : xmm4 = _mm_sad_epu8(_mm_loadl_epi64((__m128i *)(input_samples + 3 * input_stride)), xmm0);
130 0 : xmm_sum2 = _mm_add_epi16(_mm_add_epi16(xmm1, xmm2), _mm_add_epi16(xmm3, xmm4));
131 0 : xmm_sum2 = _mm_add_epi16(xmm_sum1, xmm_sum2);
132 :
133 : (void)input_area_width;
134 : (void)input_area_height;
135 :
136 0 : return (uint64_t)_mm_cvtsi128_si32(xmm_sum2) << 2;
137 : }
138 :
139 0 : void compute_interm_var_four8x8_helper_sse2(
140 : uint8_t * input_samples,
141 : uint16_t input_stride,
142 : uint64_t * mean_of8x8_blocks, // mean of four 8x8
143 : uint64_t * mean_of_squared8x8_blocks) // meanSquared
144 : {
145 0 : uint32_t blockIndex = 0;
146 : // (0,1)
147 0 : mean_of8x8_blocks[0] = compute_sub_mean8x8_sse2_intrin(input_samples + blockIndex, input_stride);
148 0 : mean_of_squared8x8_blocks[0] = compute_subd_mean_of_squared_values8x8_sse2_intrin(input_samples + blockIndex, input_stride);
149 :
150 : // (0,2)
151 0 : blockIndex = blockIndex + 8;
152 0 : mean_of8x8_blocks[1] = compute_sub_mean8x8_sse2_intrin(input_samples + blockIndex, input_stride);
153 0 : mean_of_squared8x8_blocks[1] = compute_subd_mean_of_squared_values8x8_sse2_intrin(input_samples + blockIndex, input_stride);
154 :
155 : // (0,3)
156 0 : blockIndex = blockIndex + 8;
157 0 : mean_of8x8_blocks[2] = compute_sub_mean8x8_sse2_intrin(input_samples + blockIndex, input_stride);
158 0 : mean_of_squared8x8_blocks[2] = compute_subd_mean_of_squared_values8x8_sse2_intrin(input_samples + blockIndex, input_stride);
159 :
160 : // (0,4)
161 0 : blockIndex = blockIndex + 8;
162 0 : mean_of8x8_blocks[3] = compute_sub_mean8x8_sse2_intrin(input_samples + blockIndex, input_stride);
163 0 : mean_of_squared8x8_blocks[3] = compute_subd_mean_of_squared_values8x8_sse2_intrin(input_samples + blockIndex, input_stride);
164 0 : }
|