Line data Source code
1 : /*
2 : * Copyright(c) 2019 Intel Corporation
3 : * SPDX - License - Identifier: BSD - 2 - Clause - Patent
4 : */
5 :
6 : #include <immintrin.h> /* AVX2 */
7 :
8 : #include "EbDefinitions.h"
9 : #include "synonyms.h"
10 : #include "synonyms_avx2.h"
11 :
12 0 : static INLINE __m256i txb_init_levels_32_avx512(const TranLow *const coeff) {
13 : const __m512i idx =
14 0 : _mm512_setr_epi32(0, 4, 8, 12, 1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0);
15 0 : const __m512i c0 = _mm512_loadu_si512((__m512i *)(coeff + 0 * 16));
16 0 : const __m512i c1 = _mm512_loadu_si512((__m512i *)(coeff + 1 * 16));
17 0 : const __m512i c01 = _mm512_packs_epi32(c0, c1);
18 0 : const __m512i abs01 = _mm512_abs_epi16(c01);
19 0 : const __m512i abs_8 = _mm512_packs_epi16(abs01, abs01);
20 0 : const __m512i res = _mm512_permutexvar_epi32(idx, abs_8);
21 0 : return _mm512_castsi512_si256(res);
22 : }
23 :
24 0 : static INLINE __m512i txb_init_levels_64_avx512(const TranLow *const coeff) {
25 : const __m512i idx =
26 0 : _mm512_setr_epi32(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
27 0 : const __m512i c0 = _mm512_loadu_si512((__m512i *)(coeff + 0 * 16));
28 0 : const __m512i c1 = _mm512_loadu_si512((__m512i *)(coeff + 1 * 16));
29 0 : const __m512i c2 = _mm512_loadu_si512((__m512i *)(coeff + 2 * 16));
30 0 : const __m512i c3 = _mm512_loadu_si512((__m512i *)(coeff + 3 * 16));
31 0 : const __m512i c01 = _mm512_packs_epi32(c0, c1);
32 0 : const __m512i c23 = _mm512_packs_epi32(c2, c3);
33 0 : const __m512i abs01 = _mm512_abs_epi16(c01);
34 0 : const __m512i abs23 = _mm512_abs_epi16(c23);
35 0 : const __m512i abs_8 = _mm512_packs_epi16(abs01, abs23);
36 0 : return _mm512_permutexvar_epi32(idx, abs_8);
37 : }
38 :
39 0 : void eb_av1_txb_init_levels_avx512(const TranLow *const coeff,
40 : const int32_t width, const int32_t height,
41 : uint8_t *const levels) {
42 0 : const TranLow *cf = coeff;
43 0 : const __m128i x_zeros = _mm_setzero_si128();
44 0 : uint8_t *ls = levels;
45 0 : int32_t i = height;
46 :
47 0 : if (width == 4) {
48 0 : const __m256i y_zeros = _mm256_setzero_si256();
49 :
50 0 : xx_storeu_128(ls - 16, x_zeros);
51 :
52 : do {
53 0 : const __m256i c0 = yy_loadu_256(cf);
54 0 : const __m256i c1 = yy_loadu_256(cf + 8);
55 0 : const __m256i c01 = _mm256_packs_epi32(c0, c1);
56 0 : const __m256i abs01 = _mm256_abs_epi16(c01);
57 0 : const __m256i abs_8 = _mm256_packs_epi16(abs01, y_zeros);
58 0 : const __m256i res_ = _mm256_shuffle_epi32(abs_8, 0xd8);
59 0 : const __m256i res = _mm256_permute4x64_epi64(res_, 0xd8);
60 0 : yy_storeu_256(ls, res);
61 0 : cf += 4 * 4;
62 0 : ls += 4 * 8;
63 0 : i -= 4;
64 0 : } while (i);
65 :
66 0 : yy_storeu_256(ls, y_zeros);
67 : }
68 0 : else if (width == 8) {
69 0 : const __m256i y_zeros = _mm256_setzero_si256();
70 :
71 0 : yy_storeu_256(ls - 24, y_zeros);
72 :
73 : do {
74 0 : const __m256i res = txb_init_levels_32_avx512(cf);
75 0 : const __m128i res0 = _mm256_castsi256_si128(res);
76 0 : const __m128i res1 = _mm256_extracti128_si256(res, 1);
77 0 : xx_storel_64(ls + 0 * 12 + 0, res0);
78 0 : *(int32_t *)(ls + 0 * 12 + 8) = 0;
79 0 : _mm_storeh_epi64((__m128i *)(ls + 1 * 12 + 0), res0);
80 0 : *(int32_t *)(ls + 1 * 12 + 8) = 0;
81 0 : xx_storel_64(ls + 2 * 12 + 0, res1);
82 0 : *(int32_t *)(ls + 2 * 12 + 8) = 0;
83 0 : _mm_storeh_epi64((__m128i *)(ls + 3 * 12 + 0), res1);
84 0 : *(int32_t *)(ls + 3 * 12 + 8) = 0;
85 0 : cf += 4 * 8;
86 0 : ls += 4 * 12;
87 0 : i -= 4;
88 0 : } while (i);
89 :
90 0 : yy_storeu_256(ls + 0 * 32, y_zeros);
91 0 : xx_storeu_128(ls + 1 * 32, x_zeros);
92 : }
93 0 : else if (width == 16) {
94 0 : const __m256i y_zeros = _mm256_setzero_si256();
95 0 : const __m512i z_zeros = _mm512_setzero_si512();
96 :
97 0 : yy_storeu_256(ls - 40, y_zeros);
98 0 : xx_storel_64(ls - 8, x_zeros);
99 :
100 : do {
101 0 : const __m512i res = txb_init_levels_64_avx512(cf);
102 0 : const __m256i r0 = _mm512_castsi512_si256(res);
103 0 : const __m256i r1 = _mm512_extracti64x4_epi64(res, 1);
104 0 : const __m128i res0 = _mm256_castsi256_si128(r0);
105 0 : const __m128i res1 = _mm256_extracti128_si256(r0, 1);
106 0 : const __m128i res2 = _mm256_castsi256_si128(r1);
107 0 : const __m128i res3 = _mm256_extracti128_si256(r1, 1);
108 0 : xx_storeu_128(ls + 0 * 20, res0);
109 0 : *(int32_t *)(ls + 0 * 20 + 16) = 0;
110 0 : xx_storeu_128(ls + 1 * 20, res1);
111 0 : *(int32_t *)(ls + 1 * 20 + 16) = 0;
112 0 : xx_storeu_128(ls + 2 * 20, res2);
113 0 : *(int32_t *)(ls + 2 * 20 + 16) = 0;
114 0 : xx_storeu_128(ls + 3 * 20, res3);
115 0 : *(int32_t *)(ls + 3 * 20 + 16) = 0;
116 0 : cf += 4 * 16;
117 0 : ls += 4 * 20;
118 0 : i -= 4;
119 0 : } while (i);
120 :
121 : _mm512_storeu_si512((__m512i *)(ls + 0 * 64), z_zeros);
122 0 : xx_storeu_128(ls + 1 * 64, x_zeros);
123 : }
124 : else {
125 0 : const __m512i z_zeros = _mm512_setzero_si512();
126 :
127 0 : _mm512_storeu_si512((__m512i *)(ls - 72), z_zeros);
128 0 : xx_storel_64(ls - 8, x_zeros);
129 :
130 : do {
131 0 : const __m512i res = txb_init_levels_64_avx512(cf);
132 0 : const __m256i res0 = _mm512_castsi512_si256(res);
133 0 : const __m256i res1 = _mm512_extracti64x4_epi64(res, 1);
134 0 : yy_storeu_256(ls, res0);
135 0 : *(int32_t *)(ls + 32) = 0;
136 0 : yy_storeu_256(ls + 36, res1);
137 0 : *(int32_t *)(ls + 36 + 32) = 0;
138 0 : cf += 2 * 32;
139 0 : ls += 2 * 36;
140 0 : i -= 2;
141 0 : } while (i);
142 :
143 : _mm512_storeu_si512((__m512i *)(ls + 0 * 64), z_zeros);
144 0 : _mm512_storeu_si512((__m512i *)(ls + 1 * 64), z_zeros);
145 0 : xx_storeu_128(ls + 2 * 64, x_zeros);
146 : }
147 0 : }
|