Line data Source code
1 : /*
2 : * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3 : *
4 : * This source code is subject to the terms of the BSD 2 Clause License and
5 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 : * was not distributed with this source code in the LICENSE file, you can
7 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 : * Media Patent License 1.0 was not distributed with this source code in the
9 : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 : */
11 :
12 : #include <immintrin.h> /* AVX2 */
13 :
14 : #include "EbDefinitions.h"
15 : #include "synonyms.h"
16 : #include "synonyms_avx2.h"
17 :
18 206676000 : static INLINE __m256i txb_init_levels_avx2(const TranLow *const coeff) {
19 206676000 : const __m256i idx = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7);
20 206676000 : const __m256i c0 = yy_loadu_256(coeff + 0 * 8);
21 206941000 : const __m256i c1 = yy_loadu_256(coeff + 1 * 8);
22 207095000 : const __m256i c2 = yy_loadu_256(coeff + 2 * 8);
23 207012000 : const __m256i c3 = yy_loadu_256(coeff + 3 * 8);
24 207009000 : const __m256i c01 = _mm256_packs_epi32(c0, c1);
25 207009000 : const __m256i c23 = _mm256_packs_epi32(c2, c3);
26 207009000 : const __m256i abs01 = _mm256_abs_epi16(c01);
27 207009000 : const __m256i abs23 = _mm256_abs_epi16(c23);
28 207009000 : const __m256i res = _mm256_packs_epi16(abs01, abs23);
29 207009000 : return _mm256_permutevar8x32_epi32(res, idx);
30 : }
31 :
32 74170600 : void eb_av1_txb_init_levels_avx2(const TranLow *const coeff,
33 : const int32_t width, const int32_t height,
34 : uint8_t *const levels) {
35 74170600 : const TranLow *cf = coeff;
36 74170600 : const __m128i x_zeros = _mm_setzero_si128();
37 74170600 : const __m256i y_zeros = _mm256_setzero_si256();
38 74170600 : uint8_t *ls = levels;
39 74170600 : int32_t i = height;
40 :
41 74170600 : if (width == 4) {
42 31364000 : xx_storeu_128(ls - 16, x_zeros);
43 :
44 : do {
45 45504400 : const __m256i idx = _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7);
46 45504400 : const __m256i c0 = yy_loadu_256(cf);
47 45500400 : const __m256i c1 = yy_loadu_256(cf + 8);
48 45471700 : const __m256i c01 = _mm256_packs_epi32(c0, c1);
49 45471700 : const __m256i abs01 = _mm256_abs_epi16(c01);
50 45471700 : const __m256i res_ = _mm256_packs_epi16(abs01, y_zeros);
51 45471700 : const __m256i res = _mm256_permutevar8x32_epi32(res_, idx);
52 45471700 : yy_storeu_256(ls, res);
53 45472200 : cf += 4 * 4;
54 45472200 : ls += 4 * 8;
55 45472200 : i -= 4;
56 45472200 : } while (i);
57 :
58 31363100 : yy_storeu_256(ls, y_zeros);
59 : }
60 42806600 : else if (width == 8) {
61 27149800 : yy_storeu_256(ls - 24, y_zeros);
62 :
63 : do {
64 62079400 : const __m256i res = txb_init_levels_avx2(cf);
65 62174800 : const __m128i res0 = _mm256_castsi256_si128(res);
66 62174800 : const __m128i res1 = _mm256_extracti128_si256(res, 1);
67 62174800 : xx_storel_64(ls + 0 * 12 + 0, res0);
68 62180000 : *(int32_t *)(ls + 0 * 12 + 8) = 0;
69 62180000 : _mm_storeh_epi64((__m128i *)(ls + 1 * 12 + 0), res0);
70 62134800 : *(int32_t *)(ls + 1 * 12 + 8) = 0;
71 62134800 : xx_storel_64(ls + 2 * 12 + 0, res1);
72 62096400 : *(int32_t *)(ls + 2 * 12 + 8) = 0;
73 62096400 : _mm_storeh_epi64((__m128i *)(ls + 3 * 12 + 0), res1);
74 62092200 : *(int32_t *)(ls + 3 * 12 + 8) = 0;
75 62092200 : cf += 4 * 8;
76 62092200 : ls += 4 * 12;
77 62092200 : i -= 4;
78 62092200 : } while (i);
79 :
80 27161200 : yy_storeu_256(ls + 0 * 32, y_zeros);
81 27159700 : xx_storeu_128(ls + 1 * 32, x_zeros);
82 : }
83 15656800 : else if (width == 16) {
84 12606100 : yy_storeu_256(ls - 40, y_zeros);
85 12605400 : xx_storel_64(ls - 8, x_zeros);
86 :
87 : do {
88 77433800 : const __m256i res = txb_init_levels_avx2(cf);
89 77485100 : const __m128i res0 = _mm256_castsi256_si128(res);
90 77485100 : const __m128i res1 = _mm256_extracti128_si256(res, 1);
91 77485100 : xx_storeu_128(ls, res0);
92 77477700 : *(int32_t *)(ls + 16) = 0;
93 77477700 : xx_storeu_128(ls + 20, res1);
94 77440300 : *(int32_t *)(ls + 20 + 16) = 0;
95 77440300 : cf += 2 * 16;
96 77440300 : ls += 2 * 20;
97 77440300 : i -= 2;
98 77440300 : } while (i);
99 :
100 12611600 : yy_storeu_256(ls + 0 * 32, y_zeros);
101 12611100 : yy_storeu_256(ls + 1 * 32, y_zeros);
102 12609000 : xx_storeu_128(ls + 2 * 32, x_zeros);
103 : }
104 : else {
105 3050690 : yy_storeu_256(ls - 72, y_zeros);
106 3265180 : yy_storeu_256(ls - 40, y_zeros);
107 3265060 : xx_storel_64(ls - 8, x_zeros);
108 :
109 : do {
110 68058000 : const __m256i res = txb_init_levels_avx2(cf);
111 68088400 : yy_storeu_256(ls, res);
112 68058500 : *(int32_t *)(ls + 32) = 0;
113 68058500 : cf += 32;
114 68058500 : ls += 36;
115 68058500 : } while (--i);
116 :
117 3265620 : yy_storeu_256(ls + 0 * 32, y_zeros);
118 3265510 : yy_storeu_256(ls + 1 * 32, y_zeros);
119 3265520 : yy_storeu_256(ls + 2 * 32, y_zeros);
120 3265480 : yy_storeu_256(ls + 3 * 32, y_zeros);
121 3265510 : xx_storeu_128(ls + 4 * 32, x_zeros);
122 : }
123 74387700 : }
|