Line data Source code
1 : /*
2 : * Copyright (c) 2019, Alliance for Open Media. All rights reserved
3 : *
4 : * This source code is subject to the terms of the BSD 2 Clause License and
5 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 : * was not distributed with this source code in the LICENSE file, you can
7 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 : * Media Patent License 1.0 was not distributed with this source code in the
9 : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 : */
11 :
12 : #include <immintrin.h>
13 : #include "aom_dsp_rtcd.h"
14 : #include "EbWarpedMotion.h"
15 :
16 : /* This is a modified version of 'eb_warped_filter' from warped_motion.c:
17 : * Each coefficient is stored in 8 bits instead of 16 bits
18 : * The coefficients are rearranged in the column order 0, 2, 4, 6, 1, 3, 5, 7
19 :
20 : This is done in order to avoid overflow: Since the tap with the largest
21 : coefficient could be any of taps 2, 3, 4 or 5, we can't use the summation
22 : order ((0 + 1) + (4 + 5)) + ((2 + 3) + (6 + 7)) used in the regular
23 : convolve functions.
24 :
25 : Instead, we use the summation order
26 : ((0 + 2) + (4 + 6)) + ((1 + 3) + (5 + 7)).
27 : The rearrangement of coefficients in this table is so that we can get the
28 : coefficients into the correct order more quickly.
29 : */
30 : /* clang-format off */
31 : DECLARE_ALIGNED(8, const int8_t,
32 : eb_av1_filter_8bit[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]) = {
33 : #if WARPEDPIXEL_PREC_BITS == 6
34 : // [-1, 0)
35 : { 0, 127, 0, 0, 0, 1, 0, 0}, { 0, 127, 0, 0, -1, 2, 0, 0},
36 : { 1, 127, -1, 0, -3, 4, 0, 0}, { 1, 126, -2, 0, -4, 6, 1, 0},
37 : { 1, 126, -3, 0, -5, 8, 1, 0}, { 1, 125, -4, 0, -6, 11, 1, 0},
38 : { 1, 124, -4, 0, -7, 13, 1, 0}, { 2, 123, -5, 0, -8, 15, 1, 0},
39 : { 2, 122, -6, 0, -9, 18, 1, 0}, { 2, 121, -6, 0, -10, 20, 1, 0},
40 : { 2, 120, -7, 0, -11, 22, 2, 0}, { 2, 119, -8, 0, -12, 25, 2, 0},
41 : { 3, 117, -8, 0, -13, 27, 2, 0}, { 3, 116, -9, 0, -13, 29, 2, 0},
42 : { 3, 114, -10, 0, -14, 32, 3, 0}, { 3, 113, -10, 0, -15, 35, 2, 0},
43 : { 3, 111, -11, 0, -15, 37, 3, 0}, { 3, 109, -11, 0, -16, 40, 3, 0},
44 : { 3, 108, -12, 0, -16, 42, 3, 0}, { 4, 106, -13, 0, -17, 45, 3, 0},
45 : { 4, 104, -13, 0, -17, 47, 3, 0}, { 4, 102, -14, 0, -17, 50, 3, 0},
46 : { 4, 100, -14, 0, -17, 52, 3, 0}, { 4, 98, -15, 0, -18, 55, 4, 0},
47 : { 4, 96, -15, 0, -18, 58, 3, 0}, { 4, 94, -16, 0, -18, 60, 4, 0},
48 : { 4, 91, -16, 0, -18, 63, 4, 0}, { 4, 89, -16, 0, -18, 65, 4, 0},
49 : { 4, 87, -17, 0, -18, 68, 4, 0}, { 4, 85, -17, 0, -18, 70, 4, 0},
50 : { 4, 82, -17, 0, -18, 73, 4, 0}, { 4, 80, -17, 0, -18, 75, 4, 0},
51 : { 4, 78, -18, 0, -18, 78, 4, 0}, { 4, 75, -18, 0, -17, 80, 4, 0},
52 : { 4, 73, -18, 0, -17, 82, 4, 0}, { 4, 70, -18, 0, -17, 85, 4, 0},
53 : { 4, 68, -18, 0, -17, 87, 4, 0}, { 4, 65, -18, 0, -16, 89, 4, 0},
54 : { 4, 63, -18, 0, -16, 91, 4, 0}, { 4, 60, -18, 0, -16, 94, 4, 0},
55 : { 3, 58, -18, 0, -15, 96, 4, 0}, { 4, 55, -18, 0, -15, 98, 4, 0},
56 : { 3, 52, -17, 0, -14, 100, 4, 0}, { 3, 50, -17, 0, -14, 102, 4, 0},
57 : { 3, 47, -17, 0, -13, 104, 4, 0}, { 3, 45, -17, 0, -13, 106, 4, 0},
58 : { 3, 42, -16, 0, -12, 108, 3, 0}, { 3, 40, -16, 0, -11, 109, 3, 0},
59 : { 3, 37, -15, 0, -11, 111, 3, 0}, { 2, 35, -15, 0, -10, 113, 3, 0},
60 : { 3, 32, -14, 0, -10, 114, 3, 0}, { 2, 29, -13, 0, -9, 116, 3, 0},
61 : { 2, 27, -13, 0, -8, 117, 3, 0}, { 2, 25, -12, 0, -8, 119, 2, 0},
62 : { 2, 22, -11, 0, -7, 120, 2, 0}, { 1, 20, -10, 0, -6, 121, 2, 0},
63 : { 1, 18, -9, 0, -6, 122, 2, 0}, { 1, 15, -8, 0, -5, 123, 2, 0},
64 : { 1, 13, -7, 0, -4, 124, 1, 0}, { 1, 11, -6, 0, -4, 125, 1, 0},
65 : { 1, 8, -5, 0, -3, 126, 1, 0}, { 1, 6, -4, 0, -2, 126, 1, 0},
66 : { 0, 4, -3, 0, -1, 127, 1, 0}, { 0, 2, -1, 0, 0, 127, 0, 0},
67 : // [0, 1)
68 : { 0, 0, 1, 0, 0, 127, 0, 0}, { 0, -1, 2, 0, 0, 127, 0, 0},
69 : { 0, -3, 4, 1, 1, 127, -2, 0}, { 0, -5, 6, 1, 1, 127, -2, 0},
70 : { 0, -6, 8, 1, 2, 126, -3, 0}, {-1, -7, 11, 2, 2, 126, -4, -1},
71 : {-1, -8, 13, 2, 3, 125, -5, -1}, {-1, -10, 16, 3, 3, 124, -6, -1},
72 : {-1, -11, 18, 3, 4, 123, -7, -1}, {-1, -12, 20, 3, 4, 122, -7, -1},
73 : {-1, -13, 23, 3, 4, 121, -8, -1}, {-2, -14, 25, 4, 5, 120, -9, -1},
74 : {-1, -15, 27, 4, 5, 119, -10, -1}, {-1, -16, 30, 4, 5, 118, -11, -1},
75 : {-2, -17, 33, 5, 6, 116, -12, -1}, {-2, -17, 35, 5, 6, 114, -12, -1},
76 : {-2, -18, 38, 5, 6, 113, -13, -1}, {-2, -19, 41, 6, 7, 111, -14, -2},
77 : {-2, -19, 43, 6, 7, 110, -15, -2}, {-2, -20, 46, 6, 7, 108, -15, -2},
78 : {-2, -20, 49, 6, 7, 106, -16, -2}, {-2, -21, 51, 7, 7, 104, -16, -2},
79 : {-2, -21, 54, 7, 7, 102, -17, -2}, {-2, -21, 56, 7, 8, 100, -18, -2},
80 : {-2, -22, 59, 7, 8, 98, -18, -2}, {-2, -22, 62, 7, 8, 96, -19, -2},
81 : {-2, -22, 64, 7, 8, 94, -19, -2}, {-2, -22, 67, 8, 8, 91, -20, -2},
82 : {-2, -22, 69, 8, 8, 89, -20, -2}, {-2, -22, 72, 8, 8, 87, -21, -2},
83 : {-2, -21, 74, 8, 8, 84, -21, -2}, {-2, -22, 77, 8, 8, 82, -21, -2},
84 : {-2, -21, 79, 8, 8, 79, -21, -2}, {-2, -21, 82, 8, 8, 77, -22, -2},
85 : {-2, -21, 84, 8, 8, 74, -21, -2}, {-2, -21, 87, 8, 8, 72, -22, -2},
86 : {-2, -20, 89, 8, 8, 69, -22, -2}, {-2, -20, 91, 8, 8, 67, -22, -2},
87 : {-2, -19, 94, 8, 7, 64, -22, -2}, {-2, -19, 96, 8, 7, 62, -22, -2},
88 : {-2, -18, 98, 8, 7, 59, -22, -2}, {-2, -18, 100, 8, 7, 56, -21, -2},
89 : {-2, -17, 102, 7, 7, 54, -21, -2}, {-2, -16, 104, 7, 7, 51, -21, -2},
90 : {-2, -16, 106, 7, 6, 49, -20, -2}, {-2, -15, 108, 7, 6, 46, -20, -2},
91 : {-2, -15, 110, 7, 6, 43, -19, -2}, {-2, -14, 111, 7, 6, 41, -19, -2},
92 : {-1, -13, 113, 6, 5, 38, -18, -2}, {-1, -12, 114, 6, 5, 35, -17, -2},
93 : {-1, -12, 116, 6, 5, 33, -17, -2}, {-1, -11, 118, 5, 4, 30, -16, -1},
94 : {-1, -10, 119, 5, 4, 27, -15, -1}, {-1, -9, 120, 5, 4, 25, -14, -2},
95 : {-1, -8, 121, 4, 3, 23, -13, -1}, {-1, -7, 122, 4, 3, 20, -12, -1},
96 : {-1, -7, 123, 4, 3, 18, -11, -1}, {-1, -6, 124, 3, 3, 16, -10, -1},
97 : {-1, -5, 125, 3, 2, 13, -8, -1}, {-1, -4, 126, 2, 2, 11, -7, -1},
98 : { 0, -3, 126, 2, 1, 8, -6, 0}, { 0, -2, 127, 1, 1, 6, -5, 0},
99 : { 0, -2, 127, 1, 1, 4, -3, 0}, { 0, 0, 127, 0, 0, 2, -1, 0},
100 : // [1, 2)
101 : { 0, 0, 127, 0, 0, 1, 0, 0}, { 0, 0, 127, 0, 0, -1, 2, 0},
102 : { 0, 1, 127, -1, 0, -3, 4, 0}, { 0, 1, 126, -2, 0, -4, 6, 1},
103 : { 0, 1, 126, -3, 0, -5, 8, 1}, { 0, 1, 125, -4, 0, -6, 11, 1},
104 : { 0, 1, 124, -4, 0, -7, 13, 1}, { 0, 2, 123, -5, 0, -8, 15, 1},
105 : { 0, 2, 122, -6, 0, -9, 18, 1}, { 0, 2, 121, -6, 0, -10, 20, 1},
106 : { 0, 2, 120, -7, 0, -11, 22, 2}, { 0, 2, 119, -8, 0, -12, 25, 2},
107 : { 0, 3, 117, -8, 0, -13, 27, 2}, { 0, 3, 116, -9, 0, -13, 29, 2},
108 : { 0, 3, 114, -10, 0, -14, 32, 3}, { 0, 3, 113, -10, 0, -15, 35, 2},
109 : { 0, 3, 111, -11, 0, -15, 37, 3}, { 0, 3, 109, -11, 0, -16, 40, 3},
110 : { 0, 3, 108, -12, 0, -16, 42, 3}, { 0, 4, 106, -13, 0, -17, 45, 3},
111 : { 0, 4, 104, -13, 0, -17, 47, 3}, { 0, 4, 102, -14, 0, -17, 50, 3},
112 : { 0, 4, 100, -14, 0, -17, 52, 3}, { 0, 4, 98, -15, 0, -18, 55, 4},
113 : { 0, 4, 96, -15, 0, -18, 58, 3}, { 0, 4, 94, -16, 0, -18, 60, 4},
114 : { 0, 4, 91, -16, 0, -18, 63, 4}, { 0, 4, 89, -16, 0, -18, 65, 4},
115 : { 0, 4, 87, -17, 0, -18, 68, 4}, { 0, 4, 85, -17, 0, -18, 70, 4},
116 : { 0, 4, 82, -17, 0, -18, 73, 4}, { 0, 4, 80, -17, 0, -18, 75, 4},
117 : { 0, 4, 78, -18, 0, -18, 78, 4}, { 0, 4, 75, -18, 0, -17, 80, 4},
118 : { 0, 4, 73, -18, 0, -17, 82, 4}, { 0, 4, 70, -18, 0, -17, 85, 4},
119 : { 0, 4, 68, -18, 0, -17, 87, 4}, { 0, 4, 65, -18, 0, -16, 89, 4},
120 : { 0, 4, 63, -18, 0, -16, 91, 4}, { 0, 4, 60, -18, 0, -16, 94, 4},
121 : { 0, 3, 58, -18, 0, -15, 96, 4}, { 0, 4, 55, -18, 0, -15, 98, 4},
122 : { 0, 3, 52, -17, 0, -14, 100, 4}, { 0, 3, 50, -17, 0, -14, 102, 4},
123 : { 0, 3, 47, -17, 0, -13, 104, 4}, { 0, 3, 45, -17, 0, -13, 106, 4},
124 : { 0, 3, 42, -16, 0, -12, 108, 3}, { 0, 3, 40, -16, 0, -11, 109, 3},
125 : { 0, 3, 37, -15, 0, -11, 111, 3}, { 0, 2, 35, -15, 0, -10, 113, 3},
126 : { 0, 3, 32, -14, 0, -10, 114, 3}, { 0, 2, 29, -13, 0, -9, 116, 3},
127 : { 0, 2, 27, -13, 0, -8, 117, 3}, { 0, 2, 25, -12, 0, -8, 119, 2},
128 : { 0, 2, 22, -11, 0, -7, 120, 2}, { 0, 1, 20, -10, 0, -6, 121, 2},
129 : { 0, 1, 18, -9, 0, -6, 122, 2}, { 0, 1, 15, -8, 0, -5, 123, 2},
130 : { 0, 1, 13, -7, 0, -4, 124, 1}, { 0, 1, 11, -6, 0, -4, 125, 1},
131 : { 0, 1, 8, -5, 0, -3, 126, 1}, { 0, 1, 6, -4, 0, -2, 126, 1},
132 : { 0, 0, 4, -3, 0, -1, 127, 1}, { 0, 0, 2, -1, 0, 0, 127, 0},
133 : // dummy (replicate row index 191)
134 : { 0, 0, 2, -1, 0, 0, 127, 0},
135 :
136 : #else
137 : // [-1, 0)
138 : { 0, 127, 0, 0, 0, 1, 0, 0}, { 1, 127, -1, 0, -3, 4, 0, 0},
139 : { 1, 126, -3, 0, -5, 8, 1, 0}, { 1, 124, -4, 0, -7, 13, 1, 0},
140 : { 2, 122, -6, 0, -9, 18, 1, 0}, { 2, 120, -7, 0, -11, 22, 2, 0},
141 : { 3, 117, -8, 0, -13, 27, 2, 0}, { 3, 114, -10, 0, -14, 32, 3, 0},
142 : { 3, 111, -11, 0, -15, 37, 3, 0}, { 3, 108, -12, 0, -16, 42, 3, 0},
143 : { 4, 104, -13, 0, -17, 47, 3, 0}, { 4, 100, -14, 0, -17, 52, 3, 0},
144 : { 4, 96, -15, 0, -18, 58, 3, 0}, { 4, 91, -16, 0, -18, 63, 4, 0},
145 : { 4, 87, -17, 0, -18, 68, 4, 0}, { 4, 82, -17, 0, -18, 73, 4, 0},
146 : { 4, 78, -18, 0, -18, 78, 4, 0}, { 4, 73, -18, 0, -17, 82, 4, 0},
147 : { 4, 68, -18, 0, -17, 87, 4, 0}, { 4, 63, -18, 0, -16, 91, 4, 0},
148 : { 3, 58, -18, 0, -15, 96, 4, 0}, { 3, 52, -17, 0, -14, 100, 4, 0},
149 : { 3, 47, -17, 0, -13, 104, 4, 0}, { 3, 42, -16, 0, -12, 108, 3, 0},
150 : { 3, 37, -15, 0, -11, 111, 3, 0}, { 3, 32, -14, 0, -10, 114, 3, 0},
151 : { 2, 27, -13, 0, -8, 117, 3, 0}, { 2, 22, -11, 0, -7, 120, 2, 0},
152 : { 1, 18, -9, 0, -6, 122, 2, 0}, { 1, 13, -7, 0, -4, 124, 1, 0},
153 : { 1, 8, -5, 0, -3, 126, 1, 0}, { 0, 4, -3, 0, -1, 127, 1, 0},
154 : // [0, 1)
155 : { 0, 0, 1, 0, 0, 127, 0, 0}, { 0, -3, 4, 1, 1, 127, -2, 0},
156 : { 0, -6, 8, 1, 2, 126, -3, 0}, {-1, -8, 13, 2, 3, 125, -5, -1},
157 : {-1, -11, 18, 3, 4, 123, -7, -1}, {-1, -13, 23, 3, 4, 121, -8, -1},
158 : {-1, -15, 27, 4, 5, 119, -10, -1}, {-2, -17, 33, 5, 6, 116, -12, -1},
159 : {-2, -18, 38, 5, 6, 113, -13, -1}, {-2, -19, 43, 6, 7, 110, -15, -2},
160 : {-2, -20, 49, 6, 7, 106, -16, -2}, {-2, -21, 54, 7, 7, 102, -17, -2},
161 : {-2, -22, 59, 7, 8, 98, -18, -2}, {-2, -22, 64, 7, 8, 94, -19, -2},
162 : {-2, -22, 69, 8, 8, 89, -20, -2}, {-2, -21, 74, 8, 8, 84, -21, -2},
163 : {-2, -21, 79, 8, 8, 79, -21, -2}, {-2, -21, 84, 8, 8, 74, -21, -2},
164 : {-2, -20, 89, 8, 8, 69, -22, -2}, {-2, -19, 94, 8, 7, 64, -22, -2},
165 : {-2, -18, 98, 8, 7, 59, -22, -2}, {-2, -17, 102, 7, 7, 54, -21, -2},
166 : {-2, -16, 106, 7, 6, 49, -20, -2}, {-2, -15, 110, 7, 6, 43, -19, -2},
167 : {-1, -13, 113, 6, 5, 38, -18, -2}, {-1, -12, 116, 6, 5, 33, -17, -2},
168 : {-1, -10, 119, 5, 4, 27, -15, -1}, {-1, -8, 121, 4, 3, 23, -13, -1},
169 : {-1, -7, 123, 4, 3, 18, -11, -1}, {-1, -5, 125, 3, 2, 13, -8, -1},
170 : { 0, -3, 126, 2, 1, 8, -6, 0}, { 0, -2, 127, 1, 1, 4, -3, 0},
171 : // [1, 2)
172 : { 0, 0, 127, 0, 0, 1, 0, 0}, { 0, 1, 127, -1, 0, -3, 4, 0},
173 : { 0, 1, 126, -3, 0, -5, 8, 1}, { 0, 1, 124, -4, 0, -7, 13, 1},
174 : { 0, 2, 122, -6, 0, -9, 18, 1}, { 0, 2, 120, -7, 0, -11, 22, 2},
175 : { 0, 3, 117, -8, 0, -13, 27, 2}, { 0, 3, 114, -10, 0, -14, 32, 3},
176 : { 0, 3, 111, -11, 0, -15, 37, 3}, { 0, 3, 108, -12, 0, -16, 42, 3},
177 : { 0, 4, 104, -13, 0, -17, 47, 3}, { 0, 4, 100, -14, 0, -17, 52, 3},
178 : { 0, 4, 96, -15, 0, -18, 58, 3}, { 0, 4, 91, -16, 0, -18, 63, 4},
179 : { 0, 4, 87, -17, 0, -18, 68, 4}, { 0, 4, 82, -17, 0, -18, 73, 4},
180 : { 0, 4, 78, -18, 0, -18, 78, 4}, { 0, 4, 73, -18, 0, -17, 82, 4},
181 : { 0, 4, 68, -18, 0, -17, 87, 4}, { 0, 4, 63, -18, 0, -16, 91, 4},
182 : { 0, 3, 58, -18, 0, -15, 96, 4}, { 0, 3, 52, -17, 0, -14, 100, 4},
183 : { 0, 3, 47, -17, 0, -13, 104, 4}, { 0, 3, 42, -16, 0, -12, 108, 3},
184 : { 0, 3, 37, -15, 0, -11, 111, 3}, { 0, 3, 32, -14, 0, -10, 114, 3},
185 : { 0, 2, 27, -13, 0, -8, 117, 3}, { 0, 2, 22, -11, 0, -7, 120, 2},
186 : { 0, 1, 18, -9, 0, -6, 122, 2}, { 0, 1, 13, -7, 0, -4, 124, 1},
187 : { 0, 1, 8, -5, 0, -3, 126, 1}, { 0, 0, 4, -3, 0, -1, 127, 1},
188 : // dummy (replicate row index 95)
189 : { 0, 0, 4, -3, 0, -1, 127, 1},
190 : #endif // WARPEDPIXEL_PREC_BITS == 6
191 : };
192 : /* clang-format on */
193 :
194 : DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_mask01_avx2[32]) = {
195 : 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
196 : 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
197 : };
198 :
199 : DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_mask23_avx2[32]) = {
200 : 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
201 : 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3
202 : };
203 :
204 : DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_mask45_avx2[32]) = {
205 : 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5,
206 : 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5
207 : };
208 :
209 : DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_mask67_avx2[32]) = {
210 : 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7,
211 : 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7
212 : };
213 :
214 : DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask0_avx2[32]) = {
215 : 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
216 : 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
217 : };
218 :
219 : DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask1_avx2[32]) = {
220 : 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7,
221 : 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7
222 : };
223 :
224 : DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask2_avx2[32]) = {
225 : 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11,
226 : 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11
227 : };
228 :
229 : DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask3_avx2[32]) = {
230 : 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15,
231 : 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15
232 : };
233 :
234 : DECLARE_ALIGNED(32, static const uint8_t,
235 : shuffle_src0[32]) = { 0, 2, 2, 4, 4, 6, 6, 8, 1, 3, 3,
236 : 5, 5, 7, 7, 9, 0, 2, 2, 4, 4, 6,
237 : 6, 8, 1, 3, 3, 5, 5, 7, 7, 9 };
238 :
239 : DECLARE_ALIGNED(32, static const uint8_t,
240 : shuffle_src1[32]) = { 4, 6, 6, 8, 8, 10, 10, 12, 5, 7, 7,
241 : 9, 9, 11, 11, 13, 4, 6, 6, 8, 8, 10,
242 : 10, 12, 5, 7, 7, 9, 9, 11, 11, 13 };
243 :
244 : DECLARE_ALIGNED(32, static const uint8_t,
245 : shuffle_src2[32]) = { 1, 3, 3, 5, 5, 7, 7, 9, 2, 4, 4,
246 : 6, 6, 8, 8, 10, 1, 3, 3, 5, 5, 7,
247 : 7, 9, 2, 4, 4, 6, 6, 8, 8, 10 };
248 :
249 : DECLARE_ALIGNED(32, static const uint8_t,
250 : shuffle_src3[32]) = { 5, 7, 7, 9, 9, 11, 11, 13, 6, 8, 8,
251 : 10, 10, 12, 12, 14, 5, 7, 7, 9, 9, 11,
252 : 11, 13, 6, 8, 8, 10, 10, 12, 12, 14 };
253 :
254 2559100000 : static INLINE void filter_src_pixels_avx2(const __m256i src, __m256i *horz_out,
255 : __m256i *coeff,
256 : const __m256i *shuffle_src,
257 : const __m256i *round_const,
258 : const __m128i *shift, int row) {
259 2559100000 : const __m256i src_0 = _mm256_shuffle_epi8(src, shuffle_src[0]);
260 2559100000 : const __m256i src_1 = _mm256_shuffle_epi8(src, shuffle_src[1]);
261 2559100000 : const __m256i src_2 = _mm256_shuffle_epi8(src, shuffle_src[2]);
262 2559100000 : const __m256i src_3 = _mm256_shuffle_epi8(src, shuffle_src[3]);
263 :
264 2559100000 : const __m256i res_02 = _mm256_maddubs_epi16(src_0, coeff[0]);
265 2559100000 : const __m256i res_46 = _mm256_maddubs_epi16(src_1, coeff[1]);
266 2559100000 : const __m256i res_13 = _mm256_maddubs_epi16(src_2, coeff[2]);
267 5118200000 : const __m256i res_57 = _mm256_maddubs_epi16(src_3, coeff[3]);
268 :
269 2559100000 : const __m256i res_even = _mm256_add_epi16(res_02, res_46);
270 2559100000 : const __m256i res_odd = _mm256_add_epi16(res_13, res_57);
271 : const __m256i res =
272 5118200000 : _mm256_add_epi16(_mm256_add_epi16(res_even, res_odd), *round_const);
273 2559100000 : horz_out[row] = _mm256_srl_epi16(res, *shift);
274 2559100000 : }
275 :
276 1570640000 : static INLINE void prepare_horizontal_filter_coeff_avx2(int alpha, int beta,
277 : int sx,
278 : __m256i *coeff) {
279 1570640000 : __m128i tmp_0 = _mm_loadl_epi64(
280 1570640000 : (__m128i *)&eb_av1_filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]);
281 1570640000 : __m128i tmp_1 = _mm_loadl_epi64(
282 1570640000 : (__m128i *)&eb_av1_filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]);
283 1570640000 : __m128i tmp_2 = _mm_loadl_epi64(
284 1570640000 : (__m128i *)&eb_av1_filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]);
285 1570640000 : __m128i tmp_3 = _mm_loadl_epi64(
286 1570640000 : (__m128i *)&eb_av1_filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]);
287 1570640000 : __m128i tmp_4 = _mm_loadl_epi64(
288 1570640000 : (__m128i *)&eb_av1_filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]);
289 1570640000 : __m128i tmp_5 = _mm_loadl_epi64(
290 1570640000 : (__m128i *)&eb_av1_filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]);
291 1570640000 : __m128i tmp_6 = _mm_loadl_epi64(
292 1570640000 : (__m128i *)&eb_av1_filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]);
293 1570640000 : __m128i tmp_7 = _mm_loadl_epi64(
294 1570640000 : (__m128i *)&eb_av1_filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]);
295 :
296 1570640000 : tmp_0 = _mm_unpacklo_epi16(tmp_0, tmp_2);
297 1570640000 : tmp_1 = _mm_unpacklo_epi16(tmp_1, tmp_3);
298 1570640000 : tmp_4 = _mm_unpacklo_epi16(tmp_4, tmp_6);
299 1570640000 : tmp_5 = _mm_unpacklo_epi16(tmp_5, tmp_7);
300 :
301 : __m128i tmp_8 =
302 1570640000 : _mm_loadl_epi64((__m128i *)&eb_av1_filter_8bit[((sx + beta) + 0 * alpha) >>
303 : WARPEDDIFF_PREC_BITS]);
304 : __m128i tmp_9 =
305 1570640000 : _mm_loadl_epi64((__m128i *)&eb_av1_filter_8bit[((sx + beta) + 1 * alpha) >>
306 : WARPEDDIFF_PREC_BITS]);
307 : __m128i tmp_10 =
308 1570640000 : _mm_loadl_epi64((__m128i *)&eb_av1_filter_8bit[((sx + beta) + 2 * alpha) >>
309 : WARPEDDIFF_PREC_BITS]);
310 : __m128i tmp_11 =
311 1570640000 : _mm_loadl_epi64((__m128i *)&eb_av1_filter_8bit[((sx + beta) + 3 * alpha) >>
312 : WARPEDDIFF_PREC_BITS]);
313 : tmp_2 =
314 1570640000 : _mm_loadl_epi64((__m128i *)&eb_av1_filter_8bit[((sx + beta) + 4 * alpha) >>
315 : WARPEDDIFF_PREC_BITS]);
316 : tmp_3 =
317 1570640000 : _mm_loadl_epi64((__m128i *)&eb_av1_filter_8bit[((sx + beta) + 5 * alpha) >>
318 : WARPEDDIFF_PREC_BITS]);
319 : tmp_6 =
320 1570640000 : _mm_loadl_epi64((__m128i *)&eb_av1_filter_8bit[((sx + beta) + 6 * alpha) >>
321 : WARPEDDIFF_PREC_BITS]);
322 : tmp_7 =
323 3141290000 : _mm_loadl_epi64((__m128i *)&eb_av1_filter_8bit[((sx + beta) + 7 * alpha) >>
324 : WARPEDDIFF_PREC_BITS]);
325 :
326 1570640000 : tmp_8 = _mm_unpacklo_epi16(tmp_8, tmp_10);
327 1570640000 : tmp_2 = _mm_unpacklo_epi16(tmp_2, tmp_6);
328 1570640000 : tmp_9 = _mm_unpacklo_epi16(tmp_9, tmp_11);
329 1570640000 : tmp_3 = _mm_unpacklo_epi16(tmp_3, tmp_7);
330 :
331 : const __m256i tmp_12 =
332 1570640000 : _mm256_inserti128_si256(_mm256_castsi128_si256(tmp_0), tmp_8, 0x1);
333 : const __m256i tmp_13 =
334 1570640000 : _mm256_inserti128_si256(_mm256_castsi128_si256(tmp_1), tmp_9, 0x1);
335 : const __m256i tmp_14 =
336 1570640000 : _mm256_inserti128_si256(_mm256_castsi128_si256(tmp_4), tmp_2, 0x1);
337 : const __m256i tmp_15 =
338 1570640000 : _mm256_inserti128_si256(_mm256_castsi128_si256(tmp_5), tmp_3, 0x1);
339 :
340 1570640000 : const __m256i res_0 = _mm256_unpacklo_epi32(tmp_12, tmp_14);
341 1570640000 : const __m256i res_1 = _mm256_unpackhi_epi32(tmp_12, tmp_14);
342 1570640000 : const __m256i res_2 = _mm256_unpacklo_epi32(tmp_13, tmp_15);
343 1570640000 : const __m256i res_3 = _mm256_unpackhi_epi32(tmp_13, tmp_15);
344 :
345 1570640000 : coeff[0] = _mm256_unpacklo_epi64(res_0, res_2);
346 1570640000 : coeff[1] = _mm256_unpackhi_epi64(res_0, res_2);
347 1570640000 : coeff[2] = _mm256_unpacklo_epi64(res_1, res_3);
348 1570640000 : coeff[3] = _mm256_unpackhi_epi64(res_1, res_3);
349 1570640000 : }
350 :
351 898176 : static INLINE void prepare_horizontal_filter_coeff_beta0_avx2(int alpha, int sx,
352 : __m256i *coeff) {
353 898176 : __m128i tmp_0 = _mm_loadl_epi64(
354 898176 : (__m128i *)&eb_av1_filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]);
355 898176 : __m128i tmp_1 = _mm_loadl_epi64(
356 898176 : (__m128i *)&eb_av1_filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]);
357 898176 : __m128i tmp_2 = _mm_loadl_epi64(
358 898176 : (__m128i *)&eb_av1_filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]);
359 898176 : __m128i tmp_3 = _mm_loadl_epi64(
360 898176 : (__m128i *)&eb_av1_filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]);
361 898176 : __m128i tmp_4 = _mm_loadl_epi64(
362 898176 : (__m128i *)&eb_av1_filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]);
363 898176 : __m128i tmp_5 = _mm_loadl_epi64(
364 898176 : (__m128i *)&eb_av1_filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]);
365 898176 : __m128i tmp_6 = _mm_loadl_epi64(
366 898176 : (__m128i *)&eb_av1_filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]);
367 898176 : __m128i tmp_7 = _mm_loadl_epi64(
368 898176 : (__m128i *)&eb_av1_filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]);
369 :
370 898176 : tmp_0 = _mm_unpacklo_epi16(tmp_0, tmp_2);
371 898176 : tmp_1 = _mm_unpacklo_epi16(tmp_1, tmp_3);
372 898176 : tmp_4 = _mm_unpacklo_epi16(tmp_4, tmp_6);
373 898176 : tmp_5 = _mm_unpacklo_epi16(tmp_5, tmp_7);
374 :
375 898176 : const __m256i tmp_12 = _mm256_broadcastsi128_si256(tmp_0);
376 898176 : const __m256i tmp_13 = _mm256_broadcastsi128_si256(tmp_1);
377 898176 : const __m256i tmp_14 = _mm256_broadcastsi128_si256(tmp_4);
378 898176 : const __m256i tmp_15 = _mm256_broadcastsi128_si256(tmp_5);
379 :
380 898176 : const __m256i res_0 = _mm256_unpacklo_epi32(tmp_12, tmp_14);
381 898176 : const __m256i res_1 = _mm256_unpackhi_epi32(tmp_12, tmp_14);
382 898176 : const __m256i res_2 = _mm256_unpacklo_epi32(tmp_13, tmp_15);
383 898176 : const __m256i res_3 = _mm256_unpackhi_epi32(tmp_13, tmp_15);
384 :
385 898176 : coeff[0] = _mm256_unpacklo_epi64(res_0, res_2);
386 898176 : coeff[1] = _mm256_unpackhi_epi64(res_0, res_2);
387 898176 : coeff[2] = _mm256_unpacklo_epi64(res_1, res_3);
388 898176 : coeff[3] = _mm256_unpackhi_epi64(res_1, res_3);
389 898176 : }
390 :
391 798570000 : static INLINE void prepare_horizontal_filter_coeff_alpha0_avx2(int beta, int sx,
392 : __m256i *coeff) {
393 : const __m128i tmp_0 =
394 798570000 : _mm_loadl_epi64((__m128i *)&eb_av1_filter_8bit[sx >> WARPEDDIFF_PREC_BITS]);
395 798570000 : const __m128i tmp_1 = _mm_loadl_epi64(
396 798570000 : (__m128i *)&eb_av1_filter_8bit[(sx + beta) >> WARPEDDIFF_PREC_BITS]);
397 :
398 : const __m256i res_0 =
399 798570000 : _mm256_inserti128_si256(_mm256_castsi128_si256(tmp_0), tmp_1, 0x1);
400 :
401 1597140000 : coeff[0] = _mm256_shuffle_epi8(
402 : res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask01_avx2));
403 1597140000 : coeff[1] = _mm256_shuffle_epi8(
404 : res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask23_avx2));
405 1597140000 : coeff[2] = _mm256_shuffle_epi8(
406 : res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask45_avx2));
407 798570000 : coeff[3] = _mm256_shuffle_epi8(
408 : res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask67_avx2));
409 798570000 : }
410 :
411 1569300000 : static INLINE void horizontal_filter_avx2(const __m256i src, __m256i *horz_out,
412 : int sx, int alpha, int beta, int row,
413 : const __m256i *shuffle_src,
414 : const __m256i *round_const,
415 : const __m128i *shift) {
416 : __m256i coeff[4];
417 1569300000 : prepare_horizontal_filter_coeff_avx2(alpha, beta, sx, coeff);
418 1617870000 : filter_src_pixels_avx2(src, horz_out, coeff, shuffle_src, round_const, shift,
419 : row);
420 1607110000 : }
421 236804000 : static INLINE void prepare_horizontal_filter_coeff(int alpha, int sx,
422 : __m256i *coeff) {
423 236804000 : const __m128i tmp_0 = _mm_loadl_epi64(
424 236804000 : (__m128i *)&eb_av1_filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]);
425 236804000 : const __m128i tmp_1 = _mm_loadl_epi64(
426 236804000 : (__m128i *)&eb_av1_filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]);
427 236804000 : const __m128i tmp_2 = _mm_loadl_epi64(
428 236804000 : (__m128i *)&eb_av1_filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]);
429 236804000 : const __m128i tmp_3 = _mm_loadl_epi64(
430 236804000 : (__m128i *)&eb_av1_filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]);
431 236804000 : const __m128i tmp_4 = _mm_loadl_epi64(
432 236804000 : (__m128i *)&eb_av1_filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]);
433 236804000 : const __m128i tmp_5 = _mm_loadl_epi64(
434 236804000 : (__m128i *)&eb_av1_filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]);
435 236804000 : const __m128i tmp_6 = _mm_loadl_epi64(
436 236804000 : (__m128i *)&eb_av1_filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]);
437 236804000 : const __m128i tmp_7 = _mm_loadl_epi64(
438 236804000 : (__m128i *)&eb_av1_filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]);
439 :
440 236804000 : const __m128i tmp_8 = _mm_unpacklo_epi16(tmp_0, tmp_2);
441 236804000 : const __m128i tmp_9 = _mm_unpacklo_epi16(tmp_1, tmp_3);
442 236804000 : const __m128i tmp_10 = _mm_unpacklo_epi16(tmp_4, tmp_6);
443 236804000 : const __m128i tmp_11 = _mm_unpacklo_epi16(tmp_5, tmp_7);
444 :
445 236804000 : const __m128i tmp_12 = _mm_unpacklo_epi32(tmp_8, tmp_10);
446 236804000 : const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_8, tmp_10);
447 236804000 : const __m128i tmp_14 = _mm_unpacklo_epi32(tmp_9, tmp_11);
448 236804000 : const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_9, tmp_11);
449 :
450 473607000 : coeff[0] = _mm256_castsi128_si256(_mm_unpacklo_epi64(tmp_12, tmp_14));
451 473607000 : coeff[1] = _mm256_castsi128_si256(_mm_unpackhi_epi64(tmp_12, tmp_14));
452 473607000 : coeff[2] = _mm256_castsi128_si256(_mm_unpacklo_epi64(tmp_13, tmp_15));
453 236804000 : coeff[3] = _mm256_castsi128_si256(_mm_unpackhi_epi64(tmp_13, tmp_15));
454 236804000 : }
455 :
456 229262000 : static INLINE void warp_horizontal_filter_avx2(
457 : const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4,
458 : int32_t sx4, int alpha, int beta, int p_height, int height, int i,
459 : const __m256i *round_const, const __m128i *shift,
460 : const __m256i *shuffle_src) {
461 229262000 : int k, iy, sx, row = 0;
462 : __m256i coeff[4];
463 1785720000 : for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
464 1551920000 : iy = iy4 + k;
465 1551920000 : iy = clamp(iy, 0, height - 1);
466 : const __m128i src_0 =
467 1536200000 : _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
468 1536200000 : iy = iy4 + k + 1;
469 1536200000 : iy = clamp(iy, 0, height - 1);
470 : const __m128i src_1 =
471 3020130000 : _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
472 : const __m256i src_01 =
473 1510070000 : _mm256_inserti128_si256(_mm256_castsi128_si256(src_0), src_1, 0x1);
474 1510070000 : sx = sx4 + beta * (k + 4);
475 1510070000 : horizontal_filter_avx2(src_01, horz_out, sx, alpha, beta, row, shuffle_src,
476 : round_const, shift);
477 1556460000 : row += 1;
478 : }
479 233803000 : iy = iy4 + k;
480 233803000 : iy = clamp(iy, 0, height - 1);
481 228961000 : const __m256i src_01 = _mm256_castsi128_si256(
482 228961000 : _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)));
483 228961000 : sx = sx4 + beta * (k + 4);
484 228961000 : prepare_horizontal_filter_coeff(alpha, sx, coeff);
485 229219000 : filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
486 : shift, row);
487 229051000 : }
488 :
489 102409000 : static INLINE void warp_horizontal_filter_alpha0_avx2(
490 : const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4,
491 : int32_t sx4, int alpha, int beta, int p_height, int height, int i,
492 : const __m256i *round_const, const __m128i *shift,
493 : const __m256i *shuffle_src) {
494 : (void)alpha;
495 102409000 : int k, iy, sx, row = 0;
496 : __m256i coeff[4];
497 809625000 : for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
498 707262000 : iy = iy4 + k;
499 707262000 : iy = clamp(iy, 0, height - 1);
500 : const __m128i src_0 =
501 705226000 : _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
502 705226000 : iy = iy4 + k + 1;
503 705226000 : iy = clamp(iy, 0, height - 1);
504 : const __m128i src_1 =
505 1403130000 : _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
506 : const __m256i src_01 =
507 701564000 : _mm256_inserti128_si256(_mm256_castsi128_si256(src_0), src_1, 0x1);
508 701564000 : sx = sx4 + beta * (k + 4);
509 701564000 : prepare_horizontal_filter_coeff_alpha0_avx2(beta, sx, coeff);
510 701394000 : filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
511 : shift, row);
512 707216000 : row += 1;
513 : }
514 102363000 : iy = iy4 + k;
515 102363000 : iy = clamp(iy, 0, height - 1);
516 102313000 : const __m256i src_01 = _mm256_castsi128_si256(
517 102313000 : _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)));
518 102313000 : sx = sx4 + beta * (k + 4);
519 102313000 : prepare_horizontal_filter_coeff_alpha0_avx2(beta, sx, coeff);
520 102297000 : filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
521 : shift, row);
522 102372000 : }
523 :
524 898179 : static INLINE void warp_horizontal_filter_beta0_avx2(
525 : const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4,
526 : int32_t sx4, int alpha, int beta, int p_height, int height, int i,
527 : const __m256i *round_const, const __m128i *shift,
528 : const __m256i *shuffle_src) {
529 : (void)beta;
530 898179 : int k, iy, row = 0;
531 : __m256i coeff[4];
532 898179 : prepare_horizontal_filter_coeff_beta0_avx2(alpha, sx4, coeff);
533 7184370 : for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
534 6286120 : iy = iy4 + k;
535 6286120 : iy = clamp(iy, 0, height - 1);
536 : const __m128i src_0 =
537 6285460 : _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
538 6285460 : iy = iy4 + k + 1;
539 6285460 : iy = clamp(iy, 0, height - 1);
540 : const __m128i src_1 =
541 12569300 : _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
542 : const __m256i src_01 =
543 6284670 : _mm256_inserti128_si256(_mm256_castsi128_si256(src_0), src_1, 0x1);
544 6284670 : filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
545 : shift, row);
546 6286180 : row += 1;
547 : }
548 898250 : iy = iy4 + k;
549 898250 : iy = clamp(iy, 0, height - 1);
550 898171 : const __m256i src_01 = _mm256_castsi128_si256(
551 898171 : _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)));
552 898171 : filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
553 : shift, row);
554 898178 : }
555 :
556 274245 : static INLINE void warp_horizontal_filter_alpha0_beta0_avx2(
557 : const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4,
558 : int32_t sx4, int alpha, int beta, int p_height, int height, int i,
559 : const __m256i *round_const, const __m128i *shift,
560 : const __m256i *shuffle_src) {
561 : (void)alpha;
562 274245 : int k, iy, row = 0;
563 : __m256i coeff[4];
564 274245 : prepare_horizontal_filter_coeff_alpha0_avx2(beta, sx4, coeff);
565 2193920 : for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
566 1919680 : iy = iy4 + k;
567 1919680 : iy = clamp(iy, 0, height - 1);
568 : const __m128i src0 =
569 1919640 : _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
570 1919640 : iy = iy4 + k + 1;
571 1919640 : iy = clamp(iy, 0, height - 1);
572 : const __m128i src1 =
573 3839190 : _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
574 : const __m256i src_01 =
575 1919600 : _mm256_inserti128_si256(_mm256_castsi128_si256(src0), src1, 0x1);
576 1919600 : filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
577 : shift, row);
578 1919680 : row += 1;
579 : }
580 274249 : iy = iy4 + k;
581 274249 : iy = clamp(iy, 0, height - 1);
582 274246 : const __m256i src_01 = _mm256_castsi128_si256(
583 274246 : _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)));
584 274246 : filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
585 : shift, row);
586 274246 : }
587 :
588 26608900 : static INLINE void unpack_weights_and_set_round_const_avx2(
589 : ConvolveParams *conv_params, const int round_bits, const int offset_bits,
590 : __m256i *res_sub_const, __m256i *round_bits_const, __m256i *wt) {
591 26608900 : *res_sub_const =
592 53217800 : _mm256_set1_epi16(-(1 << (offset_bits - conv_params->round_1)) -
593 26608900 : (1 << (offset_bits - conv_params->round_1 - 1)));
594 26608900 : *round_bits_const = _mm256_set1_epi16(((1 << round_bits) >> 1));
595 :
596 26608900 : const int w0 = conv_params->fwd_offset;
597 26608900 : const int w1 = conv_params->bck_offset;
598 26608900 : const __m256i wt0 = _mm256_set1_epi16(w0);
599 53217800 : const __m256i wt1 = _mm256_set1_epi16(w1);
600 26608900 : *wt = _mm256_unpacklo_epi16(wt0, wt1);
601 26608900 : }
602 :
603 933780000 : static INLINE void prepare_vertical_filter_coeffs_avx2(int gamma, int delta,
604 : int sy,
605 : __m256i *coeffs) {
606 : __m128i filt_00 =
607 1867560000 : _mm_loadu_si128((__m128i *)(eb_warped_filter +
608 933780000 : ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
609 : __m128i filt_01 =
610 1867560000 : _mm_loadu_si128((__m128i *)(eb_warped_filter +
611 933780000 : ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
612 : __m128i filt_02 =
613 1867560000 : _mm_loadu_si128((__m128i *)(eb_warped_filter +
614 933780000 : ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
615 : __m128i filt_03 =
616 1867560000 : _mm_loadu_si128((__m128i *)(eb_warped_filter +
617 933780000 : ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
618 :
619 1867560000 : __m128i filt_10 = _mm_loadu_si128(
620 933780000 : (__m128i *)(eb_warped_filter +
621 933780000 : (((sy + delta) + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
622 1867560000 : __m128i filt_11 = _mm_loadu_si128(
623 933780000 : (__m128i *)(eb_warped_filter +
624 933780000 : (((sy + delta) + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
625 1867560000 : __m128i filt_12 = _mm_loadu_si128(
626 933780000 : (__m128i *)(eb_warped_filter +
627 933780000 : (((sy + delta) + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
628 933780000 : __m128i filt_13 = _mm_loadu_si128(
629 933780000 : (__m128i *)(eb_warped_filter +
630 933780000 : (((sy + delta) + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
631 :
632 : __m256i filt_0 =
633 933780000 : _mm256_inserti128_si256(_mm256_castsi128_si256(filt_00), filt_10, 0x1);
634 : __m256i filt_1 =
635 933780000 : _mm256_inserti128_si256(_mm256_castsi128_si256(filt_01), filt_11, 0x1);
636 : __m256i filt_2 =
637 933780000 : _mm256_inserti128_si256(_mm256_castsi128_si256(filt_02), filt_12, 0x1);
638 : __m256i filt_3 =
639 933780000 : _mm256_inserti128_si256(_mm256_castsi128_si256(filt_03), filt_13, 0x1);
640 :
641 933780000 : __m256i res_0 = _mm256_unpacklo_epi32(filt_0, filt_1);
642 933780000 : __m256i res_1 = _mm256_unpacklo_epi32(filt_2, filt_3);
643 933780000 : __m256i res_2 = _mm256_unpackhi_epi32(filt_0, filt_1);
644 933780000 : __m256i res_3 = _mm256_unpackhi_epi32(filt_2, filt_3);
645 :
646 933780000 : coeffs[0] = _mm256_unpacklo_epi64(res_0, res_1);
647 933780000 : coeffs[1] = _mm256_unpackhi_epi64(res_0, res_1);
648 933780000 : coeffs[2] = _mm256_unpacklo_epi64(res_2, res_3);
649 933780000 : coeffs[3] = _mm256_unpackhi_epi64(res_2, res_3);
650 :
651 : filt_00 =
652 1867560000 : _mm_loadu_si128((__m128i *)(eb_warped_filter +
653 933780000 : ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
654 : filt_01 =
655 1867560000 : _mm_loadu_si128((__m128i *)(eb_warped_filter +
656 933780000 : ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
657 : filt_02 =
658 1867560000 : _mm_loadu_si128((__m128i *)(eb_warped_filter +
659 933780000 : ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
660 : filt_03 =
661 1867560000 : _mm_loadu_si128((__m128i *)(eb_warped_filter +
662 933780000 : ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
663 :
664 1867560000 : filt_10 = _mm_loadu_si128(
665 933780000 : (__m128i *)(eb_warped_filter +
666 933780000 : (((sy + delta) + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
667 1867560000 : filt_11 = _mm_loadu_si128(
668 933780000 : (__m128i *)(eb_warped_filter +
669 933780000 : (((sy + delta) + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
670 1867560000 : filt_12 = _mm_loadu_si128(
671 933780000 : (__m128i *)(eb_warped_filter +
672 933780000 : (((sy + delta) + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
673 933780000 : filt_13 = _mm_loadu_si128(
674 933780000 : (__m128i *)(eb_warped_filter +
675 933780000 : (((sy + delta) + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
676 :
677 : filt_0 =
678 933780000 : _mm256_inserti128_si256(_mm256_castsi128_si256(filt_00), filt_10, 0x1);
679 : filt_1 =
680 933780000 : _mm256_inserti128_si256(_mm256_castsi128_si256(filt_01), filt_11, 0x1);
681 : filt_2 =
682 933780000 : _mm256_inserti128_si256(_mm256_castsi128_si256(filt_02), filt_12, 0x1);
683 : filt_3 =
684 933780000 : _mm256_inserti128_si256(_mm256_castsi128_si256(filt_03), filt_13, 0x1);
685 :
686 933780000 : res_0 = _mm256_unpacklo_epi32(filt_0, filt_1);
687 933780000 : res_1 = _mm256_unpacklo_epi32(filt_2, filt_3);
688 933780000 : res_2 = _mm256_unpackhi_epi32(filt_0, filt_1);
689 933780000 : res_3 = _mm256_unpackhi_epi32(filt_2, filt_3);
690 :
691 933780000 : coeffs[4] = _mm256_unpacklo_epi64(res_0, res_1);
692 933780000 : coeffs[5] = _mm256_unpackhi_epi64(res_0, res_1);
693 933780000 : coeffs[6] = _mm256_unpacklo_epi64(res_2, res_3);
694 933780000 : coeffs[7] = _mm256_unpackhi_epi64(res_2, res_3);
695 933780000 : }
696 :
697 105699000 : static INLINE void prepare_vertical_filter_coeffs_delta0_avx2(int gamma, int sy,
698 : __m256i *coeffs) {
699 : __m128i filt_00 =
700 211399000 : _mm_loadu_si128((__m128i *)(eb_warped_filter +
701 105699000 : ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
702 : __m128i filt_01 =
703 211399000 : _mm_loadu_si128((__m128i *)(eb_warped_filter +
704 211399000 : ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
705 : __m128i filt_02 =
706 211399000 : _mm_loadu_si128((__m128i *)(eb_warped_filter +
707 105699000 : ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
708 : __m128i filt_03 =
709 105699000 : _mm_loadu_si128((__m128i *)(eb_warped_filter +
710 105699000 : ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
711 :
712 105699000 : __m256i filt_0 = _mm256_broadcastsi128_si256(filt_00);
713 105699000 : __m256i filt_1 = _mm256_broadcastsi128_si256(filt_01);
714 105699000 : __m256i filt_2 = _mm256_broadcastsi128_si256(filt_02);
715 105699000 : __m256i filt_3 = _mm256_broadcastsi128_si256(filt_03);
716 :
717 105699000 : __m256i res_0 = _mm256_unpacklo_epi32(filt_0, filt_1);
718 105699000 : __m256i res_1 = _mm256_unpacklo_epi32(filt_2, filt_3);
719 105699000 : __m256i res_2 = _mm256_unpackhi_epi32(filt_0, filt_1);
720 105699000 : __m256i res_3 = _mm256_unpackhi_epi32(filt_2, filt_3);
721 :
722 105699000 : coeffs[0] = _mm256_unpacklo_epi64(res_0, res_1);
723 105699000 : coeffs[1] = _mm256_unpackhi_epi64(res_0, res_1);
724 105699000 : coeffs[2] = _mm256_unpacklo_epi64(res_2, res_3);
725 105699000 : coeffs[3] = _mm256_unpackhi_epi64(res_2, res_3);
726 :
727 : filt_00 =
728 211399000 : _mm_loadu_si128((__m128i *)(eb_warped_filter +
729 105699000 : ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
730 : filt_01 =
731 211399000 : _mm_loadu_si128((__m128i *)(eb_warped_filter +
732 105699000 : ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
733 : filt_02 =
734 211399000 : _mm_loadu_si128((__m128i *)(eb_warped_filter +
735 105699000 : ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
736 : filt_03 =
737 105699000 : _mm_loadu_si128((__m128i *)(eb_warped_filter +
738 105699000 : ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
739 :
740 105699000 : filt_0 = _mm256_broadcastsi128_si256(filt_00);
741 105699000 : filt_1 = _mm256_broadcastsi128_si256(filt_01);
742 105699000 : filt_2 = _mm256_broadcastsi128_si256(filt_02);
743 105699000 : filt_3 = _mm256_broadcastsi128_si256(filt_03);
744 :
745 105699000 : res_0 = _mm256_unpacklo_epi32(filt_0, filt_1);
746 105699000 : res_1 = _mm256_unpacklo_epi32(filt_2, filt_3);
747 105699000 : res_2 = _mm256_unpackhi_epi32(filt_0, filt_1);
748 105699000 : res_3 = _mm256_unpackhi_epi32(filt_2, filt_3);
749 :
750 105699000 : coeffs[4] = _mm256_unpacklo_epi64(res_0, res_1);
751 105699000 : coeffs[5] = _mm256_unpackhi_epi64(res_0, res_1);
752 105699000 : coeffs[6] = _mm256_unpacklo_epi64(res_2, res_3);
753 105699000 : coeffs[7] = _mm256_unpackhi_epi64(res_2, res_3);
754 105699000 : }
755 :
756 2838490 : static INLINE void prepare_vertical_filter_coeffs_gamma0_avx2(int delta, int sy,
757 : __m256i *coeffs) {
758 5676990 : const __m128i filt_0 = _mm_loadu_si128(
759 2838490 : (__m128i *)(eb_warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
760 2838490 : const __m128i filt_1 = _mm_loadu_si128(
761 2838490 : (__m128i *)(eb_warped_filter + ((sy + delta) >> WARPEDDIFF_PREC_BITS)));
762 :
763 : __m256i res_0 =
764 2838490 : _mm256_inserti128_si256(_mm256_castsi128_si256(filt_0), filt_1, 0x1);
765 :
766 5676990 : coeffs[0] = _mm256_shuffle_epi8(
767 : res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask0_avx2));
768 5676990 : coeffs[1] = _mm256_shuffle_epi8(
769 : res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask1_avx2));
770 5676990 : coeffs[2] = _mm256_shuffle_epi8(
771 : res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask2_avx2));
772 2838490 : coeffs[3] = _mm256_shuffle_epi8(
773 : res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask3_avx2));
774 :
775 2838490 : coeffs[4] = coeffs[0];
776 2838490 : coeffs[5] = coeffs[1];
777 2838490 : coeffs[6] = coeffs[2];
778 2838490 : coeffs[7] = coeffs[3];
779 2838490 : }
780 :
781 1338280000 : static INLINE void filter_src_pixels_vertical_avx2(__m256i *horz_out,
782 : __m256i *src,
783 : __m256i *coeffs,
784 : __m256i *res_lo,
785 : __m256i *res_hi, int row) {
786 1338280000 : const __m256i src_6 = horz_out[row + 3];
787 : const __m256i src_7 =
788 1338280000 : _mm256_permute2x128_si256(horz_out[row + 3], horz_out[row + 4], 0x21);
789 :
790 1338280000 : src[6] = _mm256_unpacklo_epi16(src_6, src_7);
791 :
792 1338280000 : const __m256i res_0 = _mm256_madd_epi16(src[0], coeffs[0]);
793 1338280000 : const __m256i res_2 = _mm256_madd_epi16(src[2], coeffs[1]);
794 1338280000 : const __m256i res_4 = _mm256_madd_epi16(src[4], coeffs[2]);
795 2676550000 : const __m256i res_6 = _mm256_madd_epi16(src[6], coeffs[3]);
796 :
797 2676550000 : const __m256i res_even = _mm256_add_epi32(_mm256_add_epi32(res_0, res_2),
798 : _mm256_add_epi32(res_4, res_6));
799 :
800 1338280000 : src[7] = _mm256_unpackhi_epi16(src_6, src_7);
801 :
802 1338280000 : const __m256i res_1 = _mm256_madd_epi16(src[1], coeffs[4]);
803 1338280000 : const __m256i res_3 = _mm256_madd_epi16(src[3], coeffs[5]);
804 1338280000 : const __m256i res_5 = _mm256_madd_epi16(src[5], coeffs[6]);
805 2676550000 : const __m256i res_7 = _mm256_madd_epi16(src[7], coeffs[7]);
806 :
807 4014830000 : const __m256i res_odd = _mm256_add_epi32(_mm256_add_epi32(res_1, res_3),
808 : _mm256_add_epi32(res_5, res_7));
809 :
810 : // Rearrange pixels back into the order 0 ... 7
811 1338280000 : *res_lo = _mm256_unpacklo_epi32(res_even, res_odd);
812 1338280000 : *res_hi = _mm256_unpackhi_epi32(res_even, res_odd);
813 1338280000 : }
814 :
815 1344320000 : static INLINE void store_vertical_filter_output_avx2(
816 : const __m256i *res_lo, const __m256i *res_hi, const __m256i *res_add_const,
817 : const __m256i *wt, const __m256i *res_sub_const,
818 : const __m256i *round_bits_const, uint8_t *pred, ConvolveParams *conv_params,
819 : int i, int j, int k, const int reduce_bits_vert, int p_stride, int p_width,
820 : const int round_bits) {
821 1344320000 : __m256i res_lo_1 = *res_lo;
822 1344320000 : __m256i res_hi_1 = *res_hi;
823 :
824 1344320000 : if (conv_params->is_compound) {
825 99195600 : __m128i *const p_0 =
826 99195600 : (__m128i *)&conv_params->dst[(i + k + 4) * conv_params->dst_stride + j];
827 99195600 : __m128i *const p_1 =
828 : (__m128i *)&conv_params
829 99195600 : ->dst[(i + (k + 1) + 4) * conv_params->dst_stride + j];
830 :
831 297587000 : res_lo_1 = _mm256_srai_epi32(_mm256_add_epi32(res_lo_1, *res_add_const),
832 : reduce_bits_vert);
833 :
834 99195600 : const __m256i temp_lo_16 = _mm256_packus_epi32(res_lo_1, res_lo_1);
835 : __m256i res_lo_16;
836 99195600 : if (conv_params->do_average) {
837 49651300 : __m128i *const dst8_0 = (__m128i *)&pred[(i + k + 4) * p_stride + j];
838 49651300 : __m128i *const dst8_1 =
839 49651300 : (__m128i *)&pred[(i + (k + 1) + 4) * p_stride + j];
840 49651300 : const __m128i p_16_0 = _mm_loadl_epi64(p_0);
841 49651300 : const __m128i p_16_1 = _mm_loadl_epi64(p_1);
842 : const __m256i p_16 =
843 49651300 : _mm256_inserti128_si256(_mm256_castsi128_si256(p_16_0), p_16_1, 1);
844 49651300 : if (conv_params->use_jnt_comp_avg) {
845 23963700 : const __m256i p_16_lo = _mm256_unpacklo_epi16(p_16, temp_lo_16);
846 47927300 : const __m256i wt_res_lo = _mm256_madd_epi16(p_16_lo, *wt);
847 : const __m256i shifted_32 =
848 23963700 : _mm256_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
849 23963700 : res_lo_16 = _mm256_packus_epi32(shifted_32, shifted_32);
850 : }
851 : else
852 51375300 : res_lo_16 = _mm256_srai_epi16(_mm256_add_epi16(p_16, temp_lo_16), 1);
853 49651300 : res_lo_16 = _mm256_add_epi16(res_lo_16, *res_sub_const);
854 148954000 : res_lo_16 = _mm256_srai_epi16(
855 : _mm256_add_epi16(res_lo_16, *round_bits_const), round_bits);
856 49651300 : const __m256i res_8_lo = _mm256_packus_epi16(res_lo_16, res_lo_16);
857 49651300 : const __m128i res_8_lo_0 = _mm256_castsi256_si128(res_8_lo);
858 49651300 : const __m128i res_8_lo_1 = _mm256_extracti128_si256(res_8_lo, 1);
859 49651300 : *(uint32_t *)dst8_0 = _mm_cvtsi128_si32(res_8_lo_0);
860 49651300 : *(uint32_t *)dst8_1 = _mm_cvtsi128_si32(res_8_lo_1);
861 : }
862 : else {
863 49544300 : const __m128i temp_lo_16_0 = _mm256_castsi256_si128(temp_lo_16);
864 49544300 : const __m128i temp_lo_16_1 = _mm256_extracti128_si256(temp_lo_16, 1);
865 49544300 : _mm_storel_epi64(p_0, temp_lo_16_0);
866 49544300 : _mm_storel_epi64(p_1, temp_lo_16_1);
867 : }
868 99195600 : if (p_width > 4) {
869 99209800 : __m128i *const p4_0 =
870 : (__m128i *)&conv_params
871 99209800 : ->dst[(i + k + 4) * conv_params->dst_stride + j + 4];
872 99209800 : __m128i *const p4_1 =
873 : (__m128i *)&conv_params
874 99209800 : ->dst[(i + (k + 1) + 4) * conv_params->dst_stride + j + 4];
875 297629000 : res_hi_1 = _mm256_srai_epi32(_mm256_add_epi32(res_hi_1, *res_add_const),
876 : reduce_bits_vert);
877 99209800 : const __m256i temp_hi_16 = _mm256_packus_epi32(res_hi_1, res_hi_1);
878 : __m256i res_hi_16;
879 99209800 : if (conv_params->do_average) {
880 49649000 : __m128i *const dst8_4_0 =
881 49649000 : (__m128i *)&pred[(i + k + 4) * p_stride + j + 4];
882 49649000 : __m128i *const dst8_4_1 =
883 49649000 : (__m128i *)&pred[(i + (k + 1) + 4) * p_stride + j + 4];
884 49649000 : const __m128i p4_16_0 = _mm_loadl_epi64(p4_0);
885 49649000 : const __m128i p4_16_1 = _mm_loadl_epi64(p4_1);
886 49649000 : const __m256i p4_16 = _mm256_inserti128_si256(
887 : _mm256_castsi128_si256(p4_16_0), p4_16_1, 1);
888 49649000 : if (conv_params->use_jnt_comp_avg) {
889 23963600 : const __m256i p_16_hi = _mm256_unpacklo_epi16(p4_16, temp_hi_16);
890 47927100 : const __m256i wt_res_hi = _mm256_madd_epi16(p_16_hi, *wt);
891 : const __m256i shifted_32 =
892 23963600 : _mm256_srai_epi32(wt_res_hi, DIST_PRECISION_BITS);
893 23963600 : res_hi_16 = _mm256_packus_epi32(shifted_32, shifted_32);
894 : }
895 : else
896 51370800 : res_hi_16 = _mm256_srai_epi16(_mm256_add_epi16(p4_16, temp_hi_16), 1);
897 49649000 : res_hi_16 = _mm256_add_epi16(res_hi_16, *res_sub_const);
898 148947000 : res_hi_16 = _mm256_srai_epi16(
899 : _mm256_add_epi16(res_hi_16, *round_bits_const), round_bits);
900 49649000 : __m256i res_8_hi = _mm256_packus_epi16(res_hi_16, res_hi_16);
901 49649000 : const __m128i res_8_hi_0 = _mm256_castsi256_si128(res_8_hi);
902 49649000 : const __m128i res_8_hi_1 = _mm256_extracti128_si256(res_8_hi, 1);
903 49649000 : *(uint32_t *)dst8_4_0 = _mm_cvtsi128_si32(res_8_hi_0);
904 49649000 : *(uint32_t *)dst8_4_1 = _mm_cvtsi128_si32(res_8_hi_1);
905 : }
906 : else {
907 49560800 : const __m128i temp_hi_16_0 = _mm256_castsi256_si128(temp_hi_16);
908 49560800 : const __m128i temp_hi_16_1 = _mm256_extracti128_si256(temp_hi_16, 1);
909 49560800 : _mm_storel_epi64(p4_0, temp_hi_16_0);
910 49560800 : _mm_storel_epi64(p4_1, temp_hi_16_1);
911 : }
912 : }
913 : }
914 : else {
915 2490240000 : const __m256i res_lo_round = _mm256_srai_epi32(
916 : _mm256_add_epi32(res_lo_1, *res_add_const), reduce_bits_vert);
917 3735360000 : const __m256i res_hi_round = _mm256_srai_epi32(
918 : _mm256_add_epi32(res_hi_1, *res_add_const), reduce_bits_vert);
919 :
920 1245120000 : const __m256i res_16bit = _mm256_packs_epi32(res_lo_round, res_hi_round);
921 1245120000 : const __m256i res_8bit = _mm256_packus_epi16(res_16bit, res_16bit);
922 1245120000 : const __m128i res_8bit0 = _mm256_castsi256_si128(res_8bit);
923 1245120000 : const __m128i res_8bit1 = _mm256_extracti128_si256(res_8bit, 1);
924 :
925 : // Store, blending with 'pred' if needed
926 1245120000 : __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
927 1245120000 : __m128i *const p1 = (__m128i *)&pred[(i + (k + 1) + 4) * p_stride + j];
928 :
929 1245120000 : if (p_width == 4) {
930 0 : *(uint32_t *)p = _mm_cvtsi128_si32(res_8bit0);
931 0 : *(uint32_t *)p1 = _mm_cvtsi128_si32(res_8bit1);
932 : }
933 : else {
934 1245120000 : _mm_storel_epi64(p, res_8bit0);
935 1245120000 : _mm_storel_epi64(p1, res_8bit1);
936 : }
937 : }
938 1344320000 : }
939 :
940 235245000 : static INLINE void warp_vertical_filter_avx2(
941 : uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params,
942 : int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width,
943 : int i, int j, int sy4, const int reduce_bits_vert,
944 : const __m256i *res_add_const, const int round_bits,
945 : const __m256i *res_sub_const, const __m256i *round_bits_const,
946 : const __m256i *wt) {
947 235245000 : int k, row = 0;
948 : __m256i src[8];
949 235245000 : const __m256i src_0 = horz_out[0];
950 : const __m256i src_1 =
951 235245000 : _mm256_permute2x128_si256(horz_out[0], horz_out[1], 0x21);
952 235245000 : const __m256i src_2 = horz_out[1];
953 : const __m256i src_3 =
954 235245000 : _mm256_permute2x128_si256(horz_out[1], horz_out[2], 0x21);
955 235245000 : const __m256i src_4 = horz_out[2];
956 : const __m256i src_5 =
957 235245000 : _mm256_permute2x128_si256(horz_out[2], horz_out[3], 0x21);
958 :
959 235245000 : src[0] = _mm256_unpacklo_epi16(src_0, src_1);
960 235245000 : src[2] = _mm256_unpacklo_epi16(src_2, src_3);
961 235245000 : src[4] = _mm256_unpacklo_epi16(src_4, src_5);
962 :
963 235245000 : src[1] = _mm256_unpackhi_epi16(src_0, src_1);
964 235245000 : src[3] = _mm256_unpackhi_epi16(src_2, src_3);
965 235245000 : src[5] = _mm256_unpackhi_epi16(src_4, src_5);
966 :
967 1170020000 : for (k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) {
968 934544000 : int sy = sy4 + delta * (k + 4);
969 : __m256i coeffs[8];
970 934544000 : prepare_vertical_filter_coeffs_avx2(gamma, delta, sy, coeffs);
971 : __m256i res_lo, res_hi;
972 935932000 : filter_src_pixels_vertical_avx2(horz_out, src, coeffs, &res_lo, &res_hi,
973 : row);
974 934838000 : store_vertical_filter_output_avx2(&res_lo, &res_hi, res_add_const, wt,
975 : res_sub_const, round_bits_const, pred,
976 : conv_params, i, j, k, reduce_bits_vert,
977 : p_stride, p_width, round_bits);
978 934780000 : src[0] = src[2];
979 934780000 : src[2] = src[4];
980 934780000 : src[4] = src[6];
981 934780000 : src[1] = src[3];
982 934780000 : src[3] = src[5];
983 934780000 : src[5] = src[7];
984 :
985 934780000 : row += 1;
986 : }
987 235480000 : }
988 :
989 653834 : static INLINE void warp_vertical_filter_gamma0_avx2(
990 : uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params,
991 : int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width,
992 : int i, int j, int sy4, const int reduce_bits_vert,
993 : const __m256i *res_add_const, const int round_bits,
994 : const __m256i *res_sub_const, const __m256i *round_bits_const,
995 : const __m256i *wt) {
996 : (void)gamma;
997 653834 : int k, row = 0;
998 : __m256i src[8];
999 653834 : const __m256i src_0 = horz_out[0];
1000 : const __m256i src_1 =
1001 653834 : _mm256_permute2x128_si256(horz_out[0], horz_out[1], 0x21);
1002 653834 : const __m256i src_2 = horz_out[1];
1003 : const __m256i src_3 =
1004 653834 : _mm256_permute2x128_si256(horz_out[1], horz_out[2], 0x21);
1005 653834 : const __m256i src_4 = horz_out[2];
1006 : const __m256i src_5 =
1007 653834 : _mm256_permute2x128_si256(horz_out[2], horz_out[3], 0x21);
1008 :
1009 653834 : src[0] = _mm256_unpacklo_epi16(src_0, src_1);
1010 653834 : src[2] = _mm256_unpacklo_epi16(src_2, src_3);
1011 653834 : src[4] = _mm256_unpacklo_epi16(src_4, src_5);
1012 :
1013 653834 : src[1] = _mm256_unpackhi_epi16(src_0, src_1);
1014 653834 : src[3] = _mm256_unpackhi_epi16(src_2, src_3);
1015 653834 : src[5] = _mm256_unpackhi_epi16(src_4, src_5);
1016 :
1017 3269010 : for (k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) {
1018 2615180 : int sy = sy4 + delta * (k + 4);
1019 : __m256i coeffs[8];
1020 2615180 : prepare_vertical_filter_coeffs_gamma0_avx2(delta, sy, coeffs);
1021 : __m256i res_lo, res_hi;
1022 2615210 : filter_src_pixels_vertical_avx2(horz_out, src, coeffs, &res_lo, &res_hi,
1023 : row);
1024 2615230 : store_vertical_filter_output_avx2(&res_lo, &res_hi, res_add_const, wt,
1025 : res_sub_const, round_bits_const, pred,
1026 : conv_params, i, j, k, reduce_bits_vert,
1027 : p_stride, p_width, round_bits);
1028 2615180 : src[0] = src[2];
1029 2615180 : src[2] = src[4];
1030 2615180 : src[4] = src[6];
1031 2615180 : src[1] = src[3];
1032 2615180 : src[3] = src[5];
1033 2615180 : src[5] = src[7];
1034 2615180 : row += 1;
1035 : }
1036 653831 : }
1037 :
1038 105581000 : static INLINE void warp_vertical_filter_delta0_avx2(
1039 : uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params,
1040 : int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width,
1041 : int i, int j, int sy4, const int reduce_bits_vert,
1042 : const __m256i *res_add_const, const int round_bits,
1043 : const __m256i *res_sub_const, const __m256i *round_bits_const,
1044 : const __m256i *wt) {
1045 : (void)delta;
1046 105581000 : int k, row = 0;
1047 : __m256i src[8], coeffs[8];
1048 105581000 : const __m256i src_0 = horz_out[0];
1049 : const __m256i src_1 =
1050 105581000 : _mm256_permute2x128_si256(horz_out[0], horz_out[1], 0x21);
1051 105581000 : const __m256i src_2 = horz_out[1];
1052 : const __m256i src_3 =
1053 105581000 : _mm256_permute2x128_si256(horz_out[1], horz_out[2], 0x21);
1054 105581000 : const __m256i src_4 = horz_out[2];
1055 : const __m256i src_5 =
1056 105581000 : _mm256_permute2x128_si256(horz_out[2], horz_out[3], 0x21);
1057 :
1058 105581000 : src[0] = _mm256_unpacklo_epi16(src_0, src_1);
1059 105581000 : src[2] = _mm256_unpacklo_epi16(src_2, src_3);
1060 105581000 : src[4] = _mm256_unpacklo_epi16(src_4, src_5);
1061 :
1062 105581000 : src[1] = _mm256_unpackhi_epi16(src_0, src_1);
1063 105581000 : src[3] = _mm256_unpackhi_epi16(src_2, src_3);
1064 105581000 : src[5] = _mm256_unpackhi_epi16(src_4, src_5);
1065 :
1066 105581000 : prepare_vertical_filter_coeffs_delta0_avx2(gamma, sy4, coeffs);
1067 :
1068 526373000 : for (k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) {
1069 : __m256i res_lo, res_hi;
1070 420703000 : filter_src_pixels_vertical_avx2(horz_out, src, coeffs, &res_lo, &res_hi,
1071 : row);
1072 420797000 : store_vertical_filter_output_avx2(&res_lo, &res_hi, res_add_const, wt,
1073 : res_sub_const, round_bits_const, pred,
1074 : conv_params, i, j, k, reduce_bits_vert,
1075 : p_stride, p_width, round_bits);
1076 420648000 : src[0] = src[2];
1077 420648000 : src[2] = src[4];
1078 420648000 : src[4] = src[6];
1079 420648000 : src[1] = src[3];
1080 420648000 : src[3] = src[5];
1081 420648000 : src[5] = src[7];
1082 420648000 : row += 1;
1083 : }
1084 105671000 : }
1085 :
1086 223388 : static INLINE void warp_vertical_filter_gamma0_delta0_avx2(
1087 : uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params,
1088 : int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width,
1089 : int i, int j, int sy4, const int reduce_bits_vert,
1090 : const __m256i *res_add_const, const int round_bits,
1091 : const __m256i *res_sub_const, const __m256i *round_bits_const,
1092 : const __m256i *wt) {
1093 : (void)gamma;
1094 223388 : int k, row = 0;
1095 : __m256i src[8], coeffs[8];
1096 223388 : const __m256i src_0 = horz_out[0];
1097 : const __m256i src_1 =
1098 223388 : _mm256_permute2x128_si256(horz_out[0], horz_out[1], 0x21);
1099 223388 : const __m256i src_2 = horz_out[1];
1100 : const __m256i src_3 =
1101 223388 : _mm256_permute2x128_si256(horz_out[1], horz_out[2], 0x21);
1102 223388 : const __m256i src_4 = horz_out[2];
1103 : const __m256i src_5 =
1104 223388 : _mm256_permute2x128_si256(horz_out[2], horz_out[3], 0x21);
1105 :
1106 223388 : src[0] = _mm256_unpacklo_epi16(src_0, src_1);
1107 223388 : src[2] = _mm256_unpacklo_epi16(src_2, src_3);
1108 223388 : src[4] = _mm256_unpacklo_epi16(src_4, src_5);
1109 :
1110 223388 : src[1] = _mm256_unpackhi_epi16(src_0, src_1);
1111 223388 : src[3] = _mm256_unpackhi_epi16(src_2, src_3);
1112 223388 : src[5] = _mm256_unpackhi_epi16(src_4, src_5);
1113 :
1114 223388 : prepare_vertical_filter_coeffs_gamma0_avx2(delta, sy4, coeffs);
1115 :
1116 1116920 : for (k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) {
1117 : __m256i res_lo, res_hi;
1118 893539 : filter_src_pixels_vertical_avx2(horz_out, src, coeffs, &res_lo, &res_hi,
1119 : row);
1120 893537 : store_vertical_filter_output_avx2(&res_lo, &res_hi, res_add_const, wt,
1121 : res_sub_const, round_bits_const, pred,
1122 : conv_params, i, j, k, reduce_bits_vert,
1123 : p_stride, p_width, round_bits);
1124 893536 : src[0] = src[2];
1125 893536 : src[2] = src[4];
1126 893536 : src[4] = src[6];
1127 893536 : src[1] = src[3];
1128 893536 : src[3] = src[5];
1129 893536 : src[5] = src[7];
1130 893536 : row += 1;
1131 : }
1132 223386 : }
1133 :
1134 339906000 : static INLINE void prepare_warp_vertical_filter_avx2(
1135 : uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params,
1136 : int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width,
1137 : int i, int j, int sy4, const int reduce_bits_vert,
1138 : const __m256i *res_add_const, const int round_bits,
1139 : const __m256i *res_sub_const, const __m256i *round_bits_const,
1140 : const __m256i *wt) {
1141 339906000 : if (gamma == 0 && delta == 0)
1142 223388 : warp_vertical_filter_gamma0_delta0_avx2(
1143 : pred, horz_out, conv_params, gamma, delta, p_height, p_stride, p_width,
1144 : i, j, sy4, reduce_bits_vert, res_add_const, round_bits, res_sub_const,
1145 : round_bits_const, wt);
1146 339683000 : else if (gamma == 0 && delta != 0)
1147 653834 : warp_vertical_filter_gamma0_avx2(
1148 : pred, horz_out, conv_params, gamma, delta, p_height, p_stride, p_width,
1149 : i, j, sy4, reduce_bits_vert, res_add_const, round_bits, res_sub_const,
1150 : round_bits_const, wt);
1151 339029000 : else if (gamma != 0 && delta == 0)
1152 105591000 : warp_vertical_filter_delta0_avx2(
1153 : pred, horz_out, conv_params, gamma, delta, p_height, p_stride, p_width,
1154 : i, j, sy4, reduce_bits_vert, res_add_const, round_bits, res_sub_const,
1155 : round_bits_const, wt);
1156 : else
1157 233438000 : warp_vertical_filter_avx2(pred, horz_out, conv_params, gamma, delta,
1158 : p_height, p_stride, p_width, i, j, sy4,
1159 : reduce_bits_vert, res_add_const, round_bits,
1160 : res_sub_const, round_bits_const, wt);
1161 342018000 : }
1162 :
1163 332135000 : static INLINE void prepare_warp_horizontal_filter_avx2(
1164 : const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4,
1165 : int32_t sx4, int alpha, int beta, int p_height, int height, int i,
1166 : const __m256i *round_const, const __m128i *shift,
1167 : const __m256i *shuffle_src) {
1168 332135000 : if (alpha == 0 && beta == 0)
1169 274245 : warp_horizontal_filter_alpha0_beta0_avx2(
1170 : ref, horz_out, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i,
1171 : round_const, shift, shuffle_src);
1172 331861000 : else if (alpha == 0 && beta != 0)
1173 102404000 : warp_horizontal_filter_alpha0_avx2(ref, horz_out, stride, ix4, iy4, sx4,
1174 : alpha, beta, p_height, height, i,
1175 : round_const, shift, shuffle_src);
1176 229457000 : else if (alpha != 0 && beta == 0)
1177 898172 : warp_horizontal_filter_beta0_avx2(ref, horz_out, stride, ix4, iy4, sx4,
1178 : alpha, beta, p_height, height, i,
1179 : round_const, shift, shuffle_src);
1180 : else
1181 228558000 : warp_horizontal_filter_avx2(ref, horz_out, stride, ix4, iy4, sx4, alpha,
1182 : beta, p_height, height, i, round_const, shift,
1183 : shuffle_src);
1184 332617000 : }
1185 :
1186 19023800 : int64_t eb_av1_calc_frame_error_avx2(const uint8_t *const ref, int ref_stride,
1187 : const uint8_t *const dst, int p_width,
1188 : int p_height, int dst_stride) {
1189 19023800 : int64_t sum_error = 0;
1190 : int i, j;
1191 : __m256i row_error, col_error;
1192 19023800 : __m256i zero = _mm256_set1_epi16(0);
1193 19023800 : __m256i dup_255 = _mm256_set1_epi16(255);
1194 19023800 : col_error = zero;
1195 :
1196 169151000 : for (i = 0; i < (p_height / 4); i++) {
1197 150127000 : row_error = _mm256_set1_epi16(0);
1198 449895000 : for (j = 0; j < (p_width / 16); j++) {
1199 299768000 : __m256i ref_1_16 = _mm256_cvtepu8_epi16(_mm_loadu_si128(
1200 299768000 : (__m128i *)(ref + (j * 16) + (((i * 4) + 0) * ref_stride))));
1201 299768000 : __m256i dst_1_16 = _mm256_cvtepu8_epi16(_mm_loadu_si128(
1202 299768000 : (__m128i *)(dst + (j * 16) + (((i * 4) + 0) * dst_stride))));
1203 299768000 : __m256i ref_2_16 = _mm256_cvtepu8_epi16(_mm_loadu_si128(
1204 299768000 : (__m128i *)(ref + (j * 16) + (((i * 4) + 1) * ref_stride))));
1205 299768000 : __m256i dst_2_16 = _mm256_cvtepu8_epi16(_mm_loadu_si128(
1206 299768000 : (__m128i *)(dst + (j * 16) + (((i * 4) + 1) * dst_stride))));
1207 299768000 : __m256i ref_3_16 = _mm256_cvtepu8_epi16(_mm_loadu_si128(
1208 299768000 : (__m128i *)(ref + (j * 16) + (((i * 4) + 2) * ref_stride))));
1209 299768000 : __m256i dst_3_16 = _mm256_cvtepu8_epi16(_mm_loadu_si128(
1210 299768000 : (__m128i *)(dst + (j * 16) + (((i * 4) + 2) * dst_stride))));
1211 299768000 : __m256i ref_4_16 = _mm256_cvtepu8_epi16(_mm_loadu_si128(
1212 299768000 : (__m128i *)(ref + (j * 16) + (((i * 4) + 3) * ref_stride))));
1213 599535000 : __m256i dst_4_16 = _mm256_cvtepu8_epi16(_mm_loadu_si128(
1214 299768000 : (__m128i *)(dst + (j * 16) + (((i * 4) + 3) * dst_stride))));
1215 :
1216 : __m256i diff_1 =
1217 599535000 : _mm256_add_epi16(_mm256_sub_epi16(dst_1_16, ref_1_16), dup_255);
1218 : __m256i diff_2 =
1219 599535000 : _mm256_add_epi16(_mm256_sub_epi16(dst_2_16, ref_2_16), dup_255);
1220 : __m256i diff_3 =
1221 599535000 : _mm256_add_epi16(_mm256_sub_epi16(dst_3_16, ref_3_16), dup_255);
1222 : __m256i diff_4 =
1223 599535000 : _mm256_add_epi16(_mm256_sub_epi16(dst_4_16, ref_4_16), dup_255);
1224 :
1225 299768000 : __m256i diff_1_lo = _mm256_unpacklo_epi16(diff_1, zero);
1226 299768000 : __m256i diff_1_hi = _mm256_unpackhi_epi16(diff_1, zero);
1227 299768000 : __m256i diff_2_lo = _mm256_unpacklo_epi16(diff_2, zero);
1228 299768000 : __m256i diff_2_hi = _mm256_unpackhi_epi16(diff_2, zero);
1229 299768000 : __m256i diff_3_lo = _mm256_unpacklo_epi16(diff_3, zero);
1230 299768000 : __m256i diff_3_hi = _mm256_unpackhi_epi16(diff_3, zero);
1231 299768000 : __m256i diff_4_lo = _mm256_unpacklo_epi16(diff_4, zero);
1232 299768000 : __m256i diff_4_hi = _mm256_unpackhi_epi16(diff_4, zero);
1233 :
1234 299768000 : __m256i error_1_lo =
1235 599535000 : _mm256_i32gather_epi32(error_measure_lut, diff_1_lo, 4);
1236 299768000 : __m256i error_1_hi =
1237 899303000 : _mm256_i32gather_epi32(error_measure_lut, diff_1_hi, 4);
1238 299768000 : __m256i error_2_lo =
1239 599535000 : _mm256_i32gather_epi32(error_measure_lut, diff_2_lo, 4);
1240 299768000 : __m256i error_2_hi =
1241 599535000 : _mm256_i32gather_epi32(error_measure_lut, diff_2_hi, 4);
1242 299768000 : __m256i error_3_lo =
1243 599535000 : _mm256_i32gather_epi32(error_measure_lut, diff_3_lo, 4);
1244 299768000 : __m256i error_3_hi =
1245 599535000 : _mm256_i32gather_epi32(error_measure_lut, diff_3_hi, 4);
1246 299768000 : __m256i error_4_lo =
1247 599535000 : _mm256_i32gather_epi32(error_measure_lut, diff_4_lo, 4);
1248 299768000 : __m256i error_4_hi =
1249 599535000 : _mm256_i32gather_epi32(error_measure_lut, diff_4_hi, 4);
1250 :
1251 299768000 : __m256i error_1 = _mm256_add_epi32(error_1_lo, error_1_hi);
1252 299768000 : __m256i error_2 = _mm256_add_epi32(error_2_lo, error_2_hi);
1253 299768000 : __m256i error_3 = _mm256_add_epi32(error_3_lo, error_3_hi);
1254 299768000 : __m256i error_4 = _mm256_add_epi32(error_4_lo, error_4_hi);
1255 :
1256 299768000 : __m256i error_1_2 = _mm256_add_epi32(error_1, error_2);
1257 299768000 : __m256i error_3_4 = _mm256_add_epi32(error_3, error_4);
1258 :
1259 299768000 : __m256i error_1_2_3_4 = _mm256_add_epi32(error_1_2, error_3_4);
1260 299768000 : row_error = _mm256_add_epi32(row_error, error_1_2_3_4);
1261 : }
1262 150127000 : __m256i col_error_lo = _mm256_unpacklo_epi32(row_error, zero);
1263 150127000 : __m256i col_error_hi = _mm256_unpackhi_epi32(row_error, zero);
1264 150127000 : __m256i col_error_temp = _mm256_add_epi64(col_error_lo, col_error_hi);
1265 150127000 : col_error = _mm256_add_epi64(col_error, col_error_temp);
1266 : // Error summation for remaining width, which is not multiple of 16
1267 150127000 : if (p_width & 0xf) {
1268 0 : for (int k = 0; k < 4; ++k) {
1269 0 : for (int l = j * 16; l < p_width; ++l)
1270 0 : sum_error +=
1271 0 : (int64_t)error_measure(dst[l + ((i * 4) + k) * dst_stride] -
1272 0 : ref[l + ((i * 4) + k) * ref_stride]);
1273 : }
1274 : }
1275 : }
1276 19023800 : __m128i sum_error_q_0 = _mm256_castsi256_si128(col_error);
1277 19023800 : __m128i sum_error_q_1 = _mm256_extracti128_si256(col_error, 1);
1278 19023800 : sum_error_q_0 = _mm_add_epi64(sum_error_q_0, sum_error_q_1);
1279 : int64_t sum_error_d_0, sum_error_d_1;
1280 19023800 : _mm_storel_epi64((__m128i *)&sum_error_d_0, sum_error_q_0);
1281 19023800 : _mm_storel_epi64((__m128i *)&sum_error_d_1, _mm_srli_si128(sum_error_q_0, 8));
1282 19023800 : sum_error = (sum_error + sum_error_d_0 + sum_error_d_1);
1283 : // Error summation for remaining height, which is not multiple of 4
1284 19023800 : if (p_height & 0x3) {
1285 0 : for (int k = i * 4; k < p_height; ++k) {
1286 0 : for (int l = 0; l < p_width; ++l)
1287 0 : sum_error += (int64_t)error_measure(dst[l + k * dst_stride] -
1288 0 : ref[l + k * ref_stride]);
1289 : }
1290 : }
1291 19023800 : return sum_error;
1292 : }
1293 :
1294 26609200 : void eb_av1_warp_affine_avx2(const int32_t *mat, const uint8_t *ref, int width,
1295 : int height, int stride, uint8_t *pred, int p_col,
1296 : int p_row, int p_width, int p_height, int p_stride,
1297 : int subsampling_x, int subsampling_y,
1298 : ConvolveParams *conv_params, int16_t alpha,
1299 : int16_t beta, int16_t gamma, int16_t delta) {
1300 : __m256i horz_out[8];
1301 : int i, j, k;
1302 26609200 : const int bd = 8;
1303 26609200 : const int reduce_bits_horiz = conv_params->round_0;
1304 53218400 : const int reduce_bits_vert = conv_params->is_compound
1305 : ? conv_params->round_1
1306 26609200 : : 2 * FILTER_BITS - reduce_bits_horiz;
1307 26609200 : const int offset_bits_horiz = bd + FILTER_BITS - 1;
1308 26609200 : assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
1309 :
1310 26609200 : const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
1311 : const __m256i reduce_bits_vert_const =
1312 26609200 : _mm256_set1_epi32(((1 << reduce_bits_vert) >> 1));
1313 26609200 : const __m256i res_add_const = _mm256_set1_epi32(1 << offset_bits_vert);
1314 26609200 : const int round_bits =
1315 26609200 : 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
1316 26609200 : const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
1317 26609200 : assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
1318 :
1319 26609200 : const __m256i round_const = _mm256_set1_epi16(
1320 26609200 : (1 << offset_bits_horiz) + ((1 << reduce_bits_horiz) >> 1));
1321 26609200 : const __m128i shift = _mm_cvtsi32_si128(reduce_bits_horiz);
1322 :
1323 : __m256i res_sub_const, round_bits_const, wt;
1324 26609200 : unpack_weights_and_set_round_const_avx2(conv_params, round_bits, offset_bits,
1325 : &res_sub_const, &round_bits_const,
1326 : &wt);
1327 :
1328 : __m256i res_add_const_1;
1329 26611200 : if (conv_params->is_compound == 1)
1330 4275700 : res_add_const_1 = _mm256_add_epi32(reduce_bits_vert_const, res_add_const);
1331 : else
1332 22335500 : res_add_const_1 = _mm256_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) +
1333 22335500 : ((1 << reduce_bits_vert) >> 1));
1334 26611200 : const int32_t const1 = alpha * (-4) + beta * (-4) +
1335 26611200 : (1 << (WARPEDDIFF_PREC_BITS - 1)) +
1336 : (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
1337 26611200 : const int32_t const2 = gamma * (-4) + delta * (-4) +
1338 26611200 : (1 << (WARPEDDIFF_PREC_BITS - 1)) +
1339 : (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
1340 26611200 : const int32_t const3 = ((1 << WARP_PARAM_REDUCE_BITS) - 1);
1341 26611200 : const int16_t const4 = (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1));
1342 26611200 : const int16_t const5 = (1 << (FILTER_BITS - reduce_bits_horiz));
1343 :
1344 : __m256i shuffle_src[4];
1345 26611200 : shuffle_src[0] = _mm256_load_si256((__m256i *)shuffle_src0);
1346 26611200 : shuffle_src[1] = _mm256_load_si256((__m256i *)shuffle_src1);
1347 26611200 : shuffle_src[2] = _mm256_load_si256((__m256i *)shuffle_src2);
1348 26611200 : shuffle_src[3] = _mm256_load_si256((__m256i *)shuffle_src3);
1349 :
1350 118607000 : for (i = 0; i < p_height; i += 8) {
1351 433200000 : for (j = 0; j < p_width; j += 8) {
1352 341205000 : const int32_t src_x = (p_col + j + 4) << subsampling_x;
1353 341205000 : const int32_t src_y = (p_row + i + 4) << subsampling_y;
1354 341205000 : const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
1355 341205000 : const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
1356 341205000 : const int32_t x4 = dst_x >> subsampling_x;
1357 341205000 : const int32_t y4 = dst_y >> subsampling_y;
1358 :
1359 341205000 : int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
1360 341205000 : int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
1361 341205000 : int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
1362 341205000 : int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
1363 :
1364 : // Add in all the constant terms, including rounding and offset
1365 341205000 : sx4 += const1;
1366 341205000 : sy4 += const2;
1367 :
1368 341205000 : sx4 &= ~const3;
1369 341205000 : sy4 &= ~const3;
1370 :
1371 : // Horizontal filter
1372 : // If the block is aligned such that, after clamping, every sample
1373 : // would be taken from the leftmost/rightmost column, then we can
1374 : // skip the expensive horizontal filter.
1375 :
1376 341205000 : if (ix4 <= -7) {
1377 1048310 : int iy, row = 0;
1378 8386170 : for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
1379 7337880 : iy = iy4 + k;
1380 7337880 : iy = clamp(iy, 0, height - 1);
1381 : const __m256i temp_0 =
1382 7337820 : _mm256_set1_epi16(const4 + ref[iy * stride] * const5);
1383 7337820 : iy = iy4 + k + 1;
1384 7337820 : iy = clamp(iy, 0, height - 1);
1385 : const __m256i temp_1 =
1386 7337860 : _mm256_set1_epi16(const4 + ref[iy * stride] * const5);
1387 7337860 : horz_out[row] = _mm256_blend_epi32(temp_0, temp_1, 0xf0);
1388 7337860 : row += 1;
1389 : }
1390 1048290 : iy = iy4 + k;
1391 1048290 : iy = clamp(iy, 0, height - 1);
1392 2096630 : horz_out[row] = _mm256_set1_epi16(const4 + ref[iy * stride] * const5);
1393 : }
1394 340157000 : else if (ix4 >= width + 6) {
1395 178405 : int iy, row = 0;
1396 1427240 : for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
1397 1248840 : iy = iy4 + k;
1398 1248840 : iy = clamp(iy, 0, height - 1);
1399 2497670 : const __m256i temp_0 = _mm256_set1_epi16(
1400 1248840 : const4 + ref[iy * stride + (width - 1)] * const5);
1401 1248840 : iy = iy4 + k + 1;
1402 1248840 : iy = clamp(iy, 0, height - 1);
1403 2497670 : const __m256i temp_1 = _mm256_set1_epi16(
1404 1248840 : const4 + ref[iy * stride + (width - 1)] * const5);
1405 1248840 : horz_out[row] = _mm256_blend_epi32(temp_0, temp_1, 0xf0);
1406 1248840 : row += 1;
1407 : }
1408 178405 : iy = iy4 + k;
1409 178405 : iy = clamp(iy, 0, height - 1);
1410 178405 : horz_out[row] =
1411 356810 : _mm256_set1_epi16(const4 + ref[iy * stride + (width - 1)] * const5);
1412 : }
1413 347928000 : else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) {
1414 7853550 : const int out_of_boundary_left = -(ix4 - 6);
1415 7853550 : const int out_of_boundary_right = (ix4 + 8) - width;
1416 7853550 : int iy, sx, row = 0;
1417 63456900 : for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
1418 55589700 : iy = iy4 + k;
1419 55589700 : iy = clamp(iy, 0, height - 1);
1420 : __m128i src0 =
1421 55581300 : _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
1422 55581300 : iy = iy4 + k + 1;
1423 55581300 : iy = clamp(iy, 0, height - 1);
1424 : __m128i src1 =
1425 55538100 : _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
1426 :
1427 55538100 : if (out_of_boundary_left >= 0) {
1428 : const __m128i shuffle_reg_left =
1429 57098900 : _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]);
1430 28549500 : src0 = _mm_shuffle_epi8(src0, shuffle_reg_left);
1431 28549500 : src1 = _mm_shuffle_epi8(src1, shuffle_reg_left);
1432 : }
1433 55538100 : if (out_of_boundary_right >= 0) {
1434 27052500 : const __m128i shuffle_reg_right = _mm_loadu_si128(
1435 27052500 : (__m128i *)warp_pad_right[out_of_boundary_right]);
1436 27052500 : src0 = _mm_shuffle_epi8(src0, shuffle_reg_right);
1437 27052500 : src1 = _mm_shuffle_epi8(src1, shuffle_reg_right);
1438 : }
1439 55538100 : sx = sx4 + beta * (k + 4);
1440 : const __m256i src_01 =
1441 55538100 : _mm256_inserti128_si256(_mm256_castsi128_si256(src0), src1, 0x1);
1442 55538100 : horizontal_filter_avx2(src_01, horz_out, sx, alpha, beta, row,
1443 : shuffle_src, &round_const, &shift);
1444 55603300 : row += 1;
1445 : }
1446 7867200 : iy = iy4 + k;
1447 7867200 : iy = clamp(iy, 0, height - 1);
1448 7949600 : __m128i src = _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
1449 7949600 : if (out_of_boundary_left >= 0) {
1450 : const __m128i shuffle_reg_left =
1451 8163920 : _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]);
1452 4081960 : src = _mm_shuffle_epi8(src, shuffle_reg_left);
1453 : }
1454 7949600 : if (out_of_boundary_right >= 0) {
1455 : const __m128i shuffle_reg_right =
1456 7736110 : _mm_loadu_si128((__m128i *)warp_pad_right[out_of_boundary_right]);
1457 3868050 : src = _mm_shuffle_epi8(src, shuffle_reg_right);
1458 : }
1459 7949600 : sx = sx4 + beta * (k + 4);
1460 7949600 : const __m256i src_01 = _mm256_castsi128_si256(src);
1461 : __m256i coeff[4];
1462 7949600 : prepare_horizontal_filter_coeff(alpha, sx, coeff);
1463 7949850 : filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src,
1464 : &round_const, &shift, row);
1465 : }
1466 : else
1467 332125000 : prepare_warp_horizontal_filter_avx2(
1468 : ref, horz_out, stride, ix4, iy4, sx4, alpha, beta, p_height, height,
1469 : i, &round_const, &shift, shuffle_src);
1470 :
1471 : // Vertical filter
1472 339806000 : prepare_warp_vertical_filter_avx2(
1473 : pred, horz_out, conv_params, gamma, delta, p_height, p_stride,
1474 : p_width, i, j, sy4, reduce_bits_vert, &res_add_const_1, round_bits,
1475 : &res_sub_const, &round_bits_const, &wt);
1476 : }
1477 : }
1478 26668900 : }
|