Line data Source code
1 : /*
2 : * Copyright(c) 2019 Intel Corporation
3 : * SPDX - License - Identifier: BSD - 2 - Clause - Patent
4 : */
5 :
6 : #ifndef EbMemory_AVX2_h
7 : #define EbMemory_AVX2_h
8 :
9 : #include "synonyms.h"
10 :
11 : #ifdef __cplusplus
12 : extern "C" {
13 : #endif
14 :
15 : #ifndef _mm256_set_m128i
16 : #define _mm256_set_m128i(/* __m128i */ hi, /* __m128i */ lo) \
17 : _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
18 : #endif
19 :
20 : #ifndef _mm256_setr_m128i
21 : #define _mm256_setr_m128i(/* __m128i */ lo, /* __m128i */ hi) \
22 : _mm256_set_m128i((hi), (lo))
23 : #endif
24 :
25 : #ifndef _mm256_cvtsi256_si32
26 : #define _mm256_cvtsi256_si32(a) \
27 : _mm_cvtsi128_si32(_mm256_castsi256_si128(a))
28 : #endif
29 :
30 114883600 : static INLINE __m256i load_u8_4x4_avx2(const uint8_t *const src,
31 : const uint32_t stride)
32 : {
33 : __m128i src01, src23;
34 114883600 : src01 = _mm_cvtsi32_si128(*(int32_t*)(src + 0 * stride));
35 114883600 : src01 = _mm_insert_epi32(src01, *(int32_t *)(src + 1 * stride), 1);
36 114883600 : src23 = _mm_cvtsi32_si128(*(int32_t*)(src + 2 * stride));
37 114883600 : src23 = _mm_insert_epi32(src23, *(int32_t *)(src + 3 * stride), 1);
38 229767200 : return _mm256_setr_m128i(src01, src23);
39 : }
40 :
41 337398000 : static INLINE __m256i load_u8_8x2_avx2(const uint8_t *const src,
42 : const ptrdiff_t stride) {
43 337398000 : const __m128i s0 = _mm_loadl_epi64((__m128i *)src);
44 337398000 : const __m128i s1 = _mm_loadl_epi64((__m128i *)(src + stride));
45 674796000 : return _mm256_setr_m128i(s0, s1);
46 : }
47 :
48 611215900 : static INLINE __m256i load_u8_8x4_avx2(const uint8_t *const src,
49 : const uint32_t stride)
50 : {
51 : __m128i src01, src23;
52 611215900 : src01 = _mm_loadl_epi64((__m128i *)(src + 0 * stride));
53 1222519000 : src01 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(src01),
54 611215900 : (double *)(src + 1 * stride)));
55 611302500 : src23 = _mm_loadl_epi64((__m128i *)(src + 2 * stride));
56 1222762000 : src23 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(src23),
57 611302500 : (double *)(src + 3 * stride)));
58 1222915000 : return _mm256_setr_m128i(src01, src23);
59 : }
60 :
61 : static INLINE __m256i load_u8_16x2_avx2(const uint8_t *const src,
62 : const uint32_t stride)
63 : {
64 : const __m128i src0 = _mm_load_si128((__m128i *)(src + 0 * stride));
65 : const __m128i src1 = _mm_load_si128((__m128i *)(src + 1 * stride));
66 : return _mm256_setr_m128i(src0, src1);
67 : }
68 :
69 2528440000 : static INLINE __m256i loadu_8bit_16x2_avx2(const void *const src,
70 : const uint32_t strideInByte)
71 : {
72 2528440000 : const __m128i src0 = _mm_loadu_si128((__m128i *)src);
73 2528440000 : const __m128i src1 = _mm_loadu_si128((__m128i *)((uint8_t *)src + strideInByte));
74 5056875000 : return _mm256_setr_m128i(src0, src1);
75 : }
76 :
77 1159622400 : static INLINE __m256i loadu_u8_16x2_avx2(const uint8_t *const src,
78 : const uint32_t stride)
79 : {
80 1159622400 : return loadu_8bit_16x2_avx2(src, sizeof(*src) * stride);
81 : }
82 :
83 1366486000 : static INLINE __m256i loadu_u16_8x2_avx2(const uint16_t *const src,
84 : const uint32_t stride)
85 : {
86 1366486000 : return loadu_8bit_16x2_avx2(src, sizeof(*src) * stride);
87 : }
88 :
89 969758000 : static INLINE void storeu_8bit_16x2_avx2(const __m256i src,
90 : void *const dst, const int32_t strideInByte) {
91 969758000 : const __m128i d0 = _mm256_castsi256_si128(src);
92 969758000 : const __m128i d1 = _mm256_extracti128_si256(src, 1);
93 : _mm_storeu_si128((__m128i *)dst, d0);
94 969758000 : _mm_storeu_si128((__m128i *)((uint8_t *)dst + strideInByte), d1);
95 969758000 : }
96 :
97 385235400 : static INLINE void storeu_u8_16x2_avx2(const __m256i src,
98 : uint8_t *const dst,
99 : const int32_t stride) {
100 385235400 : storeu_8bit_16x2_avx2(src, dst, sizeof(*dst) * stride);
101 385019100 : }
102 :
103 137868000 : static INLINE void storeu_s16_8x2_avx2(const __m256i src,
104 : int16_t *const dst,
105 : const int32_t stride) {
106 137868000 : storeu_8bit_16x2_avx2(src, dst, sizeof(*dst) * stride);
107 137868000 : }
108 :
109 449856000 : static INLINE void storeu_u16_8x2_avx2(const __m256i src,
110 : uint16_t *const dst,
111 : const int32_t stride) {
112 449856000 : storeu_8bit_16x2_avx2(src, dst, sizeof(*dst) * stride);
113 450015000 : }
114 :
115 : #ifdef __cplusplus
116 : }
117 : #endif
118 : #endif // EbIntraPrediction_AVX2_h
|