Line data Source code
1 : /* 2 : * Copyright (c) 2016, Alliance for Open Media. All rights reserved 3 : * 4 : * This source code is subject to the terms of the BSD 2 Clause License and 5 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 : * was not distributed with this source code in the LICENSE file, you can 7 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 : * Media Patent License 1.0 was not distributed with this source code in the 9 : * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 : */ 11 : 12 : #ifndef EBVARIANCE_SSE2_H 13 : #define EBVARIANCE_SSE2_H 14 : 15 : #include "EbDefinitions.h" 16 : #include <assert.h> 17 : #include <emmintrin.h> // SSE2 18 : #include "aom_dsp_rtcd.h" 19 : #include "synonyms.h" 20 : 21 : // Read 4 samples from each of row and row + 1. Interleave the two rows and 22 : // zero-extend them to 16 bit samples stored in the lower half of an SSE 23 : // register. 24 : //static __m128i read64(const uint8_t *p, int32_t stride, int32_t row) { 25 : // __m128i row1 = xx_loadl_32(p + (row + 1) * stride); 26 : // return _mm_unpacklo_epi8(_mm_unpacklo_epi8(row0, row1), _mm_setzero_si128()); 27 : //} 28 : 29 1851520 : static INLINE __m128i load4x2_sse2(const uint8_t *const p, const int32_t stride) { 30 1851520 : const __m128i p0 = _mm_cvtsi32_si128(*(const uint32_t *)(p + 0 * stride)); 31 3703040 : const __m128i p1 = _mm_cvtsi32_si128(*(const uint32_t *)(p + 1 * stride)); 32 5554570 : return _mm_unpacklo_epi8(_mm_unpacklo_epi32(p0, p1), _mm_setzero_si128()); 33 : } 34 : 35 7070680 : static INLINE __m128i load8_8to16_sse2(const uint8_t *const p) { 36 7070680 : const __m128i p0 = _mm_loadl_epi64((const __m128i *)p); 37 14141400 : return _mm_unpacklo_epi8(p0, _mm_setzero_si128()); 38 : } 39 : 40 : // Accumulate 4 32bit numbers in val to 1 32bit number 41 470396 : static INLINE uint32_t add32x4_sse2(__m128i val) { 42 470396 : val = _mm_add_epi32(val, _mm_srli_si128(val, 8)); 43 940792 : val = _mm_add_epi32(val, _mm_srli_si128(val, 4)); 44 470396 : return _mm_cvtsi128_si32(val); 45 : } 46 : 47 0 : static INLINE void variance_kernel_no_sum_sse2(const __m128i src, const __m128i ref, 48 : __m128i *const sse) { 49 0 : const __m128i diff = _mm_sub_epi16(src, ref); 50 0 : *sse = _mm_add_epi32(*sse, _mm_madd_epi16(diff, diff)); 51 0 : } 52 : 53 : // Can handle 128 pixels' diff sum (such as 8x16 or 16x8) 54 : // Slightly faster than variance_final_256_pel_no_sum_sse2() 55 : // diff sum of 128 pixels can still fit in 16bit integer 56 0 : static INLINE void variance_final_128_pel_no_sum_sse2(__m128i vsse, 57 : uint32_t *const sse) { 58 0 : *sse = add32x4_sse2(vsse); 59 0 : } 60 : 61 : // Can handle 256 pixels' diff sum (such as 16x16) 62 0 : static INLINE void variance_final_256_pel_no_sum_sse2(__m128i vsse, 63 : uint32_t *const sse) { 64 0 : *sse = add32x4_sse2(vsse); 65 0 : } 66 : 67 0 : static INLINE void variance4_no_sum_sse2(const uint8_t *src, const int32_t src_stride, 68 : const uint8_t *ref, const int32_t ref_stride, 69 : const int32_t h, __m128i *const sse) { 70 0 : assert(h <= 256); // May overflow for larger height. 71 : 72 0 : for (int32_t i = 0; i < h; i += 2) { 73 0 : const __m128i s = load4x2_sse2(src, src_stride); 74 0 : const __m128i r = load4x2_sse2(ref, ref_stride); 75 : 76 0 : variance_kernel_no_sum_sse2(s, r, sse); 77 0 : src += 2 * src_stride; 78 0 : ref += 2 * ref_stride; 79 : } 80 0 : } 81 : 82 0 : static INLINE void variance8_no_sum_sse2(const uint8_t *src, const int32_t src_stride, 83 : const uint8_t *ref, const int32_t ref_stride, 84 : const int32_t h, __m128i *const sse) { 85 0 : assert(h <= 128); // May overflow for larger height. 86 0 : for (int32_t i = 0; i < h; i++) { 87 0 : const __m128i s = load8_8to16_sse2(src); 88 0 : const __m128i r = load8_8to16_sse2(ref); 89 : 90 0 : variance_kernel_no_sum_sse2(s, r, sse); 91 0 : src += src_stride; 92 0 : ref += ref_stride; 93 : } 94 0 : } 95 : 96 : #endif // EBVARIANCE_SSE2_H