LCOV - code coverage report
Current view: top level - ASM_SSE2 - encodetxb_sse2.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 264 264 100.0 %
Date: 2019-11-25 17:38:06 Functions: 17 17 100.0 %

          Line data    Source code
       1             : /*
       2             :  * Copyright (c) 2017, Alliance for Open Media. All rights reserved
       3             :  *
       4             :  * This source code is subject to the terms of the BSD 2 Clause License and
       5             :  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
       6             :  * was not distributed with this source code in the LICENSE file, you can
       7             :  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
       8             :  * Media Patent License 1.0 was not distributed with this source code in the
       9             :  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
      10             :  */
      11             : 
      12             : #include <assert.h>
      13             : #include <emmintrin.h>  // SSE2
      14             : #include <stdint.h>
      15             : #include "EbDefinitions.h"
      16             : #include "EbCabacContextModel.h"
      17             : #include "EbCommonUtils.h"
      18             : 
      19   336937000 : static INLINE __m128i loadh_epi64(const void *const src, const __m128i s) {
      20  1010780000 :     return _mm_castps_si128(
      21             :         _mm_loadh_pi(_mm_castsi128_ps(s), (const __m64 *)src));
      22             : }
      23             : 
      24   105187000 : static INLINE __m128i load_8bit_4x4_to_1_reg_sse2(const void *const src,
      25             :     const int32_t byte_stride) {
      26   105187000 :     return _mm_setr_epi32(*(const int32_t *)((int8_t *)src + 0 * byte_stride),
      27   105187000 :         *(const int32_t *)((int8_t *)src + 1 * byte_stride),
      28   105187000 :         *(const int32_t *)((int8_t *)src + 2 * byte_stride),
      29   105187000 :         *(const int32_t *)((int8_t *)src + 3 * byte_stride));
      30             : }
      31             : 
      32   336973000 : static INLINE __m128i load_8bit_8x2_to_1_reg_sse2(const void *const src,
      33             :     const int32_t byte_stride) {
      34             :     __m128i dst;
      35   336973000 :     dst = _mm_loadl_epi64((__m128i *)((int8_t *)src + 0 * byte_stride));
      36   336973000 :     dst = loadh_epi64((int8_t *)src + 1 * byte_stride, dst);
      37   336895000 :     return dst;
      38             : }
      39             : 
      40    21057500 : static INLINE void load_levels_4x4x5_sse2(const uint8_t *const src,
      41             :     const int32_t stride,
      42             :     const ptrdiff_t *const offsets,
      43             :     __m128i *const level) {
      44    21057500 :     level[0] = load_8bit_4x4_to_1_reg_sse2(src + 1, stride);
      45    21054900 :     level[1] = load_8bit_4x4_to_1_reg_sse2(src + stride, stride);
      46    21048600 :     level[2] = load_8bit_4x4_to_1_reg_sse2(src + offsets[0], stride);
      47    21047800 :     level[3] = load_8bit_4x4_to_1_reg_sse2(src + offsets[1], stride);
      48    21046500 :     level[4] = load_8bit_4x4_to_1_reg_sse2(src + offsets[2], stride);
      49    21046200 : }
      50             : 
      51    67658400 : static INLINE void load_levels_8x2x5_sse2(const uint8_t *const src,
      52             :     const int32_t stride,
      53             :     const ptrdiff_t *const offsets,
      54             :     __m128i *const level) {
      55    67658400 :     level[0] = load_8bit_8x2_to_1_reg_sse2(src + 1, stride);
      56    67632000 :     level[1] = load_8bit_8x2_to_1_reg_sse2(src + stride, stride);
      57    67597000 :     level[2] = load_8bit_8x2_to_1_reg_sse2(src + offsets[0], stride);
      58    67573100 :     level[3] = load_8bit_8x2_to_1_reg_sse2(src + offsets[1], stride);
      59    67564800 :     level[4] = load_8bit_8x2_to_1_reg_sse2(src + offsets[2], stride);
      60    67564800 : }
      61             : 
      62   189985000 : static INLINE void load_levels_16x1x5_sse2(const uint8_t *const src,
      63             :     const int32_t stride,
      64             :     const ptrdiff_t *const offsets,
      65             :     __m128i *const level) {
      66   189985000 :     level[0] = _mm_loadu_si128((__m128i *)(src + 1));
      67   189985000 :     level[1] = _mm_loadu_si128((__m128i *)(src + stride));
      68   189985000 :     level[2] = _mm_loadu_si128((__m128i *)(src + offsets[0]));
      69   189985000 :     level[3] = _mm_loadu_si128((__m128i *)(src + offsets[1]));
      70   189985000 :     level[4] = _mm_loadu_si128((__m128i *)(src + offsets[2]));
      71   189985000 : }
      72             : 
      73   277396000 : static INLINE __m128i get_coeff_contexts_kernel_sse2(__m128i *const level) {
      74   277396000 :     const __m128i const_3 = _mm_set1_epi8(3);
      75   277396000 :     const __m128i const_4 = _mm_set1_epi8(4);
      76             :     __m128i count;
      77             : 
      78   277396000 :     count = _mm_min_epu8(level[0], const_3);
      79   277396000 :     level[1] = _mm_min_epu8(level[1], const_3);
      80   277396000 :     level[2] = _mm_min_epu8(level[2], const_3);
      81   277396000 :     level[3] = _mm_min_epu8(level[3], const_3);
      82   277396000 :     level[4] = _mm_min_epu8(level[4], const_3);
      83   277396000 :     count = _mm_add_epi8(count, level[1]);
      84   277396000 :     count = _mm_add_epi8(count, level[2]);
      85   277396000 :     count = _mm_add_epi8(count, level[3]);
      86   554793000 :     count = _mm_add_epi8(count, level[4]);
      87   554793000 :     count = _mm_avg_epu8(count, _mm_setzero_si128());
      88   277396000 :     count = _mm_min_epu8(count, const_4);
      89   277396000 :     return count;
      90             : }
      91             : 
      92    11074900 : static INLINE void get_4_nz_map_contexts_2d(const uint8_t *levels,
      93             :     const int32_t height,
      94             :     const ptrdiff_t *const offsets,
      95             :     int8_t *const coeff_contexts) {
      96    11074900 :     const int32_t stride = 4 + TX_PAD_HOR;
      97    11074900 :     const __m128i pos_to_offset_large = _mm_set1_epi8(21);
      98    11074900 :     __m128i pos_to_offset =
      99             :         (height == 4)
     100     7527890 :         ? _mm_setr_epi8(0, 1, 6, 6, 1, 6, 6, 21, 6, 6, 21, 21, 6, 21, 21, 21)
     101    22149900 :         : _mm_setr_epi8(0, 11, 11, 11, 11, 11, 11, 11, 6, 6, 21, 21, 6, 21,
     102             :             21, 21);
     103             :     __m128i count;
     104             :     __m128i level[5];
     105    11074900 :     int8_t *cc = coeff_contexts;
     106    11074900 :     int32_t row = height;
     107             : 
     108             :     assert(!(height % 4));
     109             : 
     110             :     do {
     111    17515500 :         load_levels_4x4x5_sse2(levels, stride, offsets, level);
     112    17510000 :         count = get_coeff_contexts_kernel_sse2(level);
     113    17519500 :         count = _mm_add_epi8(count, pos_to_offset);
     114             :         _mm_store_si128((__m128i *)cc, count);
     115    17519500 :         pos_to_offset = pos_to_offset_large;
     116    17519500 :         levels += 4 * stride;
     117    17519500 :         cc += 16;
     118    17519500 :         row -= 4;
     119    17519500 :     } while (row);
     120             : 
     121    11078900 :     coeff_contexts[0] = 0;
     122    11078900 : }
     123             : 
     124    11872400 : static INLINE void get_8_coeff_contexts_2d(const uint8_t *levels,
     125             :     const int32_t height,
     126             :     const ptrdiff_t *const offsets,
     127             :     int8_t *coeff_contexts) {
     128    11872400 :     const int32_t stride = 8 + TX_PAD_HOR;
     129    11872400 :     int8_t *cc = coeff_contexts;
     130    11872400 :     int32_t row = height;
     131             :     __m128i count;
     132             :     __m128i level[5];
     133             :     __m128i pos_to_offset[3];
     134             : 
     135             :     assert(!(height % 2));
     136             : 
     137    11872400 :     if (height == 8) {
     138     7254990 :         pos_to_offset[0] =
     139     7254990 :             _mm_setr_epi8(0, 1, 6, 6, 21, 21, 21, 21, 1, 6, 6, 21, 21, 21, 21, 21);
     140     7254990 :         pos_to_offset[1] = _mm_setr_epi8(6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21,
     141             :             21, 21, 21, 21, 21);
     142             :     }
     143     4617450 :     else if (height < 8) {
     144     2024490 :         pos_to_offset[0] = _mm_setr_epi8(0, 16, 6, 6, 21, 21, 21, 21, 16, 16, 6, 21,
     145             :             21, 21, 21, 21);
     146     2024490 :         pos_to_offset[1] = _mm_setr_epi8(16, 16, 21, 21, 21, 21, 21, 21, 16, 16, 21,
     147             :             21, 21, 21, 21, 21);
     148             :     }
     149             :     else {
     150     2592960 :         pos_to_offset[0] = _mm_setr_epi8(0, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
     151             :             11, 11, 11, 11, 11);
     152     2592960 :         pos_to_offset[1] = _mm_setr_epi8(6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21,
     153             :             21, 21, 21, 21, 21);
     154             :     }
     155    11872400 :     pos_to_offset[2] = _mm_set1_epi8(21);
     156             : 
     157             :     do {
     158    58312000 :         load_levels_8x2x5_sse2(levels, stride, offsets, level);
     159    58208600 :         count = get_coeff_contexts_kernel_sse2(level);
     160   116634000 :         count = _mm_add_epi8(count, pos_to_offset[0]);
     161             :         _mm_store_si128((__m128i *)cc, count);
     162    58317100 :         pos_to_offset[0] = pos_to_offset[1];
     163    58317100 :         pos_to_offset[1] = pos_to_offset[2];
     164    58317100 :         levels += 2 * stride;
     165    58317100 :         cc += 16;
     166    58317100 :         row -= 2;
     167    58317100 :     } while (row);
     168             : 
     169    11877500 :     coeff_contexts[0] = 0;
     170    11877500 : }
     171             : 
     172     9094900 : static INLINE void get_16n_coeff_contexts_2d(const uint8_t *levels,
     173             :     const int32_t real_width,
     174             :     const int32_t real_height,
     175             :     const int32_t width, const int32_t height,
     176             :     const ptrdiff_t *const offsets,
     177             :     int8_t *coeff_contexts) {
     178     9094900 :     const int32_t stride = width + TX_PAD_HOR;
     179     9094900 :     int8_t *cc = coeff_contexts;
     180     9094900 :     int32_t row = height;
     181             :     __m128i pos_to_offset[5];
     182             :     __m128i pos_to_offset_large[3];
     183             :     __m128i count;
     184             :     __m128i level[5];
     185             : 
     186             :     assert(!(width % 16));
     187             : 
     188     9094900 :     pos_to_offset_large[2] = _mm_set1_epi8(21);
     189     9094900 :     if (real_width == real_height) {
     190     3379460 :         pos_to_offset[0] = _mm_setr_epi8(0, 1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21,
     191             :             21, 21, 21, 21);
     192     3379460 :         pos_to_offset[1] = _mm_setr_epi8(1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21,
     193             :             21, 21, 21, 21, 21);
     194     3379460 :         pos_to_offset[2] = _mm_setr_epi8(6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21,
     195             :             21, 21, 21, 21, 21);
     196     3379460 :         pos_to_offset[3] = _mm_setr_epi8(6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
     197             :             21, 21, 21, 21, 21);
     198     3379460 :         pos_to_offset[4] = pos_to_offset_large[0] = pos_to_offset_large[1] =
     199     3379460 :             pos_to_offset_large[2];
     200             :     }
     201     5715440 :     else if (real_width > real_height) {
     202     4723200 :         pos_to_offset[0] = _mm_setr_epi8(0, 16, 6, 6, 21, 21, 21, 21, 21, 21, 21,
     203             :             21, 21, 21, 21, 21);
     204     4723200 :         pos_to_offset[1] = _mm_setr_epi8(16, 16, 6, 21, 21, 21, 21, 21, 21, 21, 21,
     205             :             21, 21, 21, 21, 21);
     206     4723200 :         pos_to_offset[2] = pos_to_offset[3] = pos_to_offset[4] = _mm_setr_epi8(
     207             :             16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21);
     208     4723200 :         pos_to_offset_large[0] = pos_to_offset_large[1] = pos_to_offset_large[2];
     209             :     }
     210             :     else {  // real_width < real_height
     211      992249 :         pos_to_offset[0] = pos_to_offset[1] = _mm_setr_epi8(
     212             :             11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11);
     213      992249 :         pos_to_offset[2] = _mm_setr_epi8(6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21,
     214             :             21, 21, 21, 21, 21);
     215      992249 :         pos_to_offset[3] = _mm_setr_epi8(6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
     216             :             21, 21, 21, 21, 21);
     217      992249 :         pos_to_offset[4] = pos_to_offset_large[2];
     218      992249 :         pos_to_offset_large[0] = pos_to_offset_large[1] = _mm_set1_epi8(11);
     219             :     }
     220             : 
     221             :     do {
     222   137745000 :         int32_t w = width;
     223             : 
     224             :         do {
     225   186634000 :             load_levels_16x1x5_sse2(levels, stride, offsets, level);
     226   186376000 :             count = get_coeff_contexts_kernel_sse2(level);
     227   373800000 :             count = _mm_add_epi8(count, pos_to_offset[0]);
     228             :             _mm_store_si128((__m128i *)cc, count);
     229   186900000 :             levels += 16;
     230   186900000 :             cc += 16;
     231   186900000 :             w -= 16;
     232   186900000 :             pos_to_offset[0] = pos_to_offset_large[0];
     233   186900000 :         } while (w);
     234             : 
     235   138011000 :         pos_to_offset[0] = pos_to_offset[1];
     236   138011000 :         pos_to_offset[1] = pos_to_offset[2];
     237   138011000 :         pos_to_offset[2] = pos_to_offset[3];
     238   138011000 :         pos_to_offset[3] = pos_to_offset[4];
     239   138011000 :         pos_to_offset_large[0] = pos_to_offset_large[1];
     240   138011000 :         pos_to_offset_large[1] = pos_to_offset_large[2];
     241   138011000 :         levels += TX_PAD_HOR;
     242   138011000 :     } while (--row);
     243             : 
     244     9361100 :     coeff_contexts[0] = 0;
     245     9361100 : }
     246             : 
     247     1455480 : static INLINE void get_4_nz_map_contexts_hor(const uint8_t *levels,
     248             :     const int32_t height,
     249             :     const ptrdiff_t *const offsets,
     250             :     int8_t *coeff_contexts) {
     251     1455480 :     const int32_t stride = 4 + TX_PAD_HOR;
     252             :     const __m128i pos_to_offset =
     253     1455480 :         _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
     254             :             SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
     255             :             SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
     256             :             SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
     257             :             SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
     258             :             SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
     259             :             SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
     260             :             SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10);
     261             :     __m128i count;
     262             :     __m128i level[5];
     263     1455480 :     int32_t row = height;
     264             : 
     265             :     assert(!(height % 4));
     266             : 
     267             :     do {
     268     1883200 :         load_levels_4x4x5_sse2(levels, stride, offsets, level);
     269     1883120 :         count = get_coeff_contexts_kernel_sse2(level);
     270     1883280 :         count = _mm_add_epi8(count, pos_to_offset);
     271             :         _mm_store_si128((__m128i *)coeff_contexts, count);
     272     1883280 :         levels += 4 * stride;
     273     1883280 :         coeff_contexts += 16;
     274     1883280 :         row -= 4;
     275     1883280 :     } while (row);
     276     1455560 : }
     277             : 
     278     1302920 : static INLINE void get_4_nz_map_contexts_ver(const uint8_t *levels,
     279             :     const int32_t height,
     280             :     const ptrdiff_t *const offsets,
     281             :     int8_t *coeff_contexts) {
     282     1302920 :     const int32_t stride = 4 + TX_PAD_HOR;
     283     1302920 :     const __m128i pos_to_offset_large = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 10);
     284             :     __m128i pos_to_offset =
     285     1302920 :         _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
     286             :             SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
     287             :             SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5,
     288             :             SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5,
     289             :             SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
     290             :             SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
     291             :             SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
     292             :             SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10);
     293             :     __m128i count;
     294             :     __m128i level[5];
     295     1302920 :     int32_t row = height;
     296             : 
     297             :     assert(!(height % 4));
     298             : 
     299             :     do {
     300     1660330 :         load_levels_4x4x5_sse2(levels, stride, offsets, level);
     301     1660290 :         count = get_coeff_contexts_kernel_sse2(level);
     302     1660400 :         count = _mm_add_epi8(count, pos_to_offset);
     303             :         _mm_store_si128((__m128i *)coeff_contexts, count);
     304     1660400 :         pos_to_offset = pos_to_offset_large;
     305     1660400 :         levels += 4 * stride;
     306     1660400 :         coeff_contexts += 16;
     307     1660400 :         row -= 4;
     308     1660400 :     } while (row);
     309     1302990 : }
     310             : 
     311     1164390 : static INLINE void get_8_coeff_contexts_hor(const uint8_t *levels,
     312             :     const int32_t height,
     313             :     const ptrdiff_t *const offsets,
     314             :     int8_t *coeff_contexts) {
     315     1164390 :     const int32_t stride = 8 + TX_PAD_HOR;
     316             :     const __m128i pos_to_offset =
     317     1164390 :         _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
     318             :             SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
     319             :             SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
     320             :             SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
     321             :             SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
     322             :             SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
     323             :             SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
     324             :             SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10);
     325     1164390 :     int32_t row = height;
     326             :     __m128i count;
     327             :     __m128i level[5];
     328             : 
     329             :     assert(!(height % 2));
     330             : 
     331             :     do {
     332     4984300 :         load_levels_8x2x5_sse2(levels, stride, offsets, level);
     333     4983560 :         count = get_coeff_contexts_kernel_sse2(level);
     334     4984370 :         count = _mm_add_epi8(count, pos_to_offset);
     335             :         _mm_store_si128((__m128i *)coeff_contexts, count);
     336     4984370 :         levels += 2 * stride;
     337     4984370 :         coeff_contexts += 16;
     338     4984370 :         row -= 2;
     339     4984370 :     } while (row);
     340     1164460 : }
     341             : 
     342     1037100 : static INLINE void get_8_coeff_contexts_ver(const uint8_t *levels,
     343             :     const int32_t height,
     344             :     const ptrdiff_t *const offsets,
     345             :     int8_t *coeff_contexts) {
     346     1037100 :     const int32_t stride = 8 + TX_PAD_HOR;
     347     1037100 :     const __m128i pos_to_offset_large = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 10);
     348             :     __m128i pos_to_offset =
     349     1037100 :         _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
     350             :             SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
     351             :             SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
     352             :             SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
     353             :             SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5,
     354             :             SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5,
     355             :             SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5,
     356             :             SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5);
     357     1037100 :     int32_t row = height;
     358             :     __m128i count;
     359             :     __m128i level[5];
     360             : 
     361             :     assert(!(height % 2));
     362             : 
     363             :     do {
     364     4421060 :         load_levels_8x2x5_sse2(levels, stride, offsets, level);
     365     4420370 :         count = get_coeff_contexts_kernel_sse2(level);
     366     4421110 :         count = _mm_add_epi8(count, pos_to_offset);
     367             :         _mm_store_si128((__m128i *)coeff_contexts, count);
     368     4421110 :         pos_to_offset = pos_to_offset_large;
     369     4421110 :         levels += 2 * stride;
     370     4421110 :         coeff_contexts += 16;
     371     4421110 :         row -= 2;
     372     4421110 :     } while (row);
     373     1037160 : }
     374             : 
     375      243991 : static INLINE void get_16n_coeff_contexts_hor(const uint8_t *levels,
     376             :     const int32_t width, const int32_t height,
     377             :     const ptrdiff_t *const offsets,
     378             :     int8_t *coeff_contexts) {
     379      243991 :     const int32_t stride = width + TX_PAD_HOR;
     380             :     const __m128i pos_to_offset_large =
     381      243991 :         _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
     382             :             SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
     383             :             SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
     384             :             SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
     385             :             SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
     386             :             SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
     387             :             SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
     388             :             SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10);
     389             :     __m128i count;
     390             :     __m128i level[5];
     391      243991 :     int32_t row = height;
     392             : 
     393             :     assert(!(width % 16));
     394             : 
     395             :     do {
     396             :         __m128i pos_to_offset =
     397     1823180 :             _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
     398             :                 SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
     399             :                 SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
     400             :                 SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
     401             :                 SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
     402             :                 SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
     403             :                 SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
     404             :                 SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10);
     405     1823180 :         int32_t w = width;
     406             : 
     407             :         do {
     408     1823180 :             load_levels_16x1x5_sse2(levels, stride, offsets, level);
     409     1823160 :             count = get_coeff_contexts_kernel_sse2(level);
     410     1823200 :             count = _mm_add_epi8(count, pos_to_offset);
     411             :             _mm_store_si128((__m128i *)coeff_contexts, count);
     412     1823200 :             pos_to_offset = pos_to_offset_large;
     413     1823200 :             levels += 16;
     414     1823200 :             coeff_contexts += 16;
     415     1823200 :             w -= 16;
     416     1823200 :         } while (w);
     417             : 
     418     1823190 :         levels += TX_PAD_HOR;
     419     1823190 :     } while (--row);
     420      244009 : }
     421             : 
     422      245372 : static INLINE void get_16n_coeff_contexts_ver(const uint8_t *levels,
     423             :     const int32_t width, const int32_t height,
     424             :     const ptrdiff_t *const offsets,
     425             :     int8_t *coeff_contexts) {
     426      245372 :     const int32_t stride = width + TX_PAD_HOR;
     427             :     __m128i pos_to_offset[3];
     428             :     __m128i count;
     429             :     __m128i level[5];
     430      245372 :     int32_t row = height;
     431             : 
     432             :     assert(!(width % 16));
     433             : 
     434      245372 :     pos_to_offset[0] = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 0);
     435      245372 :     pos_to_offset[1] = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 5);
     436      245372 :     pos_to_offset[2] = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 10);
     437             : 
     438             :     do {
     439     1836460 :         int32_t w = width;
     440             : 
     441             :         do {
     442     1836460 :             load_levels_16x1x5_sse2(levels, stride, offsets, level);
     443     1836460 :             count = get_coeff_contexts_kernel_sse2(level);
     444     3673020 :             count = _mm_add_epi8(count, pos_to_offset[0]);
     445             :             _mm_store_si128((__m128i *)coeff_contexts, count);
     446     1836510 :             levels += 16;
     447     1836510 :             coeff_contexts += 16;
     448     1836510 :             w -= 16;
     449     1836510 :         } while (w);
     450             : 
     451     1836510 :         pos_to_offset[0] = pos_to_offset[1];
     452     1836510 :         pos_to_offset[1] = pos_to_offset[2];
     453     1836510 :         levels += TX_PAD_HOR;
     454     1836510 :     } while (--row);
     455      245421 : }
     456             : 
     457    43903800 : void eb_av1_get_nz_map_contexts_sse2(
     458             :     const uint8_t *const levels,
     459             :     const int16_t *const scan,
     460             :     const uint16_t eob,
     461             :     TxSize tx_size,
     462             :     const TxClass tx_class,
     463             :     int8_t *const coeff_contexts
     464             : ) {
     465    43903800 :     const int32_t last_idx = eob - 1;
     466    43903800 :     if (!last_idx) {
     467     6463950 :         coeff_contexts[0] = 0;
     468     6463950 :         return;
     469             :     }
     470             : 
     471    37439900 :     const int32_t real_width = tx_size_wide[tx_size];
     472    37439900 :     const int32_t real_height = tx_size_high[tx_size];
     473    37439900 :     const int32_t width = get_txb_wide(tx_size);
     474    37460800 :     const int32_t height = get_txb_high(tx_size);
     475             : 
     476    37485100 :     const int32_t stride = width + TX_PAD_HOR;
     477             : 
     478             :     ptrdiff_t offsets[3];
     479             : 
     480             :     /* coeff_contexts must be 16 byte aligned. */
     481             :     assert(!((intptr_t)coeff_contexts & 0xf));
     482             : 
     483    37485100 :     if (tx_class == TX_CLASS_2D) {
     484    32036100 :         offsets[0] = 0 * stride + 2;
     485    32036100 :         offsets[1] = 1 * stride + 1;
     486    32036100 :         offsets[2] = 2 * stride + 0;
     487             : 
     488    32036100 :         if (width == 4)
     489    11074700 :             get_4_nz_map_contexts_2d(levels, height, offsets, coeff_contexts);
     490    20961400 :         else if (width == 8)
     491    11873000 :             get_8_coeff_contexts_2d(levels, height, offsets, coeff_contexts);
     492     9088460 :         else if (width == 16) {
     493     6773810 :             get_16n_coeff_contexts_2d(levels, real_width, real_height, width, height,
     494             :                 offsets, coeff_contexts);
     495             :         }
     496             :         else {
     497     2314650 :             get_16n_coeff_contexts_2d(levels, real_width, real_height, width, height,
     498             :                 offsets, coeff_contexts);
     499             :         }
     500             :     }
     501     5448960 :     else if (tx_class == TX_CLASS_HORIZ) {
     502     2863710 :         offsets[0] = 2;
     503     2863710 :         offsets[1] = 3;
     504     2863710 :         offsets[2] = 4;
     505     2863710 :         if (width == 4)
     506     1455490 :             get_4_nz_map_contexts_hor(levels, height, offsets, coeff_contexts);
     507     1408220 :         else if (width == 8)
     508     1164390 :             get_8_coeff_contexts_hor(levels, height, offsets, coeff_contexts);
     509             :         else {
     510      243831 :             get_16n_coeff_contexts_hor(levels, width, height, offsets,
     511             :                 coeff_contexts);
     512             :         }
     513             :     }
     514             :     else {  // TX_CLASS_VERT
     515     2585250 :         offsets[0] = 2 * stride;
     516     2585250 :         offsets[1] = 3 * stride;
     517     2585250 :         offsets[2] = 4 * stride;
     518     2585250 :         if (width == 4)
     519     1302920 :             get_4_nz_map_contexts_ver(levels, height, offsets, coeff_contexts);
     520     1282330 :         else if (width == 8)
     521     1037120 :             get_8_coeff_contexts_ver(levels, height, offsets, coeff_contexts);
     522             :         else {
     523      245215 :             get_16n_coeff_contexts_ver(levels, width, height, offsets,
     524             :                 coeff_contexts);
     525             :         }
     526             :     }
     527             : 
     528    37501900 :     const int32_t bwl = get_txb_bwl(tx_size);
     529    37472400 :     const int32_t pos = scan[last_idx];
     530    37472400 :     if (last_idx <= (height << bwl) / 8)
     531     6834960 :         coeff_contexts[pos] = 1;
     532    30637400 :     else if (last_idx <= (height << bwl) / 4)
     533     5374890 :         coeff_contexts[pos] = 2;
     534             :     else
     535    25262500 :         coeff_contexts[pos] = 3;
     536             : }

Generated by: LCOV version 1.14