LCOV - code coverage report
Current view: top level - ASM_SSE4_1 - EbIntraPrediction16bit_Intrinsic_SSE4_1.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 109 175 62.3 %
Date: 2019-11-25 17:38:06 Functions: 2 3 66.7 %

          Line data    Source code
       1             : /*
       2             : * Copyright(c) 2019 Intel Corporation
       3             : * SPDX - License - Identifier: BSD - 2 - Clause - Patent
       4             : */
       5             : 
       6             : #include "EbDefinitions.h"
       7             : #include "smmintrin.h"
       8             : #include "aom_dsp_rtcd.h"
       9    21266000 : void eb_av1_filter_intra_edge_sse4_1(uint8_t *p, int32_t sz, int32_t strength) {
      10    21266000 :     if (!strength) return;
      11             : 
      12             :     DECLARE_ALIGNED(16, static const int8_t, kern[3][16]) = {
      13             :         { 4, 8, 4, 0, 4, 8, 4, 0, 4, 8, 4, 0, 4, 8, 4, 0 },  // strength 1: 4,8,4
      14             :         { 5, 6, 5, 0, 5, 6, 5, 0, 5, 6, 5, 0, 5, 6, 5, 0 },  // strength 2: 5,6,5
      15             :         { 2, 4, 4, 4, 2, 0, 0, 0, 2, 4, 4, 4, 2, 0, 0, 0 }  // strength 3: 2,4,4,4,2
      16             :     };
      17             : 
      18             :     DECLARE_ALIGNED(16, static const int8_t, v_const[5][16]) = {
      19             :         { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 },
      20             :         { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 },
      21             :         { 0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8 },
      22             :         { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
      23             :     };
      24             : 
      25             :     // Extend the first and last samples to simplify the loop for the 5-tap case
      26    14016900 :     p[-1] = p[0];
      27    14016900 :     __m128i last = _mm_set1_epi8(p[sz - 1]);
      28    14016900 :     _mm_storeu_si128((__m128i *)&p[sz], last);
      29             : 
      30             :     // Adjust input pointer for filter support area
      31    14016900 :     uint8_t *in = (strength == 3) ? p - 1 : p;
      32             : 
      33             :     // Avoid modifying first sample
      34    14016900 :     uint8_t *out = p + 1;
      35    14016900 :     int32_t len = sz - 1;
      36             : 
      37    14016900 :     const int32_t use_3tap_filter = (strength < 3);
      38             : 
      39    14016900 :     if (use_3tap_filter) {
      40    13555600 :         __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]);
      41     6777830 :         __m128i shuf0 = _mm_lddqu_si128((__m128i const *)v_const[0]);
      42     6777810 :         __m128i shuf1 = _mm_lddqu_si128((__m128i const *)v_const[1]);
      43     6777790 :         __m128i iden = _mm_lddqu_si128((__m128i *)v_const[3]);
      44     6778310 :         __m128i in0 = _mm_lddqu_si128((__m128i *)in);
      45    17580500 :         while (len > 0) {
      46    10802100 :             int32_t n_out = (len < 8) ? len : 8;
      47    10802100 :             __m128i d0 = _mm_shuffle_epi8(in0, shuf0);
      48    10802100 :             __m128i d1 = _mm_shuffle_epi8(in0, shuf1);
      49    10802100 :             d0 = _mm_maddubs_epi16(d0, coef0);
      50    10802100 :             d1 = _mm_maddubs_epi16(d1, coef0);
      51    10802100 :             d0 = _mm_hadd_epi16(d0, d1);
      52    10802100 :             __m128i eight = _mm_set1_epi16(8);
      53    10802100 :             d0 = _mm_add_epi16(d0, eight);
      54    10802100 :             d0 = _mm_srai_epi16(d0, 4);
      55    10802100 :             d0 = _mm_packus_epi16(d0, d0);
      56    10802300 :             __m128i out0 = _mm_lddqu_si128((__m128i *)out);
      57    21604600 :             __m128i n0 = _mm_set1_epi8(n_out);
      58    10802300 :             __m128i mask = _mm_cmpgt_epi8(n0, iden);
      59    10802300 :             out0 = _mm_blendv_epi8(out0, d0, mask);
      60    10802300 :             _mm_storel_epi64((__m128i *)out, out0);
      61    10802300 :             __m128i in1 = _mm_lddqu_si128((__m128i *)(in + 16));
      62    10802200 :             in0 = _mm_alignr_epi8(in1, in0, 8);
      63    10802200 :             in += 8;
      64    10802200 :             out += 8;
      65    10802200 :             len -= n_out;
      66             :         }
      67             :     }
      68             :     else {  // 5-tap filter
      69    14493700 :         __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]);
      70     7254570 :         __m128i two = _mm_set1_epi8(2);
      71     7254920 :         __m128i shuf_a = _mm_lddqu_si128((__m128i const *)v_const[2]);
      72     7254920 :         __m128i shuf_b = _mm_add_epi8(shuf_a, two);
      73     7254920 :         __m128i shuf_c = _mm_add_epi8(shuf_b, two);
      74     7254920 :         __m128i shuf_d = _mm_add_epi8(shuf_c, two);
      75     7254900 :         __m128i iden = _mm_lddqu_si128((__m128i *)v_const[3]);
      76     7254840 :         __m128i in0 = _mm_lddqu_si128((__m128i *)in);
      77    31020800 :         while (len > 0) {
      78    23766200 :             int32_t n_out = (len < 8) ? len : 8;
      79    23766200 :             __m128i d0 = _mm_shuffle_epi8(in0, shuf_a);
      80    23766200 :             __m128i d1 = _mm_shuffle_epi8(in0, shuf_b);
      81    23766200 :             __m128i d2 = _mm_shuffle_epi8(in0, shuf_c);
      82    23766200 :             __m128i d3 = _mm_shuffle_epi8(in0, shuf_d);
      83    23766200 :             d0 = _mm_maddubs_epi16(d0, coef0);
      84    23766200 :             d1 = _mm_maddubs_epi16(d1, coef0);
      85    23766200 :             d2 = _mm_maddubs_epi16(d2, coef0);
      86    23766200 :             d3 = _mm_maddubs_epi16(d3, coef0);
      87    23766200 :             d0 = _mm_hadd_epi16(d0, d1);
      88    23766200 :             d2 = _mm_hadd_epi16(d2, d3);
      89    23766200 :             d0 = _mm_hadd_epi16(d0, d2);
      90    23766200 :             __m128i eight = _mm_set1_epi16(8);
      91    23766200 :             d0 = _mm_add_epi16(d0, eight);
      92    23766200 :             d0 = _mm_srai_epi16(d0, 4);
      93    23766200 :             d0 = _mm_packus_epi16(d0, d0);
      94    23767000 :             __m128i out0 = _mm_lddqu_si128((__m128i *)out);
      95    47533900 :             __m128i n0 = _mm_set1_epi8(n_out);
      96    23767000 :             __m128i mask = _mm_cmpgt_epi8(n0, iden);
      97    23767000 :             out0 = _mm_blendv_epi8(out0, d0, mask);
      98    23767000 :             _mm_storel_epi64((__m128i *)out, out0);
      99    23767000 :             __m128i in1 = _mm_lddqu_si128((__m128i *)(in + 16));
     100    23766000 :             in0 = _mm_alignr_epi8(in1, in0, 8);
     101    23766000 :             in += 8;
     102    23766000 :             out += 8;
     103    23766000 :             len -= n_out;
     104             :         }
     105             :     }
     106             : }
     107             : 
     108           0 : void eb_av1_filter_intra_edge_high_sse4_1(uint16_t *p, int32_t sz, int32_t strength) {
     109           0 :     if (!strength) return;
     110             : 
     111             :     DECLARE_ALIGNED(16, static const int16_t, kern[3][8]) = {
     112             :         { 4, 8, 4, 8, 4, 8, 4, 8 },  // strength 1: 4,8,4
     113             :         { 5, 6, 5, 6, 5, 6, 5, 6 },  // strength 2: 5,6,5
     114             :         { 2, 4, 2, 4, 2, 4, 2, 4 }   // strength 3: 2,4,4,4,2
     115             :     };
     116             : 
     117             :     DECLARE_ALIGNED(16, static const int16_t,
     118             :     v_const[1][8]) = { { 0, 1, 2, 3, 4, 5, 6, 7 } };
     119             : 
     120             :     // Extend the first and last samples to simplify the loop for the 5-tap case
     121           0 :     p[-1] = p[0];
     122           0 :     __m128i last = _mm_set1_epi16(p[sz - 1]);
     123           0 :     _mm_storeu_si128((__m128i *)&p[sz], last);
     124             : 
     125             :     // Adjust input pointer for filter support area
     126           0 :     uint16_t *in = (strength == 3) ? p - 1 : p;
     127             : 
     128             :     // Avoid modifying first sample
     129           0 :     uint16_t *out = p + 1;
     130           0 :     int32_t len = sz - 1;
     131             : 
     132           0 :     const int32_t use_3tap_filter = (strength < 3);
     133             : 
     134           0 :     if (use_3tap_filter) {
     135           0 :         __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]);
     136           0 :         __m128i iden = _mm_lddqu_si128((__m128i *)v_const[0]);
     137           0 :         __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]);
     138           0 :         __m128i in8 = _mm_lddqu_si128((__m128i *)&in[8]);
     139           0 :         while (len > 0) {
     140           0 :             int32_t n_out = (len < 8) ? len : 8;
     141           0 :             __m128i in1 = _mm_alignr_epi8(in8, in0, 2);
     142           0 :             __m128i in2 = _mm_alignr_epi8(in8, in0, 4);
     143           0 :             __m128i in02 = _mm_add_epi16(in0, in2);
     144           0 :             __m128i d0 = _mm_unpacklo_epi16(in02, in1);
     145           0 :             __m128i d1 = _mm_unpackhi_epi16(in02, in1);
     146           0 :             d0 = _mm_mullo_epi16(d0, coef0);
     147           0 :             d1 = _mm_mullo_epi16(d1, coef0);
     148           0 :             d0 = _mm_hadd_epi16(d0, d1);
     149           0 :             __m128i eight = _mm_set1_epi16(8);
     150           0 :             d0 = _mm_add_epi16(d0, eight);
     151           0 :             d0 = _mm_srli_epi16(d0, 4);
     152           0 :             __m128i out0 = _mm_lddqu_si128((__m128i *)out);
     153           0 :             __m128i n0 = _mm_set1_epi16(n_out);
     154           0 :             __m128i mask = _mm_cmpgt_epi16(n0, iden);
     155           0 :             out0 = _mm_blendv_epi8(out0, d0, mask);
     156             :             _mm_storeu_si128((__m128i *)out, out0);
     157           0 :             in += 8;
     158           0 :             in0 = in8;
     159           0 :             in8 = _mm_lddqu_si128((__m128i *)&in[8]);
     160           0 :             out += 8;
     161           0 :             len -= n_out;
     162             :         }
     163             :     }
     164             :     else {  // 5-tap filter
     165           0 :         __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]);
     166           0 :         __m128i iden = _mm_lddqu_si128((__m128i *)v_const[0]);
     167           0 :         __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]);
     168           0 :         __m128i in8 = _mm_lddqu_si128((__m128i *)&in[8]);
     169           0 :         while (len > 0) {
     170           0 :             int32_t n_out = (len < 8) ? len : 8;
     171           0 :             __m128i in1 = _mm_alignr_epi8(in8, in0, 2);
     172           0 :             __m128i in2 = _mm_alignr_epi8(in8, in0, 4);
     173           0 :             __m128i in3 = _mm_alignr_epi8(in8, in0, 6);
     174           0 :             __m128i in4 = _mm_alignr_epi8(in8, in0, 8);
     175           0 :             __m128i in04 = _mm_add_epi16(in0, in4);
     176           0 :             __m128i in123 = _mm_add_epi16(in1, in2);
     177           0 :             in123 = _mm_add_epi16(in123, in3);
     178           0 :             __m128i d0 = _mm_unpacklo_epi16(in04, in123);
     179           0 :             __m128i d1 = _mm_unpackhi_epi16(in04, in123);
     180           0 :             d0 = _mm_mullo_epi16(d0, coef0);
     181           0 :             d1 = _mm_mullo_epi16(d1, coef0);
     182           0 :             d0 = _mm_hadd_epi16(d0, d1);
     183           0 :             __m128i eight = _mm_set1_epi16(8);
     184           0 :             d0 = _mm_add_epi16(d0, eight);
     185           0 :             d0 = _mm_srli_epi16(d0, 4);
     186           0 :             __m128i out0 = _mm_lddqu_si128((__m128i *)out);
     187           0 :             __m128i n0 = _mm_set1_epi16(n_out);
     188           0 :             __m128i mask = _mm_cmpgt_epi16(n0, iden);
     189           0 :             out0 = _mm_blendv_epi8(out0, d0, mask);
     190             :             _mm_storeu_si128((__m128i *)out, out0);
     191           0 :             in += 8;
     192           0 :             in0 = in8;
     193           0 :             in8 = _mm_lddqu_si128((__m128i *)&in[8]);
     194           0 :             out += 8;
     195           0 :             len -= n_out;
     196             :         }
     197             :     }
     198             : }
     199             : 
     200     5947590 : void eb_av1_upsample_intra_edge_sse4_1(uint8_t *p, int32_t sz) {
     201             :     // interpolate half-sample positions
     202             :     assert(sz <= 24);
     203             : 
     204             :     DECLARE_ALIGNED(16, static const int8_t, kernel[1][16]) = {
     205             :         { -1, 9, 9, -1, -1, 9, 9, -1, -1, 9, 9, -1, -1, 9, 9, -1 }
     206             :     };
     207             : 
     208             :     DECLARE_ALIGNED(16, static const int8_t, v_const[2][16]) = {
     209             :         { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 },
     210             :         { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
     211             :     };
     212             : 
     213             :     // Extend first/last samples (upper-left p[-1], last p[sz-1])
     214             :     // to support 4-tap filter
     215     5947590 :     p[-2] = p[-1];
     216     5947590 :     p[sz] = p[sz - 1];
     217             : 
     218     5947590 :     uint8_t *in = &p[-2];
     219     5947590 :     uint8_t *out = &p[-2];
     220             : 
     221     5947590 :     int32_t n = sz + 1;  // Input length including upper-left sample
     222             : 
     223     5949060 :     __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]);
     224    11898100 :     __m128i in16 = _mm_lddqu_si128((__m128i *)&in[16]);
     225             : 
     226     5949040 :     __m128i coef0 = _mm_lddqu_si128((__m128i *)kernel[0]);
     227     5949030 :     __m128i shuf0 = _mm_lddqu_si128((__m128i *)v_const[0]);
     228     5949020 :     __m128i shuf1 = _mm_lddqu_si128((__m128i *)v_const[1]);
     229             : 
     230    12744900 :     while (n > 0) {
     231     6795890 :         __m128i in8 = _mm_alignr_epi8(in16, in0, 8);
     232     6795890 :         __m128i d0 = _mm_shuffle_epi8(in0, shuf0);
     233     6795890 :         __m128i d1 = _mm_shuffle_epi8(in0, shuf1);
     234     6795890 :         __m128i d2 = _mm_shuffle_epi8(in8, shuf0);
     235     6795890 :         __m128i d3 = _mm_shuffle_epi8(in8, shuf1);
     236     6795890 :         d0 = _mm_maddubs_epi16(d0, coef0);
     237     6795890 :         d1 = _mm_maddubs_epi16(d1, coef0);
     238     6795890 :         d2 = _mm_maddubs_epi16(d2, coef0);
     239     6795890 :         d3 = _mm_maddubs_epi16(d3, coef0);
     240     6795890 :         d0 = _mm_hadd_epi16(d0, d1);
     241     6795890 :         d2 = _mm_hadd_epi16(d2, d3);
     242     6795890 :         __m128i eight = _mm_set1_epi16(8);
     243     6795890 :         d0 = _mm_add_epi16(d0, eight);
     244     6795890 :         d2 = _mm_add_epi16(d2, eight);
     245     6795890 :         d0 = _mm_srai_epi16(d0, 4);
     246     6795890 :         d2 = _mm_srai_epi16(d2, 4);
     247     6795890 :         d0 = _mm_packus_epi16(d0, d2);
     248     6795890 :         __m128i in1 = _mm_alignr_epi8(in16, in0, 1);
     249     6795890 :         __m128i out0 = _mm_unpacklo_epi8(in1, d0);
     250     6795890 :         __m128i out1 = _mm_unpackhi_epi8(in1, d0);
     251             :         _mm_storeu_si128((__m128i *)&out[0], out0);
     252     6795890 :         _mm_storeu_si128((__m128i *)&out[16], out1);
     253     6795890 :         in0 = in16;
     254     6795890 :         in16 = _mm_setzero_si128();
     255     6795890 :         out += 32;
     256     6795890 :         n -= 16;
     257             :     }
     258     5949020 : }

Generated by: LCOV version 1.14