LCOV - code coverage report
Current view: top level - ASM_SSE4_1 - EbPictureOperators_Intrinsic_SSE4_1.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 0 268 0.0 %
Date: 2019-11-25 17:38:06 Functions: 0 2 0.0 %

          Line data    Source code
       1             : /*
       2             : * Copyright(c) 2019 Intel Corporation
       3             : * SPDX - License - Identifier: BSD - 2 - Clause - Patent
       4             : */
       5             : 
       6             : #include "EbPictureOperators_SSE4_1.h"
       7             : #include "smmintrin.h"
       8             : 
       9           0 : uint64_t compute8x8_satd_sse4(
      10             :     int16_t *diff)       // input parameter, diff samples Ptr
      11             : {
      12           0 :     uint64_t satdBlock8x8 = 0;
      13             :     int16_t m2[8][8];
      14             : 
      15             :     uint32_t j, jj;
      16             :     __m128i s0, s1, s2, s3, s4, s5, s6, s7, s9, s10, s11, s12;
      17           0 :     __m128i s8 = _mm_setzero_si128();
      18             :     __m128i sum01Neg, sum01Pos, sum23Neg, sum23Pos, sum45Neg, sum45Pos, sum67Neg, sum67Pos;
      19             :     __m128i sum0to3Pos, sum4to7Pos, sum0to3Neg, sum4to7Neg, diff0to3Pos, diff4to7Pos, diff0to3Neg, diff4to7Neg;
      20             :     __m128i sum0, sum1, difference0, difference1;
      21             : 
      22           0 :     for (j = 0; j < 8; j += 2)
      23             :     {
      24           0 :         jj = j << 3;
      25           0 :         s0 = _mm_loadu_si128((__m128i*)(diff + jj));
      26           0 :         s10 = _mm_loadu_si128((__m128i*)(diff + 8 + jj));
      27             : 
      28           0 :         sum0 = _mm_hadd_epi16(s0, s8);
      29           0 :         sum1 = _mm_hadd_epi16(s10, s8);
      30             : 
      31           0 :         difference0 = _mm_hsub_epi16(s0, s8);
      32           0 :         difference1 = _mm_hsub_epi16(s10, s8);
      33             : 
      34             :         // m2[j][0]
      35             :         // diff[jj] + diff[jj + 4] + diff[jj + 2] + diff[jj + 6] + diff[jj + 1] + diff[jj + 5] + diff[jj + 3] + diff[jj + 7]
      36             :         // diff[jj] + diff[jj + 1] + diff[jj + 2] + diff[jj + 3] + diff[jj + 4] + diff[jj + 5] + diff[jj + 6] + diff[jj + 7]
      37           0 :         s1 = _mm_hadd_epi16(sum0, sum1);
      38           0 :         s1 = _mm_hadd_epi16(s1, s8);
      39           0 :         m2[j][0] = _mm_extract_epi16(s1, 0);
      40           0 :         m2[j + 1][0] = _mm_extract_epi16(s1, 2);
      41             : 
      42             :         //m2[j][1]
      43             :         //diff[jj] + diff[jj + 4] + diff[jj + 2] + diff[jj + 6] - diff[jj + 1] - diff[jj + 5] - diff[jj + 3] - diff[jj + 7]
      44             :         //diff[jj] - diff[jj + 1] + diff[jj + 2] - diff[jj + 3] + diff[jj + 4] - diff[jj + 5] + diff[jj + 6] - diff[jj + 7]
      45             :         //(diff[jj] - diff[jj + 1]) + (diff[jj + 2] - diff[jj + 3]) + (diff[jj + 4] - diff[jj + 5]) + (diff[jj + 6] - diff[jj + 7])
      46           0 :         s1 = _mm_hadd_epi16(difference0, difference1);
      47           0 :         s1 = _mm_hadd_epi16(s1, s8);
      48           0 :         m2[j][1] = _mm_extract_epi16(s1, 0);
      49           0 :         m2[j + 1][1] = _mm_extract_epi16(s1, 2);
      50             : 
      51             :         //m2[j][2]
      52             :         //diff[jj] + diff[jj + 4] - diff[jj + 2] - diff[jj + 6] + diff[jj + 1] + diff[jj + 5] - diff[jj + 3] - diff[jj + 7]
      53             :         //diff[jj] + diff[jj + 1] - diff[jj + 2] - diff[jj + 3] + diff[jj + 4] + diff[jj + 5] - diff[jj + 6] - diff[jj + 7]
      54           0 :         s1 = _mm_hsub_epi16(sum0, sum1);
      55           0 :         s1 = _mm_hadd_epi16(s1, s8);
      56           0 :         m2[j][2] = _mm_extract_epi16(s1, 0);
      57           0 :         m2[j + 1][2] = _mm_extract_epi16(s1, 2);
      58             : 
      59             :         //m2[j][3]
      60             :         //diff[jj] + diff[jj + 4] - diff[jj + 2] - diff[jj + 6] - diff[jj + 1] - diff[jj + 5] + diff[jj + 3] + diff[jj + 7]
      61             :         //diff[jj] - diff[jj + 1] - diff[jj + 2] + diff[jj + 3] + diff[jj + 4] - diff[jj + 5] - diff[jj + 6] + diff[jj + 7]
      62             :         //diff[jj] - diff[jj + 1] - diff[jj + 2] + diff[jj + 3] + diff[jj + 4] - diff[jj + 5] - diff[jj + 6] + diff[jj + 7]
      63           0 :         s1 = _mm_hsub_epi16(difference0, difference1);
      64           0 :         s1 = _mm_hadd_epi16(s1, s8);
      65           0 :         m2[j][3] = _mm_extract_epi16(s1, 0);
      66           0 :         m2[j + 1][3] = _mm_extract_epi16(s1, 2);
      67             : 
      68             :         //m2[j][4]
      69             :         //diff[jj] - diff[jj + 4] + diff[jj + 2] - diff[jj + 6] + diff[jj + 1] - diff[jj + 5] + diff[jj + 3] - diff[jj + 7]
      70             :         //diff[jj] + diff[jj + 1] + diff[jj + 2] + diff[jj + 3] - diff[jj + 4] - diff[jj + 5] - diff[jj + 6] - diff[jj + 7]
      71           0 :         s1 = _mm_hadd_epi16(sum0, sum1);
      72           0 :         s1 = _mm_hsub_epi16(s1, s8);
      73           0 :         m2[j][4] = _mm_extract_epi16(s1, 0);
      74           0 :         m2[j + 1][4] = _mm_extract_epi16(s1, 2);
      75             : 
      76             :         //m2[j][5]
      77             :         //m1[j][4] - m1[j][5]
      78             :         //diff[jj] - diff[jj + 1] + diff[jj + 2] - diff[jj + 3] - diff[jj + 4]  + diff[jj + 5] - diff[jj + 6] + diff[jj + 7]
      79           0 :         s1 = _mm_hadd_epi16(difference0, difference1);
      80           0 :         s1 = _mm_hsub_epi16(s1, s8);
      81           0 :         m2[j][5] = _mm_extract_epi16(s1, 0);
      82           0 :         m2[j + 1][5] = _mm_extract_epi16(s1, 2);
      83             : 
      84             :         //m2[j][6]
      85             :         //diff[jj] - diff[jj + 4] - diff[jj + 2] + diff[jj + 6] + diff[jj + 1] - diff[jj + 5] - diff[jj + 3] + diff[jj + 7]
      86             :         //diff[jj] + diff[jj + 1] - diff[jj + 2] - diff[jj + 3] - diff[jj + 4] - diff[jj + 5] + diff[jj + 6] + diff[jj + 7]
      87             : 
      88           0 :         s1 = _mm_hsub_epi16(sum0, sum1);
      89           0 :         s1 = _mm_hsub_epi16(s1, s8);
      90           0 :         m2[j][6] = _mm_extract_epi16(s1, 0);
      91           0 :         m2[j + 1][6] = _mm_extract_epi16(s1, 2);
      92             : 
      93             :         //m2[j][7]
      94             :         //diff[jj] - diff[jj + 4] - diff[jj + 2] + diff[jj + 6] - diff[jj + 1] + diff[jj + 5] + diff[jj + 3] - diff[jj + 7]
      95             :         //diff[jj] - diff[jj + 1] - diff[jj + 2] + diff[jj + 3] - diff[jj + 4] + diff[jj + 5] + diff[jj + 6] - diff[jj + 7]
      96           0 :         s1 = _mm_hsub_epi16(difference0, difference1);
      97           0 :         s1 = _mm_hsub_epi16(s1, s8);
      98           0 :         m2[j][7] = _mm_extract_epi16(s1, 0);
      99           0 :         m2[j + 1][7] = _mm_extract_epi16(s1, 2);
     100             :     }
     101             : 
     102             :     // Vertical
     103           0 :     s0 = _mm_loadu_si128((__m128i*)(m2[0]));
     104           0 :     s1 = _mm_loadu_si128((__m128i*)(m2[1]));
     105           0 :     s2 = _mm_loadu_si128((__m128i*)(m2[2]));
     106           0 :     s3 = _mm_loadu_si128((__m128i*)(m2[3]));
     107           0 :     s4 = _mm_loadu_si128((__m128i*)(m2[4]));
     108           0 :     s5 = _mm_loadu_si128((__m128i*)(m2[5]));
     109           0 :     s6 = _mm_loadu_si128((__m128i*)(m2[6]));
     110           0 :     s7 = _mm_loadu_si128((__m128i*)(m2[7]));
     111             : 
     112           0 :     sum01Pos = _mm_add_epi16(s0, s1);
     113           0 :     sum23Pos = _mm_add_epi16(s2, s3);
     114           0 :     sum45Pos = _mm_add_epi16(s4, s5);
     115           0 :     sum67Pos = _mm_add_epi16(s6, s7);
     116             : 
     117           0 :     sum01Neg = _mm_sub_epi16(s0, s1);
     118           0 :     sum23Neg = _mm_sub_epi16(s2, s3);
     119           0 :     sum45Neg = _mm_sub_epi16(s4, s5);
     120           0 :     sum67Neg = _mm_sub_epi16(s6, s7);
     121             : 
     122           0 :     sum0to3Pos = _mm_add_epi16(sum01Pos, sum23Pos);
     123           0 :     sum4to7Pos = _mm_add_epi16(sum45Pos, sum67Pos);
     124           0 :     diff0to3Pos = _mm_sub_epi16(sum01Pos, sum23Pos);
     125           0 :     diff4to7Pos = _mm_sub_epi16(sum45Pos, sum67Pos);
     126             : 
     127           0 :     sum0to3Neg = _mm_add_epi16(sum01Neg, sum23Neg);
     128           0 :     sum4to7Neg = _mm_add_epi16(sum45Neg, sum67Neg);
     129           0 :     diff0to3Neg = _mm_sub_epi16(sum01Neg, sum23Neg);
     130           0 :     diff4to7Neg = _mm_sub_epi16(sum45Neg, sum67Neg);
     131             : 
     132             :     //m2[0][i] = m1[0][i] + m1[1][i]
     133             :     //m2[0][i] = m3[0][i] + m3[2][i] + m3[1][i] + m3[3][i]
     134             :     //m2[0][i] = m2[0][i] + m2[4][i] + m2[2][i] + m2[6][i] + m2[1][i] + m2[5][i] + m2[3][i] + m2[7][i]
     135             :     //m2[0][i] = m2[0][i] + m2[1][i] + m2[2][i] + m2[3][i] + m2[4][i] + m2[5][i] + m2[6][i] + m2[7][i]
     136           0 :     s9 = _mm_add_epi16(sum0to3Pos, sum4to7Pos);
     137           0 :     s9 = _mm_abs_epi16(s9);
     138           0 :     s10 = _mm_unpacklo_epi16(s9, _mm_setzero_si128());
     139           0 :     s11 = _mm_unpackhi_epi16(s9, _mm_setzero_si128());
     140           0 :     s10 = _mm_add_epi32(s10, s11);
     141           0 :     s10 = _mm_hadd_epi32(s10, s8);
     142           0 :     s10 = _mm_hadd_epi32(s10, s8);
     143             : 
     144             :     //m2[1][i] = m1[0][i] - m1[1][i]
     145             :     //m2[1][i] = m3[0][i] + m3[2][i] -(m3[1][i] + m3[3][i])
     146             :     //m2[1][i] = m2[0][i] + m2[4][i] + m2[2][i] + m2[6][i] -(m2[1][i] + m2[5][i] + m2[3][i] + m2[7][i])
     147             :     //m2[1][i] = m2[0][i] - m2[1][i] + m2[2][i] - m2[3][i] + m2[4][i] - m2[5][i] + m2[6][i] - m2[7][i]
     148           0 :     s9 = _mm_add_epi16(sum0to3Neg, sum4to7Neg);
     149           0 :     s9 = _mm_abs_epi16(s9);
     150           0 :     s12 = _mm_unpacklo_epi16(s9, _mm_setzero_si128());
     151           0 :     s11 = _mm_unpackhi_epi16(s9, _mm_setzero_si128());
     152           0 :     s12 = _mm_add_epi32(s12, s11);
     153           0 :     s12 = _mm_hadd_epi32(s12, s8);
     154           0 :     s12 = _mm_hadd_epi32(s12, s8);
     155           0 :     s12 = _mm_add_epi32(s10, s12);
     156             : 
     157             :     //m2[2][i] = m1[2][i] + m1[3][i]
     158             :     //m2[2][i] = m3[0][i] - m3[2][i] + m3[1][i] - m3[3][i]
     159             :     //m2[2][i] = m2[0][i] + m2[4][i] - (m2[2][i] + m2[6][i]) + m2[1][i] + m2[5][i] - (m2[3][i] + m2[7][i])
     160             :     //m2[2][i] = m2[0][i] + m2[1][i] - m2[2][i] - m2[3][i] + m2[4][i] + m2[5][i] - m2[6][i] - m2[7][i]
     161             :     //m2[2][i] = m2[0][i] + m2[1][i] - (m2[2][i] + m2[3][i]) + m2[4][i] + m2[5][i] - (m2[6][i] + m2[7][i])
     162           0 :     s9 = _mm_add_epi16(diff0to3Pos, diff4to7Pos);
     163           0 :     s9 = _mm_abs_epi16(s9);
     164           0 :     s10 = _mm_unpacklo_epi16(s9, _mm_setzero_si128());
     165           0 :     s11 = _mm_unpackhi_epi16(s9, _mm_setzero_si128());
     166           0 :     s10 = _mm_add_epi32(s10, s11);
     167           0 :     s10 = _mm_hadd_epi32(s10, s8);
     168           0 :     s10 = _mm_hadd_epi32(s10, s8);
     169           0 :     s12 = _mm_add_epi32(s10, s12);
     170             : 
     171             :     //m2[3][i] = m1[2][i] - m1[3][i]
     172             :     //m2[3][i] = m3[0][i] - m3[2][i] - (m3[1][i] - m3[3][i])
     173             :     //m2[3][i] = m2[0][i] + m2[4][i] - (m2[2][i] + m2[6][i]) - (m2[1][i] + m2[5][i] - m2[3][i] - m2[7][i])
     174             :     //m2[3][i] = m2[0][i] - m2[1][i] - m2[2][i] + m2[3][i] + m2[4][i] - m2[5][i] - m2[6][i] + m2[7][i]
     175             :     //m2[3][i] = m2[0][i] - m2[1][i] - (m2[2][i] - m2[3][i]) + (m2[4][i] - m2[5][i]) - (m2[6][i] - m2[7][i])
     176           0 :     s9 = _mm_add_epi16(diff0to3Neg, diff4to7Neg);
     177           0 :     s9 = _mm_abs_epi16(s9);
     178           0 :     s10 = _mm_unpacklo_epi16(s9, _mm_setzero_si128());
     179           0 :     s11 = _mm_unpackhi_epi16(s9, _mm_setzero_si128());
     180           0 :     s10 = _mm_add_epi32(s10, s11);
     181           0 :     s10 = _mm_hadd_epi32(s10, s8);
     182           0 :     s10 = _mm_hadd_epi32(s10, s8);
     183           0 :     s12 = _mm_add_epi32(s10, s12);
     184             : 
     185             :     //m2[4][i] = m1[4][i] + m1[5][i]
     186             :     //m2[4][i] = m3[4][i] + m3[6][i] + m3[5][i] + m3[7][i]
     187             :     //m2[4][i] = m2[0][i] - m2[4][i] + m2[2][i] - m2[6][i] + m2[1][i] - m2[5][i] + m2[3][i] - m2[7][i]
     188             :     //m2[4][i] = m2[0][i] + m2[1][i] + m2[2][i] + m2[3][i] - m2[4][i] - m2[5][i] - m2[6][i] - m2[7][i]
     189             :     //m2[4][i] = m2[0][i] + m2[1][i] + m2[2][i] + m2[3][i] - ( (m2[4][i] + m2[5][i]) + (m2[6][i] + m2[7][i]) )
     190           0 :     s9 = _mm_sub_epi16(sum0to3Pos, sum4to7Pos);
     191           0 :     s9 = _mm_abs_epi16(s9);
     192           0 :     s10 = _mm_unpacklo_epi16(s9, _mm_setzero_si128());
     193           0 :     s11 = _mm_unpackhi_epi16(s9, _mm_setzero_si128());
     194           0 :     s10 = _mm_add_epi32(s10, s11);
     195           0 :     s10 = _mm_hadd_epi32(s10, s8);
     196           0 :     s10 = _mm_hadd_epi32(s10, s8);
     197           0 :     s12 = _mm_add_epi32(s10, s12);
     198             : 
     199             :     //m2[5][i] = m1[4][i] - m1[5][i]
     200             :     //m2[5][i] = m3[4][i] + m3[6][i] - (m3[5][i] + m3[7][i])
     201             :     //m2[5][i] = m2[0][i] - m2[4][i] + m2[2][i] - m2[6][i] - (m2[1][i] - m2[5][i] + m2[3][i] - m2[7][i])
     202             :     //m2[5][i] = m2[0][i] - m2[1][i] + m2[2][i] - m2[3][i] - m2[4][i] + m2[5][i] - m2[6][i] + m2[7][i]
     203             :     //m2[5][i] = m2[0][i] - m2[1][i] + (m2[2][i] - m2[3][i]) - ( (m2[4][i] - m2[5][i]) + (m2[6][i] - m2[7][i]) )
     204           0 :     s9 = _mm_sub_epi16(sum0to3Neg, sum4to7Neg);
     205           0 :     s9 = _mm_abs_epi16(s9);
     206           0 :     s10 = _mm_unpacklo_epi16(s9, _mm_setzero_si128());
     207           0 :     s11 = _mm_unpackhi_epi16(s9, _mm_setzero_si128());
     208           0 :     s10 = _mm_add_epi32(s10, s11);
     209           0 :     s10 = _mm_hadd_epi32(s10, s8);
     210           0 :     s10 = _mm_hadd_epi32(s10, s8);
     211           0 :     s12 = _mm_add_epi32(s10, s12);
     212             : 
     213             :     //m2[6][i] = m1[6][i] + m1[7][i]
     214             :     //m2[6][i] = m3[4][i] - m3[6][i] + m3[5][i] - m3[7][i]
     215             :     //m2[6][i] = m2[0][i] - m2[4][i] - (m2[2][i] - m2[6][i]) + m2[1][i] - m2[5][i] - (m2[3][i] - m2[7][i])
     216             :     //m2[6][i] = m2[0][i] + m2[1][i] - m2[2][i] - m2[3][i] - m2[4][i] - m2[5][i] + m2[6][i] + m2[7][i]
     217             :     //m2[6][i] = (m2[0][i] + m2[1][i]) - (m2[2][i] + m2[3][i]) - ( (m2[4][i] + m2[5][i]) - (m2[6][i] + m2[7][i]) )
     218           0 :     s9 = _mm_sub_epi16(diff0to3Pos, diff4to7Pos);
     219           0 :     s9 = _mm_abs_epi16(s9);
     220           0 :     s10 = _mm_unpacklo_epi16(s9, _mm_setzero_si128());
     221           0 :     s11 = _mm_unpackhi_epi16(s9, _mm_setzero_si128());
     222           0 :     s10 = _mm_add_epi32(s10, s11);
     223           0 :     s10 = _mm_hadd_epi32(s10, s8);
     224           0 :     s10 = _mm_hadd_epi32(s10, s8);
     225           0 :     s12 = _mm_add_epi32(s10, s12);
     226             : 
     227             :     //m2[7][i] = m1[6][i] - m1[7][i]
     228             :     //m2[7][i] = m3[4][i] - m3[6][i] - (m3[5][i] - m3[7][i])
     229             :     //m2[7][i] = m2[0][i] - m2[4][i] - (m2[2][i] - m2[6][i]) - ((m2[1][i] - m2[5][i]) - (m2[3][i] - m2[7][i]))
     230             :     //m2[7][i] = (m2[0][i] - m2[1][i]) - (m2[2][i] - m2[3][i]) - ( (m2[4][i] - m2[5][i]) - (m2[6][i] - m2[7][i]) )
     231           0 :     s9 = _mm_sub_epi16(diff0to3Neg, diff4to7Neg);
     232           0 :     s9 = _mm_abs_epi16(s9);
     233           0 :     s10 = _mm_unpacklo_epi16(s9, _mm_setzero_si128());
     234           0 :     s11 = _mm_unpackhi_epi16(s9, _mm_setzero_si128());
     235           0 :     s10 = _mm_add_epi32(s10, s11);
     236           0 :     s10 = _mm_hadd_epi32(s10, s8);
     237           0 :     s10 = _mm_hadd_epi32(s10, s8);
     238           0 :     s12 = _mm_add_epi32(s10, s12);
     239             : 
     240           0 :     satdBlock8x8 = (uint64_t)_mm_extract_epi32(s12, 0);
     241             : 
     242           0 :     satdBlock8x8 = ((satdBlock8x8 + 2) >> 2);
     243             : 
     244           0 :     return satdBlock8x8;
     245             : }
     246             : 
     247           0 : uint64_t compute8x8_satd_u8_sse4(
     248             :     uint8_t  *src,       // input parameter, diff samples Ptr
     249             :     uint64_t *dc_value,
     250             :     uint32_t  src_stride)
     251             : {
     252           0 :     uint64_t satdBlock8x8 = 0;
     253             :     int16_t m2[8][8];
     254             : 
     255             :     uint32_t j;
     256             :     __m128i s0, s1, s2, s3, s4, s5, s6, s7, s9, s10, s11, s12;
     257           0 :     __m128i s8 = _mm_setzero_si128();
     258             :     __m128i sum01Neg, sum01Pos, sum23Neg, sum23Pos, sum45Neg, sum45Pos, sum67Neg, sum67Pos;
     259             :     __m128i sum0to3Pos, sum4to7Pos, sum0to3Neg, sum4to7Neg, diff0to3Pos, diff4to7Pos, diff0to3Neg, diff4to7Neg;
     260             :     __m128i sum0, sum1, difference0, difference1;
     261             : 
     262           0 :     for (j = 0; j < 8; j += 2)
     263             :     {
     264           0 :         s0 = _mm_loadl_epi64((__m128i*)(src + (j *src_stride)));
     265           0 :         s10 = _mm_loadl_epi64((__m128i*)(src + ((j + 1) *src_stride)));
     266           0 :         s10 = _mm_unpacklo_epi8(s10, _mm_setzero_si128());
     267           0 :         s0 = _mm_unpacklo_epi8(s0, _mm_setzero_si128());
     268             : 
     269           0 :         sum0 = _mm_hadd_epi16(s0, s8);
     270           0 :         sum1 = _mm_hadd_epi16(s10, s8);
     271             : 
     272           0 :         difference0 = _mm_hsub_epi16(s0, s8);
     273           0 :         difference1 = _mm_hsub_epi16(s10, s8);
     274             : 
     275             :         // m2[j][0]
     276             :         // diff[jj] + diff[jj + 4] + diff[jj + 2] + diff[jj + 6] + diff[jj + 1] + diff[jj + 5] + diff[jj + 3] + diff[jj + 7]
     277             :         // diff[jj] + diff[jj + 1] + diff[jj + 2] + diff[jj + 3] + diff[jj + 4] + diff[jj + 5] + diff[jj + 6] + diff[jj + 7]
     278           0 :         s1 = _mm_hadd_epi16(sum0, sum1);
     279           0 :         s1 = _mm_hadd_epi16(s1, s8);
     280           0 :         m2[j][0] = _mm_extract_epi16(s1, 0);
     281           0 :         m2[j + 1][0] = _mm_extract_epi16(s1, 2);
     282             : 
     283             :         //m2[j][1]
     284             :         //diff[jj] + diff[jj + 4] + diff[jj + 2] + diff[jj + 6] - diff[jj + 1] - diff[jj + 5] - diff[jj + 3] - diff[jj + 7]
     285             :         //diff[jj] - diff[jj + 1] + diff[jj + 2] - diff[jj + 3] + diff[jj + 4] - diff[jj + 5] + diff[jj + 6] - diff[jj + 7]
     286             :         //(diff[jj] - diff[jj + 1]) + (diff[jj + 2] - diff[jj + 3]) + (diff[jj + 4] - diff[jj + 5]) + (diff[jj + 6] - diff[jj + 7])
     287           0 :         s1 = _mm_hadd_epi16(difference0, difference1);
     288           0 :         s1 = _mm_hadd_epi16(s1, s8);
     289           0 :         m2[j][1] = _mm_extract_epi16(s1, 0);
     290           0 :         m2[j + 1][1] = _mm_extract_epi16(s1, 2);
     291             : 
     292             :         //m2[j][2]
     293             :         //diff[jj] + diff[jj + 4] - diff[jj + 2] - diff[jj + 6] + diff[jj + 1] + diff[jj + 5] - diff[jj + 3] - diff[jj + 7]
     294             :         //diff[jj] + diff[jj + 1] - diff[jj + 2] - diff[jj + 3] + diff[jj + 4] + diff[jj + 5] - diff[jj + 6] - diff[jj + 7]
     295           0 :         s1 = _mm_hsub_epi16(sum0, sum1);
     296           0 :         s1 = _mm_hadd_epi16(s1, s8);
     297           0 :         m2[j][2] = _mm_extract_epi16(s1, 0);
     298           0 :         m2[j + 1][2] = _mm_extract_epi16(s1, 2);
     299             : 
     300             :         //m2[j][3]
     301             :         //diff[jj] + diff[jj + 4] - diff[jj + 2] - diff[jj + 6] - diff[jj + 1] - diff[jj + 5] + diff[jj + 3] + diff[jj + 7]
     302             :         //diff[jj] - diff[jj + 1] - diff[jj + 2] + diff[jj + 3] + diff[jj + 4] - diff[jj + 5] - diff[jj + 6] + diff[jj + 7]
     303             :         //diff[jj] - diff[jj + 1] - diff[jj + 2] + diff[jj + 3] + diff[jj + 4] - diff[jj + 5] - diff[jj + 6] + diff[jj + 7]
     304           0 :         s1 = _mm_hsub_epi16(difference0, difference1);
     305           0 :         s1 = _mm_hadd_epi16(s1, s8);
     306           0 :         m2[j][3] = _mm_extract_epi16(s1, 0);
     307           0 :         m2[j + 1][3] = _mm_extract_epi16(s1, 2);
     308             : 
     309             :         //m2[j][4]
     310             :         //diff[jj] - diff[jj + 4] + diff[jj + 2] - diff[jj + 6] + diff[jj + 1] - diff[jj + 5] + diff[jj + 3] - diff[jj + 7]
     311             :         //diff[jj] + diff[jj + 1] + diff[jj + 2] + diff[jj + 3] - diff[jj + 4] - diff[jj + 5] - diff[jj + 6] - diff[jj + 7]
     312           0 :         s1 = _mm_hadd_epi16(sum0, sum1);
     313           0 :         s1 = _mm_hsub_epi16(s1, s8);
     314           0 :         m2[j][4] = _mm_extract_epi16(s1, 0);
     315           0 :         m2[j + 1][4] = _mm_extract_epi16(s1, 2);
     316             : 
     317             :         //m2[j][5]
     318             :         //m1[j][4] - m1[j][5]
     319             :         //diff[jj] - diff[jj + 1] + diff[jj + 2] - diff[jj + 3] - diff[jj + 4]  + diff[jj + 5] - diff[jj + 6] + diff[jj + 7]
     320           0 :         s1 = _mm_hadd_epi16(difference0, difference1);
     321           0 :         s1 = _mm_hsub_epi16(s1, s8);
     322           0 :         m2[j][5] = _mm_extract_epi16(s1, 0);
     323           0 :         m2[j + 1][5] = _mm_extract_epi16(s1, 2);
     324             : 
     325             :         //m2[j][6]
     326             :         //diff[jj] - diff[jj + 4] - diff[jj + 2] + diff[jj + 6] + diff[jj + 1] - diff[jj + 5] - diff[jj + 3] + diff[jj + 7]
     327             :         //diff[jj] + diff[jj + 1] - diff[jj + 2] - diff[jj + 3] - diff[jj + 4] - diff[jj + 5] + diff[jj + 6] + diff[jj + 7]
     328             : 
     329           0 :         s1 = _mm_hsub_epi16(sum0, sum1);
     330           0 :         s1 = _mm_hsub_epi16(s1, s8);
     331           0 :         m2[j][6] = _mm_extract_epi16(s1, 0);
     332           0 :         m2[j + 1][6] = _mm_extract_epi16(s1, 2);
     333             : 
     334             :         //m2[j][7]
     335             :         //diff[jj] - diff[jj + 4] - diff[jj + 2] + diff[jj + 6] - diff[jj + 1] + diff[jj + 5] + diff[jj + 3] - diff[jj + 7]
     336             :         //diff[jj] - diff[jj + 1] - diff[jj + 2] + diff[jj + 3] - diff[jj + 4] + diff[jj + 5] + diff[jj + 6] - diff[jj + 7]
     337           0 :         s1 = _mm_hsub_epi16(difference0, difference1);
     338           0 :         s1 = _mm_hsub_epi16(s1, s8);
     339           0 :         m2[j][7] = _mm_extract_epi16(s1, 0);
     340           0 :         m2[j + 1][7] = _mm_extract_epi16(s1, 2);
     341             :     }
     342             : 
     343             :     // Vertical
     344           0 :     s0 = _mm_loadu_si128((__m128i*)(m2[0]));
     345           0 :     s1 = _mm_loadu_si128((__m128i*)(m2[1]));
     346           0 :     s2 = _mm_loadu_si128((__m128i*)(m2[2]));
     347           0 :     s3 = _mm_loadu_si128((__m128i*)(m2[3]));
     348           0 :     s4 = _mm_loadu_si128((__m128i*)(m2[4]));
     349           0 :     s5 = _mm_loadu_si128((__m128i*)(m2[5]));
     350           0 :     s6 = _mm_loadu_si128((__m128i*)(m2[6]));
     351           0 :     s7 = _mm_loadu_si128((__m128i*)(m2[7]));
     352             : 
     353           0 :     sum01Pos = _mm_add_epi16(s0, s1);
     354           0 :     sum23Pos = _mm_add_epi16(s2, s3);
     355           0 :     sum45Pos = _mm_add_epi16(s4, s5);
     356           0 :     sum67Pos = _mm_add_epi16(s6, s7);
     357             : 
     358           0 :     sum01Neg = _mm_sub_epi16(s0, s1);
     359           0 :     sum23Neg = _mm_sub_epi16(s2, s3);
     360           0 :     sum45Neg = _mm_sub_epi16(s4, s5);
     361           0 :     sum67Neg = _mm_sub_epi16(s6, s7);
     362             : 
     363           0 :     sum0to3Pos = _mm_add_epi16(sum01Pos, sum23Pos);
     364           0 :     sum4to7Pos = _mm_add_epi16(sum45Pos, sum67Pos);
     365           0 :     diff0to3Pos = _mm_sub_epi16(sum01Pos, sum23Pos);
     366           0 :     diff4to7Pos = _mm_sub_epi16(sum45Pos, sum67Pos);
     367             : 
     368           0 :     sum0to3Neg = _mm_add_epi16(sum01Neg, sum23Neg);
     369           0 :     sum4to7Neg = _mm_add_epi16(sum45Neg, sum67Neg);
     370           0 :     diff0to3Neg = _mm_sub_epi16(sum01Neg, sum23Neg);
     371           0 :     diff4to7Neg = _mm_sub_epi16(sum45Neg, sum67Neg);
     372             : 
     373             :     //m2[0][i] = m1[0][i] + m1[1][i]
     374             :     //m2[0][i] = m3[0][i] + m3[2][i] + m3[1][i] + m3[3][i]
     375             :     //m2[0][i] = m2[0][i] + m2[4][i] + m2[2][i] + m2[6][i] + m2[1][i] + m2[5][i] + m2[3][i] + m2[7][i]
     376             :     //m2[0][i] = m2[0][i] + m2[1][i] + m2[2][i] + m2[3][i] + m2[4][i] + m2[5][i] + m2[6][i] + m2[7][i]
     377           0 :     s9 = _mm_add_epi16(sum0to3Pos, sum4to7Pos);
     378           0 :     s9 = _mm_abs_epi16(s9);
     379           0 :     *dc_value += _mm_extract_epi16(s9, 0);
     380             : 
     381           0 :     s10 = _mm_unpacklo_epi16(s9, _mm_setzero_si128());
     382           0 :     s11 = _mm_unpackhi_epi16(s9, _mm_setzero_si128());
     383           0 :     s10 = _mm_add_epi32(s10, s11);
     384           0 :     s10 = _mm_hadd_epi32(s10, s8);
     385           0 :     s10 = _mm_hadd_epi32(s10, s8);
     386             : 
     387             :     //m2[1][i] = m1[0][i] - m1[1][i]
     388             :     //m2[1][i] = m3[0][i] + m3[2][i] -(m3[1][i] + m3[3][i])
     389             :     //m2[1][i] = m2[0][i] + m2[4][i] + m2[2][i] + m2[6][i] -(m2[1][i] + m2[5][i] + m2[3][i] + m2[7][i])
     390             :     //m2[1][i] = m2[0][i] - m2[1][i] + m2[2][i] - m2[3][i] + m2[4][i] - m2[5][i] + m2[6][i] - m2[7][i]
     391           0 :     s9 = _mm_add_epi16(sum0to3Neg, sum4to7Neg);
     392           0 :     s9 = _mm_abs_epi16(s9);
     393           0 :     s12 = _mm_unpacklo_epi16(s9, _mm_setzero_si128());
     394           0 :     s11 = _mm_unpackhi_epi16(s9, _mm_setzero_si128());
     395           0 :     s12 = _mm_add_epi32(s12, s11);
     396           0 :     s12 = _mm_hadd_epi32(s12, s8);
     397           0 :     s12 = _mm_hadd_epi32(s12, s8);
     398           0 :     s12 = _mm_add_epi32(s10, s12);
     399             : 
     400             :     //m2[2][i] = m1[2][i] + m1[3][i]
     401             :     //m2[2][i] = m3[0][i] - m3[2][i] + m3[1][i] - m3[3][i]
     402             :     //m2[2][i] = m2[0][i] + m2[4][i] - (m2[2][i] + m2[6][i]) + m2[1][i] + m2[5][i] - (m2[3][i] + m2[7][i])
     403             :     //m2[2][i] = m2[0][i] + m2[1][i] - m2[2][i] - m2[3][i] + m2[4][i] + m2[5][i] - m2[6][i] - m2[7][i]
     404             :     //m2[2][i] = m2[0][i] + m2[1][i] - (m2[2][i] + m2[3][i]) + m2[4][i] + m2[5][i] - (m2[6][i] + m2[7][i])
     405           0 :     s9 = _mm_add_epi16(diff0to3Pos, diff4to7Pos);
     406           0 :     s9 = _mm_abs_epi16(s9);
     407           0 :     s10 = _mm_unpacklo_epi16(s9, _mm_setzero_si128());
     408           0 :     s11 = _mm_unpackhi_epi16(s9, _mm_setzero_si128());
     409           0 :     s10 = _mm_add_epi32(s10, s11);
     410           0 :     s10 = _mm_hadd_epi32(s10, s8);
     411           0 :     s10 = _mm_hadd_epi32(s10, s8);
     412           0 :     s12 = _mm_add_epi32(s10, s12);
     413             : 
     414             :     //m2[3][i] = m1[2][i] - m1[3][i]
     415             :     //m2[3][i] = m3[0][i] - m3[2][i] - (m3[1][i] - m3[3][i])
     416             :     //m2[3][i] = m2[0][i] + m2[4][i] - (m2[2][i] + m2[6][i]) - (m2[1][i] + m2[5][i] - m2[3][i] - m2[7][i])
     417             :     //m2[3][i] = m2[0][i] - m2[1][i] - m2[2][i] + m2[3][i] + m2[4][i] - m2[5][i] - m2[6][i] + m2[7][i]
     418             :     //m2[3][i] = m2[0][i] - m2[1][i] - (m2[2][i] - m2[3][i]) + (m2[4][i] - m2[5][i]) - (m2[6][i] - m2[7][i])
     419           0 :     s9 = _mm_add_epi16(diff0to3Neg, diff4to7Neg);
     420           0 :     s9 = _mm_abs_epi16(s9);
     421           0 :     s10 = _mm_unpacklo_epi16(s9, _mm_setzero_si128());
     422           0 :     s11 = _mm_unpackhi_epi16(s9, _mm_setzero_si128());
     423           0 :     s10 = _mm_add_epi32(s10, s11);
     424           0 :     s10 = _mm_hadd_epi32(s10, s8);
     425           0 :     s10 = _mm_hadd_epi32(s10, s8);
     426           0 :     s12 = _mm_add_epi32(s10, s12);
     427             : 
     428             :     //m2[4][i] = m1[4][i] + m1[5][i]
     429             :     //m2[4][i] = m3[4][i] + m3[6][i] + m3[5][i] + m3[7][i]
     430             :     //m2[4][i] = m2[0][i] - m2[4][i] + m2[2][i] - m2[6][i] + m2[1][i] - m2[5][i] + m2[3][i] - m2[7][i]
     431             :     //m2[4][i] = m2[0][i] + m2[1][i] + m2[2][i] + m2[3][i] - m2[4][i] - m2[5][i] - m2[6][i] - m2[7][i]
     432             :     //m2[4][i] = m2[0][i] + m2[1][i] + m2[2][i] + m2[3][i] - ( (m2[4][i] + m2[5][i]) + (m2[6][i] + m2[7][i]) )
     433           0 :     s9 = _mm_sub_epi16(sum0to3Pos, sum4to7Pos);
     434           0 :     s9 = _mm_abs_epi16(s9);
     435           0 :     s10 = _mm_unpacklo_epi16(s9, _mm_setzero_si128());
     436           0 :     s11 = _mm_unpackhi_epi16(s9, _mm_setzero_si128());
     437           0 :     s10 = _mm_add_epi32(s10, s11);
     438           0 :     s10 = _mm_hadd_epi32(s10, s8);
     439           0 :     s10 = _mm_hadd_epi32(s10, s8);
     440           0 :     s12 = _mm_add_epi32(s10, s12);
     441             : 
     442             :     //m2[5][i] = m1[4][i] - m1[5][i]
     443             :     //m2[5][i] = m3[4][i] + m3[6][i] - (m3[5][i] + m3[7][i])
     444             :     //m2[5][i] = m2[0][i] - m2[4][i] + m2[2][i] - m2[6][i] - (m2[1][i] - m2[5][i] + m2[3][i] - m2[7][i])
     445             :     //m2[5][i] = m2[0][i] - m2[1][i] + m2[2][i] - m2[3][i] - m2[4][i] + m2[5][i] - m2[6][i] + m2[7][i]
     446             :     //m2[5][i] = m2[0][i] - m2[1][i] + (m2[2][i] - m2[3][i]) - ( (m2[4][i] - m2[5][i]) + (m2[6][i] - m2[7][i]) )
     447           0 :     s9 = _mm_sub_epi16(sum0to3Neg, sum4to7Neg);
     448           0 :     s9 = _mm_abs_epi16(s9);
     449           0 :     s10 = _mm_unpacklo_epi16(s9, _mm_setzero_si128());
     450           0 :     s11 = _mm_unpackhi_epi16(s9, _mm_setzero_si128());
     451           0 :     s10 = _mm_add_epi32(s10, s11);
     452           0 :     s10 = _mm_hadd_epi32(s10, s8);
     453           0 :     s10 = _mm_hadd_epi32(s10, s8);
     454           0 :     s12 = _mm_add_epi32(s10, s12);
     455             : 
     456             :     //m2[6][i] = m1[6][i] + m1[7][i]
     457             :     //m2[6][i] = m3[4][i] - m3[6][i] + m3[5][i] - m3[7][i]
     458             :     //m2[6][i] = m2[0][i] - m2[4][i] - (m2[2][i] - m2[6][i]) + m2[1][i] - m2[5][i] - (m2[3][i] - m2[7][i])
     459             :     //m2[6][i] = m2[0][i] + m2[1][i] - m2[2][i] - m2[3][i] - m2[4][i] - m2[5][i] + m2[6][i] + m2[7][i]
     460             :     //m2[6][i] = (m2[0][i] + m2[1][i]) - (m2[2][i] + m2[3][i]) - ( (m2[4][i] + m2[5][i]) - (m2[6][i] + m2[7][i]) )
     461           0 :     s9 = _mm_sub_epi16(diff0to3Pos, diff4to7Pos);
     462           0 :     s9 = _mm_abs_epi16(s9);
     463           0 :     s10 = _mm_unpacklo_epi16(s9, _mm_setzero_si128());
     464           0 :     s11 = _mm_unpackhi_epi16(s9, _mm_setzero_si128());
     465           0 :     s10 = _mm_add_epi32(s10, s11);
     466           0 :     s10 = _mm_hadd_epi32(s10, s8);
     467           0 :     s10 = _mm_hadd_epi32(s10, s8);
     468           0 :     s12 = _mm_add_epi32(s10, s12);
     469             : 
     470             :     //m2[7][i] = m1[6][i] - m1[7][i]
     471             :     //m2[7][i] = m3[4][i] - m3[6][i] - (m3[5][i] - m3[7][i])
     472             :     //m2[7][i] = m2[0][i] - m2[4][i] - (m2[2][i] - m2[6][i]) - ((m2[1][i] - m2[5][i]) - (m2[3][i] - m2[7][i]))
     473             :     //m2[7][i] = (m2[0][i] - m2[1][i]) - (m2[2][i] - m2[3][i]) - ( (m2[4][i] - m2[5][i]) - (m2[6][i] - m2[7][i]) )
     474           0 :     s9 = _mm_sub_epi16(diff0to3Neg, diff4to7Neg);
     475           0 :     s9 = _mm_abs_epi16(s9);
     476           0 :     s10 = _mm_unpacklo_epi16(s9, _mm_setzero_si128());
     477           0 :     s11 = _mm_unpackhi_epi16(s9, _mm_setzero_si128());
     478           0 :     s10 = _mm_add_epi32(s10, s11);
     479           0 :     s10 = _mm_hadd_epi32(s10, s8);
     480           0 :     s10 = _mm_hadd_epi32(s10, s8);
     481           0 :     s12 = _mm_add_epi32(s10, s12);
     482             : 
     483           0 :     satdBlock8x8 = (uint64_t)_mm_extract_epi32(s12, 0);
     484             : 
     485           0 :     satdBlock8x8 = ((satdBlock8x8 + 2) >> 2);
     486             : 
     487           0 :     return satdBlock8x8;
     488             : }

Generated by: LCOV version 1.14