LCOV - code coverage report
Current view: top level - ASM_AVX2 - EbPackUnPack_Intrinsic_AVX2.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 0 215 0.0 %
Date: 2019-11-25 17:38:06 Functions: 0 1 0.0 %

          Line data    Source code
       1             : /*
       2             : * Copyright(c) 2019 Intel Corporation
       3             : * SPDX - License - Identifier: BSD - 2 - Clause - Patent
       4             : */
       5             : 
       6             : #include "EbPackUnPack_AVX2.h"
       7             : 
       8             : #include <emmintrin.h>
       9             : #include <immintrin.h>
      10             : #include <stdint.h>
      11             : 
      12           0 : void eb_enc_un_pack8_bit_data_avx2_intrin(
      13             :     uint16_t *in_16bit_buffer,
      14             :     uint32_t  in_stride,
      15             :     uint8_t  *out_8bit_buffer,
      16             :     uint32_t  out_stride,
      17             :     uint32_t  width,
      18             :     uint32_t  height) {
      19             :     uint32_t x, y;
      20             : 
      21             :     __m256i ymm_00FF, in_pixel0, in_pixel1;
      22             :     __m256i in_pixel0_shftR_2_U8;
      23           0 :     ymm_00FF = _mm256_set1_epi16(0x00FF);
      24             : 
      25           0 :     if (width == 8) {
      26             :         __m128i xmm_00FF, in_pixel0, in_pixel1, in_pixel1_shftR_2_U8;
      27             :         __m128i in_pixel0_shftR_2_U8, in_pixel0_shftR_2, in_pixel1_shftR_2;
      28           0 :         xmm_00FF = _mm_set1_epi16(0x00FF);
      29           0 :         for (y = 0; y < height; y += 2) {
      30           0 :             in_pixel0 = _mm_loadu_si128((__m128i*) in_16bit_buffer);
      31           0 :             in_pixel1 = _mm_loadu_si128((__m128i*)
      32           0 :                 (in_16bit_buffer + in_stride));
      33             : 
      34           0 :             in_pixel0_shftR_2 = _mm_and_si128(_mm_srli_epi16(in_pixel0, 2),
      35             :                 xmm_00FF);
      36           0 :             in_pixel1_shftR_2 = _mm_and_si128(_mm_srli_epi16(in_pixel1, 2),
      37             :                 xmm_00FF);
      38             : 
      39           0 :             in_pixel0_shftR_2_U8 = _mm_packus_epi16(in_pixel0_shftR_2,
      40             :                 in_pixel0_shftR_2);
      41           0 :             in_pixel1_shftR_2_U8 = _mm_packus_epi16(in_pixel1_shftR_2,
      42             :                 in_pixel1_shftR_2);
      43             : 
      44           0 :             _mm_storel_epi64((__m128i*)out_8bit_buffer, in_pixel0_shftR_2_U8);
      45           0 :             _mm_storel_epi64((__m128i*)(out_8bit_buffer + out_stride),
      46             :                 in_pixel1_shftR_2_U8);
      47             : 
      48           0 :             out_8bit_buffer += 2 * out_stride;
      49           0 :             in_16bit_buffer += 2 * in_stride;
      50             :         }
      51             :     }
      52           0 :     else if (width == 16) {
      53           0 :         for (y = 0; y < height; y += 2) {
      54           0 :             in_pixel0 = _mm256_loadu_si256((__m256i*) in_16bit_buffer);
      55           0 :             in_pixel1 = _mm256_loadu_si256((__m256i*)
      56           0 :                 (in_16bit_buffer + in_stride));
      57             : 
      58           0 :             in_pixel0_shftR_2_U8 = _mm256_packus_epi16(_mm256_and_si256(
      59             :                 _mm256_srli_epi16(in_pixel0, 2), ymm_00FF), _mm256_and_si256(
      60             :                 _mm256_srli_epi16(in_pixel1, 2), ymm_00FF));
      61             : 
      62           0 :             *(uint64_t *)out_8bit_buffer =
      63           0 :                 _mm256_extract_epi64(in_pixel0_shftR_2_U8, 0);
      64           0 :             *(uint64_t *)(out_8bit_buffer + 8) =
      65           0 :                 _mm256_extract_epi64(in_pixel0_shftR_2_U8, 2);
      66           0 :             *(uint64_t *)(out_8bit_buffer + out_stride) =
      67           0 :                 _mm256_extract_epi64(in_pixel0_shftR_2_U8, 1);
      68           0 :             *(uint64_t *)(out_8bit_buffer + out_stride + 8) =
      69           0 :                 _mm256_extract_epi64(in_pixel0_shftR_2_U8, 3);
      70             : 
      71           0 :             out_8bit_buffer += 2 * out_stride;
      72           0 :             in_16bit_buffer += 2 * in_stride;
      73             :         }
      74             :     }
      75           0 :     else if (width == 32) {
      76             :         __m256i in_pixel2, in_pixel3;
      77             :         __m256i  out8_0_U8, out8_1_U8;
      78             : 
      79           0 :         for (y = 0; y < height; y += 2) {
      80           0 :             in_pixel0 = _mm256_loadu_si256((__m256i*)in_16bit_buffer);
      81           0 :             in_pixel1 = _mm256_loadu_si256((__m256i*)(in_16bit_buffer + 16));
      82           0 :             in_pixel2 = _mm256_loadu_si256((__m256i*)
      83           0 :                 (in_16bit_buffer + in_stride));
      84           0 :             in_pixel3 = _mm256_loadu_si256((__m256i*)
      85           0 :                 (in_16bit_buffer + in_stride + 16));
      86             : 
      87           0 :             out8_0_U8 = _mm256_packus_epi16(
      88             :                 _mm256_and_si256(_mm256_srli_epi16(in_pixel0, 2), ymm_00FF),
      89             :                 _mm256_and_si256(_mm256_srli_epi16(in_pixel1, 2), ymm_00FF));
      90           0 :             out8_1_U8 = _mm256_packus_epi16(
      91             :                 _mm256_and_si256(_mm256_srli_epi16(in_pixel2, 2), ymm_00FF),
      92             :                 _mm256_and_si256(_mm256_srli_epi16(in_pixel3, 2), ymm_00FF));
      93             : 
      94           0 :             *(uint64_t *)out_8bit_buffer = _mm256_extract_epi64(out8_0_U8, 0);
      95           0 :             *(uint64_t *)(out_8bit_buffer + 8) =
      96           0 :                 _mm256_extract_epi64(out8_0_U8, 2);
      97           0 :             *(uint64_t *)(out_8bit_buffer + 16) =
      98           0 :                 _mm256_extract_epi64(out8_0_U8, 1);
      99           0 :             *(uint64_t *)(out_8bit_buffer + 24) =
     100           0 :                 _mm256_extract_epi64(out8_0_U8, 3);
     101           0 :             out_8bit_buffer += out_stride;
     102             : 
     103           0 :             *(uint64_t *)out_8bit_buffer = _mm256_extract_epi64(out8_1_U8, 0);
     104           0 :             *(uint64_t *)(out_8bit_buffer + 8) =
     105           0 :                 _mm256_extract_epi64(out8_1_U8, 2);
     106           0 :             *(uint64_t *)(out_8bit_buffer + 16) =
     107           0 :                 _mm256_extract_epi64(out8_1_U8, 1);
     108           0 :             *(uint64_t *)(out_8bit_buffer + 24) =
     109           0 :                 _mm256_extract_epi64(out8_1_U8, 3);
     110           0 :             out_8bit_buffer += out_stride;
     111           0 :             in_16bit_buffer += 2 * in_stride;
     112             :         }
     113             :     }
     114           0 :     else if (width == 64) {
     115             :         __m256i in_pixel2, in_pixel3;
     116             :         __m256i  out8_0_U8, out8_1_U8;
     117             : 
     118           0 :         for (y = 0; y < height; ++y) {
     119           0 :             in_pixel0 = _mm256_loadu_si256((__m256i*)in_16bit_buffer);
     120           0 :             in_pixel1 = _mm256_loadu_si256((__m256i*)(in_16bit_buffer + 16));
     121           0 :             in_pixel2 = _mm256_loadu_si256((__m256i*)(in_16bit_buffer + 32));
     122           0 :             in_pixel3 = _mm256_loadu_si256((__m256i*)(in_16bit_buffer + 48));
     123             : 
     124           0 :             out8_0_U8 = _mm256_packus_epi16(
     125             :                 _mm256_and_si256(_mm256_srli_epi16(in_pixel0, 2), ymm_00FF),
     126             :                 _mm256_and_si256(_mm256_srli_epi16(in_pixel1, 2), ymm_00FF));
     127           0 :             out8_1_U8 = _mm256_packus_epi16(
     128             :                 _mm256_and_si256(_mm256_srli_epi16(in_pixel2, 2), ymm_00FF),
     129             :                 _mm256_and_si256(_mm256_srli_epi16(in_pixel3, 2), ymm_00FF));
     130             : 
     131           0 :             *(uint64_t *)out_8bit_buffer = _mm256_extract_epi64(out8_0_U8, 0);
     132           0 :             *(uint64_t *)(out_8bit_buffer + 8) =
     133           0 :                 _mm256_extract_epi64(out8_0_U8, 2);
     134           0 :             *(uint64_t *)(out_8bit_buffer + 16) =
     135           0 :                 _mm256_extract_epi64(out8_0_U8, 1);
     136           0 :             *(uint64_t *)(out_8bit_buffer + 24) =
     137           0 :                 _mm256_extract_epi64(out8_0_U8, 3);
     138           0 :             *(uint64_t *)(out_8bit_buffer + 32) =
     139           0 :                 _mm256_extract_epi64(out8_1_U8, 0);
     140           0 :             *(uint64_t *)(out_8bit_buffer + 40) =
     141           0 :                 _mm256_extract_epi64(out8_1_U8, 2);
     142           0 :             *(uint64_t *)(out_8bit_buffer + 48) =
     143           0 :                 _mm256_extract_epi64(out8_1_U8, 1);
     144           0 :             *(uint64_t *)(out_8bit_buffer + 56) =
     145           0 :                 _mm256_extract_epi64(out8_1_U8, 3);
     146             : 
     147           0 :             out_8bit_buffer += out_stride;
     148           0 :             in_16bit_buffer += in_stride;
     149             :         }
     150             :     }
     151             :     else {
     152           0 :         uint32_t in_strideDiff = (2 * in_stride) - width;
     153           0 :         uint32_t out_strideDiff = (2 * out_stride) - width;
     154             : 
     155           0 :         uint32_t in_strideDiff64 = in_stride - width;
     156           0 :         uint32_t out_strideDiff64 = out_stride - width;
     157             : 
     158           0 :         if (!(width & 63)) {
     159             :             __m256i in_pixel2, in_pixel3;
     160             :             __m256i out8_0_U8, out8_1_U8;
     161             : 
     162           0 :             for (x = 0; x < height; x += 1) {
     163           0 :                 for (y = 0; y < width; y += 64) {
     164           0 :                     in_pixel0 = _mm256_loadu_si256((__m256i*)in_16bit_buffer);
     165           0 :                     in_pixel1 = _mm256_loadu_si256(
     166           0 :                         (__m256i*)(in_16bit_buffer + 16));
     167           0 :                     in_pixel2 = _mm256_loadu_si256(
     168           0 :                         (__m256i*)(in_16bit_buffer + 32));
     169           0 :                     in_pixel3 = _mm256_loadu_si256(
     170           0 :                         (__m256i*)(in_16bit_buffer + 48));
     171             : 
     172           0 :                     out8_0_U8 = _mm256_packus_epi16(_mm256_and_si256(
     173             :                         _mm256_srli_epi16(in_pixel0, 2), ymm_00FF),
     174             :                         _mm256_and_si256(_mm256_srli_epi16(in_pixel1, 2),
     175             :                         ymm_00FF));
     176           0 :                     out8_1_U8 = _mm256_packus_epi16(_mm256_and_si256(
     177             :                         _mm256_srli_epi16(in_pixel2, 2), ymm_00FF),
     178             :                         _mm256_and_si256(_mm256_srli_epi16(in_pixel3, 2),
     179             :                         ymm_00FF));
     180             : 
     181           0 :                     *(uint64_t *)out_8bit_buffer =
     182           0 :                         _mm256_extract_epi64(out8_0_U8, 0);
     183           0 :                     *(uint64_t *)(out_8bit_buffer + 8) =
     184           0 :                         _mm256_extract_epi64(out8_0_U8, 2);
     185           0 :                     *(uint64_t *)(out_8bit_buffer + 16) =
     186           0 :                         _mm256_extract_epi64(out8_0_U8, 1);
     187           0 :                     *(uint64_t *)(out_8bit_buffer + 24) =
     188           0 :                         _mm256_extract_epi64(out8_0_U8, 3);
     189           0 :                     *(uint64_t *)(out_8bit_buffer + 32) =
     190           0 :                         _mm256_extract_epi64(out8_1_U8, 0);
     191           0 :                     *(uint64_t *)(out_8bit_buffer + 40) =
     192           0 :                         _mm256_extract_epi64(out8_1_U8, 2);
     193           0 :                     *(uint64_t *)(out_8bit_buffer + 48) =
     194           0 :                         _mm256_extract_epi64(out8_1_U8, 1);
     195           0 :                     *(uint64_t *)(out_8bit_buffer + 56) =
     196           0 :                         _mm256_extract_epi64(out8_1_U8, 3);
     197             : 
     198           0 :                     out_8bit_buffer += 64;
     199           0 :                     in_16bit_buffer += 64;
     200             :                 }
     201           0 :                 in_16bit_buffer += in_strideDiff64;
     202           0 :                 out_8bit_buffer += out_strideDiff64;
     203             :             }
     204             :         }
     205           0 :         else if (!(width & 31)) {
     206             :             __m256i in_pixel2, in_pixel3;
     207             :             __m256i out8_0_U8, out8_1_U8;
     208             : 
     209           0 :             for (x = 0; x < height; x += 2) {
     210           0 :                 for (y = 0; y < width; y += 32) {
     211           0 :                     in_pixel0 = _mm256_loadu_si256((__m256i*)in_16bit_buffer);
     212           0 :                     in_pixel1 = _mm256_loadu_si256((__m256i*)
     213           0 :                         (in_16bit_buffer + 16));
     214           0 :                     in_pixel2 = _mm256_loadu_si256((__m256i*)
     215           0 :                         (in_16bit_buffer + in_stride));
     216           0 :                     in_pixel3 = _mm256_loadu_si256((__m256i*)
     217           0 :                         (in_16bit_buffer + in_stride + 16));
     218             : 
     219           0 :                     out8_0_U8 = _mm256_packus_epi16(_mm256_and_si256(
     220             :                         _mm256_srli_epi16(in_pixel0, 2), ymm_00FF),
     221             :                         _mm256_and_si256(_mm256_srli_epi16(in_pixel1, 2),
     222             :                         ymm_00FF));
     223           0 :                     out8_1_U8 = _mm256_packus_epi16(_mm256_and_si256(
     224             :                         _mm256_srli_epi16(in_pixel2, 2), ymm_00FF),
     225             :                         _mm256_and_si256(_mm256_srli_epi16(in_pixel3, 2),
     226             :                         ymm_00FF));
     227             : 
     228           0 :                     *(uint64_t *)out_8bit_buffer =
     229           0 :                         _mm256_extract_epi64(out8_0_U8, 0);
     230           0 :                     *(uint64_t *)(out_8bit_buffer + 8) =
     231           0 :                         _mm256_extract_epi64(out8_0_U8, 2);
     232           0 :                     *(uint64_t *)(out_8bit_buffer + 16) =
     233           0 :                         _mm256_extract_epi64(out8_0_U8, 1);
     234           0 :                     *(uint64_t *)(out_8bit_buffer + 24) =
     235           0 :                         _mm256_extract_epi64(out8_0_U8, 3);
     236             : 
     237           0 :                     *(uint64_t *)(out_8bit_buffer + out_stride) =
     238           0 :                         _mm256_extract_epi64(out8_1_U8, 0);
     239           0 :                     *(uint64_t *)(out_8bit_buffer + out_stride + 8) =
     240           0 :                         _mm256_extract_epi64(out8_1_U8, 2);
     241           0 :                     *(uint64_t *)(out_8bit_buffer + out_stride + 16) =
     242           0 :                         _mm256_extract_epi64(out8_1_U8, 1);
     243           0 :                     *(uint64_t *)(out_8bit_buffer + out_stride + 24) =
     244           0 :                         _mm256_extract_epi64(out8_1_U8, 3);
     245             : 
     246           0 :                     out_8bit_buffer += 32;
     247           0 :                     in_16bit_buffer += 32;
     248             :                 }
     249           0 :                 in_16bit_buffer += in_strideDiff;
     250           0 :                 out_8bit_buffer += out_strideDiff;
     251             :             }
     252             :         }
     253           0 :         else if (!(width & 15)) {
     254           0 :             for (x = 0; x < height; x += 2) {
     255           0 :                 for (y = 0; y < width; y += 16) {
     256           0 :                     in_pixel0 = _mm256_loadu_si256((__m256i*) in_16bit_buffer);
     257           0 :                     in_pixel1 = _mm256_loadu_si256((__m256i*)
     258           0 :                         (in_16bit_buffer + in_stride));
     259             : 
     260           0 :                     in_pixel0_shftR_2_U8 = _mm256_packus_epi16(
     261             :                         _mm256_and_si256(_mm256_srli_epi16(in_pixel0, 2),
     262             :                         ymm_00FF), _mm256_and_si256(_mm256_srli_epi16(
     263             :                         in_pixel1, 2), ymm_00FF));
     264             : 
     265           0 :                     *(uint64_t *)out_8bit_buffer =
     266           0 :                         _mm256_extract_epi64(in_pixel0_shftR_2_U8, 0);
     267           0 :                     *(uint64_t *)(out_8bit_buffer + 8) =
     268           0 :                         _mm256_extract_epi64(in_pixel0_shftR_2_U8, 2);
     269           0 :                     *(uint64_t *)(out_8bit_buffer + out_stride) =
     270           0 :                         _mm256_extract_epi64(in_pixel0_shftR_2_U8, 1);
     271           0 :                     *(uint64_t *)(out_8bit_buffer + out_stride + 8) =
     272           0 :                         _mm256_extract_epi64(in_pixel0_shftR_2_U8, 3);
     273             : 
     274           0 :                     out_8bit_buffer += 16;
     275           0 :                     in_16bit_buffer += 16;
     276             :                 }
     277           0 :                 in_16bit_buffer += in_strideDiff;
     278           0 :                 out_8bit_buffer += out_strideDiff;
     279             :             }
     280             :         }
     281           0 :         else if (!(width & 7)) {
     282             :             __m128i xmm_00FF, in_pixel0, in_pixel1, in_pixel1_shftR_2_U8;
     283             :             __m128i in_pixel0_shftR_2_U8, in_pixel0_shftR_2, in_pixel1_shftR_2;
     284           0 :             xmm_00FF = _mm_set1_epi16(0x00FF);
     285           0 :             for (x = 0; x < height; x += 2) {
     286           0 :                 for (y = 0; y < width; y += 8) {
     287           0 :                     in_pixel0 = _mm_loadu_si128((__m128i*) in_16bit_buffer);
     288           0 :                     in_pixel1 = _mm_loadu_si128((__m128i*)
     289           0 :                         (in_16bit_buffer + in_stride));
     290             : 
     291           0 :                     in_pixel0_shftR_2 = _mm_and_si128(
     292             :                         _mm_srli_epi16(in_pixel0, 2), xmm_00FF);
     293           0 :                     in_pixel1_shftR_2 = _mm_and_si128(
     294             :                         _mm_srli_epi16(in_pixel1, 2), xmm_00FF);
     295             : 
     296           0 :                     in_pixel0_shftR_2_U8 = _mm_packus_epi16(in_pixel0_shftR_2,
     297             :                         in_pixel0_shftR_2);
     298           0 :                     in_pixel1_shftR_2_U8 = _mm_packus_epi16(in_pixel1_shftR_2,
     299             :                         in_pixel1_shftR_2);
     300             : 
     301           0 :                     _mm_storel_epi64((__m128i*)out_8bit_buffer,
     302             :                         in_pixel0_shftR_2_U8);
     303           0 :                     _mm_storel_epi64((__m128i*)(out_8bit_buffer + out_stride),
     304             :                         in_pixel1_shftR_2_U8);
     305             : 
     306           0 :                     out_8bit_buffer += 8;
     307           0 :                     in_16bit_buffer += 8;
     308             :                 }
     309           0 :                 in_16bit_buffer += in_strideDiff;
     310           0 :                 out_8bit_buffer += out_strideDiff;
     311             :             }
     312             :         }
     313             :         else {
     314             :             __m128i xmm_00FF, in_pixel0, in_pixel1, in_pixel1_shftR_2_U8;
     315             :             __m128i in_pixel0_shftR_2_U8, in_pixel0_shftR_2, in_pixel1_shftR_2;
     316           0 :             xmm_00FF = _mm_set1_epi16(0x00FF);
     317           0 :             uint32_t width_down4 = width & (~0x3);
     318             :             uint16_t in_pixel;
     319           0 :             for (x = 0; x < height; x += 2) {
     320           0 :                 for (y = 0; y < width_down4; y += 4) {
     321           0 :                     in_pixel0 = _mm_loadl_epi64((__m128i*)in_16bit_buffer);
     322           0 :                     in_pixel1 = _mm_loadl_epi64((__m128i*)
     323           0 :                         (in_16bit_buffer + in_stride));
     324             : 
     325           0 :                     in_pixel0_shftR_2 = _mm_and_si128(
     326             :                         _mm_srli_epi16(in_pixel0, 2), xmm_00FF);
     327           0 :                     in_pixel1_shftR_2 = _mm_and_si128(
     328             :                         _mm_srli_epi16(in_pixel1, 2), xmm_00FF);
     329             : 
     330           0 :                     in_pixel0_shftR_2_U8 = _mm_packus_epi16(in_pixel0_shftR_2,
     331             :                         in_pixel0_shftR_2);
     332           0 :                     in_pixel1_shftR_2_U8 = _mm_packus_epi16(in_pixel1_shftR_2,
     333             :                         in_pixel1_shftR_2);
     334             : 
     335           0 :                     *(uint32_t*)out_8bit_buffer =
     336           0 :                         _mm_cvtsi128_si32(in_pixel0_shftR_2_U8);
     337           0 :                     *(uint32_t*)(out_8bit_buffer + out_stride) =
     338           0 :                         _mm_cvtsi128_si32(in_pixel1_shftR_2_U8);
     339             : 
     340           0 :                     out_8bit_buffer += 4;
     341           0 :                     in_16bit_buffer += 4;
     342             :                 }
     343             : 
     344             :                 /* Calculate lefts pixels in 2 lines,
     345             :                  * when width is not divided by 4.
     346             :                  */
     347           0 :                 for (; y < width; y++) {
     348           0 :                     in_pixel = *in_16bit_buffer;
     349           0 :                     *out_8bit_buffer = (uint8_t)(in_pixel >> 2);
     350           0 :                     in_pixel = *(in_16bit_buffer + in_stride);
     351           0 :                     *(out_8bit_buffer + out_stride) = (uint8_t)(in_pixel >> 2);
     352           0 :                     ++out_8bit_buffer;
     353           0 :                     ++in_16bit_buffer;
     354             :                 }
     355             : 
     356           0 :                 in_16bit_buffer += in_strideDiff;
     357           0 :                 out_8bit_buffer += out_strideDiff;
     358             :             }
     359             :         }
     360             :     }
     361           0 : }

Generated by: LCOV version 1.14