794 row = _mm_setzero_si128();
795 w0 = _mm_shuffle_epi32(inf_u_q, _MM_SHUFFLE(N, N, N, N));
797 flags = _mm_and_si128(w0, _mm_set_epi32(0x8880, 0x4440, 0x2220, 0x1110));
798 insig = _mm_cmpeq_epi32(flags, _mm_setzero_si128());
799 if (_mm_movemask_epi8(insig) != 0xFFFF)
801 U_q = _mm_shuffle_epi32(U_q, _MM_SHUFFLE(N, N, N, N));
802 flags = _mm_mullo_epi16(flags, _mm_set_epi16(1,1,2,2,4,4,8,8));
803 __m128i ms_vec = frwd_fetch<0xFF>(magsgn);
811 w0 = _mm_srli_epi32(flags, 15);
812 m_n = _mm_sub_epi32(U_q, w0);
813 m_n = _mm_andnot_si128(insig, m_n);
817 __m128i inc_sum = m_n;
818 inc_sum = _mm_add_epi32(inc_sum, _mm_bslli_si128(inc_sum, 4));
819 inc_sum = _mm_add_epi32(inc_sum, _mm_bslli_si128(inc_sum, 8));
820 int total_mn = _mm_extract_epi16(inc_sum, 6);
821 __m128i ex_sum = _mm_bslli_si128(inc_sum, 4);
824 __m128i byte_idx = _mm_srli_epi32(ex_sum, 3);
825 __m128i bit_idx = _mm_and_si128(ex_sum, _mm_set1_epi32(7));
826 byte_idx = _mm_shuffle_epi8(byte_idx,
827 _mm_set_epi32(0x0C0C0C0C, 0x08080808, 0x04040404, 0x00000000));
828 byte_idx = _mm_add_epi32(byte_idx, _mm_set1_epi32(0x03020100));
829 __m128i d0 = _mm_shuffle_epi8(ms_vec, byte_idx);
830 byte_idx = _mm_add_epi32(byte_idx, _mm_set1_epi32(0x01010101));
831 __m128i d1 = _mm_shuffle_epi8(ms_vec, byte_idx);
834 bit_idx = _mm_or_si128(bit_idx, _mm_slli_epi32(bit_idx, 16));
835 __m128i bit_shift = _mm_shuffle_epi8(
836 _mm_set_epi8(1, 3, 7, 15, 31, 63, 127, -1,
837 1, 3, 7, 15, 31, 63, 127, -1), bit_idx);
838 bit_shift = _mm_add_epi16(bit_shift, _mm_set1_epi16(0x0101));
839 d0 = _mm_mullo_epi16(d0, bit_shift);
840 d0 = _mm_srli_epi16(d0, 8);
841 d1 = _mm_mullo_epi16(d1, bit_shift);
842 d1 = _mm_and_si128(d1, _mm_set1_epi32((
si32)0xFF00FF00));
843 d0 = _mm_or_si128(d0, d1);
847 __m128i ones = _mm_set1_epi32(1);
848 __m128i twos = _mm_set1_epi32(2);
849 __m128i U_q_m1 = _mm_sub_epi32(U_q, ones);
850 U_q_m1 = _mm_and_si128(U_q_m1, _mm_set_epi32(0,0,0,0x1F));
851 w0 = _mm_sub_epi32(twos, w0);
852 shift = _mm_sll_epi32(w0, U_q_m1);
853 ms_vec = _mm_and_si128(d0, _mm_sub_epi32(shift, ones));
856 w0 = _mm_and_si128(flags, _mm_set1_epi32(0x800));
857 w0 = _mm_cmpeq_epi32(w0, _mm_setzero_si128());
858 w0 = _mm_andnot_si128(w0, shift);
859 ms_vec = _mm_or_si128(ms_vec, w0);
860 w0 = _mm_slli_epi32(ms_vec, 31);
861 ms_vec = _mm_or_si128(ms_vec, ones);
862 __m128i tvn = ms_vec;
863 ms_vec = _mm_add_epi32(ms_vec, twos);
864 ms_vec = _mm_slli_epi32(ms_vec, (
si32)p - 1);
865 ms_vec = _mm_or_si128(ms_vec, w0);
866 row = _mm_andnot_si128(insig, ms_vec);
868 ms_vec = _mm_andnot_si128(insig, tvn);
870 tvn = _mm_shuffle_epi8(ms_vec,
871 _mm_set_epi32(-1, -1, 0x0F0E0D0C, 0x07060504));
873 tvn = _mm_shuffle_epi8(ms_vec,
874 _mm_set_epi32(-1, 0x0F0E0D0C, 0x07060504, -1));
877 vn = _mm_or_si128(vn, tvn);
904 row = _mm_setzero_si128();
905 w0 = _mm_shuffle_epi8(inf_u_q,
906 _mm_set_epi16(0x0504, 0x0504, 0x0504, 0x0504,
907 0x0100, 0x0100, 0x0100, 0x0100));
909 flags = _mm_and_si128(w0,
910 _mm_set_epi16((
si16)0x8880, 0x4440, 0x2220, 0x1110,
911 (
si16)0x8880, 0x4440, 0x2220, 0x1110));
912 insig = _mm_cmpeq_epi16(flags, _mm_setzero_si128());
913 if (_mm_movemask_epi8(insig) != 0xFFFF)
915 U_q = _mm_shuffle_epi8(U_q,
916 _mm_set_epi16(0x0504, 0x0504, 0x0504, 0x0504,
917 0x0100, 0x0100, 0x0100, 0x0100));
918 flags = _mm_mullo_epi16(flags, _mm_set_epi16(1,2,4,8,1,2,4,8));
919 __m128i ms_vec = frwd_fetch<0xFF>(magsgn);
927 w0 = _mm_srli_epi16(flags, 15);
928 m_n = _mm_sub_epi16(U_q, w0);
929 m_n = _mm_andnot_si128(insig, m_n);
933 __m128i inc_sum = m_n;
934 inc_sum = _mm_add_epi16(inc_sum, _mm_bslli_si128(inc_sum, 2));
935 inc_sum = _mm_add_epi16(inc_sum, _mm_bslli_si128(inc_sum, 4));
936 inc_sum = _mm_add_epi16(inc_sum, _mm_bslli_si128(inc_sum, 8));
937 int total_mn = _mm_extract_epi16(inc_sum, 7);
938 __m128i ex_sum = _mm_bslli_si128(inc_sum, 2);
941 __m128i byte_idx = _mm_srli_epi16(ex_sum, 3);
942 __m128i bit_idx = _mm_and_si128(ex_sum, _mm_set1_epi16(7));
943 byte_idx = _mm_shuffle_epi8(byte_idx,
944 _mm_set_epi16(0x0E0E, 0x0C0C, 0x0A0A, 0x0808,
945 0x0606, 0x0404, 0x0202, 0x0000));
946 byte_idx = _mm_add_epi16(byte_idx, _mm_set1_epi16(0x0100));
947 __m128i d0 = _mm_shuffle_epi8(ms_vec, byte_idx);
948 byte_idx = _mm_add_epi16(byte_idx, _mm_set1_epi16(0x0101));
949 __m128i d1 = _mm_shuffle_epi8(ms_vec, byte_idx);
952 __m128i bit_shift = _mm_shuffle_epi8(
953 _mm_set_epi8(1, 3, 7, 15, 31, 63, 127, -1,
954 1, 3, 7, 15, 31, 63, 127, -1), bit_idx);
955 bit_shift = _mm_add_epi16(bit_shift, _mm_set1_epi16(0x0101));
956 d0 = _mm_mullo_epi16(d0, bit_shift);
957 d0 = _mm_srli_epi16(d0, 8);
958 d1 = _mm_mullo_epi16(d1, bit_shift);
959 d1 = _mm_and_si128(d1, _mm_set1_epi16((
si16)0xFF00));
960 d0 = _mm_or_si128(d0, d1);
963 __m128i shift, t0, t1, Uq0, Uq1;
964 __m128i ones = _mm_set1_epi16(1);
965 __m128i twos = _mm_set1_epi16(2);
966 __m128i U_q_m1 = _mm_sub_epi32(U_q, ones);
967 Uq0 = _mm_and_si128(U_q_m1, _mm_set_epi32(0,0,0,0x1F));
968 Uq1 = _mm_bsrli_si128(U_q_m1, 14);
969 w0 = _mm_sub_epi16(twos, w0);
970 t0 = _mm_and_si128(w0, _mm_set_epi64x(0, -1));
971 t1 = _mm_and_si128(w0, _mm_set_epi64x(-1, 0));
972 t0 = _mm_sll_epi16(t0, Uq0);
973 t1 = _mm_sll_epi16(t1, Uq1);
974 shift = _mm_or_si128(t0, t1);
975 ms_vec = _mm_and_si128(d0, _mm_sub_epi16(shift, ones));
978 w0 = _mm_and_si128(flags, _mm_set1_epi16(0x800));
979 w0 = _mm_cmpeq_epi16(w0, _mm_setzero_si128());
980 w0 = _mm_andnot_si128(w0, shift);
981 ms_vec = _mm_or_si128(ms_vec, w0);
982 w0 = _mm_slli_epi16(ms_vec, 15);
983 ms_vec = _mm_or_si128(ms_vec, ones);
984 __m128i tvn = ms_vec;
985 ms_vec = _mm_add_epi16(ms_vec, twos);
986 ms_vec = _mm_slli_epi16(ms_vec, (
si32)p - 1);
987 ms_vec = _mm_or_si128(ms_vec, w0);
988 row = _mm_andnot_si128(insig, ms_vec);
990 ms_vec = _mm_andnot_si128(insig, tvn);
991 w0 = _mm_shuffle_epi8(ms_vec,
992 _mm_set_epi16(-1, -1, -1, -1, -1, -1, 0x0706, 0x0302));
993 vn = _mm_or_si128(vn, w0);
994 w0 = _mm_shuffle_epi8(ms_vec,
995 _mm_set_epi16(-1, -1, -1, -1, -1, 0x0F0E, 0x0B0A, -1));
996 vn = _mm_or_si128(vn, w0);
1023 ui32 missing_msbs,
ui32 num_passes,
1028 static bool insufficient_precision =
false;
1029 static bool modify_code =
false;
1030 static bool truncate_spp_mrp =
false;
1032 if (num_passes > 1 && lengths2 == 0)
1034 OJPH_WARN(0x00010001,
"A malformed codeblock that has more than "
1035 "one coding pass, but zero length for "
1036 "2nd and potential 3rd pass.");
1042 OJPH_WARN(0x00010002,
"We do not support more than 3 coding passes; "
1043 "This codeblocks has %d passes.",
1048 if (missing_msbs > 30)
1050 if (insufficient_precision ==
false)
1052 insufficient_precision =
true;
1053 OJPH_WARN(0x00010003,
"32 bits are not enough to decode this "
1054 "codeblock. This message will not be "
1055 "displayed again.");
1059 else if (missing_msbs == 30)
1061 if (modify_code ==
false) {
1063 OJPH_WARN(0x00010004,
"Not enough precision to decode the cleanup "
1064 "pass. The code can be modified to support "
1065 "this case. This message will not be "
1066 "displayed again.");
1070 else if (missing_msbs == 29)
1072 if (num_passes > 1) {
1074 if (truncate_spp_mrp ==
false) {
1075 truncate_spp_mrp =
true;
1076 OJPH_WARN(0x00010005,
"Not enough precision to decode the SgnProp "
1077 "nor MagRef passes; both will be skipped. "
1078 "This message will not be displayed "
1083 ui32 p = 30 - missing_msbs;
1089 OJPH_WARN(0x00010006,
"Wrong codeblock length.");
1095 lcup = (int)lengths1;
1097 scup = (((int)coded_data[lcup-1]) << 4) + (coded_data[lcup-2] & 0xF);
1098 if (scup < 2 || scup > lcup || scup > 4079)
1116 ui16 scratch[8 * 513] = {0};
1124 ui32 sstr = ((width + 2u) + 7u) & ~7u;
1126 assert((stride & 0x3) == 0);
1128 ui32 mmsbp2 = missing_msbs + 2;
1140 mel_init(&mel, coded_data, lcup, scup);
1142 rev_init(&vlc, coded_data, lcup, scup);
1152 for (
ui32 x = 0; x < width; sp += 4)
1171 t0 = (run == -1) ? t0 : 0;
1185 c_q = ((t0 & 0x10U) << 3) | ((t0 & 0xE0U) << 2);
1194 t1 =
vlc_tbl0[c_q + (vlc_val & 0x7F)];
1197 if (c_q == 0 && x < width)
1202 t1 = (run == -1) ? t1 : 0;
1207 t1 = x < width ? t1 : 0;
1216 c_q = ((t1 & 0x10U) << 3) | ((t1 & 0xE0U) << 2);
1224 ui32 uvlc_mode = ((t0 & 0x8U) << 3) | ((t1 & 0x8U) << 4);
1225 if (uvlc_mode == 0xc0)
1229 uvlc_mode += (run == -1) ? 0x40 : 0;
1246 ui32 len = uvlc_entry & 0xF;
1247 ui32 tmp = vlc_val & ((1 << len) - 1);
1251 len = uvlc_entry & 0x7;
1253 ui16 u_q = (
ui16)(1 + (uvlc_entry&7) + (tmp&~(0xFFU<<len)));
1255 u_q = (
ui16)(1 + (uvlc_entry >> 3) + (tmp >> len));
1261 for (
ui32 y = 2; y < height; y += 2)
1264 ui16 *sp = scratch + (y >> 1) * sstr;
1266 for (
ui32 x = 0; x < width; sp += 4)
1272 c_q |= ((sp[0 - (
si32)sstr] & 0xA0U) << 2);
1273 c_q |= ((sp[2 - (
si32)sstr] & 0x20U) << 4);
1289 t0 = (run == -1) ? t0 : 0;
1304 c_q = ((t0 & 0x40U) << 2) | ((t0 & 0x80U) << 1);
1306 c_q |= sp[0 - (
si32)sstr] & 0x80;
1308 c_q |= ((sp[2 - (
si32)sstr] & 0xA0U) << 2);
1309 c_q |= ((sp[4 - (
si32)sstr] & 0x20U) << 4);
1318 t1 =
vlc_tbl1[ c_q + (vlc_val & 0x7F)];
1321 if (c_q == 0 && x < width)
1326 t1 = (run == -1) ? t1 : 0;
1331 t1 = x < width ? t1 : 0;
1341 c_q = ((t1 & 0x40U) << 2) | ((t1 & 0x80U) << 1);
1343 c_q |= sp[2 - (
si32)sstr] & 0x80;
1351 ui32 uvlc_mode = ((t0 & 0x8U) << 3) | ((t1 & 0x8U) << 4);
1357 ui32 len = uvlc_entry & 0xF;
1358 ui32 tmp = vlc_val & ((1 << len) - 1);
1362 len = uvlc_entry & 0x7;
1364 ui16 u_q = (
ui16)((uvlc_entry & 7) + (tmp & ~(0xFFU << len)));
1366 u_q = (
ui16)((uvlc_entry >> 3) + (tmp >> len));
1389 const int v_n_size = 512 + 8;
1390 ui32 v_n_scratch[2 * v_n_size] = {0};
1393 frwd_init<0xFF>(&magsgn, coded_data, lcup - scup);
1397 ui32 *vp = v_n_scratch;
1398 ui32 *dp = decoded_data;
1401 for (
ui32 x = 0; x < width; x += 4, sp += 4, vp += 2, dp += 4)
1405 __m128i inf_u_q, U_q;
1408 inf_u_q = _mm_loadu_si128((__m128i*)sp);
1409 U_q = _mm_srli_epi32(inf_u_q, 16);
1411 w0 = _mm_cmpgt_epi32(U_q, _mm_set1_epi32((
int)mmsbp2));
1412 int i = _mm_movemask_epi8(w0);
1417 __m128i vn = _mm_set1_epi32(2);
1418 __m128i row0 = decode_one_quad32<0>(inf_u_q, U_q, &magsgn, p, vn);
1419 __m128i row1 = decode_one_quad32<1>(inf_u_q, U_q, &magsgn, p, vn);
1420 w0 = _mm_loadu_si128((__m128i*)vp);
1421 w0 = _mm_and_si128(w0, _mm_set_epi32(0,0,0,-1));
1422 w0 = _mm_or_si128(w0, vn);
1423 _mm_storeu_si128((__m128i*)vp, w0);
1426 w0 = _mm_unpacklo_epi32(row0, row1);
1427 w1 = _mm_unpackhi_epi32(row0, row1);
1428 row0 = _mm_unpacklo_epi32(w0, w1);
1429 row1 = _mm_unpackhi_epi32(w0, w1);
1430 _mm_store_si128((__m128i*)dp, row0);
1431 _mm_store_si128((__m128i*)(dp + stride), row1);
1435 for (
ui32 y = 2; y < height; y += 2)
1439 ui32 *vp = v_n_scratch;
1440 const __m128i lut_lo = _mm_set_epi8(
1441 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 7, 31
1443 const __m128i lut_hi = _mm_set_epi8(
1444 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 31
1446 const __m128i nibble_mask = _mm_set1_epi8(0x0F);
1447 const __m128i byte_offset8 = _mm_set1_epi16(8);
1448 const __m128i byte_offset16 = _mm_set1_epi16(16);
1449 const __m128i cc = _mm_set1_epi32(31);
1450 for (
ui32 x = 0; x <= width; x += 8, vp += 4)
1453 v = _mm_loadu_si128((__m128i*)vp);
1455 t = _mm_and_si128(nibble_mask, v);
1456 v = _mm_and_si128(_mm_srli_epi16(v, 4), nibble_mask);
1457 t = _mm_shuffle_epi8(lut_lo, t);
1458 v = _mm_shuffle_epi8(lut_hi, v);
1459 v = _mm_min_epu8(v, t);
1461 t = _mm_srli_epi16(v, 8);
1462 v = _mm_or_si128(v, byte_offset8);
1463 v = _mm_min_epu8(v, t);
1465 t = _mm_srli_epi32(v, 16);
1466 v = _mm_or_si128(v, byte_offset16);
1467 v = _mm_min_epu8(v, t);
1469 v = _mm_sub_epi16(cc, v);
1470 _mm_storeu_si128((__m128i*)(vp + v_n_size), v);
1474 ui32 *vp = v_n_scratch;
1475 ui16 *sp = scratch + (y >> 1) * sstr;
1476 ui32 *dp = decoded_data + y * stride;
1479 for (
ui32 x = 0; x < width; x += 4, sp += 4, vp += 2, dp += 4)
1483 __m128i inf_u_q, U_q;
1486 __m128i gamma, emax, kappa, u_q;
1488 inf_u_q = _mm_loadu_si128((__m128i*)sp);
1489 gamma = _mm_and_si128(inf_u_q, _mm_set1_epi32(0xF0));
1490 w0 = _mm_sub_epi32(gamma, _mm_set1_epi32(1));
1491 gamma = _mm_and_si128(gamma, w0);
1492 gamma = _mm_cmpeq_epi32(gamma, _mm_setzero_si128());
1494 emax = _mm_loadu_si128((__m128i*)(vp + v_n_size));
1495 w0 = _mm_bsrli_si128(emax, 4);
1496 emax = _mm_max_epi16(w0, emax);
1497 emax = _mm_andnot_si128(gamma, emax);
1499 kappa = _mm_set1_epi32(1);
1500 kappa = _mm_max_epi16(emax, kappa);
1502 u_q = _mm_srli_epi32(inf_u_q, 16);
1503 U_q = _mm_add_epi32(u_q, kappa);
1505 w0 = _mm_cmpgt_epi32(U_q, _mm_set1_epi32((
int)mmsbp2));
1506 int i = _mm_movemask_epi8(w0);
1511 __m128i vn = _mm_set1_epi32(2);
1512 __m128i row0 = decode_one_quad32<0>(inf_u_q, U_q, &magsgn, p, vn);
1513 __m128i row1 = decode_one_quad32<1>(inf_u_q, U_q, &magsgn, p, vn);
1514 w0 = _mm_loadu_si128((__m128i*)vp);
1515 w0 = _mm_and_si128(w0, _mm_set_epi32(0,0,0,-1));
1516 w0 = _mm_or_si128(w0, vn);
1517 _mm_storeu_si128((__m128i*)vp, w0);
1520 w0 = _mm_unpacklo_epi32(row0, row1);
1521 w1 = _mm_unpackhi_epi32(row0, row1);
1522 row0 = _mm_unpacklo_epi32(w0, w1);
1523 row1 = _mm_unpackhi_epi32(w0, w1);
1524 _mm_store_si128((__m128i*)dp, row0);
1525 _mm_store_si128((__m128i*)(dp + stride), row1);
1540 const int v_n_size = 512 + 8;
1541 ui16 v_n_scratch[2 * v_n_size] = {0};
1544 frwd_init<0xFF>(&magsgn, coded_data, lcup - scup);
1548 ui16 *vp = v_n_scratch;
1549 ui32 *dp = decoded_data;
1552 for (
ui32 x = 0; x < width; x += 4, sp += 4, vp += 2, dp += 4)
1556 __m128i inf_u_q, U_q;
1559 inf_u_q = _mm_loadu_si128((__m128i*)sp);
1560 U_q = _mm_srli_epi32(inf_u_q, 16);
1562 w0 = _mm_cmpgt_epi32(U_q, _mm_set1_epi32((
int)mmsbp2));
1563 int i = _mm_movemask_epi8(w0);
1568 __m128i vn = _mm_set1_epi16(2);
1570 w0 = _mm_loadu_si128((__m128i*)vp);
1571 w0 = _mm_and_si128(w0, _mm_set_epi16(0,0,0,0,0,0,0,-1));
1572 w0 = _mm_or_si128(w0, vn);
1573 _mm_storeu_si128((__m128i*)vp, w0);
1576 w0 = _mm_shuffle_epi8(row,
1577 _mm_set_epi16(0x0D0C, -1, 0x0908, -1,
1578 0x0504, -1, 0x0100, -1));
1579 _mm_store_si128((__m128i*)dp, w0);
1580 w1 = _mm_shuffle_epi8(row,
1581 _mm_set_epi16(0x0F0E, -1, 0x0B0A, -1,
1582 0x0706, -1, 0x0302, -1));
1583 _mm_store_si128((__m128i*)(dp + stride), w1);
1587 for (
ui32 y = 2; y < height; y += 2)
1591 ui16 *vp = v_n_scratch;
1592 const __m128i lut_lo = _mm_set_epi8(
1593 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 7, 15
1595 const __m128i lut_hi = _mm_set_epi8(
1596 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 15
1598 const __m128i nibble_mask = _mm_set1_epi8(0x0F);
1599 const __m128i byte_offset8 = _mm_set1_epi16(8);
1600 const __m128i cc = _mm_set1_epi16(15);
1601 for (
ui32 x = 0; x <= width; x += 16, vp += 8)
1604 v = _mm_loadu_si128((__m128i*)vp);
1606 t = _mm_and_si128(nibble_mask, v);
1607 v = _mm_and_si128(_mm_srli_epi16(v, 4), nibble_mask);
1608 t = _mm_shuffle_epi8(lut_lo, t);
1609 v = _mm_shuffle_epi8(lut_hi, v);
1610 v = _mm_min_epu8(v, t);
1612 t = _mm_srli_epi16(v, 8);
1613 v = _mm_or_si128(v, byte_offset8);
1614 v = _mm_min_epu8(v, t);
1616 v = _mm_sub_epi16(cc, v);
1617 _mm_storeu_si128((__m128i*)(vp + v_n_size), v);
1621 ui16 *vp = v_n_scratch;
1622 ui16 *sp = scratch + (y >> 1) * sstr;
1623 ui32 *dp = decoded_data + y * stride;
1626 for (
ui32 x = 0; x < width; x += 4, sp += 4, vp += 2, dp += 4)
1630 __m128i inf_u_q, U_q;
1633 __m128i gamma, emax, kappa, u_q;
1635 inf_u_q = _mm_loadu_si128((__m128i*)sp);
1636 gamma = _mm_and_si128(inf_u_q, _mm_set1_epi32(0xF0));
1637 w0 = _mm_sub_epi32(gamma, _mm_set1_epi32(1));
1638 gamma = _mm_and_si128(gamma, w0);
1639 gamma = _mm_cmpeq_epi32(gamma, _mm_setzero_si128());
1641 emax = _mm_loadu_si128((__m128i*)(vp + v_n_size));
1642 w0 = _mm_bsrli_si128(emax, 2);
1643 emax = _mm_max_epi16(w0, emax);
1644 emax = _mm_shuffle_epi8(emax,
1645 _mm_set_epi16(-1, 0x0706, -1, 0x0504,
1646 -1, 0x0302, -1, 0x0100));
1647 emax = _mm_andnot_si128(gamma, emax);
1649 kappa = _mm_set1_epi32(1);
1650 kappa = _mm_max_epi16(emax, kappa);
1652 u_q = _mm_srli_epi32(inf_u_q, 16);
1653 U_q = _mm_add_epi32(u_q, kappa);
1655 w0 = _mm_cmpgt_epi32(U_q, _mm_set1_epi32((
int)mmsbp2));
1656 int i = _mm_movemask_epi8(w0);
1661 __m128i vn = _mm_set1_epi16(2);
1663 w0 = _mm_loadu_si128((__m128i*)vp);
1664 w0 = _mm_and_si128(w0, _mm_set_epi16(0,0,0,0,0,0,0,-1));
1665 w0 = _mm_or_si128(w0, vn);
1666 _mm_storeu_si128((__m128i*)vp, w0);
1668 w0 = _mm_shuffle_epi8(row,
1669 _mm_set_epi16(0x0D0C, -1, 0x0908, -1,
1670 0x0504, -1, 0x0100, -1));
1671 _mm_store_si128((__m128i*)dp, w0);
1672 w1 = _mm_shuffle_epi8(row,
1673 _mm_set_epi16(0x0F0E, -1, 0x0B0A, -1,
1674 0x0706, -1, 0x0302, -1));
1675 _mm_store_si128((__m128i*)(dp + stride), w1);
1689 ui16*
const sigma = scratch;
1691 ui32 mstr = (width + 3u) >> 2;
1693 mstr = ((mstr + 2u) + 7u) & ~7u;
1701 const __m128i mask_3 = _mm_set1_epi32(0x30);
1702 const __m128i mask_C = _mm_set1_epi32(0xC0);
1703 const __m128i shuffle_mask = _mm_set_epi32(-1, -1, -1, 0x0C080400);
1704 for (y = 0; y < height; y += 4)
1706 ui16* sp = scratch + (y >> 1) * sstr;
1707 ui16* dp = sigma + (y >> 2) * mstr;
1708 for (
ui32 x = 0; x < width; x += 8, sp += 8, dp += 2)
1710 __m128i s0, s1, u3, uC, t0, t1;
1712 s0 = _mm_loadu_si128((__m128i*)(sp));
1713 u3 = _mm_and_si128(s0, mask_3);
1714 u3 = _mm_srli_epi32(u3, 4);
1715 uC = _mm_and_si128(s0, mask_C);
1716 uC = _mm_srli_epi32(uC, 2);
1717 t0 = _mm_or_si128(u3, uC);
1719 s1 = _mm_loadu_si128((__m128i*)(sp + sstr));
1720 u3 = _mm_and_si128(s1, mask_3);
1721 u3 = _mm_srli_epi32(u3, 2);
1722 uC = _mm_and_si128(s1, mask_C);
1723 t1 = _mm_or_si128(u3, uC);
1725 __m128i r = _mm_or_si128(t0, t1);
1726 r = _mm_shuffle_epi8(r, shuffle_mask);
1729 _mm_store_ss((
float*)dp, _mm_castsi128_ps(r));
1735 ui16* dp = sigma + (y >> 2) * mstr;
1736 __m128i zero = _mm_setzero_si128();
1737 for (
ui32 x = 0; x < width; x += 32, dp += 8)
1738 _mm_store_si128((__m128i*)dp, zero);
1754 ui16 prev_row_sig[256 + 8] = {0};
1757 frwd_init<0>(&sigprop, coded_data + lengths1, (
int)lengths2);
1759 for (
ui32 y = 0; y < height; y += 4)
1761 ui32 pattern = 0xFFFFu;
1762 if (height - y < 4) {
1764 if (height - y < 3) {
1774 ui16 *prev_sig = prev_row_sig;
1775 ui16 *cur_sig = sigma + (y >> 2) * mstr;
1776 ui32 *dpp = decoded_data + y * stride;
1777 for (
ui32 x = 0; x < width; x += 4, dpp += 4, ++cur_sig, ++prev_sig)
1782 pattern = pattern >> (s * 4);
1797 ui32 ns = *(
ui32*)(cur_sig + mstr);
1798 ui32 u = (ps & 0x88888888) >> 3;
1800 u |= (ns & 0x11111111) << 3;
1805 mbr |= (cs & 0x77777777) << 1;
1806 mbr |= (cs & 0xEEEEEEEE) >> 1;
1822 __m128i cwd_vec = frwd_fetch<0>(&sigprop);
1823 ui32 cwd = (
ui32)_mm_extract_epi16(cwd_vec, 0);
1826 ui32 col_mask = 0xFu;
1827 ui32 inv_sig = ~cs & pattern;
1828 for (
int i = 0; i < 16; i += 4, col_mask <<= 4)
1830 if ((col_mask & new_sig) == 0)
1834 ui32 sample_mask = 0x1111u & col_mask;
1835 if (new_sig & sample_mask)
1837 new_sig &= ~sample_mask;
1840 ui32 t = 0x33u << i;
1841 new_sig |= t & inv_sig;
1847 if (new_sig & sample_mask)
1849 new_sig &= ~sample_mask;
1852 ui32 t = 0x76u << i;
1853 new_sig |= t & inv_sig;
1859 if (new_sig & sample_mask)
1861 new_sig &= ~sample_mask;
1864 ui32 t = 0xECu << i;
1865 new_sig |= t & inv_sig;
1871 if (new_sig & sample_mask)
1873 new_sig &= ~sample_mask;
1876 ui32 t = 0xC8u << i;
1877 new_sig |= t & inv_sig;
1885 cwd |= (
ui32)_mm_extract_epi16(cwd_vec, 1) << (16 - cnt);
1889 __m128i new_sig_vec = _mm_set1_epi16((
si16)new_sig);
1890 new_sig_vec = _mm_shuffle_epi8(new_sig_vec,
1891 _mm_set_epi8(1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0));
1892 new_sig_vec = _mm_and_si128(new_sig_vec,
1893 _mm_set1_epi64x((
si64)0x8040201008040201));
1894 new_sig_vec = _mm_cmpeq_epi8(new_sig_vec,
1895 _mm_set1_epi64x((
si64)0x8040201008040201));
1899 __m128i inc_sum = new_sig_vec;
1900 inc_sum = _mm_abs_epi8(inc_sum);
1901 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 1));
1902 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 2));
1903 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 4));
1904 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 8));
1905 cnt += (
ui32)_mm_extract_epi16(inc_sum, 7) >> 8;
1907 __m128i ex_sum = _mm_bslli_si128(inc_sum, 1);
1911 cwd_vec = _mm_set1_epi16((
si16)cwd);
1912 cwd_vec = _mm_shuffle_epi8(cwd_vec,
1913 _mm_set_epi8(1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0));
1914 cwd_vec = _mm_and_si128(cwd_vec,
1915 _mm_set1_epi64x((
si64)0x8040201008040201));
1916 cwd_vec = _mm_cmpeq_epi8(cwd_vec,
1917 _mm_set1_epi64x((
si64)0x8040201008040201));
1918 cwd_vec = _mm_abs_epi8(cwd_vec);
1922 __m128i v = _mm_shuffle_epi8(cwd_vec, ex_sum);
1926 _mm_set_epi8(-1,-1,-1,12,-1,-1,-1,8,-1,-1,-1,4,-1,-1,-1,0);
1927 __m128i val = _mm_set1_epi32(3 << (p - 2));
1929 for (
int c = 0; c < 4; ++ c) {
1930 __m128i s0, s0_ns, s0_val;
1932 s0 = _mm_load_si128((__m128i*)dp);
1936 s0_ns = _mm_shuffle_epi8(new_sig_vec, m);
1937 s0_ns = _mm_cmpeq_epi32(s0_ns, _mm_set1_epi32(0xFF));
1940 s0_val = _mm_shuffle_epi8(v, m);
1941 s0_val = _mm_slli_epi32(s0_val, 31);
1942 s0_val = _mm_or_si128(s0_val, val);
1943 s0_val = _mm_and_si128(s0_val, s0_ns);
1946 s0 = _mm_or_si128(s0, s0_val);
1948 _mm_store_si128((__m128i*)dp, s0);
1951 m = _mm_add_epi32(m, _mm_set1_epi32(1));
1958 *prev_sig = (
ui16)(new_sig);
1962 new_sig |= (t & 0x7777) << 1;
1963 new_sig |= (t & 0xEEEE) >> 1;
1976 rev_init_mrp(&magref, coded_data, (
int)lengths1, (
int)lengths2);
1978 for (
ui32 y = 0; y < height; y += 4)
1980 ui16 *cur_sig = sigma + (y >> 2) * mstr;
1981 ui32 *dpp = decoded_data + y * stride;
1982 for (
ui32 i = 0; i < width; i += 4, dpp += 4)
1987 ui16 sig = *cur_sig++;
1995 __m128i sig_vec = _mm_set1_epi16((
si16)sig);
1996 sig_vec = _mm_shuffle_epi8(sig_vec,
1997 _mm_set_epi8(1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0));
1998 sig_vec = _mm_and_si128(sig_vec,
1999 _mm_set1_epi64x((
si64)0x8040201008040201));
2000 sig_vec = _mm_cmpeq_epi8(sig_vec,
2001 _mm_set1_epi64x((
si64)0x8040201008040201));
2002 sig_vec = _mm_abs_epi8(sig_vec);
2006 __m128i inc_sum = sig_vec;
2007 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 1));
2008 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 2));
2009 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 4));
2010 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 8));
2011 total_bits = _mm_extract_epi16(inc_sum, 7) >> 8;
2012 __m128i ex_sum = _mm_bslli_si128(inc_sum, 1);
2019 __m128i cwd_vec = _mm_set1_epi16((
si16)cwd);
2020 cwd_vec = _mm_shuffle_epi8(cwd_vec,
2021 _mm_set_epi8(1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0));
2022 cwd_vec = _mm_and_si128(cwd_vec,
2023 _mm_set1_epi64x((
si64)0x8040201008040201));
2024 cwd_vec = _mm_cmpeq_epi8(cwd_vec,
2025 _mm_set1_epi64x((
si64)0x8040201008040201));
2026 cwd_vec = _mm_add_epi8(cwd_vec, _mm_set1_epi8(1));
2027 cwd_vec = _mm_add_epi8(cwd_vec, cwd_vec);
2028 cwd_vec = _mm_or_si128(cwd_vec, _mm_set1_epi8(1));
2032 _mm_set_epi8(-1,-1,-1,12,-1,-1,-1,8,-1,-1,-1,4,-1,-1,-1,0);
2034 for (
int c = 0; c < 4; ++c) {
2035 __m128i s0, s0_sig, s0_idx, s0_val;
2037 s0 = _mm_load_si128((__m128i*)dp);
2039 s0_sig = _mm_shuffle_epi8(sig_vec, m);
2040 s0_sig = _mm_cmpeq_epi8(s0_sig, _mm_setzero_si128());
2042 s0_idx = _mm_shuffle_epi8(ex_sum, m);
2043 s0_val = _mm_shuffle_epi8(cwd_vec, s0_idx);
2045 s0_val = _mm_andnot_si128(s0_sig, s0_val);
2047 s0_val = _mm_slli_epi32(s0_val, (
si32)p - 2);
2048 s0 = _mm_xor_si128(s0, s0_val);
2050 _mm_store_si128((__m128i*)dp, s0);
2053 m = _mm_add_epi32(m, _mm_set1_epi32(1));