OpenJPH
Open-source implementation of JPEG2000 Part-15
Loading...
Searching...
No Matches
ojph_codestream_sse2.cpp
Go to the documentation of this file.
1//***************************************************************************/
2// This software is released under the 2-Clause BSD license, included
3// below.
4//
5// Copyright (c) 2022, Aous Naman
6// Copyright (c) 2022, Kakadu Software Pty Ltd, Australia
7// Copyright (c) 2022, The University of New South Wales, Australia
8//
9// Redistribution and use in source and binary forms, with or without
10// modification, are permitted provided that the following conditions are
11// met:
12//
13// 1. Redistributions of source code must retain the above copyright
14// notice, this list of conditions and the following disclaimer.
15//
16// 2. Redistributions in binary form must reproduce the above copyright
17// notice, this list of conditions and the following disclaimer in the
18// documentation and/or other materials provided with the distribution.
19//
20// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
21// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
23// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
26// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31//***************************************************************************/
32// This file is part of the OpenJPH software implementation.
33// File: ojph_codestream_sse2.cpp
34// Author: Aous Naman
35// Date: 15 May 2022
36//***************************************************************************/
37
38#include <climits>
39#include <immintrin.h>
40#include "ojph_defs.h"
41
42namespace ojph {
43 namespace local {
44
47 {
48 __m128i x1, x0 = _mm_loadu_si128((__m128i*)address);
49 x1 = _mm_shuffle_epi32(x0, 0xEE); // x1 = x0[2,3,2,3]
50 x0 = _mm_or_si128(x0, x1);
51 x1 = _mm_shuffle_epi32(x0, 0x55); // x1 = x0[1,1,1,1]
52 x0 = _mm_or_si128(x0, x1);
53 _mm_storeu_si128((__m128i*)address, x0);
54 return *address;
55 // A single movd t, xmm0 can do the trick, but it is not available
56 // in SSE2 intrinsics. extract_epi32 is available in sse4.1
57 // ui32 t = (ui32)_mm_extract_epi16(x0, 0);
58 // t |= (ui32)_mm_extract_epi16(x0, 1) << 16;
59 // return t;
60 }
61
64 {
65 __m128i x1, x0 = _mm_loadu_si128((__m128i*)address);
66 x1 = _mm_shuffle_epi32(x0, 0xEE); // x1 = x0[2,3,2,3]
67 x0 = _mm_or_si128(x0, x1);
68 _mm_storeu_si128((__m128i*)address, x0);
69 return *address;
70 // A single movd t, xmm0 can do the trick, but it is not available
71 // in SSE2 intrinsics. extract_epi32 is available in sse4.1
72 // ui32 t = (ui32)_mm_extract_epi16(x0, 0);
73 // t |= (ui32)_mm_extract_epi16(x0, 1) << 16;
74 // return t;
75 }
76
78 void sse2_rev_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max,
79 float delta_inv, ui32 count, ui32* max_val)
80 {
81 ojph_unused(delta_inv);
82
83 // convert to sign and magnitude and keep max_val
84 ui32 shift = 31 - K_max;
85 __m128i m0 = _mm_set1_epi32(INT_MIN);
86 __m128i zero = _mm_setzero_si128();
87 __m128i one = _mm_set1_epi32(1);
88 __m128i tmax = _mm_loadu_si128((__m128i*)max_val);
89 __m128i *p = (__m128i*)sp;
90 for (ui32 i = 0; i < count; i += 4, p += 1, dp += 4)
91 {
92 __m128i v = _mm_loadu_si128(p);
93 __m128i sign = _mm_cmplt_epi32(v, zero);
94 __m128i val = _mm_xor_si128(v, sign); // negate 1's complement
95 __m128i ones = _mm_and_si128(sign, one);
96 val = _mm_add_epi32(val, ones); // 2's complement
97 sign = _mm_and_si128(sign, m0);
98 val = _mm_slli_epi32(val, (int)shift);
99 tmax = _mm_or_si128(tmax, val);
100 val = _mm_or_si128(val, sign);
101 _mm_storeu_si128((__m128i*)dp, val);
102 }
103 _mm_storeu_si128((__m128i*)max_val, tmax);
104 }
105
107 void sse2_irv_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max,
108 float delta_inv, ui32 count, ui32* max_val)
109 {
110 ojph_unused(K_max);
111
112 //quantize and convert to sign and magnitude and keep max_val
113
114 __m128 d = _mm_set1_ps(delta_inv);
115 __m128i zero = _mm_setzero_si128();
116 __m128i one = _mm_set1_epi32(1);
117 __m128i tmax = _mm_loadu_si128((__m128i*)max_val);
118 float *p = (float*)sp;
119 for (ui32 i = 0; i < count; i += 4, p += 4, dp += 4)
120 {
121 __m128 vf = _mm_loadu_ps(p);
122 vf = _mm_mul_ps(vf, d); // multiply
123 __m128i val = _mm_cvtps_epi32(vf); // convert to int
124 __m128i sign = _mm_cmplt_epi32(val, zero); // get sign
125 val = _mm_xor_si128(val, sign); // negate 1's complement
126 __m128i ones = _mm_and_si128(sign, one);
127 val = _mm_add_epi32(val, ones); // 2's complement
128 tmax = _mm_or_si128(tmax, val);
129 sign = _mm_slli_epi32(sign, 31);
130 val = _mm_or_si128(val, sign);
131 _mm_storeu_si128((__m128i*)dp, val);
132 }
133 _mm_storeu_si128((__m128i*)max_val, tmax);
134 }
135
137 void sse2_rev_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max,
138 float delta, ui32 count)
139 {
140 ojph_unused(delta);
141 ui32 shift = 31 - K_max;
142 __m128i m1 = _mm_set1_epi32(INT_MAX);
143 __m128i zero = _mm_setzero_si128();
144 __m128i one = _mm_set1_epi32(1);
145 si32 *p = (si32*)dp;
146 for (ui32 i = 0; i < count; i += 4, sp += 4, p += 4)
147 {
148 __m128i v = _mm_load_si128((__m128i*)sp);
149 __m128i val = _mm_and_si128(v, m1);
150 val = _mm_srli_epi32(val, (int)shift);
151 __m128i sign = _mm_cmplt_epi32(v, zero);
152 val = _mm_xor_si128(val, sign); // negate 1's complement
153 __m128i ones = _mm_and_si128(sign, one);
154 val = _mm_add_epi32(val, ones); // 2's complement
155 _mm_storeu_si128((__m128i*)p, val);
156 }
157 }
158
160 void sse2_irv_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max,
161 float delta, ui32 count)
162 {
163 ojph_unused(K_max);
164 __m128i m1 = _mm_set1_epi32(INT_MAX);
165 __m128 d = _mm_set1_ps(delta);
166 float *p = (float*)dp;
167 for (ui32 i = 0; i < count; i += 4, sp += 4, p += 4)
168 {
169 __m128i v = _mm_load_si128((__m128i*)sp);
170 __m128i vali = _mm_and_si128(v, m1);
171 __m128 valf = _mm_cvtepi32_ps(vali);
172 valf = _mm_mul_ps(valf, d);
173 __m128i sign = _mm_andnot_si128(m1, v);
174 valf = _mm_or_ps(valf, _mm_castsi128_ps(sign));
175 _mm_storeu_ps(p, valf);
176 }
177 }
178
180 void sse2_rev_tx_to_cb64(const void *sp, ui64 *dp, ui32 K_max,
181 float delta_inv, ui32 count, ui64* max_val)
182 {
183 ojph_unused(delta_inv);
184
185 // convert to sign and magnitude and keep max_val
186 ui32 shift = 63 - K_max;
187 __m128i m0 = _mm_set1_epi64x(LLONG_MIN);
188 __m128i zero = _mm_setzero_si128();
189 __m128i one = _mm_set1_epi64x(1);
190 __m128i tmax = _mm_loadu_si128((__m128i*)max_val);
191 __m128i *p = (__m128i*)sp;
192 for (ui32 i = 0; i < count; i += 2, p += 1, dp += 2)
193 {
194 __m128i v = _mm_loadu_si128(p);
195 __m128i sign = _mm_cmplt_epi32(v, zero);
196 sign = _mm_shuffle_epi32(sign, 0xF5); // sign = sign[1,1,3,3];
197 __m128i val = _mm_xor_si128(v, sign); // negate 1's complement
198 __m128i ones = _mm_and_si128(sign, one);
199 val = _mm_add_epi64(val, ones); // 2's complement
200 sign = _mm_and_si128(sign, m0);
201 val = _mm_slli_epi64(val, (int)shift);
202 tmax = _mm_or_si128(tmax, val);
203 val = _mm_or_si128(val, sign);
204 _mm_storeu_si128((__m128i*)dp, val);
205 }
206 _mm_storeu_si128((__m128i*)max_val, tmax);
207 }
208
210 void sse2_rev_tx_from_cb64(const ui64 *sp, void *dp, ui32 K_max,
211 float delta, ui32 count)
212 {
213 ojph_unused(delta);
214 ui32 shift = 63 - K_max;
215 __m128i m1 = _mm_set1_epi64x(LLONG_MAX);
216 __m128i zero = _mm_setzero_si128();
217 __m128i one = _mm_set1_epi64x(1);
218 si64 *p = (si64*)dp;
219 for (ui32 i = 0; i < count; i += 2, sp += 2, p += 2)
220 {
221 __m128i v = _mm_load_si128((__m128i*)sp);
222 __m128i val = _mm_and_si128(v, m1);
223 val = _mm_srli_epi64(val, (int)shift);
224 __m128i sign = _mm_cmplt_epi32(v, zero);
225 sign = _mm_shuffle_epi32(sign, 0xF5); // sign = sign[1,1,3,3];
226 val = _mm_xor_si128(val, sign); // negate 1's complement
227 __m128i ones = _mm_and_si128(sign, one);
228 val = _mm_add_epi64(val, ones); // 2's complement
229 _mm_storeu_si128((__m128i*)p, val);
230 }
231 }
232 }
233}
void sse2_rev_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, float delta, ui32 count)
ui32 sse2_find_max_val32(ui32 *address)
ui64 sse2_find_max_val64(ui64 *address)
void sse2_rev_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, float delta_inv, ui32 count, ui32 *max_val)
void sse2_rev_tx_to_cb64(const void *sp, ui64 *dp, ui32 K_max, float delta_inv, ui32 count, ui64 *max_val)
void sse2_irv_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, float delta, ui32 count)
void sse2_rev_tx_from_cb64(const ui64 *sp, void *dp, ui32 K_max, float delta, ui32 count)
void sse2_irv_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, float delta_inv, ui32 count, ui32 *max_val)
int64_t si64
Definition ojph_defs.h:57
uint64_t ui64
Definition ojph_defs.h:56
int32_t si32
Definition ojph_defs.h:55
uint32_t ui32
Definition ojph_defs.h:54
#define ojph_unused(x)
Definition ojph_defs.h:78