libmsnumpress
Numerical compression schemes for proteomics mass spectrometry data
Loading...
Searching...
No Matches
MSNumpress.hpp
Go to the documentation of this file.
1/*
2 MSNumpress.hpp
3 johan.teleman@immun.lth.se
4
5 Copyright 2013 Johan Teleman
6
7 Licensed under the Apache License, Version 2.0 (the "License");
8 you may not use this file except in compliance with the License.
9 You may obtain a copy of the License at
10
11 http://www.apache.org/licenses/LICENSE-2.0
12
13 Unless required by applicable law or agreed to in writing, software
14 distributed under the License is distributed on an "AS IS" BASIS,
15 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 See the License for the specific language governing permissions and
17 limitations under the License.
18 */
19/*
20 ==================== encodeInt ====================
21 Some of the encodings described below use a integer compression referred to simply as
22
23 encodeInt()
24
25 This encoding works on a 4 byte integer, by truncating initial zeros or ones.
26 If the initial (most significant) half byte is 0x0 or 0xf, the number of such
27 halfbytes starting from the most significant is stored in a halfbyte. This initial
28 count is then followed by the rest of the ints halfbytes, in little-endian order.
29 A count halfbyte c of
30
31 0 <= c <= 8 is interpreted as an initial c 0x0 halfbytes
32 9 <= c <= 15 is interpreted as an initial (c-8) 0xf halfbytes
33
34 Ex:
35 int c rest
36 0 => 0x8
37 -1 => 0xf 0xf
38 23 => 0x6 0x7 0x1
39 */
40
41#ifndef _MSNUMPRESS_HPP_
42#define _MSNUMPRESS_HPP_
43
44#include <cstddef>
45#include <vector>
46
47// defines whether to throw an exception when a number cannot be encoded safely
48// with the given parameters
49#ifndef THROW_ON_OVERFLOW
50#define THROW_ON_OVERFLOW true
51#endif
52
53namespace ms {
54namespace numpress {
55
56namespace MSNumpress {
57
58 /**
59 * Compute the maximal linear fixed point that prevents integer overflow.
60 *
61 * @data pointer to array of double to be encoded (need memorycont. repr.)
62 * @dataSize number of doubles from *data to encode
63 *
64 * @return the linear fixed point safe to use
65 */
67 const double *data,
68 size_t dataSize);
69
70 /**
71 * Compute the optimal linear fixed point with a desired m/z accuracy.
72 *
73 * @note If the desired accuracy cannot be reached without overflowing 64
74 * bit integers, then a negative value is returned. You need to check for
75 * this and in that case abandon numpress or use optimalLinearFixedPoint
76 * which returns the largest safe value.
77 *
78 * @data pointer to array of double to be encoded (need memorycont. repr.)
79 * @dataSize number of doubles from *data to encode
80 * @mass_acc desired m/z accuracy in Th
81 *
82 * @return the linear fixed point that satisfies the accuracy requirement (or -1 in case of failure).
83 */
85 const double *data,
86 size_t dataSize,
87 double mass_acc);
88
89 /**
90 * Encodes the doubles in data by first using a
91 * - lossy conversion to a 4 byte 5 decimal fixed point representation
92 * - storing the residuals from a linear prediction after first two values
93 * - encoding by encodeInt (see above)
94 *
95 * The resulting binary is maximally 8 + dataSize * 5 bytes, but much less if the
96 * data is reasonably smooth on the first order.
97 *
98 * This encoding is suitable for typical m/z or retention time binary arrays.
99 * On a test set, the encoding was empirically show to be accurate to at least 0.002 ppm.
100 *
101 * @data pointer to array of double to be encoded (need memorycont. repr.)
102 * @dataSize number of doubles from *data to encode
103 * @result pointer to where resulting bytes should be stored
104 * @fixedPoint the scaling factor used for getting the fixed point repr.
105 * This is stored in the binary and automatically extracted
106 * on decoding.
107 * @return the number of encoded bytes
108 */
109 size_t encodeLinear(
110 const double *data,
111 const size_t dataSize,
112 unsigned char *result,
113 double fixedPoint);
114
115 /**
116 * Calls lower level encodeLinear while handling vector sizes appropriately
117 *
118 * @data vector of doubles to be encoded
119 * @result vector of resulting bytes (will be resized to the number of bytes)
120 */
121 void encodeLinear(
122 const std::vector<double> &data,
123 std::vector<unsigned char> &result,
124 double fixedPoint);
125
126 /**
127 * Decodes data encoded by encodeLinear.
128 *
129 * result vector guaranteed to be shorter or equal to (|data| - 8) * 2
130 *
131 * Note that this method may throw a const char* if it deems the input data to be corrupt, i.e.
132 * that the last encoded int does not use the last byte in the data. In addition the last encoded
133 * int need to use either the last halfbyte, or the second last followed by a 0x0 halfbyte.
134 *
135 * @data pointer to array of bytes to be decoded (need memorycont. repr.)
136 * @dataSize number of bytes from *data to decode
137 * @result pointer to were resulting doubles should be stored
138 * @return the number of decoded doubles, or -1 if dataSize < 4 or 4 < dataSize < 8
139 */
140 size_t decodeLinear(
141 const unsigned char *data,
142 const size_t dataSize,
143 double *result);
144
145 /**
146 * Calls lower level decodeLinear while handling vector sizes appropriately
147 *
148 * Note that this method may throw a const char* if it deems the input data to be corrupt, i.e..
149 * that the last encoded int does not use the last byte in the data. In addition the last encoded
150 * int need to use either the last halfbyte, or the second last followed by a 0x0 halfbyte.
151 *
152 * @data vector of bytes to be decoded
153 * @result vector of resulting double (will be resized to the number of doubles)
154 */
155 void decodeLinear(
156 const std::vector<unsigned char> &data,
157 std::vector<double> &result);
158
159/////////////////////////////////////////////////////////////
160
161
162 /**
163 * Encodes the doubles in data by storing the residuals from a linear prediction after first two values.
164 *
165 * The resulting binary is the same size as the input data.
166 *
167 * This encoding is suitable for typical m/z or retention time binary arrays, and is
168 * intended to be used before zlib compression to improve compression.
169 *
170 * @data pointer to array of doubles to be encoded (need memorycont. repr.)
171 * @dataSize number of doubles from *data to encode
172 * @result pointer to were resulting bytes should be stored
173 */
174 size_t encodeSafe(
175 const double *data,
176 const size_t dataSize,
177 unsigned char *result);
178
179
180 /**
181 * Decodes data encoded by encodeSafe.
182 *
183 * result vector is the same size as the input data.
184 *
185 * Might throw const char* is something goes wrong during decoding.
186 *
187 * @data pointer to array of bytes to be decoded (need memorycont. repr.)
188 * @dataSize number of bytes from *data to decode
189 * @result pointer to were resulting doubles should be stored
190 * @return the number of decoded bytes
191 */
192 size_t decodeSafe(
193 const unsigned char *data,
194 const size_t dataSize,
195 double *result);
196
197/////////////////////////////////////////////////////////////
198
199 /**
200 * Encodes ion counts by simply rounding to the nearest 4 byte integer,
201 * and compressing each integer with encodeInt.
202 *
203 * The handleable range is therefore 0 -> 4294967294.
204 * The resulting binary is maximally dataSize * 5 bytes, but much less if the
205 * data is close to 0 on average.
206 *
207 * @data pointer to array of double to be encoded (need memorycont. repr.)
208 * @dataSize number of doubles from *data to encode
209 * @result pointer to were resulting bytes should be stored
210 * @return the number of encoded bytes
211 */
212 size_t encodePic(
213 const double *data,
214 const size_t dataSize,
215 unsigned char *result);
216
217 /**
218 * Calls lower level encodePic while handling vector sizes appropriately
219 *
220 * @data vector of doubles to be encoded
221 * @result vector of resulting bytes (will be resized to the number of bytes)
222 */
223 void encodePic(
224 const std::vector<double> &data,
225 std::vector<unsigned char> &result);
226
227 /**
228 * Decodes data encoded by encodePic
229 *
230 * result vector guaranteed to be shorter of equal to |data| * 2
231 *
232 * Note that this method may throw a const char* if it deems the input data to be corrupt, i.e.
233 * that the last encoded int does not use the last byte in the data. In addition the last encoded
234 * int need to use either the last halfbyte, or the second last followed by a 0x0 halfbyte.
235 *
236 * @data pointer to array of bytes to be decoded (need memorycont. repr.)
237 * @dataSize number of bytes from *data to decode
238 * @result pointer to were resulting doubles should be stored
239 * @return the number of decoded doubles
240 */
241 size_t decodePic(
242 const unsigned char *data,
243 const size_t dataSize,
244 double *result);
245
246 /**
247 * Calls lower level decodePic while handling vector sizes appropriately
248 *
249 * Note that this method may throw a const char* if it deems the input data to be corrupt, i.e.
250 * that the last encoded int does not use the last byte in the data. In addition the last encoded
251 * int need to use either the last halfbyte, or the second last followed by a 0x0 halfbyte.
252 *
253 * @data vector of bytes to be decoded
254 * @result vector of resulting double (will be resized to the number of doubles)
255 */
256 void decodePic(
257 const std::vector<unsigned char> &data,
258 std::vector<double> &result);
259
260/////////////////////////////////////////////////////////////
261
262
264 const double *data,
265 size_t dataSize);
266
267 /**
268 * Encodes ion counts by taking the natural logarithm, and storing a
269 * fixed point representation of this. This is calculated as
270 *
271 * unsigned short fp = log(d + 1) * fixedPoint + 0.5
272 *
273 * the result vector is exactly |data| * 2 + 8 bytes long
274 *
275 * @data pointer to array of double to be encoded (need memorycont. repr.)
276 * @dataSize number of doubles from *data to encode
277 * @result pointer to were resulting bytes should be stored
278 * @return the number of encoded bytes
279 */
280 size_t encodeSlof(
281 const double *data,
282 const size_t dataSize,
283 unsigned char *result,
284 double fixedPoint);
285
286 /**
287 * Calls lower level encodeSlof while handling vector sizes appropriately
288 *
289 * @data vector of doubles to be encoded
290 * @result vector of resulting bytes (will be resized to the number of bytes)
291 */
292 void encodeSlof(
293 const std::vector<double> &data,
294 std::vector<unsigned char> &result,
295 double fixedPoint);
296
297 /**
298 * Decodes data encoded by encodeSlof
299 *
300 * The return will include exactly (|data| - 8) / 2 doubles.
301 *
302 * Note that this method may throw a const char* if it deems the input data to be corrupt.
303 *
304 * @data pointer to array of bytes to be decoded (need memorycont. repr.)
305 * @dataSize number of bytes from *data to decode
306 * @result pointer to were resulting doubles should be stored
307 * @return the number of decoded doubles
308 */
309 size_t decodeSlof(
310 const unsigned char *data,
311 const size_t dataSize,
312 double *result);
313
314 /**
315 * Calls lower level decodeSlof while handling vector sizes appropriately
316 *
317 * Note that this method may throw a const char* if it deems the input data to be corrupt.
318 *
319 * @data vector of bytes to be decoded
320 * @result vector of resulting double (will be resized to the number of doubles)
321 */
322 void decodeSlof(
323 const std::vector<unsigned char> &data,
324 std::vector<double> &result);
325
326} // namespace MSNumpress
327} // namespace msdata
328} // namespace pwiz
329
330#endif // _MSNUMPRESS_HPP_
void encodeLinear()
void optimalLinearFixedPointMass()
void optimalSlofFixedPoint()
void optimalLinearFixedPoint()
size_t encodePic(const double *data, size_t dataSize, unsigned char *result)
size_t decodeLinear(const unsigned char *data, const size_t dataSize, double *result)
size_t decodeSlof(const unsigned char *data, const size_t dataSize, double *result)
size_t encodeSlof(const double *data, size_t dataSize, unsigned char *result, double fixedPoint)
size_t encodeSafe(const double *data, const size_t dataSize, unsigned char *result)
size_t decodeSafe(const unsigned char *data, const size_t dataSize, double *result)
size_t decodePic(const unsigned char *data, const size_t dataSize, double *result)