libmsnumpress
Numerical compression schemes for proteomics mass spectrometry data
Loading...
Searching...
No Matches
MSNumpress.cpp
Go to the documentation of this file.
1/*
2 MSNumpress.cpp
3 johan.teleman@immun.lth.se
4
5 Copyright 2013 Johan Teleman
6
7 Licensed under the Apache License, Version 2.0 (the "License");
8 you may not use this file except in compliance with the License.
9 You may obtain a copy of the License at
10
11 http://www.apache.org/licenses/LICENSE-2.0
12
13 Unless required by applicable law or agreed to in writing, software
14 distributed under the License is distributed on an "AS IS" BASIS,
15 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 See the License for the specific language governing permissions and
17 limitations under the License.
18 */
19
20#include <iostream>
21#include <cmath>
22#include <climits>
23#include <algorithm>
24#include "MSNumpress.hpp"
25
26namespace ms {
27namespace numpress {
28namespace MSNumpress {
29
30using std::cout;
31using std::cerr;
32using std::endl;
33using std::min;
34using std::max;
35using std::abs;
36
37// This is only valid on systems were ints use more bytes than chars...
38
39const int ONE = 1;
40static bool is_little_endian() {
41 return *((char*)&(ONE)) == 1;
42}
44
45
46
47/////////////////////////////////////////////////////////////
48
49static void encodeFixedPoint(
50 double fixedPoint,
51 unsigned char *result
52) {
53 int i;
54 unsigned char *fp = (unsigned char*)&fixedPoint;
55 for (i=0; i<8; i++) {
56 result[i] = fp[IS_LITTLE_ENDIAN ? (7-i) : i];
57 }
58}
59
60
61
62static double decodeFixedPoint(
63 const unsigned char *data
64) {
65 int i;
66 double fixedPoint;
67 unsigned char *fp = (unsigned char*)&fixedPoint;
68
69 for (i=0; i<8; i++) {
70 fp[i] = data[IS_LITTLE_ENDIAN ? (7-i) : i];
71 }
72
73 return fixedPoint;
74}
75
76/////////////////////////////////////////////////////////////
77
78/**
79 * Encodes the int x as a number of halfbytes in res.
80 * res_length is incremented by the number of halfbytes,
81 * which will be 1 <= n <= 9
82 */
83static void encodeInt(
84 const unsigned int x,
85 unsigned char* res,
86 size_t *res_length
87) {
88 // get the bit pattern of a signed int x_inp
89 unsigned int m;
90 unsigned char i, l; // numbers between 0 and 9
91
92 unsigned int mask = 0xf0000000;
93 unsigned int init = x & mask;
94
95 if (init == 0) {
96 l = 8;
97 for (i=0; i<8; i++) {
98 m = mask >> (4*i);
99 if ((x & m) != 0) {
100 l = i;
101 break;
102 }
103 }
104 res[0] = l;
105 for (i=l; i<8; i++) {
106 res[1+i-l] = static_cast<unsigned char>( x >> (4*(i-l)) );
107 }
108 *res_length += 1+8-l;
109
110 } else if (init == mask) {
111 l = 7;
112 for (i=0; i<8; i++) {
113 m = mask >> (4*i);
114 if ((x & m) != m) {
115 l = i;
116 break;
117 }
118 }
119 res[0] = l + 8;
120 for (i=l; i<8; i++) {
121 res[1+i-l] = static_cast<unsigned char>( x >> (4*(i-l)) );
122 }
123 *res_length += 1+8-l;
124
125 } else {
126 res[0] = 0;
127 for (i=0; i<8; i++) {
128 res[1+i] = static_cast<unsigned char>( x >> (4*i) );
129 }
130 *res_length += 9;
131
132 }
133}
134
135
136
137/**
138 * Decodes an int from the half bytes in bp. Lossless reverse of encodeInt
139 *
140 * @param data ptr to the char data to decode
141 * @param di position in the char data array to start decoding (will be advanced)
142 * @param max_di size of data array
143 * @param half helper variable (do not change between multiple calls)
144 * @param res result (a 32 bit integer)
145 *
146 * @note the helper variable indicates whether we look at the first half byte
147 * or second half byte of the current data (thus whether to interpret the first
148 * half byte of data[*di] or the second half byte).
149 *
150 */
151static void decodeInt(
152 const unsigned char *data,
153 size_t *di,
154 size_t max_di,
155 size_t *half,
156 unsigned int *res
157) {
158 size_t n, i;
159 unsigned int mask, m;
160 unsigned char head;
161 unsigned char hb;
162
163 // Extract the first half byte, specifying the number of leading zero half
164 // bytes of the final integer.
165 // If half is zero, we look at the first half byte, otherwise we look at
166 // the second (lower) half byte and advance the counter to the next char.
167 if (*half == 0) {
168 head = data[*di] >> 4;
169 } else {
170 head = data[*di] & 0xf;
171 (*di)++;
172 }
173
174 *half = 1-(*half); // switch to other half byte
175 *res = 0;
176
177 if (head <= 8) {
178 n = head;
179 } else { // we have n leading ones, fill n half bytes in res with 0xf
180 n = head - 8;
181 mask = 0xf0000000;
182 for (i=0; i<n; i++) {
183 m = mask >> (4*i);
184 *res = *res | m;
185 }
186 }
187
188 if (n == 8) {
189 return;
190 }
191
192 if (*di + ((8 - n) - (1 - *half)) / 2 >= max_di) {
193 throw "[MSNumpress::decodeInt] Corrupt input data! ";
194 }
195
196 for (i=n; i<8; i++) {
197 if (*half == 0) {
198 hb = data[*di] >> 4;
199 } else {
200 hb = data[*di] & 0xf;
201 (*di)++;
202 }
203 *res = *res | ( static_cast<unsigned int>(hb) << ((i-n)*4));
204 *half = 1 - (*half);
205 }
206}
207
208
209
210
211/////////////////////////////////////////////////////////////
212
214 const double *data,
215 size_t dataSize,
216 double mass_acc
217) {
218 if (dataSize < 3) return 0; // we just encode the first two points as floats
219
220 // We calculate the maximal fixedPoint we need to achieve a specific mass
221 // accuracy. Note that the maximal error we will make by encoding as int is
222 // 0.5 due to rounding errors.
223 double maxFP = 0.5 / mass_acc;
224
225 // There is a maximal value for the FP given by the int length (32bit)
226 // which means we cannot choose a value higher than that. In case we cannot
227 // achieve the desired accuracy, return failure (-1).
228 double maxFP_overflow = optimalLinearFixedPoint(data, dataSize);
229 if (maxFP > maxFP_overflow) return -1;
230
231 return maxFP;
232}
233
235 const double *data,
236 size_t dataSize
237) {
238 /*
239 * safer impl - apparently not needed though
240 *
241 if (dataSize == 0) return 0;
242
243 double maxDouble = 0;
244 double x;
245
246 for (size_t i=0; i<dataSize; i++) {
247 x = data[i];
248 maxDouble = max(maxDouble, x);
249 }
250
251 return floor(0xFFFFFFFF / maxDouble);
252 */
253 if (dataSize == 0) return 0;
254 if (dataSize == 1) return floor(0x7FFFFFFFl / data[0]);
255 double maxDouble = max(data[0], data[1]);
256 double extrapol;
257 double diff;
258
259 for (size_t i=2; i<dataSize; i++) {
260 extrapol = data[i-1] + (data[i-1] - data[i-2]);
261 diff = data[i] - extrapol;
262 maxDouble = max(maxDouble, ceil(abs(diff)+1));
263 }
264
265 return floor(0x7FFFFFFFl / maxDouble);
266}
267
268
269
271 const double *data,
272 size_t dataSize,
273 unsigned char *result,
274 double fixedPoint
275) {
276 long long ints[3];
277 size_t i, ri;
278 unsigned char halfBytes[10];
279 size_t halfByteCount;
280 size_t hbi;
281 long long extrapol;
282 int diff;
283
284 //printf("Encoding %d doubles with fixed point %f\n", (int)dataSize, fixedPoint);
285 encodeFixedPoint(fixedPoint, result);
286
287
288 if (dataSize == 0) return 8;
289
290 ints[1] = static_cast<long long>(data[0] * fixedPoint + 0.5);
291 for (i=0; i<4; i++) {
292 result[8+i] = (ints[1] >> (i*8)) & 0xff;
293 }
294
295 if (dataSize == 1) return 12;
296
297 ints[2] = static_cast<long long>(data[1] * fixedPoint + 0.5);
298 for (i=0; i<4; i++) {
299 result[12+i] = (ints[2] >> (i*8)) & 0xff;
300 }
301
302 halfByteCount = 0;
303 ri = 16;
304
305 for (i=2; i<dataSize; i++) {
306 ints[0] = ints[1];
307 ints[1] = ints[2];
308 if (THROW_ON_OVERFLOW &&
309 data[i] * fixedPoint + 0.5 > LLONG_MAX ) {
310 throw "[MSNumpress::encodeLinear] Next number overflows LLONG_MAX.";
311 }
312
313 ints[2] = static_cast<long long>(data[i] * fixedPoint + 0.5);
314 extrapol = ints[1] + (ints[1] - ints[0]);
315
316 if (THROW_ON_OVERFLOW &&
317 ( ints[2] - extrapol > INT_MAX
318 || ints[2] - extrapol < INT_MIN )) {
319 throw "[MSNumpress::encodeLinear] Cannot encode a number that exceeds the bounds of [-INT_MAX, INT_MAX].";
320 }
321
322 diff = static_cast<int>(ints[2] - extrapol);
323 //printf("%lu %lu %lu, extrapol: %ld diff: %d \n", ints[0], ints[1], ints[2], extrapol, diff);
324 encodeInt(
325 static_cast<unsigned int>(diff),
326 &halfBytes[halfByteCount],
327 &halfByteCount
328 );
329 /*
330 printf("%d (%d): ", diff, (int)halfByteCount);
331 for (size_t j=0; j<halfByteCount; j++) {
332 printf("%x ", halfBytes[j] & 0xf);
333 }
334 printf("\n");
335 */
336
337
338 for (hbi=1; hbi < halfByteCount; hbi+=2) {
339 result[ri] = static_cast<unsigned char>(
340 (halfBytes[hbi-1] << 4) | (halfBytes[hbi] & 0xf)
341 );
342 //printf("%x \n", result[ri]);
343 ri++;
344 }
345 if (halfByteCount % 2 != 0) {
346 halfBytes[0] = halfBytes[halfByteCount-1];
347 halfByteCount = 1;
348 } else {
349 halfByteCount = 0;
350 }
351 }
352 if (halfByteCount == 1) {
353 result[ri] = static_cast<unsigned char>(halfBytes[0] << 4);
354 ri++;
355 }
356 return ri;
357}
358
359
360
362 const unsigned char *data,
363 const size_t dataSize,
364 double *result
365) {
366 size_t i;
367 size_t ri = 0;
368 unsigned int init, buff;
369 int diff;
370 long long ints[3];
371 //double d;
372 size_t di;
373 size_t half;
374 long long extrapol;
375 long long y;
376 double fixedPoint;
377
378 //printf("Decoding %d bytes with fixed point %f\n", (int)dataSize, fixedPoint);
379
380 if (dataSize == 8) return 0;
381
382 if (dataSize < 8)
383 throw "[MSNumpress::decodeLinear] Corrupt input data: not enough bytes to read fixed point! ";
384
385 fixedPoint = decodeFixedPoint(data);
386
387
388 if (dataSize < 12)
389 throw "[MSNumpress::decodeLinear] Corrupt input data: not enough bytes to read first value! ";
390
391 ints[1] = 0;
392 for (i=0; i<4; i++) {
393 ints[1] = ints[1] | ((0xff & (init = data[8+i])) << (i*8));
394 }
395 result[0] = ints[1] / fixedPoint;
396
397 if (dataSize == 12) return 1;
398 if (dataSize < 16)
399 throw "[MSNumpress::decodeLinear] Corrupt input data: not enough bytes to read second value! ";
400
401 ints[2] = 0;
402 for (i=0; i<4; i++) {
403 ints[2] = ints[2] | ((0xff & (init = data[12+i])) << (i*8));
404 }
405 result[1] = ints[2] / fixedPoint;
406
407 half = 0;
408 ri = 2;
409 di = 16;
410
411 //printf(" di ri half int[0] int[1] extrapol diff\n");
412
413 while (di < dataSize) {
414 if (di == (dataSize - 1) && half == 1) {
415 if ((data[di] & 0xf) == 0x0) {
416 break;
417 }
418 }
419 //printf("%7d %7d %7d %lu %lu %ld", di, ri, half, ints[0], ints[1], extrapol);
420
421 ints[0] = ints[1];
422 ints[1] = ints[2];
423 decodeInt(data, &di, dataSize, &half, &buff);
424 diff = static_cast<int>(buff);
425
426 extrapol = ints[1] + (ints[1] - ints[0]);
427 y = extrapol + diff;
428 //printf(" %d \n", diff);
429 result[ri++] = y / fixedPoint;
430 ints[2] = y;
431 }
432
433 return ri;
434}
435
436
437
439 const std::vector<double> &data,
440 std::vector<unsigned char> &result,
441 double fixedPoint
442) {
443 size_t dataSize = data.size();
444 result.resize(dataSize * 5 + 8);
445 size_t encodedLength = encodeLinear(&data[0], dataSize, &result[0], fixedPoint);
446 result.resize(encodedLength);
447}
448
449
450
452 const std::vector<unsigned char> &data,
453 std::vector<double> &result
454) {
455 size_t dataSize = data.size();
456 result.resize((dataSize - 8) * 2);
457 size_t decodedLength = decodeLinear(&data[0], dataSize, &result[0]);
458 result.resize(decodedLength);
459}
460
461/////////////////////////////////////////////////////////////
462
463
465 const double *data,
466 const size_t dataSize,
467 unsigned char *result
468) {
469 size_t i, j, ri = 0;
470 double latest[3];
471 double extrapol, diff;
472 const unsigned char *fp;
473
474 //printf("d0 d1 d2 extrapol diff\n");
475
476 if (dataSize == 0) return ri;
477
478 latest[1] = data[0];
479 fp = (unsigned char*)data;
480 for (i=0; i<8; i++) {
481 result[ri++] = fp[IS_LITTLE_ENDIAN ? (7-i) : i];
482 }
483
484 if (dataSize == 1) return ri;
485
486 latest[2] = data[1];
487 fp = (unsigned char*)&(data[1]);
488 for (i=0; i<8; i++) {
489 result[ri++] = fp[IS_LITTLE_ENDIAN ? (7-i) : i];
490 }
491
492 fp = (unsigned char*)&diff;
493 for (i=2; i<dataSize; i++) {
494 latest[0] = latest[1];
495 latest[1] = latest[2];
496 latest[2] = data[i];
497 extrapol = latest[1] + (latest[1] - latest[0]);
498 diff = latest[2] - extrapol;
499 //printf("%f %f %f %f %f\n", latest[0], latest[1], latest[2], extrapol, diff);
500 for (j=0; j<8; j++) {
501 result[ri++] = fp[IS_LITTLE_ENDIAN ? (7-j) : j];
502 }
503 }
504
505 return ri;
506}
507
508
509
511 const unsigned char *data,
512 const size_t dataSize,
513 double *result
514) {
515 size_t i, di, ri;
516 double extrapol, diff;
517 double latest[3];
518 unsigned char *fp;
519
520 if (dataSize % 8 != 0)
521 throw "[MSNumpress::decodeSafe] Corrupt input data: number of bytes needs to be multiple of 8! ";
522
523 //printf("d0 d1 extrapol diff\td2\n");
524
525 try {
526 fp = (unsigned char*)&(latest[1]);
527 for (i=0; i<8; i++) {
528 fp[i] = data[IS_LITTLE_ENDIAN ? (7-i) : i];
529 }
530 result[0] = latest[1];
531
532 if (dataSize == 8) return 1;
533
534 fp = (unsigned char*)&(latest[2]);
535 for (i=0; i<8; i++) {
536 fp[i] = data[8 + (IS_LITTLE_ENDIAN ? (7-i) : i)];
537 }
538 result[1] = latest[2];
539
540 ri = 2;
541
542 fp = (unsigned char*)&diff;
543 for (di = 16; di < dataSize; di += 8) {
544 latest[0] = latest[1];
545 latest[1] = latest[2];
546
547 for (i=0; i<8; i++) {
548 fp[i] = data[di + (IS_LITTLE_ENDIAN ? (7-i) : i)];
549 }
550
551 extrapol = latest[1] + (latest[1] - latest[0]);
552 latest[2] = extrapol + diff;
553
554 //printf("%f %f %f %f\t%f \n", latest[0], latest[1], extrapol, diff, latest[2]);
555
556 result[ri++] = latest[2];
557 }
558 } catch (...) {
559 throw "[MSNumpress::decodeSafe] Unknown error during decode! ";
560 }
561
562 return ri;
563}
564
565/////////////////////////////////////////////////////////////
566
567
569 const double *data,
570 size_t dataSize,
571 unsigned char *result
572) {
573 size_t i, ri;
574 unsigned int x;
575 unsigned char halfBytes[10];
576 size_t halfByteCount;
577 size_t hbi;
578
579 //printf("Encoding %d doubles\n", (int)dataSize);
580
581 halfByteCount = 0;
582 ri = 0;
583
584 for (i=0; i<dataSize; i++) {
585
586 if (THROW_ON_OVERFLOW &&
587 (data[i] + 0.5 > INT_MAX || data[i] < -0.5) ){
588 throw "[MSNumpress::encodePic] Cannot use Pic to encode a number larger than INT_MAX or smaller than 0.";
589 }
590 x = static_cast<unsigned int>(data[i] + 0.5);
591 //printf("%d %d %d, extrapol: %d diff: %d \n", ints[0], ints[1], ints[2], extrapol, diff);
592 encodeInt(x, &halfBytes[halfByteCount], &halfByteCount);
593
594 for (hbi=1; hbi < halfByteCount; hbi+=2) {
595 result[ri] = static_cast<unsigned char>(
596 (halfBytes[hbi-1] << 4) | (halfBytes[hbi] & 0xf)
597 );
598 //printf("%x \n", result[ri]);
599 ri++;
600 }
601 if (halfByteCount % 2 != 0) {
602 halfBytes[0] = halfBytes[halfByteCount-1];
603 halfByteCount = 1;
604 } else {
605 halfByteCount = 0;
606 }
607 }
608 if (halfByteCount == 1) {
609 result[ri] = static_cast<unsigned char>(halfBytes[0] << 4);
610 ri++;
611 }
612 return ri;
613}
614
615
616
618 const unsigned char *data,
619 const size_t dataSize,
620 double *result
621) {
622 size_t ri;
623 unsigned int x;
624 size_t di;
625 size_t half;
626
627 //printf("ri di half dSize count\n");
628
629 half = 0;
630 ri = 0;
631 di = 0;
632
633 while (di < dataSize) {
634 if (di == (dataSize - 1) && half == 1) {
635 if ((data[di] & 0xf) == 0x0) {
636 break;
637 }
638 }
639
640 decodeInt(&data[0], &di, dataSize, &half, &x);
641
642 //printf("%7d %7d %7d %7d %7d\n", ri, di, half, dataSize, count);
643
644 //printf("count: %d \n", count);
645 result[ri++] = static_cast<double>(x);
646 }
647
648 return ri;
649}
650
651
652
654 const std::vector<double> &data,
655 std::vector<unsigned char> &result
656) {
657 size_t dataSize = data.size();
658 result.resize(dataSize * 5);
659 size_t encodedLength = encodePic(&data[0], dataSize, &result[0]);
660 result.resize(encodedLength);
661}
662
663
664
666 const std::vector<unsigned char> &data,
667 std::vector<double> &result
668) {
669 size_t dataSize = data.size();
670 result.resize(dataSize * 2);
671 size_t decodedLength = decodePic(&data[0], dataSize, &result[0]);
672 result.resize(decodedLength);
673}
674
675
676/////////////////////////////////////////////////////////////
677
678
680 const double *data,
681 size_t dataSize
682) {
683 if (dataSize == 0) return 0;
684
685 double maxDouble = 1;
686 double x;
687 double fp;
688
689 for (size_t i=0; i<dataSize; i++) {
690 x = log(data[i]+1);
691 maxDouble = max(maxDouble, x);
692 }
693
694 fp = floor(0xFFFF / maxDouble);
695
696 //cout << " max val: " << maxDouble << endl;
697 //cout << "fixed point: " << fp << endl;
698
699 return fp;
700}
701
702
703
705 const double *data,
706 size_t dataSize,
707 unsigned char *result,
708 double fixedPoint
709) {
710 size_t i, ri;
711 double temp;
712 unsigned short x;
713 encodeFixedPoint(fixedPoint, result);
714
715 ri = 8;
716 for (i=0; i<dataSize; i++) {
717 temp = log(data[i]+1) * fixedPoint;
718
719 if (THROW_ON_OVERFLOW &&
720 temp > USHRT_MAX ) {
721 throw "[MSNumpress::encodeSlof] Cannot encode a number that overflows USHRT_MAX.";
722 }
723
724 x = static_cast<unsigned short>(temp + 0.5);
725 result[ri++] = x & 0xff;
726 result[ri++] = (x >> 8) & 0xff;
727 }
728 return ri;
729}
730
731
732
734 const unsigned char *data,
735 const size_t dataSize,
736 double *result
737) {
738 size_t i, ri;
739 unsigned short x;
740 double fixedPoint;
741
742 if (dataSize < 8)
743 throw "[MSNumpress::decodeSlof] Corrupt input data: not enough bytes to read fixed point! ";
744
745 ri = 0;
746 fixedPoint = decodeFixedPoint(data);
747
748 for (i=8; i<dataSize; i+=2) {
749 x = static_cast<unsigned short>(data[i] | (data[i+1] << 8));
750 result[ri++] = exp(x / fixedPoint) - 1;
751 }
752 return ri;
753}
754
755
756
758 const std::vector<double> &data,
759 std::vector<unsigned char> &result,
760 double fixedPoint
761) {
762 size_t dataSize = data.size();
763 result.resize(dataSize * 2 + 8);
764 size_t encodedLength = encodeSlof(&data[0], dataSize, &result[0], fixedPoint);
765 result.resize(encodedLength);
766}
767
768
769
771 const std::vector<unsigned char> &data,
772 std::vector<double> &result
773) {
774 size_t dataSize = data.size();
775 result.resize((dataSize - 8) / 2);
776 size_t decodedLength = decodeSlof(&data[0], dataSize, &result[0]);
777 result.resize(decodedLength);
778}
779
780}
781} // namespace numpress
782} // namespace ms
void encodeLinear()
void optimalLinearFixedPointMass()
void optimalSlofFixedPoint()
void optimalLinearFixedPoint()
#define THROW_ON_OVERFLOW
static void encodeFixedPoint(double fixedPoint, unsigned char *result)
size_t encodePic(const double *data, size_t dataSize, unsigned char *result)
size_t decodeLinear(const unsigned char *data, const size_t dataSize, double *result)
static double decodeFixedPoint(const unsigned char *data)
size_t decodeSlof(const unsigned char *data, const size_t dataSize, double *result)
size_t encodeSlof(const double *data, size_t dataSize, unsigned char *result, double fixedPoint)
size_t encodeSafe(const double *data, const size_t dataSize, unsigned char *result)
size_t decodeSafe(const unsigned char *data, const size_t dataSize, double *result)
static void decodeInt(const unsigned char *data, size_t *di, size_t max_di, size_t *half, unsigned int *res)
size_t decodePic(const unsigned char *data, const size_t dataSize, double *result)
static bool is_little_endian()
static void encodeInt(const unsigned int x, unsigned char *res, size_t *res_length)