XRootD
Loading...
Searching...
No Matches
XrdOucCRC32C.cc
Go to the documentation of this file.
1/* crc32c.c -- compute CRC-32C using the Intel crc32 instruction
2 * Copyright (C) 2013, 2015 Mark Adler
3 * Version 1.3 31 Dec 2015 Mark Adler
4 */
5
6/*
7 This software is provided 'as-is', without any express or implied
8 warranty. In no event will the author be held liable for any damages
9 arising from the use of this software.
10
11 Permission is granted to anyone to use this software for any purpose,
12 including commercial applications, and to alter it and redistribute it
13 freely, subject to the following restrictions:
14
15 1. The origin of this software must not be misrepresented; you must not
16 claim that you wrote the original software. If you use this software
17 in a product, an acknowledgment in the product documentation would be
18 appreciated but is not required.
19 2. Altered source versions must be plainly marked as such, and must not be
20 misrepresented as being the original software.
21 3. This notice may not be removed or altered from any source distribution.
22
23 Mark Adler
24 madler@alumni.caltech.edu
25 */
26
27/* Use hardware CRC instruction on Intel SSE 4.2 processors. This computes a
28 CRC-32C, *not* the CRC-32 used by Ethernet and zip, gzip, etc. A software
29 version is provided as a fall-back, as well as for speed comparisons. */
30
31/* Version history:
32 1.0 10 Feb 2013 First version
33 1.1 1 Aug 2013 Correct comments on why three crc instructions in parallel
34 1.2 1 Nov 2015 Add const qualifier to avoid compiler warning
35 Load entire input into memory (test code)
36 Argument gives number of times to repeat (test code)
37 Argument < 0 forces software implementation (test code)
38 1.3 31 Dec 2015 Check for Intel architecture using compiler macro
39 Support big-endian processors in software calculation
40 Add header for external use
41 */
42
43/* Modification history:
44 22 Nov 2019 Rename orignal crc32.c to XrdOucCRC32.cc and crc32c.h to
45 XrdOucCRC32C.hh with corresponding change to include
46 statement herein. Add required casts to allow C++
47 compilation.
48 */
49
50#include <pthread.h>
52
53/* CRC-32C (iSCSI) polynomial in reversed bit order. */
54#define POLY 0x82f63b78
55
56#ifdef __x86_64__
57
58/* Hardware CRC-32C for Intel and compatible processors. */
59
60/* Multiply a matrix times a vector over the Galois field of two elements,
61 GF(2). Each element is a bit in an unsigned integer. mat must have at
62 least as many entries as the power of two for most significant one bit in
63 vec. */
64static inline uint32_t gf2_matrix_times(uint32_t *mat, uint32_t vec) {
65 uint32_t sum = 0;
66 while (vec) {
67 if (vec & 1)
68 sum ^= *mat;
69 vec >>= 1;
70 mat++;
71 }
72 return sum;
73}
74
75/* Multiply a matrix by itself over GF(2). Both mat and square must have 32
76 rows. */
77static inline void gf2_matrix_square(uint32_t *square, uint32_t *mat) {
78 for (unsigned n = 0; n < 32; n++)
79 square[n] = gf2_matrix_times(mat, mat[n]);
80}
81
82/* Construct an operator to apply len zeros to a crc. len must be a power of
83 two. If len is not a power of two, then the result is the same as for the
84 largest power of two less than len. The result for len == 0 is the same as
85 for len == 1. A version of this routine could be easily written for any
86 len, but that is not needed for this application. */
87static void crc32c_zeros_op(uint32_t *even, size_t len) {
88 uint32_t odd[32]; /* odd-power-of-two zeros operator */
89
90 /* put operator for one zero bit in odd */
91 odd[0] = POLY; /* CRC-32C polynomial */
92 uint32_t row = 1;
93 for (unsigned n = 1; n < 32; n++) {
94 odd[n] = row;
95 row <<= 1;
96 }
97
98 /* put operator for two zero bits in even */
99 gf2_matrix_square(even, odd);
100
101 /* put operator for four zero bits in odd */
102 gf2_matrix_square(odd, even);
103
104 /* first square will put the operator for one zero byte (eight zero bits),
105 in even -- next square puts operator for two zero bytes in odd, and so
106 on, until len has been rotated down to zero */
107 do {
108 gf2_matrix_square(even, odd);
109 len >>= 1;
110 if (len == 0)
111 return;
112 gf2_matrix_square(odd, even);
113 len >>= 1;
114 } while (len);
115
116 /* answer ended up in odd -- copy to even */
117 for (unsigned n = 0; n < 32; n++)
118 even[n] = odd[n];
119}
120
121/* Take a length and build four lookup tables for applying the zeros operator
122 for that length, byte-by-byte on the operand. */
123static void crc32c_zeros(uint32_t zeros[][256], size_t len) {
124 uint32_t op[32];
125
126 crc32c_zeros_op(op, len);
127 for (unsigned n = 0; n < 256; n++) {
128 zeros[0][n] = gf2_matrix_times(op, n);
129 zeros[1][n] = gf2_matrix_times(op, n << 8);
130 zeros[2][n] = gf2_matrix_times(op, n << 16);
131 zeros[3][n] = gf2_matrix_times(op, n << 24);
132 }
133}
134
135/* Apply the zeros operator table to crc. */
136static inline uint32_t crc32c_shift(uint32_t zeros[][256], uint32_t crc) {
137 return zeros[0][crc & 0xff] ^ zeros[1][(crc >> 8) & 0xff] ^
138 zeros[2][(crc >> 16) & 0xff] ^ zeros[3][crc >> 24];
139}
140
141/* Block sizes for three-way parallel crc computation. LONG and SHORT must
142 both be powers of two. The associated string constants must be set
143 accordingly, for use in constructing the assembler instructions. */
144#define LONG 8192
145#define LONGx1 "8192"
146#define LONGx2 "16384"
147#define SHORT 256
148#define SHORTx1 "256"
149#define SHORTx2 "512"
150
151/* Tables for hardware crc that shift a crc by LONG and SHORT zeros. */
152static pthread_once_t crc32c_once_hw = PTHREAD_ONCE_INIT;
153static uint32_t crc32c_long[4][256];
154static uint32_t crc32c_short[4][256];
155
156/* Initialize tables for shifting crcs. */
157static void crc32c_init_hw(void) {
158 crc32c_zeros(crc32c_long, LONG);
159 crc32c_zeros(crc32c_short, SHORT);
160}
161
162/* Compute CRC-32C using the Intel hardware instruction. */
163static uint32_t crc32c_hw(uint32_t crc, void const *buf, size_t len) {
164 /* populate shift tables the first time through */
165 pthread_once(&crc32c_once_hw, crc32c_init_hw);
166
167 /* pre-process the crc */
168 crc = ~crc;
169 uint64_t crc0 = crc; /* 64-bits for crc32q instruction */
170
171 /* compute the crc for up to seven leading bytes to bring the data pointer
172 to an eight-byte boundary */
173 unsigned char const *next = (unsigned char const *)buf;
174 while (len && ((uintptr_t)next & 7) != 0) {
175 __asm__("crc32b\t" "(%1), %0"
176 : "=r"(crc0)
177 : "r"(next), "0"(crc0));
178 next++;
179 len--;
180 }
181
182 /* compute the crc on sets of LONG*3 bytes, executing three independent crc
183 instructions, each on LONG bytes -- this is optimized for the Nehalem,
184 Westmere, Sandy Bridge, and Ivy Bridge architectures, which have a
185 throughput of one crc per cycle, but a latency of three cycles */
186 while (len >= LONG*3) {
187 uint64_t crc1 = 0;
188 uint64_t crc2 = 0;
189 unsigned char const * const end = next + LONG;
190 do {
191 __asm__("crc32q\t" "(%3), %0\n\t"
192 "crc32q\t" LONGx1 "(%3), %1\n\t"
193 "crc32q\t" LONGx2 "(%3), %2"
194 : "=r"(crc0), "=r"(crc1), "=r"(crc2)
195 : "r"(next), "0"(crc0), "1"(crc1), "2"(crc2));
196 next += 8;
197 } while (next < end);
198 crc0 = crc32c_shift(crc32c_long, crc0) ^ crc1;
199 crc0 = crc32c_shift(crc32c_long, crc0) ^ crc2;
200 next += LONG*2;
201 len -= LONG*3;
202 }
203
204 /* do the same thing, but now on SHORT*3 blocks for the remaining data less
205 than a LONG*3 block */
206 while (len >= SHORT*3) {
207 uint64_t crc1 = 0;
208 uint64_t crc2 = 0;
209 unsigned char const * const end = next + SHORT;
210 do {
211 __asm__("crc32q\t" "(%3), %0\n\t"
212 "crc32q\t" SHORTx1 "(%3), %1\n\t"
213 "crc32q\t" SHORTx2 "(%3), %2"
214 : "=r"(crc0), "=r"(crc1), "=r"(crc2)
215 : "r"(next), "0"(crc0), "1"(crc1), "2"(crc2));
216 next += 8;
217 } while (next < end);
218 crc0 = crc32c_shift(crc32c_short, crc0) ^ crc1;
219 crc0 = crc32c_shift(crc32c_short, crc0) ^ crc2;
220 next += SHORT*2;
221 len -= SHORT*3;
222 }
223
224 /* compute the crc on the remaining eight-byte units less than a SHORT*3
225 block */
226 {
227 unsigned char const * const end = next + (len - (len & 7));
228 while (next < end) {
229 __asm__("crc32q\t" "(%1), %0"
230 : "=r"(crc0)
231 : "r"(next), "0"(crc0));
232 next += 8;
233 }
234 len &= 7;
235 }
236
237 /* compute the crc for up to seven trailing bytes */
238 while (len) {
239 __asm__("crc32b\t" "(%1), %0"
240 : "=r"(crc0)
241 : "r"(next), "0"(crc0));
242 next++;
243 len--;
244 }
245
246 /* return a post-processed crc */
247 return ~crc0;
248}
249
250/* Check for SSE 4.2. SSE 4.2 was first supported in Nehalem processors
251 introduced in November, 2008. This does not check for the existence of the
252 cpuid instruction itself, which was introduced on the 486SL in 1992, so this
253 will fail on earlier x86 processors. cpuid works on all Pentium and later
254 processors. */
255#define SSE42(have) \
256 do { \
257 uint32_t eax, ecx; \
258 eax = 1; \
259 __asm__("cpuid" \
260 : "=c"(ecx) \
261 : "a"(eax) \
262 : "%ebx", "%edx"); \
263 (have) = (ecx >> 20) & 1; \
264 } while (0)
265
266/* Compute a CRC-32C. If the crc32 instruction is available, use the hardware
267 version. Otherwise, use the software version. */
268uint32_t crc32c(uint32_t crc, void const *buf, size_t len) {
269 int sse42;
270
271 SSE42(sse42);
272 return sse42 ? crc32c_hw(crc, buf, len) : crc32c_sw(crc, buf, len);
273}
274
275#else /* !__x86_64__ */
276
277uint32_t crc32c(uint32_t crc, void const *buf, size_t len) {
278 return crc32c_sw(crc, buf, len);
279}
280
281#endif
282
283/* Construct table for software CRC-32C little-endian calculation. */
284static pthread_once_t crc32c_once_little = PTHREAD_ONCE_INIT;
285static uint32_t crc32c_table_little[8][256];
286static void crc32c_init_sw_little(void) {
287 for (unsigned n = 0; n < 256; n++) {
288 uint32_t crc = n;
289 crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
290 crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
291 crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
292 crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
293 crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
294 crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
295 crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
296 crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
297 crc32c_table_little[0][n] = crc;
298 }
299 for (unsigned n = 0; n < 256; n++) {
300 uint32_t crc = crc32c_table_little[0][n];
301 for (unsigned k = 1; k < 8; k++) {
302 crc = crc32c_table_little[0][crc & 0xff] ^ (crc >> 8);
303 crc32c_table_little[k][n] = crc;
304 }
305 }
306}
307
308/* Compute a CRC-32C in software assuming a little-endian architecture,
309 constructing the required table if that hasn't already been done. */
310uint32_t crc32c_sw_little(uint32_t crc, void const *buf, size_t len) {
311 unsigned char const *next = (unsigned char const *)buf;
312
314 crc = ~crc;
315 while (len && ((uintptr_t)next & 7) != 0) {
316 crc = crc32c_table_little[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
317 len--;
318 }
319 if (len >= 8) {
320 uint64_t crcw = crc;
321 do {
322 crcw ^= *(uint64_t const *)next;
323 crcw = crc32c_table_little[7][crcw & 0xff] ^
324 crc32c_table_little[6][(crcw >> 8) & 0xff] ^
325 crc32c_table_little[5][(crcw >> 16) & 0xff] ^
326 crc32c_table_little[4][(crcw >> 24) & 0xff] ^
327 crc32c_table_little[3][(crcw >> 32) & 0xff] ^
328 crc32c_table_little[2][(crcw >> 40) & 0xff] ^
329 crc32c_table_little[1][(crcw >> 48) & 0xff] ^
330 crc32c_table_little[0][crcw >> 56];
331 next += 8;
332 len -= 8;
333 } while (len >= 8);
334 crc = crcw;
335 }
336 while (len) {
337 crc = crc32c_table_little[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
338 len--;
339 }
340 return ~crc;
341}
342
343/* Swap the bytes in a uint64_t. (Only for big-endian.) */
344#if defined(__has_builtin) || (defined(__GNUC__) && \
345 (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3)))
346# define swap __builtin_bswap64
347#else
348static inline uint64_t swap(uint64_t x) {
349 x = ((x << 8) & 0xff00ff00ff00ff00) | ((x >> 8) & 0xff00ff00ff00ff);
350 x = ((x << 16) & 0xffff0000ffff0000) | ((x >> 16) & 0xffff0000ffff);
351 return (x << 32) | (x >> 32);
352}
353#endif
354
355/* Construct tables for software CRC-32C big-endian calculation. */
356static pthread_once_t crc32c_once_big = PTHREAD_ONCE_INIT;
357static uint32_t crc32c_table_big_byte[256];
358static uint64_t crc32c_table_big[8][256];
359static void crc32c_init_sw_big(void) {
360 for (unsigned n = 0; n < 256; n++) {
361 uint32_t crc = n;
362 crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
363 crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
364 crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
365 crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
366 crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
367 crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
368 crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
369 crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
370 crc32c_table_big_byte[n] = crc;
371 }
372 for (unsigned n = 0; n < 256; n++) {
373 uint32_t crc = crc32c_table_big_byte[n];
374 crc32c_table_big[0][n] = swap(crc);
375 for (unsigned k = 1; k < 8; k++) {
376 crc = crc32c_table_big_byte[crc & 0xff] ^ (crc >> 8);
377 crc32c_table_big[k][n] = swap(crc);
378 }
379 }
380}
381
382/* Compute a CRC-32C in software assuming a big-endian architecture,
383 constructing the required tables if that hasn't already been done. */
384uint32_t crc32c_sw_big(uint32_t crc, void const *buf, size_t len) {
385 unsigned char const *next = (unsigned char const *)buf;
386
387 pthread_once(&crc32c_once_big, crc32c_init_sw_big);
388 crc = ~crc;
389 while (len && ((uintptr_t)next & 7) != 0) {
390 crc = crc32c_table_big_byte[(crc ^ *next++) & 0xff] ^ (crc >> 8);
391 len--;
392 }
393 if (len >= 8) {
394 uint64_t crcw = swap(crc);
395 do {
396 crcw ^= *(uint64_t const *)next;
397 crcw = crc32c_table_big[0][crcw & 0xff] ^
398 crc32c_table_big[1][(crcw >> 8) & 0xff] ^
399 crc32c_table_big[2][(crcw >> 16) & 0xff] ^
400 crc32c_table_big[3][(crcw >> 24) & 0xff] ^
401 crc32c_table_big[4][(crcw >> 32) & 0xff] ^
402 crc32c_table_big[5][(crcw >> 40) & 0xff] ^
403 crc32c_table_big[6][(crcw >> 48) & 0xff] ^
404 crc32c_table_big[7][(crcw >> 56)];
405 next += 8;
406 len -= 8;
407 } while (len >= 8);
408 crc = swap(crcw);
409 }
410 while (len) {
411 crc = crc32c_table_big_byte[(crc ^ *next++) & 0xff] ^ (crc >> 8);
412 len--;
413 }
414 return ~crc;
415}
416
417/* Table-driven software CRC-32C. This is about 15 times slower than using the
418 hardware instructions. Determine the endianess of the processor and proceed
419 accordingly. Ideally the endianess will be determined at compile time, in
420 which case the unused functions and tables for the other endianess will be
421 removed by the optimizer. If not, then the proper routines and tables will
422 be used, even if the endianess is changed mid-stream. (Yes, there are
423 processors that permit that -- go figure.) */
424uint32_t crc32c_sw(uint32_t crc, void const *buf, size_t len) {
425 static int const little = 1;
426 if (*(char const *)&little)
427 return crc32c_sw_little(crc, buf, len);
428 else
429 return crc32c_sw_big(crc, buf, len);
430}
431
432#ifdef TEST
433
434#include <cstdio>
435#include <cstdlib>
436#include "load.h"
437
438int main(int argc, char **argv) {
439 /* interpret argument */
440 if (argc > 2) {
441 fputs("only one argument permitted\n", stderr);
442 return 1;
443 }
444 long rep = argc == 1 ? 1 : strtol(argv[1], NULL, 10);
445 if (rep == 0) {
446 fputs("usage: crc32c [[-]nnn] < data\n"
447 " where nnn is the number of times to repeat\n"
448 " negative forces the software implementation\n", stderr);
449 return 0;
450 }
451 int sw = rep < 0;
452 if (sw)
453 rep = -rep;
454
455 /* read input */
456 void *dat = NULL;
457 size_t size, len;
458 int ret = load(stdin, 0, &dat, &size, &len);
459 if (ret) {
460 fputs("error reading from stdin\n", stderr);
461 return 1;
462 }
463
464 /* compute crc rep times */
465 uint32_t crc;
466 do {
467 crc = sw ? crc32c_sw(0, dat, len) : crc32c(0, dat, len);
468 } while (--rep);
469
470 /* show crc */
471 printf("0x%08x\n", crc);
472
473 /* clean up */
474 free(dat);
475 return 0;
476}
477
478#endif /* TEST */
int main(int argc, char *argv[])
Definition XrdMain.cc:161
static pthread_once_t crc32c_once_little
uint32_t crc32c(uint32_t crc, void const *buf, size_t len)
static uint64_t swap(uint64_t x)
uint32_t crc32c_sw_big(uint32_t crc, void const *buf, size_t len)
uint32_t crc32c_sw(uint32_t crc, void const *buf, size_t len)
static uint64_t crc32c_table_big[8][256]
static void crc32c_init_sw_little(void)
static uint32_t crc32c_table_big_byte[256]
static void crc32c_init_sw_big(void)
#define POLY
uint32_t crc32c_sw_little(uint32_t crc, void const *buf, size_t len)
static uint32_t crc32c_table_little[8][256]
static pthread_once_t crc32c_once_big