-
Notifications
You must be signed in to change notification settings - Fork 40
/
fsst.h
227 lines (211 loc) · 12.4 KB
/
fsst.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
/*
* the API for FSST compression -- (c) Peter Boncz, Viktor Leis and Thomas Neumann (CWI, TU Munich), 2018-2019
*
* ===================================================================================================================================
* this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT):
*
* Copyright 2018-2020, CWI, TU Munich, FSU Jena
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify,
* merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
* LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
* IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* You can contact the authors via the FSST source repository : https://github.com/cwida/fsst
* ===================================================================================================================================
*
* FSST: Fast Static Symbol Table compression
* see the paper https://github.com/cwida/fsst/raw/master/fsstcompression.pdf
*
* FSST is a compression scheme focused on string/text data: it can compress strings from distributions with many different values (i.e.
* where dictionary compression will not work well). It allows *random-access* to compressed data: it is not block-based, so individual
* strings can be decompressed without touching the surrounding data in a compressed block. When compared to e.g. lz4 (which is
* block-based), FSST achieves similar decompression speed, (2x) better compression speed and 30% better compression ratio on text.
*
* FSST encodes strings also using a symbol table -- but it works on pieces of the string, as it maps "symbols" (1-8 byte sequences)
* onto "codes" (single-bytes). FSST can also represent a byte as an exception (255 followed by the original byte). Hence, compression
* transforms a sequence of bytes into a (supposedly shorter) sequence of codes or escaped bytes. These shorter byte-sequences could
* be seen as strings again and fit in whatever your program is that manipulates strings.
*
* useful property: FSST ensures that strings that are equal, are also equal in their compressed form.
*
* In this API, strings are considered byte-arrays (byte = unsigned char) and a batch of strings is represented as an array of
* unsigned char* pointers to their starts. A seperate length array (of unsigned int) denotes how many bytes each string consists of.
*
* This representation as unsigned char* pointers tries to assume as little as possible on the memory management of the program
* that calls this API, and is also intended to allow passing strings into this API without copying (even if you use C++ strings).
*
* We optionally support C-style zero-terminated strings (zero appearing only at the end). In this case, the compressed strings are
* also zero-terminated strings. In zero-terminated mode, the zero-byte at the end *is* counted in the string byte-length.
*/
#ifndef FSST_INCLUDED_H
#define FSST_INCLUDED_H
#ifdef _MSC_VER
#define __restrict__
#define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__
#define __ORDER_LITTLE_ENDIAN__ 2
#include <intrin.h>
static inline int __builtin_ctzl(unsigned long long x) {
unsigned long ret;
_BitScanForward64(&ret, x);
return (int)ret;
}
#endif
#ifdef __cplusplus
#define FSST_FALLTHROUGH [[fallthrough]]
#include <cstring>
extern "C" {
#else
#define FSST_FALLTHROUGH
#endif
#include <stddef.h>
/* A compressed string is simply a string of 1-byte codes; except for code 255, which is followed by an uncompressed byte. */
#define FSST_ESC 255
/* Data structure needed for compressing strings - use fsst_duplicate() to create thread-local copies. Use fsst_destroy() to free. */
typedef void* fsst_encoder_t; /* opaque type - it wraps around a rather large (~900KB) C++ object */
/* Data structure needed for decompressing strings - read-only and thus can be shared between multiple decompressing threads. */
typedef struct {
unsigned long long version; /* version id */
unsigned char zeroTerminated; /* terminator is a single-byte code that does not appear in longer symbols */
unsigned char len[255]; /* len[x] is the byte-length of the symbol x (1 < len[x] <= 8). */
unsigned long long symbol[255]; /* symbol[x] contains in LITTLE_ENDIAN the bytesequence that code x represents (0 <= x < 255). */
} fsst_decoder_t;
/* Calibrate a FSST symboltable from a batch of strings (it is best to provide at least 16KB of data). */
fsst_encoder_t*
fsst_create(
size_t n, /* IN: number of strings in batch to sample from. */
const size_t lenIn[], /* IN: byte-lengths of the inputs */
const unsigned char *strIn[], /* IN: string start pointers. */
int zeroTerminated /* IN: whether input strings are zero-terminated. If so, encoded strings are as well (i.e. symbol[0]=""). */
);
/* Create another encoder instance, necessary to do multi-threaded encoding using the same symbol table. */
fsst_encoder_t*
fsst_duplicate(
fsst_encoder_t *encoder /* IN: the symbol table to duplicate. */
);
#define FSST_MAXHEADER (8+1+8+2048+1) /* maxlen of deserialized fsst header, produced/consumed by fsst_export() resp. fsst_import() */
/* Space-efficient symbol table serialization (smaller than sizeof(fsst_decoder_t) - by saving on the unused bytes in symbols of len < 8). */
unsigned int /* OUT: number of bytes written in buf, at most sizeof(fsst_decoder_t) */
fsst_export(
fsst_encoder_t *encoder, /* IN: the symbol table to dump. */
unsigned char *buf /* OUT: pointer to a byte-buffer where to serialize this symbol table. */
);
/* Deallocate encoder. */
void
fsst_destroy(fsst_encoder_t*);
/* Return a decoder structure from serialized format (typically used in a block-, file- or row-group header). */
unsigned int /* OUT: number of bytes consumed in buf (0 on failure). */
fsst_import(
fsst_decoder_t *decoder, /* IN: this symbol table will be overwritten. */
unsigned char *buf /* OUT: pointer to a byte-buffer where fsst_export() serialized this symbol table. */
);
/* Return a decoder structure from an encoder. */
fsst_decoder_t
fsst_decoder(
fsst_encoder_t *encoder
);
/* Compress a batch of strings (on AVX512 machines best performance is obtained by compressing more than 32KB of string volume). */
/* The output buffer must be large; at least "conservative space" (7+2*inputlength) for the first string for something to happen. */
size_t /* OUT: the number of compressed strings (<=n) that fit the output buffer. */
fsst_compress(
fsst_encoder_t *encoder, /* IN: encoder obtained from fsst_create(). */
size_t nstrings, /* IN: number of strings in batch to compress. */
const size_t lenIn[], /* IN: byte-lengths of the inputs */
const unsigned char *strIn[], /* IN: input string start pointers. */
size_t outsize, /* IN: byte-length of output buffer. */
unsigned char *output, /* OUT: memory buffer to put the compressed strings in (one after the other). */
size_t lenOut[], /* OUT: byte-lengths of the compressed strings. */
unsigned char *strOut[] /* OUT: output string start pointers. Will all point into [output,output+size). */
);
/* Decompress a single string, inlined for speed. */
inline size_t /* OUT: bytesize of the decompressed string. If > size, the decoded output is truncated to size. */
fsst_decompress(
const fsst_decoder_t *decoder, /* IN: use this symbol table for compression. */
size_t lenIn, /* IN: byte-length of compressed string. */
const unsigned char *strIn, /* IN: compressed string. */
size_t size, /* IN: byte-length of output buffer. */
unsigned char *output /* OUT: memory buffer to put the decompressed string in. */
) {
unsigned char*__restrict__ len = (unsigned char* __restrict__) decoder->len;
unsigned char*__restrict__ strOut = (unsigned char* __restrict__) output;
unsigned long long*__restrict__ symbol = (unsigned long long* __restrict__) decoder->symbol;
size_t code, posOut = 0, posIn = 0;
#ifndef FSST_MUST_ALIGN /* defining on platforms that require aligned memory access may help their performance */
#define FSST_UNALIGNED_STORE(dst,src) memcpy((unsigned long long*) (dst), &(src), sizeof(unsigned long long))
#if defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
while (posOut+32 <= size && posIn+4 <= lenIn) {
unsigned int nextBlock, escapeMask;
memcpy(&nextBlock, strIn+posIn, sizeof(unsigned int));
escapeMask = (nextBlock&0x80808080u)&((((~nextBlock)&0x7F7F7F7Fu)+0x7F7F7F7Fu)^0x80808080u);
if (escapeMask == 0) {
code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code];
code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code];
code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code];
code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code];
} else {
unsigned long firstEscapePos=__builtin_ctzl((unsigned long long) escapeMask)>>3;
switch(firstEscapePos) { /* Duff's device */
case 3: code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code];
// fall through
case 2: code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code];
// fall through
case 1: code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code];
// fall through
case 0: posIn+=2; strOut[posOut++] = strIn[posIn-1]; /* decompress an escaped byte */
}
}
}
if (posOut+24 <= size) { // handle the possibly 3 last bytes without a loop
if (posIn+2 <= lenIn) {
strOut[posOut] = strIn[posIn+1];
if (strIn[posIn] != FSST_ESC) {
code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code];
if (strIn[posIn] != FSST_ESC) {
code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code];
} else {
posIn += 2; strOut[posOut++] = strIn[posIn-1];
}
} else {
posIn += 2; posOut++;
}
}
if (posIn < lenIn) { // last code cannot be an escape
code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code];
}
}
#else
while (posOut+8 <= size && posIn < lenIn)
if ((code = strIn[posIn++]) < FSST_ESC) { /* symbol compressed as code? */
FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); /* unaligned memory write */
posOut += len[code];
} else {
strOut[posOut] = strIn[posIn]; /* decompress an escaped byte */
posIn++; posOut++;
}
#endif
#endif
while (posIn < lenIn)
if ((code = strIn[posIn++]) < FSST_ESC) {
size_t posWrite = posOut, endWrite = posOut + len[code];
unsigned char* __restrict__ symbolPointer = ((unsigned char* __restrict__) &symbol[code]) - posWrite;
if ((posOut = endWrite) > size) endWrite = size;
for(; posWrite < endWrite; posWrite++) /* only write if there is room */
strOut[posWrite] = symbolPointer[posWrite];
} else {
if (posOut < size) strOut[posOut] = strIn[posIn]; /* idem */
posIn++; posOut++;
}
if (posOut >= size && (decoder->zeroTerminated&1)) strOut[size-1] = 0;
return posOut; /* full size of decompressed string (could be >size, then the actually decompressed part) */
}
#ifdef __cplusplus
}
#endif
#endif /* FSST_INCLUDED_H */