-
Notifications
You must be signed in to change notification settings - Fork 21
/
h0_lz77.cpp
142 lines (91 loc) · 3.11 KB
/
h0_lz77.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
// Copyright (c) 2017, Nicola Prezza. All rights reserved.
// Use of this source code is governed
// by a MIT license that can be found in the LICENSE file.
/*
* h0_lz77.cpp
*
* Compute the LZ77 parsing in zero-order compressed space
* using a dynamic Huffman-encoded FM index with
* a sparse SA sampling (1 sample every 256 characters)
*
* Space is ( n*H0 + n + (n/k)*log n )(1+o(1)) bits.
*
* Type of input text here is uchar
*
*
*/
#include <chrono>
#include "dynamic/dynamic.hpp"
#include "dynamic/algorithms/h0_lz77.hpp"
using namespace std;
using namespace dyn;
ulint sa_rate = 0;
bool int_file = false;
void help(){
cout << "Build LZ77 using a zero-order compressed FM index." << endl << endl;
cout << "Usage: h0_lz77 [options] <input_file> <output_file> " << endl;
cout << "Options: " << endl;
cout << "-s <sample_rate> store one SA sample every sample_rate positions. default: 256." << endl;
cout << "-i Interpret the file as a stream of 32-bits integers." << endl;
cout << "input_file: file to be parsed" << endl;
cout << "output_file: LZ77 triples <start,length,trailing_character> will be saved in binary format in this file" << endl << endl;
cout << "Note: the file should terminate with a character (or int if -i) not appearing elsewhere." << endl;
exit(0);
}
void parse_args(char** argv, int argc, int &ptr){
assert(ptr<argc);
string s(argv[ptr]);
ptr++;
if(s.compare("-s")==0){
sa_rate = atoi(argv[ptr++]);
}else if(s.compare("-i")==0){
int_file = true;
}else{
cout << "Error: unrecognized '" << s << "' option." << endl;
help();
}
}
int main(int argc,char** argv) {
using std::chrono::high_resolution_clock;
using std::chrono::duration_cast;
using std::chrono::duration;
if(argc < 3) help();
//parse options
int ptr = 1;
if(argc<3) help();
while(ptr<argc-2)
parse_args(argv, argc, ptr);
string in = string(argv[ptr++]);
string out = string(argv[ptr]);
using lz77_t = h0_lz77<wt_fmi>;
lz77_t lz77;
ulint DEFAULT_SA_RATE = lz77_t::DEFAULT_SA_RATE;
sa_rate = not sa_rate ? DEFAULT_SA_RATE : sa_rate;
auto t1 = high_resolution_clock::now();
cout << "Sample rate is " << sa_rate << endl;
if(not int_file){
{
cout << "Detecting alphabet ... " << flush;
std::ifstream ifs(in);
lz77 = lz77_t(ifs, sa_rate);
cout << "done." << endl;
}
std::ifstream ifs(in);
std::ofstream os(out, ios::binary);
lz77.parse(ifs,os,1,true);
}else{
lz77 = lz77_t(~uint(0), sa_rate);
std::ifstream ifs(in, ios::binary);
std::ofstream os(out, ios::binary);
lz77.parse_int(ifs,os,1,true);
}
auto t2 = high_resolution_clock::now();
uint64_t sec = std::chrono::duration_cast<std::chrono::seconds>(t2 - t1).count();
ulint bitsize = lz77.bit_size();
cout << endl << "done" << endl;
cout << " Total time: " << (double)sec << " seconds" << endl;
cout << " Size of the structures (bits): " << bitsize << endl;
cout << " Size of the structures (Bytes): " << bitsize/8 << endl;
cout << " Size of the structures (KB): " << (bitsize/8)/1024 << endl;
cout << " Size of the structures (MB): " << ((bitsize/8)/1024)/1024 << endl;
}