forked from HIITMetagenomics/dsm-framework
-
Notifications
You must be signed in to change notification settings - Fork 0
/
TextStorage.h
165 lines (133 loc) · 4.32 KB
/
TextStorage.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
#ifndef _TextStorage_H_
#define _TextStorage_H_
#include "TextCollection.h"
#include "Tools.h"
#include <cassert>
#include <stdexcept>
// Include from libcds
#include <static_bitsequence.h>
// Re-define word size to ulong:
#undef W
#define W (CHAR_BIT*sizeof(unsigned long))
#undef bitset
#undef bitget
class TextStoragePlainText;
/**
* Text collection that supports fast extraction.
* Defines an abstact interface class.
* See subclasses TextStorageLzIndex and TextStoragePlainText
* below.
*
* TODO store in DNA alphabet, use delta encoded bitvector
*/
class TextStorage
{
public:
// Define a shortcut
typedef TextCollection::TextPosition TextPosition;
// Storage type
const static char TYPE_PLAIN_TEXT = 0;
const static char TYPE_LZ_INDEX = 1;
// Call DeleteText() for each pointer returned by GetText()
// to avoid possible memory leaks.
virtual uchar * GetText(TextCollection::DocId docId) const = 0;
virtual uchar * GetText(TextCollection::DocId i, TextCollection::DocId j) const = 0;
virtual void DeleteText(uchar *) const = 0;
static TextStorage * Load(FILE *file);
virtual void Save(FILE *file) const = 0;
virtual ~TextStorage()
{
delete offsets_;
offsets_ = 0;
}
TextCollection::DocId DocIdAtTextPos(TextCollection::TextPosition i) const
{
assert(i < n_);
return offsets_->rank1(i)-1;
}
TextCollection::TextPosition TextStartPos(TextCollection::DocId i) const
{
assert(i < (TextCollection::DocId)numberOfTexts_);
return offsets_->select1(i+1);
}
bool IsEndmarker(TextCollection::TextPosition i) const
{
assert(i < n_);
if (i >= n_ - 1)
return true;
return offsets_->access(i+1);
}
protected:
TextStorage(uchar const * text, TextPosition n)
: n_(n), offsets_(0), numberOfTexts_(0)
{
uint *startpos = new uint[n/(sizeof(uint)*8)+1];
for (unsigned long i = 0; i < n / (sizeof(uint)*8) + 1; i++)
startpos[i] = 0;
// Read offsets by finding text end positions:
set_field(startpos,1,0,1);
for (TextPosition i = 0; i < n_ - 1; ++i)
if (text[i] == '\0')
set_field(startpos,1,i+1,1);
offsets_ = new static_bitsequence_brw32(startpos, n, 16);
delete [] startpos;
for (ulong i = 0; i < n_-1; ++i)
if ((text[i] == '\0') != IsEndmarker(i))
std::cout << "misplaced endmarker at i = " << i << std::endl;
numberOfTexts_ = offsets_->rank1(n_ - 1);
}
TextStorage(std::FILE *);
void Save(FILE *file, char type) const;
TextPosition n_;
//CSA::DeltaVector *offsets_;
static_bitsequence * offsets_ ;
TextPosition numberOfTexts_;
};
/******************************************************************
* Plain text collection.
*/
class TextStoragePlainText : public TextStorage
{
public:
TextStoragePlainText(uchar *text, TextPosition n)
: TextStorage(text, n), text_(text)
{ }
TextStoragePlainText(FILE *file)
: TextStorage(file), text_(0)
{
text_ = new uchar[n_];
if (std::fread(this->text_, sizeof(uchar), n_, file) != n_)
throw std::runtime_error("TextStorage::Load(): file read error (text_).");
}
void Save(FILE *file) const
{
TextStorage::Save(file, TYPE_PLAIN_TEXT);
if (std::fwrite(this->text_, sizeof(uchar), n_, file) != n_)
throw std::runtime_error("TextStorage::Save(): file write error (text_).");
}
~TextStoragePlainText()
{
delete [] text_;
text_ = 0;
n_ = 0;
}
uchar * GetText(TextCollection::DocId docId) const
{
assert(docId < (TextCollection::DocId)numberOfTexts_);
TextPosition offset = offsets_->select1(docId+1);
return &text_[offset];
}
uchar * GetText(TextCollection::DocId i, TextCollection::DocId j) const
{
assert(i < (TextCollection::DocId)numberOfTexts_);
assert(j < (TextCollection::DocId)numberOfTexts_);
TextPosition offset = offsets_->select1(i+1);
return &text_[offset];
}
// No operation, since text is a pointer to this->text_
void DeleteText(uchar *text) const
{ }
private:
uchar *text_;
}; // class TextStoragePlainText
#endif // _TextStorage_H_