forked from goldendict/goldendict
-
Notifications
You must be signed in to change notification settings - Fork 0
/
btreeidx.hh
277 lines (217 loc) · 9.5 KB
/
btreeidx.hh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
/* This file is (c) 2008-2012 Konstantin Isakov <[email protected]>
* Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */
#ifndef __BTREEIDX_HH_INCLUDED__
#define __BTREEIDX_HH_INCLUDED__
#include "dictionary.hh"
#include "file.hh"
#include <string>
#include <vector>
#include <map>
#include <algorithm>
#include <QVector>
#include <QSet>
#include <QList>
#include "cpp_features.hh"
#if defined( _MSC_VER ) && _MSC_VER < 1800 // VS2012 and older
#include <stdint_msvc.h>
#else
#include <stdint.h>
#endif
/// A base for the dictionary which creates a btree index to look up
/// the words.
namespace BtreeIndexing {
using std::string;
using gd::wstring;
using std::vector;
using std::map;
enum
{
/// This is to be bumped up each time the internal format changes.
/// The value isn't used here by itself, it is supposed to be added
/// to each dictionary's internal format version.
FormatVersion = 4
};
// These exceptions which might be thrown during the index traversal
DEF_EX( exIndexWasNotOpened, "The index wasn't opened", Dictionary::Ex )
DEF_EX( exFailedToDecompressNode, "Failed to decompress a btree's node", Dictionary::Ex )
DEF_EX( exCorruptedChainData, "Corrupted chain data in the leaf of a btree encountered", Dictionary::Ex )
/// This structure describes a word linked to its translation. The
/// translation is represented as an abstract 32-bit offset.
struct WordArticleLink
{
string word, prefix; // in utf8
uint32_t articleOffset;
WordArticleLink()
{}
WordArticleLink( string const & word_, uint32_t articleOffset_, string const & prefix_ = string() ):
word( word_ ), prefix( prefix_ ), articleOffset( articleOffset_ )
{}
};
/// Information needed to open the index
struct IndexInfo
{
uint32_t btreeMaxElements, rootOffset;
IndexInfo( uint32_t btreeMaxElements_, uint32_t rootOffset_ ):
btreeMaxElements( btreeMaxElements_ ), rootOffset( rootOffset_ )
{}
};
/// Base btree indexing class which allows using what buildIndex() function
/// created. It's quite low-lovel and is basically a set of 'building blocks'
/// functions.
class BtreeIndex
{
public:
BtreeIndex();
/// Opens the index. The file reference is saved to be used for
/// subsequent lookups.
/// The mutex is the one to be locked when working with the file.
void openIndex( IndexInfo const &, File::Class &, Mutex & );
/// Finds articles that match the given string. A case-insensitive search
/// is performed.
vector< WordArticleLink > findArticles( wstring const &, bool ignoreDiacritics = false );
/// Find all unique article links in the index
void findAllArticleLinks( QVector< WordArticleLink > & articleLinks );
/// Retrieve all unique headwords from index
void getAllHeadwords( QSet< QString > & headwords );
/// Find all article links and/or headwords in the index
void findArticleLinks( QVector< WordArticleLink > * articleLinks,
QSet< uint32_t > * offsets,
QSet< QString > * headwords,
QAtomicInt * isCancelled = 0 );
/// Retrieve headwords for presented article addresses
void getHeadwordsFromOffsets( QList< uint32_t > & offsets,
QVector< QString > & headwords,
QAtomicInt * isCancelled = 0 );
protected:
/// Finds the offset in the btree leaf for the given word, either matching
/// by an exact match, or by finding the smallest entry that might match
/// by prefix. It can return zero if there isn't even a possible prefx
/// match. The input string must already be folded. The exactMatch is set
/// to true when an exact match is located, and to false otherwise.
/// The located leaf is loaded to 'leaf', and the pointer to the next
/// leaf is saved to 'nextLeaf'.
/// However, due to root node being permanently cached, the 'leaf' passed
/// might not get used at all if the root node was the terminal one. In that
/// case, the returned pointer wouldn't belong to 'leaf' at all. To that end,
/// the leafEnd pointer always holds the pointer to the first byte outside
/// the node data.
char const * findChainOffsetExactOrPrefix( wstring const & target,
bool & exactMatch,
vector< char > & leaf,
uint32_t & nextLeaf,
char const * & leafEnd );
/// Reads a node or leaf at the given offset. Just uncompresses its data
/// to the given vector and does nothing more.
void readNode( uint32_t offset, vector< char > & out );
/// Reads the word-article links' chain at the given offset. The pointer
/// is updated to point to the next chain, if there's any.
vector< WordArticleLink > readChain( char const * & );
/// Drops any aliases which arose due to folding. Only case-folded aliases
/// are left.
void antialias( wstring const &, vector< WordArticleLink > &, bool ignoreDiactitics );
protected:
Mutex * idxFileMutex;
File::Class * idxFile;
private:
uint32_t indexNodeSize;
uint32_t rootOffset;
bool rootNodeLoaded;
vector< char > rootNode; // We load root note here and keep it at all times,
// since all searches always start with it.
};
/// A base for the dictionary that utilizes a btree index build using
/// buildIndex() function declared below.
class BtreeDictionary: public Dictionary::Class, public BtreeIndex
{
public:
BtreeDictionary( string const & id, vector< string > const & dictionaryFiles );
/// Btree-indexed dictionaries are usually a good source for compound searches.
virtual Dictionary::Features getFeatures() const throw()
{ return Dictionary::SuitableForCompoundSearching; }
/// This function does the search using the btree index. Derivatives usually
/// need not to implement this function.
virtual sptr< Dictionary::WordSearchRequest > prefixMatch( wstring const &,
unsigned long )
THROW_SPEC( std::exception );
virtual sptr< Dictionary::WordSearchRequest > stemmedMatch( wstring const &,
unsigned minLength,
unsigned maxSuffixVariation,
unsigned long maxResults )
THROW_SPEC( std::exception );
virtual bool isLocalDictionary()
{ return true; }
virtual bool getHeadwords( QStringList &headwords );
virtual void getArticleText( uint32_t articleAddress, QString & headword, QString & text );
string const & ftsIndexName() const
{ return ftsIdxName; }
Mutex & getFtsMutex()
{ return ftsIdxMutex; }
virtual uint32_t getFtsIndexVersion()
{ return 0; }
// Sort articles offsets for full-text search in dictionary-specific order
// to increase of articles retrieving speed
// Default - simple sorting in increase order
virtual void sortArticlesOffsetsForFTS( QVector< uint32_t > & offsets,
QAtomicInt & isCancelled )
{ Q_UNUSED( isCancelled ); std::sort( offsets.begin(), offsets.end() ); }
/// Called before each matching operation to ensure that any child init
/// has completed. Mainly used for deferred init. The default implementation
/// does nothing.
/// The function returns an empty string if the initialization is or was
/// successful, or a human-readable error string otherwise.
virtual string const & ensureInitDone();
protected:
Mutex ftsIdxMutex;
string ftsIdxName;
friend class BtreeWordSearchRequest;
friend class FTSResultsRequest;
};
class BtreeWordSearchRequest: public Dictionary::WordSearchRequest
{
friend class BtreeWordSearchRunnable;
protected:
BtreeDictionary & dict;
wstring str;
unsigned long maxResults;
unsigned minLength;
int maxSuffixVariation;
bool allowMiddleMatches;
QAtomicInt isCancelled;
QSemaphore hasExited;
public:
BtreeWordSearchRequest( BtreeDictionary & dict_,
wstring const & str_,
unsigned minLength_,
int maxSuffixVariation_,
bool allowMiddleMatches_,
unsigned long maxResults_,
bool startRunnable = true );
virtual void findMatches();
void run(); // Run from another thread by BtreeWordSearchRunnable
virtual void cancel()
{
isCancelled.ref();
}
~BtreeWordSearchRequest();
};
// Everything below is for building the index data.
/// This represents the index in its source form, as a map which binds folded
/// words to sequences of their unfolded source forms and the corresponding
/// article offsets. The words are utf8-encoded -- it doesn't break Unicode
/// sorting, but conserves space.
struct IndexedWords: public map< string, vector< WordArticleLink > >
{
/// Instead of adding to the map directly, use this function. It does folding
/// itself, and for phrases/sentences it adds additional entries beginning with
/// each new word.
void addWord( wstring const & word, uint32_t articleOffset, unsigned int maxHeadwordSize = 256U );
/// Differs from addWord() in that it only adds a single entry. We use this
/// for zip's file names.
void addSingleWord( wstring const & word, uint32_t articleOffset );
};
/// Builds the index, as a compressed btree. Returns IndexInfo.
/// All the data is stored to the given file, beginning from its current
/// position.
IndexInfo buildIndex( IndexedWords const &, File::Class & file );
}
#endif