Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Introduce fuzzyRules storage and exploitation. #835

Closed
wants to merge 12 commits into from
Closed
14 changes: 14 additions & 0 deletions include/zim/archive.h
Original file line number Diff line number Diff line change
Expand Up @@ -222,13 +222,27 @@ namespace zim
*
* Get an entry using its path.
* The path must contains the namespace.
* Path must be exact. Either a entry exists with the path or not.
*
* @param path The entry's path.
* @return The Entry.
* @exception EntryNotFound If no entry has the asked path.
*/
Entry getEntryByPath(const std::string& path) const;

/** Get an entry using a url.
*
* Get an entry using a full url.
* A full url is composed of a path and a querystring (separated by a '?')
* If the path is not in the archive, libzim will try find the entry
* using a combination of the querystring and fuzzyRules stored in the archive.
*
* @param url The url (including querystring) to search for.
* @return The Entry.
* @exception EntryNotFound If no entry has been found.
*/
Entry getEntryByUrl(const std::string& url) const;

/** Get an entry using its "title" index.
*
* Use the index of the entry to get the idx'th entry
Expand Down
1 change: 1 addition & 0 deletions include/zim/item.h
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ namespace zim

#ifdef ZIM_PRIVATE
cluster_index_type getClusterIndex() const;
blob_index_type getBlobIndex() const;
#endif

private: // data
Expand Down
38 changes: 38 additions & 0 deletions include/zim/writer/creator.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#define ZIM_WRITER_CREATOR_H

#include <memory>
#include <vector>
#include <zim/zim.h>
#include <zim/writer/item.h>

Expand Down Expand Up @@ -195,6 +196,36 @@ namespace zim
const std::string& targetpath,
const Hints& hints = Hints());


/**
* Add a alias of a existing entry.
*
* The existing entry pointed by `targetPath` is cloned and updated with
* `path` and `title`.
*
* The alias entry will shared the same type (redirection or item)
* and namespace than `targetPath`.
*
* If the `targetPath` is a item, the new entry will be item pointing
* to the same data than `targetPath` item. (Not a redirection to `targetPath`).
* However, the alias entry is not counted in the media type counter
* and it is not fulltext indexed (only title indexed).
*
* Hints can be given to influence creator handling (front article, ...)
* as it is done for redirection.
*
* @param path the path of the alias
* @param title the title of the alias
* @param targetPath the path of the aliased entry.
* @param hints hints associated to the alias.
*/
void addAlias(
const std::string& path,
const std::string& title,
const std::string& targetPath,
const Hints& hints = Hints()
);

/**
* Finalize the zim creation.
*/
Expand All @@ -214,6 +245,13 @@ namespace zim
*/
void setUuid(const zim::Uuid& uuid) { m_uuid = uuid; }

/**
* Set the fuzzy rules of the archive.
*
* @param rules A vector of fuzzyRules.
*/
void addFuzzyRules(const std::string& match, const std::string& replace, const std::string& split_str, bool splitlast, const std::vector<std::vector<std::string>>& arg_list);

private:
std::unique_ptr<CreatorData> data;

Expand Down
7 changes: 3 additions & 4 deletions meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -72,12 +72,11 @@ else
thread_dep = dependency('', required:false)
endif

icu_dep = dependency('icu-i18n', static:static_linkage)
pkg_requires += ['icu-i18n']

if xapian_dep.found()
pkg_requires += ['xapian-core']
icu_dep = dependency('icu-i18n', static:static_linkage)
pkg_requires += ['icu-i18n']
else
icu_dep = dependency('icu-i18n', required:false, static:static_linkage)
endif

gtest_dep = dependency('gtest', version: '>=1.10.0', main:true, fallback:['gtest', 'gtest_main_dep'], required:false)
Expand Down
20 changes: 20 additions & 0 deletions src/archive.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,26 @@
throw EntryNotFound("Cannot find entry");
}

Entry Archive::getEntryByUrl(const std::string& url) const

Check warning on line 253 in src/archive.cpp

View check run for this annotation

Codecov / codecov/patch

src/archive.cpp#L253

Added line #L253 was not covered by tests
{
std::string path;
std::vector<std::pair<std::string, std::string>> queryParams;

Check warning on line 256 in src/archive.cpp

View check run for this annotation

Codecov / codecov/patch

src/archive.cpp#L255-L256

Added lines #L255 - L256 were not covered by tests
std::tie(path, queryParams) = urlSplit(url);
try {
return getEntryByPath(path);
} catch (const EntryNotFound& e) {
for(const auto& path_to_try: m_impl->getFuzzyRules().get_fuzzy_paths(path, queryParams)) {
try {
return getEntryByPath(path_to_try);
} catch (const EntryNotFound& e) {
continue;

Check warning on line 265 in src/archive.cpp

View check run for this annotation

Codecov / codecov/patch

src/archive.cpp#L265

Added line #L265 was not covered by tests
}
}
}

throw EntryNotFound("Cannot find entry");
}

Entry Archive::getEntryByTitle(entry_index_type idx) const
{
return Entry(m_impl, entry_index_type(m_impl->getIndexByTitle(title_index_t(idx))));
Expand Down
18 changes: 18 additions & 0 deletions src/fileimpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
#include "envvalue.h"
#include "md5.h"
#include "tools.h"
#include "fuzzy_rules.h"

log_define("zim.file.impl")

Expand Down Expand Up @@ -236,6 +237,7 @@
m_byTitleDirentLookup.reset(new ByTitleDirentLookup(mp_titleDirentAccessor.get()));

readMimeTypes();
readFuzzyRules();
}

std::unique_ptr<IndirectDirentAccessor> FileImpl::getTitleAccessor(const std::string& path)
Expand Down Expand Up @@ -365,6 +367,22 @@
}
}

void FileImpl::readFuzzyRules() {
auto r = findx('M', "FuzzyRules");
if (!r.first) {
// No rules
return;
}
auto fuzzy_rule_dirent = getDirent(r.second);
if (fuzzy_rule_dirent->isRedirect()) {
std::cerr << "Error: 'M/FuzzyRules' is a redirect." << std::endl;
return;

Check warning on line 379 in src/fileimpl.cpp

View check run for this annotation

Codecov / codecov/patch

src/fileimpl.cpp#L379

Added line #L379 was not covered by tests
}
auto cluster = getCluster(fuzzy_rule_dirent->getClusterNumber());
auto blob = cluster->getBlob(fuzzy_rule_dirent->getBlobNumber());
fuzzyRules = FuzzyRules(blob);
}

FileImpl::FindxResult FileImpl::findx(char ns, const std::string& url)
{
return direntLookup().find(ns, url);
Expand Down
6 changes: 6 additions & 0 deletions src/fileimpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
#include "fileheader.h"
#include "zim_types.h"
#include "direntreader.h"
#include "fuzzy_rules.h"


namespace zim
Expand Down Expand Up @@ -68,6 +69,8 @@
typedef std::vector<std::string> MimeTypes;
MimeTypes mimeTypes;

FuzzyRules fuzzyRules;

mutable std::vector<entry_index_type> m_articleListByCluster;
mutable std::mutex m_articleListByClusterMutex;

Expand Down Expand Up @@ -148,6 +151,8 @@

const std::string& getMimeType(uint16_t idx) const;

const FuzzyRules& getFuzzyRules() const { return fuzzyRules; };

Check warning on line 154 in src/fileimpl.h

View check run for this annotation

Codecov / codecov/patch

src/fileimpl.h#L154

Added line #L154 was not covered by tests

std::string getChecksum();
bool verify();
bool is_multiPart() const;
Expand All @@ -165,6 +170,7 @@
ClusterHandle readCluster(cluster_index_t idx);
offset_type getMimeListEndUpperLimit() const;
void readMimeTypes();
void readFuzzyRules();
void quickCheckForCorruptFile();

bool checkChecksum();
Expand Down
Loading
Loading