From 777e013ade14d6e25eb504eee6d9b0670b527061 Mon Sep 17 00:00:00 2001 From: Matthieu Gautier <mgautier@kymeria.fr> Date: Mon, 19 Feb 2024 14:51:06 +0100 Subject: [PATCH 1/2] Fix documentation of `getEntryByPath`. --- include/zim/archive.h | 42 +++++++++++++++++++++++++++++++++++++----- 1 file changed, 37 insertions(+), 5 deletions(-) diff --git a/include/zim/archive.h b/include/zim/archive.h index 7539b96c6..e3d3bcb48 100644 --- a/include/zim/archive.h +++ b/include/zim/archive.h @@ -51,6 +51,27 @@ namespace zim * An `Archive` is read-only, and internal states (as caches) are protected * from race-condition. Therefore, all methods of `Archive` are threadsafe. * + * Zim archives exist with two different namespace schemes: An old one and the new one. + * The method `hasNewNamespaceScheme` permit to know which namespace is used by the archive. + * + * When using old namespace scheme: + * - User entries may be stored in different namespaces (historically `A`, `I`, `J` or `-`). + * So path of the entries contains the namespace as a "top level directory": `A/foo.html`, `I/image.png`, ... + * - All API taking or returning a path expect/will return a path with the namespace. + * + * When using new namespace scheme: + * - User entries are always stored without namespace. + * (For information, they are stored in the same namespace `C`. Still consider there is no namespace as all API masks it) + * As there is no namespace, paths don't contain it: `foo.hmtl`, `image.png`, ... + * - All API taking or returning a path expect/will return a path without namespace. + * + * This difference may seem complex to handle, but not so much. + * As all paths returned by API is consistent with paths expected, you simply have to use the path as it is. + * Forget about the namespace and if a path has it, simply consider it as a subdirectory. + * The only place it could be problematic is when you already have a path stored somewhere (bookmark, ...) + * using a scheme and use it on an archive with another scheme. For this case, the method `getEntryByPath` + * has a compatibility layer trying to transform a path to the new scheme as a fallback if the entry is not found. + * * All methods of archive may throw an `ZimFileFormatError` if the file is invalid. */ class LIBZIM_API Archive @@ -220,8 +241,15 @@ namespace zim /** Get an entry using a path. * - * Get an entry using its path. - * The path must contains the namespace. + * Search an entry in the zim, using its path. + * On archive with new namespace scheme, path must not contain the namespace. + * On archive without new namespace scheme, path must contain the namespace. + * A compatibility layer exists to accept "old" path on new archive (and the opposite) + * to help using saved path (bookmark) on new archive. + * On new archive, we first search the path in `C` namespace, then try to remove the potential namespace in path + * and search again in `C` namespace with path "without namespace". + * On old archive, we first assume path contains a namespace and if not (or no entry found) search in + * namespaces `A`, `I`, `J` and `-`. * * @param path The entry's path. * @return The Entry. @@ -242,7 +270,7 @@ namespace zim /** Get an entry using a title. * - * Get an entry using its path. + * Get an entry using its title. * * @param title The entry's title. * @return The Entry. @@ -282,6 +310,8 @@ namespace zim Entry getRandomEntry() const; /** Check in an entry has path in the archive. + * + * The path follows the same requirement than `getEntryByPath`. * * @param path The entry's path. * @return True if the path in the archive, false else. @@ -386,7 +416,9 @@ namespace zim /** Find a range of entries starting with path. * - * The path is the "long path". (Ie, with the namespace) + * When using new namespace scheme, path must not contain the namespace (`foo.html`). + * When using old namespace scheme, path must contain the namespace (`A/foo.html`). + * Contrary to `getEntryByPath`, there is no compatibility layer, path must follow the archive scheme. * * @param path The path prefix to search for. * @return A range starting from the first entry starting with path @@ -397,7 +429,7 @@ namespace zim /** Find a range of entry starting with title. * - * The entry title is search in `A` namespace. + * When using old namespace scheme, entry title is search in `A` namespace. * * @param title The title prefix to search for. * @return A range starting from the first entry starting with title From 605466034808124eb158672235ff67d4dc1faf40 Mon Sep 17 00:00:00 2001 From: Matthieu Gautier <mgautier@kymeria.fr> Date: Mon, 19 Feb 2024 14:52:24 +0100 Subject: [PATCH 2/2] Add new `getEntryByPathWithNamespace` as a private method. This allow "advanced" tools (as zimdump) to access content in any namespaces, even for new archive. --- include/zim/archive.h | 9 +++++++++ src/archive.cpp | 9 +++++++++ test/archive.cpp | 13 +++++++++++++ 3 files changed, 31 insertions(+) diff --git a/include/zim/archive.h b/include/zim/archive.h index e3d3bcb48..0e44e6508 100644 --- a/include/zim/archive.h +++ b/include/zim/archive.h @@ -505,6 +505,15 @@ namespace zim cluster_index_type getClusterCount() const; offset_type getClusterOffset(cluster_index_type idx) const; entry_index_type getMainEntryIndex() const; + + /** Get an entry using a path and a namespace. + * + * @param ns The namespace to search in + * @param path The entry's path (without namespace) + * @return The entry + * @exception EntryNotFound If no entry has been found. + */ + Entry getEntryByPathWithNamespace(char ns, const std::string& path) const; #endif private: diff --git a/src/archive.cpp b/src/archive.cpp index 17f33157d..1237baea7 100644 --- a/src/archive.cpp +++ b/src/archive.cpp @@ -250,6 +250,15 @@ namespace zim throw EntryNotFound("Cannot find entry"); } + Entry Archive::getEntryByPathWithNamespace(char ns, const std::string& path) const + { + auto r = m_impl->findx(ns, path); + if (r.first) { + return Entry(m_impl, entry_index_type(r.second)); + } + throw EntryNotFound("Cannot find entry"); + } + Entry Archive::getEntryByTitle(entry_index_type idx) const { return Entry(m_impl, entry_index_type(m_impl->getIndexByTitle(title_index_t(idx)))); diff --git a/test/archive.cpp b/test/archive.cpp index 955b55a3d..838c4551b 100644 --- a/test/archive.cpp +++ b/test/archive.cpp @@ -192,7 +192,12 @@ TEST(ZimArchive, openCreatedArchive) auto titleMeta = archive.getMetadataItem("Title"); ASSERT_EQ(std::string(titleMeta.getData()), "This is a title"); ASSERT_EQ(titleMeta.getMimetype(), "text/plain;charset=utf-8"); + + auto titleMeta_with_ns = archive.getEntryByPathWithNamespace('M', "Title"); + ASSERT_EQ(titleMeta.getIndex(), titleMeta_with_ns.getIndex()); + ASSERT_EQ(archive.getMetadata("Counter"), "text/html=2"); + auto illu48 = archive.getIllustrationItem(48); ASSERT_EQ(illu48.getPath(), "Illustration_48x48@1"); ASSERT_EQ(std::string(illu48.getData()), "PNGBinaryContent48"); @@ -210,6 +215,9 @@ TEST(ZimArchive, openCreatedArchive) ASSERT_THROW(foo.getRedirectEntry(), zim::InvalidType); ASSERT_THROW(foo.getRedirectEntryIndex(), zim::InvalidType); + auto foo_with_ns = archive.getEntryByPathWithNamespace('C', "foo"); + ASSERT_EQ(foo.getIndex(), foo_with_ns.getIndex()); + auto foo2 = archive.getEntryByPath("foo2"); ASSERT_EQ(foo2.getPath(), "foo2"); ASSERT_EQ(foo2.getTitle(), "AFoo"); @@ -227,6 +235,11 @@ TEST(ZimArchive, openCreatedArchive) ASSERT_EQ(main.getRedirectEntry().getIndex(), foo.getIndex()); ASSERT_EQ(main.getRedirectEntryIndex(), foo.getIndex()); ASSERT_EQ(archive.getMainEntryIndex(), main.getIndex()); + + // NO existant entries + ASSERT_THROW(archive.getEntryByPath("non/existant/path"), zim::EntryNotFound); + ASSERT_THROW(archive.getEntryByPath("C/non/existant/path"), zim::EntryNotFound); + ASSERT_THROW(archive.getEntryByPathWithNamespace('C', "non/existant/path"), zim::EntryNotFound); } #if WITH_TEST_DATA