Merge pull request #859 from openzim/getEntryByPath_ns

openzim · Feb 22, 2024 · 121e3af · 121e3af
2 parents a7776b3 + 6054660
commit 121e3af
Show file tree

Hide file tree

Showing 3 changed files with 68 additions and 5 deletions.
diff --git a/include/zim/archive.h b/include/zim/archive.h
@@ -51,6 +51,27 @@ namespace zim
    * An `Archive` is read-only, and internal states (as caches) are protected
    * from race-condition. Therefore, all methods of `Archive` are threadsafe.
    *
+   * Zim archives exist with two different namespace schemes: An old one and the new one.
+   * The method `hasNewNamespaceScheme` permit to know which namespace is used by the archive.
+   *
+   * When using old namespace scheme:
+   * - User entries may be stored in different namespaces (historically `A`, `I`, `J` or `-`).
+   *   So path of the entries contains the namespace as a "top level directory": `A/foo.html`, `I/image.png`, ...
+   * - All API taking or returning a path expect/will return a path with the namespace.
+   *
+   * When using new namespace scheme:
+   * - User entries are always stored without namespace.
+   *   (For information, they are stored in the same namespace `C`. Still consider there is no namespace as all API masks it)
+   *   As there is no namespace, paths don't contain it: `foo.hmtl`, `image.png`, ...
+   * - All API taking or returning a path expect/will return a path without namespace.
+   *
+   * This difference may seem complex to handle, but not so much.
+   * As all paths returned by API is consistent with paths expected, you simply have to use the path as it is.
+   * Forget about the namespace and if a path has it, simply consider it as a subdirectory.
+   * The only place it could be problematic is when you already have a path stored somewhere (bookmark, ...)
+   * using a scheme and use it on an archive with another scheme. For this case, the method `getEntryByPath`
+   * has a compatibility layer trying to transform a path to the new scheme as a fallback if the entry is not found.
+   *
    * All methods of archive may throw an `ZimFileFormatError` if the file is invalid.
    */
   class LIBZIM_API Archive
@@ -220,8 +241,15 @@ namespace zim
 
       /** Get an entry using a path.
        *
-       *  Get an entry using its path.
-       *  The path must contains the namespace.
+       *  Search an entry in the zim, using its path.
+       *  On archive with new namespace scheme, path must not contain the namespace.
+       *  On archive without new namespace scheme, path must contain the namespace.
+       *  A compatibility layer exists to accept "old" path on new archive (and the opposite)
+       *  to help using saved path (bookmark) on new archive.
+       *  On new archive, we first search the path in `C` namespace, then try to remove the potential namespace in path
+       *  and search again in `C` namespace with path "without namespace".
+       *  On old archive, we first assume path contains a namespace and if not (or no entry found) search in
+       *  namespaces `A`, `I`, `J` and `-`.
        *
        *  @param path The entry's path.
        *  @return The Entry.
@@ -242,7 +270,7 @@ namespace zim
 
       /** Get an entry using a title.
        *
-       *  Get an entry using its path.
+       *  Get an entry using its title.
        *
        *  @param title The entry's title.
        *  @return The Entry.
@@ -282,6 +310,8 @@ namespace zim
       Entry getRandomEntry() const;
 
       /** Check in an entry has path in the archive.
+       *
+       *  The path follows the same requirement than `getEntryByPath`.
        *
        *  @param path The entry's path.
        *  @return True if the path in the archive, false else.
@@ -386,7 +416,9 @@ namespace zim
 
       /** Find a range of entries starting with path.
        *
-       * The path is the "long path". (Ie, with the namespace)
+       * When using new namespace scheme, path must not contain the namespace (`foo.html`).
+       * When using old namespace scheme, path must contain the namespace (`A/foo.html`).
+       * Contrary to `getEntryByPath`, there is no compatibility layer, path must follow the archive scheme.
        *
        * @param path The path prefix to search for.
        * @return A range starting from the first entry starting with path
@@ -397,7 +429,7 @@ namespace zim
 
       /** Find a range of entry starting with title.
        *
-       * The entry title is search in `A` namespace.
+       * When using old namespace scheme, entry title is search in `A` namespace.
        *
        * @param title The title prefix to search for.
        * @return A range starting from the first entry starting with title
@@ -473,6 +505,15 @@ namespace zim
       cluster_index_type getClusterCount() const;
       offset_type getClusterOffset(cluster_index_type idx) const;
       entry_index_type getMainEntryIndex() const;
+
+      /** Get an entry using a path and a namespace.
+       *
+       * @param ns The namespace to search in
+       * @param path The entry's path (without namespace)
+       * @return The entry
+       * @exception EntryNotFound If no entry has been found.
+       */
+      Entry getEntryByPathWithNamespace(char ns, const std::string& path) const;
 #endif
 
     private:

diff --git a/src/archive.cpp b/src/archive.cpp
@@ -250,6 +250,15 @@ namespace zim
     throw EntryNotFound("Cannot find entry");
   }
 
+  Entry Archive::getEntryByPathWithNamespace(char ns, const std::string& path) const
+  {
+    auto r = m_impl->findx(ns, path);
+    if (r.first) {
+      return Entry(m_impl, entry_index_type(r.second));
+    }
+    throw EntryNotFound("Cannot find entry");
+  }
+
   Entry Archive::getEntryByTitle(entry_index_type idx) const
   {
     return Entry(m_impl, entry_index_type(m_impl->getIndexByTitle(title_index_t(idx))));

diff --git a/test/archive.cpp b/test/archive.cpp
@@ -192,7 +192,12 @@ TEST(ZimArchive, openCreatedArchive)
   auto titleMeta = archive.getMetadataItem("Title");
   ASSERT_EQ(std::string(titleMeta.getData()), "This is a title");
   ASSERT_EQ(titleMeta.getMimetype(), "text/plain;charset=utf-8");
+
+  auto titleMeta_with_ns = archive.getEntryByPathWithNamespace('M', "Title");
+  ASSERT_EQ(titleMeta.getIndex(), titleMeta_with_ns.getIndex());
+
   ASSERT_EQ(archive.getMetadata("Counter"), "text/html=2");
+
   auto illu48 = archive.getIllustrationItem(48);
   ASSERT_EQ(illu48.getPath(), "Illustration_48x48@1");
   ASSERT_EQ(std::string(illu48.getData()), "PNGBinaryContent48");
@@ -210,6 +215,9 @@ TEST(ZimArchive, openCreatedArchive)
   ASSERT_THROW(foo.getRedirectEntry(), zim::InvalidType);
   ASSERT_THROW(foo.getRedirectEntryIndex(), zim::InvalidType);
 
+  auto foo_with_ns = archive.getEntryByPathWithNamespace('C', "foo");
+  ASSERT_EQ(foo.getIndex(), foo_with_ns.getIndex());
+
   auto foo2 = archive.getEntryByPath("foo2");
   ASSERT_EQ(foo2.getPath(), "foo2");
   ASSERT_EQ(foo2.getTitle(), "AFoo");
@@ -227,6 +235,11 @@ TEST(ZimArchive, openCreatedArchive)
   ASSERT_EQ(main.getRedirectEntry().getIndex(), foo.getIndex());
   ASSERT_EQ(main.getRedirectEntryIndex(), foo.getIndex());
   ASSERT_EQ(archive.getMainEntryIndex(), main.getIndex());
+
+  // NO existant entries
+  ASSERT_THROW(archive.getEntryByPath("non/existant/path"), zim::EntryNotFound);
+  ASSERT_THROW(archive.getEntryByPath("C/non/existant/path"), zim::EntryNotFound);
+  ASSERT_THROW(archive.getEntryByPathWithNamespace('C', "non/existant/path"), zim::EntryNotFound);
 }
 
 #if WITH_TEST_DATA