From 32267114fc4bcda4d9f63dee1b4cd88ab0471572 Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Thu, 22 Feb 2024 11:32:08 +0100 Subject: [PATCH 1/8] Make FilePart constructors explicit. --- src/file_part.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/file_part.h b/src/file_part.h index 6362baf83..afa003092 100644 --- a/src/file_part.h +++ b/src/file_part.h @@ -39,13 +39,13 @@ class FilePart { using FDSharedPtr = std::shared_ptr; public: - FilePart(const std::string& filename) : + explicit FilePart(const std::string& filename) : m_filename(filename), m_fhandle(std::make_shared(FS::openFile(filename))), m_size(m_fhandle->getSize()) {} #ifndef _WIN32 - FilePart(int fd) : + explicit FilePart(int fd) : FilePart(getFilePathFromFD(fd)) {} #endif From a8aa2ea537b61fc37d1041e5a5236852b15d75af Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Thu, 22 Feb 2024 11:37:05 +0100 Subject: [PATCH 2/8] Introduce offset in FilePart. FilePart don't have to "map" the whole file on the FS. Only a section of the file may be a part of our archive. (Think about file parts being stored in a zip) --- src/file_part.h | 15 +++++++++++++++ src/file_reader.cpp | 25 +++++++++++++++---------- 2 files changed, 30 insertions(+), 10 deletions(-) diff --git a/src/file_part.h b/src/file_part.h index afa003092..ae42ee962 100644 --- a/src/file_part.h +++ b/src/file_part.h @@ -32,6 +32,12 @@ namespace zim { +/** A part of file. + * + * `FilePart` references a part(section) of a physical file. + * Most of the time, `FilePart` will reference the whole file (m_offset==0 and m_size==m_fhandle->getSize()) + * but in some situation, it can reference only a part of the file (in case the content is store in a zip archive.) + */ class FilePart { typedef DEFAULTFS FS; @@ -42,11 +48,18 @@ class FilePart { explicit FilePart(const std::string& filename) : m_filename(filename), m_fhandle(std::make_shared(FS::openFile(filename))), + m_offset(0), m_size(m_fhandle->getSize()) {} #ifndef _WIN32 explicit FilePart(int fd) : FilePart(getFilePathFromFD(fd)) {} + + FilePart(int fd, offset_t offset, zsize_t size): + m_filename(getFilePathFromFD(fd)), + m_fhandle(std::make_shared(FS::openFile(m_filename))), + m_offset(offset), + m_size(size) {} #endif ~FilePart() = default; @@ -55,12 +68,14 @@ class FilePart { const FDSharedPtr& shareable_fhandle() const { return m_fhandle; }; zsize_t size() const { return m_size; }; + offset_t offset() const { return m_offset; } bool fail() const { return !m_size; }; bool good() const { return bool(m_size); }; private: const std::string m_filename; FDSharedPtr m_fhandle; + offset_t m_offset; zsize_t m_size; }; diff --git a/src/file_reader.cpp b/src/file_reader.cpp index cfabe335f..2fcf09661 100644 --- a/src/file_reader.cpp +++ b/src/file_reader.cpp @@ -66,11 +66,12 @@ char MultiPartFileReader::read(offset_t offset) const { offset += _offset; auto part_pair = source->locate(offset); auto& fhandle = part_pair->second->fhandle(); - offset_t local_offset = offset - part_pair->first.min; - ASSERT(local_offset, <=, part_pair->first.max); + offset_t logical_local_offset = offset - part_pair->first.min; + ASSERT(logical_local_offset, <=, part_pair->first.max); + offset_t physical_local_offset = logical_local_offset + part_pair->second->offset(); char ret; try { - fhandle.readAt(&ret, zsize_t(1), local_offset); + fhandle.readAt(&ret, zsize_t(1), physical_local_offset); } catch (std::runtime_error& e) { //Error while reading. std::ostringstream s; @@ -79,7 +80,8 @@ char MultiPartFileReader::read(offset_t offset) const { s << " - File part size is " << part_pair->second->size().v << "\n"; s << " - File part range is " << part_pair->first.min << "-" << part_pair->first.max << "\n"; s << " - Reading offset at " << offset.v << "\n"; - s << " - local offset is " << local_offset.v << "\n"; + s << " - logical local offset is " << logical_local_offset.v << "\n"; + s << " - physical local offset is " << physical_local_offset.v << "\n"; s << " - error is " << strerror(errno) << "\n"; std::error_code ec(errno, std::generic_category()); throw std::system_error(ec, s.str()); @@ -98,11 +100,12 @@ void MultiPartFileReader::read(char* dest, offset_t offset, zsize_t size) const for(auto current = found_range.first; current!=found_range.second; current++){ auto part = current->second; Range partRange = current->first; - offset_t local_offset = offset-partRange.min; + offset_t logical_local_offset = offset - partRange.min; ASSERT(size.v, >, 0U); - zsize_t size_to_get = zsize_t(std::min(size.v, part->size().v-local_offset.v)); + zsize_t size_to_get = zsize_t(std::min(size.v, part->size().v-logical_local_offset.v)); + offset_t physical_local_offset = logical_local_offset + part->offset(); try { - part->fhandle().readAt(dest, size_to_get, local_offset); + part->fhandle().readAt(dest, size_to_get, physical_local_offset); } catch (std::runtime_error& e) { std::ostringstream s; s << "Cannot read chars.\n"; @@ -112,7 +115,8 @@ void MultiPartFileReader::read(char* dest, offset_t offset, zsize_t size) const s << " - size_to_get is " << size_to_get.v << "\n"; s << " - total size is " << size.v << "\n"; s << " - Reading offset at " << offset.v << "\n"; - s << " - local offset is " << local_offset.v << "\n"; + s << " - logical local offset is " << logical_local_offset.v << "\n"; + s << " - physical local offset is " << physical_local_offset.v << "\n"; s << " - error is " << strerror(errno) << "\n"; std::error_code ec(errno, std::generic_category()); throw std::system_error(ec, s.str()); @@ -189,10 +193,11 @@ const Buffer MultiPartFileReader::get_buffer(offset_t offset, zsize_t size) cons // The range is in only one part auto range = found_range.first->first; auto part = found_range.first->second; - auto local_offset = offset + _offset - range.min; + auto logical_local_offset = offset + _offset - range.min; ASSERT(size, <=, part->size()); int fd = part->fhandle().getNativeHandle(); - return Buffer::makeBuffer(makeMmappedBuffer(fd, local_offset, size), size); + auto physical_local_offset = logical_local_offset + part->offset(); + return Buffer::makeBuffer(makeMmappedBuffer(fd, physical_local_offset, size), size); } catch(MMapException& e) #endif { From 93e9ccda3ea0c92165400f35df6fff67d2a86e01 Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Thu, 22 Feb 2024 15:13:23 +0100 Subject: [PATCH 3/8] Remove archiveStartOffset from FileImpl. Has FilePart now have offsets, we can move the "offset concept" out of FileImpl. FileImpl always read from "offset" 0 in FileCompound. If it appears that we have an offset inside a file, it will be FilePart which will handle it. --- src/file_compound.cpp | 8 ++++++++ src/file_compound.h | 1 + src/fileimpl.cpp | 15 ++++----------- src/fileimpl.h | 2 -- src/item.cpp | 7 +++---- 5 files changed, 16 insertions(+), 17 deletions(-) diff --git a/src/file_compound.cpp b/src/file_compound.cpp index a8f6bf1fa..59abd5ba6 100644 --- a/src/file_compound.cpp +++ b/src/file_compound.cpp @@ -20,6 +20,7 @@ #include "file_compound.h" #include "buffer.h" +#include "zim_types.h" #include #include @@ -77,6 +78,13 @@ FileCompound::FileCompound(int fd): { addPart(new FilePart(fd)); } + +FileCompound::FileCompound(int fd, offset_t offset, zsize_t size): + _filename(), + _fsize(0) +{ + addPart(new FilePart(fd, offset, size)); +} #endif FileCompound::~FileCompound() { diff --git a/src/file_compound.h b/src/file_compound.h index 1f56fd9f4..74526f0bb 100644 --- a/src/file_compound.h +++ b/src/file_compound.h @@ -60,6 +60,7 @@ class FileCompound : private std::map { #ifndef _WIN32 explicit FileCompound(int fd); + FileCompound(int fd, offset_t offset, zsize_t size); #endif ~FileCompound(); diff --git a/src/fileimpl.cpp b/src/fileimpl.cpp index 152f98aa0..7aa134d6b 100644 --- a/src/fileimpl.cpp +++ b/src/fileimpl.cpp @@ -67,17 +67,15 @@ sectionSubReader(const Reader& zimReader, const std::string& sectionName, } std::shared_ptr -makeFileReader(std::shared_ptr zimFile, offset_t offset, zsize_t size) +makeFileReader(std::shared_ptr zimFile) { if (zimFile->fail()) { return nullptr; } else if ( zimFile->is_multiPart() ) { - ASSERT(offset.v, ==, 0u); - ASSERT(size, ==, zimFile->fsize()); return std::make_shared(zimFile); } else { const auto& firstAndOnlyPart = zimFile->begin()->second; - return std::make_shared(firstAndOnlyPart->shareable_fhandle(), offset, size); + return std::make_shared(firstAndOnlyPart->shareable_fhandle(), firstAndOnlyPart->offset(), firstAndOnlyPart->size()); } } @@ -173,18 +171,13 @@ class Grouping {} FileImpl::FileImpl(int fd, offset_t offset, zsize_t size) - : FileImpl(std::make_shared(fd), offset, size) + : FileImpl(std::make_shared(fd, offset, size)) {} #endif FileImpl::FileImpl(std::shared_ptr _zimFile) - : FileImpl(_zimFile, offset_t(0), _zimFile->fsize()) - {} - - FileImpl::FileImpl(std::shared_ptr _zimFile, offset_t offset, zsize_t size) : zimFile(_zimFile), - archiveStartOffset(offset), - zimReader(makeFileReader(zimFile, offset, size)), + zimReader(makeFileReader(zimFile)), direntReader(new DirentReader(zimReader)), clusterCache(envValue("ZIM_CLUSTERCACHE", CLUSTER_CACHE_SIZE)), m_newNamespaceScheme(false), diff --git a/src/fileimpl.h b/src/fileimpl.h index cc85c46d6..e8ce762f2 100644 --- a/src/fileimpl.h +++ b/src/fileimpl.h @@ -47,7 +47,6 @@ namespace zim class FileImpl { std::shared_ptr zimFile; - offset_t archiveStartOffset; std::shared_ptr zimReader; std::shared_ptr direntReader; Fileheader header; @@ -107,7 +106,6 @@ namespace zim FileImpl(int fd, offset_t offset, zsize_t size); #endif - offset_t getArchiveStartOffset() const { return archiveStartOffset; } time_t getMTime() const; const std::string& getFilename() const { return zimFile->filename(); } diff --git a/src/item.cpp b/src/item.cpp index 931ba47e8..4ce4b990d 100644 --- a/src/item.cpp +++ b/src/item.cpp @@ -70,8 +70,6 @@ std::pair Item::getDirectAccessInformation() const auto full_offset = m_file->getBlobOffset(m_dirent->getClusterNumber(), m_dirent->getBlobNumber()); - full_offset += m_file->getArchiveStartOffset().v; - auto part_its = m_file->getFileParts(full_offset, zsize_t(getSize())); auto first_part = part_its.first; if (++part_its.first != part_its.second) { @@ -80,8 +78,9 @@ std::pair Item::getDirectAccessInformation() const } auto range = first_part->first; auto part = first_part->second; - const offset_type local_offset(full_offset - range.min); - return std::make_pair(part->filename(), local_offset); + const offset_type logical_local_offset(full_offset - range.min); + const auto physical_local_offset = logical_local_offset + part->offset().v; + return std::make_pair(part->filename(), physical_local_offset); } cluster_index_type Item::getClusterIndex() const From 036e3cae4b7f9a055e74eb73b3a03806dd9048c2 Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Thu, 22 Feb 2024 11:39:31 +0100 Subject: [PATCH 4/8] Introduce new type `FdInput` --- include/zim/archive.h | 9 +++------ include/zim/zim.h | 14 ++++++++++++++ src/archive.cpp | 4 ++-- src/file_compound.cpp | 5 ++--- src/file_compound.h | 2 +- src/file_part.h | 14 ++++++++------ src/fileimpl.cpp | 4 ++-- src/fileimpl.h | 2 +- test/archive.cpp | 4 ++-- 9 files changed, 35 insertions(+), 23 deletions(-) diff --git a/include/zim/archive.h b/include/zim/archive.h index 0e44e6508..4e884e26f 100644 --- a/include/zim/archive.h +++ b/include/zim/archive.h @@ -111,13 +111,10 @@ namespace zim * * Note: This function is not available under Windows. * - * @param fd The descriptor of a seekable file with a continuous segment - * representing a complete ZIM archive. - * @param offset The offset of the ZIM archive relative to the beginning - * of the file (rather than the current position associated with fd). - * @param size The size of the ZIM archive. + * @param fd A FdInput (tuple) containing the fd (int), offset (offset_type) and size (size_type) + * referencing a continuous segment representing a complete ZIM archive. */ - Archive(int fd, offset_type offset, size_type size); + explicit Archive(FdInput fd); #endif /** Return the filename of the zim file. diff --git a/include/zim/zim.h b/include/zim/zim.h index 80e8596d2..4b27dc27a 100644 --- a/include/zim/zim.h +++ b/include/zim/zim.h @@ -58,6 +58,20 @@ namespace zim // An offset. typedef uint64_t offset_type; + struct FdInput { + // An open file descriptor + int fd; + + // The (absolute) offset of the data "pointed" by FdInput in fd. + offset_type offset; + + // The size (length) of the data "pointed" by FdInput + size_type size; + + FdInput(int fd, offset_type offset, size_type size): + fd(fd), offset(offset), size(size) {} + }; + enum class Compression { None = 1, diff --git a/src/archive.cpp b/src/archive.cpp index 1237baea7..be1b42b68 100644 --- a/src/archive.cpp +++ b/src/archive.cpp @@ -41,8 +41,8 @@ namespace zim : m_impl(new FileImpl(fd)) { } - Archive::Archive(int fd, offset_type offset, size_type size) - : m_impl(new FileImpl(fd, offset_t(offset), zsize_t(size))) + Archive::Archive(FdInput fd) + : m_impl(new FileImpl(fd)) { } #endif diff --git a/src/file_compound.cpp b/src/file_compound.cpp index 59abd5ba6..1ff2aeca9 100644 --- a/src/file_compound.cpp +++ b/src/file_compound.cpp @@ -20,7 +20,6 @@ #include "file_compound.h" #include "buffer.h" -#include "zim_types.h" #include #include @@ -79,11 +78,11 @@ FileCompound::FileCompound(int fd): addPart(new FilePart(fd)); } -FileCompound::FileCompound(int fd, offset_t offset, zsize_t size): +FileCompound::FileCompound(FdInput fd): _filename(), _fsize(0) { - addPart(new FilePart(fd, offset, size)); + addPart(new FilePart(fd)); } #endif diff --git a/src/file_compound.h b/src/file_compound.h index 74526f0bb..2ed1b17a0 100644 --- a/src/file_compound.h +++ b/src/file_compound.h @@ -60,7 +60,7 @@ class FileCompound : private std::map { #ifndef _WIN32 explicit FileCompound(int fd); - FileCompound(int fd, offset_t offset, zsize_t size); + explicit FileCompound(FdInput fd); #endif ~FileCompound(); diff --git a/src/file_part.h b/src/file_part.h index ae42ee962..cdd711033 100644 --- a/src/file_part.h +++ b/src/file_part.h @@ -36,7 +36,9 @@ namespace zim { * * `FilePart` references a part(section) of a physical file. * Most of the time, `FilePart` will reference the whole file (m_offset==0 and m_size==m_fhandle->getSize()) - * but in some situation, it can reference only a part of the file (in case the content is store in a zip archive.) + * but in some situation, it can reference only a part of the file: + * We have this case on android where the zim file is split in different part and stored in a "resource" (zip) archive + * using no-compression. */ class FilePart { typedef DEFAULTFS FS; @@ -55,11 +57,11 @@ class FilePart { explicit FilePart(int fd) : FilePart(getFilePathFromFD(fd)) {} - FilePart(int fd, offset_t offset, zsize_t size): - m_filename(getFilePathFromFD(fd)), + explicit FilePart(FdInput fdInput): + m_filename(getFilePathFromFD(fdInput.fd)), m_fhandle(std::make_shared(FS::openFile(m_filename))), - m_offset(offset), - m_size(size) {} + m_offset(fdInput.offset), + m_size(fdInput.size) {} #endif ~FilePart() = default; @@ -76,7 +78,7 @@ class FilePart { const std::string m_filename; FDSharedPtr m_fhandle; offset_t m_offset; - zsize_t m_size; + zsize_t m_size; // The total size of the (starting at m_offset) of the part }; }; diff --git a/src/fileimpl.cpp b/src/fileimpl.cpp index 7aa134d6b..baa5bf005 100644 --- a/src/fileimpl.cpp +++ b/src/fileimpl.cpp @@ -170,8 +170,8 @@ class Grouping : FileImpl(std::make_shared(fd)) {} - FileImpl::FileImpl(int fd, offset_t offset, zsize_t size) - : FileImpl(std::make_shared(fd, offset, size)) + FileImpl::FileImpl(FdInput fd) + : FileImpl(std::make_shared(fd)) {} #endif diff --git a/src/fileimpl.h b/src/fileimpl.h index e8ce762f2..76214129f 100644 --- a/src/fileimpl.h +++ b/src/fileimpl.h @@ -103,7 +103,7 @@ namespace zim explicit FileImpl(const std::string& fname); #ifndef _WIN32 explicit FileImpl(int fd); - FileImpl(int fd, offset_t offset, zsize_t size); + explicit FileImpl(FdInput fd); #endif time_t getMTime() const; diff --git a/test/archive.cpp b/test/archive.cpp index 838c4551b..74a1d85ed 100644 --- a/test/archive.cpp +++ b/test/archive.cpp @@ -624,7 +624,7 @@ TEST(ZimArchive, openZIMFileEmbeddedInAnotherFile) for(auto i=0UL; i < normalZims.size(); i++) { const zim::Archive archive1(normalZims[i].path); const int fd = OPEN_READ_ONLY(embeddedZims[i].path); - const zim::Archive archive2(fd, 8, archive1.getFilesize()); + const zim::Archive archive2(zim::FdInput(fd, 8, archive1.getFilesize())); checkEquivalence(archive1, archive2); } @@ -693,7 +693,7 @@ TEST(ZimArchive, getDirectAccessInformationFromEmbeddedArchive) for(auto i=0UL; i < normalZims.size(); i++) { const int fd = OPEN_READ_ONLY(embeddedZims[i].path); const auto size = zim::DEFAULTFS::openFile(normalZims[i].path).getSize(); - const zim::Archive archive(fd, 8, size.v); + const zim::Archive archive(zim::FdInput(fd, 8, size.v)); zim::entry_index_type checkedItemCount = 0; for ( auto entry : archive.iterEfficient() ) { if (!entry.isRedirect()) { From 49ac0d75c68702575ee7a73453702b9d33792a15 Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Thu, 22 Feb 2024 15:18:14 +0100 Subject: [PATCH 5/8] Now Archive can be create from a vector of FdInput. --- include/zim/archive.h | 13 +++++++++++++ src/archive.cpp | 4 ++++ src/file_compound.cpp | 9 +++++++++ src/file_compound.h | 2 ++ src/fileimpl.cpp | 4 ++++ src/fileimpl.h | 1 + 6 files changed, 33 insertions(+) diff --git a/include/zim/archive.h b/include/zim/archive.h index 4e884e26f..80a27cf6b 100644 --- a/include/zim/archive.h +++ b/include/zim/archive.h @@ -115,6 +115,19 @@ namespace zim * referencing a continuous segment representing a complete ZIM archive. */ explicit Archive(FdInput fd); + + /** Archive constructor. + * + * Construct an archive from several file descriptors. + * Each part may be embedded in a file. + * Fds (int) can be the same between FdInput if the parts belong to the same file. + * + * Note: This function is not available under Windows. + * + * @param fds A vector of FdInput (tuple) containing the fd (int), offset (offset_type) and size (size_type) + * referencing a series of segments representing a complete ZIM archive. + */ + explicit Archive(const std::vector& fds); #endif /** Return the filename of the zim file. diff --git a/src/archive.cpp b/src/archive.cpp index be1b42b68..561dc6559 100644 --- a/src/archive.cpp +++ b/src/archive.cpp @@ -44,6 +44,10 @@ namespace zim Archive::Archive(FdInput fd) : m_impl(new FileImpl(fd)) { } + + Archive::Archive(const std::vector& fds) + : m_impl(new FileImpl(fds)) + { } #endif const std::string& Archive::getFilename() const diff --git a/src/file_compound.cpp b/src/file_compound.cpp index 1ff2aeca9..cc1799abd 100644 --- a/src/file_compound.cpp +++ b/src/file_compound.cpp @@ -84,6 +84,15 @@ FileCompound::FileCompound(FdInput fd): { addPart(new FilePart(fd)); } + +FileCompound::FileCompound(const std::vector& fds): + _filename(), + _fsize(0) +{ + for (auto& fd: fds) { + addPart(new FilePart(fd)); + } +} #endif FileCompound::~FileCompound() { diff --git a/src/file_compound.h b/src/file_compound.h index 2ed1b17a0..e11a03cff 100644 --- a/src/file_compound.h +++ b/src/file_compound.h @@ -25,6 +25,7 @@ #include "zim_types.h" #include "debug.h" #include +#include #include #include @@ -61,6 +62,7 @@ class FileCompound : private std::map { #ifndef _WIN32 explicit FileCompound(int fd); explicit FileCompound(FdInput fd); + explicit FileCompound(const std::vector& fds); #endif ~FileCompound(); diff --git a/src/fileimpl.cpp b/src/fileimpl.cpp index baa5bf005..994bcab7c 100644 --- a/src/fileimpl.cpp +++ b/src/fileimpl.cpp @@ -173,6 +173,10 @@ class Grouping FileImpl::FileImpl(FdInput fd) : FileImpl(std::make_shared(fd)) {} + + FileImpl::FileImpl(const std::vector& fds) + : FileImpl(std::make_shared(fds)) + {} #endif FileImpl::FileImpl(std::shared_ptr _zimFile) diff --git a/src/fileimpl.h b/src/fileimpl.h index 76214129f..f46aa8c6c 100644 --- a/src/fileimpl.h +++ b/src/fileimpl.h @@ -104,6 +104,7 @@ namespace zim #ifndef _WIN32 explicit FileImpl(int fd); explicit FileImpl(FdInput fd); + explicit FileImpl(const std::vector& fds); #endif time_t getMTime() const; From 9cf67baec995d83afaeefab352725bc27eaf702d Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Thu, 22 Feb 2024 15:18:26 +0100 Subject: [PATCH 6/8] Remove unused includes. --- src/_dirent.h | 2 -- src/buffer.cpp | 6 +----- src/buffer.h | 5 ----- src/bufferstreamer.h | 1 + src/cluster.cpp | 3 --- src/cluster.h | 3 --- src/compression.h | 8 +++++--- src/dirent.cpp | 2 -- src/dirent_accessor.h | 1 - src/dirent_lookup.h | 1 - src/endian_tools.h | 4 ++-- src/entry.cpp | 2 -- src/file_compound.cpp | 1 - src/file_compound.h | 2 -- src/file_part.h | 1 - src/fileheader.h | 2 -- src/fileimpl.cpp | 1 - src/fileimpl.h | 3 --- src/fs_unix.cpp | 1 - src/fs_unix.h | 2 -- src/fs_windows.h | 1 - src/istreamreader.h | 5 ++--- src/item.cpp | 3 ++- src/narrowdown.h | 1 - src/rawstreamreader.h | 1 - src/search.cpp | 2 -- src/suggestion.cpp | 1 - src/tools.cpp | 2 -- test/cluster.cpp | 2 +- test/creator.cpp | 1 + test/tools.h | 2 +- 31 files changed, 16 insertions(+), 56 deletions(-) diff --git a/src/_dirent.h b/src/_dirent.h index 907e9e122..91642049a 100644 --- a/src/_dirent.h +++ b/src/_dirent.h @@ -25,10 +25,8 @@ #include #include #include -#include #include "zim_types.h" -#include "debug.h" namespace zim { diff --git a/src/buffer.cpp b/src/buffer.cpp index 6cc789653..5f9132257 100644 --- a/src/buffer.cpp +++ b/src/buffer.cpp @@ -19,14 +19,10 @@ */ #include "buffer.h" +#include "debug.h" #include -#include -#include #include -#include -#include -#include #ifndef _WIN32 # include diff --git a/src/buffer.h b/src/buffer.h index b14e60919..339c0da5e 100644 --- a/src/buffer.h +++ b/src/buffer.h @@ -21,15 +21,10 @@ #ifndef ZIM_BUFFER_H_ #define ZIM_BUFFER_H_ -#include -#include #include -#include #include "config.h" #include "zim_types.h" -#include "endian_tools.h" -#include "debug.h" #include namespace zim { diff --git a/src/bufferstreamer.h b/src/bufferstreamer.h index ff447d97e..18f198221 100644 --- a/src/bufferstreamer.h +++ b/src/bufferstreamer.h @@ -21,6 +21,7 @@ #define ZIM_BUFFERSTREAMER_H #include "debug.h" +#include "endian_tools.h" #include diff --git a/src/cluster.cpp b/src/cluster.cpp index 7ac93e359..f0e384297 100644 --- a/src/cluster.cpp +++ b/src/cluster.cpp @@ -23,15 +23,12 @@ #include #include #include "buffer_reader.h" -#include "endian_tools.h" #include "bufferstreamer.h" #include "decoderstreamreader.h" #include "rawstreamreader.h" #include #include -#include -#include "compression.h" #include "log.h" #include "config.h" diff --git a/src/cluster.h b/src/cluster.h index e8c966285..6d2932d31 100644 --- a/src/cluster.h +++ b/src/cluster.h @@ -26,14 +26,11 @@ #include #include "buffer.h" #include "zim_types.h" -#include "file_reader.h" -#include #include #include #include #include "zim_types.h" -#include "zim/error.h" namespace zim { diff --git a/src/compression.h b/src/compression.h index c6e03e0bf..b8c898930 100644 --- a/src/compression.h +++ b/src/compression.h @@ -22,10 +22,8 @@ #ifndef _LIBZIM_COMPRESSION_ #define _LIBZIM_COMPRESSION_ -#include -#include "string.h" +#include "reader.h" -#include "file_reader.h" #include #include "config.h" @@ -36,6 +34,10 @@ #include "zim_types.h" #include "constants.h" +#include +#include +#include + //#define DEB(X) std::cerr << __func__ << " " << X << std::endl ; #define DEB(X) diff --git a/src/dirent.cpp b/src/dirent.cpp index 9c262121f..3316d36d2 100644 --- a/src/dirent.cpp +++ b/src/dirent.cpp @@ -24,9 +24,7 @@ #include #include "buffer.h" #include "bufferstreamer.h" -#include "endian_tools.h" #include "log.h" -#include #include log_define("zim.dirent") diff --git a/src/dirent_accessor.h b/src/dirent_accessor.h index 501e9b618..b9a3b3719 100644 --- a/src/dirent_accessor.h +++ b/src/dirent_accessor.h @@ -21,7 +21,6 @@ #define ZIM_DIRENT_ACCESSOR_H #include "zim_types.h" -#include "debug.h" #include "lrucache.h" #include diff --git a/src/dirent_lookup.h b/src/dirent_lookup.h index ffb76efaa..1a5215615 100644 --- a/src/dirent_lookup.h +++ b/src/dirent_lookup.h @@ -27,7 +27,6 @@ #include #include #include -#include #include namespace zim diff --git a/src/endian_tools.h b/src/endian_tools.h index e51a58c99..fce33ba41 100644 --- a/src/endian_tools.h +++ b/src/endian_tools.h @@ -21,10 +21,10 @@ #ifndef ENDIAN_H #define ENDIAN_H -#include -#include #include +#include + namespace zim { diff --git a/src/entry.cpp b/src/entry.cpp index e28b8788b..4d3e14a34 100644 --- a/src/entry.cpp +++ b/src/entry.cpp @@ -21,9 +21,7 @@ #include #include #include -#include "_dirent.h" #include "fileimpl.h" -#include "file_part.h" #include "log.h" #include diff --git a/src/file_compound.cpp b/src/file_compound.cpp index cc1799abd..c3fcd8262 100644 --- a/src/file_compound.cpp +++ b/src/file_compound.cpp @@ -19,7 +19,6 @@ */ #include "file_compound.h" -#include "buffer.h" #include #include diff --git a/src/file_compound.h b/src/file_compound.h index e11a03cff..05fc13c38 100644 --- a/src/file_compound.h +++ b/src/file_compound.h @@ -26,8 +26,6 @@ #include "debug.h" #include #include -#include -#include namespace zim { diff --git a/src/file_part.h b/src/file_part.h index cdd711033..c2733423f 100644 --- a/src/file_part.h +++ b/src/file_part.h @@ -22,7 +22,6 @@ #define ZIM_FILE_PART_H_ #include -#include #include #include diff --git a/src/fileheader.h b/src/fileheader.h index 95be6917b..468800e65 100644 --- a/src/fileheader.h +++ b/src/fileheader.h @@ -21,10 +21,8 @@ #ifndef ZIM_FILEHEADER_H #define ZIM_FILEHEADER_H -#include #include #include -#include #include // max may be defined as a macro by window includes diff --git a/src/fileimpl.cpp b/src/fileimpl.cpp index 994bcab7c..7882d381b 100644 --- a/src/fileimpl.cpp +++ b/src/fileimpl.cpp @@ -27,7 +27,6 @@ #include #include #include -#include #include #include #include diff --git a/src/fileimpl.h b/src/fileimpl.h index f46aa8c6c..b07d2c7f1 100644 --- a/src/fileimpl.h +++ b/src/fileimpl.h @@ -24,17 +24,14 @@ #include #include -#include #include #include #include -#include "lrucache.h" #include "concurrent_cache.h" #include "_dirent.h" #include "dirent_accessor.h" #include "dirent_lookup.h" #include "cluster.h" -#include "buffer.h" #include "file_reader.h" #include "file_compound.h" #include "fileheader.h" diff --git a/src/fs_unix.cpp b/src/fs_unix.cpp index 6ac076750..872393d3e 100644 --- a/src/fs_unix.cpp +++ b/src/fs_unix.cpp @@ -19,7 +19,6 @@ #include "fs_unix.h" #include -#include #include #include diff --git a/src/fs_unix.h b/src/fs_unix.h index 51aab05ac..f45826aa2 100644 --- a/src/fs_unix.h +++ b/src/fs_unix.h @@ -22,8 +22,6 @@ #include "zim_types.h" -#include - #include #include #include diff --git a/src/fs_windows.h b/src/fs_windows.h index 9e4ae07f1..25521545f 100644 --- a/src/fs_windows.h +++ b/src/fs_windows.h @@ -22,7 +22,6 @@ #include "zim_types.h" -#include #include typedef void* HANDLE; diff --git a/src/istreamreader.h b/src/istreamreader.h index 4255d3f33..0d55186c6 100644 --- a/src/istreamreader.h +++ b/src/istreamreader.h @@ -21,12 +21,11 @@ #ifndef ZIM_IDATASTREAM_H #define ZIM_IDATASTREAM_H -#include -#include - #include "endian_tools.h" #include "reader.h" +#include + namespace zim { diff --git a/src/item.cpp b/src/item.cpp index 4ce4b990d..ff70828ff 100644 --- a/src/item.cpp +++ b/src/item.cpp @@ -22,9 +22,10 @@ #include #include "cluster.h" #include "fileimpl.h" -#include "file_part.h" #include "log.h" +#include + log_define("zim.item") using namespace zim; diff --git a/src/narrowdown.h b/src/narrowdown.h index 361a07840..6ce4caf73 100644 --- a/src/narrowdown.h +++ b/src/narrowdown.h @@ -21,7 +21,6 @@ #ifndef ZIM_NARROWDOWN_H #define ZIM_NARROWDOWN_H -#include "zim_types.h" #include "debug.h" #include diff --git a/src/rawstreamreader.h b/src/rawstreamreader.h index 43596fcc7..504f67e63 100644 --- a/src/rawstreamreader.h +++ b/src/rawstreamreader.h @@ -23,7 +23,6 @@ #include "istreamreader.h" #include "reader.h" -#include "debug.h" namespace zim { diff --git a/src/search.cpp b/src/search.cpp index d0e9e710f..cac358531 100644 --- a/src/search.cpp +++ b/src/search.cpp @@ -27,7 +27,6 @@ #include #include "fileimpl.h" #include "search_internal.h" -#include "fs.h" #include "tools.h" #include @@ -39,7 +38,6 @@ #else # include #endif -#include #include "xapian.h" #include diff --git a/src/suggestion.cpp b/src/suggestion.cpp index 8ed711714..33305c662 100644 --- a/src/suggestion.cpp +++ b/src/suggestion.cpp @@ -22,7 +22,6 @@ #include #include #include "suggestion_internal.h" -#include #include "fileimpl.h" #include "tools.h" #include "constants.h" diff --git a/src/tools.cpp b/src/tools.cpp index 58894f384..17ef33697 100644 --- a/src/tools.cpp +++ b/src/tools.cpp @@ -28,11 +28,9 @@ #include #include #include -#include #include #include #include -#include #include #ifdef _WIN32 diff --git a/test/cluster.cpp b/test/cluster.cpp index bc5993044..f97b83aa0 100644 --- a/test/cluster.cpp +++ b/test/cluster.cpp @@ -49,7 +49,7 @@ #include "../src/buffer.h" #include "../src/cluster.h" -#include "../src/file_part.h" +#include "../src/file_reader.h" #include "../src/file_compound.h" #include "../src/buffer_reader.h" #include "../src/writer/cluster.h" diff --git a/test/creator.cpp b/test/creator.cpp index db13e8c12..b884dce83 100644 --- a/test/creator.cpp +++ b/test/creator.cpp @@ -22,6 +22,7 @@ #include #include #include +#include #include "tools.h" #include "../src/file_compound.h" diff --git a/test/tools.h b/test/tools.h index 5ca1e2398..e65a1afed 100644 --- a/test/tools.h +++ b/test/tools.h @@ -22,7 +22,7 @@ #include -#include +#include #include #ifdef _WIN32 #include From 0c5d552d5a21f5944837a1e84ab8cd4807180aeb Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Fri, 23 Feb 2024 11:05:35 +0100 Subject: [PATCH 7/8] Add test on multi part embedded file. --- scripts/download_test_data.py | 2 +- test/archive.cpp | 27 +++++++++++++++++++++++++++ 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/scripts/download_test_data.py b/scripts/download_test_data.py index d320fb392..90eb509ae 100755 --- a/scripts/download_test_data.py +++ b/scripts/download_test_data.py @@ -26,7 +26,7 @@ import tarfile import sys -TEST_DATA_VERSION = "0.3" +TEST_DATA_VERSION = "0.5" ARCHIVE_URL_TEMPL = "https://github.com/openzim/zim-testing-suite/releases/download/v{version}/zim-testing-suite-{version}.tar.gz" if __name__ == "__main__": diff --git a/test/archive.cpp b/test/archive.cpp index 74a1d85ed..12108058f 100644 --- a/test/archive.cpp +++ b/test/archive.cpp @@ -629,6 +629,33 @@ TEST(ZimArchive, openZIMFileEmbeddedInAnotherFile) checkEquivalence(archive1, archive2); } } + +TEST(ZimArchive, openZIMFileMultiPartEmbeddedInAnotherFile) +{ + auto normalZims = getDataFilePath("small.zim"); + auto embeddedZims = getDataFilePath("small.zim.embedded.multi"); + + ASSERT_EQ(normalZims.size(), embeddedZims.size()) << "We must have same number of zim files. (This is a test data issue)"; + for(auto i=0UL; i < normalZims.size(); i++) { + const zim::Archive archive1(normalZims[i].path); + auto archive_size = archive1.getFilesize(); + + std::vector fds; + zim::offset_type start_offset = std::string("BEGINZIMMULTIPART").size(); + while (archive_size > 2048) { + int fd = OPEN_READ_ONLY(embeddedZims[i].path); + fds.push_back(zim::FdInput(fd, start_offset, 2048)); + start_offset += 2048 + std::string("NEWSECTIONZIMMULTI").size(); + archive_size -= 2048; + } + int fd = OPEN_READ_ONLY(embeddedZims[i].path); + fds.push_back(zim::FdInput(fd, start_offset, archive_size)); + + const zim::Archive archive2(fds); + + checkEquivalence(archive1, archive2); + } +} #endif // not _WIN32 #endif // WITH_TEST_DATA From 57fa6f1e1af6999821a86c0b651565e12ed70e1b Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Fri, 23 Feb 2024 11:19:59 +0100 Subject: [PATCH 8/8] Be explicit that fd are used only a Archive creation. Close opened file descriptor in test before checking equivalence. --- include/zim/archive.h | 6 ++++++ test/archive.cpp | 5 +++++ 2 files changed, 11 insertions(+) diff --git a/include/zim/archive.h b/include/zim/archive.h index 80a27cf6b..666e0f931 100644 --- a/include/zim/archive.h +++ b/include/zim/archive.h @@ -97,6 +97,8 @@ namespace zim /** Archive constructor. * * Construct an archive from a file descriptor. + * Fd is used only at Archive creation. + * Ownership of the fd is not taken and it must be closed by caller. * * Note: This function is not available under Windows. * @@ -108,6 +110,8 @@ namespace zim * * Construct an archive from a descriptor of a file with an embedded ZIM * archive inside. + * Fd is used only at Archive creation. + * Ownership of the fd is not taken and it must be closed by caller. * * Note: This function is not available under Windows. * @@ -120,6 +124,8 @@ namespace zim * * Construct an archive from several file descriptors. * Each part may be embedded in a file. + * Fds are used only at Archive creation. + * Ownership of the fds is not taken and they must be closed by caller. * Fds (int) can be the same between FdInput if the parts belong to the same file. * * Note: This function is not available under Windows. diff --git a/test/archive.cpp b/test/archive.cpp index 12108058f..5a1d57d03 100644 --- a/test/archive.cpp +++ b/test/archive.cpp @@ -625,6 +625,7 @@ TEST(ZimArchive, openZIMFileEmbeddedInAnotherFile) const zim::Archive archive1(normalZims[i].path); const int fd = OPEN_READ_ONLY(embeddedZims[i].path); const zim::Archive archive2(zim::FdInput(fd, 8, archive1.getFilesize())); + close(fd); checkEquivalence(archive1, archive2); } @@ -653,6 +654,10 @@ TEST(ZimArchive, openZIMFileMultiPartEmbeddedInAnotherFile) const zim::Archive archive2(fds); + for(auto &fd: fds) { + close(fd.fd); + } + checkEquivalence(archive1, archive2); } }