Skip to content

Commit

Permalink
feat: support case-insensitive lookups (fixes gh #232)
Browse files Browse the repository at this point in the history
  • Loading branch information
mhx committed Nov 18, 2024
1 parent 4551ec4 commit 94f8b4c
Show file tree
Hide file tree
Showing 10 changed files with 300 additions and 15 deletions.
9 changes: 8 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ set(PARALLEL_HASHMAP_REQUIRED_VERSION 1.3.8)
set(PARALLEL_HASHMAP_PREFERRED_VERSION 1.3.12)

set(BOOST_REQUIRED_VERSION 1.67.0)
set(LIBICUUC_REQUIRED_VERSION 70.0)
set(LIBCRYPTO_REQUIRED_VERSION 3.0.0)
set(LIBLZ4_REQUIRED_VERSION 1.9.3)
set(LIBLZMA_REQUIRED_VERSION 5.2.5)
Expand Down Expand Up @@ -190,8 +191,13 @@ if(WITH_LIBDWARFS)
OPTIONAL_COMPONENTS process)

if(APPLE)
find_program(HOMEBREW_EXE brew)
execute_process(
COMMAND ${HOMEBREW_EXE} --prefix icu4c
OUTPUT_VARIABLE LIBICU4C_PREFIX
OUTPUT_STRIP_TRAILING_WHITESPACE)
list(PREPEND CMAKE_PREFIX_PATH ${LIBICU4C_PREFIX})
if(USE_HOMEBREW_LIBARCHIVE)
find_program(HOMEBREW_EXE brew)
execute_process(
COMMAND ${HOMEBREW_EXE} --prefix libarchive
OUTPUT_VARIABLE LIBARCHIVE_PREFIX
Expand All @@ -204,6 +210,7 @@ if(WITH_LIBDWARFS)
find_package(cpptrace REQUIRED CONFIG)
endif()

pkg_check_modules(LIBICUUC REQUIRED IMPORTED_TARGET icu-uc>=${LIBICUUC_REQUIRED_VERSION})
pkg_check_modules(LIBCRYPTO REQUIRED IMPORTED_TARGET libcrypto>=${LIBCRYPTO_REQUIRED_VERSION})
pkg_check_modules(LIBARCHIVE REQUIRED IMPORTED_TARGET libarchive>=${LIBARCHIVE_REQUIRED_VERSION})
pkg_check_modules(XXHASH REQUIRED IMPORTED_TARGET libxxhash>=${XXHASH_REQUIRED_VERSION})
Expand Down
16 changes: 12 additions & 4 deletions cmake/dwarfs-config.cmake.in
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,20 @@ set(DWARFS_PREFIX_DIR "${PACKAGE_PREFIX_DIR}")

find_package(PkgConfig)

if(@APPLE@ AND @USE_HOMEBREW_LIBARCHIVE@) # APPLE AND USE_HOMEBREW_LIBARCHIVE
if(@APPLE@) # APPLE
find_program(HOMEBREW_EXE brew)
execute_process(
COMMAND ${HOMEBREW_EXE} --prefix libarchive
OUTPUT_VARIABLE LIBARCHIVE_PREFIX
COMMAND ${HOMEBREW_EXE} --prefix icu4c
OUTPUT_VARIABLE LIBICU4C_PREFIX
OUTPUT_STRIP_TRAILING_WHITESPACE)
list(PREPEND CMAKE_PREFIX_PATH ${LIBARCHIVE_PREFIX})
list(PREPEND CMAKE_PREFIX_PATH ${LIBICU4C_PREFIX})
if(@USE_HOMEBREW_LIBARCHIVE@) # USE_HOMEBREW_LIBARCHIVE
execute_process(
COMMAND ${HOMEBREW_EXE} --prefix libarchive
OUTPUT_VARIABLE LIBARCHIVE_PREFIX
OUTPUT_STRIP_TRAILING_WHITESPACE)
list(PREPEND CMAKE_PREFIX_PATH ${LIBARCHIVE_PREFIX})
endif()
endif()

find_dependency(Threads REQUIRED)
Expand All @@ -24,6 +31,7 @@ find_dependency(gflags CONFIG REQUIRED)
find_dependency(Boost @BOOST_REQUIRED_VERSION@ REQUIRED CONFIG
COMPONENTS chrono context filesystem iostreams program_options regex system thread
OPTIONAL_COMPONENTS process)
pkg_check_modules(LIBICUUC REQUIRED IMPORTED_TARGET icu-uc>=@LIBICUUC_REQUIRED_VERSION@)
pkg_check_modules(LIBCRYPTO REQUIRED IMPORTED_TARGET libcrypto>=@LIBCRYPTO_REQUIRED_VERSION@)
pkg_check_modules(LIBARCHIVE REQUIRED IMPORTED_TARGET libarchive>=@LIBARCHIVE_REQUIRED_VERSION@)
pkg_check_modules(XXHASH REQUIRED IMPORTED_TARGET libxxhash>=@XXHASH_REQUIRED_VERSION@)
Expand Down
2 changes: 1 addition & 1 deletion cmake/libdwarfs.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ add_cpp2_thrift_library(thrift/features.thrift
TARGET dwarfs_features_thrift OUTPUT_PATH dwarfs)

target_link_libraries(dwarfs_common PRIVATE dwarfs_folly_lite PkgConfig::LIBCRYPTO PkgConfig::XXHASH PkgConfig::ZSTD)
target_link_libraries(dwarfs_reader PUBLIC dwarfs_common)
target_link_libraries(dwarfs_reader PUBLIC dwarfs_common PkgConfig::LIBICUUC)
target_link_libraries(dwarfs_writer PUBLIC dwarfs_common PkgConfig::ZSTD)
target_link_libraries(dwarfs_extractor PUBLIC dwarfs_reader)
target_link_libraries(dwarfs_rewrite PUBLIC dwarfs_reader dwarfs_writer)
Expand Down
11 changes: 11 additions & 0 deletions doc/dwarfs.md
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,17 @@ options:
overlays and want the file system to reflect its read-only
state, you can set this option.

- `-o case_insensitive`:
Perform case-insensitive lookups in the mounted file system,
i.e. an entry orignally named `ReadMe.txt` can be accessed as
`readme.txt`, `README.TXT`, or `rEaDmE.tXt`. This works across
all platforms. When mounting a file system with many files, this
may be slightly slower and consume slightly more memory as case-
insensitive lookup requires an additional mapping table that is
built on-demand. Note that this is not supported if the file
system contains directories with entries that only differ in
case.

- `-o (no_)cache_image`:
By default, `dwarfs` tries to ensure that the compressed file
system image will not be cached by the kernel (i.e. the default
Expand Down
1 change: 1 addition & 0 deletions include/dwarfs/reader/metadata_options.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ struct metadata_options {
bool enable_nlink{false};
bool readonly{false};
bool check_consistency{false};
bool case_insensitive_lookup{false};
size_t block_size{512};
std::optional<file_stat::uid_type> fs_uid{};
std::optional<file_stat::gid_type> fs_gid{};
Expand Down
97 changes: 88 additions & 9 deletions src/reader/internal/metadata_v2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@

#include <range/v3/view/enumerate.hpp>

#include <unicode/unistr.h>

#include <dwarfs/error.h>
#include <dwarfs/file_stat.h>
#include <dwarfs/fstypes.h>
Expand Down Expand Up @@ -416,7 +418,7 @@ class metadata_ final : public metadata_v2::impl {
, symlinks_(meta_.compact_symlinks()
? string_table(lgr, "symlinks", *meta_.compact_symlinks())
: string_table(meta_.symlinks()))
// clang-format off
, dir_icase_cache_{build_dir_icase_cache()} // clang-format off
PERFMON_CLS_PROXY_INIT(perfmon, "metadata_v2")
PERFMON_CLS_TIMER_INIT(find)
PERFMON_CLS_TIMER_INIT(getattr)
Expand Down Expand Up @@ -921,6 +923,54 @@ class metadata_ final : public metadata_v2::impl {
return packed_nlinks;
}

static std::string utf8_to_lower(std::string str) {
auto ustr = icu::UnicodeString::fromUTF8(str);
ustr.toLower();
str.clear();
ustr.toUTF8String(str);
return str;
}

std::vector<packed_int_vector<uint32_t>> build_dir_icase_cache() const {
std::vector<packed_int_vector<uint32_t>> cache;

if (options_.case_insensitive_lookup) {
auto td = LOG_TIMED_DEBUG;
size_t num_cached_dirs = 0;
size_t total_cache_size = 0;

cache.reserve(meta_.directories().size());

for (uint32_t inode = 0; inode < meta_.directories().size() - 1;
++inode) {
auto& pv = cache.emplace_back();
directory_view dir{inode, global_};
auto range = dir.entry_range();
std::vector<std::string> names(range.size());
std::transform(range.begin(), range.end(), names.begin(), [&](auto ix) {
return utf8_to_lower(dir_entry_view_impl::name(ix, global_));
});
std::vector<uint32_t> entries(range.size());
std::iota(entries.begin(), entries.end(), 0);
std::sort(entries.begin(), entries.end(),
[&](auto a, auto b) { return names[a] < names[b]; });
if (!std::is_sorted(entries.begin(), entries.end())) {
pv.reset(std::bit_width(entries.size()), entries.size());
for (size_t i = 0; i < entries.size(); ++i) {
pv.set(i, entries[i]);
}
++num_cached_dirs;
total_cache_size += pv.size_in_bytes();
}
}

td << "built case-insensitive directory cache for " << num_cached_dirs
<< " directories (" << size_with_unit(total_cache_size) << ")";
}

return cache;
}

size_t total_file_entries() const {
return (dev_inode_offset_ - file_inode_offset_) +
(meta_.dir_entries()
Expand All @@ -944,6 +994,7 @@ class metadata_ final : public metadata_v2::impl {
const int unique_files_;
const metadata_options options_;
const string_table symlinks_;
std::vector<packed_int_vector<uint32_t>> const dir_icase_cache_;
PERFMON_CLS_PROXY_DECL
PERFMON_CLS_TIMER_DECL(find)
PERFMON_CLS_TIMER_DECL(getattr)
Expand Down Expand Up @@ -1695,15 +1746,43 @@ metadata_<LoggerPolicy>::find(directory_view dir, std::string_view name) const {

auto range = dir.entry_range();

auto it = std::lower_bound(
range.begin(), range.end(), name, [&](auto ix, std::string_view name) {
return internal::dir_entry_view_impl::name(ix, global_) < name;
});
if (options_.case_insensitive_lookup) {
auto const& cache = dir_icase_cache_[dir.inode()];
auto ixr = boost::irange<uint32_t>(0, range.size());
auto key = utf8_to_lower(std::string(name));

auto it = std::lower_bound(
ixr.begin(), ixr.end(), key, [&](auto ix, std::string const& key) {
if (!cache.empty()) {
ix = cache[ix];
}
return utf8_to_lower(internal::dir_entry_view_impl::name(
range[ix], global_)) < key;
});

if (it != ixr.end()) {
auto ix = *it;
if (!cache.empty()) {
ix = cache[ix];
}
ix = range[ix];
if (utf8_to_lower(internal::dir_entry_view_impl::name(ix, global_)) ==
key) {
return dir_entry_view{dir_entry_view_impl::from_dir_entry_index_shared(
ix, global_.self_dir_entry(dir.inode()), global_)};
}
}
} else {
auto it = std::lower_bound(
range.begin(), range.end(), name, [&](auto ix, std::string_view name) {
return internal::dir_entry_view_impl::name(ix, global_) < name;
});

if (it != range.end()) {
if (internal::dir_entry_view_impl::name(*it, global_) == name) {
return dir_entry_view{dir_entry_view_impl::from_dir_entry_index_shared(
*it, global_.self_dir_entry(dir.inode()), global_)};
if (it != range.end()) {
if (internal::dir_entry_view_impl::name(*it, global_) == name) {
return dir_entry_view{dir_entry_view_impl::from_dir_entry_index_shared(
*it, global_.self_dir_entry(dir.inode()), global_)};
}
}
}

Expand Down
163 changes: 163 additions & 0 deletions test/dwarfs_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2054,3 +2054,166 @@ TEST(filesystem, multi_image) {
EXPECT_EQ("baz", fs.read_string(fs.open(baz->inode())));
}
}

TEST(filesystem, case_insensitive_lookup) {
auto input = std::make_shared<test::os_access_mock>();

input->add_dir("");
input->add_dir(u8"hEllÖwÖrLD");
input->add_dir(u8"FÜñKÿStrÍñg");
input->add_dir(u8"unícødérøcks");
input->add_dir(u8"JÄLAPEÑOPEPPÉR");
input->add_dir(u8"SpIcYsÜsHiRoLL");
input->add_dir(u8"CAFÉMØCHAlatte");
input->add_dir(u8"ČhàŧGƤŦ");
input->add_dir(u8"lõREMÏpSüM");
input->add_dir(u8"ŠåmpŁËŠTrInG");
input->add_dir(u8"pythonprogramming");
input->add_dir(u8"DÃTâScïÊNcË");
input->add_dir(u8"AIISFÛTÛRË");
input->add_dir(u8"readability");
input->add_file(u8"TëStCãSeSçÉNâRïÖ", "testcasescenario");
input->add_file(u8"lõREMÏpSüM/ÆSTHETÎCcøding", "aestheticcoding");
input->add_file(u8"lõREMÏpSüM/smîLëyFÀÇë😊", "smileyface");
input->add_file(u8"lõREMÏpSüM/NØRTHèast", "northeast");
input->add_file(u8"lõREMÏpSüM/SPACEadventure", "spaceadventure");
input->add_file(u8"lõREMÏpSüM/cõMPLEXïTy🚀", "complexity");
input->add_file(u8"lõREMÏpSüM/thisisatest", "thisisatest");

std::vector<std::u8string> case_sensitive_dirs{
u8"/hEllÖwÖrLD", u8"/FÜñKÿStrÍñg", u8"/unícødérøcks",
u8"/JÄLAPEÑOPEPPÉR", u8"/SpIcYsÜsHiRoLL", u8"/CAFÉMØCHAlatte",
u8"/ČhàŧGƤŦ", u8"/lõREMÏpSüM", u8"/ŠåmpŁËŠTrInG",
u8"/pythonprogramming", u8"/DÃTâScïÊNcË", u8"/AIISFÛTÛRË",
u8"/readability",
};

std::vector<std::pair<std::u8string, std::string>> case_sensitive_files{
{u8"/TëStCãSeSçÉNâRïÖ", "testcasescenario"},
{u8"/lõREMÏpSüM/ÆSTHETÎCcøding", "aestheticcoding"},
{u8"/lõREMÏpSüM/smîLëyFÀÇë😊", "smileyface"},
{u8"/lõREMÏpSüM/NØRTHèast", "northeast"},
{u8"/lõREMÏpSüM/SPACEadventure", "spaceadventure"},
{u8"/lõREMÏpSüM/cõMPLEXïTy🚀", "complexity"},
{u8"/lõREMÏpSüM/thisisatest", "thisisatest"},
};

std::vector<std::u8string> case_insensitive_dirs{
u8"/HELlÖwÖRLD", u8"/FÜÑKÿSTríÑg", u8"/uNÍcødéRøcks",
u8"/JÄLApeñOPePPÉR", u8"/SpiCysÜshiRoLL", u8"/CAféMØchAlatte",
u8"/čhàŧgƥŧ", u8"/lõremÏpsüM", u8"/šåmpŁëšTrInG",
u8"/pyTHonproGRamming", u8"/DãtÂScïêNcË", u8"/AiisFÛTÛRË",
u8"/reADabiLIty",
};

std::vector<std::pair<std::u8string, std::string>> case_insensitive_files{
{u8"/TësTcãSeSçéNâRïÖ", "testcasescenario"},
{u8"/lõRemïpSüM/ÆstHETÎCcØDing", "aestheticcoding"},
{u8"/lõremïPSüM/smîlËYfàÇë😊", "smileyface"},
{u8"/lõREMÏPsÜM/NØRthÈAst", "northeast"},
{u8"/lõRemïPsüM/SPACEadvENTure", "spaceadventure"},
{u8"/LÕREMÏpSüM/CõMPlexïTy🚀", "complexity"},
{u8"/lõrEMÏpSüM/thiSISatest", "thisisatest"},
};

std::vector<std::u8string> non_matching_entries{
u8"/HELlÖwÖRLDx",
u8"/FÜÑKÿSTríÑj",
u8"/uNÍcødéRcks",
u8"/JÄLApeñOPePPÉ",
u8"/SpiCysÜshiRoLLx",
u8"/CAféMØchAltte",
u8"/čhàŧgƥŧx",
u8"/lõremÏpsü",
u8"/šåmpŁëšTrnG",
u8"/pyTHonproGRammin",
u8"/DãtÂScïêNcËx",
u8"/AiisFÛTÛTË",
u8"/reADabiLItx",
u8"/TësRcãSeSçéNâRïÖ",
u8"/lõRemïpüM/ÆstHETÎCcØDing",
u8"/lõremïPSüM/mîlËYfàÇë😊",
u8"/lõRMÏPsÜM/NØRthÈAst",
u8"/lõRemïPsüM/SPACEadvENTurex",
u8"/LÕREMÏpSüM/CõMPexïTy🚀",
u8"/lõrEMÏpSüM/thiSISatesy",
};

test::test_logger lgr;
auto fsimage = build_dwarfs(lgr, input, "null");

auto mm = std::make_shared<test::mmap_mock>(std::move(fsimage));

{
reader::filesystem_v2 fs(lgr, *input, mm,
{.metadata = {.case_insensitive_lookup = false}});

for (auto const& dir : case_sensitive_dirs) {
auto name = u8string_to_string(dir);
auto dev = fs.find(name);
EXPECT_TRUE(dev) << name;
}

for (auto const& [file, content] : case_sensitive_files) {
auto name = u8string_to_string(file);
auto dev = fs.find(name);
EXPECT_TRUE(dev) << name;
EXPECT_EQ(content, fs.read_string(fs.open(dev->inode()))) << name;
}

for (auto const& dir : case_insensitive_dirs) {
auto name = u8string_to_string(dir);
auto dev = fs.find(name);
EXPECT_FALSE(dev) << name;
}

for (auto const& [file, content] : case_insensitive_files) {
auto name = u8string_to_string(file);
auto dev = fs.find(name);
EXPECT_FALSE(dev) << name;
}

for (auto const& ent : non_matching_entries) {
auto name = u8string_to_string(ent);
auto dev = fs.find(name);
EXPECT_FALSE(dev) << name;
}
}

{
reader::filesystem_v2 fs(lgr, *input, mm,
{.metadata = {.case_insensitive_lookup = true}});

for (auto const& dir : case_sensitive_dirs) {
auto name = u8string_to_string(dir);
auto dev = fs.find(name);
EXPECT_TRUE(dev) << name;
}

for (auto const& [file, content] : case_sensitive_files) {
auto name = u8string_to_string(file);
auto dev = fs.find(name);
EXPECT_TRUE(dev) << name;
EXPECT_EQ(content, fs.read_string(fs.open(dev->inode()))) << name;
}

for (auto const& dir : case_insensitive_dirs) {
auto name = u8string_to_string(dir);
auto dev = fs.find(name);
EXPECT_TRUE(dev) << name;
}

for (auto const& [file, content] : case_insensitive_files) {
auto name = u8string_to_string(file);
auto dev = fs.find(name);
EXPECT_TRUE(dev) << name;
EXPECT_EQ(content, fs.read_string(fs.open(dev->inode()))) << name;
}

for (auto const& ent : non_matching_entries) {
auto name = u8string_to_string(ent);
auto dev = fs.find(name);
EXPECT_FALSE(dev) << name;
}
}
}
Loading

0 comments on commit 94f8b4c

Please sign in to comment.