diff --git a/CMakeLists.txt b/CMakeLists.txt index 51b6b4f39..9f5aaef40 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -88,6 +88,7 @@ set(PARALLEL_HASHMAP_REQUIRED_VERSION 1.3.8) set(PARALLEL_HASHMAP_PREFERRED_VERSION 1.3.12) set(BOOST_REQUIRED_VERSION 1.67.0) +set(LIBICUUC_REQUIRED_VERSION 70.0) set(LIBCRYPTO_REQUIRED_VERSION 3.0.0) set(LIBLZ4_REQUIRED_VERSION 1.9.3) set(LIBLZMA_REQUIRED_VERSION 5.2.5) @@ -190,8 +191,13 @@ if(WITH_LIBDWARFS) OPTIONAL_COMPONENTS process) if(APPLE) + find_program(HOMEBREW_EXE brew) + execute_process( + COMMAND ${HOMEBREW_EXE} --prefix icu4c + OUTPUT_VARIABLE LIBICU4C_PREFIX + OUTPUT_STRIP_TRAILING_WHITESPACE) + list(PREPEND CMAKE_PREFIX_PATH ${LIBICU4C_PREFIX}) if(USE_HOMEBREW_LIBARCHIVE) - find_program(HOMEBREW_EXE brew) execute_process( COMMAND ${HOMEBREW_EXE} --prefix libarchive OUTPUT_VARIABLE LIBARCHIVE_PREFIX @@ -204,6 +210,7 @@ if(WITH_LIBDWARFS) find_package(cpptrace REQUIRED CONFIG) endif() + pkg_check_modules(LIBICUUC REQUIRED IMPORTED_TARGET icu-uc>=${LIBICUUC_REQUIRED_VERSION}) pkg_check_modules(LIBCRYPTO REQUIRED IMPORTED_TARGET libcrypto>=${LIBCRYPTO_REQUIRED_VERSION}) pkg_check_modules(LIBARCHIVE REQUIRED IMPORTED_TARGET libarchive>=${LIBARCHIVE_REQUIRED_VERSION}) pkg_check_modules(XXHASH REQUIRED IMPORTED_TARGET libxxhash>=${XXHASH_REQUIRED_VERSION}) diff --git a/cmake/dwarfs-config.cmake.in b/cmake/dwarfs-config.cmake.in index e4ebb2eaa..b2b2252e7 100644 --- a/cmake/dwarfs-config.cmake.in +++ b/cmake/dwarfs-config.cmake.in @@ -9,13 +9,20 @@ set(DWARFS_PREFIX_DIR "${PACKAGE_PREFIX_DIR}") find_package(PkgConfig) -if(@APPLE@ AND @USE_HOMEBREW_LIBARCHIVE@) # APPLE AND USE_HOMEBREW_LIBARCHIVE +if(@APPLE@) # APPLE find_program(HOMEBREW_EXE brew) execute_process( - COMMAND ${HOMEBREW_EXE} --prefix libarchive - OUTPUT_VARIABLE LIBARCHIVE_PREFIX + COMMAND ${HOMEBREW_EXE} --prefix icu4c + OUTPUT_VARIABLE LIBICU4C_PREFIX OUTPUT_STRIP_TRAILING_WHITESPACE) - list(PREPEND CMAKE_PREFIX_PATH ${LIBARCHIVE_PREFIX}) + list(PREPEND CMAKE_PREFIX_PATH ${LIBICU4C_PREFIX}) + if(@USE_HOMEBREW_LIBARCHIVE@) # USE_HOMEBREW_LIBARCHIVE + execute_process( + COMMAND ${HOMEBREW_EXE} --prefix libarchive + OUTPUT_VARIABLE LIBARCHIVE_PREFIX + OUTPUT_STRIP_TRAILING_WHITESPACE) + list(PREPEND CMAKE_PREFIX_PATH ${LIBARCHIVE_PREFIX}) + endif() endif() find_dependency(Threads REQUIRED) @@ -24,6 +31,7 @@ find_dependency(gflags CONFIG REQUIRED) find_dependency(Boost @BOOST_REQUIRED_VERSION@ REQUIRED CONFIG COMPONENTS chrono context filesystem iostreams program_options regex system thread OPTIONAL_COMPONENTS process) +pkg_check_modules(LIBICUUC REQUIRED IMPORTED_TARGET icu-uc>=@LIBICUUC_REQUIRED_VERSION@) pkg_check_modules(LIBCRYPTO REQUIRED IMPORTED_TARGET libcrypto>=@LIBCRYPTO_REQUIRED_VERSION@) pkg_check_modules(LIBARCHIVE REQUIRED IMPORTED_TARGET libarchive>=@LIBARCHIVE_REQUIRED_VERSION@) pkg_check_modules(XXHASH REQUIRED IMPORTED_TARGET libxxhash>=@XXHASH_REQUIRED_VERSION@) diff --git a/cmake/libdwarfs.cmake b/cmake/libdwarfs.cmake index 9907dff04..8c89179f3 100644 --- a/cmake/libdwarfs.cmake +++ b/cmake/libdwarfs.cmake @@ -162,7 +162,7 @@ add_cpp2_thrift_library(thrift/features.thrift TARGET dwarfs_features_thrift OUTPUT_PATH dwarfs) target_link_libraries(dwarfs_common PRIVATE dwarfs_folly_lite PkgConfig::LIBCRYPTO PkgConfig::XXHASH PkgConfig::ZSTD) -target_link_libraries(dwarfs_reader PUBLIC dwarfs_common) +target_link_libraries(dwarfs_reader PUBLIC dwarfs_common PkgConfig::LIBICUUC) target_link_libraries(dwarfs_writer PUBLIC dwarfs_common PkgConfig::ZSTD) target_link_libraries(dwarfs_extractor PUBLIC dwarfs_reader) target_link_libraries(dwarfs_rewrite PUBLIC dwarfs_reader dwarfs_writer) diff --git a/doc/dwarfs.md b/doc/dwarfs.md index 698ba111e..69f5136e3 100644 --- a/doc/dwarfs.md +++ b/doc/dwarfs.md @@ -104,6 +104,17 @@ options: overlays and want the file system to reflect its read-only state, you can set this option. +- `-o case_insensitive`: + Perform case-insensitive lookups in the mounted file system, + i.e. an entry orignally named `ReadMe.txt` can be accessed as + `readme.txt`, `README.TXT`, or `rEaDmE.tXt`. This works across + all platforms. When mounting a file system with many files, this + may be slightly slower and consume slightly more memory as case- + insensitive lookup requires an additional mapping table that is + built on-demand. Note that this is not supported if the file + system contains directories with entries that only differ in + case. + - `-o (no_)cache_image`: By default, `dwarfs` tries to ensure that the compressed file system image will not be cached by the kernel (i.e. the default diff --git a/include/dwarfs/reader/metadata_options.h b/include/dwarfs/reader/metadata_options.h index 5bb42fd9a..e8e629b45 100644 --- a/include/dwarfs/reader/metadata_options.h +++ b/include/dwarfs/reader/metadata_options.h @@ -32,6 +32,7 @@ struct metadata_options { bool enable_nlink{false}; bool readonly{false}; bool check_consistency{false}; + bool case_insensitive_lookup{false}; size_t block_size{512}; std::optional fs_uid{}; std::optional fs_gid{}; diff --git a/src/reader/internal/metadata_v2.cpp b/src/reader/internal/metadata_v2.cpp index dec8ddd98..18d0f83d1 100644 --- a/src/reader/internal/metadata_v2.cpp +++ b/src/reader/internal/metadata_v2.cpp @@ -46,6 +46,8 @@ #include +#include + #include #include #include @@ -416,7 +418,7 @@ class metadata_ final : public metadata_v2::impl { , symlinks_(meta_.compact_symlinks() ? string_table(lgr, "symlinks", *meta_.compact_symlinks()) : string_table(meta_.symlinks())) - // clang-format off + , dir_icase_cache_{build_dir_icase_cache()} // clang-format off PERFMON_CLS_PROXY_INIT(perfmon, "metadata_v2") PERFMON_CLS_TIMER_INIT(find) PERFMON_CLS_TIMER_INIT(getattr) @@ -921,6 +923,54 @@ class metadata_ final : public metadata_v2::impl { return packed_nlinks; } + static std::string utf8_to_lower(std::string str) { + auto ustr = icu::UnicodeString::fromUTF8(str); + ustr.toLower(); + str.clear(); + ustr.toUTF8String(str); + return str; + } + + std::vector> build_dir_icase_cache() const { + std::vector> cache; + + if (options_.case_insensitive_lookup) { + auto td = LOG_TIMED_DEBUG; + size_t num_cached_dirs = 0; + size_t total_cache_size = 0; + + cache.reserve(meta_.directories().size()); + + for (uint32_t inode = 0; inode < meta_.directories().size() - 1; + ++inode) { + auto& pv = cache.emplace_back(); + directory_view dir{inode, global_}; + auto range = dir.entry_range(); + std::vector names(range.size()); + std::transform(range.begin(), range.end(), names.begin(), [&](auto ix) { + return utf8_to_lower(dir_entry_view_impl::name(ix, global_)); + }); + std::vector entries(range.size()); + std::iota(entries.begin(), entries.end(), 0); + std::sort(entries.begin(), entries.end(), + [&](auto a, auto b) { return names[a] < names[b]; }); + if (!std::is_sorted(entries.begin(), entries.end())) { + pv.reset(std::bit_width(entries.size()), entries.size()); + for (size_t i = 0; i < entries.size(); ++i) { + pv.set(i, entries[i]); + } + ++num_cached_dirs; + total_cache_size += pv.size_in_bytes(); + } + } + + td << "built case-insensitive directory cache for " << num_cached_dirs + << " directories (" << size_with_unit(total_cache_size) << ")"; + } + + return cache; + } + size_t total_file_entries() const { return (dev_inode_offset_ - file_inode_offset_) + (meta_.dir_entries() @@ -944,6 +994,7 @@ class metadata_ final : public metadata_v2::impl { const int unique_files_; const metadata_options options_; const string_table symlinks_; + std::vector> const dir_icase_cache_; PERFMON_CLS_PROXY_DECL PERFMON_CLS_TIMER_DECL(find) PERFMON_CLS_TIMER_DECL(getattr) @@ -1695,15 +1746,43 @@ metadata_::find(directory_view dir, std::string_view name) const { auto range = dir.entry_range(); - auto it = std::lower_bound( - range.begin(), range.end(), name, [&](auto ix, std::string_view name) { - return internal::dir_entry_view_impl::name(ix, global_) < name; - }); + if (options_.case_insensitive_lookup) { + auto const& cache = dir_icase_cache_[dir.inode()]; + auto ixr = boost::irange(0, range.size()); + auto key = utf8_to_lower(std::string(name)); + + auto it = std::lower_bound( + ixr.begin(), ixr.end(), key, [&](auto ix, std::string const& key) { + if (!cache.empty()) { + ix = cache[ix]; + } + return utf8_to_lower(internal::dir_entry_view_impl::name( + range[ix], global_)) < key; + }); + + if (it != ixr.end()) { + auto ix = *it; + if (!cache.empty()) { + ix = cache[ix]; + } + ix = range[ix]; + if (utf8_to_lower(internal::dir_entry_view_impl::name(ix, global_)) == + key) { + return dir_entry_view{dir_entry_view_impl::from_dir_entry_index_shared( + ix, global_.self_dir_entry(dir.inode()), global_)}; + } + } + } else { + auto it = std::lower_bound( + range.begin(), range.end(), name, [&](auto ix, std::string_view name) { + return internal::dir_entry_view_impl::name(ix, global_) < name; + }); - if (it != range.end()) { - if (internal::dir_entry_view_impl::name(*it, global_) == name) { - return dir_entry_view{dir_entry_view_impl::from_dir_entry_index_shared( - *it, global_.self_dir_entry(dir.inode()), global_)}; + if (it != range.end()) { + if (internal::dir_entry_view_impl::name(*it, global_) == name) { + return dir_entry_view{dir_entry_view_impl::from_dir_entry_index_shared( + *it, global_.self_dir_entry(dir.inode()), global_)}; + } } } diff --git a/test/dwarfs_test.cpp b/test/dwarfs_test.cpp index c1d8de6c1..08104b21d 100644 --- a/test/dwarfs_test.cpp +++ b/test/dwarfs_test.cpp @@ -2054,3 +2054,166 @@ TEST(filesystem, multi_image) { EXPECT_EQ("baz", fs.read_string(fs.open(baz->inode()))); } } + +TEST(filesystem, case_insensitive_lookup) { + auto input = std::make_shared(); + + input->add_dir(""); + input->add_dir(u8"hEllÖwÖrLD"); + input->add_dir(u8"FÜñKÿStrÍñg"); + input->add_dir(u8"unícødérøcks"); + input->add_dir(u8"JÄLAPEÑOPEPPÉR"); + input->add_dir(u8"SpIcYsÜsHiRoLL"); + input->add_dir(u8"CAFÉMØCHAlatte"); + input->add_dir(u8"ČhàŧGƤŦ"); + input->add_dir(u8"lõREMÏpSüM"); + input->add_dir(u8"ŠåmpŁËŠTrInG"); + input->add_dir(u8"pythonprogramming"); + input->add_dir(u8"DÃTâScïÊNcË"); + input->add_dir(u8"AIISFÛTÛRË"); + input->add_dir(u8"readability"); + input->add_file(u8"TëStCãSeSçÉNâRïÖ", "testcasescenario"); + input->add_file(u8"lõREMÏpSüM/ÆSTHETÎCcøding", "aestheticcoding"); + input->add_file(u8"lõREMÏpSüM/smîLëyFÀÇë😊", "smileyface"); + input->add_file(u8"lõREMÏpSüM/NØRTHèast", "northeast"); + input->add_file(u8"lõREMÏpSüM/SPACEadventure", "spaceadventure"); + input->add_file(u8"lõREMÏpSüM/cõMPLEXïTy🚀", "complexity"); + input->add_file(u8"lõREMÏpSüM/thisisatest", "thisisatest"); + + std::vector case_sensitive_dirs{ + u8"/hEllÖwÖrLD", u8"/FÜñKÿStrÍñg", u8"/unícødérøcks", + u8"/JÄLAPEÑOPEPPÉR", u8"/SpIcYsÜsHiRoLL", u8"/CAFÉMØCHAlatte", + u8"/ČhàŧGƤŦ", u8"/lõREMÏpSüM", u8"/ŠåmpŁËŠTrInG", + u8"/pythonprogramming", u8"/DÃTâScïÊNcË", u8"/AIISFÛTÛRË", + u8"/readability", + }; + + std::vector> case_sensitive_files{ + {u8"/TëStCãSeSçÉNâRïÖ", "testcasescenario"}, + {u8"/lõREMÏpSüM/ÆSTHETÎCcøding", "aestheticcoding"}, + {u8"/lõREMÏpSüM/smîLëyFÀÇë😊", "smileyface"}, + {u8"/lõREMÏpSüM/NØRTHèast", "northeast"}, + {u8"/lõREMÏpSüM/SPACEadventure", "spaceadventure"}, + {u8"/lõREMÏpSüM/cõMPLEXïTy🚀", "complexity"}, + {u8"/lõREMÏpSüM/thisisatest", "thisisatest"}, + }; + + std::vector case_insensitive_dirs{ + u8"/HELlÖwÖRLD", u8"/FÜÑKÿSTríÑg", u8"/uNÍcødéRøcks", + u8"/JÄLApeñOPePPÉR", u8"/SpiCysÜshiRoLL", u8"/CAféMØchAlatte", + u8"/čhàŧgƥŧ", u8"/lõremÏpsüM", u8"/šåmpŁëšTrInG", + u8"/pyTHonproGRamming", u8"/DãtÂScïêNcË", u8"/AiisFÛTÛRË", + u8"/reADabiLIty", + }; + + std::vector> case_insensitive_files{ + {u8"/TësTcãSeSçéNâRïÖ", "testcasescenario"}, + {u8"/lõRemïpSüM/ÆstHETÎCcØDing", "aestheticcoding"}, + {u8"/lõremïPSüM/smîlËYfàÇë😊", "smileyface"}, + {u8"/lõREMÏPsÜM/NØRthÈAst", "northeast"}, + {u8"/lõRemïPsüM/SPACEadvENTure", "spaceadventure"}, + {u8"/LÕREMÏpSüM/CõMPlexïTy🚀", "complexity"}, + {u8"/lõrEMÏpSüM/thiSISatest", "thisisatest"}, + }; + + std::vector non_matching_entries{ + u8"/HELlÖwÖRLDx", + u8"/FÜÑKÿSTríÑj", + u8"/uNÍcødéRcks", + u8"/JÄLApeñOPePPÉ", + u8"/SpiCysÜshiRoLLx", + u8"/CAféMØchAltte", + u8"/čhàŧgƥŧx", + u8"/lõremÏpsü", + u8"/šåmpŁëšTrnG", + u8"/pyTHonproGRammin", + u8"/DãtÂScïêNcËx", + u8"/AiisFÛTÛTË", + u8"/reADabiLItx", + u8"/TësRcãSeSçéNâRïÖ", + u8"/lõRemïpüM/ÆstHETÎCcØDing", + u8"/lõremïPSüM/mîlËYfàÇë😊", + u8"/lõRMÏPsÜM/NØRthÈAst", + u8"/lõRemïPsüM/SPACEadvENTurex", + u8"/LÕREMÏpSüM/CõMPexïTy🚀", + u8"/lõrEMÏpSüM/thiSISatesy", + }; + + test::test_logger lgr; + auto fsimage = build_dwarfs(lgr, input, "null"); + + auto mm = std::make_shared(std::move(fsimage)); + + { + reader::filesystem_v2 fs(lgr, *input, mm, + {.metadata = {.case_insensitive_lookup = false}}); + + for (auto const& dir : case_sensitive_dirs) { + auto name = u8string_to_string(dir); + auto dev = fs.find(name); + EXPECT_TRUE(dev) << name; + } + + for (auto const& [file, content] : case_sensitive_files) { + auto name = u8string_to_string(file); + auto dev = fs.find(name); + EXPECT_TRUE(dev) << name; + EXPECT_EQ(content, fs.read_string(fs.open(dev->inode()))) << name; + } + + for (auto const& dir : case_insensitive_dirs) { + auto name = u8string_to_string(dir); + auto dev = fs.find(name); + EXPECT_FALSE(dev) << name; + } + + for (auto const& [file, content] : case_insensitive_files) { + auto name = u8string_to_string(file); + auto dev = fs.find(name); + EXPECT_FALSE(dev) << name; + } + + for (auto const& ent : non_matching_entries) { + auto name = u8string_to_string(ent); + auto dev = fs.find(name); + EXPECT_FALSE(dev) << name; + } + } + + { + reader::filesystem_v2 fs(lgr, *input, mm, + {.metadata = {.case_insensitive_lookup = true}}); + + for (auto const& dir : case_sensitive_dirs) { + auto name = u8string_to_string(dir); + auto dev = fs.find(name); + EXPECT_TRUE(dev) << name; + } + + for (auto const& [file, content] : case_sensitive_files) { + auto name = u8string_to_string(file); + auto dev = fs.find(name); + EXPECT_TRUE(dev) << name; + EXPECT_EQ(content, fs.read_string(fs.open(dev->inode()))) << name; + } + + for (auto const& dir : case_insensitive_dirs) { + auto name = u8string_to_string(dir); + auto dev = fs.find(name); + EXPECT_TRUE(dev) << name; + } + + for (auto const& [file, content] : case_insensitive_files) { + auto name = u8string_to_string(file); + auto dev = fs.find(name); + EXPECT_TRUE(dev) << name; + EXPECT_EQ(content, fs.read_string(fs.open(dev->inode()))) << name; + } + + for (auto const& ent : non_matching_entries) { + auto name = u8string_to_string(ent); + auto dev = fs.find(name); + EXPECT_FALSE(dev) << name; + } + } +} diff --git a/test/tools_test.cpp b/test/tools_test.cpp index 76826085e..7d74113a5 100644 --- a/test/tools_test.cpp +++ b/test/tools_test.cpp @@ -1053,6 +1053,7 @@ TEST_P(tools_test, end_to_end) { std::vector all_options{ "-s", + "-ocase_insensitive", #ifndef _WIN32 "-oenable_nlink", "-oreadonly", @@ -1074,6 +1075,7 @@ TEST_P(tools_test, end_to_end) { for (unsigned bitmask = 0; bitmask < combinations; ++bitmask) { std::vector args; + bool case_insensitive{false}; #ifndef _WIN32 bool enable_nlink{false}; bool readonly{false}; @@ -1083,6 +1085,9 @@ TEST_P(tools_test, end_to_end) { for (size_t i = 0; i < all_options.size(); ++i) { if ((1 << i) & bitmask) { auto const& opt = all_options[i]; + if (opt == "-ocase_insensitive") { + case_insensitive = true; + } #ifndef _WIN32 if (opt == "-oreadonly") { readonly = true; @@ -1139,6 +1144,12 @@ TEST_P(tools_test, end_to_end) { EXPECT_EQ(st.st_gid, 3456) << runner.cmdline(); } #endif + EXPECT_TRUE(fs::exists(mountpoint / "format.sh")) << runner.cmdline(); + EXPECT_EQ(case_insensitive, fs::exists(mountpoint / "FORMAT.SH")) + << runner.cmdline(); + EXPECT_EQ(case_insensitive, fs::exists(mountpoint / "fOrMaT.Sh")) + << runner.cmdline(); + auto perfmon = dwarfs::getxattr(mountpoint, "user.dwarfs.driver.perfmon"); #if DWARFS_PERFMON_ENABLED diff --git a/tools/src/dwarfs_main.cpp b/tools/src/dwarfs_main.cpp index 219ba0f15..293edfebf 100644 --- a/tools/src/dwarfs_main.cpp +++ b/tools/src/dwarfs_main.cpp @@ -179,6 +179,7 @@ struct options { #endif int enable_nlink{0}; int readonly{0}; + int case_insensitive{0}; int cache_image{0}; int cache_files{0}; size_t cachesize{0}; @@ -258,6 +259,7 @@ constexpr struct ::fuse_opt dwarfs_opts[] = { DWARFS_OPT("seq_detector=%s", seq_detector_thresh_str, 0), DWARFS_OPT("enable_nlink", enable_nlink, 1), DWARFS_OPT("readonly", readonly, 1), + DWARFS_OPT("case_insensitive", case_insensitive, 1), DWARFS_OPT("cache_image", cache_image, 1), DWARFS_OPT("no_cache_image", cache_image, 0), DWARFS_OPT("cache_files", cache_files, 1), @@ -1224,6 +1226,7 @@ void usage(std::ostream& os, std::filesystem::path const& progname) { << " -o imagesize=NUM filesystem image size in bytes\n" << " -o enable_nlink show correct hardlink numbers\n" << " -o readonly show read-only file system\n" + << " -o case_insensitive perform case-insensitive lookups\n" << " -o (no_)cache_image (don't) keep image in kernel cache\n" << " -o (no_)cache_files (don't) keep files in kernel cache\n" << " -o debuglevel=NAME " << logger::all_level_names() << "\n" @@ -1464,6 +1467,7 @@ void load_filesystem(dwarfs_userdata& userdata) { fsopts.inode_reader.readahead = opts.readahead; fsopts.metadata.enable_nlink = bool(opts.enable_nlink); fsopts.metadata.readonly = bool(opts.readonly); + fsopts.metadata.case_insensitive_lookup = bool(opts.case_insensitive); fsopts.metadata.block_size = opts.blocksize; #ifndef _WIN32 fsopts.metadata.fs_uid = opts.fs_uid; diff --git a/vcpkg.json b/vcpkg.json index fbb2f7619..17079a6c3 100644 --- a/vcpkg.json +++ b/vcpkg.json @@ -20,6 +20,7 @@ "double-conversion", "fmt", "glog", + "icu", "libarchive", "libevent", "libflac",