From 267e8bf98db4dabfe80bfbc38d73f5e31a61b1dd Mon Sep 17 00:00:00 2001 From: David Moles Date: Sun, 4 Dec 2022 10:13:27 -0800 Subject: [PATCH] Unicode-normalize manifest and file paths before comparing; fixes issue #70 --- ...actPayloadFileExistsInManifestsVistor.java | 22 +++++-- .../verify/internal/ManifestVerifier.java | 31 ++++++---- ...PayloadFileExistsInAllManifestsVistor.java | 10 +-- ...dFileExistsInAtLeastOneManifestVistor.java | 4 +- .../jscancella/verify/BagVeriferTest.java | 61 +++++++++++-------- .../verify/internal/ManifestVerifierTest.java | 22 +++---- .../bagit.txt | 2 + .../data/contr\303\264le.txt" | 1 + .../manifest-md5.txt | 1 + 9 files changed, 95 insertions(+), 59 deletions(-) create mode 100644 src/test/resources/bagWithAccentedCharactersInFilename/bagit.txt create mode 100644 "src/test/resources/bagWithAccentedCharactersInFilename/data/contr\303\264le.txt" create mode 100644 src/test/resources/bagWithAccentedCharactersInFilename/manifest-md5.txt diff --git a/src/main/java/com/github/jscancella/verify/internal/AbstractPayloadFileExistsInManifestsVistor.java b/src/main/java/com/github/jscancella/verify/internal/AbstractPayloadFileExistsInManifestsVistor.java index 0e20936..a269d87 100644 --- a/src/main/java/com/github/jscancella/verify/internal/AbstractPayloadFileExistsInManifestsVistor.java +++ b/src/main/java/com/github/jscancella/verify/internal/AbstractPayloadFileExistsInManifestsVistor.java @@ -6,6 +6,7 @@ import java.nio.file.SimpleFileVisitor; import java.nio.file.attribute.BasicFileAttributes; import java.util.ResourceBundle; +import java.util.Set; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -26,23 +27,36 @@ abstract public class AbstractPayloadFileExistsInManifestsVistor extends SimpleF /** * constructor must be called before using! - * + * * @param ignoreHiddenFiles Should hidden files be ignored */ public AbstractPayloadFileExistsInManifestsVistor(final boolean ignoreHiddenFiles) { super(); this.ignoreHiddenFiles = ignoreHiddenFiles; } - + @Override public FileVisitResult preVisitDirectory(final Path dir, final BasicFileAttributes attrs) throws IOException { FileVisitResult result = FileVisitResult.CONTINUE; - + if(ignoreHiddenFiles && PathUtils.isHidden(dir)){ logger.debug(messages.getString("skipping_hidden_file"), dir); result = FileVisitResult.SKIP_SUBTREE; } - + return result; } + + /** + * Returns true if the path exists in the provided set of manifest paths, false otherwise. + * @param path The file path. + * @param manifestPaths The manifest paths. + * @return true if the path exists, false otherwise. + */ + protected static boolean inManifest(final Path path, final Set manifestPaths) { + final String normalizedPath = ManifestVerifier.toNormalizedString(path); + return manifestPaths.stream().anyMatch( + (mp) -> ManifestVerifier.toNormalizedString(mp).equals(normalizedPath) + ); + } } diff --git a/src/main/java/com/github/jscancella/verify/internal/ManifestVerifier.java b/src/main/java/com/github/jscancella/verify/internal/ManifestVerifier.java index e2bbd48..637f6e6 100644 --- a/src/main/java/com/github/jscancella/verify/internal/ManifestVerifier.java +++ b/src/main/java/com/github/jscancella/verify/internal/ManifestVerifier.java @@ -31,19 +31,19 @@ public enum ManifestVerifier {; //using enum to enforce singleton private static final ResourceBundle messages = ResourceBundle.getBundle("MessageBundle"); /** - * Verify that all the files in the payload directory are listed in the payload manifest and + * Verify that all the files in the payload directory are listed in the payload manifest and * all files listed in all manifests exist. - * + * * @param bag the bag which contains the manifests to check * @param ignoreHiddenFiles to include hidden files when checking - * + * * @throws IOException if there is an error while reading a file from the filesystem * @throws MaliciousPathException if a path is outside the bag * @throws InvalidBagitFileFormatException if a manifest is not formatted correctly * @throws FileNotInPayloadDirectoryException if a file listed in a manifest is not in the payload directory */ public static void verifyManifests(final Bag bag, final boolean ignoreHiddenFiles)throws IOException{ - + final Set allFilesListedInManifests = getAllFilesListedInManifests(bag); checkAllFilesListedInManifestExist(allFilesListedInManifests); @@ -54,12 +54,21 @@ public static void verifyManifests(final Bag bag, final boolean ignoreHiddenFile } } + /** + * Returns the path as a String in {{java.text.Normalizer.Form#NFD}} (canonical) normalized form. + * @param path the path to normalize + * @return String the normalized string + */ + static String toNormalizedString(final Path path) { + return Normalizer.normalize(path.toString(), Normalizer.Form.NFD); + } + /* * get the full path (absolute) of all the files listed in all the manifests */ private static Set getAllFilesListedInManifests(final Bag bag) throws IOException { logger.debug(messages.getString("all_files_in_manifests")); - + final Set filesListedInManifests = new HashSet<>(); try(DirectoryStream directoryStream = Files.newDirectoryStream(bag.getTagFileDir(), new ManifestFilter())){ @@ -78,7 +87,7 @@ private static Set getAllFilesListedInManifests(final Bag bag) throws IOEx */ private static void checkAllFilesListedInManifestExist(final Set files) { logger.info(messages.getString("check_all_files_in_manifests_exist")); - + for (final Path file : files) { if(!Files.exists(file)){ if(existsNormalized(file)){ @@ -91,21 +100,21 @@ private static void checkAllFilesListedInManifestExist(final Set files) { } } } - + /** * if a file is parially normalized or of a different normalization then the manifest specifies it will fail the existence test. * This method checks for that by normalizing what is on disk with the normalized filename and see if they match. - * + * * @return true if the normalized filename matches one on disk in the specified folder */ private static boolean existsNormalized(final Path file){ boolean existsNormalized = false; - final String normalizedFile = Normalizer.normalize(file.toString(), Normalizer.Form.NFD); + final String normalizedFile = toNormalizedString(file); final Path parent = file.getParent(); if(parent != null){ try(DirectoryStream files = Files.newDirectoryStream(parent)){ for(final Path fileToCheck : files){ - final String normalizedFileToCheck = Normalizer.normalize(fileToCheck.toString(), Normalizer.Form.NFD); + final String normalizedFileToCheck = toNormalizedString(fileToCheck); if(normalizedFile.equals(normalizedFileToCheck)){ existsNormalized = true; break; @@ -116,7 +125,7 @@ private static boolean existsNormalized(final Path file){ logger.error(messages.getString("error_reading_normalized_file"), parent, normalizedFile, e); } } - + return existsNormalized; } diff --git a/src/main/java/com/github/jscancella/verify/internal/PayloadFileExistsInAllManifestsVistor.java b/src/main/java/com/github/jscancella/verify/internal/PayloadFileExistsInAllManifestsVistor.java index e8e811b..698b8dc 100644 --- a/src/main/java/com/github/jscancella/verify/internal/PayloadFileExistsInAllManifestsVistor.java +++ b/src/main/java/com/github/jscancella/verify/internal/PayloadFileExistsInAllManifestsVistor.java @@ -1,5 +1,6 @@ package com.github.jscancella.verify.internal; +import java.io.IOException; import java.nio.file.FileVisitResult; import java.nio.file.Files; import java.nio.file.Path; @@ -29,7 +30,7 @@ public final class PayloadFileExistsInAllManifestsVistor extends AbstractPayload /** * Implements {@link SimpleFileVisitor} to ensure that the encountered file is in one of the manifests. - * + * * @param manifests the set of manifests to check * @param rootDir the root directory of the bag * @param ignoreHiddenFiles if the checker should ignore hidden files or not @@ -41,7 +42,7 @@ public PayloadFileExistsInAllManifestsVistor(final Set manifests, fina } @Override - public FileVisitResult visitFile(final Path path, final BasicFileAttributes attrs){ + public FileVisitResult visitFile(final Path path, final BasicFileAttributes attrs) throws IOException { if(Files.isRegularFile(path)){ for(final Manifest manifest : manifests){ final Set relativePaths = manifest @@ -49,8 +50,8 @@ public FileVisitResult visitFile(final Path path, final BasicFileAttributes attr .map(entry -> entry.getRelativeLocation()) .collect(Collectors.toSet()); final Path relativePath = rootDir.relativize(path); - - if(!relativePaths.contains(relativePath)){ + + if(!inManifest(relativePath, relativePaths)){ final String formattedMessage = messages.getString("file_not_in_manifest_error"); throw new FileNotInManifestException(MessageFormatter.format(formattedMessage, path, manifest.getBagitAlgorithmName()).getMessage()); } @@ -59,4 +60,5 @@ public FileVisitResult visitFile(final Path path, final BasicFileAttributes attr logger.debug(messages.getString("file_in_all_manifests"), path); return FileVisitResult.CONTINUE; } + } diff --git a/src/main/java/com/github/jscancella/verify/internal/PayloadFileExistsInAtLeastOneManifestVistor.java b/src/main/java/com/github/jscancella/verify/internal/PayloadFileExistsInAtLeastOneManifestVistor.java index 310753c..50442f3 100644 --- a/src/main/java/com/github/jscancella/verify/internal/PayloadFileExistsInAtLeastOneManifestVistor.java +++ b/src/main/java/com/github/jscancella/verify/internal/PayloadFileExistsInAtLeastOneManifestVistor.java @@ -27,7 +27,7 @@ public final class PayloadFileExistsInAtLeastOneManifestVistor extends AbstractP /** * Implements {@link SimpleFileVisitor} to ensure that the encountered file is in one of the manifests. - * + * * @param filesListedInManifests the set of files listed in all the manifests * @param ignoreHiddenFiles if the checker should ignore hidden files or not */ @@ -42,7 +42,7 @@ public FileVisitResult visitFile(final Path path, final BasicFileAttributes attr logger.debug(messages.getString("skipping_hidden_file"), path); } else { - if(Files.isRegularFile(path) && !filesListedInManifests.contains(path.toAbsolutePath())){ + if(Files.isRegularFile(path) && !inManifest(path.toAbsolutePath(), filesListedInManifests)){ final String formattedMessage = messages.getString("file_not_in_any_manifest_error"); throw new FileNotInManifestException(MessageFormatter.format(formattedMessage, path).getMessage()); } diff --git a/src/test/java/com/github/jscancella/verify/BagVeriferTest.java b/src/test/java/com/github/jscancella/verify/BagVeriferTest.java index b739a71..6b0a92b 100644 --- a/src/test/java/com/github/jscancella/verify/BagVeriferTest.java +++ b/src/test/java/com/github/jscancella/verify/BagVeriferTest.java @@ -25,126 +25,133 @@ public class BagVeriferTest extends TempFolderTest { Security.addProvider(new BouncyCastleProvider()); } } - + @AfterAll public static void tearDown() { BagitChecksumNameMapping.clear("sha3256"); } - + private Path rootDir = Paths.get(new File("src/test/resources/bags/v0_97/bag").toURI()); - + @Test public void testValidWhenHiddenFolderNotIncluded() throws Exception{ Path copyDir = copyBagToTempFolder(rootDir); Files.createDirectory(copyDir.resolve("data").resolve(".someHiddenFolder")); TestUtils.makeFilesHiddenOnWindows(copyDir); - + Bag bag = Bag.read(copyDir); bag.isValid(true); } - + @Test public void testValidWithHiddenFile() throws Exception{ Path copyDir = copyBagToTempFolder(rootDir); Files.createFile(copyDir.resolve("data").resolve(".someHiddenFile")); TestUtils.makeFilesHiddenOnWindows(copyDir); - + Bag bag = Bag.read(copyDir); bag.isValid(true); } - + @Test public void testInvalidWithHiddenFile() throws Exception{ Path copyDir = copyBagToTempFolder(rootDir); Files.createFile(copyDir.resolve("data").resolve(".someHiddenFile")); TestUtils.makeFilesHiddenOnWindows(copyDir); - + Bag bag = Bag.read(copyDir); Assertions.assertThrows(FileNotInManifestException.class, () -> { bag.isValid(false); }); } - + + @Test + public void testValidWithAccentedCharactersInFilename() throws Exception{ + Path bagDir = Paths.get("src", "test", "resources", "bagWithAccentedCharactersInFilename"); + Bag bag = Bag.read(bagDir); + bag.isValid(true); + } + @Test public void testMD5Bag() throws Exception{ Path bagDir = Paths.get("src", "test", "resources", "md5Bag"); Bag bag = Bag.read(bagDir); bag.isValid(true); } - + @Test public void testSHA1Bag() throws Exception{ Path bagDir = Paths.get("src", "test", "resources", "sha1Bag"); Bag bag = Bag.read(bagDir); bag.isValid(true); } - + @Test public void testSHA224Bag() throws Exception{ Path bagDir = Paths.get("src", "test", "resources", "sha224Bag"); Bag bag = Bag.read(bagDir); bag.isValid(true); } - + @Test public void testSHA256Bag() throws Exception{ Path bagDir = Paths.get("src", "test", "resources", "sha256Bag"); Bag bag = Bag.read(bagDir); bag.isValid(true); } - + @Test public void testSHA512Bag() throws Exception{ Path bagDir = Paths.get("src", "test", "resources", "sha512Bag"); Bag bag = Bag.read(bagDir); bag.isValid(true); } - + @Test public void testVersion0_97IsValid() throws Exception{ Bag bag = Bag.read(rootDir); - + bag.isValid(true); } - + @Test public void testIsComplete() throws Exception{ Bag bag = Bag.read(rootDir); - + bag.isComplete(true); } - + @Test public void testCorruptPayloadFile() throws Exception{ rootDir = Paths.get(new File("src/test/resources/corruptPayloadFile").toURI()); Bag bag = Bag.read(rootDir); - + Assertions.assertThrows(CorruptChecksumException.class, () -> { bag.isValid(true); }); } - + @Test public void testCorruptTagFile() throws Exception{ rootDir = Paths.get(new File("src/test/resources/corruptTagFile").toURI()); Bag bag = Bag.read(rootDir); - + Assertions.assertThrows(CorruptChecksumException.class, () -> { bag.isValid(true); }); } - + @Test public void testThrowsNoSuchAlgorithmExceptionWhenNotInHasherMap() throws Exception{ Path sha3BagDir = Paths.get(getClass().getClassLoader().getResource("sha3Bag").toURI()); - + Assertions.assertThrows(NoSuchBagitAlgorithmException.class, () -> { Bag.read(sha3BagDir); }); } - + @Test public void testAddSHA3SupportViaInterface() throws Exception{ boolean successful = BagitChecksumNameMapping.add("sha3256", SHA3Hasher.class); Assertions.assertTrue(successful); - + Path sha3BagDir = Paths.get(new File("src/test/resources/sha3Bag").toURI()); Bag bag = Bag.read(sha3BagDir); Assertions.assertTrue(bag.isValid(true)); } - + /* * Technically valid but highly discouraged */ @@ -152,7 +159,7 @@ public void testAddSHA3SupportViaInterface() throws Exception{ public void testManifestsWithLeadingDotSlash() throws Exception{ Path bagPath = Paths.get(new File("src/test/resources/bag-with-leading-dot-slash-in-manifest").toURI()); Bag bag = Bag.read(bagPath); - + Assertions.assertTrue(bag.isValid(true)); } } diff --git a/src/test/java/com/github/jscancella/verify/internal/ManifestVerifierTest.java b/src/test/java/com/github/jscancella/verify/internal/ManifestVerifierTest.java index f58927b..74687ab 100644 --- a/src/test/java/com/github/jscancella/verify/internal/ManifestVerifierTest.java +++ b/src/test/java/com/github/jscancella/verify/internal/ManifestVerifierTest.java @@ -12,40 +12,40 @@ import com.github.jscancella.exceptions.FileNotInPayloadDirectoryException; public class ManifestVerifierTest { - + private Path rootDir = Paths.get(new File("src/test/resources/bags/v0_97/bag").toURI()); @Test public void testErrorWhenManifestListFileThatDoesntExist() throws Exception{ rootDir = Paths.get(new File("src/test/resources/filesInManifestDontExist").toURI()); Bag bag = Bag.read(rootDir); - - Assertions.assertThrows(FileNotInPayloadDirectoryException.class, + + Assertions.assertThrows(FileNotInPayloadDirectoryException.class, () -> { ManifestVerifier.verifyManifests(bag, true); }); } - + @Test public void testErrorWhenFileIsntInManifest() throws Exception{ rootDir = Paths.get(new File("src/test/resources/filesInPayloadDirAreNotInManifest").toURI()); Bag bag = Bag.read(rootDir); - - Assertions.assertThrows(FileNotInManifestException.class, + + Assertions.assertThrows(FileNotInManifestException.class, () -> { ManifestVerifier.verifyManifests(bag, true); }); } - + @Test public void testBagWithTagFilesInPayloadIsValid() throws Exception{ rootDir = Paths.get(new File("src/test/resources/bags/v0_96/bag-with-tagfiles-in-payload-manifest").toURI()); Bag bag = Bag.read(rootDir); - + ManifestVerifier.verifyManifests(bag, true); } - + @Test - public void testNotALlFilesListedInAllManifestsThrowsException() throws Exception{ + public void testNotAllFilesListedInAllManifestsThrowsException() throws Exception{ Path bagDir = Paths.get(new File("src/test/resources/notAllFilesListedInAllManifestsBag").toURI()); Bag bag = Bag.read(bagDir); - Assertions.assertThrows(FileNotInManifestException.class, + Assertions.assertThrows(FileNotInManifestException.class, () -> { ManifestVerifier.verifyManifests(bag, true); }); } } diff --git a/src/test/resources/bagWithAccentedCharactersInFilename/bagit.txt b/src/test/resources/bagWithAccentedCharactersInFilename/bagit.txt new file mode 100644 index 0000000..7b9e48d --- /dev/null +++ b/src/test/resources/bagWithAccentedCharactersInFilename/bagit.txt @@ -0,0 +1,2 @@ +BagIt-Version: 0.97 +Tag-File-Character-Encoding: UTF-8 \ No newline at end of file diff --git "a/src/test/resources/bagWithAccentedCharactersInFilename/data/contr\303\264le.txt" "b/src/test/resources/bagWithAccentedCharactersInFilename/data/contr\303\264le.txt" new file mode 100644 index 0000000..d606037 --- /dev/null +++ "b/src/test/resources/bagWithAccentedCharactersInFilename/data/contr\303\264le.txt" @@ -0,0 +1 @@ +test2 \ No newline at end of file diff --git a/src/test/resources/bagWithAccentedCharactersInFilename/manifest-md5.txt b/src/test/resources/bagWithAccentedCharactersInFilename/manifest-md5.txt new file mode 100644 index 0000000..3f57fb1 --- /dev/null +++ b/src/test/resources/bagWithAccentedCharactersInFilename/manifest-md5.txt @@ -0,0 +1 @@ +ad0234829205b9033196ba818f7a872b data/contrĂ´le.txt