Skip to content

Commit

Permalink
fix: improvements to config loading logic (#1410)
Browse files Browse the repository at this point in the history
  • Loading branch information
junfuchen99 authored and MikeDombo committed Feb 23, 2023
1 parent 3692066 commit 0054611
Show file tree
Hide file tree
Showing 6 changed files with 180 additions and 94 deletions.
1 change: 1 addition & 0 deletions src/main/java/com/aws/greengrass/config/Configuration.java
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,7 @@ private void read(Reader in, String extension, long timestamp) throws IOExceptio
mergeMap(timestamp, yamlMapper.readValue(in, Map.class));
break;
case "tlog":
case "tlog~":
ConfigurationReader.mergeTLogInto(this, in, false, null);
break;
default:
Expand Down
69 changes: 42 additions & 27 deletions src/main/java/com/aws/greengrass/config/ConfigurationReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -104,41 +104,56 @@ private static void mergeTLogInto(Configuration c, Path p) throws IOException {
}

/**
* Validate the tlog contents at the given path. Throws an IOException if any entry is invalid.
* Validate the tlog contents at the given path.
*
* @param tlogPath path to the file to validate.
* @throws IOException if any entry is invalid.
* @return true if all entries in the file are valid;
* false if file doesn't exist, is empty, or contains invalid entry
*/
public static void validateTlog(Path tlogPath) throws IOException {
try (BufferedReader in = Files.newBufferedReader(tlogPath)) {
// We have seen two different file corruption scenarios
// 1. The last line of config file is corrupted with non-UTF8 characters and BufferedReader::readLine
// throws a MalformedInputException.
// 2. The config file is filled with kilobytes of null bytes and the first line read from file is not
// parseable.
// To handle both scenarios and make sure we can fall back to backup config files, we decided to validate
// the entire file.
public static boolean validateTlog(Path tlogPath) {
try {
if (!Files.exists(tlogPath)) {
logger.atError().setEventType("validate-tlog").kv("path", tlogPath)
.log("Transaction log file does not exist at given path");
return false;
}
try (BufferedReader in = Files.newBufferedReader(tlogPath)) {
// We have seen two different file corruption scenarios
// 1. The last line of config file is corrupted with non-UTF8 characters and BufferedReader::readLine
// throws a MalformedInputException.
// 2. The config file is filled with kilobytes of null bytes and the first line read from file is not
// parseable.
// To handle both scenarios and make sure we can fall back to backup config files, we decided to
// validate the entire file.

// Specific description of scenario 2:
// We have been seeing that very rarely the transaction log gets corrupted when a device (specifically
// raspberry pi using an SD card) has a power outage.
// The corruption is happening at the hardware level and there really isn't anything that we can do
// about it right now.
// The corruption that we see is that the tlog file is filled with kilobytes of null
// bytes, depending on how large the configuration was before dumping the entire config to disk.
// Specific description of scenario 2:
// We have been seeing that very rarely the transaction log gets corrupted when a device (specifically
// raspberry pi using an SD card) has a power outage.
// The corruption is happening at the hardware level and there really isn't anything that we can do
// about it right now.
// The corruption that we see is that the tlog file is filled with kilobytes of null
// bytes, depending on how large the configuration was before dumping the entire config to disk.

String l = in.readLine();
// if file is empty, throw an IOException so that nucleus can recover with backup config files
if (l == null) {
throw new IOException(String.format("Empty transaction log file at %s", tlogPath));
}
String l = in.readLine();
// if file is empty, return false
if (l == null) {
logger.atError().setEventType("validate-tlog").kv("path", tlogPath)
.log("Empty transaction log file");
return false;
}

// if file is not empty, validate that the entire file is parseable
while (l != null) {
Coerce.toObject(l, TLOG_LINE_REF);
l = in.readLine();
// if file is not empty, validate that the entire file is parseable
while (l != null) {
Coerce.toObject(l, TLOG_LINE_REF);
l = in.readLine();
}
}
} catch (IOException e) {
logger.atError().setCause(e).setEventType("validate-tlog").kv("path", tlogPath)
.log("Unable to validate the transaction log content");
return false;
}
return true;
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -195,13 +195,17 @@ private static Writer newTlogWriter(Path outputPath) throws IOException {
StandardOpenOption.SYNC, StandardOpenOption.CREATE);
}

public static Path getOldTlogPath(Path tlogPath) {
return tlogPath.resolveSibling(tlogPath.getFileName() + ".old");
}

/**
* Discard current tlog. Start a new tlog with the current kernel configs.
*/
private synchronized void truncateTlog() {
logger.atDebug(TRUNCATE_TLOG_EVENT).log("started");
truncateQueued.set(false);
Path oldTlogPath = tlogOutputPath.resolveSibling(tlogOutputPath.getFileName() + ".old");
Path oldTlogPath = getOldTlogPath(tlogOutputPath);
// close existing writer
flush(out);
if (out instanceof Commitable) {
Expand Down
149 changes: 87 additions & 62 deletions src/main/java/com/aws/greengrass/lifecyclemanager/KernelLifecycle.java
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.StandardCopyOption;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
Expand Down Expand Up @@ -265,94 +266,56 @@ void initConfigAndTlog(String configFilePath) {
void initConfigAndTlog() {
try {
Path transactionLogPath = nucleusPaths.configPath().resolve(Kernel.DEFAULT_CONFIG_TLOG_FILE);
boolean readFromNonTlog = false;
boolean readFromTlog = true;

if (Objects.nonNull(kernelCommandLine.getProvidedConfigPathName())) {
// If a config file is provided, kernel will use the provided file as a new base
// and ignore existing config and tlog files.
// This is used by the nucleus bootstrap workflow
kernel.getConfig().read(kernelCommandLine.getProvidedConfigPathName());
readFromNonTlog = true;
readFromTlog = false;
} else {
Path externalConfig = nucleusPaths.configPath().resolve(Kernel.DEFAULT_CONFIG_YAML_FILE_READ);
boolean externalConfigFromCmd = Utils.isNotEmpty(kernelCommandLine.getProvidedInitialConfigPath());
if (externalConfigFromCmd) {
externalConfig = Paths.get(kernelCommandLine.getProvidedInitialConfigPath());
}

Path bootstrapTlogPath = nucleusPaths.configPath().resolve(Kernel.DEFAULT_BOOTSTRAP_CONFIG_TLOG_FILE);

boolean bootstrapTlogExists = Files.exists(bootstrapTlogPath);
boolean tlogExists = Files.exists(transactionLogPath);

IOException tlogValidationError = null;
if (tlogExists) {
try {
ConfigurationReader.validateTlog(transactionLogPath);
} catch (IOException e) {
tlogValidationError = e;
}
}
// config.tlog is valid if any incomplete tlog truncation is handled correctly and the tlog content
// is validated
boolean transactionTlogValid =
handleIncompleteTlogTruncation(transactionLogPath) && ConfigurationReader.validateTlog(
transactionLogPath);

// if tlog is present, read the tlog first because the yaml config file may not be up to date
if (tlogExists && tlogValidationError == null) {
// if config.tlog is valid, read the tlog first because the yaml config file may not be up to date
if (transactionTlogValid) {
kernel.getConfig().read(transactionLogPath);
} else {
// if config.tlog is not valid, try to read config from backup tlogs
readConfigFromBackUpTLog(transactionLogPath, bootstrapTlogPath);
readFromTlog = false;
}

// tlog recovery logic if the main tlog isn't valid
if (tlogValidationError != null) {
// Attempt to load from backup tlog file
Path backupTlogPath = CommitableFile.getBackupFile(transactionLogPath);
boolean backupValid = false;
if (Files.exists(backupTlogPath)) {
try {
ConfigurationReader.validateTlog(backupTlogPath);
backupValid = true;
} catch (IOException e) {
logger.atError().log("Backup transaction log at {} is invalid", backupTlogPath, e);
}
}

if (backupValid) {
logger.atError()
.log("Transaction log {} is invalid and so is the backup at {}, will attempt to "
+ "load configuration from {}", transactionLogPath, backupTlogPath,
bootstrapTlogPath, tlogValidationError);
kernel.getConfig().read(backupTlogPath);
readFromNonTlog = true;
} else if (bootstrapTlogExists) {
// If no backup or if the backup was invalid, then try loading from bootstrap
logger.atError()
.log("Transaction log {} is invalid and no usable backup exists, will attempt to load "
+ "configuration from {}", transactionLogPath, bootstrapTlogPath,
tlogValidationError);
kernel.getConfig().read(bootstrapTlogPath);
readFromNonTlog = true;
} else {
// There are no files to load from
logger.atError()
.log("Transaction log {} is invalid and no usable backup exists", transactionLogPath,
tlogValidationError);
}
// read from external configs
Path externalConfig = nucleusPaths.configPath().resolve(Kernel.DEFAULT_CONFIG_YAML_FILE_READ);
boolean externalConfigFromCmd = Utils.isNotEmpty(kernelCommandLine.getProvidedInitialConfigPath());
if (externalConfigFromCmd) {
externalConfig = Paths.get(kernelCommandLine.getProvidedInitialConfigPath());
}

// not validating its content since the file could be in non-tlog format
boolean externalConfigExists = Files.exists(externalConfig);
// If there is no tlog, or the path was provided via commandline, read in that file
if ((externalConfigFromCmd || !tlogExists) && externalConfigExists) {
if ((externalConfigFromCmd || !transactionTlogValid) && externalConfigExists) {
kernel.getConfig().read(externalConfig);
readFromNonTlog = true;
readFromTlog = false;
}

// If no bootstrap was present, then write one out now that we've loaded our config so that we can
// fallback to something
if (!bootstrapTlogExists) {
// fallback to something in future
if (!Files.exists(bootstrapTlogPath)) {
kernel.writeEffectiveConfigAsTransactionLog(bootstrapTlogPath);
}
}

// write new tlog and config files
// only dump out the current config if we read from a source which was not the tlog
if (readFromNonTlog) {
if (!readFromTlog) {
kernel.writeEffectiveConfigAsTransactionLog(transactionLogPath);
}
kernel.writeEffectiveConfig();
Expand All @@ -366,6 +329,68 @@ void initConfigAndTlog() {
}
}

/*
* Check if last tlog truncation was interrupted and undo its effect
*
* @param transactionLogPath path to config.tlog
* @return true if last tlog truncation was complete or if we are able to undo its effect;
* false only if there was an IO error while undoing its effect (renaming the old tlog file)
*/
private boolean handleIncompleteTlogTruncation(Path transactionLogPath) {
Path oldTlogPath = ConfigurationWriter.getOldTlogPath(transactionLogPath);
// At the beginning of tlog truncation, the original config.tlog file is moved to config.tlog.old
// If .old file exists, then the last truncation was incomplete, so we need to undo its effect by moving it
// back to the original location.
if (Files.exists(oldTlogPath)) {
// we don't need to validate the content of old tlog here, since the existence of old tlog itself signals
// that the content in config.tlog at the moment is unusable
logger.atWarn().log("Config tlog truncation was interrupted by last nucleus shutdown and an old version "
+ "of config.tlog exists. Undoing the effect of incomplete truncation by moving {} back to {}",
oldTlogPath, transactionLogPath);
try {
Files.move(oldTlogPath, transactionLogPath, StandardCopyOption.REPLACE_EXISTING);
} catch (IOException e) {
logger.atError().setCause(e).log("An IO error occurred while moving the old tlog file. Will "
+ "attempt to load from backup configs");
return false;
}
}
// also delete the new file (config.tlog+) as part of undoing the effect of incomplete truncation
Path newTlogPath = CommitableFile.getNewFile(transactionLogPath);
try {
Files.deleteIfExists(newTlogPath);
} catch (IOException e) {
// do not throw since it does not impact loading configs
logger.atWarn().setCause(e).log("Failed to delete {}", newTlogPath);
}
return true;
}

/*
* Read configs from backup tlog files.
* the fallback order is config.tlog~ -> bootstrap.tlog -> bootstrap.tlog~
*
* @param transactionLogPath path to main config tlog
* @param bootstrapTlogPath path to bootstrap config tlog
* @throws IOException IO error while reading file
*/
private void readConfigFromBackUpTLog(Path transactionLogPath, Path bootstrapTlogPath) throws IOException {
List<Path> tlogBackupPathsInOrder =
Arrays.asList(CommitableFile.getBackupFile(transactionLogPath), // config.tlog~
bootstrapTlogPath, // bootstrap.tlog
CommitableFile.getBackupFile(bootstrapTlogPath) // bootstrap.tlog~
);
for (Path tlogBackupPath : tlogBackupPathsInOrder) {
if (ConfigurationReader.validateTlog(tlogBackupPath)) {
logger.atError().log("Transaction log {} is invalid, will attempt to load configuration from {}",
transactionLogPath, tlogBackupPath);
kernel.getConfig().read(tlogBackupPath);
return;
}
}
logger.atError().log("Transaction log {} is invalid and no usable backup exists", transactionLogPath);
}

@SuppressWarnings("PMD.CloseResource")
private Queue<String> findBuiltInServicesAndPlugins() {
Queue<String> autostart = new LinkedList<>();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.extension.ExtendWith;
import org.junit.jupiter.api.extension.ExtensionContext;
import org.junit.jupiter.api.io.TempDir;

import java.io.IOException;
Expand All @@ -21,10 +22,11 @@

import static com.aws.greengrass.lifecyclemanager.GreengrassService.SERVICES_NAMESPACE_TOPIC;
import static com.aws.greengrass.lifecyclemanager.GreengrassService.SERVICE_LIFECYCLE_NAMESPACE_TOPIC;
import static com.aws.greengrass.testcommons.testutilities.ExceptionLogProtector.ignoreExceptionOfType;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertNull;
import static org.junit.jupiter.api.Assertions.assertThrows;

@ExtendWith(GGExtension.class)
class ConfigurationReaderTest {
Expand Down Expand Up @@ -207,12 +209,13 @@ void GIVEN_tlog_WHEN_merge_THEN_topics_created_with_correct_modtime() throws Exc
}

@Test
void GIVEN_corrupted_tlog_WHEN_validate_tlog_THEN_correct_exception_is_thrown() throws Exception {
void GIVEN_corrupted_tlog_WHEN_validate_tlog_THEN_return_false(ExtensionContext context) throws Exception {
ignoreExceptionOfType(context, MalformedInputException.class);
Path emptyTlogPath = Files.createTempFile(tempDir, null, null);
assertThrows(IOException.class, () -> ConfigurationReader.validateTlog(emptyTlogPath));
assertFalse(ConfigurationReader.validateTlog(emptyTlogPath));

// test a config file with non-UTF8 encoding
Path corruptedTlogPath = Paths.get(this.getClass().getResource("corruptedConfig.tlog").toURI());
assertThrows(MalformedInputException.class, () -> ConfigurationReader.validateTlog(corruptedTlogPath));
assertFalse(ConfigurationReader.validateTlog(corruptedTlogPath));
}
}
Loading

0 comments on commit 0054611

Please sign in to comment.