diff --git a/src/CsvProc9000/Csv/CsvReaderFactory.cs b/src/CsvProc9000/Csv/CsvReaderFactory.cs index 2728a36..2937b68 100644 --- a/src/CsvProc9000/Csv/CsvReaderFactory.cs +++ b/src/CsvProc9000/Csv/CsvReaderFactory.cs @@ -1,11 +1,14 @@ using CsvHelper; using CsvHelper.Configuration; using CsvProc9000.Csv.Contracts; +using Microsoft.Extensions.Logging; using System; using System.Diagnostics.CodeAnalysis; using System.Globalization; using System.IO; using System.IO.Abstractions; +using System.Text; +using Ude; namespace CsvProc9000.Csv { @@ -13,11 +16,14 @@ namespace CsvProc9000.Csv internal sealed class CsvReaderFactory : ICsvReaderFactory { private readonly IFileSystem _fileSystem; + private readonly ILogger _logger; public CsvReaderFactory( - [JetBrains.Annotations.NotNull] IFileSystem fileSystem) + [NotNull] IFileSystem fileSystem, + [NotNull] ILogger logger) { _fileSystem = fileSystem ?? throw new ArgumentNullException(nameof(fileSystem)); + _logger = logger ?? throw new ArgumentNullException(nameof(logger)); } public IReader Create(string file, string delimiter) @@ -27,16 +33,37 @@ public IReader Create(string file, string delimiter) * We're using FileShare.None here, so that no other process can modify the file while we're reading it * and additionally we'll be 'notified' when another process is still writing to a file */ + var encoding = GetEncoding(file); var fileStream = _fileSystem.FileStream.Create(file, FileMode.Open, FileAccess.Read, FileShare.None); - var streamReader = new StreamReader(fileStream); + var streamReader = new StreamReader(fileStream, encoding); var csvConfiguration = new CsvConfiguration(CultureInfo.InvariantCulture) { - Delimiter = delimiter, HasHeaderRecord = true + Delimiter = delimiter, + HasHeaderRecord = true, + Encoding = encoding }; var reader = new CsvReader(streamReader, csvConfiguration); return reader; } + + private Encoding GetEncoding(string fileName) + { + using var fileStream = _fileSystem.FileStream.Create(fileName, FileMode.Open, FileAccess.Read); + + _logger.LogTrace("Trying to detect charset of '{File}'...", fileName); + + var detector = new CharsetDetector(); + detector.Feed(fileStream); + detector.DataEnd(); + var charset = detector.Charset; + + _logger.LogTrace("Found charset '{Charset}' with a confidence of {Confidence}", + charset, detector.Confidence); + + var encoding = CodePagesEncodingProvider.Instance.GetEncoding(charset); + return encoding; + } } } diff --git a/src/CsvProc9000/Csv/CsvWriterFactory.cs b/src/CsvProc9000/Csv/CsvWriterFactory.cs index c44fdb5..71d82e6 100644 --- a/src/CsvProc9000/Csv/CsvWriterFactory.cs +++ b/src/CsvProc9000/Csv/CsvWriterFactory.cs @@ -4,6 +4,7 @@ using System.Diagnostics.CodeAnalysis; using System.Globalization; using System.IO; +using System.Text; namespace CsvProc9000.Csv { @@ -12,7 +13,11 @@ internal sealed class CsvWriterFactory : ICsvWriterFactory { public IWriter Create(string file, string delimiter) { - var csvConfiguration = new CsvConfiguration(CultureInfo.InvariantCulture) { Delimiter = delimiter }; + var csvConfiguration = new CsvConfiguration(CultureInfo.InvariantCulture) + { + Delimiter = delimiter, + Encoding = Encoding.UTF8 + }; var streamWriter = new StreamWriter(file); var writer = new CsvWriter(streamWriter, csvConfiguration); diff --git a/src/CsvProc9000/CsvProc9000.csproj b/src/CsvProc9000/CsvProc9000.csproj index 3f45871..212e631 100644 --- a/src/CsvProc9000/CsvProc9000.csproj +++ b/src/CsvProc9000/CsvProc9000.csproj @@ -5,37 +5,39 @@ - + all - - - - - + + + + + + + - + - + - + PreserveNewest - + - +