Skip to content

Commit

Permalink
fix: reading a csv file now reliably detects the encoding (#27)
Browse files Browse the repository at this point in the history
which currently helps for a weird Windows-1252 Encoding, which .NET didn't detect by itself.
I'm also saving the files as UTF-8 now.
  • Loading branch information
wgnf authored Nov 2, 2021
1 parent dc00e28 commit f835628
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 15 deletions.
33 changes: 30 additions & 3 deletions src/CsvProc9000/Csv/CsvReaderFactory.cs
Original file line number Diff line number Diff line change
@@ -1,23 +1,29 @@
using CsvHelper;
using CsvHelper.Configuration;
using CsvProc9000.Csv.Contracts;
using Microsoft.Extensions.Logging;
using System;
using System.Diagnostics.CodeAnalysis;
using System.Globalization;
using System.IO;
using System.IO.Abstractions;
using System.Text;
using Ude;

namespace CsvProc9000.Csv
{
[ExcludeFromCodeCoverage] // simple factory
internal sealed class CsvReaderFactory : ICsvReaderFactory
{
private readonly IFileSystem _fileSystem;
private readonly ILogger<CsvReaderFactory> _logger;

public CsvReaderFactory(
[JetBrains.Annotations.NotNull] IFileSystem fileSystem)
[NotNull] IFileSystem fileSystem,
[NotNull] ILogger<CsvReaderFactory> logger)
{
_fileSystem = fileSystem ?? throw new ArgumentNullException(nameof(fileSystem));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}

public IReader Create(string file, string delimiter)
Expand All @@ -27,16 +33,37 @@ public IReader Create(string file, string delimiter)
* We're using FileShare.None here, so that no other process can modify the file while we're reading it
* and additionally we'll be 'notified' when another process is still writing to a file
*/
var encoding = GetEncoding(file);
var fileStream = _fileSystem.FileStream.Create(file, FileMode.Open, FileAccess.Read, FileShare.None);
var streamReader = new StreamReader(fileStream);
var streamReader = new StreamReader(fileStream, encoding);

var csvConfiguration = new CsvConfiguration(CultureInfo.InvariantCulture)
{
Delimiter = delimiter, HasHeaderRecord = true
Delimiter = delimiter,
HasHeaderRecord = true,
Encoding = encoding
};
var reader = new CsvReader(streamReader, csvConfiguration);

return reader;
}

private Encoding GetEncoding(string fileName)
{
using var fileStream = _fileSystem.FileStream.Create(fileName, FileMode.Open, FileAccess.Read);

_logger.LogTrace("Trying to detect charset of '{File}'...", fileName);

var detector = new CharsetDetector();
detector.Feed(fileStream);
detector.DataEnd();
var charset = detector.Charset;

_logger.LogTrace("Found charset '{Charset}' with a confidence of {Confidence}",
charset, detector.Confidence);

var encoding = CodePagesEncodingProvider.Instance.GetEncoding(charset);
return encoding;
}
}
}
7 changes: 6 additions & 1 deletion src/CsvProc9000/Csv/CsvWriterFactory.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
using System.Diagnostics.CodeAnalysis;
using System.Globalization;
using System.IO;
using System.Text;

namespace CsvProc9000.Csv
{
Expand All @@ -12,7 +13,11 @@ internal sealed class CsvWriterFactory : ICsvWriterFactory
{
public IWriter Create(string file, string delimiter)
{
var csvConfiguration = new CsvConfiguration(CultureInfo.InvariantCulture) { Delimiter = delimiter };
var csvConfiguration = new CsvConfiguration(CultureInfo.InvariantCulture)
{
Delimiter = delimiter,
Encoding = Encoding.UTF8
};

var streamWriter = new StreamWriter(file);
var writer = new CsvWriter(streamWriter, csvConfiguration);
Expand Down
24 changes: 13 additions & 11 deletions src/CsvProc9000/CsvProc9000.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -5,37 +5,39 @@
</PropertyGroup>

<ItemGroup>
<PackageReference Include="CsvHelper" Version="27.1.1"/>
<PackageReference Include="CsvHelper" Version="27.1.1" />
<PackageReference Include="JetBrains.Annotations" Version="2021.3.0">
<PrivateAssets>all</PrivateAssets>
</PackageReference>
<PackageReference Include="Microsoft.Extensions.Hosting" Version="5.0.0"/>
<PackageReference Include="Microsoft.Extensions.Hosting.WindowsServices" Version="5.0.1"/>
<PackageReference Include="Serilog.AspNetCore" Version="4.1.0"/>
<PackageReference Include="Serilog.Settings.Configuration" Version="3.3.0"/>
<PackageReference Include="System.IO.Abstractions" Version="13.2.47"/>
<PackageReference Include="Microsoft.Extensions.Hosting" Version="5.0.0" />
<PackageReference Include="Microsoft.Extensions.Hosting.WindowsServices" Version="5.0.1" />
<PackageReference Include="Serilog.AspNetCore" Version="4.1.0" />
<PackageReference Include="Serilog.Settings.Configuration" Version="3.3.0" />
<PackageReference Include="System.IO.Abstractions" Version="13.2.47" />
<PackageReference Include="System.Text.Encoding.CodePages" Version="5.0.0" />
<PackageReference Include="UDE.CSharp" Version="1.1.0" />
</ItemGroup>

<ItemGroup>
<Compile Remove="logs\**"/>
<Compile Remove="logs\**" />
</ItemGroup>

<ItemGroup>
<EmbeddedResource Remove="logs\**"/>
<EmbeddedResource Remove="logs\**" />
</ItemGroup>

<ItemGroup>
<None Remove="logs\**"/>
<None Remove="logs\**" />
<None Update="CreateService.bat">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</None>
</ItemGroup>

<ItemGroup>
<Content Remove="logs\**"/>
<Content Remove="logs\**" />
</ItemGroup>

<ItemGroup>
<ProjectReference Include="..\CsvProc9000.Model\CsvProc9000.Model.csproj"/>
<ProjectReference Include="..\CsvProc9000.Model\CsvProc9000.Model.csproj" />
</ItemGroup>
</Project>

0 comments on commit f835628

Please sign in to comment.