Skip to content

Commit

Permalink
Merge pull request #12066 from NaN-git/utf-8
Browse files Browse the repository at this point in the history
try to calculate width of UTF-8 encoded characters
  • Loading branch information
Mic92 authored Dec 28, 2024
2 parents bff9296 + 92e3095 commit b3eab02
Show file tree
Hide file tree
Showing 7 changed files with 1,636 additions and 29 deletions.
1 change: 1 addition & 0 deletions maintainers/flake-module.nix
Original file line number Diff line number Diff line change
Expand Up @@ -356,6 +356,7 @@
''^src/libutil/util\.cc$''
''^src/libutil/util\.hh$''
''^src/libutil/variant-wrapper\.hh$''
''^src/libutil/widecharwidth/widechar_width\.h$'' # vendored source
''^src/libutil/windows/file-descriptor\.cc$''
''^src/libutil/windows/file-path\.cc$''
''^src/libutil/windows/processes\.cc$''
Expand Down
4 changes: 4 additions & 0 deletions src/libutil-tests/terminal.cc
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,10 @@ TEST(filterANSIEscapes, utf8)
ASSERT_EQ(filterANSIEscapes("fóóbär", true, 3), "fóó");
ASSERT_EQ(filterANSIEscapes("f€€bär", true, 4), "f€€b");
ASSERT_EQ(filterANSIEscapes("f𐍈𐍈bär", true, 4), "f𐍈𐍈b");
ASSERT_EQ(filterANSIEscapes("f🔍bar", true, 6), "f🔍bar");
ASSERT_EQ(filterANSIEscapes("f🔍bar", true, 3), "f🔍");
ASSERT_EQ(filterANSIEscapes("f🔍bar", true, 2), "f");
ASSERT_EQ(filterANSIEscapes("foo\u0301", true, 3), "foó");
}

TEST(filterANSIEscapes, osc8)
Expand Down
6 changes: 6 additions & 0 deletions src/libutil/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,8 @@ deps_private += cpuid
nlohmann_json = dependency('nlohmann_json', version : '>= 3.9')
deps_public += nlohmann_json

cxx = meson.get_compiler('cpp')

config_h = configure_file(
configuration : configdata,
output : 'config-util.hh',
Expand Down Expand Up @@ -168,6 +170,10 @@ sources = files(
)

include_dirs = [include_directories('.')]
if not cxx.has_header('widechar_width.h', required : false)
# use vendored widechar_width.h
include_dirs += include_directories('./widecharwidth')
endif

headers = [config_h] + files(
'abstract-setting-to-json.hh',
Expand Down
1 change: 1 addition & 0 deletions src/libutil/package.nix
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ mkMesonLibrary (finalAttrs: {
./nix-meson-build-support
../../.version
./.version
./widecharwidth
./meson.build
./meson.options
./linux/meson.build
Expand Down
90 changes: 61 additions & 29 deletions src/libutil/terminal.cc
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,53 @@
# include <sys/ioctl.h>
#endif
#include <unistd.h>
#include <widechar_width.h>

namespace {

inline std::pair<int, size_t> charWidthUTF8Helper(std::string_view s)
{
size_t bytes = 1;
uint32_t ch = s[0];
uint32_t max = 1U << 7;
if ((ch & 0x80U) == 0U) {
} else if ((ch & 0xe0U) == 0xc0U) {
ch &= 0x1fU;
bytes = 2;
max = 1U << 11;
} else if ((ch & 0xf0U) == 0xe0U) {
ch &= 0x0fU;
bytes = 3;
max = 1U << 16;
} else if ((ch & 0xf8U) == 0xf0U) {
ch &= 0x07U;
bytes = 4;
max = 0x110000U;
} else {
return {bytes, bytes}; // invalid UTF-8 start byte
}
for (size_t i = 1; i < bytes; i++) {
if (i < s.size() && (s[i] & 0xc0) == 0x80) {
ch = (ch << 6) | (s[i] & 0x3f);
} else {
return {i, i}; // invalid UTF-8 encoding; assume one character per byte
}
}
int width = bytes; // in case of overlong encoding
if (ch < max) {
width = widechar_wcwidth(ch);
if (width == widechar_ambiguous) {
width = 1; // just a guess...
} else if (width == widechar_widened_in_9) {
width = 2;
} else if (width < 0) {
width = 0;
}
}
return {width, bytes};
}

}

namespace nix {

Expand All @@ -30,7 +77,7 @@ std::string filterANSIEscapes(std::string_view s, bool filterAll, unsigned int w
size_t w = 0;
auto i = s.begin();

while (w < (size_t) width && i != s.end()) {
while (i != s.end()) {

if (*i == '\e') {
std::string e;
Expand Down Expand Up @@ -61,46 +108,31 @@ std::string filterANSIEscapes(std::string_view s, bool filterAll, unsigned int w
}

else if (*i == '\t') {
i++; t += ' '; w++;
while (w < (size_t) width && w % 8) {
t += ' '; w++;
}
do {
if (++w > (size_t) width)
return t;
t += ' ';
} while (w % 8);
i++;
}

else if (*i == '\r' || *i == '\a')
// do nothing for now
i++;

else {
w++;
// Copy one UTF-8 character.
if ((*i & 0xe0) == 0xc0) {
t += *i++;
if (i != s.end() && ((*i & 0xc0) == 0x80)) t += *i++;
} else if ((*i & 0xf0) == 0xe0) {
t += *i++;
if (i != s.end() && ((*i & 0xc0) == 0x80)) {
t += *i++;
if (i != s.end() && ((*i & 0xc0) == 0x80)) t += *i++;
}
} else if ((*i & 0xf8) == 0xf0) {
t += *i++;
if (i != s.end() && ((*i & 0xc0) == 0x80)) {
t += *i++;
if (i != s.end() && ((*i & 0xc0) == 0x80)) {
t += *i++;
if (i != s.end() && ((*i & 0xc0) == 0x80)) t += *i++;
}
}
} else
t += *i++;
auto [chWidth, bytes] = charWidthUTF8Helper({i, s.end()});
w += chWidth;
if (w > (size_t) width) {
break;
}
t += {i, i + bytes};
i += bytes;
}
}

return t;
}


//////////////////////////////////////////////////////////////////////

static Sync<std::pair<unsigned short, unsigned short>> windowSize{{0, 0}};
Expand Down
4 changes: 4 additions & 0 deletions src/libutil/widecharwidth/LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
widecharwidth - wcwidth implementation
Written in 2018 by ridiculous_fish
To the extent possible under law, the author(s) have dedicated all copyright and related and neighboring rights to this software to the public domain worldwide. This software is distributed without any warranty.
You should have received a copy of the CC0 Public Domain Dedication along with this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
Loading

0 comments on commit b3eab02

Please sign in to comment.