From 81bab5783a9e7ee0c23fccc8cb6e77eb6aa4f675 Mon Sep 17 00:00:00 2001 From: Guillaume Melquiond Date: Tue, 15 Oct 2024 11:14:21 +0200 Subject: [PATCH] Fix unicode classification of non-spacing marks. This commit adds the ~2000 non-spacing marks into the IdentPart category. This includes all the combining marks, and thus fixes #19512. This also means that characters in the range 1DC0-1DFF can no longer appear at the start of an identifier (which does not make sense anyway, as they are combining marks). This commit also fixes a few exceptions, which were actually no exception: - the dot is already in Symbol, - phonetic extensions are already in Letter. --- clib/unicode.ml | 13 ++----------- doc/changelog/03-notations/19693-fix-19512.rst | 5 +++++ 2 files changed, 7 insertions(+), 11 deletions(-) create mode 100644 doc/changelog/03-notations/19693-fix-19512.rst diff --git a/clib/unicode.ml b/clib/unicode.ml index a93a095d4b22..266059bd5e7a 100644 --- a/clib/unicode.ml +++ b/clib/unicode.ml @@ -98,25 +98,16 @@ let classify = Unicodetable.nd; (* Number, decimal digits. *) Unicodetable.nl; (* Number, letter. *) Unicodetable.no; (* Number, other. *) + Unicodetable.mn; (* Non-spacing marks. *) ]; - (* Workaround. Some characters seems to be missing in - Camomile's category tables. We add them manually. *) - mk_lookup_table_from_unicode_tables_for Letter - [ - [(0x01D00, 0x01D7F)]; (* Phonetic Extensions. *) - [(0x01D80, 0x01DBF)]; (* Phonetic Extensions Suppl. *) - [(0x01DC0, 0x01DFF)]; (* Combining Diacritical Marks Suppl.*) - ]; - - (* Exceptions (from a previous version of this function). *) + (* Exceptions from Number, other. *) mk_lookup_table_from_unicode_tables_for Symbol [ [(0x000B2, 0x000B3)]; (* Superscript 2-3. *) single 0x000B9; (* Superscript 1. *) single 0x02070; (* Superscript 0. *) [(0x02074, 0x02079)]; (* Superscript 4-9. *) - single 0x0002E; (* Dot. *) ]; mk_lookup_table_from_unicode_tables_for Separator [ diff --git a/doc/changelog/03-notations/19693-fix-19512.rst b/doc/changelog/03-notations/19693-fix-19512.rst new file mode 100644 index 000000000000..acb9f979df1d --- /dev/null +++ b/doc/changelog/03-notations/19693-fix-19512.rst @@ -0,0 +1,5 @@ +- **Fixed:** + Recognized all Unicode non-spacing marks as valid identifier characters + (`#19693 `_, + fixes `#19512 `_, + by Guillaume Melquiond).