-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
core: Add optional comparison to ICU (generalCategory)
- Loading branch information
Showing
8 changed files
with
262 additions
and
59 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -71,3 +71,5 @@ library | |
extra-libraries: icuuc | ||
pkgconfig-depends: | ||
icu-uc >= 72.1 | ||
build-tool-depends: | ||
c2hs:c2hs |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,136 @@ | ||
-- | | ||
-- Module : ICU.Char | ||
-- Copyright : (c) 2023 Pierre Le Marre | ||
-- License : Apache-2.0 | ||
-- Maintainer : [email protected] | ||
-- Stability : experimental | ||
-- | ||
-- Unicode character general properties | ||
-- | ||
-- @since 0.3.0 | ||
|
||
module ICU.Char | ||
( unicodeVersion | ||
, charAge | ||
, UGeneralCategory(..) | ||
, toGeneralCategory | ||
, charType | ||
) where | ||
|
||
#include <unicode/uchar.h> | ||
|
||
import Data.Char (ord) | ||
import qualified Data.Char as Char | ||
import Data.Int (Int8) | ||
import Data.Version (Version, makeVersion) | ||
import Data.Word (Word32) | ||
import Foreign (Ptr) | ||
import Foreign.C (CInt) | ||
import Foreign.Marshal.Array (allocaArray, peekArray) | ||
import System.IO.Unsafe (unsafePerformIO) | ||
|
||
type UChar32 = Word32 | ||
|
||
foreign import capi "icu.h value __hs_U_MAX_VERSION_LENGTH" maxVersionLength :: Int | ||
|
||
foreign import ccall unsafe "icu.h __hs_u_getUnicodeVersion" u_getUnicodeVersion | ||
:: Ptr Int8 -> IO () | ||
|
||
-- | ICU Unicode version | ||
unicodeVersion :: Version | ||
unicodeVersion | ||
= makeVersion | ||
. fmap fromIntegral | ||
. unsafePerformIO | ||
$ allocaArray | ||
maxVersionLength | ||
(\ptr -> u_getUnicodeVersion ptr *> peekArray maxVersionLength ptr) | ||
|
||
foreign import ccall unsafe "icu.h __hs_u_charAge" u_charAge | ||
:: UChar32 -> Ptr Int8 -> IO () | ||
|
||
-- | Character age | ||
charAge :: Char -> Version | ||
charAge c | ||
= makeVersion | ||
. fmap fromIntegral | ||
. unsafePerformIO | ||
$ allocaArray | ||
maxVersionLength | ||
(\ptr -> u_charAge cp ptr *> peekArray maxVersionLength ptr) | ||
where | ||
cp = fromIntegral (ord c) | ||
|
||
foreign import ccall safe "icu.h __hs_u_charType" u_charType | ||
:: UChar32 -> Int8 | ||
|
||
{#enum define UGeneralCategory { | ||
U_UNASSIGNED as Unassigned, | ||
U_UPPERCASE_LETTER as UppercaseLetter, | ||
U_LOWERCASE_LETTER as LowercaseLetter, | ||
U_TITLECASE_LETTER as TitlecaseLetter, | ||
U_MODIFIER_LETTER as ModifierLetter, | ||
U_OTHER_LETTER as OtherLetter, | ||
U_NON_SPACING_MARK as NonSpacingMark, | ||
U_ENCLOSING_MARK as EnclosingMark, | ||
U_COMBINING_SPACING_MARK as CombiningSpacingMark, | ||
U_DECIMAL_DIGIT_NUMBER as DecimalDigitNumber, | ||
U_LETTER_NUMBER as LetterNumber, | ||
U_OTHER_NUMBER as OtherNumber, | ||
U_SPACE_SEPARATOR as SpaceSeparator, | ||
U_LINE_SEPARATOR as LineSeparator, | ||
U_PARAGRAPH_SEPARATOR as ParagraphSeparator, | ||
U_CONTROL_CHAR as ControlChar, | ||
U_FORMAT_CHAR as FormatChar, | ||
U_PRIVATE_USE_CHAR as PrivateUseChar, | ||
U_SURROGATE as Surrogate, | ||
U_DASH_PUNCTUATION as DashPunctuation, | ||
U_START_PUNCTUATION as StartPunctuation, | ||
U_END_PUNCTUATION as EndPunctuation, | ||
U_CONNECTOR_PUNCTUATION as ConnectorPunctuation, | ||
U_OTHER_PUNCTUATION as OtherPunctuation, | ||
U_MATH_SYMBOL as MathSymbol, | ||
U_CURRENCY_SYMBOL as CurrencySymbol, | ||
U_MODIFIER_SYMBOL as ModifierSymbol, | ||
U_OTHER_SYMBOL as OtherSymbol, | ||
U_INITIAL_PUNCTUATION as InitialPunctuation, | ||
U_FINAL_PUNCTUATION as FinalPunctuation | ||
} | ||
deriving (Bounded, Eq, Ord, Show) #} | ||
|
||
-- | General category | ||
charType :: Char -> UGeneralCategory | ||
charType = toEnum . fromIntegral . u_charType . fromIntegral . ord | ||
|
||
toGeneralCategory :: UGeneralCategory -> Char.GeneralCategory | ||
toGeneralCategory = \case | ||
Unassigned -> Char.NotAssigned | ||
UppercaseLetter -> Char.UppercaseLetter | ||
LowercaseLetter -> Char.LowercaseLetter | ||
TitlecaseLetter -> Char.TitlecaseLetter | ||
ModifierLetter -> Char.ModifierLetter | ||
OtherLetter -> Char.OtherLetter | ||
NonSpacingMark -> Char.NonSpacingMark | ||
EnclosingMark -> Char.EnclosingMark | ||
CombiningSpacingMark -> Char.SpacingCombiningMark | ||
DecimalDigitNumber -> Char.DecimalNumber | ||
LetterNumber -> Char.LetterNumber | ||
OtherNumber -> Char.OtherNumber | ||
SpaceSeparator -> Char.Space | ||
LineSeparator -> Char.LineSeparator | ||
ParagraphSeparator -> Char.ParagraphSeparator | ||
ControlChar -> Char.Control | ||
FormatChar -> Char.Format | ||
PrivateUseChar -> Char.PrivateUse | ||
Surrogate -> Char.Surrogate | ||
DashPunctuation -> Char.DashPunctuation | ||
StartPunctuation -> Char.OpenPunctuation | ||
EndPunctuation -> Char.ClosePunctuation | ||
ConnectorPunctuation -> Char.ConnectorPunctuation | ||
OtherPunctuation -> Char.OtherPunctuation | ||
MathSymbol -> Char.MathSymbol | ||
CurrencySymbol -> Char.CurrencySymbol | ||
ModifierSymbol -> Char.ModifierSymbol | ||
OtherSymbol -> Char.OtherSymbol | ||
InitialPunctuation -> Char.InitialQuote | ||
FinalPunctuation -> Char.FinalQuote |
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
{-# LANGUAGE CPP, BlockArguments, GADTs #-} | ||
|
||
module ICU.CharSpec | ||
( spec | ||
) where | ||
|
||
import Control.Applicative (Alternative(..)) | ||
import Data.Foldable (traverse_) | ||
import Data.Version (showVersion, versionBranch) | ||
import Numeric (showHex) | ||
import Test.Hspec | ||
( describe | ||
, expectationFailure | ||
, it | ||
, pendingWith | ||
, Spec | ||
, HasCallStack, SpecWith ) | ||
|
||
import qualified ICU.Char as ICU | ||
import qualified Unicode.Char as U | ||
|
||
spec :: Spec | ||
spec = do | ||
describe "General" do | ||
checkAndGatherErrors | ||
"charType" | ||
(GeneralCategory . U.generalCategory) | ||
(GeneralCategory . ICU.toGeneralCategory . ICU.charType) | ||
-- TODO: other functions | ||
where | ||
ourUnicodeVersion = versionBranch U.unicodeVersion | ||
theirUnicodeVersion = versionBranch ICU.unicodeVersion | ||
showCodePoint c = ("U+" ++) . fmap U.toUpper . showHex (U.ord c) | ||
|
||
-- There is no feature to display warnings other than `trace`, so | ||
-- hack our own: | ||
-- 1. Compare given functions in pure code and gather warning & errors | ||
-- 2. Create dummy spec that throw an expectation failure, if relevant. | ||
-- 3. Create pending spec for each Char that raises a Unicode version | ||
-- mismatch between ICU and unicode-data. | ||
checkAndGatherErrors | ||
:: forall a. (HasCallStack, Eq a, Show a) | ||
=> String | ||
-> (Char -> a) | ||
-> (Char -> a) | ||
-> SpecWith () | ||
checkAndGatherErrors label f fRef = do | ||
it label (maybe (pure ()) expectationFailure err) | ||
if null ws | ||
then pure () | ||
else describe (label ++ " (Unicode version conflict)") | ||
(traverse_ mkWarning ws) | ||
where | ||
Acc ws err = foldr check (Acc [] Nothing) [minBound..maxBound] | ||
check c acc | ||
-- Test passed | ||
| n == nRef = acc | ||
-- Unicode version mismatch: char is not mapped in one of the libs: | ||
-- add warning. | ||
| age' > ourUnicodeVersion || age' > theirUnicodeVersion | ||
= acc{warnings=c : warnings acc} | ||
-- Error | ||
| otherwise = | ||
let !msg = mconcat | ||
[ showCodePoint c ": expected " | ||
, show nRef | ||
, ", got ", show n, "" ] | ||
in acc{firstError = firstError acc <|> Just msg} | ||
where | ||
!n = f c | ||
!nRef = fRef c | ||
age = ICU.charAge c | ||
age' = take 3 (versionBranch age) | ||
mkWarning c = it (showCodePoint c "") . pendingWith $ mconcat | ||
[ "Incompatible ICU Unicode version: expected " | ||
, showVersion U.unicodeVersion | ||
, ", got: " | ||
, showVersion ICU.unicodeVersion | ||
, " (ICU character age is: " | ||
, showVersion (ICU.charAge c) | ||
, ")" ] | ||
|
||
-- | Helper to compare our GeneralCategory to 'Data.Char.GeneralCategory'. | ||
data GeneralCategory = forall c. (Show c, Enum c) => GeneralCategory c | ||
|
||
instance Show GeneralCategory where | ||
show (GeneralCategory a) = show a | ||
|
||
instance Eq GeneralCategory where | ||
GeneralCategory a == GeneralCategory b = fromEnum a == fromEnum b | ||
|
||
-- | Warning accumulator | ||
data Acc = Acc { warnings :: ![Char], firstError :: !(Maybe String) } | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,10 +1,19 @@ | ||
{-# LANGUAGE CPP #-} | ||
|
||
module Main where | ||
|
||
import Test.Hspec | ||
import qualified Unicode.CharSpec | ||
#ifdef HAS_ICU | ||
import qualified ICU.CharSpec as ICU | ||
#endif | ||
|
||
main :: IO () | ||
main = hspec spec | ||
|
||
spec :: Spec | ||
spec = describe "Unicode.Char" Unicode.CharSpec.spec | ||
spec = do | ||
describe "Unicode.Char" Unicode.CharSpec.spec | ||
#ifdef HAS_ICU | ||
describe "ICU.Char" ICU.spec | ||
#endif |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters