From defa062c1acde92ab56d081a3f8e3a09d9339ea1 Mon Sep 17 00:00:00 2001 From: Pierre Le Marre Date: Mon, 17 Jun 2024 16:52:49 +0200 Subject: [PATCH] core: Improve integerValue --- .../Modules/DerivedNumericValues.hs | 1 + unicode-data/Changelog.md | 4 +++ .../bench/Unicode/Char/NumericBench.hs | 17 +++++++++--- unicode-data/lib/Unicode/Char/Numeric.hs | 27 ++++++++++++++++--- .../lib/Unicode/Char/Numeric/Compat.hs | 16 +++++------ .../Internal/Char/DerivedNumericValues.hs | 1 + 6 files changed, 50 insertions(+), 16 deletions(-) diff --git a/ucd2haskell/exe/UCD2Haskell/Modules/DerivedNumericValues.hs b/ucd2haskell/exe/UCD2Haskell/Modules/DerivedNumericValues.hs index a5f6f516..d9de3422 100644 --- a/ucd2haskell/exe/UCD2Haskell/Modules/DerivedNumericValues.hs +++ b/ucd2haskell/exe/UCD2Haskell/Modules/DerivedNumericValues.hs @@ -53,6 +53,7 @@ genNumericValuesModule moduleName = Fold step mempty done , "" , "import Data.Ratio ((%))" , "" + , "{-# NOINLINE numericValue #-}" , "numericValue :: Char -> Maybe Rational" , "numericValue = \\case" <> mkEntries values , " _ -> Nothing" diff --git a/unicode-data/Changelog.md b/unicode-data/Changelog.md index 4f7d7372..654da281 100644 --- a/unicode-data/Changelog.md +++ b/unicode-data/Changelog.md @@ -1,5 +1,9 @@ # Changelog +## TBD + +- Changed `integerValue` from `Char -> Maybe Int` to `(Integral a) => Char -> Maybe a`. + ## 0.4.0.1 (December 2022) - Fix [Unicode blocks handling on big-endian architectures](https://github.com/composewell/unicode-data/issues/97). diff --git a/unicode-data/bench/Unicode/Char/NumericBench.hs b/unicode-data/bench/Unicode/Char/NumericBench.hs index c6df39e7..d105af54 100644 --- a/unicode-data/bench/Unicode/Char/NumericBench.hs +++ b/unicode-data/bench/Unicode/Char/NumericBench.hs @@ -2,7 +2,8 @@ module Unicode.Char.NumericBench ( benchmarks ) where -import Test.Tasty.Bench (Benchmark) +import Data.Int (Int64) +import Test.Tasty.Bench (Benchmark, bgroup) import Unicode.Char.Bench ( Bench (..), @@ -22,7 +23,15 @@ benchmarks r = bgroupWithCharRange "Unicode.Char.Numeric" r $ \chars -> , bgroupWithChars "numericValue" chars [ Bench "unicode-data" Num.numericValue ] - , bgroupWithChars "integerValue" chars - [ Bench "unicode-data" Num.integerValue - ] + , bgroup "integerValue" + [ bgroupWithChars "Integer" chars + [ Bench "unicode-data" (Num.integerValue :: Char -> Maybe Integer) + ] + , bgroupWithChars "Int64" chars + [ Bench "unicode-data" (Num.integerValue :: Char -> Maybe Int64) + ] + , bgroupWithChars "Int" chars + [ Bench "unicode-data" (Num.integerValue :: Char -> Maybe Int) + ] + ] ] diff --git a/unicode-data/lib/Unicode/Char/Numeric.hs b/unicode-data/lib/Unicode/Char/Numeric.hs index 770337fa..01cb5ff6 100644 --- a/unicode-data/lib/Unicode/Char/Numeric.hs +++ b/unicode-data/lib/Unicode/Char/Numeric.hs @@ -26,11 +26,16 @@ module Unicode.Char.Numeric ) where import Data.Char (digitToInt, intToDigit, isDigit, isHexDigit, isOctDigit) +import Data.Int (Int64) import Data.Maybe (isJust) -import Data.Ratio (numerator, denominator) +import Data.Ratio (denominator, numerator) + import qualified Unicode.Char.Numeric.Compat as Compat import qualified Unicode.Internal.Char.DerivedNumericValues as V +-- $setup +-- >>> import Data.Int (Int32, Int64) + -- | Selects Unicode numeric characters, including digits from various -- scripts, Roman numerals, et cetera. -- @@ -86,16 +91,30 @@ numericValue = V.numericValue -- -- This is a special case of 'numericValue'. -- --- __Note:__ a character may have a numeric value but return 'False' with +-- __Warning:__ There is a risk of /integer overflow/ depending of the chosen +-- concrete return type. As of Unicode 15.0 the results range from 0 to 1e12. +-- +-- >>> integerValue '\x5146' :: Maybe Int64 -- OK +-- Just 1000000000000 +-- >>> integerValue '\x5146' :: Maybe Int32 -- Will overflow! +-- Just (-727379968) +-- +-- Therefore it is advised to use: @'integerValue' \@'Int64'@. +-- +-- __Note:__ A character may have a numeric value but return 'False' with -- the predicate 'Unicode.Char.Numeric.Compat.isNumber', because -- 'Unicode.Char.Numeric.Compat.isNumber' only tests -- 'Unicode.Char.General.GeneralCategory': some CJK characters are -- 'Unicode.Char.General.OtherLetter' and do have a numeric value. -- -- @since 0.3.1 -integerValue :: Char -> Maybe Int +{-# INLINE integerValue #-} +{-# SPECIALIZE integerValue :: Char -> Maybe Integer #-} +{-# SPECIALIZE integerValue :: Char -> Maybe Int64 #-} +{-# SPECIALIZE integerValue :: Char -> Maybe Int #-} +integerValue :: (Integral a) => Char -> Maybe a integerValue c = do r <- V.numericValue c if denominator r == 1 - then Just (fromIntegral (numerator r)) + then Just (fromInteger (numerator r)) else Nothing diff --git a/unicode-data/lib/Unicode/Char/Numeric/Compat.hs b/unicode-data/lib/Unicode/Char/Numeric/Compat.hs index afb52fa7..3e9f473a 100644 --- a/unicode-data/lib/Unicode/Char/Numeric/Compat.hs +++ b/unicode-data/lib/Unicode/Char/Numeric/Compat.hs @@ -21,16 +21,16 @@ import qualified Unicode.Internal.Char.UnicodeData.GeneralCategory as UC -- scripts, Roman numerals, et cetera. -- -- This function returns 'True' if its argument has one of the --- following 'GeneralCategory's, or 'False' otherwise: +-- following 'Unicode.Char.General.GeneralCategory's, or 'False' otherwise: -- --- * 'DecimalNumber' --- * 'LetterNumber' --- * 'OtherNumber' +-- * 'Unicode.Char.General.DecimalNumber' +-- * 'Unicode.Char.General.LetterNumber' +-- * 'Unicode.Char.General.OtherNumber' -- -- __Note:__ a character may have a numeric value (see --- 'Unicode.Char.Numeric.numericValue') but return --- 'False', because 'isNumber' only tests 'GeneralCategory': --- some CJK characters are 'OtherLetter' and do have a numeric value. +-- 'Unicode.Char.Numeric.numericValue') but return 'False', because 'isNumber' +-- only tests 'Unicode.Char.General.GeneralCategory': some CJK characters are +-- 'Unicode.Char.General.OtherLetter' and do have a numeric value. -- Use 'Unicode.Char.Numeric.isNumeric' to cover those cases as well. -- -- prop> isNumber c == Data.Char.isNumber c @@ -48,4 +48,4 @@ isNumber c = UC.LetterNumber -> True UC.OtherNumber -> True _ -> False - where cp = ord c + where !cp = ord c diff --git a/unicode-data/lib/Unicode/Internal/Char/DerivedNumericValues.hs b/unicode-data/lib/Unicode/Internal/Char/DerivedNumericValues.hs index 3123bb0a..a47c4341 100644 --- a/unicode-data/lib/Unicode/Internal/Char/DerivedNumericValues.hs +++ b/unicode-data/lib/Unicode/Internal/Char/DerivedNumericValues.hs @@ -15,6 +15,7 @@ where import Data.Ratio ((%)) +{-# NOINLINE numericValue #-} numericValue :: Char -> Maybe Rational numericValue = \case '0' -> Just 0