Skip to content

Commit

Permalink
core: Improve integerValue
Browse files Browse the repository at this point in the history
  • Loading branch information
wismill committed Jun 18, 2024
1 parent ad24b7b commit defa062
Show file tree
Hide file tree
Showing 6 changed files with 50 additions and 16 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ genNumericValuesModule moduleName = Fold step mempty done
, ""
, "import Data.Ratio ((%))"
, ""
, "{-# NOINLINE numericValue #-}"
, "numericValue :: Char -> Maybe Rational"
, "numericValue = \\case" <> mkEntries values
, " _ -> Nothing"
Expand Down
4 changes: 4 additions & 0 deletions unicode-data/Changelog.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# Changelog

## TBD

- Changed `integerValue` from `Char -> Maybe Int` to `(Integral a) => Char -> Maybe a`.

## 0.4.0.1 (December 2022)

- Fix [Unicode blocks handling on big-endian architectures](https://github.com/composewell/unicode-data/issues/97).
Expand Down
17 changes: 13 additions & 4 deletions unicode-data/bench/Unicode/Char/NumericBench.hs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@ module Unicode.Char.NumericBench
( benchmarks
) where

import Test.Tasty.Bench (Benchmark)
import Data.Int (Int64)
import Test.Tasty.Bench (Benchmark, bgroup)

import Unicode.Char.Bench (
Bench (..),
Expand All @@ -22,7 +23,15 @@ benchmarks r = bgroupWithCharRange "Unicode.Char.Numeric" r $ \chars ->
, bgroupWithChars "numericValue" chars
[ Bench "unicode-data" Num.numericValue
]
, bgroupWithChars "integerValue" chars
[ Bench "unicode-data" Num.integerValue
]
, bgroup "integerValue"
[ bgroupWithChars "Integer" chars
[ Bench "unicode-data" (Num.integerValue :: Char -> Maybe Integer)
]
, bgroupWithChars "Int64" chars
[ Bench "unicode-data" (Num.integerValue :: Char -> Maybe Int64)
]
, bgroupWithChars "Int" chars
[ Bench "unicode-data" (Num.integerValue :: Char -> Maybe Int)
]
]
]
27 changes: 23 additions & 4 deletions unicode-data/lib/Unicode/Char/Numeric.hs
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,16 @@ module Unicode.Char.Numeric
) where

import Data.Char (digitToInt, intToDigit, isDigit, isHexDigit, isOctDigit)
import Data.Int (Int64)
import Data.Maybe (isJust)
import Data.Ratio (numerator, denominator)
import Data.Ratio (denominator, numerator)

import qualified Unicode.Char.Numeric.Compat as Compat
import qualified Unicode.Internal.Char.DerivedNumericValues as V

-- $setup
-- >>> import Data.Int (Int32, Int64)

-- | Selects Unicode numeric characters, including digits from various
-- scripts, Roman numerals, et cetera.
--
Expand Down Expand Up @@ -86,16 +91,30 @@ numericValue = V.numericValue
--
-- This is a special case of 'numericValue'.
--
-- __Note:__ a character may have a numeric value but return 'False' with
-- __Warning:__ There is a risk of /integer overflow/ depending of the chosen
-- concrete return type. As of Unicode 15.0 the results range from 0 to 1e12.
--
-- >>> integerValue '\x5146' :: Maybe Int64 -- OK
-- Just 1000000000000
-- >>> integerValue '\x5146' :: Maybe Int32 -- Will overflow!
-- Just (-727379968)
--
-- Therefore it is advised to use: @'integerValue' \@'Int64'@.
--
-- __Note:__ A character may have a numeric value but return 'False' with
-- the predicate 'Unicode.Char.Numeric.Compat.isNumber', because
-- 'Unicode.Char.Numeric.Compat.isNumber' only tests
-- 'Unicode.Char.General.GeneralCategory': some CJK characters are
-- 'Unicode.Char.General.OtherLetter' and do have a numeric value.
--
-- @since 0.3.1
integerValue :: Char -> Maybe Int
{-# INLINE integerValue #-}
{-# SPECIALIZE integerValue :: Char -> Maybe Integer #-}
{-# SPECIALIZE integerValue :: Char -> Maybe Int64 #-}
{-# SPECIALIZE integerValue :: Char -> Maybe Int #-}
integerValue :: (Integral a) => Char -> Maybe a
integerValue c = do
r <- V.numericValue c
if denominator r == 1
then Just (fromIntegral (numerator r))
then Just (fromInteger (numerator r))
else Nothing
16 changes: 8 additions & 8 deletions unicode-data/lib/Unicode/Char/Numeric/Compat.hs
Original file line number Diff line number Diff line change
Expand Up @@ -21,16 +21,16 @@ import qualified Unicode.Internal.Char.UnicodeData.GeneralCategory as UC
-- scripts, Roman numerals, et cetera.
--
-- This function returns 'True' if its argument has one of the
-- following 'GeneralCategory's, or 'False' otherwise:
-- following 'Unicode.Char.General.GeneralCategory's, or 'False' otherwise:
--
-- * 'DecimalNumber'
-- * 'LetterNumber'
-- * 'OtherNumber'
-- * 'Unicode.Char.General.DecimalNumber'
-- * 'Unicode.Char.General.LetterNumber'
-- * 'Unicode.Char.General.OtherNumber'
--
-- __Note:__ a character may have a numeric value (see
-- 'Unicode.Char.Numeric.numericValue') but return
-- 'False', because 'isNumber' only tests 'GeneralCategory':
-- some CJK characters are 'OtherLetter' and do have a numeric value.
-- 'Unicode.Char.Numeric.numericValue') but return 'False', because 'isNumber'
-- only tests 'Unicode.Char.General.GeneralCategory': some CJK characters are
-- 'Unicode.Char.General.OtherLetter' and do have a numeric value.
-- Use 'Unicode.Char.Numeric.isNumeric' to cover those cases as well.
--
-- prop> isNumber c == Data.Char.isNumber c
Expand All @@ -48,4 +48,4 @@ isNumber c =
UC.LetterNumber -> True
UC.OtherNumber -> True
_ -> False
where cp = ord c
where !cp = ord c
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ where

import Data.Ratio ((%))

{-# NOINLINE numericValue #-}
numericValue :: Char -> Maybe Rational
numericValue = \case
'0' -> Just 0
Expand Down

0 comments on commit defa062

Please sign in to comment.