Skip to content

Commit

Permalink
Improve name aliases
Browse files Browse the repository at this point in the history
Encode the explicit length of the aliases.
  • Loading branch information
wismill committed Mar 15, 2023
1 parent 663d630 commit 871e6f2
Show file tree
Hide file tree
Showing 9 changed files with 769 additions and 587 deletions.
80 changes: 49 additions & 31 deletions ucd2haskell/exe/Parser/Text.hs
Original file line number Diff line number Diff line change
Expand Up @@ -1732,8 +1732,7 @@ genNamesModule moduleName =
where
shows' = \case
'\0' -> \s -> '\\' : '0' : s
-- Note: names are ASCII
c -> (c :)
c -> (c :) -- Note: names are ASCII

genAliasesModule
:: Monad m
Expand All @@ -1750,12 +1749,14 @@ genAliasesModule moduleName =
, showHexCodepoint char
, "'# -> \""
, mkCharAliasesLiteral char aliases
, "\"#\n"
, "\"#"
]

mkCharAliasesLiteral :: Char -> Aliases -> String
mkCharAliasesLiteral char aliasesList =
enumMapToAddrLiteral 0 0xfff (reverse index) (mconcat (reverse aliases))
enumMapToAddrLiteral 0 0xfff
(reverse index)
(mconcat (reverse ("\\0":aliases)))
where
(index, aliases, _) = Map.foldlWithKey'
(addAliasType char)
Expand All @@ -1779,40 +1780,39 @@ genAliasesModule moduleName =
( 0 : index
, aliasesAcc
, lastAliasIndex )
aliases -> -- traceShow (char, ty, fromIntegral lastAliasIndex :: Word8)
( fromIntegral lastAliasIndex : index
, encodedAliases
, lastAliasIndex' )
aliases -> if lastAliasIndex < 0xff
then
( fromIntegral lastAliasIndex : index
, encodedAliases
, lastAliasIndex' )
else error . mconcat $
[ "Cannot encode char ", show char
, "aliases. Offset: ", show lastAliasIndex, " >= 0xff" ]
where
(encodedAliases, lastAliasIndex') =
addEncodedAliases (aliasesAcc, lastAliasIndex) aliases
addEncodedAliases acc@(as, offset) = \case
Alias alias : rest -> if offset' < 0xff
then addEncodedAliases
-- next offset : null-terminated string
( mconcat ["\\", show nextAliasOffset, alias, "\\0"]:as
, offset' )
rest
else error . mconcat $
[ "Cannot encode alias “", alias, "” offset for char : "
, show char
, " . Offset: ", show offset', " >= 0xff" ]
addEncodedAliases (as, offset) = \case
Alias alias : rest -> addEncodedAliases
( mconcat ["\\", show len, alias] : as
, offset' )
rest
where
-- offset + length + null
offset' = offset + length alias + 2
nextAliasOffset = if null rest then 0 else offset'
[] -> acc
len = length alias
offset' = offset + len + 1
[] -> ("\\0" : as, offset + 1)

done names = unlines
[ apacheLicense 2022 moduleName
, "{-# LANGUAGE DeriveGeneric, PatternSynonyms #-}"
, "{-# OPTIONS_HADDOCK hide #-}"
, ""
, "module " <> moduleName
, "(NameAliasType(..), maxNameAliasType, nameAliases)"
, "(NameAliasType(..), pattern MaxNameAliasType, nameAliases)"
, "where"
, ""
, "import Data.Ix (Ix)"
, "import GHC.Exts (Addr#, Char#)"
, "import GHC.Exts (Addr#, Char#, Int#)"
, "import GHC.Generics (Generic)"
, ""
, "-- | Type of name alias. See Unicode Standard 15.0.0, section 4.8."
, "--"
Expand All @@ -1831,21 +1831,39 @@ genAliasesModule moduleName =
, " | Abbreviation"
, " -- ^ Commonly occurring abbreviations (or acronyms) for control codes,"
, " -- format characters, spaces, and variation selectors."
, " deriving (Enum, Bounded, Eq, Ord, Ix, Show)"
, " deriving (Generic, Enum, Bounded, Eq, Ord, Ix, Show)"
, ""
, "-- $setup"
, "-- >>> import GHC.Exts (Int(..))"
, ""
, "-- | >>> maxNameAliasType == fromEnum (maxBound :: NameAliasType)"
, "maxNameAliasType :: Int"
, "maxNameAliasType = 4"
, "-- |"
, "-- >>> I# MaxNameAliasType == fromEnum (maxBound :: NameAliasType)"
, "-- True"
, "pattern MaxNameAliasType :: Int#"
, "pattern MaxNameAliasType = "
<> show (fromEnum (maxBound :: AliasType)) <> "#"
, ""
, "-- | Detailed character names aliases."
, "-- The names are listed in the original order of the UCD."
, "--"
, "-- See 'nameAliases' if the alias type is not required."
, "-- Encoding:"
, "--"
, "-- * If there is no alias, return @\"\\\\xff\"#@."
, "-- * For each type of alias, the aliases are encoded as list of (length, alias)."
, "-- The list terminates with @\\\\0@."
, "-- * The list are then concatenated in order of type of alias and"
, "-- terminates with @\\\\0@."
, "-- * The first "
<> show (fromEnum (maxBound :: AliasType) + 1)
<> " bytes represent each one the index of the first element of the"
, "-- corresponding list of aliases. When the list is empty, then the index is 0."
, "-- * Example: @\\\"\\\\5\\\\0\\\\13\\\\0\\\\0\\\\3XXX\\\\2YY\\\\0\\\\4ZZZZ\\\\0\\\\0\\\"#@"
, "-- represents: @[('Correction',[\\\"XXX\\\", \\\"YY\\\"]),('Alternate', [\\\"ZZZZ\\\"])]@."
, "--"
, "-- @since 0.1.0"
, "nameAliases :: Char# -> Addr#"
, "nameAliases = \\case"
, mconcat names
, mconcat (intersperse "\n" names)
, " _ -> \"\\xff\"#"
]

Expand Down
124 changes: 109 additions & 15 deletions unicode-data-names/bench/Main.hs
Original file line number Diff line number Diff line change
@@ -1,10 +1,22 @@
{-# LANGUAGE CPP, ExistentialQuantification #-}
{-# OPTIONS_GHC -Wno-orphans #-}

import Control.DeepSeq (NFData, deepseq)
import Control.DeepSeq (NFData, deepseq, force)
import Control.Exception (evaluate)
import Data.Ix (Ix(..))
import Test.Tasty.Bench (Benchmark, bgroup, bcompare, bench, nf, defaultMain)
import Data.Proxy (Proxy(..))
import GHC.Exts (Char(..), indexCharOffAddr#)
import Test.Tasty (askOption, includingOptions)
import Test.Tasty.Bench (Benchmark, bgroup, bcompare, bench, benchIngredients, nf, env)
import Test.Tasty.Options
( IsOption(defaultValue, optionHelp, optionName, parseValue)
, OptionDescription(..) )
import Test.Tasty.Runners (TestTree, defaultMainWithIngredients)

import qualified Unicode.Char as UChar
import qualified Unicode.Char.General.Names as String
import qualified Unicode.Internal.Char.UnicodeData.DerivedName as DerivedName
import qualified Unicode.Internal.Char.UnicodeData.NameAliases as NameAliases
#ifdef HAS_BYTESTRING
import qualified Unicode.Char.General.Names.ByteString as ByteString
import Data.ByteString ()
Expand All @@ -20,15 +32,77 @@ import qualified ICU.Names.Text as ICUText
#endif
#endif

--------------------------------------------------------------------------------
-- CLI options
--------------------------------------------------------------------------------

data CharRange = CharRange !Char !Char

instance IsOption CharRange where
defaultValue = CharRange minBound maxBound
parseValue = \case
"ascii" -> Just (CharRange minBound '\x7f')
"bmp" -> Just (CharRange minBound '\xffff')
"planes0To3" -> Just (CharRange minBound '\x3FFFF')
-- [TODO] handle errors
s ->
let (l, u) = drop 1 <$> break (== '-') s
in Just (CharRange (UChar.chr (read l)) (UChar.chr (read u)))
optionName = pure "chars"
optionHelp = pure "Range of chars to test"

data Filter = NoFilter | WithName | WithNameAlias

instance IsOption Filter where
defaultValue = WithName
parseValue = \case
"name" -> Just WithName
"alias" -> Just WithNameAlias
"none" -> Just NoFilter
_ -> Nothing
optionName = pure "chars-filter"
optionHelp = pure "Filter the chars to test"

--------------------------------------------------------------------------------
-- Benchmark utils
--------------------------------------------------------------------------------

-- Orphan instance
instance NFData String.NameAliasType

-- | A unit benchmark
data Bench = forall a. (NFData a) => Bench
{ -- | Name
_title :: !String
-- | Function to benchmark
, _func :: Char -> a }

hasName :: Char -> Bool
hasName (C# c#) = case DerivedName.name c# of
(# _, 0# #) -> False
_ -> True

hasNameAlias :: Char -> Bool
hasNameAlias (C# c#) =
let addr# = NameAliases.nameAliases c#
in case indexCharOffAddr# addr# 0# of
'\xff'# -> False
_ -> True

--------------------------------------------------------------------------------
-- Benchmark
--------------------------------------------------------------------------------

main :: IO ()
main = defaultMain
main = do
let customOpts = [ Option (Proxy :: Proxy CharRange)
, Option (Proxy :: Proxy Filter)]
ingredients = includingOptions customOpts : benchIngredients
defaultMainWithIngredients ingredients
(askOption (askOption . benchmarks))

benchmarks :: CharRange -> Filter -> TestTree
benchmarks charRange charFilter = bgroup "All"
[ bgroup "Unicode.Char.General.Names"
[ bgroup "name"
[ bgroup' "name" "String"
Expand Down Expand Up @@ -96,33 +170,33 @@ main = defaultMain
, bgroup "nameAliasesByType"
[ bgroup' "nameAliasesByType" "String"
[ Bench "unicode-data"
(\c -> (`String.nameAliasesByType` c) <$> [minBound..maxBound])
(\c -> fold_ (`String.nameAliasesByType` c))
]
#ifdef HAS_BYTESTRING
, bgroup' "nameAliasesByType" "ByteString"
[ Bench "unicode-data"
(\c -> (`ByteString.nameAliasesByType` c) <$> [minBound..maxBound])
(\c -> fold_ (`String.nameAliasesByType` c))
]
#endif
#ifdef HAS_TEXT
, bgroup' "nameAliasesByType" "Text"
[ Bench "unicode-data"
(\c -> (`Text.nameAliasesByType` c) <$> [minBound..maxBound])
(\c -> fold_ (`String.nameAliasesByType` c))
]
#endif
]
, bgroup "nameAliasesWithTypes"
[ bgroup' "nameAliasesWithTypes" "String"
[ Bench "unicode-data" (show . String.nameAliasesWithTypes)
[ Bench "unicode-data" String.nameAliasesWithTypes
]
#ifdef HAS_BYTESTRING
, bgroup' "nameAliasesWithTypes" "ByteString"
[ Bench "unicode-data" (show . ByteString.nameAliasesWithTypes)
[ Bench "unicode-data" ByteString.nameAliasesWithTypes
]
#endif
#ifdef HAS_TEXT
, bgroup' "nameAliasesWithTypes" "Text"
[ Bench "unicode-data" (show . Text.nameAliasesWithTypes)
[ Bench "unicode-data" Text.nameAliasesWithTypes
]
#endif
]
Expand Down Expand Up @@ -151,18 +225,38 @@ main = defaultMain

-- [NOTE] Works if groupTitle uniquely identifies the benchmark group.
benchNF' superGroupTitle groupTitle title = case title of
"unicode-data" -> benchNF title
"unicode-data" -> benchCharsNF title
_ ->
bcompare ( mconcat
[ "$NF == \"unicode-data\" && $(NF-1) == \""
, groupTitle
, "\" && $(NF-2) == \""
, superGroupTitle
, "\"" ] )
. benchNF title
. benchCharsNF title

benchNF :: forall a. (NFData a) => String -> (Char -> a) -> Benchmark
benchNF t f = bench t $ nf (fold_ f) (minBound, maxBound)
{-# INLINE benchCharsNF #-}
benchCharsNF
:: forall a. (NFData a)
=> String
-> (Char -> a)
-> Benchmark
benchCharsNF t f =
-- Avoid side-effects with garbage collection (see tasty-bench doc)
env
(evaluate (force chars')) -- initialize
(bench t . nf (foldr (deepseq . f) ())) -- benchmark
where
CharRange l u = charRange
extraFilter = case charFilter of
NoFilter -> const True
WithName -> hasName
WithNameAlias -> hasNameAlias
chars = filter isValid [l..u]
-- Ensure to have sufficiently chars
n = 0x10FFFF `div` length chars
chars' = mconcat (replicate n chars)
isValid c = UChar.generalCategory c < UChar.Surrogate && extraFilter c

fold_ :: forall a. (NFData a) => (Char -> a) -> (Char, Char) -> ()
fold_ f = foldr (deepseq . f) () . range
fold_ :: forall a. (NFData a) => (String.NameAliasType -> a) -> ()
fold_ f = foldr (deepseq . f) () (range (minBound, maxBound))
Loading

0 comments on commit 871e6f2

Please sign in to comment.