Skip to content

Commit

Permalink
core: Improve Blocks
Browse files Browse the repository at this point in the history
  • Loading branch information
wismill committed Jun 18, 2024
1 parent e324a09 commit 093f2d6
Show file tree
Hide file tree
Showing 6 changed files with 424 additions and 378 deletions.
57 changes: 28 additions & 29 deletions ucd2haskell/exe/UCD2Haskell/Modules/Blocks.hs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@ recipe = ModuleRecipe
genBlocksModule

data Acc = Acc
{ blocks :: ![BB.Builder]
{ count :: !Word
, blocks :: ![BB.Builder]
, defs :: ![BB.Builder]
, ranges :: ![(Int, Int)] }

Expand All @@ -33,10 +34,11 @@ genBlocksModule moduleName = Fold step initial done

done Acc{..} = let ranges' = reverse ranges in unlinesBB
[ apacheLicense 2022 moduleName
, "{-# LANGUAGE LambdaCase #-}"
, "{-# OPTIONS_HADDOCK hide #-}"
, ""
, "module " <> moduleName
, "(Block(..), BlockDefinition(..), block, blockDefinition)"
, "(Block(..), block, blockDefinition)"
, "where"
, ""
, "import Data.Ix (Ix)"
Expand All @@ -53,25 +55,25 @@ genBlocksModule moduleName = Fold step initial done
, " = " <> mconcat (L.intersperse "\n | " (reverse blocks))
, " deriving (Enum, Bounded, Eq, Ord, Ix, Show)"
, ""
, "-- | Block definition: range and name."
, "--"
, "-- @since 0.3.1"
, "data BlockDefinition = BlockDefinition"
, " { blockRange :: !(Int, Int) -- ^ Range"
, " , blockName :: !String -- ^ Name"
, " } deriving (Eq, Ord, Show)"
, ""
, "-- | Block definition"
, "--"
, "-- Undefined for values greater than " <> BB.wordDec (pred count) <> "."
, "--"
, "-- Returned value:"
, "--"
, "-- * Lower bound"
, "-- * Upper bound"
, "-- * Name (null terminated ASCII string)"
, "--"
, "-- @since 0.3.1"
, "blockDefinition :: Block -> BlockDefinition"
, "blockDefinition b = case b of"
, "blockDefinition :: Int# -> (# Int#, Int#, Addr# #)"
, "blockDefinition = \\case"
, mconcat (reverse defs)
, "-- | Character block, if defined."
, "-- | Character block, if defined, else -1."
, "--"
, "-- @since 0.3.1"
, "block :: Char -> Maybe Int"
, "block (C# c#) = getBlock 0# " <> BB.intDec (length ranges - 1) <> BB.char7 '#'
, "block :: Char# -> Int#"
, "block c# = getBlock 0# " <> BB.intDec (length ranges - 1) <> BB.char7 '#'
, " where"
, " -- [NOTE] Encoding"
, " -- A range is encoded as two LE Word32:"
Expand All @@ -83,7 +85,7 @@ genBlocksModule moduleName = Fold step initial done
, ""
, " -- Binary search"
, " getBlock l# u# = if isTrue# (l# ># u#)"
, " then Nothing"
, " then -1#"
, " else"
, " let k# = l# +# uncheckedIShiftRL# (u# -# l#) 1#"
, " j# = k# `uncheckedIShiftL#` 1#"
Expand All @@ -99,7 +101,7 @@ genBlocksModule moduleName = Fold step initial done
, " then getBlock l# (k# -# 1#)"
, " -- cp in block: get block index"
, " else let block# = cpL0# `uncheckedShiftRL#` 21#"
, " in Just (I# (word2Int# block#))"
, " in word2Int# block#"
, ""
, " getRawCodePoint# = lookupWord32# ranges#"
, ""
Expand All @@ -111,7 +113,7 @@ genBlocksModule moduleName = Fold step initial done
, " \"" <> enumMapToAddrLiteral 4 0xff (mkRanges ranges') "\"#"
]

initial = Acc mempty mempty mempty
initial = Acc 0 mempty mempty mempty

step Acc{..} (Prop.Entry range blockName) = case range of
U.SingleChar c -> error ("genBlocksModule: expected range, got: " <> show c)
Expand All @@ -120,8 +122,9 @@ genBlocksModule moduleName = Fold step initial done
blockRange = (ord start, ord end)
blockName' = BB.shortByteString blockName
in Acc
{ blocks = mkBlockConstructor blockID blockName' blockRange : blocks
, defs = mkBlockDef blockID blockName' blockRange : defs
{ count = succ count
, blocks = mkBlockConstructor blockID blockName' blockRange : blocks
, defs = mkBlockDef count blockName' blockRange : defs
, ranges = blockRange : ranges }

mkBlockConstructor blockID blockName (l, u) = mconcat
Expand All @@ -135,16 +138,12 @@ genBlocksModule moduleName = Fold step initial done
, "."
]

mkBlockDef blockID blockName (l, u) = mconcat
mkBlockDef blockIndex blockName (l, u) = mconcat
[ " "
, blockID
, " -> BlockDefinition (0x"
, showPaddedHexB l
, ", 0x"
, showPaddedHexB u
, ") \""
, blockName
, "\"\n"
, if u == ord maxBound then "_ " else BB.wordDec blockIndex <> "#"
, " -> (# 0x", showPaddedHexB l, "#, 0x", showPaddedHexB u, "#, \""
, blockName -- NOTE: name is ASCII
, "\\0\"# #)\n"
]

-- [NOTE] Encoding: a range is encoded as two LE Word32:
Expand Down
38 changes: 34 additions & 4 deletions unicode-data/lib/Unicode/Char/General/Blocks.hs
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
{-# LANGUAGE PatternSynonyms #-}
{-# LANGUAGE ViewPatterns #-}
{-# LANGUAGE DeriveGeneric #-}

-- |
-- Module : Unicode.Char.General.Blocks
-- Copyright : (c) 2020 Composewell Technologies and Contributors
Expand All @@ -10,19 +14,45 @@
-- @since 0.3.1

module Unicode.Char.General.Blocks
( B.Block(..)
, B.BlockDefinition(..)
( -- * Blocks
B.Block(..)
, block
, B.blockDefinition
-- * Blocks definitions
, BlockDefinition(..)
, blockDefinition
)

where

import GHC.Exts (Char (..), Int (..), dataToTag#, tagToEnum#)

import Unicode.Internal.Bits (unpackCString#)
import qualified Unicode.Internal.Char.Blocks as B

-- | Character [block](https://www.unicode.org/glossary/#block), if defined.
--
-- @since 0.3.1
{-# INLINE block #-}
block :: Char -> Maybe B.Block
block = fmap toEnum . B.block
block (C# c#) = case B.block c# of
-1# -> Nothing
b# -> Just (tagToEnum# b# :: B.Block)

-- | Block definition: range and name.
--
-- @since 0.3.1
data BlockDefinition = BlockDefinition
{ blockRange :: !(Int, Int) -- ^ Range
, blockName :: !String -- ^ Name
} deriving (Eq, Ord, Show)

-- | Block definition
--
-- @since 0.3.1
blockDefinition :: B.Block -> BlockDefinition
blockDefinition b = case B.blockDefinition (dataToTag# b) of
(# lower#, upper#, name# #) -> BlockDefinition range name
where
!range = (I# lower#, I# upper#)
-- Note: names are ASCII. See Unicode Standard 15.0.0, section 3.4.
!name = unpackCString# name#
13 changes: 11 additions & 2 deletions unicode-data/lib/Unicode/Internal/Bits.hs
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,13 @@
-- Fast, static bitmap lookup utilities

module Unicode.Internal.Bits
( lookupBit64,
( -- * Bitmap lookup
lookupBit64,
lookupWord8AsInt,
lookupWord16AsInt,
lookupWord32#
lookupWord32#,
-- * CString
unpackCString#
) where

#include "MachDeps.h"
Expand All @@ -36,6 +39,12 @@ import GHC.Exts
byteSwap16#, byteSwap32#)
#endif

#if MIN_VERSION_base(4,15,0)
import GHC.Exts (unpackCString#)
#else
import GHC.CString (unpackCString#)
#endif

-- | @lookup64 addr index@ looks up the bit stored at bit index @index@ using a
-- bitmap starting at the address @addr@. Looks up the 64-bit word containing
-- the bit and then the bit in that word. The caller must make sure that the
Expand Down
Loading

0 comments on commit 093f2d6

Please sign in to comment.