diff --git a/Data/Text/Normalize.hs b/Data/Text/Normalize.hs index aa09564..047878b 100644 --- a/Data/Text/Normalize.hs +++ b/Data/Text/Normalize.hs @@ -16,7 +16,6 @@ module Data.Text.Normalize NormalizationMode(..) -- * Normalization API , normalize - , normalizeQC ) where import Data.Text (Text) @@ -25,9 +24,8 @@ import Data.Unicode.Types (NormalizationMode(..)) -- Internal modules import Data.Unicode.Internal.NormalizeStream ( DecomposeMode(..) - , normalizeQC , stream - , unstream + , unstreamD , unstreamC ) @@ -36,7 +34,7 @@ import Data.Unicode.Internal.NormalizeStream normalize :: NormalizationMode -> Text -> Text normalize mode = case mode of - NFD -> (unstream Canonical) . stream - NFKD -> (unstream Kompat) . stream - NFC -> (unstreamC Canonical) . stream - NFKC -> (unstreamC Kompat) . stream + NFD -> unstreamD Canonical . stream + NFKD -> unstreamD Kompat . stream + NFC -> unstreamC Canonical . stream + NFKC -> unstreamC Kompat . stream diff --git a/Data/Unicode/Internal/NormalizeStream.hs b/Data/Unicode/Internal/NormalizeStream.hs index 38d54e1..9497d8b 100644 --- a/Data/Unicode/Internal/NormalizeStream.hs +++ b/Data/Unicode/Internal/NormalizeStream.hs @@ -17,23 +17,19 @@ module Data.Unicode.Internal.NormalizeStream ( UC.DecomposeMode(..) - , normalizeQC , stream - , unstream + , unstreamD , unstreamC ) where import Data.Char (chr, ord) -import Data.Unicode.Types (NormalizationMode(..)) import GHC.ST (ST(..)) import GHC.Types (SPEC(..)) -import Unicode.Char.Normalization - ( QuickCheck(..) - , isNFD - , isNFKD - , isNFC - , isNFKC +import Unicode.Char.Normalization (isNFD, isNFKD) +import Unicode.Internal.Char.DerivedNormalizationProperties + ( isNFC_QC + , isNFKC_QC ) import qualified Unicode.Char as UC @@ -56,57 +52,73 @@ import Data.Text.Internal.Private (runText) import Data.Text.Internal.Unsafe.Char (unsafeWrite) ------------------------------------------------------------------------------- --- Reorder buffer to hold characters till the next starter boundary +-- Buffer to hold sorted combining characters till the next starter boundary ------------------------------------------------------------------------------- -- | A list of combining characters, ordered by 'UC.combiningClass'. -- Couple of top levels are unrolled and unpacked for efficiency. -data ReBuf = Empty | One !Char | Many !Char !Char ![Char] +data ReBuf + = Empty + | One !Char + | Many !Char !Char ![Char] +-- | Insert a combining character in its canonical position in a buffer. {-# INLINE insertIntoReBuf #-} insertIntoReBuf :: Char -> ReBuf -> ReBuf -insertIntoReBuf c Empty = One c -insertIntoReBuf c (One c0) - | UC.combiningClass c < UC.combiningClass c0 - = Many c c0 [] - | otherwise - = Many c0 c [] -insertIntoReBuf c (Many c0 c1 cs) - | cc < UC.combiningClass c0 - = Many c c0 (c1 : cs) - | cc < UC.combiningClass c1 - = Many c0 c (c1 : cs) - | otherwise - = Many c0 c1 (cs' ++ (c : cs'')) - where - cc = UC.combiningClass c - (cs', cs'') = span ((<= cc) . UC.combiningClass) cs - -writeStr :: A.MArray s -> Int -> [Char] -> ST s Int +insertIntoReBuf c = \case + Empty -> One c + One c0 + | UC.combiningClass c < UC.combiningClass c0 + -> Many c c0 [] + | otherwise + -> Many c0 c [] + Many c0 c1 cs + | cc < UC.combiningClass c0 + -> Many c c0 (c1 : cs) + | cc < UC.combiningClass c1 + -> Many c0 c (c1 : cs) + | otherwise + -> Many c0 c1 (cs' ++ (c : cs'')) + where + cc = UC.combiningClass c + (cs', cs'') = span ((<= cc) . UC.combiningClass) cs + +-- | Write a string in an array. +writeStr + :: A.MArray s + -> Int -- ^ Array offset + -> String -- ^ String to write + -> ST s Int writeStr marr di str = go di str where - go i [] = return i + go i [] = pure i go i (c : cs) = do n <- unsafeWrite marr i c go (i + n) cs +-- | Write combining character buffer in an array. {-# INLINE writeReorderBuffer #-} -writeReorderBuffer :: A.MArray s -> Int -> ReBuf -> ST s Int -writeReorderBuffer _ di Empty = return di - -writeReorderBuffer marr di (One c) = do - n <- unsafeWrite marr di c - return (di + n) - -writeReorderBuffer marr di (Many c1 c2 str) = do - n1 <- unsafeWrite marr di c1 - n2 <- unsafeWrite marr (di + n1) c2 - writeStr marr (di + n1 + n2) str +writeReorderBuffer + :: A.MArray s + -> Int -- ^ Array offset + -> ReBuf -- ^ Sorted combining characters + -> ST s Int +writeReorderBuffer marr di = \case + Empty -> pure di + One c -> do + n <- unsafeWrite marr di c + pure (di + n) + Many c1 c2 str -> do + n1 <- unsafeWrite marr di c1 + n2 <- unsafeWrite marr (di + n1) c2 + writeStr marr (di + n1 + n2) str ------------------------------------------------------------------------------- -- Decomposition of Hangul characters is done algorithmically ------------------------------------------------------------------------------- +-- | Decompose and write Hangul character. +-- Decomposition of Hangul characters is done algorithmically. -- {-# INLINE decomposeCharHangul #-} decomposeCharHangul :: A.MArray s -> Int -> Char -> ST s Int decomposeCharHangul marr j c = @@ -122,42 +134,64 @@ decomposeCharHangul marr j c = where (l, v, t) = UC.decomposeHangul c +-- | Decompose a character and write the result in a target array. {-# INLINE decomposeChar #-} decomposeChar - :: UC.DecomposeMode - -> A.MArray s -- destination array for decomposition - -> Int -- array index - -> ReBuf -- reorder buffer - -> Char -- char to be decomposed + :: UC.DecomposeMode -- ^ Decomposition mode: Canonical or Kompat + -> A.MArray s -- ^ Destination array for decomposition + -> Int -- ^ Array index + -> ReBuf -- ^ Reorder buffer + -> Char -- ^ Char to be decomposed -> ST s (Int, ReBuf) decomposeChar mode marr index reBuf ch + -- ASCII + C1 control codes: we know it’s always allowed and a starter + | ch <= '\x9f' = do + writeStarter index reBuf ch + -- Quick check + | quickCheck = + reorder index reBuf ch + -- Hangul | UC.isHangul ch = do j <- writeReorderBuffer marr index reBuf (, Empty) <$> decomposeCharHangul marr j ch - | UC.isDecomposable mode ch = - decomposeAll marr index reBuf (UC.decompose mode ch) + -- Decomposable | otherwise = - reorder marr index reBuf ch + decomposeAll index reBuf (UC.decompose mode ch) where - {-# INLINE decomposeAll #-} - decomposeAll _ i rbuf [] = return (i, rbuf) - decomposeAll arr i rbuf (x : xs) - | UC.isDecomposable mode x = do - (i', rbuf') <- decomposeAll arr i rbuf (UC.decompose mode x) - decomposeAll arr i' rbuf' xs - | otherwise = do - (i', rbuf') <- reorder arr i rbuf x - decomposeAll arr i' rbuf' xs + quickCheck = case mode of + UC.Canonical -> isNFD ch + UC.Kompat -> isNFKD ch + -- Decompose a character recursively and write the result in the array + {-# INLINE decomposeAll #-} + decomposeAll i rbuf = \case + [] -> pure (i, rbuf) + x:xs + | UC.isDecomposable mode x -> do + (i', rbuf') <- decomposeAll i rbuf (UC.decompose mode x) + decomposeAll i' rbuf' xs + | otherwise -> do + (i', rbuf') <- reorder i rbuf x + decomposeAll i' rbuf' xs + + -- Given a normalized char, buffer it if combining else write it {-# INLINE reorder #-} - reorder arr i rbuf c - | UC.isCombining c = return (i, insertIntoReBuf c rbuf) - | otherwise = do - j <- writeReorderBuffer arr i rbuf - n <- unsafeWrite arr j c - return (j + n, Empty) + reorder i rbuf c + | UC.isCombining c = pure (i, insertIntoReBuf c rbuf) + | otherwise = writeStarter i rbuf c + + -- Write a normalized char + {-# INLINE writeStarter #-} + writeStarter i rbuf c = do + j <- writeReorderBuffer marr i rbuf + n <- unsafeWrite marr j c + pure (j + n, Empty) + +------------------------------------------------------------------------------- +-- Stream text +------------------------------------------------------------------------------- #if !MIN_VERSION_text(2,0,0) -- | /O(n)/ Convert a 'Text' into a 'Stream Char'. @@ -178,65 +212,29 @@ stream (Text arr off len) = Stream next off (betweenSize (len `shiftR` 1) len) {-# INLINE [0] stream #-} #endif --- | Perform Unicode normalization on @Text@ according to the specified --- normalization mode, using the quick check algorithm to speed up case when --- the text is already normalized. -normalizeQC - :: NormalizationMode - -> Text - -> Text -normalizeQC mode t = case stream t of - str@(Stream next0 s0 _len) -> - let quickCheck s cc r = case next0 s of - Done -> r - Skip s' -> quickCheck s' cc r - Yield c s' -> - -- [TODO] the ASCII check speeds up latin scripts, - -- but it slows down the other scripts. - if c <= '\x7f' - -- For ASCII we know it’s always allowed and a starter - then quickCheck s' 0 r - -- Otherwise, lookup the combining class and QC property - else let cc' = UC.combiningClass c in - if cc > cc' && cc' /= 0 - then No - else case check c of - No -> No - Maybe -> quickCheck s' cc' Maybe - Yes -> quickCheck s' cc' r - -- let cc' = UC.combiningClass c - -- in if cc > cc' && cc' /= 0 - -- then No - -- else case check c of - -- No -> No - -- Maybe -> quickCheck s' cc' Maybe - -- Yes -> quickCheck s' cc' r - check = case mode of - NFD -> isNFD - NFKD -> isNFKD - NFC -> isNFC - NFKC -> isNFKC - in case quickCheck s0 0 Yes of - Yes -> t - _ -> case mode of - NFD -> unstream UC.Canonical str - NFKD -> unstream UC.Kompat str - NFC -> unstreamC UC.Canonical str - NFKC -> unstreamC UC.Kompat str +------------------------------------------------------------------------------- +-- Normalization Forms NFD & NFKD +------------------------------------------------------------------------------- + +-- [TODO] use SPEC?? -- | /O(n)/ Convert a 'Stream Char' into a decompose-normalized 'Text'. -unstream :: UC.DecomposeMode -> Stream Char -> Text -unstream mode (Stream next0 s0 len) = runText $ \done -> do +unstreamD :: UC.DecomposeMode -> Stream Char -> Text +unstreamD mode (Stream next0 s0 len) = runText $ \done -> do -- Before encoding each char we perform a buffer realloc check assuming -- worst case encoding size of two 16-bit units for the char. Just add an -- extra space to the buffer so that we do not end up reallocating even when -- all the chars are encoded as single unit. let margin = 1 + maxDecomposeLen - mlen = (upperBound 4 len + margin) + mlen = upperBound 4 len + margin arr0 <- A.new mlen let outer !arr !maxi = encode where - -- keep the common case loop as small as possible + -- si: internal state + -- di: current char index + -- rbuf: current ordered buffer of combining characters + + -- Keep the common case loop as small as possible encode !si !di rbuf = -- simply check for the worst case if maxi < di + margin @@ -246,14 +244,14 @@ unstream mode (Stream next0 s0 len) = runText $ \done -> do Done -> do di' <- writeReorderBuffer arr di rbuf done arr di' - Skip si' -> encode si' di rbuf + Skip si' -> encode si' di rbuf Yield c si' -> do - (di', rbuf') <- decomposeChar mode arr di rbuf c - encode si' di' rbuf' - -- n <- unsafeWrite arr di c - -- encode si' (di + n) rbuf + (di', rbuf') <- decomposeChar mode arr di rbuf c + encode si' di' rbuf' + -- n <- unsafeWrite arr di c + -- encode si' (di + n) rbuf - -- keep uncommon case separate from the common case code + -- Keep uncommon case separate from the common case code {-# NOINLINE realloc #-} realloc !si !di rbuf = do let newlen = maxi * 2 @@ -262,14 +260,15 @@ unstream mode (Stream next0 s0 len) = runText $ \done -> do outer arr' (newlen - 1) si di rbuf outer arr0 (mlen - 1) s0 0 Empty -{-# INLINE [0] unstream #-} +{-# INLINE [0] unstreamD #-} --- we can generate this from UCD +-- [TODO] we can generate this from UCD +-- [TODO] is https://www.unicode.org/reports/tr15/#Stream_Safe_Text_Format the source for this? maxDecomposeLen :: Int maxDecomposeLen = 32 ------------------------------------------------------------------------------- --- Composition +-- Normalization Forms NFC & NFKC ------------------------------------------------------------------------------- -- If we are composing we do not need to first decompose Hangul. We can just @@ -277,24 +276,34 @@ maxDecomposeLen = 32 -- syllable followed by a jamo T. We need to compose this case as well. -- Hold an L to wait for V, hold an LV to wait for T. +-- | Jamo buffer data JamoBuf - = Jamo !Char -- Jamo L, V or T - | Hangul !Char -- Hangul Syllable LV or LVT + = Jamo !Char + -- ^ Conjoining jamo L, V or T + | Hangul !Char + -- ^ Hangul syllable LV or LVT | HangulLV !Char + -- ^ Hangul syllable LV +-- | Generic Composition buffer data RegBuf = RegOne !Char | RegMany !Char !Char ![Char] +-- | Composition state data ComposeState = ComposeNone + -- ^ None | ComposeReg !RegBuf + -- ^ Generic buffer | ComposeJamo !JamoBuf + -- ^ Conjoining jamo buffer ------------------------------------------------------------------------------- -- Composition of Jamo into Hangul syllables, done algorithmically ------------------------------------------------------------------------------- +-- | Write a jamo buffer to a target array {-# INLINE writeJamoBuf #-} writeJamoBuf :: A.MArray s -> Int -> JamoBuf -> ST s Int writeJamoBuf arr i jbuf = do @@ -303,27 +312,34 @@ writeJamoBuf arr i jbuf = do where - getCh (Jamo ch) = ch - getCh (Hangul ch) = ch - getCh (HangulLV ch) = ch + getCh = \case + Jamo ch -> ch + Hangul ch -> ch + HangulLV ch -> ch +-- | Initialize Jamo buffer with a precomposed Hangul character {-# INLINE initHangul #-} initHangul :: Char -> Int -> ST s (Int, ComposeState) initHangul c i = return (i, ComposeJamo (Hangul c)) +-- | Initialize Jamo buffer with conjoining jamo {-# INLINE initJamo #-} initJamo :: Char -> Int -> ST s (Int, ComposeState) initJamo c i = return (i, ComposeJamo (Jamo c)) +-- | Insert a conjoining jamo in the buffer {-# INLINE insertJamo #-} insertJamo :: A.MArray s -> Int -> JamoBuf -> Char -> ST s (Int, ComposeState) insertJamo arr i jbuf ch + -- Leading consonant (L): write buffer and start new one | ich <= UC.jamoLLast = do j <- writeJamoBuf arr i jbuf return (j, ComposeJamo (Jamo ch)) + -- Jamo that is neither L, V nor T | ich < UC.jamoVFirst = flushAndWrite arr i jbuf ch + -- Vowel (V): | ich <= UC.jamoVLast = do case jbuf of Jamo c -> @@ -334,10 +350,12 @@ insertJamo arr i jbuf ch lv = chr (UC.hangulFirst + lvi) in return (i, ComposeJamo (HangulLV lv)) Nothing -> writeTwo arr i c ch - Hangul c -> writeTwo arr i c ch + Hangul c -> writeTwo arr i c ch HangulLV c -> writeTwo arr i c ch + -- Jamo that is neither L, V nor T | ich <= UC.jamoTFirst = do flushAndWrite arr i jbuf ch + -- Trailing consonant (T) | otherwise = do let ti = ich - UC.jamoTFirst case jbuf of @@ -354,23 +372,27 @@ insertJamo arr i jbuf ch ich = ord ch + -- Write buffer and the following character, and start an empty buffer. {-# INLINE flushAndWrite #-} flushAndWrite marr ix jb c = do j <- writeJamoBuf marr ix jb n <- unsafeWrite marr j c return (j + n, ComposeNone) + -- Write LVT syllable and start an empty buffer {-# INLINE writeLVT #-} writeLVT marr ix lv ti = do n <- unsafeWrite marr ix (chr ((ord lv) + ti)) return (ix + n, ComposeNone) + -- Write two chars and start an empty buffer {-# INLINE writeTwo #-} writeTwo marr ix c1 c2 = do n <- unsafeWrite marr ix c1 m <- unsafeWrite marr (ix + n) c2 return ((ix + n + m), ComposeNone) +-- | Write buffer and start a new one with a pre-composed Hangul syllable. {-# INLINE insertHangul #-} insertHangul :: A.MArray s -> Int -> JamoBuf -> Char -> ST s (Int, ComposeState) @@ -378,24 +400,27 @@ insertHangul arr i jbuf ch = do j <- writeJamoBuf arr i jbuf return (j, ComposeJamo (Hangul ch)) +-- | Insert a combining character in its canonical position in a buffer. {-# INLINE insertIntoRegBuf #-} insertIntoRegBuf :: Char -> RegBuf -> RegBuf -insertIntoRegBuf c (RegOne c0) - | UC.combiningClass c < UC.combiningClass c0 - = RegMany c c0 [] - | otherwise - = RegMany c0 c [] -insertIntoRegBuf c (RegMany c0 c1 cs) - | cc < UC.combiningClass c0 - = RegMany c c0 (c1 : cs) - | cc < UC.combiningClass c1 - = RegMany c0 c (c1 : cs) - | otherwise - = RegMany c0 c1 (cs' ++ (c : cs'')) - where - cc = UC.combiningClass c - (cs', cs'') = span ((<= cc) . UC.combiningClass) cs +insertIntoRegBuf c = \case + RegOne c0 + | UC.combiningClass c < UC.combiningClass c0 + -> RegMany c c0 [] + | otherwise + -> RegMany c0 c [] + RegMany c0 c1 cs + | cc < UC.combiningClass c0 + -> RegMany c c0 (c1 : cs) + | cc < UC.combiningClass c1 + -> RegMany c0 c (c1 : cs) + | otherwise + -> RegMany c0 c1 (cs' ++ (c : cs'')) + where + cc = UC.combiningClass c + (cs', cs'') = span ((<= cc) . UC.combiningClass) cs +-- | Write generic compose buffer to a target array {-# INLINE writeRegBuf #-} writeRegBuf :: A.MArray s -> Int -> RegBuf -> ST s Int writeRegBuf arr i = \case @@ -424,6 +449,7 @@ writeRegBuf arr i = \case cc = UC.combiningClass c (same, bigger) = span ((== cc) . UC.combiningClass) cs +-- | Finalize composition {-# INLINE flushComposeState #-} flushComposeState :: A.MArray s -> Int -> ComposeState -> ST s Int flushComposeState arr i = \case @@ -431,18 +457,28 @@ flushComposeState arr i = \case ComposeReg rbuf -> writeRegBuf arr i rbuf ComposeJamo jbuf -> writeJamoBuf arr i jbuf +-- | Compose characters if possible and write the result in a target array. {-# INLINE composeChar #-} composeChar :: UC.DecomposeMode - -> A.MArray s -- destination array for composition - -> Char -- input char - -> Int -- array index - -> ComposeState + -> A.MArray s -- ^ Destination array for composition + -> Char -- ^ Input char + -> Int -- ^ Array index + -> ComposeState -- ^ Compose state -> ST s (Int, ComposeState) composeChar mode marr = go0 where + quickCheck = case mode of + UC.Canonical -> isNFC_QC + UC.Kompat -> isNFKC_QC + + -- ch: input char + -- i: array index + -- st: compose state + + -- Start normalization with initial compose state go0 ch !i !st = case st of ComposeReg rbuf @@ -482,34 +518,61 @@ composeChar mode marr = go0 initReg ch i where ich = ord ch + -- Write jamo buffer and initialize new buffer {-# INLINE jamoToReg #-} jamoToReg arr i jbuf ch = do j <- writeJamoBuf arr i jbuf initReg ch j + -- Start new normalization with empty previous buffer {-# INLINE initReg #-} initReg !ch !i + -- [TODO] Check if quick check is needed here. Probably not. | UC.isDecomposable mode ch = go (UC.decompose mode ch) i ComposeNone | otherwise = pure (i, ComposeReg (RegOne ch)) + -- Start normalization with a non-empty generic buffer {-# INLINE composeReg #-} - composeReg rbuf !ch !i !st - | UC.isDecomposable mode ch = - go (UC.decompose mode ch) i st - | UC.isCombining ch = do - pure (i, ComposeReg (insertIntoRegBuf ch rbuf)) - -- The first char in RegBuf may or may not be a starter. In - -- case it is not we rely on composeStarters failing. - | RegOne s <- rbuf - , UC.isCombiningStarter ch - , Just x <- UC.composeStarters s ch = - pure (i, (ComposeReg (RegOne x))) - | otherwise = do - j <- writeRegBuf marr i rbuf - pure (j, ComposeReg (RegOne ch)) - + -- composeReg rbuf !ch !i !st + -- | UC.isDecomposable mode ch = + -- go (UC.decompose mode ch) i st + -- | UC.isCombining ch = do + -- pure (i, ComposeReg (insertIntoRegBuf ch rbuf)) + -- -- The first char in RegBuf may or may not be a starter. In + -- -- case it is not we rely on composeStarters failing. + -- | RegOne s <- rbuf + -- , UC.isCombiningStarter ch + -- , Just x <- UC.composeStarters s ch = + -- pure (i, (ComposeReg (RegOne x))) + -- | otherwise = do + -- j <- writeRegBuf marr i rbuf + -- pure (j, ComposeReg (RegOne ch)) + composeReg rbuf !ch !i !st = case quickCheck ch of + -- QC Yes: Non decomposable, never compose with previous, starter + 3 -> do + n <- writeRegBuf marr i rbuf + initReg ch n + -- QC Yes: Non decomposable, never compose with previous, combining + 2 -> pure (i, ComposeReg (insertIntoRegBuf ch rbuf)) + -- QC No: Decomposable + 0 -> go (UC.decompose mode ch) i st + -- QC Maybe: Non decomposable, may compose + _ -- Expected: 1 + | UC.isCombining ch -> + pure (i, ComposeReg (insertIntoRegBuf ch rbuf)) + -- The first char in RegBuf may or may not be a starter. In + -- case it is not we rely on composeStarters failing. + | RegOne s <- rbuf + , UC.isCombiningStarter ch + , Just x <- UC.composeStarters s ch -> + pure (i, (ComposeReg (RegOne x))) + | otherwise -> do + j <- writeRegBuf marr i rbuf + pure (j, ComposeReg (RegOne ch)) + + -- Recursively decompose, then compose if possible go [] !i !st = pure (i, st) go (ch : rest) i st = case st of @@ -564,37 +627,42 @@ composeChar mode marr = go0 -- | /O(n)/ Convert a 'Stream Char' into a composed normalized 'Text'. unstreamC :: UC.DecomposeMode -> Stream Char -> Text unstreamC mode (Stream next0 s0 len) = runText $ \done -> do - -- Before encoding each char we perform a buffer realloc check assuming - -- worst case encoding size of two 16-bit units for the char. Just add an - -- extra space to the buffer so that we do not end up reallocating even when - -- all the chars are encoded as single unit. - let margin = 1 + maxDecomposeLen - mlen = (upperBound 4 len + margin) - arr0 <- A.new mlen - let outer !arr !maxi = encode SPEC - where - -- keep the common case loop as small as possible - encode !_ !si !di st = - -- simply check for the worst case - if maxi < di + margin - then realloc si di st - else - case next0 si of - Done -> do - di' <- flushComposeState arr di st - done arr di' - Skip si' -> encode SPEC si' di st - Yield c si' -> do - (di', st') <- composeChar mode arr c di st - encode SPEC si' di' st' - - -- keep uncommon case separate from the common case code - {-# NOINLINE realloc #-} - realloc !si !di st = do - let newlen = maxi * 2 - arr' <- A.new newlen - A.copyM arr' 0 arr 0 di - outer arr' (newlen - 1) si di st - - outer arr0 (mlen - 1) s0 0 ComposeNone + -- Before encoding each char we perform a buffer realloc check assuming + -- worst case encoding size of two 16-bit units for the char. Just add an + -- extra space to the buffer so that we do not end up reallocating even when + -- all the chars are encoded as single unit. + let margin = 1 + maxDecomposeLen + mlen = upperBound 4 len + margin + arr0 <- A.new mlen + let outer !arr !maxi = encode SPEC + where + -- si: internal state + -- di: current char index [TODO] check this + -- [FIXME] cc: combining class of the previous character + -- st: compose state + + -- keep the common case loop as small as possible + encode !_ !si !di st = + -- simply check for the worst case + if maxi < di + margin + then realloc si di st + else + case next0 si of + Done -> do + di' <- flushComposeState arr di st + done arr di' + Skip si' -> encode SPEC si' di st + Yield c si' -> do + (di', st') <- composeChar mode arr c di st + encode SPEC si' di' st' + + -- keep uncommon case separate from the common case code + {-# NOINLINE realloc #-} + realloc !si !di st = do + let newlen = maxi * 2 + arr' <- A.new newlen + A.copyM arr' 0 arr 0 di + outer arr' (newlen - 1) si di st + + outer arr0 (mlen - 1) s0 0 ComposeNone {-# INLINE [0] unstreamC #-} diff --git a/benchmark/Benchmark.hs b/benchmark/Benchmark.hs index 7235763..2f1745c 100644 --- a/benchmark/Benchmark.hs +++ b/benchmark/Benchmark.hs @@ -20,9 +20,9 @@ import Path.IO (listDir) import System.FilePath (dropExtensions, takeFileName) import Gauge.Main (Benchmark, bench, bgroup, defaultMain, env, nf) -#ifdef USE_TASTY -import Gauge.Main (bcompare) -#endif +-- #ifdef USE_TASTY +-- import Gauge.Main (bcompare) +-- #endif import qualified Data.Text as T import qualified Data.Text.Normalize as UTText @@ -47,14 +47,6 @@ unicodeTransformTextFuncs = , ("NFKC", UTText.normalize UTText.NFKC) ] -unicodeTransformTextFuncsQuickCheck :: [(String, Text -> Text)] -unicodeTransformTextFuncsQuickCheck = - [ ("NFD", UTText.normalizeQC UTText.NFD) - , ("NFKD", UTText.normalizeQC UTText.NFKD) - , ("NFC", UTText.normalizeQC UTText.NFC) - , ("NFKC", UTText.normalizeQC UTText.NFKC) - ] - dataDir :: Path Rel Dir dataDir = $(mkRelDir "benchmark") $(mkRelDir "data") @@ -74,15 +66,15 @@ makeTestName implName dataName = implName ++ "/" ++ dataName makeBenchRef :: (NFData a, NFData b) => (String, a -> b) -> (String, IO a) -> Benchmark makeBenchRef impl (dataName, setup) = env setup (makeBench impl dataName) -makeBenchComp :: (NFData a, NFData b) => (String, a -> b) -> (String, IO a) -> Benchmark -#ifdef USE_TASTY -makeBenchComp impl (dataName, setup) = env setup - ( bcompare ("$NF == \"" <> (makeTestName (fst impl) dataName) - <> "\" && $(NF-1) == \"unicode-transforms-text\"") - . makeBench impl dataName) -#else -makeBenchComp = makeBenchRef -#endif +-- makeBenchComp :: (NFData a, NFData b) => (String, a -> b) -> (String, IO a) -> Benchmark +-- #ifdef USE_TASTY +-- makeBenchComp impl (dataName, setup) = env setup +-- ( bcompare ("$NF == \"" <> (makeTestName (fst impl) dataName) +-- <> "\" && $(NF-1) == \"unicode-transforms-text\"") +-- . makeBench impl dataName) +-- #else +-- makeBenchComp = makeBenchRef +-- #endif strInput :: FilePath -> (String, IO String) strInput file = (dataName file, @@ -106,7 +98,4 @@ main = do bgroup "unicode-transforms-text" $ makeBenchRef <$> unicodeTransformTextFuncs <*> (map txtInput dataFiles) - , bgroup "unicode-transforms-text (QC)" - $ makeBenchComp <$> unicodeTransformTextFuncsQuickCheck - <*> (map txtInput dataFiles) ] diff --git a/test/NormalizationTest.hs b/test/NormalizationTest.hs index 452dd15..9ff69d9 100644 --- a/test/NormalizationTest.hs +++ b/test/NormalizationTest.hs @@ -22,9 +22,9 @@ import Data.List.Split (splitOn) import Data.Text (Text) import qualified Data.Text as T import qualified Data.Text.Encoding as T -import Data.Text.Normalize +import Data.Text.Normalize ( NormalizationMode(NFD, NFKD, NFC, NFKC) - , normalize, normalizeQC) + , normalize) import Text.Printf (printf) #if !MIN_VERSION_base(4,8,0) @@ -52,8 +52,7 @@ checkEqual opName op (c1, c2) = checkOp :: String -> NormalizationMode -> [(Text, Text)] -> IO Bool checkOp name op pairs = do res1 <- mapM (checkEqual name ((normalize op))) pairs - res2 <- mapM (checkEqual name ((normalizeQC op))) pairs - return $ all (== True) res1 && all (== True) res2 + return $ all (== True) res1 checkNFC :: (Text, Text, Text, Text, Text) -> IO Bool checkNFC (c1, c2, c3, c4, c5) = diff --git a/unicode-transforms.cabal b/unicode-transforms.cabal index fe57109..cbe6a4a 100644 --- a/unicode-transforms.cabal +++ b/unicode-transforms.cabal @@ -185,6 +185,7 @@ benchmark bench if flag(use-gauge) build-depends: gauge >=0.2.0 && <0.3 else + cpp-options: -DUSE_TASTY build-depends: tasty-bench>= 0.2.5 && <0.4 mixins: tasty-bench (Test.Tasty.Bench as Gauge.Main) if flag(dev)