From 74ef7df6633f43306b5ac4e0246fb5adedd8af56 Mon Sep 17 00:00:00 2001 From: Pierre Le Marre Date: Sun, 19 Jun 2022 12:03:02 +0200 Subject: [PATCH] Reorganize code --- Data/Unicode/Internal/NormalizeStream.hs | 588 +++++++++++------------ 1 file changed, 292 insertions(+), 296 deletions(-) diff --git a/Data/Unicode/Internal/NormalizeStream.hs b/Data/Unicode/Internal/NormalizeStream.hs index 9497d8b..d04aa94 100644 --- a/Data/Unicode/Internal/NormalizeStream.hs +++ b/Data/Unicode/Internal/NormalizeStream.hs @@ -51,6 +51,29 @@ import Data.Text.Internal.Fusion.Types (Step(..), Stream(..)) import Data.Text.Internal.Private (runText) import Data.Text.Internal.Unsafe.Char (unsafeWrite) +------------------------------------------------------------------------------- +-- Stream text +------------------------------------------------------------------------------- + +#if !MIN_VERSION_text(2,0,0) +-- | /O(n)/ Convert a 'Text' into a 'Stream Char'. +stream :: Text -> Stream Char +stream (Text arr off len) = Stream next off (betweenSize (len `shiftR` 1) len) + where + !end = off+len + {-# INLINE next #-} + next !i + | i >= end = Done + -- shift generates only two branches instead of three in case of + -- range check, works quite a bit faster with llvm backend. + | (n `shiftR` 10) == 0x36 = Yield (chr2 n n2) (i + 2) + | otherwise = Yield (unsafeChr n) (i + 1) + where + n = A.unsafeIndex arr i + n2 = A.unsafeIndex arr (i + 1) +{-# INLINE [0] stream #-} +#endif + ------------------------------------------------------------------------------- -- Buffer to hold sorted combining characters till the next starter boundary ------------------------------------------------------------------------------- @@ -114,25 +137,59 @@ writeReorderBuffer marr di = \case writeStr marr (di + n1 + n2) str ------------------------------------------------------------------------------- --- Decomposition of Hangul characters is done algorithmically +-- Normalization Forms NFD & NFKD ------------------------------------------------------------------------------- --- | Decompose and write Hangul character. --- Decomposition of Hangul characters is done algorithmically. --- {-# INLINE decomposeCharHangul #-} -decomposeCharHangul :: A.MArray s -> Int -> Char -> ST s Int -decomposeCharHangul marr j c = - if t == chr UC.jamoTFirst then do - n1 <- unsafeWrite marr j l - n2 <- unsafeWrite marr (j + n1) v - return (j + n1 + n2) - else do - n1 <- unsafeWrite marr j l - n2 <- unsafeWrite marr (j + n1) v - n3 <- unsafeWrite marr (j + n1 + n2) t - return (j + n1 + n2 + n3) - where - (l, v, t) = UC.decomposeHangul c +-- [TODO] use SPEC?? + +-- | /O(n)/ Convert a 'Stream Char' into a decompose-normalized 'Text'. +unstreamD :: UC.DecomposeMode -> Stream Char -> Text +unstreamD mode (Stream next0 s0 len) = runText $ \done -> do + -- Before encoding each char we perform a buffer realloc check assuming + -- worst case encoding size of two 16-bit units for the char. Just add an + -- extra space to the buffer so that we do not end up reallocating even when + -- all the chars are encoded as single unit. + let margin = 1 + maxDecomposeLen + mlen = upperBound 4 len + margin + arr0 <- A.new mlen + let outer !arr !maxi = encode + where + -- si: internal state + -- di: current char index + -- rbuf: current ordered buffer of combining characters + + -- Keep the common case loop as small as possible + encode !si !di rbuf = + -- simply check for the worst case + if maxi < di + margin + then realloc si di rbuf + else + case next0 si of + Done -> do + di' <- writeReorderBuffer arr di rbuf + done arr di' + Skip si' -> encode si' di rbuf + Yield c si' -> do + (di', rbuf') <- decomposeChar mode arr di rbuf c + encode si' di' rbuf' + -- n <- unsafeWrite arr di c + -- encode si' (di + n) rbuf + + -- Keep uncommon case separate from the common case code + {-# NOINLINE realloc #-} + realloc !si !di rbuf = do + let newlen = maxi * 2 + arr' <- A.new newlen + A.copyM arr' 0 arr 0 di + outer arr' (newlen - 1) si di rbuf + + outer arr0 (mlen - 1) s0 0 Empty +{-# INLINE [0] unstreamD #-} + +-- [TODO] we can generate this from UCD +-- [TODO] is https://www.unicode.org/reports/tr15/#Stream_Safe_Text_Format the source for this? +maxDecomposeLen :: Int +maxDecomposeLen = 32 -- | Decompose a character and write the result in a target array. {-# INLINE decomposeChar #-} @@ -189,107 +246,27 @@ decomposeChar mode marr index reBuf ch n <- unsafeWrite marr j c pure (j + n, Empty) -------------------------------------------------------------------------------- --- Stream text -------------------------------------------------------------------------------- - -#if !MIN_VERSION_text(2,0,0) --- | /O(n)/ Convert a 'Text' into a 'Stream Char'. -stream :: Text -> Stream Char -stream (Text arr off len) = Stream next off (betweenSize (len `shiftR` 1) len) +-- | Decompose and write Hangul character. +-- Decomposition of Hangul characters is done algorithmically. +-- {-# INLINE decomposeCharHangul #-} +decomposeCharHangul :: A.MArray s -> Int -> Char -> ST s Int +decomposeCharHangul marr j c = + if t == chr UC.jamoTFirst then do + n1 <- unsafeWrite marr j l + n2 <- unsafeWrite marr (j + n1) v + return (j + n1 + n2) + else do + n1 <- unsafeWrite marr j l + n2 <- unsafeWrite marr (j + n1) v + n3 <- unsafeWrite marr (j + n1 + n2) t + return (j + n1 + n2 + n3) where - !end = off+len - {-# INLINE next #-} - next !i - | i >= end = Done - -- shift generates only two branches instead of three in case of - -- range check, works quite a bit faster with llvm backend. - | (n `shiftR` 10) == 0x36 = Yield (chr2 n n2) (i + 2) - | otherwise = Yield (unsafeChr n) (i + 1) - where - n = A.unsafeIndex arr i - n2 = A.unsafeIndex arr (i + 1) -{-# INLINE [0] stream #-} -#endif - -------------------------------------------------------------------------------- --- Normalization Forms NFD & NFKD -------------------------------------------------------------------------------- - --- [TODO] use SPEC?? - --- | /O(n)/ Convert a 'Stream Char' into a decompose-normalized 'Text'. -unstreamD :: UC.DecomposeMode -> Stream Char -> Text -unstreamD mode (Stream next0 s0 len) = runText $ \done -> do - -- Before encoding each char we perform a buffer realloc check assuming - -- worst case encoding size of two 16-bit units for the char. Just add an - -- extra space to the buffer so that we do not end up reallocating even when - -- all the chars are encoded as single unit. - let margin = 1 + maxDecomposeLen - mlen = upperBound 4 len + margin - arr0 <- A.new mlen - let outer !arr !maxi = encode - where - -- si: internal state - -- di: current char index - -- rbuf: current ordered buffer of combining characters - - -- Keep the common case loop as small as possible - encode !si !di rbuf = - -- simply check for the worst case - if maxi < di + margin - then realloc si di rbuf - else - case next0 si of - Done -> do - di' <- writeReorderBuffer arr di rbuf - done arr di' - Skip si' -> encode si' di rbuf - Yield c si' -> do - (di', rbuf') <- decomposeChar mode arr di rbuf c - encode si' di' rbuf' - -- n <- unsafeWrite arr di c - -- encode si' (di + n) rbuf - - -- Keep uncommon case separate from the common case code - {-# NOINLINE realloc #-} - realloc !si !di rbuf = do - let newlen = maxi * 2 - arr' <- A.new newlen - A.copyM arr' 0 arr 0 di - outer arr' (newlen - 1) si di rbuf - - outer arr0 (mlen - 1) s0 0 Empty -{-# INLINE [0] unstreamD #-} - --- [TODO] we can generate this from UCD --- [TODO] is https://www.unicode.org/reports/tr15/#Stream_Safe_Text_Format the source for this? -maxDecomposeLen :: Int -maxDecomposeLen = 32 + (l, v, t) = UC.decomposeHangul c ------------------------------------------------------------------------------- -- Normalization Forms NFC & NFKC ------------------------------------------------------------------------------- --- If we are composing we do not need to first decompose Hangul. We can just --- compose assuming there could be some partially composed syllables e.g. LV --- syllable followed by a jamo T. We need to compose this case as well. - --- Hold an L to wait for V, hold an LV to wait for T. --- | Jamo buffer -data JamoBuf - = Jamo !Char - -- ^ Conjoining jamo L, V or T - | Hangul !Char - -- ^ Hangul syllable LV or LVT - | HangulLV !Char - -- ^ Hangul syllable LV - --- | Generic Composition buffer -data RegBuf - = RegOne !Char - | RegMany !Char !Char ![Char] - -- | Composition state data ComposeState = ComposeNone @@ -299,155 +276,53 @@ data ComposeState | ComposeJamo !JamoBuf -- ^ Conjoining jamo buffer -------------------------------------------------------------------------------- --- Composition of Jamo into Hangul syllables, done algorithmically -------------------------------------------------------------------------------- - --- | Write a jamo buffer to a target array -{-# INLINE writeJamoBuf #-} -writeJamoBuf :: A.MArray s -> Int -> JamoBuf -> ST s Int -writeJamoBuf arr i jbuf = do - n <- unsafeWrite arr i (getCh jbuf) - return (i + n) - - where - - getCh = \case - Jamo ch -> ch - Hangul ch -> ch - HangulLV ch -> ch - --- | Initialize Jamo buffer with a precomposed Hangul character -{-# INLINE initHangul #-} -initHangul :: Char -> Int -> ST s (Int, ComposeState) -initHangul c i = return (i, ComposeJamo (Hangul c)) - --- | Initialize Jamo buffer with conjoining jamo -{-# INLINE initJamo #-} -initJamo :: Char -> Int -> ST s (Int, ComposeState) -initJamo c i = return (i, ComposeJamo (Jamo c)) - --- | Insert a conjoining jamo in the buffer -{-# INLINE insertJamo #-} -insertJamo - :: A.MArray s -> Int -> JamoBuf -> Char -> ST s (Int, ComposeState) -insertJamo arr i jbuf ch - -- Leading consonant (L): write buffer and start new one - | ich <= UC.jamoLLast = do - j <- writeJamoBuf arr i jbuf - return (j, ComposeJamo (Jamo ch)) - -- Jamo that is neither L, V nor T - | ich < UC.jamoVFirst = - flushAndWrite arr i jbuf ch - -- Vowel (V): - | ich <= UC.jamoVLast = do - case jbuf of - Jamo c -> - case UC.jamoLIndex c of - Just li -> - let vi = ich - UC.jamoVFirst - lvi = li * UC.jamoNCount + vi * UC.jamoTCount - lv = chr (UC.hangulFirst + lvi) - in return (i, ComposeJamo (HangulLV lv)) - Nothing -> writeTwo arr i c ch - Hangul c -> writeTwo arr i c ch - HangulLV c -> writeTwo arr i c ch - -- Jamo that is neither L, V nor T - | ich <= UC.jamoTFirst = do - flushAndWrite arr i jbuf ch - -- Trailing consonant (T) - | otherwise = do - let ti = ich - UC.jamoTFirst - case jbuf of - Jamo c -> writeTwo arr i c ch - Hangul c - | UC.isHangulLV c -> do - writeLVT arr i c ti - | otherwise -> - writeTwo arr i c ch - HangulLV c -> - writeLVT arr i c ti - - where - - ich = ord ch - - -- Write buffer and the following character, and start an empty buffer. - {-# INLINE flushAndWrite #-} - flushAndWrite marr ix jb c = do - j <- writeJamoBuf marr ix jb - n <- unsafeWrite marr j c - return (j + n, ComposeNone) - - -- Write LVT syllable and start an empty buffer - {-# INLINE writeLVT #-} - writeLVT marr ix lv ti = do - n <- unsafeWrite marr ix (chr ((ord lv) + ti)) - return (ix + n, ComposeNone) - - -- Write two chars and start an empty buffer - {-# INLINE writeTwo #-} - writeTwo marr ix c1 c2 = do - n <- unsafeWrite marr ix c1 - m <- unsafeWrite marr (ix + n) c2 - return ((ix + n + m), ComposeNone) - --- | Write buffer and start a new one with a pre-composed Hangul syllable. -{-# INLINE insertHangul #-} -insertHangul - :: A.MArray s -> Int -> JamoBuf -> Char -> ST s (Int, ComposeState) -insertHangul arr i jbuf ch = do - j <- writeJamoBuf arr i jbuf - return (j, ComposeJamo (Hangul ch)) +-- | Generic Composition buffer +data RegBuf + = RegOne !Char + | RegMany !Char !Char ![Char] --- | Insert a combining character in its canonical position in a buffer. -{-# INLINE insertIntoRegBuf #-} -insertIntoRegBuf :: Char -> RegBuf -> RegBuf -insertIntoRegBuf c = \case - RegOne c0 - | UC.combiningClass c < UC.combiningClass c0 - -> RegMany c c0 [] - | otherwise - -> RegMany c0 c [] - RegMany c0 c1 cs - | cc < UC.combiningClass c0 - -> RegMany c c0 (c1 : cs) - | cc < UC.combiningClass c1 - -> RegMany c0 c (c1 : cs) - | otherwise - -> RegMany c0 c1 (cs' ++ (c : cs'')) - where - cc = UC.combiningClass c - (cs', cs'') = span ((<= cc) . UC.combiningClass) cs +-- | /O(n)/ Convert a 'Stream Char' into a composed normalized 'Text'. +unstreamC :: UC.DecomposeMode -> Stream Char -> Text +unstreamC mode (Stream next0 s0 len) = runText $ \done -> do + -- Before encoding each char we perform a buffer realloc check assuming + -- worst case encoding size of two 16-bit units for the char. Just add an + -- extra space to the buffer so that we do not end up reallocating even when + -- all the chars are encoded as single unit. + let margin = 1 + maxDecomposeLen + mlen = upperBound 4 len + margin + arr0 <- A.new mlen + let outer !arr !maxi = encode SPEC + where + -- si: internal state + -- di: current char index [TODO] check this + -- [FIXME] cc: combining class of the previous character + -- st: compose state --- | Write generic compose buffer to a target array -{-# INLINE writeRegBuf #-} -writeRegBuf :: A.MArray s -> Int -> RegBuf -> ST s Int -writeRegBuf arr i = \case - RegOne c -> do - n <- unsafeWrite arr i c - return (i + n) - RegMany st c [] -> - case UC.compose st c of - Just x -> do - n <- unsafeWrite arr i x - return (i + n) - Nothing -> do - n <- unsafeWrite arr i st - m <- unsafeWrite arr (i + n) c - return (i + n + m) - RegMany st0 c0 cs0 -> go [] st0 (c0 : cs0) + -- keep the common case loop as small as possible + encode !_ !si !di st = + -- simply check for the worst case + if maxi < di + margin + then realloc si di st + else + case next0 si of + Done -> do + di' <- flushComposeState arr di st + done arr di' + Skip si' -> encode SPEC si' di st + Yield c si' -> do + (di', st') <- composeChar mode arr c di st + encode SPEC si' di' st' - where + -- keep uncommon case separate from the common case code + {-# NOINLINE realloc #-} + realloc !si !di st = do + let newlen = maxi * 2 + arr' <- A.new newlen + A.copyM arr' 0 arr 0 di + outer arr' (newlen - 1) si di st - -- arguments: uncombined chars, starter, unprocessed str - go uncs st [] = writeStr arr i (st : uncs) - go uncs st (c : cs) = case UC.compose st c of - Nothing -> go (uncs ++ (c : same)) st bigger - Just x -> go uncs x cs - where - cc = UC.combiningClass c - (same, bigger) = span ((== cc) . UC.combiningClass) cs + outer arr0 (mlen - 1) s0 0 ComposeNone +{-# INLINE [0] unstreamC #-} -- | Finalize composition {-# INLINE flushComposeState #-} @@ -624,45 +499,166 @@ composeChar mode marr = go0 | otherwise -> go rest i (ComposeReg (RegOne ch)) --- | /O(n)/ Convert a 'Stream Char' into a composed normalized 'Text'. -unstreamC :: UC.DecomposeMode -> Stream Char -> Text -unstreamC mode (Stream next0 s0 len) = runText $ \done -> do - -- Before encoding each char we perform a buffer realloc check assuming - -- worst case encoding size of two 16-bit units for the char. Just add an - -- extra space to the buffer so that we do not end up reallocating even when - -- all the chars are encoded as single unit. - let margin = 1 + maxDecomposeLen - mlen = upperBound 4 len + margin - arr0 <- A.new mlen - let outer !arr !maxi = encode SPEC - where - -- si: internal state - -- di: current char index [TODO] check this - -- [FIXME] cc: combining class of the previous character - -- st: compose state +-- | Write generic compose buffer to a target array +{-# INLINE writeRegBuf #-} +writeRegBuf :: A.MArray s -> Int -> RegBuf -> ST s Int +writeRegBuf arr i = \case + RegOne c -> do + n <- unsafeWrite arr i c + return (i + n) + RegMany st c [] -> + case UC.compose st c of + Just x -> do + n <- unsafeWrite arr i x + return (i + n) + Nothing -> do + n <- unsafeWrite arr i st + m <- unsafeWrite arr (i + n) c + return (i + n + m) + RegMany st0 c0 cs0 -> go [] st0 (c0 : cs0) - -- keep the common case loop as small as possible - encode !_ !si !di st = - -- simply check for the worst case - if maxi < di + margin - then realloc si di st - else - case next0 si of - Done -> do - di' <- flushComposeState arr di st - done arr di' - Skip si' -> encode SPEC si' di st - Yield c si' -> do - (di', st') <- composeChar mode arr c di st - encode SPEC si' di' st' + where - -- keep uncommon case separate from the common case code - {-# NOINLINE realloc #-} - realloc !si !di st = do - let newlen = maxi * 2 - arr' <- A.new newlen - A.copyM arr' 0 arr 0 di - outer arr' (newlen - 1) si di st + -- arguments: uncombined chars, starter, unprocessed str + go uncs st [] = writeStr arr i (st : uncs) + go uncs st (c : cs) = case UC.compose st c of + Nothing -> go (uncs ++ (c : same)) st bigger + Just x -> go uncs x cs + where + cc = UC.combiningClass c + (same, bigger) = span ((== cc) . UC.combiningClass) cs - outer arr0 (mlen - 1) s0 0 ComposeNone -{-# INLINE [0] unstreamC #-} +-- | Insert a combining character in its canonical position in a buffer. +{-# INLINE insertIntoRegBuf #-} +insertIntoRegBuf :: Char -> RegBuf -> RegBuf +insertIntoRegBuf c = \case + RegOne c0 + | UC.combiningClass c < UC.combiningClass c0 + -> RegMany c c0 [] + | otherwise + -> RegMany c0 c [] + RegMany c0 c1 cs + | cc < UC.combiningClass c0 + -> RegMany c c0 (c1 : cs) + | cc < UC.combiningClass c1 + -> RegMany c0 c (c1 : cs) + | otherwise + -> RegMany c0 c1 (cs' ++ (c : cs'')) + where + cc = UC.combiningClass c + (cs', cs'') = span ((<= cc) . UC.combiningClass) cs + +------------------------------------------------------------------------------- +-- Composition of Jamo into Hangul syllables, done algorithmically +------------------------------------------------------------------------------- + +-- If we are composing we do not need to first decompose Hangul. We can just +-- compose assuming there could be some partially composed syllables e.g. LV +-- syllable followed by a jamo T. We need to compose this case as well. + +-- Hold an L to wait for V, hold an LV to wait for T. +-- | Jamo buffer +data JamoBuf + = Jamo !Char + -- ^ Conjoining jamo L, V or T + | Hangul !Char + -- ^ Hangul syllable LV or LVT + | HangulLV !Char + -- ^ Hangul syllable LV + +-- | Write a jamo buffer to a target array +{-# INLINE writeJamoBuf #-} +writeJamoBuf :: A.MArray s -> Int -> JamoBuf -> ST s Int +writeJamoBuf arr i jbuf = do + n <- unsafeWrite arr i (getCh jbuf) + return (i + n) + + where + + getCh = \case + Jamo ch -> ch + Hangul ch -> ch + HangulLV ch -> ch + +-- | Initialize Jamo buffer with a precomposed Hangul character +{-# INLINE initHangul #-} +initHangul :: Char -> Int -> ST s (Int, ComposeState) +initHangul c i = return (i, ComposeJamo (Hangul c)) + +-- | Initialize Jamo buffer with conjoining jamo +{-# INLINE initJamo #-} +initJamo :: Char -> Int -> ST s (Int, ComposeState) +initJamo c i = return (i, ComposeJamo (Jamo c)) + +-- | Insert a conjoining jamo in the buffer +{-# INLINE insertJamo #-} +insertJamo + :: A.MArray s -> Int -> JamoBuf -> Char -> ST s (Int, ComposeState) +insertJamo arr i jbuf ch + -- Leading consonant (L): write buffer and start new one + | ich <= UC.jamoLLast = do + j <- writeJamoBuf arr i jbuf + return (j, ComposeJamo (Jamo ch)) + -- Jamo that is neither L, V nor T + | ich < UC.jamoVFirst = + flushAndWrite arr i jbuf ch + -- Vowel (V): + | ich <= UC.jamoVLast = do + case jbuf of + Jamo c -> + case UC.jamoLIndex c of + Just li -> + let vi = ich - UC.jamoVFirst + lvi = li * UC.jamoNCount + vi * UC.jamoTCount + lv = chr (UC.hangulFirst + lvi) + in return (i, ComposeJamo (HangulLV lv)) + Nothing -> writeTwo arr i c ch + Hangul c -> writeTwo arr i c ch + HangulLV c -> writeTwo arr i c ch + -- Jamo that is neither L, V nor T + | ich <= UC.jamoTFirst = do + flushAndWrite arr i jbuf ch + -- Trailing consonant (T) + | otherwise = do + let ti = ich - UC.jamoTFirst + case jbuf of + Jamo c -> writeTwo arr i c ch + Hangul c + | UC.isHangulLV c -> do + writeLVT arr i c ti + | otherwise -> + writeTwo arr i c ch + HangulLV c -> + writeLVT arr i c ti + + where + + ich = ord ch + + -- Write buffer and the following character, and start an empty buffer. + {-# INLINE flushAndWrite #-} + flushAndWrite marr ix jb c = do + j <- writeJamoBuf marr ix jb + n <- unsafeWrite marr j c + return (j + n, ComposeNone) + + -- Write LVT syllable and start an empty buffer + {-# INLINE writeLVT #-} + writeLVT marr ix lv ti = do + n <- unsafeWrite marr ix (chr ((ord lv) + ti)) + return (ix + n, ComposeNone) + + -- Write two chars and start an empty buffer + {-# INLINE writeTwo #-} + writeTwo marr ix c1 c2 = do + n <- unsafeWrite marr ix c1 + m <- unsafeWrite marr (ix + n) c2 + return ((ix + n + m), ComposeNone) + +-- | Write buffer and start a new one with a pre-composed Hangul syllable. +{-# INLINE insertHangul #-} +insertHangul + :: A.MArray s -> Int -> JamoBuf -> Char -> ST s (Int, ComposeState) +insertHangul arr i jbuf ch = do + j <- writeJamoBuf arr i jbuf + return (j, ComposeJamo (Hangul ch))