Start quick check implementation

composewell · Jun 7, 2022 · f6e236a · f6e236a
1 parent 278f273
commit f6e236a
Show file tree

Hide file tree

Showing 5 changed files with 107 additions and 12 deletions.
diff --git a/Data/Text/Normalize.hs b/Data/Text/Normalize.hs
@@ -16,6 +16,7 @@ module Data.Text.Normalize
       NormalizationMode(..)
     -- * Normalization API
     , normalize
+    , normalizeQC
     ) where
 
 import Data.Text (Text)
@@ -24,6 +25,7 @@ import Data.Unicode.Types (NormalizationMode(..))
 -- Internal modules
 import Data.Unicode.Internal.NormalizeStream
     ( DecomposeMode(..)
+    , normalizeQC
     , stream
     , unstream
     , unstreamC
@@ -34,7 +36,7 @@ import Data.Unicode.Internal.NormalizeStream
 normalize :: NormalizationMode -> Text -> Text
 normalize mode =
     case mode of
-      NFD  -> (unstream Canonical)   . stream
-      NFKD -> (unstream Kompat)  . stream
-      NFC  -> (unstreamC Canonical)  . stream
+      NFD  -> (unstream Canonical) . stream
+      NFKD -> (unstream Kompat) . stream
+      NFC  -> (unstreamC Canonical) . stream
       NFKC -> (unstreamC Kompat) . stream
diff --git a/Data/Unicode/Internal/NormalizeStream.hs b/Data/Unicode/Internal/NormalizeStream.hs
@@ -17,18 +17,27 @@
 module Data.Unicode.Internal.NormalizeStream
     (
       UC.DecomposeMode(..)
+    , normalizeQC
     , stream
     , unstream
     , unstreamC
     )
     where
 
 import Data.Char (chr, ord)
+import Data.Unicode.Types (NormalizationMode(..))
 import GHC.ST (ST(..))
 import GHC.Types (SPEC(..))
+import Unicode.Char.Normalization
+    ( QuickCheck(..)
+    , isNFD
+    , isNFKD
+    , isNFC
+    , isNFKC
+    )
 
-import qualified Data.Text.Array as A
 import qualified Unicode.Char as UC
+import qualified Data.Text.Array as A
 
 #if MIN_VERSION_text(2,0,0)
 import Data.Text.Internal.Fusion (stream)
@@ -169,6 +178,52 @@ stream (Text arr off len) = Stream next off (betweenSize (len `shiftR` 1) len)
 {-# INLINE [0] stream #-}
 #endif
 
+-- | Perform Unicode normalization on @Text@ according to the specified
+-- normalization mode, using the quick check algorithm to speed up case when 
+-- the text is already normalized.
+normalizeQC
+    :: NormalizationMode
+    -> Text
+    -> Text
+normalizeQC mode t = case stream t of
+    str@(Stream next0 s0 _len) ->
+        let quickCheck s cc r = case next0 s of
+                Done       -> r
+                Skip    s' -> quickCheck s' cc r
+                Yield c s' ->
+                    -- [TODO] the ASCII check speeds up latin scripts,
+                    --        but it slows down the other scripts.
+                    if c <= '\x7f'
+                        -- For ASCII we know it’s always allowed and a starter
+                        then quickCheck s' 0 r
+                        -- Otherwise, lookup the combining class and QC property
+                        else let cc' = UC.combiningClass c in
+                            if cc > cc' && cc' /= 0
+                                then No
+                                else case check c of
+                                    No    -> No
+                                    Maybe -> quickCheck s' cc' Maybe
+                                    Yes   -> quickCheck s' cc' r
+                    -- let cc' = UC.combiningClass c
+                    -- in if cc > cc' && cc' /= 0
+                    --     then No
+                    --     else case check c of
+                    --         No    -> No
+                    --         Maybe -> quickCheck s' cc' Maybe
+                    --         Yes   -> quickCheck s' cc' r
+            check = case mode of
+                NFD  -> isNFD
+                NFKD -> isNFKD
+                NFC  -> isNFC
+                NFKC -> isNFKC
+        in case quickCheck s0 0 Yes of
+            Yes -> t
+            _   -> case mode of
+                NFD  -> unstream  UC.Canonical str
+                NFKD -> unstream  UC.Kompat    str
+                NFC  -> unstreamC UC.Canonical str
+                NFKC -> unstreamC UC.Kompat    str
+
 -- | /O(n)/ Convert a 'Stream Char' into a decompose-normalized 'Text'.
 unstream :: UC.DecomposeMode -> Stream Char -> Text
 unstream mode (Stream next0 s0 len) = runText $ \done -> do

diff --git a/benchmark/Benchmark.hs b/benchmark/Benchmark.hs
@@ -20,6 +20,9 @@ import Path.IO (listDir)
 import System.FilePath (dropExtensions, takeFileName)
 
 import Gauge.Main (Benchmark, bench, bgroup, defaultMain, env, nf)
+#ifdef USE_TASTY
+import Test.Tasty.Bench (bcompare)
+#endif
 
 import qualified Data.Text as T
 import qualified Data.Text.Normalize as UTText
@@ -44,6 +47,14 @@ unicodeTransformTextFuncs =
     , ("NFKC", UTText.normalize UTText.NFKC)
     ]
 
+unicodeTransformTextFuncsQuickCheck :: [(String, Text -> Text)]
+unicodeTransformTextFuncsQuickCheck =
+    [ ("NFD", UTText.normalizeQC UTText.NFD)
+    , ("NFKD", UTText.normalizeQC UTText.NFKD)
+    , ("NFC", UTText.normalizeQC UTText.NFC)
+    , ("NFKC", UTText.normalizeQC UTText.NFKC)
+    ]
+
 dataDir :: Path Rel Dir
 dataDir = $(mkRelDir "benchmark") </> $(mkRelDir "data")
 
@@ -53,9 +64,25 @@ dataDir = $(mkRelDir "benchmark") </> $(mkRelDir "data")
 dataSetSize :: Int
 dataSetSize = 1000000
 
-makeBench :: (NFData a, NFData b) => (String, a -> b) -> (String, IO a) -> Benchmark
-makeBench (implName, func) (dataName, setup) =
-    env setup (\txt -> bench (implName ++ "/" ++ dataName) (nf func txt))
+makeBench :: (NFData a, NFData b) => (String, a -> b) -> String -> a -> Benchmark
+makeBench (implName, func) dataName =
+    \txt -> bench (makeTestName implName dataName) (nf func txt)
+
+makeTestName :: String -> String -> String
+makeTestName implName dataName = implName ++ "/" ++ dataName
+
+makeBenchRef :: (NFData a, NFData b) => (String, a -> b) -> (String, IO a) -> Benchmark
+makeBenchRef impl (dataName, setup) = env setup (makeBench impl dataName)
+
+makeBenchComp :: (NFData a, NFData b) => (String, a -> b) -> (String, IO a) -> Benchmark
+#ifdef USE_TASTY
+makeBenchComp impl (dataName, setup) = env setup
+    ( bcompare ("$NF == \"" <> (makeTestName (fst impl) dataName) 
+               <> "\" && $(NF-1) == \"unicode-transforms-text\"")
+    . makeBench impl dataName)
+#else
+makeBenchComp = makeBenchRef
+#endif
 
 strInput :: FilePath -> (String, IO String)
 strInput file = (dataName file,
@@ -73,10 +100,13 @@ main = do
         [
 #ifdef BENCH_ICU
           bgroup "text-icu"
-              $ makeBench <$> textICUFuncs <*> (map txtInput dataFiles)
+              $ makeBenchComp <$> textICUFuncs <*> (map txtInput dataFiles)
         ,
 #endif
           bgroup "unicode-transforms-text"
-            $ makeBench <$> unicodeTransformTextFuncs
+            $ makeBenchRef <$> unicodeTransformTextFuncs
+                        <*> (map txtInput dataFiles)
+        , bgroup "unicode-transforms-text (QC)"
+            $ makeBenchComp <$> unicodeTransformTextFuncsQuickCheck
                         <*> (map txtInput dataFiles)
         ]
diff --git a/cabal.project b/cabal.project
@@ -0,0 +1,5 @@
+packages: .
+source-repository-package
+    type: git
+    location: https://github.com/composewell/unicode-data.git
+    tag: d81b67cc76d7f312a35de8e2a42c8e856c393885
diff --git a/test/NormalizationTest.hs b/test/NormalizationTest.hs
@@ -22,7 +22,9 @@ import Data.List.Split (splitOn)
 import Data.Text (Text)
 import qualified Data.Text as T
 import qualified Data.Text.Encoding as T
-import Data.Text.Normalize (NormalizationMode(NFD, NFKD, NFC, NFKC), normalize)
+import Data.Text.Normalize 
+    ( NormalizationMode(NFD, NFKD, NFC, NFKC)
+    , normalize, normalizeQC)
 import Text.Printf (printf)
 
 #if !MIN_VERSION_base(4,8,0)
@@ -49,8 +51,9 @@ checkEqual opName op (c1, c2) =
 
 checkOp :: String -> NormalizationMode -> [(Text, Text)] -> IO Bool
 checkOp name op pairs = do
-    res <- mapM (checkEqual name ((normalize op))) pairs
-    return $ all (== True) res
+    res1 <- mapM (checkEqual name ((normalize   op))) pairs
+    res2 <- mapM (checkEqual name ((normalizeQC op))) pairs
+    return $ all (== True) res1 && all (== True) res2
 
 checkNFC :: (Text, Text, Text, Text, Text) -> IO Bool
 checkNFC (c1, c2, c3, c4, c5) =