From f6e236a29815c560757012f74a815cda63ca7be6 Mon Sep 17 00:00:00 2001 From: Pierre Le Marre Date: Tue, 7 Jun 2022 18:03:37 +0200 Subject: [PATCH] Start quick check implementation --- Data/Text/Normalize.hs | 8 ++-- Data/Unicode/Internal/NormalizeStream.hs | 57 +++++++++++++++++++++++- benchmark/Benchmark.hs | 40 ++++++++++++++--- cabal.project | 5 +++ test/NormalizationTest.hs | 9 ++-- 5 files changed, 107 insertions(+), 12 deletions(-) create mode 100644 cabal.project diff --git a/Data/Text/Normalize.hs b/Data/Text/Normalize.hs index c0d1826..aa09564 100644 --- a/Data/Text/Normalize.hs +++ b/Data/Text/Normalize.hs @@ -16,6 +16,7 @@ module Data.Text.Normalize NormalizationMode(..) -- * Normalization API , normalize + , normalizeQC ) where import Data.Text (Text) @@ -24,6 +25,7 @@ import Data.Unicode.Types (NormalizationMode(..)) -- Internal modules import Data.Unicode.Internal.NormalizeStream ( DecomposeMode(..) + , normalizeQC , stream , unstream , unstreamC @@ -34,7 +36,7 @@ import Data.Unicode.Internal.NormalizeStream normalize :: NormalizationMode -> Text -> Text normalize mode = case mode of - NFD -> (unstream Canonical) . stream - NFKD -> (unstream Kompat) . stream - NFC -> (unstreamC Canonical) . stream + NFD -> (unstream Canonical) . stream + NFKD -> (unstream Kompat) . stream + NFC -> (unstreamC Canonical) . stream NFKC -> (unstreamC Kompat) . stream diff --git a/Data/Unicode/Internal/NormalizeStream.hs b/Data/Unicode/Internal/NormalizeStream.hs index e5fb2c3..38d54e1 100644 --- a/Data/Unicode/Internal/NormalizeStream.hs +++ b/Data/Unicode/Internal/NormalizeStream.hs @@ -17,6 +17,7 @@ module Data.Unicode.Internal.NormalizeStream ( UC.DecomposeMode(..) + , normalizeQC , stream , unstream , unstreamC @@ -24,11 +25,19 @@ module Data.Unicode.Internal.NormalizeStream where import Data.Char (chr, ord) +import Data.Unicode.Types (NormalizationMode(..)) import GHC.ST (ST(..)) import GHC.Types (SPEC(..)) +import Unicode.Char.Normalization + ( QuickCheck(..) + , isNFD + , isNFKD + , isNFC + , isNFKC + ) -import qualified Data.Text.Array as A import qualified Unicode.Char as UC +import qualified Data.Text.Array as A #if MIN_VERSION_text(2,0,0) import Data.Text.Internal.Fusion (stream) @@ -169,6 +178,52 @@ stream (Text arr off len) = Stream next off (betweenSize (len `shiftR` 1) len) {-# INLINE [0] stream #-} #endif +-- | Perform Unicode normalization on @Text@ according to the specified +-- normalization mode, using the quick check algorithm to speed up case when +-- the text is already normalized. +normalizeQC + :: NormalizationMode + -> Text + -> Text +normalizeQC mode t = case stream t of + str@(Stream next0 s0 _len) -> + let quickCheck s cc r = case next0 s of + Done -> r + Skip s' -> quickCheck s' cc r + Yield c s' -> + -- [TODO] the ASCII check speeds up latin scripts, + -- but it slows down the other scripts. + if c <= '\x7f' + -- For ASCII we know it’s always allowed and a starter + then quickCheck s' 0 r + -- Otherwise, lookup the combining class and QC property + else let cc' = UC.combiningClass c in + if cc > cc' && cc' /= 0 + then No + else case check c of + No -> No + Maybe -> quickCheck s' cc' Maybe + Yes -> quickCheck s' cc' r + -- let cc' = UC.combiningClass c + -- in if cc > cc' && cc' /= 0 + -- then No + -- else case check c of + -- No -> No + -- Maybe -> quickCheck s' cc' Maybe + -- Yes -> quickCheck s' cc' r + check = case mode of + NFD -> isNFD + NFKD -> isNFKD + NFC -> isNFC + NFKC -> isNFKC + in case quickCheck s0 0 Yes of + Yes -> t + _ -> case mode of + NFD -> unstream UC.Canonical str + NFKD -> unstream UC.Kompat str + NFC -> unstreamC UC.Canonical str + NFKC -> unstreamC UC.Kompat str + -- | /O(n)/ Convert a 'Stream Char' into a decompose-normalized 'Text'. unstream :: UC.DecomposeMode -> Stream Char -> Text unstream mode (Stream next0 s0 len) = runText $ \done -> do diff --git a/benchmark/Benchmark.hs b/benchmark/Benchmark.hs index 4f35cab..cc6bef9 100644 --- a/benchmark/Benchmark.hs +++ b/benchmark/Benchmark.hs @@ -20,6 +20,9 @@ import Path.IO (listDir) import System.FilePath (dropExtensions, takeFileName) import Gauge.Main (Benchmark, bench, bgroup, defaultMain, env, nf) +#ifdef USE_TASTY +import Test.Tasty.Bench (bcompare) +#endif import qualified Data.Text as T import qualified Data.Text.Normalize as UTText @@ -44,6 +47,14 @@ unicodeTransformTextFuncs = , ("NFKC", UTText.normalize UTText.NFKC) ] +unicodeTransformTextFuncsQuickCheck :: [(String, Text -> Text)] +unicodeTransformTextFuncsQuickCheck = + [ ("NFD", UTText.normalizeQC UTText.NFD) + , ("NFKD", UTText.normalizeQC UTText.NFKD) + , ("NFC", UTText.normalizeQC UTText.NFC) + , ("NFKC", UTText.normalizeQC UTText.NFKC) + ] + dataDir :: Path Rel Dir dataDir = $(mkRelDir "benchmark") $(mkRelDir "data") @@ -53,9 +64,25 @@ dataDir = $(mkRelDir "benchmark") $(mkRelDir "data") dataSetSize :: Int dataSetSize = 1000000 -makeBench :: (NFData a, NFData b) => (String, a -> b) -> (String, IO a) -> Benchmark -makeBench (implName, func) (dataName, setup) = - env setup (\txt -> bench (implName ++ "/" ++ dataName) (nf func txt)) +makeBench :: (NFData a, NFData b) => (String, a -> b) -> String -> a -> Benchmark +makeBench (implName, func) dataName = + \txt -> bench (makeTestName implName dataName) (nf func txt) + +makeTestName :: String -> String -> String +makeTestName implName dataName = implName ++ "/" ++ dataName + +makeBenchRef :: (NFData a, NFData b) => (String, a -> b) -> (String, IO a) -> Benchmark +makeBenchRef impl (dataName, setup) = env setup (makeBench impl dataName) + +makeBenchComp :: (NFData a, NFData b) => (String, a -> b) -> (String, IO a) -> Benchmark +#ifdef USE_TASTY +makeBenchComp impl (dataName, setup) = env setup + ( bcompare ("$NF == \"" <> (makeTestName (fst impl) dataName) + <> "\" && $(NF-1) == \"unicode-transforms-text\"") + . makeBench impl dataName) +#else +makeBenchComp = makeBenchRef +#endif strInput :: FilePath -> (String, IO String) strInput file = (dataName file, @@ -73,10 +100,13 @@ main = do [ #ifdef BENCH_ICU bgroup "text-icu" - $ makeBench <$> textICUFuncs <*> (map txtInput dataFiles) + $ makeBenchComp <$> textICUFuncs <*> (map txtInput dataFiles) , #endif bgroup "unicode-transforms-text" - $ makeBench <$> unicodeTransformTextFuncs + $ makeBenchRef <$> unicodeTransformTextFuncs + <*> (map txtInput dataFiles) + , bgroup "unicode-transforms-text (QC)" + $ makeBenchComp <$> unicodeTransformTextFuncsQuickCheck <*> (map txtInput dataFiles) ] diff --git a/cabal.project b/cabal.project new file mode 100644 index 0000000..8d0d630 --- /dev/null +++ b/cabal.project @@ -0,0 +1,5 @@ +packages: . +source-repository-package + type: git + location: https://github.com/composewell/unicode-data.git + tag: d81b67cc76d7f312a35de8e2a42c8e856c393885 diff --git a/test/NormalizationTest.hs b/test/NormalizationTest.hs index 2d0e14e..452dd15 100644 --- a/test/NormalizationTest.hs +++ b/test/NormalizationTest.hs @@ -22,7 +22,9 @@ import Data.List.Split (splitOn) import Data.Text (Text) import qualified Data.Text as T import qualified Data.Text.Encoding as T -import Data.Text.Normalize (NormalizationMode(NFD, NFKD, NFC, NFKC), normalize) +import Data.Text.Normalize + ( NormalizationMode(NFD, NFKD, NFC, NFKC) + , normalize, normalizeQC) import Text.Printf (printf) #if !MIN_VERSION_base(4,8,0) @@ -49,8 +51,9 @@ checkEqual opName op (c1, c2) = checkOp :: String -> NormalizationMode -> [(Text, Text)] -> IO Bool checkOp name op pairs = do - res <- mapM (checkEqual name ((normalize op))) pairs - return $ all (== True) res + res1 <- mapM (checkEqual name ((normalize op))) pairs + res2 <- mapM (checkEqual name ((normalizeQC op))) pairs + return $ all (== True) res1 && all (== True) res2 checkNFC :: (Text, Text, Text, Text, Text) -> IO Bool checkNFC (c1, c2, c3, c4, c5) =