Skip to content

Commit

Permalink
Start quick check implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
wismill committed Jun 7, 2022
1 parent 278f273 commit f6e236a
Show file tree
Hide file tree
Showing 5 changed files with 107 additions and 12 deletions.
8 changes: 5 additions & 3 deletions Data/Text/Normalize.hs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ module Data.Text.Normalize
NormalizationMode(..)
-- * Normalization API
, normalize
, normalizeQC
) where

import Data.Text (Text)
Expand All @@ -24,6 +25,7 @@ import Data.Unicode.Types (NormalizationMode(..))
-- Internal modules
import Data.Unicode.Internal.NormalizeStream
( DecomposeMode(..)
, normalizeQC
, stream
, unstream
, unstreamC
Expand All @@ -34,7 +36,7 @@ import Data.Unicode.Internal.NormalizeStream
normalize :: NormalizationMode -> Text -> Text
normalize mode =
case mode of
NFD -> (unstream Canonical) . stream
NFKD -> (unstream Kompat) . stream
NFC -> (unstreamC Canonical) . stream
NFD -> (unstream Canonical) . stream
NFKD -> (unstream Kompat) . stream
NFC -> (unstreamC Canonical) . stream
NFKC -> (unstreamC Kompat) . stream
57 changes: 56 additions & 1 deletion Data/Unicode/Internal/NormalizeStream.hs
Original file line number Diff line number Diff line change
Expand Up @@ -17,18 +17,27 @@
module Data.Unicode.Internal.NormalizeStream
(
UC.DecomposeMode(..)
, normalizeQC
, stream
, unstream
, unstreamC
)
where

import Data.Char (chr, ord)
import Data.Unicode.Types (NormalizationMode(..))
import GHC.ST (ST(..))
import GHC.Types (SPEC(..))
import Unicode.Char.Normalization
( QuickCheck(..)
, isNFD
, isNFKD
, isNFC
, isNFKC
)

import qualified Data.Text.Array as A
import qualified Unicode.Char as UC
import qualified Data.Text.Array as A

#if MIN_VERSION_text(2,0,0)
import Data.Text.Internal.Fusion (stream)
Expand Down Expand Up @@ -169,6 +178,52 @@ stream (Text arr off len) = Stream next off (betweenSize (len `shiftR` 1) len)
{-# INLINE [0] stream #-}
#endif

-- | Perform Unicode normalization on @Text@ according to the specified
-- normalization mode, using the quick check algorithm to speed up case when
-- the text is already normalized.
normalizeQC
:: NormalizationMode
-> Text
-> Text
normalizeQC mode t = case stream t of
str@(Stream next0 s0 _len) ->
let quickCheck s cc r = case next0 s of
Done -> r
Skip s' -> quickCheck s' cc r
Yield c s' ->
-- [TODO] the ASCII check speeds up latin scripts,
-- but it slows down the other scripts.
if c <= '\x7f'
-- For ASCII we know it’s always allowed and a starter
then quickCheck s' 0 r
-- Otherwise, lookup the combining class and QC property
else let cc' = UC.combiningClass c in
if cc > cc' && cc' /= 0
then No
else case check c of
No -> No
Maybe -> quickCheck s' cc' Maybe
Yes -> quickCheck s' cc' r
-- let cc' = UC.combiningClass c
-- in if cc > cc' && cc' /= 0
-- then No
-- else case check c of
-- No -> No
-- Maybe -> quickCheck s' cc' Maybe
-- Yes -> quickCheck s' cc' r
check = case mode of
NFD -> isNFD
NFKD -> isNFKD
NFC -> isNFC
NFKC -> isNFKC
in case quickCheck s0 0 Yes of
Yes -> t
_ -> case mode of
NFD -> unstream UC.Canonical str
NFKD -> unstream UC.Kompat str
NFC -> unstreamC UC.Canonical str
NFKC -> unstreamC UC.Kompat str

-- | /O(n)/ Convert a 'Stream Char' into a decompose-normalized 'Text'.
unstream :: UC.DecomposeMode -> Stream Char -> Text
unstream mode (Stream next0 s0 len) = runText $ \done -> do
Expand Down
40 changes: 35 additions & 5 deletions benchmark/Benchmark.hs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@ import Path.IO (listDir)
import System.FilePath (dropExtensions, takeFileName)

import Gauge.Main (Benchmark, bench, bgroup, defaultMain, env, nf)
#ifdef USE_TASTY
import Test.Tasty.Bench (bcompare)
#endif

import qualified Data.Text as T
import qualified Data.Text.Normalize as UTText
Expand All @@ -44,6 +47,14 @@ unicodeTransformTextFuncs =
, ("NFKC", UTText.normalize UTText.NFKC)
]

unicodeTransformTextFuncsQuickCheck :: [(String, Text -> Text)]
unicodeTransformTextFuncsQuickCheck =
[ ("NFD", UTText.normalizeQC UTText.NFD)
, ("NFKD", UTText.normalizeQC UTText.NFKD)
, ("NFC", UTText.normalizeQC UTText.NFC)
, ("NFKC", UTText.normalizeQC UTText.NFKC)
]

dataDir :: Path Rel Dir
dataDir = $(mkRelDir "benchmark") </> $(mkRelDir "data")

Expand All @@ -53,9 +64,25 @@ dataDir = $(mkRelDir "benchmark") </> $(mkRelDir "data")
dataSetSize :: Int
dataSetSize = 1000000

makeBench :: (NFData a, NFData b) => (String, a -> b) -> (String, IO a) -> Benchmark
makeBench (implName, func) (dataName, setup) =
env setup (\txt -> bench (implName ++ "/" ++ dataName) (nf func txt))
makeBench :: (NFData a, NFData b) => (String, a -> b) -> String -> a -> Benchmark
makeBench (implName, func) dataName =
\txt -> bench (makeTestName implName dataName) (nf func txt)

makeTestName :: String -> String -> String
makeTestName implName dataName = implName ++ "/" ++ dataName

makeBenchRef :: (NFData a, NFData b) => (String, a -> b) -> (String, IO a) -> Benchmark
makeBenchRef impl (dataName, setup) = env setup (makeBench impl dataName)

makeBenchComp :: (NFData a, NFData b) => (String, a -> b) -> (String, IO a) -> Benchmark
#ifdef USE_TASTY
makeBenchComp impl (dataName, setup) = env setup
( bcompare ("$NF == \"" <> (makeTestName (fst impl) dataName)
<> "\" && $(NF-1) == \"unicode-transforms-text\"")
. makeBench impl dataName)
#else
makeBenchComp = makeBenchRef
#endif

strInput :: FilePath -> (String, IO String)
strInput file = (dataName file,
Expand All @@ -73,10 +100,13 @@ main = do
[
#ifdef BENCH_ICU
bgroup "text-icu"
$ makeBench <$> textICUFuncs <*> (map txtInput dataFiles)
$ makeBenchComp <$> textICUFuncs <*> (map txtInput dataFiles)
,
#endif
bgroup "unicode-transforms-text"
$ makeBench <$> unicodeTransformTextFuncs
$ makeBenchRef <$> unicodeTransformTextFuncs
<*> (map txtInput dataFiles)
, bgroup "unicode-transforms-text (QC)"
$ makeBenchComp <$> unicodeTransformTextFuncsQuickCheck
<*> (map txtInput dataFiles)
]
5 changes: 5 additions & 0 deletions cabal.project
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
packages: .
source-repository-package
type: git
location: https://github.com/composewell/unicode-data.git
tag: d81b67cc76d7f312a35de8e2a42c8e856c393885
9 changes: 6 additions & 3 deletions test/NormalizationTest.hs
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,9 @@ import Data.List.Split (splitOn)
import Data.Text (Text)
import qualified Data.Text as T
import qualified Data.Text.Encoding as T
import Data.Text.Normalize (NormalizationMode(NFD, NFKD, NFC, NFKC), normalize)
import Data.Text.Normalize
( NormalizationMode(NFD, NFKD, NFC, NFKC)
, normalize, normalizeQC)
import Text.Printf (printf)

#if !MIN_VERSION_base(4,8,0)
Expand All @@ -49,8 +51,9 @@ checkEqual opName op (c1, c2) =

checkOp :: String -> NormalizationMode -> [(Text, Text)] -> IO Bool
checkOp name op pairs = do
res <- mapM (checkEqual name ((normalize op))) pairs
return $ all (== True) res
res1 <- mapM (checkEqual name ((normalize op))) pairs
res2 <- mapM (checkEqual name ((normalizeQC op))) pairs
return $ all (== True) res1 && all (== True) res2

checkNFC :: (Text, Text, Text, Text, Text) -> IO Bool
checkNFC (c1, c2, c3, c4, c5) =
Expand Down

0 comments on commit f6e236a

Please sign in to comment.