-
Notifications
You must be signed in to change notification settings - Fork 0
/
Encoder.hs
170 lines (133 loc) · 5.65 KB
/
Encoder.hs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
module Encoder (encode) where
import Shared
import System.FilePath
import System.Directory
import qualified Data.ByteString.Lazy as BL
import qualified Data.ByteString.Builder as BLB
import Data.Binary.Put
import Data.Word (Word8)
import Data.IntMap (IntMap)
import qualified Data.IntMap as IntMap
import qualified Data.IntMap.Strict as SIntMap
import Data.List
data HuffTree = Leaf Int Int | Node Int HuffTree HuffTree deriving Show
type FileInfo = (FileEntry, IntMap Int)
encode :: ([FilePath], FilePath) -> IO ()
encode (inFiles, outFile) = flatList inFiles >>= writeEncoding outFile
{- IO functions to encode and write the results -}
writeEncoding :: FilePath -> [FilePath] -> IO ()
writeEncoding outFile inFiles = do
putStrLn "\nReading frequences and writing header"
filesInfo <- mapM readInfo inFiles
BL.writeFile outFile . runPut . putHeader $ map fst filesInfo
putStrLn "\nStarting compression"
mapM_ (compress outFile) filesInfo
putStrLn "\nFinished"
compress :: FilePath -> FileInfo -> IO ()
compress outFile (fileEntry, sizesMap) = do
let filePath = path fileEntry
putComp = makePutComp sizesMap
putStrLn $ "Compressing: " ++ filePath
BL.readFile filePath >>= BL.appendFile outFile . runPut . putComp . BL.unpack
readInfo :: FilePath -> IO FileInfo
readInfo filePath = do
putStrLn $ "reading: " ++ filePath
fileSize <- getFileSize filePath
freqs <- readFrequences filePath
return $ generateInfo filePath fileSize freqs
{- Put functions used by the IO functions -}
putHeader :: [FileEntry] -> Put
putHeader entries = do
putWord8 0 {- flag to be used in the future -}
putNum entries
mapM_ putEntry entries
putNum :: [FileEntry] -> Put
putNum = putInt16be . fromIntegral . length
putEntry :: FileEntry -> Put
putEntry fileEntry = do
putInt32be $ origSize fileEntry
putInt32be $ comprSize fileEntry
putInt16be . fromIntegral . length $ path fileEntry
putLazyByteString . BLB.toLazyByteString . BLB.string8 $ path fileEntry
makePutComp :: IntMap Int -> [Word8] -> Put
makePutComp sizesMap = putCompression codebook encoder
where
codebook = makeCodebook sizesMap
encoder = getHuffEncoder $ IntMap.toList sizesMap
putCompression :: [Int] -> ([Word8] -> [Word8]) -> [Word8] -> Put
putCompression codebook encoder input = do
putCodebook codebook
mapM_ putWord8 $ encoder input
putCodebook :: [Int] -> Put
putCodebook = mapM_ (putWord8 . fromIntegral)
{- Functions to generate info required by the Put functions -}
generateInfo :: FilePath -> Integer -> IntMap Int -> FileInfo
generateInfo filePath fileSize freqs = (fileEntry, sizesMap)
where
sizesMap = codesSize 0 $ treeFromFreqs freqs
compSize = 256 + compressedSize freqs sizesMap
fileEntry = FileEntry {origSize = fromInteger fileSize,
comprSize = fromIntegral compSize,
path = filePath}
makeCodebook :: IntMap Int -> [Int]
makeCodebook sizesMap = map (codebookLookup sizesMap) [0..255]
codebookLookup :: IntMap Int -> Int -> Int
codebookLookup sizesMap n = IntMap.findWithDefault 0 n sizesMap
compressedSize :: IntMap Int -> IntMap Int -> Int
compressedSize freqs sizes = bytesNum + ceil
where
bitsNum = sum . IntMap.elems $ IntMap.unionWith (*) freqs sizes
bytesNum = div bitsNum 8
ceil = if mod bitsNum 8 /= 0 then 1 else 0
{- Maps every byte to the bit-lenght of it's encoding from the Huffman tree -}
codesSize :: Int -> HuffTree -> IntMap Int
codesSize d (Leaf val _) = IntMap.singleton val d
codesSize d (Node _ l r) = IntMap.union (codesSize (d+1) l) (codesSize (d+1) r)
{- Functions to make a Huffman tree from a map of bytes frequences -}
treeFromFreqs :: IntMap Int -> HuffTree
treeFromFreqs = findTree . map toLeaf . sortOn snd . IntMap.toList
findTree :: [HuffTree] -> HuffTree
findTree [] = Leaf 0 0
findTree [x] = x
findTree (a:b:lst) = findTree . priorityInsert lst $ mergeCouple (a,b)
mergeCouple :: (HuffTree, HuffTree) -> HuffTree
mergeCouple (l, r) = Node (treeFreq l + treeFreq r) l r
priorityInsert :: [HuffTree] -> HuffTree -> [HuffTree]
priorityInsert [] tree = [tree]
priorityInsert (x:xs) tree
| treeFreq x < treeFreq tree = x : priorityInsert xs tree
| otherwise = tree : x : xs
treeFreq :: HuffTree -> Int
treeFreq (Leaf _ freq) = freq
treeFreq (Node freq _ _) = freq
toLeaf :: (Int, Int) -> HuffTree
toLeaf (val, freq) = Leaf val freq
{- Functions to read the frequence of each byte in a file -}
{- NOTE: uses strict IntMaps and foldl' to run in constant memory -}
readFrequences :: FilePath -> IO (IntMap Int)
readFrequences = (BL.foldl' mapFrequences SIntMap.empty <$>) . BL.readFile
mapFrequences :: SIntMap.IntMap Int -> Word8 -> SIntMap.IntMap Int
mapFrequences fr n = SIntMap.insertWith (+) (fromIntegral n) 1 fr
{- To unfold a list of files/directories into a flat list of existing files -}
flatList :: [FilePath] -> IO [FilePath]
flatList [] = return []
flatList (filePath:paths) = do
isDir <- doesDirectoryExist filePath
isFile <- doesFileExist filePath
following <- flatList paths
if isDir then do
children <- listDirectory filePath
flatChildren <- flatList $ map (filePath </>) children
return $ following ++ flatChildren
else if isFile then return $ filePath : following
else do
putStrLn $ "Warning: " ++ filePath ++ " is not a file. Skipped."
return following
{- Functions that work with lists of bits (big-endian) -}
getHuffEncoder :: [(Int, Int)] -> ([Word8] -> [Word8])
getHuffEncoder = huffEncoder . IntMap.fromList . canonicalList
huffEncoder :: IntMap BitList -> [Word8] -> [Word8]
huffEncoder cMap = wrdsFromBits . concatMap (wrdMapper cMap)
wrdMapper :: IntMap BitList -> Word8 -> BitList
wrdMapper cMap x = lst
where (Just lst) = IntMap.lookup (fromIntegral x) cMap