From 08ba10e26ca7c2679e62c46f1e2f4bd2d538b250 Mon Sep 17 00:00:00 2001
From: Alexander Mankuta <alex@pointless.one>
Date: Wed, 6 Dec 2023 12:50:06 +0200
Subject: [PATCH] A whole bunch of CFF fixes

## Corrupted CFF index data

There was a subtle bug in CFF Index implementation that resulted in
a data corruption. In certain circumstances some items didn't get
properly encoded. This happened when items were not previously accessed.

This resulted, for instance, in missing glyphs. But only sometimes
because indexes might've still contain data that shouldn't've been
there. In combination with incorrect encoding (see further) this
resulted in some glyphs still being rendered, sometimes even correctly.

Along with the fix a rather large API change landed. This resulted in
quite a big diff.

## Incorrect CFF encoding in subsets

TTFunk used to reuse encoding from the original font. This mapping was
incorrect for subset fonts which used not just a subset of glyphs but
also a different encoding.

A separate issue was that some fonts have empty CFF encoding. This
incorrect mapping resulted in encoding that mapped all codes to glyph 0.

This had impact on Prawn in particular. PDF spec explicitly says that
CFF encoding is not to be used in OpenType fonts. `cmap` table should
directly index charstrings in the CFF table. Despite this PDF renderers
still use CFF encoding to retrieve glyphs. So TTFunk has to discard the
original CFF encoding and supply its own.
---
 CHANGELOG.md                               |  35 +++++
 lib/ttfunk/otf_encoder.rb                  |  11 +-
 lib/ttfunk/subset/code_page.rb             |   1 +
 lib/ttfunk/table/cff.rb                    |  12 +-
 lib/ttfunk/table/cff/charset.rb            |  31 +++--
 lib/ttfunk/table/cff/charstring.rb         |   4 -
 lib/ttfunk/table/cff/charstrings_index.rb  |  18 +--
 lib/ttfunk/table/cff/encoding.rb           |  46 +++----
 lib/ttfunk/table/cff/fd_selector.rb        |  20 +--
 lib/ttfunk/table/cff/font_dict.rb          |   6 +-
 lib/ttfunk/table/cff/font_index.rb         |  23 ++--
 lib/ttfunk/table/cff/index.rb              | 137 +++++++++++---------
 lib/ttfunk/table/cff/one_based_index.rb    |   2 +-
 lib/ttfunk/table/cff/private_dict.rb       |   4 +-
 lib/ttfunk/table/cff/subr_index.rb         |   4 +-
 lib/ttfunk/table/cff/top_dict.rb           |  18 ++-
 lib/ttfunk/table/cff/top_index.rb          |  15 ++-
 spec/ttfunk/table/cff/charset_spec.rb      |  33 +++--
 spec/ttfunk/table/cff/encoding_spec.rb     |  31 ++---
 spec/ttfunk/table/cff/fd_selector_spec.rb  |  20 +--
 spec/ttfunk/table/cff/font_dict_spec.rb    |   6 +-
 spec/ttfunk/table/cff/font_index_spec.rb   |   2 +-
 spec/ttfunk/table/cff/index_spec.rb        | 143 ++++++++++++++++++---
 spec/ttfunk/table/cff/private_dict_spec.rb |   4 +-
 spec/ttfunk/table/cff/top_dict_spec.rb     |   5 +-
 spec/ttfunk/table/cff/top_index_spec.rb    |   2 +-
 26 files changed, 401 insertions(+), 232 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 546ede4a..113c68ae 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,41 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/).
 
 ## [Unreleased]
 
+### Fixed
+
+* Corrupted CFF index data
+
+  there was a subtle bug in cff index implementation that resulted in
+  a data corruption. in certain circumstances some items didn't get
+  properly encoded. this happened when items were not previously accessed.
+
+  this resulted, for instance, in missing glyphs. but only sometimes
+  because indexes might've still contain data that shouldn't've been
+  there. in combination with incorrect encoding (see further) this
+  resulted in some glyphs still being rendered, sometimes even correctly.
+
+  along with the fix a rather large api change landed. this resulted in
+  quite a big diff.
+
+  Alexander Mankuta
+
+* Incorrect CFF encoding in subsets
+
+  TTFunk used to reuse encoding from the original font. This mapping was
+  incorrect for subset fonts which used not just a subset of glyphs but
+  also a different encoding.
+
+  A separate issue was that some fonts have empty CFF encoding. This
+  incorrect mapping resulted in encoding that mapped all codes to glyph 0.
+
+  This had impact on Prawn in particular. PDF spec explicitly says that
+  CFF encoding is not to be used in OpenType fonts. `cmap` table should
+  directly index charstrings in the CFF table. Despite this PDF renderers
+  still use CFF encoding to retrieve glyphs. So TTFunk has to discard the
+  original CFF encoding and supply its own.
+
+  Alexander Mankuta
+
 ## 1.7.0
 
 ### Changes
diff --git a/lib/ttfunk/otf_encoder.rb b/lib/ttfunk/otf_encoder.rb
index 408e1a52..3bddfde6 100644
--- a/lib/ttfunk/otf_encoder.rb
+++ b/lib/ttfunk/otf_encoder.rb
@@ -27,7 +27,7 @@ def base_table
     end
 
     def cff_table
-      @cff_table ||= original.cff.encode(new_to_old_glyph, old_to_new_glyph)
+      @cff_table ||= original.cff.encode(subset)
     end
 
     def vorg_table
@@ -48,14 +48,5 @@ def optimal_table_order
         (tables.keys - ['DSIG'] - OPTIMAL_TABLE_ORDER) +
         ['DSIG']
     end
-
-    def collect_glyphs(glyph_ids)
-      # CFF top indexes are supposed to contain only one font, although they're
-      # capable of supporting many (no idea why this is true, maybe for CFF
-      # v2??). Anyway it's cool to do top_index[0], don't worry about it.
-      glyph_ids.each_with_object({}) do |id, h|
-        h[id] = original.cff.top_index[0].charstrings_index[id]
-      end
-    end
   end
 end
diff --git a/lib/ttfunk/subset/code_page.rb b/lib/ttfunk/subset/code_page.rb
index b943b18c..d2ad9976 100644
--- a/lib/ttfunk/subset/code_page.rb
+++ b/lib/ttfunk/subset/code_page.rb
@@ -40,6 +40,7 @@ def initialize(original, code_page, encoding)
 
       def to_unicode_map
         self.class.unicode_mapping_for(encoding)
+          .select { |codepoint, _unicode| @subset[codepoint] }
       end
 
       def use(character)
diff --git a/lib/ttfunk/table/cff.rb b/lib/ttfunk/table/cff.rb
index 3f40a082..4ecd459b 100644
--- a/lib/ttfunk/table/cff.rb
+++ b/lib/ttfunk/table/cff.rb
@@ -31,18 +31,18 @@ def tag
         TAG
       end
 
-      def encode(new_to_old, old_to_new)
+      def encode(subset)
         EncodedString.new do |result|
-          sub_tables = [
+          result.concat(
             header.encode,
             name_index.encode,
-            top_index.encode(&:encode),
+            top_index.encode,
             string_index.encode,
             global_subr_index.encode
-          ]
+          )
 
-          sub_tables.each { |tb| result << tb }
-          top_index[0].finalize(result, new_to_old, old_to_new)
+          charmap = subset.new_cmap_table[:charmap]
+          top_index[0].finalize(result, charmap)
         end
       end
 
diff --git a/lib/ttfunk/table/cff/charset.rb b/lib/ttfunk/table/cff/charset.rb
index 3c78441f..fcaded54 100644
--- a/lib/ttfunk/table/cff/charset.rb
+++ b/lib/ttfunk/table/cff/charset.rb
@@ -35,7 +35,7 @@ def strings_for_charset_id(charset_id)
         end
 
         attr_reader :entries, :length
-        attr_reader :top_dict, :format, :count, :offset_or_id
+        attr_reader :top_dict, :format, :items_count, :offset_or_id
 
         def initialize(top_dict, file, offset_or_id = nil, length = nil)
           @top_dict = top_dict
@@ -44,7 +44,7 @@ def initialize(top_dict, file, offset_or_id = nil, length = nil)
           if offset
             super(file, offset, length)
           else
-            @count = self.class.strings_for_charset_id(offset_or_id).size
+            @items_count = self.class.strings_for_charset_id(offset_or_id).size
           end
         end
 
@@ -52,7 +52,7 @@ def each
           return to_enum(__method__) unless block_given?
 
           # +1 adjusts for the implicit .notdef glyph
-          (count + 1).times { |i| yield self[i] }
+          (items_count + 1).times { |i| yield self[i] }
         end
 
         def [](glyph_id)
@@ -73,13 +73,18 @@ def offset
           end
         end
 
-        # mapping is new -> old glyph ids
-        def encode(mapping)
+        def encode(charmap)
           # no offset means no charset was specified (i.e. we're supposed to
           # use a predefined charset) so there's nothing to encode
           return '' unless offset
 
-          sids = mapping.keys.sort.map { |new_gid| sid_for(mapping[new_gid]) }
+          sids =
+            charmap
+              .values
+              .reject { |mapping| mapping[:new].zero? }
+              .sort_by { |mapping| mapping[:new] }
+              .map { |mapping| sid_for(mapping[:old]) }
+
           ranges = TTFunk::BinUtils.rangify(sids)
           range_max = ranges.map(&:last).max
 
@@ -138,7 +143,7 @@ def find_string(sid)
 
             idx = sid - 390
 
-            if idx < file.cff.string_index.count
+            if idx < file.cff.string_index.items_count
               file.cff.string_index[idx]
             end
           else
@@ -153,23 +158,23 @@ def parse!
 
           case format_sym
           when :array_format
-            @count = top_dict.charstrings_index.count - 1
-            @length = count * element_width
+            @items_count = top_dict.charstrings_index.items_count - 1
+            @length = @items_count * element_width
             @entries = OneBasedArray.new(read(length, 'n*'))
 
           when :range_format8, :range_format16
             # The number of ranges is not explicitly specified in the font.
             # Instead, software utilizing this data simply processes ranges
             # until all glyphs in the font are covered.
-            @count = 0
+            @items_count = 0
             @entries = []
             @length = 0
 
-            until count >= top_dict.charstrings_index.count - 1
+            until @items_count >= top_dict.charstrings_index.items_count - 1
               @length += 1 + element_width
               sid, num_left = read(element_width, element_format)
-              entries << (sid..(sid + num_left))
-              @count += num_left + 1
+              @entries << (sid..(sid + num_left))
+              @items_count += num_left + 1
             end
           end
         end
diff --git a/lib/ttfunk/table/cff/charstring.rb b/lib/ttfunk/table/cff/charstring.rb
index d7cb05ff..c3b58e89 100644
--- a/lib/ttfunk/table/cff/charstring.rb
+++ b/lib/ttfunk/table/cff/charstring.rb
@@ -91,10 +91,6 @@ def render(x: 0, y: 0, font_size: 72)
           )
         end
 
-        def encode
-          raw
-        end
-
         private
 
         def parse!
diff --git a/lib/ttfunk/table/cff/charstrings_index.rb b/lib/ttfunk/table/cff/charstrings_index.rb
index be86824a..e3be83b8 100644
--- a/lib/ttfunk/table/cff/charstrings_index.rb
+++ b/lib/ttfunk/table/cff/charstrings_index.rb
@@ -11,21 +11,21 @@ def initialize(top_dict, *remaining_args)
           @top_dict = top_dict
         end
 
-        def [](index)
-          entry_cache[index] ||= TTFunk::Table::Cff::Charstring.new(
+        private
+
+        def decode_item(index, _offset, _length)
+          TTFunk::Table::Cff::Charstring.new(
             index, top_dict, font_dict_for(index), super
           )
         end
 
-        # gets passed a mapping of new => old glyph ids
-        def encode(mapping)
-          super() do |_entry, index|
-            self[mapping[index]].encode if mapping.include?(index)
-          end
+        def encode_items(charmap)
+          charmap
+            .reject { |code, mapping| mapping[:new].zero? && !code.zero? }
+            .sort_by { |_code, mapping| mapping[:new] }
+            .map { |(_code, mapping)| items[mapping[:old]] }
         end
 
-        private
-
         def font_dict_for(index)
           # only CID-keyed fonts contain an FD selector and font dicts
           if top_dict.is_cid_font?
diff --git a/lib/ttfunk/table/cff/encoding.rb b/lib/ttfunk/table/cff/encoding.rb
index bd4c76b3..6ab0ba3f 100644
--- a/lib/ttfunk/table/cff/encoding.rb
+++ b/lib/ttfunk/table/cff/encoding.rb
@@ -22,7 +22,7 @@ def codes_for_encoding_id(encoding_id)
           end
         end
 
-        attr_reader :top_dict, :format, :count, :offset_or_id
+        attr_reader :top_dict, :format, :items_count, :offset_or_id
 
         def initialize(top_dict, file, offset_or_id = nil, length = nil)
           @top_dict = top_dict
@@ -30,8 +30,10 @@ def initialize(top_dict, file, offset_or_id = nil, length = nil)
 
           if offset
             super(file, offset, length)
+            @supplemental = format >> 7 == 1
           else
-            @count = self.class.codes_for_encoding_id(offset_or_id).size
+            @items_count = self.class.codes_for_encoding_id(offset_or_id).size
+            @supplemental = false
           end
         end
 
@@ -39,7 +41,7 @@ def each
           return to_enum(__method__) unless block_given?
 
           # +1 adjusts for the implicit .notdef glyph
-          (count + 1).times { |i| yield self[i] }
+          (items_count + 1).times { |i| yield self[i] }
         end
 
         def [](glyph_id)
@@ -62,16 +64,18 @@ def offset
           end
         end
 
-        def encode(new_to_old, old_to_new)
-          # no offset means no encoding was specified (i.e. we're supposed to
-          # use a predefined encoding) so there's nothing to encode
-          return '' unless offset
-          return encode_supplemental(new_to_old, old_to_new) if supplemental?
+        def encode(charmap)
+          # Any subset encoding is all but guaranteed to be different from the
+          # standard encoding so we don't even attempt to see if it matches. We
+          # assume it's different and just encode it anew.
+
+          return encode_supplemental(charmap) if supplemental?
 
           codes =
-            new_to_old.keys.sort.map do |new_gid|
-              code_for(new_to_old[new_gid])
-            end
+            charmap
+              .reject { |_code, mapping| mapping[:new].zero? }
+              .sort_by { |_code, mapping| mapping[:new] }
+              .map { |(code, _m)| code }
 
           ranges = TTFunk::BinUtils.rangify(codes)
 
@@ -95,18 +99,16 @@ def encode(new_to_old, old_to_new)
 
         def supplemental?
           # high-order bit set to 1 indicates supplemental encoding
-          @format >> 7 == 1
+          @supplemental
         end
 
         private
 
-        def encode_supplemental(_new_to_old, old_to_new)
+        def encode_supplemental(charmap)
           new_entries =
-            @entries.each_with_object({}) do |(code, old_gid), ret|
-              if (new_gid = old_to_new[old_gid])
-                ret[code] = new_gid
-              end
-            end
+            charmap
+              .reject { |_code, mapping| mapping[:new].zero? }
+              .transform_values { |mapping| mapping[:new] }
 
           result = [format_int(:supplemental), new_entries.size].pack('CC')
           fmt = element_format(:supplemental)
@@ -150,22 +152,22 @@ def parse!
 
           case format_sym
           when :array_format
-            @count = entry_count
+            @items_count = entry_count
             @entries = OneBasedArray.new(read(length, 'C*'))
 
           when :range_format
             @entries = []
-            @count = 0
+            @items_count = 0
 
             entry_count.times do
               code, num_left = read(element_width, element_format)
               @entries << (code..(code + num_left))
-              @count += num_left + 1
+              @items_count += num_left + 1
             end
 
           when :supplemental
             @entries = {}
-            @count = entry_count
+            @items_count = entry_count
 
             entry_count.times do
               code, glyph = read(element_width, element_format)
diff --git a/lib/ttfunk/table/cff/fd_selector.rb b/lib/ttfunk/table/cff/fd_selector.rb
index 22fde67f..639465b7 100644
--- a/lib/ttfunk/table/cff/fd_selector.rb
+++ b/lib/ttfunk/table/cff/fd_selector.rb
@@ -12,7 +12,7 @@ class FdSelector < TTFunk::SubTable
         RANGE_ENTRY_SIZE = 3
         ARRAY_ENTRY_SIZE = 1
 
-        attr_reader :top_dict, :count, :entries, :n_glyphs
+        attr_reader :top_dict, :items_count, :entries, :n_glyphs
 
         def initialize(top_dict, file, offset, length = nil)
           @top_dict = top_dict
@@ -48,16 +48,16 @@ def [](glyph_id)
         def each
           return to_enum(__method__) unless block_given?
 
-          count.times { |i| yield self[i] }
+          items_count.times { |i| yield self[i] }
         end
 
-        # mapping is new -> old glyph ids
-        def encode(mapping)
+        def encode(charmap)
           # get list of [new_gid, fd_index] pairs
           new_indices =
-            mapping.keys.sort.map do |new_gid|
-              [new_gid, self[mapping[new_gid]]]
-            end
+            charmap
+              .reject { |code, mapping| mapping[:new].zero? && !code.zero? }
+              .sort_by { |_code, mapping| mapping[:new] }
+              .map { |(_code, mapping)| [mapping[:new], self[mapping[:old]]] }
 
           ranges = rangify_gids(new_indices)
           total_range_size = ranges.size * RANGE_ENTRY_SIZE
@@ -108,10 +108,10 @@ def parse!
 
           case format_sym
           when :array_format
-            @n_glyphs = top_dict.charstrings_index.count
+            @n_glyphs = top_dict.charstrings_index.items_count
             data = io.read(n_glyphs)
             @length += data.bytesize
-            @count = data.bytesize
+            @items_count = data.bytesize
             @entries = data.bytes
 
           when :range_format
@@ -135,7 +135,7 @@ def parse!
             last_start_gid, last_fd_index = ranges.last
             @entries << [(last_start_gid...(n_glyphs + 1)), last_fd_index]
 
-            @count = entries.reduce(0) { |sum, entry| sum + entry.first.size }
+            @items_count = entries.reduce(0) { |sum, entry| sum + entry.first.size }
           end
         end
 
diff --git a/lib/ttfunk/table/cff/font_dict.rb b/lib/ttfunk/table/cff/font_dict.rb
index 39e4e8ec..5ac1c349 100644
--- a/lib/ttfunk/table/cff/font_dict.rb
+++ b/lib/ttfunk/table/cff/font_dict.rb
@@ -15,7 +15,7 @@ def initialize(top_dict, file, offset, length = nil)
           super(file, offset, length)
         end
 
-        def encode(_mapping)
+        def encode
           EncodedString.new do |result|
             each do |operator, operands|
               case OPERATOR_CODES[operator]
@@ -30,8 +30,8 @@ def encode(_mapping)
           end
         end
 
-        def finalize(new_cff_data, mapping)
-          encoded_private_dict = private_dict.encode(mapping)
+        def finalize(new_cff_data)
+          encoded_private_dict = private_dict.encode
           encoded_offset = encode_integer32(new_cff_data.length)
           encoded_length = encode_integer32(encoded_private_dict.length)
 
diff --git a/lib/ttfunk/table/cff/font_index.rb b/lib/ttfunk/table/cff/font_index.rb
index c5f0dc4c..d282ba58 100644
--- a/lib/ttfunk/table/cff/font_index.rb
+++ b/lib/ttfunk/table/cff/font_index.rb
@@ -11,18 +11,21 @@ def initialize(top_dict, file, offset, length = nil)
           @top_dict = top_dict
         end
 
-        def [](index)
-          entry_cache[index] ||=
-            begin
-              start, finish = absolute_offsets_for(index)
-              TTFunk::Table::Cff::FontDict.new(
-                top_dict, file, start, (finish - start) + 1
-              )
-            end
+        def finalize(new_cff_data)
+          each { |font_dict| font_dict.finalize(new_cff_data) }
         end
 
-        def finalize(new_cff_data, mapping)
-          each { |font_dict| font_dict.finalize(new_cff_data, mapping) }
+        private
+
+        def decode_item(_index, offset, length)
+          TTFunk::Table::Cff::FontDict.new(
+            top_dict, file, offset, length
+          )
+        end
+
+        def encode_items(*)
+          # Re-encode font dicts
+          map(&:encode)
         end
       end
     end
diff --git a/lib/ttfunk/table/cff/index.rb b/lib/ttfunk/table/cff/index.rb
index d570f4d4..fb0dfec9 100644
--- a/lib/ttfunk/table/cff/index.rb
+++ b/lib/ttfunk/table/cff/index.rb
@@ -6,72 +6,80 @@ class Cff < TTFunk::Table
       class Index < TTFunk::SubTable
         include Enumerable
 
-        # number of objects in the index
-        attr_reader :count
-
-        # offset array element size
-        attr_reader :offset_size
-
-        attr_reader :raw_offset_length, :offsets, :raw_data
-        attr_reader :data_start_pos
-
         def [](index)
-          entry_cache[index] ||= raw_data[
-            offsets[index]...offsets[index + 1]
-          ]
+          return if index >= items_count
+
+          entry_cache[index] ||=
+            decode_item(
+              index,
+              data_reference_offset + offsets[index],
+              offsets[index + 1] - offsets[index]
+            )
         end
 
-        def each
-          return to_enum(__method__) unless block_given?
+        def each(&block)
+          return to_enum(__method__) unless block
 
-          count.times { |i| yield self[i] }
+          items_count.times do |i|
+            yield self[i]
+          end
         end
 
-        def encode
-          result = EncodedString.new
+        def items_count
+          items.length
+        end
 
-          entries =
-            each_with_object([]).with_index do |(entry, ret), index|
-              new_entry = block_given? ? yield(entry, index) : entry
-              ret << new_entry if new_entry
-            end
+        def encode(*args)
+          new_items = encode_items(*args)
 
-          # "An empty INDEX is represented by a count field with a 0 value and
-          # no additional fields. Thus, the total size of an empty INDEX is 2
-          # bytes."
-          result << [entries.size].pack('n')
-          return result if entries.empty?
+          if new_items.empty?
+            return [0].pack('n')
+          end
 
-          offset_size = (Math.log2(entries.size) / 8.0).round + 1
-          result << [offset_size].pack('C')
-          data_offset = 1
+          if new_items.length > 0xffff
+            raise Error, 'Too many items in a CFF index'
+          end
 
-          data = EncodedString.new
+          offsets_array =
+            new_items
+              .each_with_object([1]) do |item, offsets|
+                offsets << offsets.last + item.length
+              end
 
-          entries.each do |entry|
-            result << encode_offset(data_offset, offset_size)
-            data << entry
-            data_offset += entry.length
-          end
+          offset_size = (offsets_array.last.bit_length / 8.0).ceil
 
-          unless entries.empty?
-            result << encode_offset(data_offset, offset_size)
-          end
+          offsets_array.map! { |offset| encode_offset(offset, offset_size) }
 
-          result << data
+          EncodedString.new.concat(
+            [new_items.length, offset_size].pack('nC'),
+            *offsets_array,
+            *new_items
+          )
         end
 
         private
 
+        attr_reader :items, :offsets, :data_reference_offset
+
         def entry_cache
           @entry_cache ||= {}
         end
 
-        def absolute_offsets_for(index)
-          [
-            table_offset + offsets[index] + data_start_pos,
-            table_offset + offsets[index + 1] + data_start_pos
-          ]
+        # Returns an array of EncodedString elements (plain strings,
+        # placeholders, or EncodedString instances). Each element is supposed to
+        # represent an encoded item.
+        #
+        # This is the place to do all the filtering, reordering, or individual
+        # item encoding.
+        #
+        # It gets all the arguments `encode` gets.
+        def encode_items(*)
+          items
+        end
+
+        # By default do nothing
+        def decode_item(index, _offset, _length)
+          items[index]
         end
 
         def encode_offset(offset, offset_size)
@@ -88,35 +96,38 @@ def encode_offset(offset, offset_size)
         end
 
         def parse!
-          @count = read(2, 'n').first
+          @entry_cache = {}
 
-          if count.zero?
+          num_entries = read(2, 'n').first
+
+          if num_entries.zero?
             @length = 2
-            @data = []
+            @items = []
             return
           end
 
-          @offset_size = read(1, 'C').first
+          offset_size = read(1, 'C').first
 
-          # read an extra offset_size bytes to get rid of the first offset,
-          # which is always 1
-          io.read(offset_size)
+          @offsets =
+            Array.new(num_entries + 1) do
+              unpack_offset(io.read(offset_size), offset_size)
+            end
 
-          @raw_offset_length = count * offset_size
-          raw_offsets = io.read(raw_offset_length)
+          @data_reference_offset = table_offset + 3 + offsets.length * offset_size - 1
 
-          @offsets = [0] + Array.new(count) do |idx|
-            start = offset_size * idx
-            finish = offset_size * (idx + 1)
-            unpack_offset(raw_offsets[start...finish]) - 1
-          end
+          @length =
+            2 + # num entries
+            1 + # offset size
+            offsets.length * offset_size + # offsets
+            offsets.last - 1 # items
 
-          @raw_data = io.read(offsets.last)
-          @data_start_pos = 3 + offset_size + raw_offset_length
-          @length = data_start_pos + raw_data.size
+          @items =
+            offsets.each_cons(2).map do |offset, next_offset|
+              io.read(next_offset - offset)
+            end
         end
 
-        def unpack_offset(offset_data)
+        def unpack_offset(offset_data, offset_size)
           padding = "\x00" * (4 - offset_size)
           (padding + offset_data).unpack1('N')
         end
diff --git a/lib/ttfunk/table/cff/one_based_index.rb b/lib/ttfunk/table/cff/one_based_index.rb
index 147e287a..f2ebe4a2 100644
--- a/lib/ttfunk/table/cff/one_based_index.rb
+++ b/lib/ttfunk/table/cff/one_based_index.rb
@@ -11,7 +11,7 @@ class OneBasedIndex
         def_delegators :base_index,
           :each,
           :table_offset,
-          :count,
+          :items_count,
           :length,
           :encode
 
diff --git a/lib/ttfunk/table/cff/private_dict.rb b/lib/ttfunk/table/cff/private_dict.rb
index 1dc0a06e..ef364290 100644
--- a/lib/ttfunk/table/cff/private_dict.rb
+++ b/lib/ttfunk/table/cff/private_dict.rb
@@ -18,7 +18,7 @@ class PrivateDict < TTFunk::Table::Cff::Dict
 
         # @TODO: use mapping to determine which subroutines are still used.
         # For now, just encode them all.
-        def encode(_mapping)
+        def encode
           EncodedString.new do |result|
             each do |operator, operands|
               case OPERATOR_CODES[operator]
@@ -72,7 +72,7 @@ def nominal_width_x
         private
 
         def encode_subrs
-          EncodedString.new.tap do |result|
+          EncodedString.new do |result|
             result << Placeholder.new(
               :"subrs_#{@table_offset}", length: PLACEHOLDER_LENGTH
             )
diff --git a/lib/ttfunk/table/cff/subr_index.rb b/lib/ttfunk/table/cff/subr_index.rb
index 6cf8065e..6a1ec73a 100644
--- a/lib/ttfunk/table/cff/subr_index.rb
+++ b/lib/ttfunk/table/cff/subr_index.rb
@@ -5,9 +5,9 @@ class Table
     class Cff < TTFunk::Table
       class SubrIndex < TTFunk::Table::Cff::Index
         def bias
-          if count < 1240
+          if items.length < 1240
             107
-          elsif count < 33_900
+          elsif items.length < 33_900
             1131
           else
             32_768
diff --git a/lib/ttfunk/table/cff/top_dict.rb b/lib/ttfunk/table/cff/top_dict.rb
index 40148edf..50e0a29d 100644
--- a/lib/ttfunk/table/cff/top_dict.rb
+++ b/lib/ttfunk/table/cff/top_dict.rb
@@ -47,16 +47,16 @@ def encode(*)
           end
         end
 
-        def finalize(new_cff_data, new_to_old, old_to_new)
+        def finalize(new_cff_data, charmap)
           if charset
             finalize_subtable(
-              new_cff_data, :charset, charset.encode(new_to_old)
+              new_cff_data, :charset, charset.encode(charmap)
             )
           end
 
           if encoding
             finalize_subtable(
-              new_cff_data, :encoding, encoding.encode(new_to_old, old_to_new)
+              new_cff_data, :encoding, encoding.encode(charmap)
             )
           end
 
@@ -64,7 +64,7 @@ def finalize(new_cff_data, new_to_old, old_to_new)
             finalize_subtable(
               new_cff_data,
               :charstrings_index,
-              charstrings_index.encode(new_to_old, &:encode)
+              charstrings_index.encode(charmap)
             )
           end
 
@@ -72,24 +72,22 @@ def finalize(new_cff_data, new_to_old, old_to_new)
             finalize_subtable(
               new_cff_data,
               :font_index,
-              font_index.encode do |font_dict|
-                font_dict.encode(new_to_old)
-              end
+              font_index.encode
             )
 
-            font_index.finalize(new_cff_data, new_to_old)
+            font_index.finalize(new_cff_data)
           end
 
           if font_dict_selector
             finalize_subtable(
               new_cff_data,
               :font_dict_selector,
-              font_dict_selector.encode(new_to_old)
+              font_dict_selector.encode(charmap)
             )
           end
 
           if private_dict
-            encoded_private_dict = private_dict.encode(new_to_old)
+            encoded_private_dict = private_dict.encode
             encoded_offset = encode_integer32(new_cff_data.length)
             encoded_length = encode_integer32(encoded_private_dict.length)
 
diff --git a/lib/ttfunk/table/cff/top_index.rb b/lib/ttfunk/table/cff/top_index.rb
index 08efbc4d..8c120c97 100644
--- a/lib/ttfunk/table/cff/top_index.rb
+++ b/lib/ttfunk/table/cff/top_index.rb
@@ -4,12 +4,15 @@ module TTFunk
   class Table
     class Cff < TTFunk::Table
       class TopIndex < TTFunk::Table::Cff::Index
-        def [](index)
-          entry_cache[index] ||=
-            begin
-              start, finish = absolute_offsets_for(index)
-              TTFunk::Table::Cff::TopDict.new(file, start, (finish - start) + 1)
-            end
+        private
+
+        def decode_item(_index, offset, length)
+          TTFunk::Table::Cff::TopDict.new(file, offset, length)
+        end
+
+        def encode_items(*)
+          # Re-encode the top dict
+          map(&:encode)
         end
       end
     end
diff --git a/spec/ttfunk/table/cff/charset_spec.rb b/spec/ttfunk/table/cff/charset_spec.rb
index 175ce3f7..91c609e5 100644
--- a/spec/ttfunk/table/cff/charset_spec.rb
+++ b/spec/ttfunk/table/cff/charset_spec.rb
@@ -28,8 +28,8 @@
         # From the spec: There is one less element in the glyph name array than
         # nGlyphs (i.e. charstrings count) because the .notdef glyph name is
         # omitted.
-        expect(charset.count).to(
-          eq(font.cff.top_index[0].charstrings_index.count - 1)
+        expect(charset.items_count).to(
+          eq(font.cff.top_index[0].charstrings_index.items_count - 1)
         )
       end
 
@@ -58,8 +58,8 @@
         # From the spec: There is one less element in the glyph name array than
         # nGlyphs (i.e. charstrings count) because the .notdef glyph name is
         # omitted.
-        expect(charset.count).to(
-          eq(font.cff.top_index[0].charstrings_index.count - 1)
+        expect(charset.items_count).to(
+          eq(font.cff.top_index[0].charstrings_index.items_count - 1)
         )
       end
 
@@ -85,8 +85,8 @@
         # From the spec: There is one less element in the glyph name array than
         # nGlyphs (i.e. charstrings count) because the .notdef glyph name is
         # omitted.
-        expect(charset.count).to(
-          eq(font.cff.top_index[0].charstrings_index.count - 1)
+        expect(charset.items_count).to(
+          eq(font.cff.top_index[0].charstrings_index.items_count - 1)
         )
       end
 
@@ -115,14 +115,21 @@
 
   describe '#encode' do
     let(:font_path) { test_font('NotoSansCJKsc-Thin', :otf) }
-    let(:encoded) { charset.encode(subset_mapping) }
+    let(:encoded) { charset.encode(charmap) }
 
     context 'when the subset contains non-sequential SIDs' do
-      let(:subset_mapping) do
+      let(:charmap) do
         # the idea here is to demonstrate that non-sequental SIDs can sometimes
         # be more compactly represented as individual elements as opposed to
         # ranges (supposed to be new => old glyph IDs)
-        { 1 => 1, 4 => 4, 10 => 10, 14 => 14, 15 => 15, 21 => 21 }
+        {
+          0x20 => { old: 1, new: 1 },
+          0x23 => { old: 4, new: 4 },
+          0x29 => { old: 10, new: 10 },
+          0x2d => { old: 14, new: 14 },
+          0x2e => { old: 15, new: 15 },
+          0x34 => { old: 21, new: 21 }
+        }
       end
 
       it 'encodes using the array-based format' do
@@ -141,10 +148,10 @@
     end
 
     context 'when the subset contains few sequential SIDs' do
-      let(:subset_mapping) do
+      let(:charmap) do
         # i.e. the first 20 characters, in order
         # (supposed to be new => old glyph IDs)
-        Hash[(1..20).map { |i| [i, i] }]
+        Hash[(1..20).map { |i| [0x20 + i, { old: i, new: i }] }]
       end
 
       it 'encodes using the 8-bit range-based format' do
@@ -158,10 +165,10 @@
     end
 
     context 'when the subset contains many sequential SIDs' do
-      let(:subset_mapping) do
+      let(:charmap) do
         # we want to get a 2-byte range to demonstrate the 16-bit format
         # (supposed to be new => old glyph IDs)
-        Hash[(1..2**10).map { |i| [i, i] }]
+        Hash[(1..2**10).map { |i| [0x20 + i, { old: i, new: i }] }]
       end
 
       it 'encodes using the 16-bit range-based format' do
diff --git a/spec/ttfunk/table/cff/encoding_spec.rb b/spec/ttfunk/table/cff/encoding_spec.rb
index 9dd80d21..710a4cd5 100644
--- a/spec/ttfunk/table/cff/encoding_spec.rb
+++ b/spec/ttfunk/table/cff/encoding_spec.rb
@@ -32,14 +32,21 @@
 
   describe '#encode' do
     let(:font_path) { test_font('AlbertTextBold', :otf) }
-    let(:encoded) { encoding.encode(subset_mapping, subset_mapping.invert) }
+    let(:encoded) { encoding.encode(charmap) }
 
     context 'when the subset contains non-sequential codes' do
-      let(:subset_mapping) do
+      let(:charmap) do
         # the idea here is to demonstrate that non-sequental codes can
         # sometimes be more compactly represented as individual elements
         # as opposed to ranges (supposed to be new => old glyph IDs)
-        { 1 => 1, 4 => 4, 10 => 10, 14 => 14, 15 => 15, 21 => 21 }
+        {
+          0x20 => { old: 1, new: 1 },
+          0x23 => { old: 4, new: 4 },
+          0x29 => { old: 10, new: 10 },
+          0x2d => { old: 14, new: 14 },
+          0x2e => { old: 15, new: 15 },
+          0x34 => { old: 21, new: 13 }
+        }
       end
 
       it 'encodes using the array-based format' do
@@ -48,13 +55,7 @@
 
       it 'encodes correctly' do
         # format (0x00), codes (1 byte each)
-        expect(encoded.bytes).to eq(
-          [
-            0,
-            subset_mapping.count,
-            *subset_mapping.map { |old_gid, _| encoding[old_gid] }
-          ]
-        )
+        expect(encoded).to eq("\x00\x06\x20\x23\x29\x34\x2d\x2e")
       end
 
       # unfortunately I haven't been able to find an example font that defines
@@ -83,16 +84,16 @@
           font.cff.top_index[0], file, fake_offset, encoded.length
         )
 
-        expect(new_encoding.to_a).to eq([0, 26, 29, 35, 39, 40, 46])
+        expect(new_encoding.to_a).to eq([0, 0x20, 0x23, 0x29, 0x34, 0x2d, 0x2e])
       end
       # rubocop: enable RSpec/AnyInstance
     end
 
     context 'when the subset contains sequential codes' do
-      let(:subset_mapping) do
+      let(:charmap) do
         # i.e. the first 20 characters, in order
         # (supposed to be new => old glyph IDs)
-        Hash[(1..20).map { |i| [i, i] }]
+        Hash[(1..20).map { |i| [0x20 + i, { old: i, new: i }] }]
       end
 
       it 'encodes using the range-based format' do
@@ -100,9 +101,9 @@
       end
 
       it 'encodes correctly' do
-        # format (0x01), count (0x01, start code (0x1D, i.e. 26),
+        # format (0x01), count (0x01, start code (0x21, i.e. 33),
         # rest (0x13, i.e. 19)
-        expect(encoded.bytes).to eq([0x01, 0x01, 0x1A, 0x13])
+        expect(encoded.bytes).to eq([0x01, 0x01, 0x21, 0x13])
       end
     end
   end
diff --git a/spec/ttfunk/table/cff/fd_selector_spec.rb b/spec/ttfunk/table/cff/fd_selector_spec.rb
index c6adb0a8..e83bde77 100644
--- a/spec/ttfunk/table/cff/fd_selector_spec.rb
+++ b/spec/ttfunk/table/cff/fd_selector_spec.rb
@@ -24,7 +24,7 @@
       instance_double(
         TTFunk::Table::Cff::CharstringsIndex,
         :charstrings_index,
-        count: entry_count
+        items_count: entry_count
       )
     end
     let(:fd_selector) do
@@ -38,7 +38,7 @@
     end
 
     it 'includes entries for all the glyphs in the font' do
-      expect(fd_selector.count).to eq(entry_count)
+      expect(fd_selector.items_count).to eq(entry_count)
     end
 
     it 'parses the entries correctly' do
@@ -46,8 +46,12 @@
     end
 
     it 'encodes correctly' do
-      mapping = { 1 => 1, 3 => 3, 5 => 5 }
-      expect(fd_selector.encode(mapping)).to eq("\x00\x02\x04\x06")
+      charmap = {
+        0x20 => { old: 1, new: 1 },
+        0x22 => { old: 3, new: 3 },
+        0x24 => { old: 5, new: 5 }
+      }
+      expect(fd_selector.encode(charmap)).to eq("\x00\x02\x04\x06")
     end
   end
 
@@ -56,8 +60,8 @@
 
     it 'includes entries for all the glyphs in the font' do
       # the charstrings index doesn't contain an entry for the .notdef glyph
-      expect(fd_selector.count).to(
-        eq(font.cff.top_index[0].charstrings_index.count + 1)
+      expect(fd_selector.items_count).to(
+        eq(font.cff.top_index[0].charstrings_index.items_count + 1)
       )
     end
 
@@ -78,8 +82,8 @@
     end
 
     it 'encodes correctly' do
-      mapping = Hash[(0..15).map { |i| [i, i] }]
-      result = fd_selector.encode(mapping)
+      charmap = Hash[(0..15).map { |i| [i, { old: i, new: i }] }]
+      result = fd_selector.encode(charmap)
       expect(result).to(
         #   fmt | count |  range 1  |  range 2  | n glyphs
         eq("\x03\x00\x02\x00\x00\x05\x00\x01\x0F\x00\x10")
diff --git a/spec/ttfunk/table/cff/font_dict_spec.rb b/spec/ttfunk/table/cff/font_dict_spec.rb
index 153e0773..6a29b029 100644
--- a/spec/ttfunk/table/cff/font_dict_spec.rb
+++ b/spec/ttfunk/table/cff/font_dict_spec.rb
@@ -24,11 +24,11 @@
     end
 
     it 'produces an encoded dict that can be re-parsed successfully' do
-      result = font_dict.encode({})
+      result = font_dict.encode
       dict_length = result.length
-      private_dict_length = font_dict.private_dict.encode({}).length
+      private_dict_length = font_dict.private_dict.encode.length
 
-      font_dict.finalize(result, {})
+      font_dict.finalize(result)
       io = StringIO.new(result.string)
       file = TestFile.new(io)
       new_dict = described_class.new(top_dict, file, 0, dict_length)
diff --git a/spec/ttfunk/table/cff/font_index_spec.rb b/spec/ttfunk/table/cff/font_index_spec.rb
index 5f1d4d85..3c141b84 100644
--- a/spec/ttfunk/table/cff/font_index_spec.rb
+++ b/spec/ttfunk/table/cff/font_index_spec.rb
@@ -9,7 +9,7 @@
   let(:font_path) { test_font('NotoSansCJKsc-Thin', :otf) }
 
   it 'provides access to font dicts by index' do
-    expect(font_index.count).to eq(19)
+    expect(font_index.items_count).to eq(19)
     expect(font_index[0]).to be_a(TTFunk::Table::Cff::FontDict)
   end
 end
diff --git a/spec/ttfunk/table/cff/index_spec.rb b/spec/ttfunk/table/cff/index_spec.rb
index 3abe0a34..9d206bc1 100644
--- a/spec/ttfunk/table/cff/index_spec.rb
+++ b/spec/ttfunk/table/cff/index_spec.rb
@@ -34,26 +34,139 @@
     [0x00, 0x00] => []
   }
 
-  test_cases.each_with_index do |(bytes, decoded_values), idx|
-    context "test case #{idx}" do
-      subject(:index) do
-        io = StringIO.new(bytes.pack('C*'))
-        described_class.new(
-          TestFile.new(io), 0, bytes.size
-        )
-      end
+  describe 'decoding' do
+    test_cases.each_with_index do |(bytes, decoded_values), idx|
+      context "test case #{idx}" do
+        subject(:index) do
+          io = StringIO.new(bytes.pack('C*'))
+          described_class.new(
+            TestFile.new(io), 0, bytes.size
+          )
+        end
 
-      it 'parses correctly' do
-        expect(index.map(&:bytes)).to eq(decoded_values)
-      end
+        it 'parses correctly' do
+          expect(index.map(&:bytes)).to eq(decoded_values)
+        end
 
-      it 'encodes correctly' do
-        expect(index.encode.bytes).to eq(bytes)
+        it 'encodes correctly' do
+          expect(index.encode.bytes).to eq(bytes)
+        end
+
+        it 'calculates the length correctly' do
+          expect(index.length).to eq(bytes.size)
+        end
       end
+    end
+  end
+
+  describe 'encoding' do
+    it 'properly encodes items (change)' do
+      inc_index_class =
+        Class.new(described_class) do
+          private
+
+          def encode_items(*)
+            # Increase each byte by 1
+            items.map { |i| [i.unpack1('C') + 1].pack('C') }
+          end
+        end
+
+      data = [
+        # count
+        0x00, 0x03,
+        # offset len
+        0x01,
+        # offsets
+        0x01, 0x02, 0x03, 0x04,
+        # data
+        0x01, 0x02, 0x03
+      ].pack('C*')
 
-      it 'calculates the length correctly' do
-        expect(index.length).to eq(bytes.size)
+      index =
+        inc_index_class.new(
+          TestFile.new(StringIO.new(data)), 0, data.length
+        )
+
+      expect(index.encode.string).to eq("\00\03\01\01\02\03\04\02\03\04")
+    end
+
+    it 'properly encodes items (filter)' do
+      dup_index_class =
+        Class.new(described_class) do
+          private
+
+          def encode_items(*)
+            # duplicate each item
+            items.flat_map { |i| [i, i] }
+          end
+        end
+
+      data = [
+        # count
+        0x00, 0x03,
+        # offset len
+        0x01,
+        # offsets
+        0x01, 0x02, 0x03, 0x04,
+        # data
+        0x01, 0x02, 0x03
+      ].pack('C*')
+
+      index =
+        dup_index_class.new(
+          TestFile.new(StringIO.new(data)), 0, data.length
+        )
+
+      expect(index.encode.string).to eq("\00\06\01\01\02\03\04\05\06\07\01\01\02\02\03\03")
+    end
+
+    [
+      { item_size: 1, data_size: 6, offset_size: 1 },
+      { item_size: 0xff, data_size: 262, offset_size: 2 },
+      { item_size: 0xffff, data_size: 65_544, offset_size: 3 },
+      { item_size: 0xffffff, data_size: 16_777_226, offset_size: 4 }
+    ].each do |params|
+      it "properly encodes offset size #{params[:offset_size]}" do
+        gen_index_class =
+          Class.new(described_class) do
+            attr_accessor :item_size
+
+            private
+
+            def encode_items(*)
+              ["\00" * item_size]
+            end
+          end
+
+        gen_index = gen_index_class.new(TestFile.new(StringIO.new("\00\00")), 0, 2)
+        gen_index.item_size = params[:item_size]
+
+        data = gen_index.encode.string
+
+        expect(data.length).to eq params[:data_size]
+
+        index =
+          described_class.new(
+            TestFile.new(StringIO.new(data)), 0, data.length
+          )
+
+        expect(index.items_count).to eq 1
       end
     end
+
+    it 'raises on more items than is possible to encode' do
+      gen_index_class =
+        Class.new(described_class) do
+          private
+
+          def encode_items(*)
+            ["\00"] * 0x10000
+          end
+        end
+
+      gen_index = gen_index_class.new(TestFile.new(StringIO.new("\00\00")), 0, 2)
+
+      expect { gen_index.encode }.to raise_error(/too many items/i)
+    end
   end
 end
diff --git a/spec/ttfunk/table/cff/private_dict_spec.rb b/spec/ttfunk/table/cff/private_dict_spec.rb
index b6f806b7..fb767a5b 100644
--- a/spec/ttfunk/table/cff/private_dict_spec.rb
+++ b/spec/ttfunk/table/cff/private_dict_spec.rb
@@ -41,7 +41,7 @@
 
   describe '#encode' do
     it 'produces an encoded dict that can be re-parsed successfully' do
-      result = private_dict.encode({})
+      result = private_dict.encode
       dict_length = result.length
 
       private_dict.finalize(result)
@@ -57,7 +57,7 @@
         )
       )
 
-      expect(new_dict.subr_index.count).to eq(private_dict.subr_index.count)
+      expect(new_dict.subr_index.items_count).to eq(private_dict.subr_index.items_count)
     end
   end
 end
diff --git a/spec/ttfunk/table/cff/top_dict_spec.rb b/spec/ttfunk/table/cff/top_dict_spec.rb
index abc5b4e3..e50c6eb0 100644
--- a/spec/ttfunk/table/cff/top_dict_spec.rb
+++ b/spec/ttfunk/table/cff/top_dict_spec.rb
@@ -18,13 +18,12 @@
 
   describe '#encode' do
     it 'produces an encoded dict that can be re-parsed successfully' do
-      new_to_old = font.cmap.unicode.first.code_map
-      old_to_new = new_to_old.invert
+      charmap = font.cmap.unicode.first.code_map.transform_values { |v| { old: v, new: v } }
       encoded = top_dict.encode
       top_dict_length = encoded.length
       top_dict_hash = top_dict.to_h
       placeholders = encoded.placeholders.dup
-      top_dict.finalize(encoded, new_to_old, old_to_new)
+      top_dict.finalize(encoded, charmap)
 
       file = TestFile.new(StringIO.new(encoded.string))
       new_top_dict = described_class.new(file, 0, top_dict_length)
diff --git a/spec/ttfunk/table/cff/top_index_spec.rb b/spec/ttfunk/table/cff/top_index_spec.rb
index fd243c02..4ed2c41e 100644
--- a/spec/ttfunk/table/cff/top_index_spec.rb
+++ b/spec/ttfunk/table/cff/top_index_spec.rb
@@ -13,7 +13,7 @@
     end
 
     it 'always contains a single top dict' do
-      expect(top_index.count).to eq(1)
+      expect(top_index.items_count).to eq(1)
     end
   end