From e0270967c0cfe06c1fae2cae69f70b7d05c8deab Mon Sep 17 00:00:00 2001 From: James Healy Date: Fri, 18 Oct 2019 14:57:32 +1100 Subject: [PATCH 1/2] PageState should track much more of the graphics state --- lib/pdf/reader/page_state.rb | 176 +++++++++++++++++++++++++++ lib/pdf/reader/page_text_receiver.rb | 4 + 2 files changed, 180 insertions(+) diff --git a/lib/pdf/reader/page_state.rb b/lib/pdf/reader/page_state.rb index ecad5423..4ffc2727 100644 --- a/lib/pdf/reader/page_state.rb +++ b/lib/pdf/reader/page_state.rb @@ -10,7 +10,31 @@ class PDF::Reader # directly to PDF operators. class PageState + # TODO: items int tracked in the graphics state yet: + # clipping_path, black_generation, undercolor_removal, transfer, halftone + # DEFAULT_GRAPHICS_STATE = { + :colorspace_fill => :DeviceGray, + :colorspace_stroke => :DeviceGray, + :color_fill => 1.0, # black + :color_stroke => 1.0, # black + :line_width => 1.0, + :line_join => 0, + :line_cap => 0, + :miter_limit => 10.0, + :dash_pattern => { :array => [], phase: 0 }, + :rendering_intent => :RelativeColorimetric, + :stroke_adjustment => false, + :blend_mode => :Normal, + :soft_mask => :None, + :alpha_constant_fill => 1.0, + :alpha_constant_stroke => 1.0, + :alpha_source => false, + :overprint_fill => false, + :overprint_stroke => false, + :overprint_mode => 0, + :flatness => 1.0, + :smoothness => 0, # appropriate default value? :char_spacing => 0, :word_spacing => 0, :h_scaling => 1.0, @@ -30,6 +54,7 @@ def initialize(page) @font_stack = [build_fonts(page.fonts)] @xobject_stack = [page.xobjects] @cs_stack = [page.color_spaces] + @gs_stack = [page.graphic_states] @stack = [DEFAULT_GRAPHICS_STATE.dup] state[:ctm] = identity_matrix end @@ -52,6 +77,108 @@ def restore_graphics_state @stack.pop end + def set_color_rendering_intent(value) + state[:rendering_intent] = value + end + + def set_flatness_tolerance(value) + state[:flatness] = value + end + + # TODO we're not handling the following keys in graphics state dictionaries: + # :BG, BG2, :UCR, :UCR2, :TR, :TR2, :HT, :SA, :BM, :SMask + # + def set_graphics_state_parameters(name) + gs = find_graphics_state(name) + return if gs.nil? + puts "set_graphics_state_parameters #{name} #{gs.inspect}" + set_line_width(gs[:LW]) if gs[:LW] + set_line_cap_style(gs[:LC]) if gs[:LC] + set_line_join_style(gs[:LJ]) if gs[:LJ] + set_miter_limit(gs[:ML]) if gs[:ML] + set_line_dash(gs[:D].first. gs[:D].last) if gs[:D] + set_color_rendering_intent(gs[:RI]) if gs[:RI] + if gs[:OP] && gs[:op] + set_overprint_stroke(gs[:OP]) + set_overprint_fill(gs[:op]) + elsif gs[:OP] + set_overprint_stroke(gs[:OP]) + set_overprint_fill(gs[:OP]) + elsif gs[:op] + set_overprint_fill(gs[:op]) + end + set_overprint_mode(gs[:OPM]) if gs[:OPM] + set_text_font_and_size(gs[:Font].first, gs[:Font].last) if gs[:Font] + set_flatness_tolerance(gs[:FL]) if gs[:FL] + set_smoothness(gs[:SM]) if gs[:SM] + set_alpha_constant_stroke(gs[:CA]) if gs[:CA] + set_alpha_constant_fill(gs[:ca]) if gs[:ca] + set_alpha_source(gs[:AIS]) if gs[:AIS] + set_text_knockout(gs[:TK]) if gs[:TK] + end + + def set_line_cap_style(value) + state[:line_cap] = value.to_i + end + + def set_line_dash(array, phase) + state[:dash_pattern] = { :array => array, :phase => phase } + end + + def set_line_join_style(value) + state[:line_join] = value.to_i + end + + def set_line_width(value) + state[:line_width] = value + end + + def set_miter_limit(value) + state[:miter_limit] = value + end + + ##################################################### + # Colour Operators + ##################################################### + + def set_cmyk_color_for_stroking(c, m, y, k) + set_stroke_color_space(:DeviceCMYK) + state[:color_stroke] = [c, m, y, k] + end + + def set_cmyk_color_for_nonstroking(c, m, y, k) + set_nonstroke_color_space(:DeviceCMYK) + state[:color_fill] = [c, m, y, k] + end + + def set_gray_for_stroking(value) + set_stroke_color_space(:DeviceGray) + state[:color_stroke] = [value] + end + + def set_gray_for_nonstroking(value) + set_nonstroke_color_space(:DeviceGray) + state[:color_fill] = [value] + end + + def set_rgb_color_for_stroking(r, g, b) + set_stroke_color_space(:DeviceRGB) + state[:color_stroke] = [r, g, b] + end + + def set_rgb_color_for_nonstroking(r, g, b) + set_nonstroke_color_space(:DeviceRGB) + state[:color_fill] = [r, g, b] + end + + def set_stroke_color_space(name) + state[:colorspace_stroke] = name + end + + def set_nonstroke_color_space(name) + state[:colorspace_fill] = name + end + ##################################################### # Matrix Operators ##################################################### @@ -201,9 +328,12 @@ def invoke_xobject(label) form = PDF::Reader::FormXObject.new(@page, xobject, :cache => @cache) @font_stack.unshift(form.font_objects) @xobject_stack.unshift(form.xobjects) + @cs_stack.unshift(form.color_spaces) + @gs_stack.unshift(form.graphic_states) yield form if block_given? @font_stack.shift @xobject_stack.shift + @gs_stack.shift else yield xobject if block_given? end @@ -261,6 +391,13 @@ def find_color_space(label) dict ? dict[label] : nil end + def find_graphics_state(label) + dict = @gs_stack.detect { |graphic_states| + graphic_states.has_key?(label) + } + dict ? dict[label] : nil + end + def find_xobject(label) dict = @xobject_stack.detect { |xobjects| xobjects.has_key?(label) @@ -407,5 +544,44 @@ def identity_matrix 0, 0) end + ##################################################### + # Graphic state updates that don't have operators, so no need for public methods + ##################################################### + def set_overprint_stroke(value) + state[:overprint_stroke] = value + end + + def set_overprint_fill(value) + state[:overprint_fill] = value + end + + def set_overprint_mode(value) + state[:overprint_mode] = value + end + + def set_flatness_tolerance(value) + state[:flatness] = value + end + + def set_smoothness(value) + state[:smoothness] = value + end + + def set_alpha_constant_stroke(value) + state[:alpha_constant_stroke] = value + end + + def set_alpha_constant_fill(value) + state[:alpha_constant_fill] = value + end + + def set_alpha_source(value) + state[:alpha_source] = value + end + + def set_text_knockout(value) + state[:text_knockout] = value + end + end end diff --git a/lib/pdf/reader/page_text_receiver.rb b/lib/pdf/reader/page_text_receiver.rb index 356d5a12..ee1e1748 100644 --- a/lib/pdf/reader/page_text_receiver.rb +++ b/lib/pdf/reader/page_text_receiver.rb @@ -21,6 +21,10 @@ class PageTextReceiver ########## BEGIN FORWARDERS ########## # Graphics State Operators def_delegators :@state, :save_graphics_state, :restore_graphics_state + def_delegators :@state, :set_color_rendering_intent + def_delegators :@state, :set_flatness_tolerance, :set_graphics_state_parameters + def_delegators :@state, :set_line_cap_style, :set_line_dash, :set_line_join_style + def_delegators :@state, :set_line_width, :set_miter_limit # Matrix Operators def_delegators :@state, :concatenate_matrix From 951f12bfac42037444a9db4f7c4f91257a27cf36 Mon Sep 17 00:00:00 2001 From: James Healy Date: Fri, 18 Oct 2019 15:30:11 +1100 Subject: [PATCH 2/2] HACK track the graphics state for each character how does it vary over the page? Are there some characters that we should ignore? --- lib/pdf/reader/page_text_receiver.rb | 8 +++++++- lib/pdf/reader/text_run.rb | 11 ++++++----- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/lib/pdf/reader/page_text_receiver.rb b/lib/pdf/reader/page_text_receiver.rb index ee1e1748..816e2ff3 100644 --- a/lib/pdf/reader/page_text_receiver.rb +++ b/lib/pdf/reader/page_text_receiver.rb @@ -26,6 +26,12 @@ class PageTextReceiver def_delegators :@state, :set_line_cap_style, :set_line_dash, :set_line_join_style def_delegators :@state, :set_line_width, :set_miter_limit + # Graphics State Operators (colour) + def_delegators :@state, :set_cmyk_color_for_stroking, :set_cmyk_color_for_nonstroking + def_delegators :@state, :set_gray_color_for_stroking, :set_gray_color_for_nonstroking + def_delegators :@state, :set_rgb_color_for_stroking, :set_rgb_color_for_nonstroking + def_delegators :@state, :set_stroke_color_space, :set_nonstroke_color_space + # Matrix Operators def_delegators :@state, :concatenate_matrix @@ -140,7 +146,7 @@ def internal_show_text(string) th = 1 scaled_glyph_width = glyph_width * @state.font_size * th unless utf8_chars == SPACE - @characters << TextRun.new(newx, newy, scaled_glyph_width, @state.font_size, utf8_chars) + @characters << TextRun.new(newx, newy, scaled_glyph_width, @state.font_size, utf8_chars, @state.clone_state) end @state.process_glyph_displacement(glyph_width, 0, utf8_chars == SPACE) end diff --git a/lib/pdf/reader/text_run.rb b/lib/pdf/reader/text_run.rb index e47311a9..b2d62923 100644 --- a/lib/pdf/reader/text_run.rb +++ b/lib/pdf/reader/text_run.rb @@ -7,15 +7,16 @@ class PDF::Reader class TextRun include Comparable - attr_reader :origin, :width, :font_size, :text + attr_reader :origin, :width, :font_size, :text, :state alias :to_s :text - def initialize(x, y, width, font_size, text) + def initialize(x, y, width, font_size, text, state) @origin = PDF::Reader::Point.new(x, y) @width = width @font_size = font_size @text = text + @state = state end # Allows collections of TextRun objects to be sorted. They will be sorted @@ -62,14 +63,14 @@ def +(other) raise ArgumentError, "#{other} cannot be merged with this run" unless mergable?(other) if (other.x - endx) <( font_size * 0.2) - TextRun.new(x, y, other.endx - x, font_size, text + other.text) + TextRun.new(x, y, other.endx - x, font_size, text + other.text, {}) else - TextRun.new(x, y, other.endx - x, font_size, "#{text} #{other.text}") + TextRun.new(x, y, other.endx - x, font_size, "#{text} #{other.text}", {}) end end def inspect - "#{text} w:#{width} f:#{font_size} @#{x},#{y}" + "#{text} w:#{width} f:#{font_size} @#{x},#{y} #{@state.inspect}" end def intersect?(other_run)