Skip to content

Commit

Permalink
Merge branch pull request coolwanglu#493 into merge_all
Browse files Browse the repository at this point in the history
  • Loading branch information
jwuttke committed Sep 29, 2016
2 parents 3bbb286 + 9dbd504 commit 551f1b9
Show file tree
Hide file tree
Showing 9 changed files with 112 additions and 25 deletions.
8 changes: 6 additions & 2 deletions 3rdparty/poppler/git/CairoFontEngine.cc
Original file line number Diff line number Diff line change
Expand Up @@ -377,14 +377,18 @@ _ft_new_face (FT_Library lib,

CairoFreeTypeFont::CairoFreeTypeFont(Ref ref,
cairo_font_face_t *cairo_font_face,
FT_Face ft_face,
int *codeToGID,
Guint codeToGIDLen,
GBool substitute) : CairoFont(ref,
cairo_font_face,
codeToGID,
codeToGIDLen,
substitute,
gTrue) { }
gTrue),
// Caution: this field is added by pdf2htmlEX to determine whitespace. Please merge during update.
ft_face(ft_face)
{ }

CairoFreeTypeFont::~CairoFreeTypeFont() { }

Expand Down Expand Up @@ -547,7 +551,7 @@ CairoFreeTypeFont *CairoFreeTypeFont::create(GfxFont *gfxFont, XRef *xref,

delete fontLoc;
return new CairoFreeTypeFont(ref,
font_face,
font_face, face,
codeToGID, codeToGIDLen,
substitute);

Expand Down
6 changes: 4 additions & 2 deletions 3rdparty/poppler/git/CairoFontEngine.h
Original file line number Diff line number Diff line change
Expand Up @@ -75,10 +75,12 @@ class CairoFreeTypeFont : public CairoFont {
public:
static CairoFreeTypeFont *create(GfxFont *gfxFont, XRef *xref, FT_Library lib, GBool useCIDs);
virtual ~CairoFreeTypeFont();

// Caution: this function is added by pdf2htmlEX to determine whitespace. Please merge during update.
FT_Face get_ft_face() { return ft_face; }
private:
CairoFreeTypeFont(Ref ref, cairo_font_face_t *cairo_font_face,
CairoFreeTypeFont(Ref ref, cairo_font_face_t *cairo_font_face, FT_Face ft_face,
int *codeToGID, Guint codeToGIDLen, GBool substitute);
FT_Face ft_face;
};

//------------------------------------------------------------------------
Expand Down
12 changes: 12 additions & 0 deletions src/HTMLRenderer/HTMLRenderer.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
#include <fstream>
#include <memory>

#include <ft2build.h>
#include FT_FREETYPE_H
#include <OutputDev.h>
#include <GfxState.h>
#include <Stream.h>
Expand Down Expand Up @@ -42,6 +44,7 @@
#include "util/const.h"
#include "util/misc.h"

class CairoFontEngine;

namespace pdf2htmlEX {

Expand Down Expand Up @@ -217,6 +220,10 @@ struct HTMLRenderer : OutputDev
// make sure the current HTML style consistent with PDF
void prepare_text_line(GfxState * state);

// Check whether this char has a non-empty glyph in this font. If not sure, return true.
// A char has an empty glyph or no glyph is usually a whitespace.
bool has_glyph(CharCode code, GfxFont* font);

////////////////////////////////////////////////////
// PDF stuffs
////////////////////////////////////////////////////
Expand Down Expand Up @@ -341,6 +348,11 @@ struct HTMLRenderer : OutputDev

CoveredTextDetector covered_text_detector;
DrawingTracer tracer;

#if ENABLE_SVG
FT_Library ft_lib;
std::unique_ptr<CairoFontEngine> font_engine;
#endif
};

} //namespace pdf2htmlEX
Expand Down
26 changes: 26 additions & 0 deletions src/HTMLRenderer/font.cc
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
#include "CairoFontEngine.h"
#include "CairoOutputDev.h"
#include <Gfx.h>
#include FT_OUTLINE_H
#endif

namespace pdf2htmlEX {
Expand Down Expand Up @@ -1086,4 +1087,29 @@ void HTMLRenderer::export_local_font(const FontInfo & info, GfxFont * font, cons
f_css.fs << "}" << endl;
}

bool HTMLRenderer::has_glyph(CharCode code, GfxFont* font)
{
#if ENABLE_SVG
if (font->getType() == fontType3)
return true;
CairoFreeTypeFont* ftfont = (CairoFreeTypeFont*)font_engine->getFont(font, cur_doc, false, xref);
if (ftfont == nullptr)
return false;
FT_Face face = ftfont->get_ft_face();
if (face == nullptr)
return false;
auto gid = ftfont->getGlyph(code, nullptr, 0);
// gid == 0 means no glyph
if (gid == 0)
return false;
if (FT_Load_Glyph(face, gid, FT_LOAD_NO_SCALE))
return false;
FT_GlyphSlot slot = face->glyph;
// n_contours == 0 means an empty glyph
if (slot->format == FT_GLYPH_FORMAT_OUTLINE && slot->outline.n_contours == 0)
return false;
#endif
return true;
}

} //namespace pdf2htmlEX
12 changes: 12 additions & 0 deletions src/HTMLRenderer/general.cc
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@
#include "util/css_const.h"
#include "util/encoding.h"

#if ENABLE_SVG
#include "CairoFontEngine.h"
#endif

namespace pdf2htmlEX {

using std::fixed;
Expand Down Expand Up @@ -86,11 +90,19 @@ HTMLRenderer::HTMLRenderer(const Param & param)
[this](double * box, bool partial) { covered_text_detector.add_char_bbox_clipped(box, partial); };
tracer.on_non_char_drawn =
[this](double * box) { covered_text_detector.add_non_char_bbox(box); };

#if ENABLE_SVG
FT_Init_FreeType(&ft_lib);
font_engine = std::unique_ptr<CairoFontEngine>(new CairoFontEngine(ft_lib));
#endif
}

HTMLRenderer::~HTMLRenderer()
{
ffw_finalize();
#if ENABLE_SVG
FT_Done_FreeType(ft_lib);
#endif
}

void HTMLRenderer::process(PDFDoc *doc)
Expand Down
42 changes: 27 additions & 15 deletions src/HTMLRenderer/text.cc
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s)
while (len > 0)
{
auto n = font->getNextChar(p, len, &code, &u, &uLen, &ax, &ay, &ox, &oy);
HR_DEBUG(printf("HTMLRenderer::drawString:unicode=%lc(%d)\n", (wchar_t)u[0], u[0]));
HR_DEBUG(printf("HTMLRenderer::drawString:unicode=%lc(%d)%s\n", (wchar_t)u[0], u[0], has_glyph(code, font) ? "":" no glyph"));

if(!(equal(ox, 0) && equal(oy, 0)))
{
Expand Down Expand Up @@ -113,24 +113,36 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s)
}
else
{
Unicode uu;
if(cur_text_state.font_info->use_tounicode)
if (uLen == 1 && is_illegal_unicode(u[0]) && !has_glyph(code, font))
{
uu = check_unicode(u, uLen, code, font);
// Convert illegal html unicode to a whitespace, if it has no glyph.
// Add a zero-width space AFTER the offset to make sure words are
// delimited, and make sure the ZWSP can be optimized out if the
// offset is represented by a space (see HTMLTextLine::dump_unicode).
html_text_page.get_cur_line()->append_offset(ddx * draw_text_scale);
html_text_page.get_cur_line()->append_unicodes(&zero_width_space, 1, 0);
}
else
{
uu = unicode_from_font(code, font);
}
html_text_page.get_cur_line()->append_unicodes(&uu, 1, ddx);
/*
* In PDF, word_space is appended if (n == 1 and *p = ' ')
* but in HTML, word_space is appended if (uu == ' ')
*/
int space_count = (is_space ? 1 : 0) - ((uu == ' ') ? 1 : 0);
if(space_count != 0)
{
html_text_page.get_cur_line()->append_offset(cur_word_space * draw_text_scale * space_count);
Unicode uu;
if(cur_text_state.font_info->use_tounicode)
{
uu = check_unicode(u, uLen, code, font);
}
else
{
uu = unicode_from_font(code, font);
}
html_text_page.get_cur_line()->append_unicodes(&uu, 1, ddx);
/*
* In PDF, word_space is appended if (n == 1 and *p = ' ')
* but in HTML, word_space is appended if (uu == ' ')
*/
int space_count = (is_space ? 1 : 0) - ((uu == ' ') ? 1 : 0);
if(space_count != 0)
{
html_text_page.get_cur_line()->append_offset(cur_word_space * draw_text_scale * space_count);
}
}
}
}
Expand Down
26 changes: 20 additions & 6 deletions src/HTMLTextLine.cc
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

#include "util/encoding.h"
#include "util/css_const.h"
#include "util/unicode.h"

namespace pdf2htmlEX {

Expand All @@ -32,6 +33,7 @@ HTMLTextLine::HTMLTextLine (const HTMLLineState & line_state, const Param & para
,clip_x1(0)
,clip_y1(0)
,width(0)
,last_output_unicode(0)
{ }

void HTMLTextLine::append_unicodes(const Unicode * u, int l, double width)
Expand Down Expand Up @@ -88,16 +90,25 @@ void HTMLTextLine::dump_char(std::ostream & out, int pos)
int c = text[pos];
if (c > 0)
{
Unicode u = c;
writeUnicodes(out, &u, 1);
dump_unicode(out, c);
}
else if (c < 0)
{
auto dt = decomposed_text[- c - 1];
writeUnicodes(out, &dt.front(), dt.size());
for (auto it = dt.begin(), end = dt.end(); it != end; it++)
dump_unicode(out, *it);
}
}

void HTMLTextLine::dump_unicode(std::ostream & out, Unicode u)
{
// ZWSP following space can be optimized out.
if (u == zero_width_space && last_output_unicode == ' ')
return;
writeUnicodes(out, &u, 1);
last_output_unicode = u;
}

void HTMLTextLine::dump_chars(ostream & out, int begin, int len)
{
static const Color transparent(0, 0, 0, true);
Expand Down Expand Up @@ -162,6 +173,7 @@ void HTMLTextLine::dump_text(ostream & out)
<< " " << CSS::BOTTOM_CN << all_manager.bottom.install(line_state.y - clip_y1)
;
// it will be closed by the first state
last_output_unicode = 0;
}

std::vector<State*> stack;
Expand Down Expand Up @@ -249,8 +261,7 @@ void HTMLTextLine::dump_text(ostream & out)
double space_off = state_iter1->single_space_offset();
if(std::abs(target - space_off) <= param.h_eps)
{
Unicode u = ' ';
writeUnicodes(out, &u, 1);
dump_unicode(out, ' ');
actual_offset = space_off;
done = true;
}
Expand All @@ -269,7 +280,10 @@ void HTMLTextLine::dump_text(ostream & out)
double threshold = state_iter1->em_size() * (param.space_threshold);

out << "<span class=\"" << CSS::WHITESPACE_CN
<< ' ' << CSS::WHITESPACE_CN << wid << "\">" << (target > (threshold - EPS) ? " " : "") << "</span>";
<< ' ' << CSS::WHITESPACE_CN << wid << "\">";
if (target > (threshold - EPS))
dump_unicode(out, ' ');
out << "</span>";
}
}
}
Expand Down
3 changes: 3 additions & 0 deletions src/HTMLTextLine.h
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ class HTMLTextLine
*/
void dump_chars(std::ostream & out, int begin, int len);
void dump_char(std::ostream & out, int pos);
void dump_unicode(std::ostream & out, Unicode u);

const Param & param;
AllStateManager & all_manager;
Expand All @@ -128,6 +129,8 @@ class HTMLTextLine
*/
std::vector<int> text;
std::vector<std::vector<Unicode> > decomposed_text;

Unicode last_output_unicode; //last unicode written to html (chars in tags excluded)
};

} // namespace pdf2htmlEX
Expand Down
2 changes: 2 additions & 0 deletions src/util/unicode.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@

namespace pdf2htmlEX {

const Unicode zero_width_space = 0x200B;

/**
* Check whether a unicode character is illegal for the output HTML.
* Unlike PDF readers, browsers has special treatments for such characters (normally treated as
Expand Down

0 comments on commit 551f1b9

Please sign in to comment.