Skip to content

Commit

Permalink
PDFBOX-4371: new option to detect all angles and to extract text by a…
Browse files Browse the repository at this point in the history
…ngle

git-svn-id: https://svn.apache.org/repos/asf/pdfbox/trunk@1846262 13f79535-47bb-0310-9956-ffa450edef68
  • Loading branch information
THausherr committed Nov 9, 2018
1 parent 80a03a5 commit 9119af7
Showing 1 changed file with 178 additions and 41 deletions.
219 changes: 178 additions & 41 deletions tools/src/main/java/org/apache/pdfbox/tools/ExtractText.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,23 +23,32 @@
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.io.IOUtils;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary;
import org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification;
import org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile;
import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import org.apache.pdfbox.util.Matrix;

/**
* This is the main program that simply parses the pdf document and transforms it
* into text.
*
* @author Ben Litchfield
* @author Tilman Hausherr
*/
public final class ExtractText
{
Expand All @@ -55,6 +64,7 @@ public final class ExtractText
private static final String DEBUG = "-debug";
private static final String HTML = "-html";
private static final String ALWAYSNEXT = "-alwaysNext";
private static final String ROTATION_MAGIC = "-rotationMagic";
private static final String STD_ENCODING = "UTF-8";

/*
Expand Down Expand Up @@ -98,6 +108,7 @@ public void startExtraction( String[] args ) throws IOException
boolean sort = false;
boolean separateBeads = true;
boolean alwaysNext = false;
boolean rotationMagic = false;
String password = "";
String encoding = STD_ENCODING;
String pdfFile = null;
Expand Down Expand Up @@ -156,6 +167,10 @@ else if (args[i].equals(ALWAYSNEXT))
{
alwaysNext = true;
}
else if (args[i].equals(ROTATION_MAGIC))
{
rotationMagic = true;
}
else if( args[i].equals( END_PAGE ) )
{
i++;
Expand Down Expand Up @@ -221,43 +236,41 @@ else if( args[i].equals( CONSOLE ) )
}
output = new OutputStreamWriter( new FileOutputStream( outputFile ), encoding );
}
startTime = startProcessing("Starting text extraction");
if (debug)
{
System.err.println("Writing to " + outputFile);
}

PDFTextStripper stripper;
if(toHTML)
{
// HTML stripper can't work page by page because of startDocument() callback
stripper = new PDFText2HTML();
}
else
{
stripper = new PDFTextStripper();
}
stripper.setSortByPosition( sort );
stripper.setShouldSeparateByBeads( separateBeads );
stripper.setSortByPosition(sort);
stripper.setShouldSeparateByBeads(separateBeads);
stripper.setStartPage(startPage);
stripper.setEndPage(endPage);

startTime = startProcessing("Starting text extraction");
if (debug)
{
System.err.println("Writing to "+outputFile);
// Extract text for main document:
stripper.writeText(document, output);
}
endPage = Math.min(endPage, document.getNumberOfPages());

// Extract text for main document:
for (int p = startPage; p <= endPage; ++p)
else
{
try
if (rotationMagic)
{
stripper.setStartPage(p);
stripper.setEndPage(p);
stripper.writeText(document, output);
stripper = new FilteredTextStripper();
}
catch (IOException ex)
else
{
if (!alwaysNext)
{
throw ex;
}
LOG.error("Failed to process page " + p, ex);
stripper = new PDFTextStripper();
}
stripper.setSortByPosition(sort);
stripper.setShouldSeparateByBeads(separateBeads);

// Extract text for main document:
extractPages(startPage, Math.min(endPage, document.getNumberOfPages()),
stripper, document, output, rotationMagic, alwaysNext);
}

// ... also for any embedded PDFs:
Expand Down Expand Up @@ -288,19 +301,15 @@ else if( args[i].equals( CONSOLE ) )
try (InputStream fis = file.createInputStream();
PDDocument subDoc = PDDocument.load(fis))
{
for (int p = 1; p <= subDoc.getNumberOfPages(); ++p)
if (toHTML)
{
// will not really work because of HTML header + footer
stripper.writeText( subDoc, output );
}
else
{
try
{
stripper.setStartPage(p);
stripper.setEndPage(p);
stripper.writeText(subDoc, output);
}
catch (IOException ex)
{
//TODO alternatively, log and continue
throw ex;
}
extractPages(1, subDoc.getNumberOfPages(),
stripper, subDoc, output, rotationMagic, alwaysNext);
}
}
}
Expand All @@ -318,6 +327,59 @@ else if( args[i].equals( CONSOLE ) )
}
}

private void extractPages(int startPage, int endPage,
PDFTextStripper stripper, PDDocument document, Writer output,
boolean rotationMagic, boolean alwaysNext) throws IOException
{
for (int p = startPage; p <= endPage; ++p)
{
stripper.setStartPage(p);
stripper.setEndPage(p);
try
{
if (rotationMagic)
{
PDPage page = document.getPage(p - 1);
int rotation = page.getRotation();
page.setRotation(0);
AngleCollector angleCollector = new AngleCollector();
angleCollector.setStartPage(p);
angleCollector.setEndPage(p);
angleCollector.writeText(document, new NullWriter());
// rotation magic
for (int angle : angleCollector.getAngles())
{
// prepend a transformation
// (we could skip these parts for angle 0, but it doesn't matter much)
try (PDPageContentStream cs = new PDPageContentStream(document, page,
PDPageContentStream.AppendMode.PREPEND, false))
{
cs.transform(Matrix.getRotateInstance(-Math.toRadians(angle), 0, 0));
}

stripper.writeText(document, output);

// remove prepended transformation
((COSArray) page.getCOSObject().getItem(COSName.CONTENTS)).remove(0);
}
page.setRotation(rotation);
}
else
{
stripper.writeText(document, output);
}
}
catch (IOException ex)
{
if (!alwaysNext)
{
throw ex;
}
LOG.error("Failed to process page " + p, ex);
}
}
}

private long startProcessing(String message)
{
if (debug)
Expand Down Expand Up @@ -345,19 +407,94 @@ private static void usage()
String message = "Usage: java -jar pdfbox-app-x.y.z.jar ExtractText [options] <inputfile> [output-text-file]\n"
+ "\nOptions:\n"
+ " -password <password> : Password to decrypt document\n"
+ " -encoding <output encoding> : UTF-8 (default) or ISO-8859-1, UTF-16BE, UTF-16LE, etc.\n"
+ " -encoding <output encoding> : UTF-8 (default) or ISO-8859-1, UTF-16BE,\n"
+ " UTF-16LE, etc.\n"
+ " -console : Send text to console instead of file\n"
+ " -html : Output in HTML format instead of raw text\n"
+ " -sort : Sort the text before writing\n"
+ " -ignoreBeads : Disables the separation by beads\n"
+ " -debug : Enables debug output about the time consumption of every stage\n"
+ " -alwaysNext : Process next page (if applicable) despite IOException\n"
+ " -debug : Enables debug output about the time consumption\n"
+ " of every stage\n"
+ " -alwaysNext : Process next page (if applicable) despite\n"
+ " IOException (ignored when -html)\n"
+ " -rotationMagic : Analyze each page for rotated/skewed text,\n"
+ " rotate to 0° and extract separately\n"
+ " (slower, and ignored when -html)\n"
+ " -startPage <number> : The first page to start extraction (1 based)\n"
+ " -endPage <number> : The last page to extract (1 based and inclusive)\n"
+ " -endPage <number> : The last page to extract (1 based, inclusive)\n"
+ " <inputfile> : The PDF document to use\n"
+ " [output-text-file] : The file to write the text to";

System.err.println(message);
System.exit( 1 );
}
}

/**
* Collect all angles while doing text extraction. Angles are in degrees and rounded to the closest
* integer. Must be constructed for each page.
*/
class AngleCollector extends PDFTextStripper
{
private final Set<Integer> angles = new TreeSet<>();

AngleCollector() throws IOException
{
}

Set<Integer> getAngles()
{
return angles;
}

@Override
protected void processTextPosition(TextPosition text)
{
Matrix m = text.getTextMatrix();
int angle = (int) Math.round(Math.toDegrees(Math.atan2(m.getShearY(), m.getScaleY())));
angle = (angle + 360) % 360;
angles.add(angle);
}
}

/**
* TextStripper that only processes glyphs that have angle 0.
*/
class FilteredTextStripper extends PDFTextStripper
{
FilteredTextStripper() throws IOException
{
}

@Override
protected void processTextPosition(TextPosition text)
{
Matrix m = text.getTextMatrix();
int angle = (int) Math.round(Math.toDegrees(Math.atan2(m.getShearY(), m.getScaleY())));
if (angle == 0)
{
super.processTextPosition(text);
}
}
}

/**
* Dummy output.
*/
class NullWriter extends Writer
{
@Override
public void write(char[] cbuf, int off, int len) throws IOException
{
}

@Override
public void flush() throws IOException
{
}

@Override
public void close() throws IOException
{
}
}

0 comments on commit 9119af7

Please sign in to comment.