extract Text with Font information?

Help
2009-01-20
2013-05-28
  • I will search for words, sentences and so on they were in italic or bold, style.
    The Font Size is interessting, too.
    Is it possible to extract this information from any pdf document with the jpod library?

     
    • Elfi Heck
      Elfi Heck
      2009-01-20

      Have you seen the ExtractText example? This shows basically how to extract text with jPod. If you need font information you have to look more closely at the PDFont objects passed to CSTextExtractor.textSetFont().
      Be aware however that the font size specified for the "Tf" operator (that is also passed to textSetFont()) is not always what you will see on a rendered page. The page's contents might well specify a scaling before the text occurs. To catch the "transform" and more messages from the interpreter you can always create your own "device" (subclass of CSDeviceAdapter) and override the appropriate methods.

       
  • I am very new to jpod, but needed a similar thing and created the following modification of the original ExtractText example to produce a simple xml file holding some font/formatting information:

        protected void extractText(PDPageTree pageTree, StringBuilder sb) {
            sb.append("<!-- very basic pdf text extraction enriched with some formatting " +
                    "info (br, std, bold, ital) by IPod Java PDF lib -->\n");
            sb.append("<pdf2fmttxt>\n");
            for (Iterator it = pageTree.getKids().iterator(); it.hasNext();) {
                PDPageNode node = (PDPageNode) it.next();
                if (node.isPage()) {              // do real extraction
                    try {
                        SmartCSTextExtractor extractor = new SmartCSTextExtractor();  // important for formatting extraction
                        PDPage page = (PDPage) node;
                        AffineTransform pageTx = new AffineTransform();
                        PDFGeometryTools.adjustTransform(pageTx, page);
                        extractor.setDeviceTransform(pageTx);
                        CSDeviceBasedInterpreter interpreter = new CSDeviceBasedInterpreter(
                                null, extractor);
                        interpreter.process(page.getContentStream(), page.getResources());
                        String actContent = extractor.getContent();
                        sb.append(actContent);
                    } catch (CSException e) {
                        e.printStackTrace();
                    }
                } else {
                    extractText((PDPageTree) node, sb);   // go down in hirarchy recursively
                }
            }
            sb.append("\n</pdf2fmttxt>\n");
        }
    

    The important piece of code is a variation of the original class de.intarsys.pdf.content.text.CSTextExtractor:

    package at.atrom.jpod.examples;
    import de.intarsys.pdf.content.ICSInterpreter;
    import de.intarsys.pdf.content.text.CSCharacterParser;
    import de.intarsys.pdf.cos.COSName;
    import de.intarsys.pdf.font.PDFont;
    import de.intarsys.pdf.font.PDGlyphs;
    import java.awt.geom.AffineTransform;
    import java.awt.geom.Rectangle2D;
    /**
     * Variation of the de.intarsys.pdf.content.text.CSTextExtractor to extract
     * some formatting information too. A simple XML snipplet is created which
     * is using the tags <br>, <std>, <bold>, <ital>.
     * @author mxrenkin
     */
    public class SmartCSTextExtractor extends CSCharacterParser {
        private StringBuilder content;
        private double maxDX = 5;
        private double maxDY = 5;
        private PDFont currPDFont = null;
        private String currStartTag = null;
        private String currEndTag = null;
        public SmartCSTextExtractor() {
            super();
        }
        private void append(char c) {
            if (c > 0) {
                content.append(c);
            } else {
                content.append(' ');
            }
        }
        private void append(String s) {
            content.append(s);
        }
        public String getContent() {
            // close XML Tag here:
            if (currEndTag != null) {
                append(currEndTag);
            }
            // do some cosmetics:
            return content.toString().replaceAll("<br/>(\n+)</(std|bold|ital)>", "</$2><br/>$1"); 
        }
        private static boolean sameFont(PDFont f0, PDFont f1) {
            boolean same = true;
            if (f0 == null || f1 == null) {
                same = false;
            } else if (!f0.getFontNameNormalized().equals(f1.getFontNameNormalized())) {
                same = false;
            }
            return same;
        }
        @Override
        protected void onCharacterFound(PDGlyphs glyphs, Rectangle2D rect) {
            //super.onCharacterFound(glyphs, rect);
            char c = (char) glyphs.getUnicode();
            PDFont glyphsFont = glyphs.getFont();
            double dX = lastStopX - lastStartX;
            double dY = lastStopY - lastStartY;
            if (Math.abs(dX) < maxDX) {
                if (Math.abs(dY) > maxDY && content.length() > 0) {
                    append("<br/>" + System.getProperty("line.separator"));
                }
            } else {
                if (content.length() > 0) {
                    if (Math.abs(dY) < maxDY) {
                        append(" ");
                    } else {
                        append("<br/>" + System.getProperty("line.separator"));
                    }
                }
            }
            if ('\n' == c) {
                append("<br/>");
            } else if (!sameFont(glyphsFont, currPDFont)) {
                if (currEndTag != null) {
                    append(currEndTag);
                }
                if (glyphsFont.getFontNameNormalized().contains("-Bold")) {
                    currStartTag = "<bold>";
                    currEndTag = "</bold>";
                } else if (glyphsFont.getFontNameNormalized().contains("-Italic")) {
                    currStartTag = "<ital>";
                    currEndTag = "</ital>";
                } else {
                    currStartTag = "<std>";
                    currEndTag = "</std>";
                }
                //append("<font name=\"" + glyphsFont.getFontNameNormalized() + "\">");
                append(currStartTag);
                //currEndTag = "</font>";
                currPDFont = glyphsFont;
                System.out.println("Now Font: " + glyphsFont.getFontNameNormalized());
            }
            switch (c) {  // ensure that the 5 critical xml entities are represented correctly
                case '<':
                    append("&lt;");
                    break;
                case '>':
                    append("&gt;");
                    break;
                case '&':
                    append("&amp;");
                    break;
                case '"':
                    append("&quot;");
                    break;
                case '\'':
                    append("&apos;");
                    break;
                default:
                    append(c);
            }
        }
        @Override
        public void open(ICSInterpreter pInterpreter) {
            super.open(pInterpreter);
            content = new StringBuilder();
        }
        @Override
        public void textSetFont(COSName name, PDFont font, float size) {
            super.textSetFont(name, font, size);
            AffineTransform tx;
            tx = (AffineTransform) getDeviceTransform().clone();
            tx.concatenate(textState.globalTransform);
            maxDX = textState.fontSize * 0.2 * tx.getScaleX();
            maxDY = textState.fontSize * 0.6 * tx.getScaleY();
        }
        @Override
        public void textSetTransform(float a, float b, float c, float d, float e,
                float f) {
            super.textSetTransform(a, b, c, d, e, f);
            AffineTransform tx;
            tx = (AffineTransform) getDeviceTransform().clone();
            tx.concatenate(textState.globalTransform);
            maxDX = textState.fontSize * 0.2 * tx.getScaleX();
            maxDY = textState.fontSize * 0.6 * tx.getScaleY();
        }
    }
    
     
  • mtraut
    mtraut
    2010-02-03

    welcome, mxrenkin!

    afaik you're the first 3rd party solution committer to our project - many thanks, you're promoted user of the month :-)