Skip to content
cjhaas blog

Basically a place that Chris can post solutions to problems so he can easily find them later

cjhaas blog

Basically a place that Chris can post solutions to problems so he can easily find them later

Getting color information from iTextSharp’s TextRenderInfo and ITextExtractionStrategy

Posted on July 31, 2011 By [email protected]

In order to get color information when using an ITextExtractionStrategy in iTextSharp (5.1.1.0) you need to make the following changes to main iTextSharp code. Once you make these changes you can follow my SO post here for getting font information as well.

iTextSharp.text.pdf.parser.GraphicsState.cs

//New Fields:
internal BaseColor colorStroke;
internal BaseColor colorNonStroke;

//New Properties:
public BaseColor GetColorStroke() {
    return colorStroke;
}
public BaseColor GetColorNonStroke() {
    return colorNonStroke;
}

//changed constructors:
public GraphicsState(){
    ctm = new Matrix();
    characterSpacing = 0;
    wordSpacing = 0;
    horizontalScaling = 1.0f;
    leading = 0;
    font = null;
    fontSize = 0;
    renderMode = 0;
    rise = 0;
    knockout = true;
    colorStroke = null;
    colorNonStroke = null;
}

/**
* Copy constructor.
* @param source    another GraphicsState object
*/
public GraphicsState(GraphicsState source){
    // note: all of the following are immutable, with the possible exception of font
    // so it is safe to copy them as-is
    ctm = source.ctm;
    characterSpacing = source.characterSpacing;
    wordSpacing = source.wordSpacing;
    horizontalScaling = source.horizontalScaling;
    leading = source.leading;
    font = source.font;
    fontSize = source.fontSize;
    renderMode = source.renderMode;
    rise = source.rise;
    knockout = source.knockout;
    colorStroke = source.colorStroke;
    colorNonStroke = source.colorNonStroke;
}

iTextSharp.text.pdf.parser.PdfContentStreamProcessor.cs

//append to end of method PopulateOperators()
    RegisterContentOperator("G", new SetStrokingGray());
    RegisterContentOperator("g", new SetNonStrokingGray());
    RegisterContentOperator("RG", new SetStrokingRGB());
    RegisterContentOperator("rg", new SetNonStrokingRGB());
    RegisterContentOperator("K", new SetStrokingCMYK());
    RegisterContentOperator("k", new SetNonStrokingCMYK());
    RegisterContentOperator("CS", new SetStrokingGeneral());
    RegisterContentOperator("cs", new SetNonStrokingGeneral());
    RegisterContentOperator("SC", new SetStrokingGeneral());
    RegisterContentOperator("sc", new SetNonStrokingGeneral());
    RegisterContentOperator("SCN", new SetStrokingGeneral());
    RegisterContentOperator("scn", new SetNonStrokingGeneral());

//add new classes:
public abstract class SetColorBase : IContentOperator {
    public enum ColorStyle { Stroke = 1, NonStroke = 2 };
    public enum ColorSpace { RGB = 1, CMYK = 2, Gray = 3, Other = 4 };
    public abstract BaseColor GetColor(PdfLiteral oper, List<PdfObject> operands);
    private ColorStyle style;
    private ColorSpace space;
    public SetColorBase(ColorStyle colorStyle, ColorSpace colorSpace) {
        this.style = colorStyle;
        this.space = colorSpace;
    }
    public void Invoke(PdfContentStreamProcessor processor, PdfLiteral oper, List<PdfObject> operands) {
        BaseColor c = GetColor(oper, operands);
        GraphicsState gs = processor.gsStack.Peek();
        if (this.style == ColorStyle.Stroke) {
            gs.colorStroke = c;
        }
        else if (this.style == ColorStyle.NonStroke) {
            gs.colorNonStroke = c;
        }
    }
}
private class SetStrokingGray : SetColorBase {
    public SetStrokingGray() : base(ColorStyle.Stroke, ColorSpace.Gray) { }
    public override BaseColor GetColor(PdfLiteral oper, List<PdfObject> operands) {
        PdfNumber g = (PdfNumber)operands[0];
        return new GrayColor(g.FloatValue);
    }
}
private class SetNonStrokingGray : SetColorBase {
    public SetNonStrokingGray() : base(ColorStyle.NonStroke, ColorSpace.Gray) { }
    public override BaseColor GetColor(PdfLiteral oper, List<PdfObject> operands) {
        PdfNumber g = (PdfNumber)operands[0];
        return new GrayColor(g.FloatValue);
    }
}
private class SetStrokingRGB : SetColorBase {
    public SetStrokingRGB() : base(ColorStyle.Stroke, ColorSpace.RGB) { }
    public override BaseColor GetColor(PdfLiteral oper, List<PdfObject> operands) {
        PdfNumber r = (PdfNumber)operands[0];
        PdfNumber g = (PdfNumber)operands[1];
        PdfNumber b = (PdfNumber)operands[2];
        return new BaseColor(r.FloatValue, g.FloatValue, b.FloatValue);
    }
}
private class SetNonStrokingRGB : SetColorBase {
    public SetNonStrokingRGB() : base(ColorStyle.NonStroke, ColorSpace.RGB) { }
    public override BaseColor GetColor(PdfLiteral oper, List<PdfObject> operands) {
        PdfNumber r = (PdfNumber)operands[0];
        PdfNumber g = (PdfNumber)operands[1];
        PdfNumber b = (PdfNumber)operands[2];
        return new BaseColor(r.FloatValue, g.FloatValue, b.FloatValue);
    }
}
private class SetStrokingCMYK : SetColorBase {
    public SetStrokingCMYK() : base(ColorStyle.Stroke, ColorSpace.CMYK) { }
    public override BaseColor GetColor(PdfLiteral oper, List<PdfObject> operands) {
        PdfNumber c = (PdfNumber)operands[0];
        PdfNumber m = (PdfNumber)operands[1];
        PdfNumber y = (PdfNumber)operands[2];
        PdfNumber k = (PdfNumber)operands[3];
        return new CMYKColor(c.FloatValue, m.FloatValue, y.FloatValue, k.FloatValue);
    }
}
private class SetNonStrokingCMYK : SetColorBase {
    public SetNonStrokingCMYK() : base(ColorStyle.NonStroke, ColorSpace.CMYK) { }
    public override BaseColor GetColor(PdfLiteral oper, List<PdfObject> operands) {
        PdfNumber c = (PdfNumber)operands[0];
        PdfNumber m = (PdfNumber)operands[1];
        PdfNumber y = (PdfNumber)operands[2];
        PdfNumber k = (PdfNumber)operands[3];
        return new CMYKColor(c.FloatValue, m.FloatValue, y.FloatValue, k.FloatValue);
    }
}
private class SetNonStrokingGeneral : SetColorBase {
    public SetNonStrokingGeneral() : base(ColorStyle.NonStroke, ColorSpace.Other) { }
    public override BaseColor GetColor(PdfLiteral oper, List<PdfObject> operands) {
        if (operands.Count == 2 && operands[0].IsNumber() && ((PdfNumber)operands[0]).IntValue == 0) {
            return new BaseColor(0);
        }
        if (operands.Count == 2 && operands[0].IsName()) {
            return new BaseColor(0);
        }
        if (operands.Count == 4) {
            PdfNumber r = (PdfNumber)operands[0];
            PdfNumber g = (PdfNumber)operands[1];
            PdfNumber b = (PdfNumber)operands[2];
            return new BaseColor(r.FloatValue, g.FloatValue, b.FloatValue);
        }
        return null;
    }
}
private class SetStrokingGeneral : SetColorBase {
    public SetStrokingGeneral() : base(ColorStyle.Stroke, ColorSpace.Other) { }
    public override BaseColor GetColor(PdfLiteral oper, List<PdfObject> operands) {
        if (operands.Count == 2 && operands[0].IsNumber() && ((PdfNumber)operands[0]).IntValue == 0) {
            return new BaseColor(0);
        }
        if (operands.Count == 2 && operands[0].IsName()) {
            return new BaseColor(0);
        }
        if (operands.Count == 4) {
            PdfNumber r = (PdfNumber)operands[0];
            PdfNumber g = (PdfNumber)operands[1];
            PdfNumber b = (PdfNumber)operands[2];
            return new BaseColor(r.FloatValue, g.FloatValue, b.FloatValue);
        }
    return null;
    }
}

iTextSharp.text.pdf.parser.TextRenderInfo.cs

//new methods
public BaseColor GetColorStroke() {
    return gs.GetColorStroke();
}
public BaseColor GetColorNonStroke() {
    return gs.GetColorNonStroke();
}

This code is very experimental but so far works pretty well. Depending on who generates the PDF different things can happen. Word’s built-in PDF generator seems to take the easier route and just kicks out simple RGB values. Adobe’s PDF plug-in appears to do the same but in a more complicated way, creating “named” color spaces (I think) but I’m not completely sure how to use them yet.

Uncategorized

Post navigation

Previous post
Next post

Comments (9)

  1. Mark says:
    August 3, 2011 at 1:51 am

    Hi, Could You please advice me or at least show the direction. I wonder if using ItextExtractionStrategy i will be able to parse pdf document with table and additionally if i will be able to retrieve cells text and it’s background color. Is it a good idea or it will be only possible in tagged pdf? Where should i search the informations about cell colors/background.
    Thank You,

    Best regards,
    Mark

    Reply
  2. chrishaas says:
    August 3, 2011 at 8:34 am

    Sorry Mark, but PDFs don’t really have anything called a “table”, just something that looks like one. A table in a PDF is actually just a bunch of lines or boxes with their contents filled in with a color, then regular text is drawn on top of that. With that definition you can understand that text can actually be placed on top of anything, from a box with a solid color (a table cell) or an image with a million colors in it. When you’re parsing text there would be no way for the text extractor to reliably determine a color that’s behind text because there could be millions.

    Reply
  3. Mark says:
    August 3, 2011 at 9:01 am

    Oh, i see but how in other algorithm not during TextExtracting retrieve this box with a solid color and it’s color and position?What methods should i try to recover it.? Then i could merge these informations

    Thanks one more time,

    Mark

    Reply
    1. chrishaas says:
      August 3, 2011 at 9:03 am

      Mark, what other algorithms are you talking about. Your question will probably get a better answer if you post it on Stackoverflow (http://stackoverflow.com/)

      Reply
  4. ulas says:
    March 9, 2012 at 7:29 pm

    Hi, I am developing project in Java with itext, so I wonder how can i change iTextSharp.text.pdf.parser.GraphicsState file, where can i find?

    Reply
    1. Chris Haas says:
      March 10, 2012 at 8:17 am

      Are you using the 5.x series? If so its built right in:
      http://api.itextpdf.com/itext/com/itextpdf/text/pdf/parser/GraphicsState.html

      iTextSharp is a port of iText and all features get added to the Java version first and then trickle down to the .Net version. So if you see any .Net code you can pretty much assume that there’s a Java version, although the reverse isn’t always true.

      Reply
  5. Shujaat says:
    September 9, 2012 at 10:11 am

    Hi Chris,

    Your code helped me take a few steps in the correct direction, but now I’m stuck again. After it detects an SCN operator and goes into SetStrokingGeneral class, you’re apparently not handling the case where the operands may be color space names or numbers. For example, I get 0 or 1 as the first operand, which I assume is the index of the color space it is using, but you’re simply skipping this case in your code.

    I tried somewhat further and found that I can extract color space objects from PdfReader, but I end up get a PdfArray with two objects, where the first is a literal named “ICCBased” and the second is a PRStream. Do you have any idea how to handle this case?

    Reply
    1. Chris Haas says:
      September 10, 2012 at 8:45 am

      Hi Shujaat. As you saw, SCN and scn themselves are catchalls for everything else that’s not RGB, CMYK or Grey. Before hitting one of those two you should actually first find a CS operator whose first and only operand is the actual color space to use. There’s a bunch of options for this including DeviceRGB, DeviceCMYK, Pattern, Lab, DeviceN, etc. You can find these in table 74 of the 2008 PDF spec section 8.6.8 (page 171). My code is actually not completely correct and I shouldn’t be pushing CS and cs to the SetStrokingGeneral method but instead should do some further processing. Unfortunately none of the samples PDFs that I had at the time had this set so I couldn’t test for it. Hopefully this helps you out!

      Reply
  6. nami says:
    August 3, 2020 at 7:33 am

    How to get the color information of a shape annotation in the pdf?

    Reply

Leave a Reply Cancel reply

Your email address will not be published. Required fields are marked *

This site uses Akismet to reduce spam. Learn how your comment data is processed.

Recent Posts

  • Google open redirect
  • How to use AI to write code
  • Doctrine/Symfony MariaDB DSN connection string
  • Creating a portable copy of pdftotext from source
  • Gravity Forms shortcode getting extra line breaks when used with ACF

Recent Comments

  • jose luis on #2 – VB.Net iTextSharp Tutorial – Add an image to a document
  • Eliezer Castanon on iTextSharp slightly smarter text extraction strategy
  • javad on How to recompress images in a PDF using iTextSharp
  • MANOUS3784 on Flock is awesome
  • Sang on Flock is awesome

Archives

  • June 2026
  • October 2025
  • November 2023
  • September 2023
  • July 2023
  • June 2023
  • May 2023
  • April 2023
  • December 2022
  • September 2022
  • April 2022
  • October 2021
  • September 2021
  • April 2021
  • January 2021
  • October 2020
  • August 2020
  • June 2020
  • May 2020
  • December 2019
  • November 2019
  • October 2019
  • July 2019
  • May 2019
  • December 2018
  • October 2018
  • July 2018
  • November 2017
  • October 2017
  • August 2017
  • July 2017
  • June 2017
  • May 2017
  • April 2017
  • March 2017
  • February 2017
  • January 2017
  • September 2015
  • December 2014
  • November 2014
  • October 2014
  • September 2014
  • August 2014
  • July 2014
  • November 2013
  • May 2013
  • April 2013
  • March 2013
  • January 2013
  • November 2012
  • October 2012
  • July 2012
  • March 2012
  • January 2012
  • October 2011
  • September 2011
  • July 2011
  • February 2011
  • December 2010
  • November 2010
  • October 2010
  • September 2010
  • August 2010
  • June 2010
  • April 2010
  • January 2010
  • December 2009
  • November 2009
  • October 2009
  • July 2009
  • June 2009
  • May 2009
  • April 2009

Categories

  • Accessibility
  • Advanced Custom Fields
  • Authorize.Net
  • BWP Minify
  • Composer
  • Crappy Google Search Results of the Day
  • CSS
  • Doctrine
  • Drupal
  • Drush
  • Elasticsearch
  • Fun links of the day
  • Google Analytics
  • Gravity Forms
  • HHVM
  • HTML
  • iTextSharp
  • JavaScript
  • Linux
  • mysql
  • nginx
  • Optimization
  • PDF
  • PdfPTable
  • PHP
  • Plugins
  • Ramblings
  • Random things I learned
  • Redis
  • Security
  • simplesamlphp
  • SQL Server
  • SSH
  • SSL/TLS/HTTPS
  • Stack Overflow
  • SVG
  • Symfony
  • Synology
  • Uncategorized
  • Unicode
  • Varnish
  • Vendi Best Practice
  • VIP
  • Weird Google Search Results
  • Windows
  • WordPress
  • WP-CLI

Meta

  • Log in
  • Entries feed
  • Comments feed
  • WordPress.org
©2026 cjhaas blog | WordPress Theme by SuperbThemes