cjhaas.com

Getting color information from iTextSharp’s TextRenderInfo and ITextExtractionStrategy

Posted in Uncategorized by Chris Haas on July 31st, 2011

In order to get color information when using an ITextExtractionStrategy in iTextSharp (5.1.1.0) you need to make the following changes to main iTextSharp code. Once you make these changes you can follow my SO post here for getting font information as well.

iTextSharp.text.pdf.parser.GraphicsState.cs

//New Fields:
internal BaseColor colorStroke;
internal BaseColor colorNonStroke;

//New Properties:
public BaseColor GetColorStroke() {
    return colorStroke;
}
public BaseColor GetColorNonStroke() {
    return colorNonStroke;
}

//changed constructors:
public GraphicsState(){
    ctm = new Matrix();
    characterSpacing = 0;
    wordSpacing = 0;
    horizontalScaling = 1.0f;
    leading = 0;
    font = null;
    fontSize = 0;
    renderMode = 0;
    rise = 0;
    knockout = true;
    colorStroke = null;
    colorNonStroke = null;
}

/**
* Copy constructor.
* @param source    another GraphicsState object
*/
public GraphicsState(GraphicsState source){
    // note: all of the following are immutable, with the possible exception of font
    // so it is safe to copy them as-is
    ctm = source.ctm;
    characterSpacing = source.characterSpacing;
    wordSpacing = source.wordSpacing;
    horizontalScaling = source.horizontalScaling;
    leading = source.leading;
    font = source.font;
    fontSize = source.fontSize;
    renderMode = source.renderMode;
    rise = source.rise;
    knockout = source.knockout;
    colorStroke = source.colorStroke;
    colorNonStroke = source.colorNonStroke;
}

iTextSharp.text.pdf.parser.PdfContentStreamProcessor.cs

//append to end of method PopulateOperators()
    RegisterContentOperator("G", new SetStrokingGray());
    RegisterContentOperator("g", new SetNonStrokingGray());
    RegisterContentOperator("RG", new SetStrokingRGB());
    RegisterContentOperator("rg", new SetNonStrokingRGB());
    RegisterContentOperator("K", new SetStrokingCMYK());
    RegisterContentOperator("k", new SetNonStrokingCMYK());
    RegisterContentOperator("CS", new SetStrokingGeneral());
    RegisterContentOperator("cs", new SetNonStrokingGeneral());
    RegisterContentOperator("SC", new SetStrokingGeneral());
    RegisterContentOperator("sc", new SetNonStrokingGeneral());
    RegisterContentOperator("SCN", new SetStrokingGeneral());
    RegisterContentOperator("scn", new SetNonStrokingGeneral());

//add new classes:
public abstract class SetColorBase : IContentOperator {
    public enum ColorStyle { Stroke = 1, NonStroke = 2 };
    public enum ColorSpace { RGB = 1, CMYK = 2, Gray = 3, Other = 4 };
    public abstract BaseColor GetColor(PdfLiteral oper, List<PdfObject> operands);
    private ColorStyle style;
    private ColorSpace space;
    public SetColorBase(ColorStyle colorStyle, ColorSpace colorSpace) {
        this.style = colorStyle;
        this.space = colorSpace;
    }
    public void Invoke(PdfContentStreamProcessor processor, PdfLiteral oper, List<PdfObject> operands) {
        BaseColor c = GetColor(oper, operands);
        GraphicsState gs = processor.gsStack.Peek();
        if (this.style == ColorStyle.Stroke) {
            gs.colorStroke = c;
        }
        else if (this.style == ColorStyle.NonStroke) {
            gs.colorNonStroke = c;
        }
    }
}
private class SetStrokingGray : SetColorBase {
    public SetStrokingGray() : base(ColorStyle.Stroke, ColorSpace.Gray) { }
    public override BaseColor GetColor(PdfLiteral oper, List<PdfObject> operands) {
        PdfNumber g = (PdfNumber)operands[0];
        return new GrayColor(g.FloatValue);
    }
}
private class SetNonStrokingGray : SetColorBase {
    public SetNonStrokingGray() : base(ColorStyle.NonStroke, ColorSpace.Gray) { }
    public override BaseColor GetColor(PdfLiteral oper, List<PdfObject> operands) {
        PdfNumber g = (PdfNumber)operands[0];
        return new GrayColor(g.FloatValue);
    }
}
private class SetStrokingRGB : SetColorBase {
    public SetStrokingRGB() : base(ColorStyle.Stroke, ColorSpace.RGB) { }
    public override BaseColor GetColor(PdfLiteral oper, List<PdfObject> operands) {
        PdfNumber r = (PdfNumber)operands[0];
        PdfNumber g = (PdfNumber)operands[1];
        PdfNumber b = (PdfNumber)operands[2];
        return new BaseColor(r.FloatValue, g.FloatValue, b.FloatValue);
    }
}
private class SetNonStrokingRGB : SetColorBase {
    public SetNonStrokingRGB() : base(ColorStyle.NonStroke, ColorSpace.RGB) { }
    public override BaseColor GetColor(PdfLiteral oper, List<PdfObject> operands) {
        PdfNumber r = (PdfNumber)operands[0];
        PdfNumber g = (PdfNumber)operands[1];
        PdfNumber b = (PdfNumber)operands[2];
        return new BaseColor(r.FloatValue, g.FloatValue, b.FloatValue);
    }
}
private class SetStrokingCMYK : SetColorBase {
    public SetStrokingCMYK() : base(ColorStyle.Stroke, ColorSpace.CMYK) { }
    public override BaseColor GetColor(PdfLiteral oper, List<PdfObject> operands) {
        PdfNumber c = (PdfNumber)operands[0];
        PdfNumber m = (PdfNumber)operands[1];
        PdfNumber y = (PdfNumber)operands[2];
        PdfNumber k = (PdfNumber)operands[3];
        return new CMYKColor(c.FloatValue, m.FloatValue, y.FloatValue, k.FloatValue);
    }
}
private class SetNonStrokingCMYK : SetColorBase {
    public SetNonStrokingCMYK() : base(ColorStyle.NonStroke, ColorSpace.CMYK) { }
    public override BaseColor GetColor(PdfLiteral oper, List<PdfObject> operands) {
        PdfNumber c = (PdfNumber)operands[0];
        PdfNumber m = (PdfNumber)operands[1];
        PdfNumber y = (PdfNumber)operands[2];
        PdfNumber k = (PdfNumber)operands[3];
        return new CMYKColor(c.FloatValue, m.FloatValue, y.FloatValue, k.FloatValue);
    }
}
private class SetNonStrokingGeneral : SetColorBase {
    public SetNonStrokingGeneral() : base(ColorStyle.NonStroke, ColorSpace.Other) { }
    public override BaseColor GetColor(PdfLiteral oper, List<PdfObject> operands) {
        if (operands.Count == 2 && operands[0].IsNumber() && ((PdfNumber)operands[0]).IntValue == 0) {
            return new BaseColor(0);
        }
        if (operands.Count == 2 && operands[0].IsName()) {
            return new BaseColor(0);
        }
        if (operands.Count == 4) {
            PdfNumber r = (PdfNumber)operands[0];
            PdfNumber g = (PdfNumber)operands[1];
            PdfNumber b = (PdfNumber)operands[2];
            return new BaseColor(r.FloatValue, g.FloatValue, b.FloatValue);
        }
        return null;
    }
}
private class SetStrokingGeneral : SetColorBase {
    public SetStrokingGeneral() : base(ColorStyle.Stroke, ColorSpace.Other) { }
    public override BaseColor GetColor(PdfLiteral oper, List<PdfObject> operands) {
        if (operands.Count == 2 && operands[0].IsNumber() && ((PdfNumber)operands[0]).IntValue == 0) {
            return new BaseColor(0);
        }
        if (operands.Count == 2 && operands[0].IsName()) {
            return new BaseColor(0);
        }
        if (operands.Count == 4) {
            PdfNumber r = (PdfNumber)operands[0];
            PdfNumber g = (PdfNumber)operands[1];
            PdfNumber b = (PdfNumber)operands[2];
            return new BaseColor(r.FloatValue, g.FloatValue, b.FloatValue);
        }
    return null;
    }
}

iTextSharp.text.pdf.parser.TextRenderInfo.cs

//new methods
public BaseColor GetColorStroke() {
    return gs.GetColorStroke();
}
public BaseColor GetColorNonStroke() {
    return gs.GetColorNonStroke();
}

This code is very experimental but so far works pretty well. Depending on who generates the PDF different things can happen. Word’s built-in PDF generator seems to take the easier route and just kicks out simple RGB values. Adobe’s PDF plug-in appears to do the same but in a more complicated way, creating “named” color spaces (I think) but I’m not completely sure how to use them yet.

8 Responses to 'Getting color information from iTextSharp’s TextRenderInfo and ITextExtractionStrategy'

Subscribe to comments with RSS or TrackBack to 'Getting color information from iTextSharp’s TextRenderInfo and ITextExtractionStrategy'.


  1. Fatal error: Call to undefined function avatar_by_id() in /home/cjhascom/public_html/wp-content/themes/journalist/comments.php on line 37