#42 Wrong Text Location

open
nobody
5
2013-02-23
2013-02-23
Mohsen Afshin
No

Text extraction on the attached PDF works but with wrong char and sentence coordinations.
Also on some PDFs the space character is not detected correctly.

I've applied a temporary fix in the Extract method of ContentScanner.cs:
(It works on some PDF but on some other it add excess spaces :-( )
(I measure the width of a reference space character using the only character in the embedded font).

        private void Extract(ContentScanner level)
        {
            if (level == null)
                return;

            while (level.MoveNext())
            {
                ContentObject content = level.Current;

                if (content is ShowText)
                {
                    var currentWrapper = (TextStringWrapper)level.CurrentWrapper;

                    Bitmap b = new Bitmap(1, 1);
                    Graphics g = Graphics.FromImage(b);
                    float charSystemSize = g.MeasureString(currentWrapper.TextChars[0].Value.ToString(), SystemFonts.DefaultFont).Width;
                    float spaceSystemSize = g.MeasureString(' '.ToString(), SystemFonts.DefaultFont).Width;
                    float charFontSize = currentWrapper.TextChars[0].Box.Width;
                    float spaceFontSize = (charFontSize * spaceSystemSize) / charSystemSize;

                    if (charFontSize > 0.0f)
                    {
                        for (int i = 0; i < currentWrapper.TextChars.Count - 1; i++)
                        {
                            if (currentWrapper.TextChars[i].Value == ' ')
                                continue;

                            RectangleF box1 = currentWrapper.TextChars[i].Box;
                            float left1 = box1.Left + box1.Width;
                            float left2 = currentWrapper.TextChars[i + 1].Box.Left;
                            if (Math.Abs(Math.Abs(left2 - left1) - spaceFontSize) < 1.0f)
                            {
                                currentWrapper.TextChars.Insert(i + 1,
                                    new TextChar(' ', new RectangleF(left1 + 0.2f, box1.Top, 0.5f, 0.2f), null, true));
                            }
                        }
                    }

                    textStrings.Add(currentWrapper);
                }
                else if (content is ContainerObject)
                {
                    Extract(level.ChildLevel);
                }
            }
        }
1 Attachments

Discussion