/MarkupToPDF/Common/LocationTextExtractionStrategyWithPosition.cs - MARKUS - 일정관리

markus / MarkupToPDF / Common / LocationTextExtractionStrategyWithPosition.cs @ 05009a0e

       using iTextSharp.text.pdf.parser;
       using System;
       using System.Collections.Generic;
       using System.Linq;
       using System.Text;
       using System.Threading.Tasks;
       using static iTextSharp.text.pdf.parser.LocationTextExtractionStrategy;
       namespace MarkupToPDF.Common
+      {
           public class LocationTextExtractionStrategyWithPosition : LocationTextExtractionStrategy
+          {
               private readonly List<TextChunk> locationalResult = new List<TextChunk>();
               private readonly ITextChunkLocationStrategy tclStrat;
               public LocationTextExtractionStrategyWithPosition() : this(new TextChunkLocationStrategyDefaultImp())
+              {
+              }
               /**
                * Creates a new text extraction renderer, with a custom strategy for
                * creating new TextChunkLocation objects based on the input of the
                * TextRenderInfo.
                * @param strat the custom strategy
                */
               public LocationTextExtractionStrategyWithPosition(ITextChunkLocationStrategy strat)
+              {
                   tclStrat = strat;
+              }
               private bool StartsWithSpace(string str)
+              {
                   if (str.Length == 0) return false;
                   return str[0] == ' ';
+              }
               private bool EndsWithSpace(string str)
+              {
                   if (str.Length == 0) return false;
                   return str[str.Length - 1] == ' ';
+              }
               /**
                * Filters the provided list with the provided filter
                * @param textChunks a list of all TextChunks that this strategy found during processing
                * @param filter the filter to apply.  If null, filtering will be skipped.
                * @return the filtered list
                * @since 5.3.3
                */
               private List<TextChunk> filterTextChunks(List<TextChunk> textChunks, ITextChunkFilter filter)
+              {
                   if (filter == null)
+                  {
                       return textChunks;
+                  }
                   var filtered = new List<TextChunk>();
                   foreach (var textChunk in textChunks)
+                  {
                       if (filter.Accept(textChunk))
+                      {
                           filtered.Add(textChunk);
+                      }
+                  }
                   return filtered;
+              }
               public override void RenderText(TextRenderInfo renderInfo)
+              {
                   LineSegment segment = renderInfo.GetBaseline();
                   if (renderInfo.GetRise() != 0)
                   { // remove the rise from the baseline - we do this because the text from a super/subscript render operations should probably be considered as part of the baseline of the text the super/sub is relative to
                       Matrix riseOffsetTransform = new Matrix(0, -renderInfo.GetRise());
                       segment = segment.TransformBy(riseOffsetTransform);
+                  }
                   TextChunk tc = new TextChunk(renderInfo.GetText(), tclStrat.CreateLocation(renderInfo, segment));
                   locationalResult.Add(tc);
+              }
               public IList<TextLocation> GetLocations()
+              {
                   var filteredTextChunks = filterTextChunks(locationalResult, null);
                   filteredTextChunks.Sort();
                   TextChunk lastChunk = null;
                   var textLocations = new List<TextLocation>();
                   foreach (var chunk in filteredTextChunks)
+                  {
                       if (lastChunk == null)
+                      {
                           //initial
                           textLocations.Add(new TextLocation
+                          {
                               Text = chunk.Text,
                               X = iTextSharp.text.Utilities.PointsToMillimeters(chunk.Location.StartLocation[0]),
                               Y = iTextSharp.text.Utilities.PointsToMillimeters(chunk.Location.StartLocation[1])
                           });
+                      }
                       else
+                      {
                           if (chunk.SameLine(lastChunk))
+                          {
                               var text = "";
                               // we only insert a blank space if the trailing character of the previous string wasn't a space, and the leading character of the current string isn't a space
                               if (IsChunkAtWordBoundary(chunk, lastChunk) && !StartsWithSpace(chunk.Text) && !EndsWithSpace(lastChunk.Text))
                                   text += ' ';
                               text += chunk.Text;
                               textLocations[textLocations.Count - 1].Text += text;
+                          }
                           else
+                          {
                               textLocations.Add(new TextLocation
+                              {
                                   Text = chunk.Text,
                                   X = iTextSharp.text.Utilities.PointsToMillimeters(chunk.Location.StartLocation[0]),
                                   Y = iTextSharp.text.Utilities.PointsToMillimeters(chunk.Location.StartLocation[1])
                               });
+                          }
+                      }
                       lastChunk = chunk;
+                  }
                   //now find the location(s) with the given texts
                   return textLocations;
+              }
+          }
           public class TextLocation
+          {
               public float X { get; set; }
               public float Y { get; set; }
               public string Text { get; set; }
+          }
           public class LocationTextExtractionStrategyEx : LocationTextExtractionStrategy
+          {
               private List<TextChunk> m_locationResult = new List<TextChunk>();
               private List<TextInfo> m_TextLocationInfo = new List<TextInfo>();
               public List<TextChunk> LocationResult
+              {
                   get { return m_locationResult; }
+              }
               public List<TextInfo> TextLocationInfo
+              {
                   get { return m_TextLocationInfo; }
+              }
               /// <summary>
               /// Creates a new LocationTextExtracationStrategyEx
               /// </summary>
               public LocationTextExtractionStrategyEx()
+              {
+              }
               /// <summary>
               /// Returns the result so far
               /// </summary>
               /// <returns>a String with the resulting text</returns>
               public override String GetResultantText()
+              {
                   m_locationResult.Sort();
                   StringBuilder sb = new StringBuilder();
                   TextChunk lastChunk = null;
                   TextInfo lastTextInfo = null;
                   foreach (TextChunk chunk in m_locationResult)
+                  {
                       if (lastChunk == null)
+                      {
                           sb.Append(chunk.Text);
                           lastTextInfo = new TextInfo(chunk);
                           m_TextLocationInfo.Add(lastTextInfo);
+                      }
                       else
+                      {
                           if (chunk.sameLine(lastChunk))
+                          {
                               float dist = chunk.distanceFromEndOf(lastChunk);
                               if (dist < -chunk.CharSpaceWidth)
+                              {
                                   sb.Append(' ');
                                   lastTextInfo.addSpace();
+                              }
                               //append a space if the trailing char of the prev string wasn't a space && the 1st char of the current string isn't a space
                               else if (dist > chunk.CharSpaceWidth / 2.0f && chunk.Text[0] != ' ' && lastChunk.Text[lastChunk.Text.Length - 1] != ' ')
+                              {
                                   sb.Append(' ');
                                   lastTextInfo.addSpace();
+                              }
                               sb.Append(chunk.Text);
                               lastTextInfo.appendText(chunk);
+                          }
                           else
+                          {
                               sb.Append('\n');
                               sb.Append(chunk.Text);
                               lastTextInfo = new TextInfo(chunk);
                               m_TextLocationInfo.Add(lastTextInfo);
+                          }
+                      }
                       lastChunk = chunk;
+                  }
                   return sb.ToString();
+              }
               /// <summary>
               ///
               /// </summary>
               /// <param name="renderInfo"></param>
               public override void RenderText(TextRenderInfo renderInfo)
+              {
                   LineSegment segment = renderInfo.GetBaseline();
                   TextChunk location = new TextChunk(renderInfo.GetText(), segment.GetStartPoint(), segment.GetEndPoint(), renderInfo.GetSingleSpaceWidth(), renderInfo.GetAscentLine(), renderInfo.GetDescentLine());
                   m_locationResult.Add(location);
+              }
               public class TextChunk : IComparable, ICloneable
+              {
                   string m_text;
                   Vector m_startLocation;
                   Vector m_endLocation;
                   Vector m_orientationVector;
                   int m_orientationMagnitude;
                   int m_distPerpendicular;
                   float m_distParallelStart;
                   float m_distParallelEnd;
                   float m_charSpaceWidth;
                   public LineSegment AscentLine;
                   public LineSegment DecentLine;
                   public object Clone()
+                  {
                       TextChunk copy = new TextChunk(m_text, m_startLocation, m_endLocation, m_charSpaceWidth, AscentLine, DecentLine);
                       return copy;
+                  }
                   public string Text
+                  {
                       get { return m_text; }
                       set { m_text = value; }
+                  }
                   public float CharSpaceWidth
+                  {
                       get { return m_charSpaceWidth; }
                       set { m_charSpaceWidth = value; }
+                  }
                   public Vector StartLocation
+                  {
                       get { return m_startLocation; }
                       set { m_startLocation = value; }
+                  }
                   public Vector EndLocation
+                  {
                       get { return m_endLocation; }
                       set { m_endLocation = value; }
+                  }
                   /// <summary>
                   /// Represents a chunk of text, it's orientation, and location relative to the orientation vector
                   /// </summary>
                   /// <param name="txt"></param>
                   /// <param name="startLoc"></param>
                   /// <param name="endLoc"></param>
                   /// <param name="charSpaceWidth"></param>
                   public TextChunk(string txt, Vector startLoc, Vector endLoc, float charSpaceWidth, LineSegment ascentLine, LineSegment decentLine)
+                  {
                       m_text = txt;
                       m_startLocation = startLoc;
                       m_endLocation = endLoc;
                       m_charSpaceWidth = charSpaceWidth;
                       AscentLine = ascentLine;
                       DecentLine = decentLine;
                       m_orientationVector = m_endLocation.Subtract(m_startLocation).Normalize();
                       m_orientationMagnitude = (int)(Math.Atan2(m_orientationVector[Vector.I2], m_orientationVector[Vector.I1]) * 1000);
                       // see http://mathworld.wolfram.com/Point-LineDistance2-Dimensional.html
                       // the two vectors we are crossing are in the same plane, so the result will be purely
                       // in the z-axis (out of plane) direction, so we just take the I3 component of the result
                       Vector origin = new Vector(0, 0, 1);
                       m_distPerpendicular = (int)(m_startLocation.Subtract(origin)).Cross(m_orientationVector)[Vector.I3];
                       m_distParallelStart = m_orientationVector.Dot(m_startLocation);
                       m_distParallelEnd = m_orientationVector.Dot(m_endLocation);
+                  }
                   /// <summary>
                   /// true if this location is on the the same line as the other text chunk
                   /// </summary>
                   /// <param name="textChunkToCompare">the location to compare to</param>
                   /// <returns>true if this location is on the the same line as the other</returns>
                   public bool sameLine(TextChunk textChunkToCompare)
+                  {
                       if (m_orientationMagnitude != textChunkToCompare.m_orientationMagnitude) return false;
                       if (m_distPerpendicular != textChunkToCompare.m_distPerpendicular) return false;
                       return true;
+                  }
                   /// <summary>
                   /// Computes the distance between the end of 'other' and the beginning of this chunk
                   /// in the direction of this chunk's orientation vector.  Note that it's a bad idea
                   /// to call this for chunks that aren't on the same line and orientation, but we don't
                   /// explicitly check for that condition for performance reasons.
                   /// </summary>
                   /// <param name="other"></param>
                   /// <returns>the number of spaces between the end of 'other' and the beginning of this chunk</returns>
                   public float distanceFromEndOf(TextChunk other)
+                  {
                       float distance = m_distParallelStart - other.m_distParallelEnd;
                       return distance;
+                  }
                   /// <summary>
                   /// Compares based on orientation, perpendicular distance, then parallel distance
                   /// </summary>
                   /// <param name="obj"></param>
                   /// <returns></returns>
                   public int CompareTo(object obj)
+                  {
                       if (obj == null) throw new ArgumentException("Object is now a TextChunk");
                       TextChunk rhs = obj as TextChunk;
                       if (rhs != null)
+                      {
                           if (this == rhs) return 0;
                           int rslt;
                           rslt = m_orientationMagnitude - rhs.m_orientationMagnitude;
                           if (rslt != 0) return rslt;
                           rslt = m_distPerpendicular - rhs.m_distPerpendicular;
                           if (rslt != 0) return rslt;
                           // note: it's never safe to check floating point numbers for equality, and if two chunks
                           // are truly right on top of each other, which one comes first or second just doesn't matter
                           // so we arbitrarily choose this way.
                           rslt = m_distParallelStart < rhs.m_distParallelStart ? -1 : 1;
                           return rslt;
+                      }
                       else
+                      {
                           throw new ArgumentException("Object is now a TextChunk");
+                      }
+                  }
+              }
               public class TextInfo
+              {
                   public Vector TopLeft;
                   public Vector BottomRight;
                   private string m_Text;
                   public string Text
+                  {
                       get { return m_Text; }
+                  }
                   /// <summary>
                   /// Create a TextInfo.
                   /// </summary>
                   /// <param name="initialTextChunk"></param>
                   public TextInfo(TextChunk initialTextChunk)
+                  {
                       TopLeft = initialTextChunk.AscentLine.GetStartPoint();
                       BottomRight = initialTextChunk.DecentLine.GetEndPoint();
                       m_Text = initialTextChunk.Text;
+                  }
                   /// <summary>
                   /// Add more text to this TextInfo.
                   /// </summary>
                   /// <param name="additionalTextChunk"></param>
                   public void appendText(TextChunk additionalTextChunk)
+                  {
                       BottomRight = additionalTextChunk.DecentLine.GetEndPoint();
                       m_Text += additionalTextChunk.Text;
+                  }
                   /// <summary>
                   /// Add a space to the TextInfo.  This will leave the endpoint out of sync with the text.
                   /// The assumtion is that you will add more text after the space which will correct the endpoint.
                   /// </summary>
                   public void addSpace()
+                  {
                       m_Text += ' ';
+                  }
+              }
+          }
           public class HoneyheadTEXT
+          {
               public int PageNo { get; set; }
               public string Text { get; set; }
               public pdftron.PDF.Rect Bounds { get; set; }
+          }
           public class PDFSearchText
+          {
               public PDFSearchText()
+              {
                   pdftron.PDFNet.Initialize("daelim.co.kr(Doftech Corp):CPU:2::W:AMC(20120315):EF6E886F25A414FFB5F8C1F2999CF2DA33DC6C5164315BAF7011B87AF0FA");
+              }
               public List<HoneyheadTEXT> GetPageText(string inputPdf, int pageNumber, bool IsSearchAllPage)
+              {
                   IList<TextLocation> res = null;
                   List<HoneyheadTEXT> txtSet = new List<HoneyheadTEXT>();
                   using (pdftron.PDF.PDFDoc doc = new pdftron.PDF.PDFDoc(inputPdf))
+                  {
                       doc.InitSecurityHandler();
                       if (IsSearchAllPage)
+                      {
                           for (int i = 1; i <= doc.GetPageCount(); i++)
+                          {
                               pdftron.PDF.Page page = doc.GetPage(i);
                               using (pdftron.PDF.TextExtractor txt = new pdftron.PDF.TextExtractor())
+                              {
                                   txt.Begin(page);  // Read the page.
                                                     //pdftron.PDF.Rect bbox;
                                                     //int cur_flow_id = -1, cur_para_id = -1;
                                   //TextExtractor.Line line;
                                   //TextExtractor.Word word;
                                   //TextExtractor.Style s, line_style;
                                   //// For each line on the page...
                                   //for (line = txt.GetFirstLine(); line.IsValid(); line = line.GetNextLine())
                                   //{
                                   //}
                                   //String text = txt.GetAsXML(TextExtractor.XMLOutputFlags.e_output_bbox);
                                   pdftron.PDF.TextExtractor.Word word;
                                   for (pdftron.PDF.TextExtractor.Line line = txt.GetFirstLine(); line.IsValid(); line = line.GetNextLine())
+                                  {
                                       for (word = line.GetFirstWord(); word.IsValid(); word = word.GetNextWord())
+                                      {
                                           string text = word.GetString();
                                           var data = word.GetBBox();
                                           //wordString += word.GetString() + " ";
                                           txtSet.Add(new HoneyheadTEXT { Text = text, Bounds = data, PageNo = i });
+                                      }
+                                  }
+                              }
+                          }
+                      }
                       else
+                      {
                           pdftron.PDF.Page page = doc.GetPage(pageNumber);
                           if (page == null)
+                          {
                               return txtSet;
+                          }
                           using (pdftron.PDF.TextExtractor txt = new pdftron.PDF.TextExtractor())
+                          {
                               txt.Begin(page);  // Read the page.
                                                 //pdftron.PDF.Rect bbox;
                                                 //int cur_flow_id = -1, cur_para_id = -1;
                               //TextExtractor.Line line;
                               //TextExtractor.Word word;
                               //TextExtractor.Style s, line_style;
                               //// For each line on the page...
                               //for (line = txt.GetFirstLine(); line.IsValid(); line = line.GetNextLine())
                               //{
                               //}
                               //String text = txt.GetAsXML(TextExtractor.XMLOutputFlags.e_output_bbox);
                               pdftron.PDF.TextExtractor.Word word;
                               for (pdftron.PDF.TextExtractor.Line line = txt.GetFirstLine(); line.IsValid(); line = line.GetNextLine())
+                              {
                                   for (word = line.GetFirstWord(); word.IsValid(); word = word.GetNextWord())
+                                  {
                                       string text = word.GetString();
                                       var data = word.GetBBox();
                                       //wordString += word.GetString() + " ";
                                       txtSet.Add(new HoneyheadTEXT { Text = text, Bounds = data, PageNo = pageNumber });
+                                  }
+                              }
+                          }
+                      }
                       return txtSet;
+                  }
+              }
+          }
+      }

프로젝트

일반

사용자정보

MARKUS

markus / MarkupToPDF / Common / LocationTextExtractionStrategyWithPosition.cs @ 05009a0e