markus / MarkupToPDF / Common / LocationTextExtractionStrategyWithPosition.cs @ a4e5d148
이력 | 보기 | 이력해설 | 다운로드 (19.6 KB)
1 | 787a4489 | KangIngu | using iTextSharp.text.pdf.parser; |
---|---|---|---|
2 | using System; |
||
3 | using System.Collections.Generic; |
||
4 | using System.Linq; |
||
5 | using System.Text; |
||
6 | using System.Threading.Tasks; |
||
7 | using static iTextSharp.text.pdf.parser.LocationTextExtractionStrategy; |
||
8 | |||
9 | namespace MarkupToPDF.Common |
||
10 | { |
||
11 | public class LocationTextExtractionStrategyWithPosition : LocationTextExtractionStrategy |
||
12 | { |
||
13 | private readonly List<TextChunk> locationalResult = new List<TextChunk>(); |
||
14 | |||
15 | private readonly ITextChunkLocationStrategy tclStrat; |
||
16 | |||
17 | public LocationTextExtractionStrategyWithPosition() : this(new TextChunkLocationStrategyDefaultImp()) |
||
18 | { |
||
19 | } |
||
20 | |||
21 | /** |
||
22 | * Creates a new text extraction renderer, with a custom strategy for |
||
23 | * creating new TextChunkLocation objects based on the input of the |
||
24 | * TextRenderInfo. |
||
25 | * @param strat the custom strategy |
||
26 | */ |
||
27 | public LocationTextExtractionStrategyWithPosition(ITextChunkLocationStrategy strat) |
||
28 | { |
||
29 | tclStrat = strat; |
||
30 | } |
||
31 | |||
32 | |||
33 | private bool StartsWithSpace(string str) |
||
34 | { |
||
35 | if (str.Length == 0) return false; |
||
36 | return str[0] == ' '; |
||
37 | } |
||
38 | |||
39 | |||
40 | private bool EndsWithSpace(string str) |
||
41 | { |
||
42 | if (str.Length == 0) return false; |
||
43 | return str[str.Length - 1] == ' '; |
||
44 | } |
||
45 | |||
46 | /** |
||
47 | * Filters the provided list with the provided filter |
||
48 | * @param textChunks a list of all TextChunks that this strategy found during processing |
||
49 | * @param filter the filter to apply. If null, filtering will be skipped. |
||
50 | * @return the filtered list |
||
51 | * @since 5.3.3 |
||
52 | */ |
||
53 | |||
54 | private List<TextChunk> filterTextChunks(List<TextChunk> textChunks, ITextChunkFilter filter) |
||
55 | { |
||
56 | if (filter == null) |
||
57 | { |
||
58 | return textChunks; |
||
59 | } |
||
60 | |||
61 | var filtered = new List<TextChunk>(); |
||
62 | |||
63 | foreach (var textChunk in textChunks) |
||
64 | { |
||
65 | if (filter.Accept(textChunk)) |
||
66 | { |
||
67 | filtered.Add(textChunk); |
||
68 | } |
||
69 | } |
||
70 | |||
71 | return filtered; |
||
72 | } |
||
73 | |||
74 | public override void RenderText(TextRenderInfo renderInfo) |
||
75 | { |
||
76 | LineSegment segment = renderInfo.GetBaseline(); |
||
77 | if (renderInfo.GetRise() != 0) |
||
78 | { // remove the rise from the baseline - we do this because the text from a super/subscript render operations should probably be considered as part of the baseline of the text the super/sub is relative to |
||
79 | Matrix riseOffsetTransform = new Matrix(0, -renderInfo.GetRise()); |
||
80 | segment = segment.TransformBy(riseOffsetTransform); |
||
81 | } |
||
82 | TextChunk tc = new TextChunk(renderInfo.GetText(), tclStrat.CreateLocation(renderInfo, segment)); |
||
83 | locationalResult.Add(tc); |
||
84 | } |
||
85 | |||
86 | |||
87 | public IList<TextLocation> GetLocations() |
||
88 | { |
||
89 | |||
90 | var filteredTextChunks = filterTextChunks(locationalResult, null); |
||
91 | filteredTextChunks.Sort(); |
||
92 | |||
93 | TextChunk lastChunk = null; |
||
94 | |||
95 | var textLocations = new List<TextLocation>(); |
||
96 | |||
97 | foreach (var chunk in filteredTextChunks) |
||
98 | { |
||
99 | |||
100 | if (lastChunk == null) |
||
101 | { |
||
102 | //initial |
||
103 | textLocations.Add(new TextLocation |
||
104 | { |
||
105 | Text = chunk.Text, |
||
106 | X = iTextSharp.text.Utilities.PointsToMillimeters(chunk.Location.StartLocation[0]), |
||
107 | Y = iTextSharp.text.Utilities.PointsToMillimeters(chunk.Location.StartLocation[1]) |
||
108 | }); |
||
109 | |||
110 | } |
||
111 | else |
||
112 | { |
||
113 | if (chunk.SameLine(lastChunk)) |
||
114 | { |
||
115 | var text = ""; |
||
116 | // we only insert a blank space if the trailing character of the previous string wasn't a space, and the leading character of the current string isn't a space |
||
117 | if (IsChunkAtWordBoundary(chunk, lastChunk) && !StartsWithSpace(chunk.Text) && !EndsWithSpace(lastChunk.Text)) |
||
118 | text += ' '; |
||
119 | |||
120 | text += chunk.Text; |
||
121 | |||
122 | textLocations[textLocations.Count - 1].Text += text; |
||
123 | |||
124 | } |
||
125 | else |
||
126 | { |
||
127 | |||
128 | textLocations.Add(new TextLocation |
||
129 | { |
||
130 | Text = chunk.Text, |
||
131 | X = iTextSharp.text.Utilities.PointsToMillimeters(chunk.Location.StartLocation[0]), |
||
132 | Y = iTextSharp.text.Utilities.PointsToMillimeters(chunk.Location.StartLocation[1]) |
||
133 | }); |
||
134 | } |
||
135 | } |
||
136 | lastChunk = chunk; |
||
137 | } |
||
138 | |||
139 | //now find the location(s) with the given texts |
||
140 | return textLocations; |
||
141 | |||
142 | } |
||
143 | |||
144 | } |
||
145 | |||
146 | public class TextLocation |
||
147 | { |
||
148 | public float X { get; set; } |
||
149 | public float Y { get; set; } |
||
150 | |||
151 | public string Text { get; set; } |
||
152 | } |
||
153 | |||
154 | public class LocationTextExtractionStrategyEx : LocationTextExtractionStrategy |
||
155 | { |
||
156 | private List<TextChunk> m_locationResult = new List<TextChunk>(); |
||
157 | private List<TextInfo> m_TextLocationInfo = new List<TextInfo>(); |
||
158 | public List<TextChunk> LocationResult |
||
159 | { |
||
160 | get { return m_locationResult; } |
||
161 | } |
||
162 | public List<TextInfo> TextLocationInfo |
||
163 | { |
||
164 | get { return m_TextLocationInfo; } |
||
165 | } |
||
166 | |||
167 | /// <summary> |
||
168 | /// Creates a new LocationTextExtracationStrategyEx |
||
169 | /// </summary> |
||
170 | public LocationTextExtractionStrategyEx() |
||
171 | { |
||
172 | } |
||
173 | |||
174 | /// <summary> |
||
175 | /// Returns the result so far |
||
176 | /// </summary> |
||
177 | /// <returns>a String with the resulting text</returns> |
||
178 | public override String GetResultantText() |
||
179 | { |
||
180 | m_locationResult.Sort(); |
||
181 | |||
182 | StringBuilder sb = new StringBuilder(); |
||
183 | TextChunk lastChunk = null; |
||
184 | TextInfo lastTextInfo = null; |
||
185 | foreach (TextChunk chunk in m_locationResult) |
||
186 | { |
||
187 | if (lastChunk == null) |
||
188 | { |
||
189 | sb.Append(chunk.Text); |
||
190 | lastTextInfo = new TextInfo(chunk); |
||
191 | m_TextLocationInfo.Add(lastTextInfo); |
||
192 | } |
||
193 | else |
||
194 | { |
||
195 | if (chunk.sameLine(lastChunk)) |
||
196 | { |
||
197 | float dist = chunk.distanceFromEndOf(lastChunk); |
||
198 | |||
199 | if (dist < -chunk.CharSpaceWidth) |
||
200 | { |
||
201 | sb.Append(' '); |
||
202 | lastTextInfo.addSpace(); |
||
203 | } |
||
204 | //append a space if the trailing char of the prev string wasn't a space && the 1st char of the current string isn't a space |
||
205 | else if (dist > chunk.CharSpaceWidth / 2.0f && chunk.Text[0] != ' ' && lastChunk.Text[lastChunk.Text.Length - 1] != ' ') |
||
206 | { |
||
207 | sb.Append(' '); |
||
208 | lastTextInfo.addSpace(); |
||
209 | } |
||
210 | sb.Append(chunk.Text); |
||
211 | lastTextInfo.appendText(chunk); |
||
212 | } |
||
213 | else |
||
214 | { |
||
215 | sb.Append('\n'); |
||
216 | sb.Append(chunk.Text); |
||
217 | lastTextInfo = new TextInfo(chunk); |
||
218 | m_TextLocationInfo.Add(lastTextInfo); |
||
219 | } |
||
220 | } |
||
221 | lastChunk = chunk; |
||
222 | } |
||
223 | return sb.ToString(); |
||
224 | } |
||
225 | |||
226 | /// <summary> |
||
227 | /// |
||
228 | /// </summary> |
||
229 | /// <param name="renderInfo"></param> |
||
230 | public override void RenderText(TextRenderInfo renderInfo) |
||
231 | { |
||
232 | LineSegment segment = renderInfo.GetBaseline(); |
||
233 | TextChunk location = new TextChunk(renderInfo.GetText(), segment.GetStartPoint(), segment.GetEndPoint(), renderInfo.GetSingleSpaceWidth(), renderInfo.GetAscentLine(), renderInfo.GetDescentLine()); |
||
234 | m_locationResult.Add(location); |
||
235 | } |
||
236 | |||
237 | public class TextChunk : IComparable, ICloneable |
||
238 | { |
||
239 | string m_text; |
||
240 | Vector m_startLocation; |
||
241 | Vector m_endLocation; |
||
242 | Vector m_orientationVector; |
||
243 | int m_orientationMagnitude; |
||
244 | int m_distPerpendicular; |
||
245 | float m_distParallelStart; |
||
246 | float m_distParallelEnd; |
||
247 | float m_charSpaceWidth; |
||
248 | |||
249 | public LineSegment AscentLine; |
||
250 | public LineSegment DecentLine; |
||
251 | |||
252 | public object Clone() |
||
253 | { |
||
254 | TextChunk copy = new TextChunk(m_text, m_startLocation, m_endLocation, m_charSpaceWidth, AscentLine, DecentLine); |
||
255 | return copy; |
||
256 | } |
||
257 | |||
258 | public string Text |
||
259 | { |
||
260 | get { return m_text; } |
||
261 | set { m_text = value; } |
||
262 | } |
||
263 | public float CharSpaceWidth |
||
264 | { |
||
265 | get { return m_charSpaceWidth; } |
||
266 | set { m_charSpaceWidth = value; } |
||
267 | } |
||
268 | public Vector StartLocation |
||
269 | { |
||
270 | get { return m_startLocation; } |
||
271 | set { m_startLocation = value; } |
||
272 | } |
||
273 | public Vector EndLocation |
||
274 | { |
||
275 | get { return m_endLocation; } |
||
276 | set { m_endLocation = value; } |
||
277 | } |
||
278 | |||
279 | /// <summary> |
||
280 | /// Represents a chunk of text, it's orientation, and location relative to the orientation vector |
||
281 | /// </summary> |
||
282 | /// <param name="txt"></param> |
||
283 | /// <param name="startLoc"></param> |
||
284 | /// <param name="endLoc"></param> |
||
285 | /// <param name="charSpaceWidth"></param> |
||
286 | public TextChunk(string txt, Vector startLoc, Vector endLoc, float charSpaceWidth, LineSegment ascentLine, LineSegment decentLine) |
||
287 | { |
||
288 | m_text = txt; |
||
289 | m_startLocation = startLoc; |
||
290 | m_endLocation = endLoc; |
||
291 | m_charSpaceWidth = charSpaceWidth; |
||
292 | AscentLine = ascentLine; |
||
293 | DecentLine = decentLine; |
||
294 | |||
295 | m_orientationVector = m_endLocation.Subtract(m_startLocation).Normalize(); |
||
296 | m_orientationMagnitude = (int)(Math.Atan2(m_orientationVector[Vector.I2], m_orientationVector[Vector.I1]) * 1000); |
||
297 | |||
298 | // see http://mathworld.wolfram.com/Point-LineDistance2-Dimensional.html |
||
299 | // the two vectors we are crossing are in the same plane, so the result will be purely |
||
300 | // in the z-axis (out of plane) direction, so we just take the I3 component of the result |
||
301 | Vector origin = new Vector(0, 0, 1); |
||
302 | m_distPerpendicular = (int)(m_startLocation.Subtract(origin)).Cross(m_orientationVector)[Vector.I3]; |
||
303 | |||
304 | m_distParallelStart = m_orientationVector.Dot(m_startLocation); |
||
305 | m_distParallelEnd = m_orientationVector.Dot(m_endLocation); |
||
306 | } |
||
307 | |||
308 | /// <summary> |
||
309 | /// true if this location is on the the same line as the other text chunk |
||
310 | /// </summary> |
||
311 | /// <param name="textChunkToCompare">the location to compare to</param> |
||
312 | /// <returns>true if this location is on the the same line as the other</returns> |
||
313 | public bool sameLine(TextChunk textChunkToCompare) |
||
314 | { |
||
315 | if (m_orientationMagnitude != textChunkToCompare.m_orientationMagnitude) return false; |
||
316 | if (m_distPerpendicular != textChunkToCompare.m_distPerpendicular) return false; |
||
317 | return true; |
||
318 | } |
||
319 | |||
320 | /// <summary> |
||
321 | /// Computes the distance between the end of 'other' and the beginning of this chunk |
||
322 | /// in the direction of this chunk's orientation vector. Note that it's a bad idea |
||
323 | /// to call this for chunks that aren't on the same line and orientation, but we don't |
||
324 | /// explicitly check for that condition for performance reasons. |
||
325 | /// </summary> |
||
326 | /// <param name="other"></param> |
||
327 | /// <returns>the number of spaces between the end of 'other' and the beginning of this chunk</returns> |
||
328 | public float distanceFromEndOf(TextChunk other) |
||
329 | { |
||
330 | float distance = m_distParallelStart - other.m_distParallelEnd; |
||
331 | return distance; |
||
332 | } |
||
333 | |||
334 | /// <summary> |
||
335 | /// Compares based on orientation, perpendicular distance, then parallel distance |
||
336 | /// </summary> |
||
337 | /// <param name="obj"></param> |
||
338 | /// <returns></returns> |
||
339 | public int CompareTo(object obj) |
||
340 | { |
||
341 | if (obj == null) throw new ArgumentException("Object is now a TextChunk"); |
||
342 | |||
343 | TextChunk rhs = obj as TextChunk; |
||
344 | if (rhs != null) |
||
345 | { |
||
346 | if (this == rhs) return 0; |
||
347 | |||
348 | int rslt; |
||
349 | rslt = m_orientationMagnitude - rhs.m_orientationMagnitude; |
||
350 | if (rslt != 0) return rslt; |
||
351 | |||
352 | rslt = m_distPerpendicular - rhs.m_distPerpendicular; |
||
353 | if (rslt != 0) return rslt; |
||
354 | |||
355 | // note: it's never safe to check floating point numbers for equality, and if two chunks |
||
356 | // are truly right on top of each other, which one comes first or second just doesn't matter |
||
357 | // so we arbitrarily choose this way. |
||
358 | rslt = m_distParallelStart < rhs.m_distParallelStart ? -1 : 1; |
||
359 | |||
360 | return rslt; |
||
361 | } |
||
362 | else |
||
363 | { |
||
364 | throw new ArgumentException("Object is now a TextChunk"); |
||
365 | } |
||
366 | } |
||
367 | } |
||
368 | |||
369 | public class TextInfo |
||
370 | { |
||
371 | public Vector TopLeft; |
||
372 | public Vector BottomRight; |
||
373 | private string m_Text; |
||
374 | |||
375 | public string Text |
||
376 | { |
||
377 | get { return m_Text; } |
||
378 | } |
||
379 | |||
380 | /// <summary> |
||
381 | /// Create a TextInfo. |
||
382 | /// </summary> |
||
383 | /// <param name="initialTextChunk"></param> |
||
384 | public TextInfo(TextChunk initialTextChunk) |
||
385 | { |
||
386 | TopLeft = initialTextChunk.AscentLine.GetStartPoint(); |
||
387 | BottomRight = initialTextChunk.DecentLine.GetEndPoint(); |
||
388 | m_Text = initialTextChunk.Text; |
||
389 | } |
||
390 | |||
391 | /// <summary> |
||
392 | /// Add more text to this TextInfo. |
||
393 | /// </summary> |
||
394 | /// <param name="additionalTextChunk"></param> |
||
395 | public void appendText(TextChunk additionalTextChunk) |
||
396 | { |
||
397 | BottomRight = additionalTextChunk.DecentLine.GetEndPoint(); |
||
398 | m_Text += additionalTextChunk.Text; |
||
399 | } |
||
400 | |||
401 | /// <summary> |
||
402 | /// Add a space to the TextInfo. This will leave the endpoint out of sync with the text. |
||
403 | /// The assumtion is that you will add more text after the space which will correct the endpoint. |
||
404 | /// </summary> |
||
405 | public void addSpace() |
||
406 | { |
||
407 | m_Text += ' '; |
||
408 | } |
||
409 | |||
410 | |||
411 | } |
||
412 | } |
||
413 | |||
414 | public class HoneyheadTEXT |
||
415 | { |
||
416 | public int PageNo { get; set; } |
||
417 | public string Text { get; set; } |
||
418 | public pdftron.PDF.Rect Bounds { get; set; } |
||
419 | } |
||
420 | |||
421 | public class PDFSearchText |
||
422 | { |
||
423 | public PDFSearchText() |
||
424 | { |
||
425 | pdftron.PDFNet.Initialize("daelim.co.kr(Doftech Corp):CPU:2::W:AMC(20120315):EF6E886F25A414FFB5F8C1F2999CF2DA33DC6C5164315BAF7011B87AF0FA"); |
||
426 | } |
||
427 | |||
428 | public List<HoneyheadTEXT> GetPageText(string inputPdf, int pageNumber, bool IsSearchAllPage) |
||
429 | { |
||
430 | IList<TextLocation> res = null; |
||
431 | List<HoneyheadTEXT> txtSet = new List<HoneyheadTEXT>(); |
||
432 | |||
433 | using (pdftron.PDF.PDFDoc doc = new pdftron.PDF.PDFDoc(inputPdf)) |
||
434 | { |
||
435 | doc.InitSecurityHandler(); |
||
436 | |||
437 | if (IsSearchAllPage) |
||
438 | { |
||
439 | for (int i = 1; i <= doc.GetPageCount(); i++) |
||
440 | { |
||
441 | pdftron.PDF.Page page = doc.GetPage(i); |
||
442 | using (pdftron.PDF.TextExtractor txt = new pdftron.PDF.TextExtractor()) |
||
443 | { |
||
444 | txt.Begin(page); // Read the page. |
||
445 | //pdftron.PDF.Rect bbox; |
||
446 | //int cur_flow_id = -1, cur_para_id = -1; |
||
447 | |||
448 | //TextExtractor.Line line; |
||
449 | //TextExtractor.Word word; |
||
450 | //TextExtractor.Style s, line_style; |
||
451 | |||
452 | //// For each line on the page... |
||
453 | //for (line = txt.GetFirstLine(); line.IsValid(); line = line.GetNextLine()) |
||
454 | //{ |
||
455 | |||
456 | //} |
||
457 | //String text = txt.GetAsXML(TextExtractor.XMLOutputFlags.e_output_bbox); |
||
458 | |||
459 | pdftron.PDF.TextExtractor.Word word; |
||
460 | for (pdftron.PDF.TextExtractor.Line line = txt.GetFirstLine(); line.IsValid(); line = line.GetNextLine()) |
||
461 | { |
||
462 | for (word = line.GetFirstWord(); word.IsValid(); word = word.GetNextWord()) |
||
463 | { |
||
464 | string text = word.GetString(); |
||
465 | var data = word.GetBBox(); |
||
466 | |||
467 | //wordString += word.GetString() + " "; |
||
468 | |||
469 | txtSet.Add(new HoneyheadTEXT { Text = text, Bounds = data, PageNo = i }); |
||
470 | } |
||
471 | } |
||
472 | } |
||
473 | } |
||
474 | } |
||
475 | else |
||
476 | { |
||
477 | pdftron.PDF.Page page = doc.GetPage(pageNumber); |
||
478 | if (page == null) |
||
479 | { |
||
480 | return txtSet; |
||
481 | } |
||
482 | using (pdftron.PDF.TextExtractor txt = new pdftron.PDF.TextExtractor()) |
||
483 | { |
||
484 | txt.Begin(page); // Read the page. |
||
485 | //pdftron.PDF.Rect bbox; |
||
486 | //int cur_flow_id = -1, cur_para_id = -1; |
||
487 | |||
488 | //TextExtractor.Line line; |
||
489 | //TextExtractor.Word word; |
||
490 | //TextExtractor.Style s, line_style; |
||
491 | |||
492 | //// For each line on the page... |
||
493 | //for (line = txt.GetFirstLine(); line.IsValid(); line = line.GetNextLine()) |
||
494 | //{ |
||
495 | |||
496 | //} |
||
497 | //String text = txt.GetAsXML(TextExtractor.XMLOutputFlags.e_output_bbox); |
||
498 | |||
499 | pdftron.PDF.TextExtractor.Word word; |
||
500 | for (pdftron.PDF.TextExtractor.Line line = txt.GetFirstLine(); line.IsValid(); line = line.GetNextLine()) |
||
501 | { |
||
502 | for (word = line.GetFirstWord(); word.IsValid(); word = word.GetNextWord()) |
||
503 | { |
||
504 | string text = word.GetString(); |
||
505 | var data = word.GetBBox(); |
||
506 | |||
507 | //wordString += word.GetString() + " "; |
||
508 | |||
509 | txtSet.Add(new HoneyheadTEXT { Text = text, Bounds = data, PageNo = pageNumber }); |
||
510 | } |
||
511 | } |
||
512 | } |
||
513 | } |
||
514 | return txtSet; |
||
515 | } |
||
516 | } |
||
517 | } |
||
518 | } |