markus / MarkupToPDF / Common / LocationTextExtractionStrategyWithPosition.cs @ 05009a0e
이력 | 보기 | 이력해설 | 다운로드 (19.6 KB)
1 |
using iTextSharp.text.pdf.parser; |
---|---|
2 |
using System; |
3 |
using System.Collections.Generic; |
4 |
using System.Linq; |
5 |
using System.Text; |
6 |
using System.Threading.Tasks; |
7 |
using static iTextSharp.text.pdf.parser.LocationTextExtractionStrategy; |
8 |
|
9 |
namespace MarkupToPDF.Common |
10 |
{ |
11 |
public class LocationTextExtractionStrategyWithPosition : LocationTextExtractionStrategy |
12 |
{ |
13 |
private readonly List<TextChunk> locationalResult = new List<TextChunk>(); |
14 |
|
15 |
private readonly ITextChunkLocationStrategy tclStrat; |
16 |
|
17 |
public LocationTextExtractionStrategyWithPosition() : this(new TextChunkLocationStrategyDefaultImp()) |
18 |
{ |
19 |
} |
20 |
|
21 |
/** |
22 |
* Creates a new text extraction renderer, with a custom strategy for |
23 |
* creating new TextChunkLocation objects based on the input of the |
24 |
* TextRenderInfo. |
25 |
* @param strat the custom strategy |
26 |
*/ |
27 |
public LocationTextExtractionStrategyWithPosition(ITextChunkLocationStrategy strat) |
28 |
{ |
29 |
tclStrat = strat; |
30 |
} |
31 |
|
32 |
|
33 |
private bool StartsWithSpace(string str) |
34 |
{ |
35 |
if (str.Length == 0) return false; |
36 |
return str[0] == ' '; |
37 |
} |
38 |
|
39 |
|
40 |
private bool EndsWithSpace(string str) |
41 |
{ |
42 |
if (str.Length == 0) return false; |
43 |
return str[str.Length - 1] == ' '; |
44 |
} |
45 |
|
46 |
/** |
47 |
* Filters the provided list with the provided filter |
48 |
* @param textChunks a list of all TextChunks that this strategy found during processing |
49 |
* @param filter the filter to apply. If null, filtering will be skipped. |
50 |
* @return the filtered list |
51 |
* @since 5.3.3 |
52 |
*/ |
53 |
|
54 |
private List<TextChunk> filterTextChunks(List<TextChunk> textChunks, ITextChunkFilter filter) |
55 |
{ |
56 |
if (filter == null) |
57 |
{ |
58 |
return textChunks; |
59 |
} |
60 |
|
61 |
var filtered = new List<TextChunk>(); |
62 |
|
63 |
foreach (var textChunk in textChunks) |
64 |
{ |
65 |
if (filter.Accept(textChunk)) |
66 |
{ |
67 |
filtered.Add(textChunk); |
68 |
} |
69 |
} |
70 |
|
71 |
return filtered; |
72 |
} |
73 |
|
74 |
public override void RenderText(TextRenderInfo renderInfo) |
75 |
{ |
76 |
LineSegment segment = renderInfo.GetBaseline(); |
77 |
if (renderInfo.GetRise() != 0) |
78 |
{ // remove the rise from the baseline - we do this because the text from a super/subscript render operations should probably be considered as part of the baseline of the text the super/sub is relative to |
79 |
Matrix riseOffsetTransform = new Matrix(0, -renderInfo.GetRise()); |
80 |
segment = segment.TransformBy(riseOffsetTransform); |
81 |
} |
82 |
TextChunk tc = new TextChunk(renderInfo.GetText(), tclStrat.CreateLocation(renderInfo, segment)); |
83 |
locationalResult.Add(tc); |
84 |
} |
85 |
|
86 |
|
87 |
public IList<TextLocation> GetLocations() |
88 |
{ |
89 |
|
90 |
var filteredTextChunks = filterTextChunks(locationalResult, null); |
91 |
filteredTextChunks.Sort(); |
92 |
|
93 |
TextChunk lastChunk = null; |
94 |
|
95 |
var textLocations = new List<TextLocation>(); |
96 |
|
97 |
foreach (var chunk in filteredTextChunks) |
98 |
{ |
99 |
|
100 |
if (lastChunk == null) |
101 |
{ |
102 |
//initial |
103 |
textLocations.Add(new TextLocation |
104 |
{ |
105 |
Text = chunk.Text, |
106 |
X = iTextSharp.text.Utilities.PointsToMillimeters(chunk.Location.StartLocation[0]), |
107 |
Y = iTextSharp.text.Utilities.PointsToMillimeters(chunk.Location.StartLocation[1]) |
108 |
}); |
109 |
|
110 |
} |
111 |
else |
112 |
{ |
113 |
if (chunk.SameLine(lastChunk)) |
114 |
{ |
115 |
var text = ""; |
116 |
// we only insert a blank space if the trailing character of the previous string wasn't a space, and the leading character of the current string isn't a space |
117 |
if (IsChunkAtWordBoundary(chunk, lastChunk) && !StartsWithSpace(chunk.Text) && !EndsWithSpace(lastChunk.Text)) |
118 |
text += ' '; |
119 |
|
120 |
text += chunk.Text; |
121 |
|
122 |
textLocations[textLocations.Count - 1].Text += text; |
123 |
|
124 |
} |
125 |
else |
126 |
{ |
127 |
|
128 |
textLocations.Add(new TextLocation |
129 |
{ |
130 |
Text = chunk.Text, |
131 |
X = iTextSharp.text.Utilities.PointsToMillimeters(chunk.Location.StartLocation[0]), |
132 |
Y = iTextSharp.text.Utilities.PointsToMillimeters(chunk.Location.StartLocation[1]) |
133 |
}); |
134 |
} |
135 |
} |
136 |
lastChunk = chunk; |
137 |
} |
138 |
|
139 |
//now find the location(s) with the given texts |
140 |
return textLocations; |
141 |
|
142 |
} |
143 |
|
144 |
} |
145 |
|
146 |
public class TextLocation |
147 |
{ |
148 |
public float X { get; set; } |
149 |
public float Y { get; set; } |
150 |
|
151 |
public string Text { get; set; } |
152 |
} |
153 |
|
154 |
public class LocationTextExtractionStrategyEx : LocationTextExtractionStrategy |
155 |
{ |
156 |
private List<TextChunk> m_locationResult = new List<TextChunk>(); |
157 |
private List<TextInfo> m_TextLocationInfo = new List<TextInfo>(); |
158 |
public List<TextChunk> LocationResult |
159 |
{ |
160 |
get { return m_locationResult; } |
161 |
} |
162 |
public List<TextInfo> TextLocationInfo |
163 |
{ |
164 |
get { return m_TextLocationInfo; } |
165 |
} |
166 |
|
167 |
/// <summary> |
168 |
/// Creates a new LocationTextExtracationStrategyEx |
169 |
/// </summary> |
170 |
public LocationTextExtractionStrategyEx() |
171 |
{ |
172 |
} |
173 |
|
174 |
/// <summary> |
175 |
/// Returns the result so far |
176 |
/// </summary> |
177 |
/// <returns>a String with the resulting text</returns> |
178 |
public override String GetResultantText() |
179 |
{ |
180 |
m_locationResult.Sort(); |
181 |
|
182 |
StringBuilder sb = new StringBuilder(); |
183 |
TextChunk lastChunk = null; |
184 |
TextInfo lastTextInfo = null; |
185 |
foreach (TextChunk chunk in m_locationResult) |
186 |
{ |
187 |
if (lastChunk == null) |
188 |
{ |
189 |
sb.Append(chunk.Text); |
190 |
lastTextInfo = new TextInfo(chunk); |
191 |
m_TextLocationInfo.Add(lastTextInfo); |
192 |
} |
193 |
else |
194 |
{ |
195 |
if (chunk.sameLine(lastChunk)) |
196 |
{ |
197 |
float dist = chunk.distanceFromEndOf(lastChunk); |
198 |
|
199 |
if (dist < -chunk.CharSpaceWidth) |
200 |
{ |
201 |
sb.Append(' '); |
202 |
lastTextInfo.addSpace(); |
203 |
} |
204 |
//append a space if the trailing char of the prev string wasn't a space && the 1st char of the current string isn't a space |
205 |
else if (dist > chunk.CharSpaceWidth / 2.0f && chunk.Text[0] != ' ' && lastChunk.Text[lastChunk.Text.Length - 1] != ' ') |
206 |
{ |
207 |
sb.Append(' '); |
208 |
lastTextInfo.addSpace(); |
209 |
} |
210 |
sb.Append(chunk.Text); |
211 |
lastTextInfo.appendText(chunk); |
212 |
} |
213 |
else |
214 |
{ |
215 |
sb.Append('\n'); |
216 |
sb.Append(chunk.Text); |
217 |
lastTextInfo = new TextInfo(chunk); |
218 |
m_TextLocationInfo.Add(lastTextInfo); |
219 |
} |
220 |
} |
221 |
lastChunk = chunk; |
222 |
} |
223 |
return sb.ToString(); |
224 |
} |
225 |
|
226 |
/// <summary> |
227 |
/// |
228 |
/// </summary> |
229 |
/// <param name="renderInfo"></param> |
230 |
public override void RenderText(TextRenderInfo renderInfo) |
231 |
{ |
232 |
LineSegment segment = renderInfo.GetBaseline(); |
233 |
TextChunk location = new TextChunk(renderInfo.GetText(), segment.GetStartPoint(), segment.GetEndPoint(), renderInfo.GetSingleSpaceWidth(), renderInfo.GetAscentLine(), renderInfo.GetDescentLine()); |
234 |
m_locationResult.Add(location); |
235 |
} |
236 |
|
237 |
public class TextChunk : IComparable, ICloneable |
238 |
{ |
239 |
string m_text; |
240 |
Vector m_startLocation; |
241 |
Vector m_endLocation; |
242 |
Vector m_orientationVector; |
243 |
int m_orientationMagnitude; |
244 |
int m_distPerpendicular; |
245 |
float m_distParallelStart; |
246 |
float m_distParallelEnd; |
247 |
float m_charSpaceWidth; |
248 |
|
249 |
public LineSegment AscentLine; |
250 |
public LineSegment DecentLine; |
251 |
|
252 |
public object Clone() |
253 |
{ |
254 |
TextChunk copy = new TextChunk(m_text, m_startLocation, m_endLocation, m_charSpaceWidth, AscentLine, DecentLine); |
255 |
return copy; |
256 |
} |
257 |
|
258 |
public string Text |
259 |
{ |
260 |
get { return m_text; } |
261 |
set { m_text = value; } |
262 |
} |
263 |
public float CharSpaceWidth |
264 |
{ |
265 |
get { return m_charSpaceWidth; } |
266 |
set { m_charSpaceWidth = value; } |
267 |
} |
268 |
public Vector StartLocation |
269 |
{ |
270 |
get { return m_startLocation; } |
271 |
set { m_startLocation = value; } |
272 |
} |
273 |
public Vector EndLocation |
274 |
{ |
275 |
get { return m_endLocation; } |
276 |
set { m_endLocation = value; } |
277 |
} |
278 |
|
279 |
/// <summary> |
280 |
/// Represents a chunk of text, it's orientation, and location relative to the orientation vector |
281 |
/// </summary> |
282 |
/// <param name="txt"></param> |
283 |
/// <param name="startLoc"></param> |
284 |
/// <param name="endLoc"></param> |
285 |
/// <param name="charSpaceWidth"></param> |
286 |
public TextChunk(string txt, Vector startLoc, Vector endLoc, float charSpaceWidth, LineSegment ascentLine, LineSegment decentLine) |
287 |
{ |
288 |
m_text = txt; |
289 |
m_startLocation = startLoc; |
290 |
m_endLocation = endLoc; |
291 |
m_charSpaceWidth = charSpaceWidth; |
292 |
AscentLine = ascentLine; |
293 |
DecentLine = decentLine; |
294 |
|
295 |
m_orientationVector = m_endLocation.Subtract(m_startLocation).Normalize(); |
296 |
m_orientationMagnitude = (int)(Math.Atan2(m_orientationVector[Vector.I2], m_orientationVector[Vector.I1]) * 1000); |
297 |
|
298 |
// see http://mathworld.wolfram.com/Point-LineDistance2-Dimensional.html |
299 |
// the two vectors we are crossing are in the same plane, so the result will be purely |
300 |
// in the z-axis (out of plane) direction, so we just take the I3 component of the result |
301 |
Vector origin = new Vector(0, 0, 1); |
302 |
m_distPerpendicular = (int)(m_startLocation.Subtract(origin)).Cross(m_orientationVector)[Vector.I3]; |
303 |
|
304 |
m_distParallelStart = m_orientationVector.Dot(m_startLocation); |
305 |
m_distParallelEnd = m_orientationVector.Dot(m_endLocation); |
306 |
} |
307 |
|
308 |
/// <summary> |
309 |
/// true if this location is on the the same line as the other text chunk |
310 |
/// </summary> |
311 |
/// <param name="textChunkToCompare">the location to compare to</param> |
312 |
/// <returns>true if this location is on the the same line as the other</returns> |
313 |
public bool sameLine(TextChunk textChunkToCompare) |
314 |
{ |
315 |
if (m_orientationMagnitude != textChunkToCompare.m_orientationMagnitude) return false; |
316 |
if (m_distPerpendicular != textChunkToCompare.m_distPerpendicular) return false; |
317 |
return true; |
318 |
} |
319 |
|
320 |
/// <summary> |
321 |
/// Computes the distance between the end of 'other' and the beginning of this chunk |
322 |
/// in the direction of this chunk's orientation vector. Note that it's a bad idea |
323 |
/// to call this for chunks that aren't on the same line and orientation, but we don't |
324 |
/// explicitly check for that condition for performance reasons. |
325 |
/// </summary> |
326 |
/// <param name="other"></param> |
327 |
/// <returns>the number of spaces between the end of 'other' and the beginning of this chunk</returns> |
328 |
public float distanceFromEndOf(TextChunk other) |
329 |
{ |
330 |
float distance = m_distParallelStart - other.m_distParallelEnd; |
331 |
return distance; |
332 |
} |
333 |
|
334 |
/// <summary> |
335 |
/// Compares based on orientation, perpendicular distance, then parallel distance |
336 |
/// </summary> |
337 |
/// <param name="obj"></param> |
338 |
/// <returns></returns> |
339 |
public int CompareTo(object obj) |
340 |
{ |
341 |
if (obj == null) throw new ArgumentException("Object is now a TextChunk"); |
342 |
|
343 |
TextChunk rhs = obj as TextChunk; |
344 |
if (rhs != null) |
345 |
{ |
346 |
if (this == rhs) return 0; |
347 |
|
348 |
int rslt; |
349 |
rslt = m_orientationMagnitude - rhs.m_orientationMagnitude; |
350 |
if (rslt != 0) return rslt; |
351 |
|
352 |
rslt = m_distPerpendicular - rhs.m_distPerpendicular; |
353 |
if (rslt != 0) return rslt; |
354 |
|
355 |
// note: it's never safe to check floating point numbers for equality, and if two chunks |
356 |
// are truly right on top of each other, which one comes first or second just doesn't matter |
357 |
// so we arbitrarily choose this way. |
358 |
rslt = m_distParallelStart < rhs.m_distParallelStart ? -1 : 1; |
359 |
|
360 |
return rslt; |
361 |
} |
362 |
else |
363 |
{ |
364 |
throw new ArgumentException("Object is now a TextChunk"); |
365 |
} |
366 |
} |
367 |
} |
368 |
|
369 |
public class TextInfo |
370 |
{ |
371 |
public Vector TopLeft; |
372 |
public Vector BottomRight; |
373 |
private string m_Text; |
374 |
|
375 |
public string Text |
376 |
{ |
377 |
get { return m_Text; } |
378 |
} |
379 |
|
380 |
/// <summary> |
381 |
/// Create a TextInfo. |
382 |
/// </summary> |
383 |
/// <param name="initialTextChunk"></param> |
384 |
public TextInfo(TextChunk initialTextChunk) |
385 |
{ |
386 |
TopLeft = initialTextChunk.AscentLine.GetStartPoint(); |
387 |
BottomRight = initialTextChunk.DecentLine.GetEndPoint(); |
388 |
m_Text = initialTextChunk.Text; |
389 |
} |
390 |
|
391 |
/// <summary> |
392 |
/// Add more text to this TextInfo. |
393 |
/// </summary> |
394 |
/// <param name="additionalTextChunk"></param> |
395 |
public void appendText(TextChunk additionalTextChunk) |
396 |
{ |
397 |
BottomRight = additionalTextChunk.DecentLine.GetEndPoint(); |
398 |
m_Text += additionalTextChunk.Text; |
399 |
} |
400 |
|
401 |
/// <summary> |
402 |
/// Add a space to the TextInfo. This will leave the endpoint out of sync with the text. |
403 |
/// The assumtion is that you will add more text after the space which will correct the endpoint. |
404 |
/// </summary> |
405 |
public void addSpace() |
406 |
{ |
407 |
m_Text += ' '; |
408 |
} |
409 |
|
410 |
|
411 |
} |
412 |
} |
413 |
|
414 |
public class HoneyheadTEXT |
415 |
{ |
416 |
public int PageNo { get; set; } |
417 |
public string Text { get; set; } |
418 |
public pdftron.PDF.Rect Bounds { get; set; } |
419 |
} |
420 |
|
421 |
public class PDFSearchText |
422 |
{ |
423 |
public PDFSearchText() |
424 |
{ |
425 |
pdftron.PDFNet.Initialize("daelim.co.kr(Doftech Corp):CPU:2::W:AMC(20120315):EF6E886F25A414FFB5F8C1F2999CF2DA33DC6C5164315BAF7011B87AF0FA"); |
426 |
} |
427 |
|
428 |
public List<HoneyheadTEXT> GetPageText(string inputPdf, int pageNumber, bool IsSearchAllPage) |
429 |
{ |
430 |
IList<TextLocation> res = null; |
431 |
List<HoneyheadTEXT> txtSet = new List<HoneyheadTEXT>(); |
432 |
|
433 |
using (pdftron.PDF.PDFDoc doc = new pdftron.PDF.PDFDoc(inputPdf)) |
434 |
{ |
435 |
doc.InitSecurityHandler(); |
436 |
|
437 |
if (IsSearchAllPage) |
438 |
{ |
439 |
for (int i = 1; i <= doc.GetPageCount(); i++) |
440 |
{ |
441 |
pdftron.PDF.Page page = doc.GetPage(i); |
442 |
using (pdftron.PDF.TextExtractor txt = new pdftron.PDF.TextExtractor()) |
443 |
{ |
444 |
txt.Begin(page); // Read the page. |
445 |
//pdftron.PDF.Rect bbox; |
446 |
//int cur_flow_id = -1, cur_para_id = -1; |
447 |
|
448 |
//TextExtractor.Line line; |
449 |
//TextExtractor.Word word; |
450 |
//TextExtractor.Style s, line_style; |
451 |
|
452 |
//// For each line on the page... |
453 |
//for (line = txt.GetFirstLine(); line.IsValid(); line = line.GetNextLine()) |
454 |
//{ |
455 |
|
456 |
//} |
457 |
//String text = txt.GetAsXML(TextExtractor.XMLOutputFlags.e_output_bbox); |
458 |
|
459 |
pdftron.PDF.TextExtractor.Word word; |
460 |
for (pdftron.PDF.TextExtractor.Line line = txt.GetFirstLine(); line.IsValid(); line = line.GetNextLine()) |
461 |
{ |
462 |
for (word = line.GetFirstWord(); word.IsValid(); word = word.GetNextWord()) |
463 |
{ |
464 |
string text = word.GetString(); |
465 |
var data = word.GetBBox(); |
466 |
|
467 |
//wordString += word.GetString() + " "; |
468 |
|
469 |
txtSet.Add(new HoneyheadTEXT { Text = text, Bounds = data, PageNo = i }); |
470 |
} |
471 |
} |
472 |
} |
473 |
} |
474 |
} |
475 |
else |
476 |
{ |
477 |
pdftron.PDF.Page page = doc.GetPage(pageNumber); |
478 |
if (page == null) |
479 |
{ |
480 |
return txtSet; |
481 |
} |
482 |
using (pdftron.PDF.TextExtractor txt = new pdftron.PDF.TextExtractor()) |
483 |
{ |
484 |
txt.Begin(page); // Read the page. |
485 |
//pdftron.PDF.Rect bbox; |
486 |
//int cur_flow_id = -1, cur_para_id = -1; |
487 |
|
488 |
//TextExtractor.Line line; |
489 |
//TextExtractor.Word word; |
490 |
//TextExtractor.Style s, line_style; |
491 |
|
492 |
//// For each line on the page... |
493 |
//for (line = txt.GetFirstLine(); line.IsValid(); line = line.GetNextLine()) |
494 |
//{ |
495 |
|
496 |
//} |
497 |
//String text = txt.GetAsXML(TextExtractor.XMLOutputFlags.e_output_bbox); |
498 |
|
499 |
pdftron.PDF.TextExtractor.Word word; |
500 |
for (pdftron.PDF.TextExtractor.Line line = txt.GetFirstLine(); line.IsValid(); line = line.GetNextLine()) |
501 |
{ |
502 |
for (word = line.GetFirstWord(); word.IsValid(); word = word.GetNextWord()) |
503 |
{ |
504 |
string text = word.GetString(); |
505 |
var data = word.GetBBox(); |
506 |
|
507 |
//wordString += word.GetString() + " "; |
508 |
|
509 |
txtSet.Add(new HoneyheadTEXT { Text = text, Bounds = data, PageNo = pageNumber }); |
510 |
} |
511 |
} |
512 |
} |
513 |
} |
514 |
return txtSet; |
515 |
} |
516 |
} |
517 |
} |
518 |
} |