프로젝트

일반

사용자정보

통계
| 브랜치(Branch): | 개정판:

markus / MarkupToPDF / Common / LocationTextExtractionStrategyWithPosition.cs @ c8e9b3e4

이력 | 보기 | 이력해설 | 다운로드 (19.6 KB)

1 787a4489 KangIngu
using iTextSharp.text.pdf.parser;
2
using System;
3
using System.Collections.Generic;
4
using System.Linq;
5
using System.Text;
6
using System.Threading.Tasks;
7
using static iTextSharp.text.pdf.parser.LocationTextExtractionStrategy;
8
9
namespace MarkupToPDF.Common
10
{
11
    public class LocationTextExtractionStrategyWithPosition : LocationTextExtractionStrategy
12
    {
13
        private readonly List<TextChunk> locationalResult = new List<TextChunk>();
14
15
        private readonly ITextChunkLocationStrategy tclStrat;
16
17
        public LocationTextExtractionStrategyWithPosition() : this(new TextChunkLocationStrategyDefaultImp())
18
        {
19
        }
20
21
        /**
22
         * Creates a new text extraction renderer, with a custom strategy for
23
         * creating new TextChunkLocation objects based on the input of the
24
         * TextRenderInfo.
25
         * @param strat the custom strategy
26
         */
27
        public LocationTextExtractionStrategyWithPosition(ITextChunkLocationStrategy strat)
28
        {
29
            tclStrat = strat;
30
        }
31
32
33
        private bool StartsWithSpace(string str)
34
        {
35
            if (str.Length == 0) return false;
36
            return str[0] == ' ';
37
        }
38
39
40
        private bool EndsWithSpace(string str)
41
        {
42
            if (str.Length == 0) return false;
43
            return str[str.Length - 1] == ' ';
44
        }
45
46
        /**
47
         * Filters the provided list with the provided filter
48
         * @param textChunks a list of all TextChunks that this strategy found during processing
49
         * @param filter the filter to apply.  If null, filtering will be skipped.
50
         * @return the filtered list
51
         * @since 5.3.3
52
         */
53
54
        private List<TextChunk> filterTextChunks(List<TextChunk> textChunks, ITextChunkFilter filter)
55
        {
56
            if (filter == null)
57
            {
58
                return textChunks;
59
            }
60
61
            var filtered = new List<TextChunk>();
62
63
            foreach (var textChunk in textChunks)
64
            {
65
                if (filter.Accept(textChunk))
66
                {
67
                    filtered.Add(textChunk);
68
                }
69
            }
70
71
            return filtered;
72
        }
73
74
        public override void RenderText(TextRenderInfo renderInfo)
75
        {
76
            LineSegment segment = renderInfo.GetBaseline();
77
            if (renderInfo.GetRise() != 0)
78
            { // remove the rise from the baseline - we do this because the text from a super/subscript render operations should probably be considered as part of the baseline of the text the super/sub is relative to 
79
                Matrix riseOffsetTransform = new Matrix(0, -renderInfo.GetRise());
80
                segment = segment.TransformBy(riseOffsetTransform);
81
            }
82
            TextChunk tc = new TextChunk(renderInfo.GetText(), tclStrat.CreateLocation(renderInfo, segment));
83
            locationalResult.Add(tc);
84
        }
85
86
87
        public IList<TextLocation> GetLocations()
88
        {
89
90
            var filteredTextChunks = filterTextChunks(locationalResult, null);
91
            filteredTextChunks.Sort();
92
93
            TextChunk lastChunk = null;
94
95
            var textLocations = new List<TextLocation>();
96
97
            foreach (var chunk in filteredTextChunks)
98
            {
99
100
                if (lastChunk == null)
101
                {
102
                    //initial
103
                    textLocations.Add(new TextLocation
104
                    {
105
                        Text = chunk.Text,
106
                        X = iTextSharp.text.Utilities.PointsToMillimeters(chunk.Location.StartLocation[0]),
107
                        Y = iTextSharp.text.Utilities.PointsToMillimeters(chunk.Location.StartLocation[1])
108
                    });
109
110
                }
111
                else
112
                {
113
                    if (chunk.SameLine(lastChunk))
114
                    {
115
                        var text = "";
116
                        // we only insert a blank space if the trailing character of the previous string wasn't a space, and the leading character of the current string isn't a space
117
                        if (IsChunkAtWordBoundary(chunk, lastChunk) && !StartsWithSpace(chunk.Text) && !EndsWithSpace(lastChunk.Text))
118
                            text += ' ';
119
120
                        text += chunk.Text;
121
122
                        textLocations[textLocations.Count - 1].Text += text;
123
124
                    }
125
                    else
126
                    {
127
128
                        textLocations.Add(new TextLocation
129
                        {
130
                            Text = chunk.Text,
131
                            X = iTextSharp.text.Utilities.PointsToMillimeters(chunk.Location.StartLocation[0]),
132
                            Y = iTextSharp.text.Utilities.PointsToMillimeters(chunk.Location.StartLocation[1])
133
                        });
134
                    }
135
                }
136
                lastChunk = chunk;
137
            }
138
139
            //now find the location(s) with the given texts
140
            return textLocations;
141
142
        }
143
144
    }
145
146
    public class TextLocation
147
    {
148
        public float X { get; set; }
149
        public float Y { get; set; }
150
151
        public string Text { get; set; }
152
    }
153
154
    public class LocationTextExtractionStrategyEx : LocationTextExtractionStrategy
155
    {
156
        private List<TextChunk> m_locationResult = new List<TextChunk>();
157
        private List<TextInfo> m_TextLocationInfo = new List<TextInfo>();
158
        public List<TextChunk> LocationResult
159
        {
160
            get { return m_locationResult; }
161
        }
162
        public List<TextInfo> TextLocationInfo
163
        {
164
            get { return m_TextLocationInfo; }
165
        }
166
167
        /// <summary>
168
        /// Creates a new LocationTextExtracationStrategyEx
169
        /// </summary>
170
        public LocationTextExtractionStrategyEx()
171
        {
172
        }
173
174
        /// <summary>
175
        /// Returns the result so far
176
        /// </summary>
177
        /// <returns>a String with the resulting text</returns>
178
        public override String GetResultantText()
179
        {
180
            m_locationResult.Sort();
181
182
            StringBuilder sb = new StringBuilder();
183
            TextChunk lastChunk = null;
184
            TextInfo lastTextInfo = null;
185
            foreach (TextChunk chunk in m_locationResult)
186
            {
187
                if (lastChunk == null)
188
                {
189
                    sb.Append(chunk.Text);
190
                    lastTextInfo = new TextInfo(chunk);
191
                    m_TextLocationInfo.Add(lastTextInfo);
192
                }
193
                else
194
                {
195
                    if (chunk.sameLine(lastChunk))
196
                    {
197
                        float dist = chunk.distanceFromEndOf(lastChunk);
198
199
                        if (dist < -chunk.CharSpaceWidth)
200
                        {
201
                            sb.Append(' ');
202
                            lastTextInfo.addSpace();
203
                        }
204
                        //append a space if the trailing char of the prev string wasn't a space && the 1st char of the current string isn't a space
205
                        else if (dist > chunk.CharSpaceWidth / 2.0f && chunk.Text[0] != ' ' && lastChunk.Text[lastChunk.Text.Length - 1] != ' ')
206
                        {
207
                            sb.Append(' ');
208
                            lastTextInfo.addSpace();
209
                        }
210
                        sb.Append(chunk.Text);
211
                        lastTextInfo.appendText(chunk);
212
                    }
213
                    else
214
                    {
215
                        sb.Append('\n');
216
                        sb.Append(chunk.Text);
217
                        lastTextInfo = new TextInfo(chunk);
218
                        m_TextLocationInfo.Add(lastTextInfo);
219
                    }
220
                }
221
                lastChunk = chunk;
222
            }
223
            return sb.ToString();
224
        }
225
226
        /// <summary>
227
        /// 
228
        /// </summary>
229
        /// <param name="renderInfo"></param>
230
        public override void RenderText(TextRenderInfo renderInfo)
231
        {
232
            LineSegment segment = renderInfo.GetBaseline();
233
            TextChunk location = new TextChunk(renderInfo.GetText(), segment.GetStartPoint(), segment.GetEndPoint(), renderInfo.GetSingleSpaceWidth(), renderInfo.GetAscentLine(), renderInfo.GetDescentLine());
234
            m_locationResult.Add(location);
235
        }
236
237
        public class TextChunk : IComparable, ICloneable
238
        {
239
            string m_text;
240
            Vector m_startLocation;
241
            Vector m_endLocation;
242
            Vector m_orientationVector;
243
            int m_orientationMagnitude;
244
            int m_distPerpendicular;
245
            float m_distParallelStart;
246
            float m_distParallelEnd;
247
            float m_charSpaceWidth;
248
249
            public LineSegment AscentLine;
250
            public LineSegment DecentLine;
251
252
            public object Clone()
253
            {
254
                TextChunk copy = new TextChunk(m_text, m_startLocation, m_endLocation, m_charSpaceWidth, AscentLine, DecentLine);
255
                return copy;
256
            }
257
258
            public string Text
259
            {
260
                get { return m_text; }
261
                set { m_text = value; }
262
            }
263
            public float CharSpaceWidth
264
            {
265
                get { return m_charSpaceWidth; }
266
                set { m_charSpaceWidth = value; }
267
            }
268
            public Vector StartLocation
269
            {
270
                get { return m_startLocation; }
271
                set { m_startLocation = value; }
272
            }
273
            public Vector EndLocation
274
            {
275
                get { return m_endLocation; }
276
                set { m_endLocation = value; }
277
            }
278
279
            /// <summary>
280
            /// Represents a chunk of text, it's orientation, and location relative to the orientation vector
281
            /// </summary>
282
            /// <param name="txt"></param>
283
            /// <param name="startLoc"></param>
284
            /// <param name="endLoc"></param>
285
            /// <param name="charSpaceWidth"></param>
286
            public TextChunk(string txt, Vector startLoc, Vector endLoc, float charSpaceWidth, LineSegment ascentLine, LineSegment decentLine)
287
            {
288
                m_text = txt;
289
                m_startLocation = startLoc;
290
                m_endLocation = endLoc;
291
                m_charSpaceWidth = charSpaceWidth;
292
                AscentLine = ascentLine;
293
                DecentLine = decentLine;
294
295
                m_orientationVector = m_endLocation.Subtract(m_startLocation).Normalize();
296
                m_orientationMagnitude = (int)(Math.Atan2(m_orientationVector[Vector.I2], m_orientationVector[Vector.I1]) * 1000);
297
298
                // see http://mathworld.wolfram.com/Point-LineDistance2-Dimensional.html
299
                // the two vectors we are crossing are in the same plane, so the result will be purely
300
                // in the z-axis (out of plane) direction, so we just take the I3 component of the result
301
                Vector origin = new Vector(0, 0, 1);
302
                m_distPerpendicular = (int)(m_startLocation.Subtract(origin)).Cross(m_orientationVector)[Vector.I3];
303
304
                m_distParallelStart = m_orientationVector.Dot(m_startLocation);
305
                m_distParallelEnd = m_orientationVector.Dot(m_endLocation);
306
            }
307
308
            /// <summary>
309
            /// true if this location is on the the same line as the other text chunk
310
            /// </summary>
311
            /// <param name="textChunkToCompare">the location to compare to</param>
312
            /// <returns>true if this location is on the the same line as the other</returns>
313
            public bool sameLine(TextChunk textChunkToCompare)
314
            {
315
                if (m_orientationMagnitude != textChunkToCompare.m_orientationMagnitude) return false;
316
                if (m_distPerpendicular != textChunkToCompare.m_distPerpendicular) return false;
317
                return true;
318
            }
319
320
            /// <summary>
321
            /// Computes the distance between the end of 'other' and the beginning of this chunk
322
            /// in the direction of this chunk's orientation vector.  Note that it's a bad idea
323
            /// to call this for chunks that aren't on the same line and orientation, but we don't
324
            /// explicitly check for that condition for performance reasons.
325
            /// </summary>
326
            /// <param name="other"></param>
327
            /// <returns>the number of spaces between the end of 'other' and the beginning of this chunk</returns>
328
            public float distanceFromEndOf(TextChunk other)
329
            {
330
                float distance = m_distParallelStart - other.m_distParallelEnd;
331
                return distance;
332
            }
333
334
            /// <summary>
335
            /// Compares based on orientation, perpendicular distance, then parallel distance
336
            /// </summary>
337
            /// <param name="obj"></param>
338
            /// <returns></returns>
339
            public int CompareTo(object obj)
340
            {
341
                if (obj == null) throw new ArgumentException("Object is now a TextChunk");
342
343
                TextChunk rhs = obj as TextChunk;
344
                if (rhs != null)
345
                {
346
                    if (this == rhs) return 0;
347
348
                    int rslt;
349
                    rslt = m_orientationMagnitude - rhs.m_orientationMagnitude;
350
                    if (rslt != 0) return rslt;
351
352
                    rslt = m_distPerpendicular - rhs.m_distPerpendicular;
353
                    if (rslt != 0) return rslt;
354
355
                    // note: it's never safe to check floating point numbers for equality, and if two chunks
356
                    // are truly right on top of each other, which one comes first or second just doesn't matter
357
                    // so we arbitrarily choose this way.
358
                    rslt = m_distParallelStart < rhs.m_distParallelStart ? -1 : 1;
359
360
                    return rslt;
361
                }
362
                else
363
                {
364
                    throw new ArgumentException("Object is now a TextChunk");
365
                }
366
            }
367
        }
368
369
        public class TextInfo
370
        {
371
            public Vector TopLeft;
372
            public Vector BottomRight;
373
            private string m_Text;
374
375
            public string Text
376
            {
377
                get { return m_Text; }
378
            }
379
380
            /// <summary>
381
            /// Create a TextInfo.
382
            /// </summary>
383
            /// <param name="initialTextChunk"></param>
384
            public TextInfo(TextChunk initialTextChunk)
385
            {
386
                TopLeft = initialTextChunk.AscentLine.GetStartPoint();
387
                BottomRight = initialTextChunk.DecentLine.GetEndPoint();
388
                m_Text = initialTextChunk.Text;
389
            }
390
391
            /// <summary>
392
            /// Add more text to this TextInfo.
393
            /// </summary>
394
            /// <param name="additionalTextChunk"></param>
395
            public void appendText(TextChunk additionalTextChunk)
396
            {
397
                BottomRight = additionalTextChunk.DecentLine.GetEndPoint();
398
                m_Text += additionalTextChunk.Text;
399
            }
400
401
            /// <summary>
402
            /// Add a space to the TextInfo.  This will leave the endpoint out of sync with the text.
403
            /// The assumtion is that you will add more text after the space which will correct the endpoint.
404
            /// </summary>
405
            public void addSpace()
406
            {
407
                m_Text += ' ';
408
            }
409
410
411
        }
412
    }
413
414
    public class HoneyheadTEXT
415
    {
416
        public int PageNo { get; set; }
417
        public string Text { get; set; }
418
        public pdftron.PDF.Rect Bounds { get; set; }
419
    }
420
421
    public class PDFSearchText
422
    {
423
        public PDFSearchText()
424
        {
425
            pdftron.PDFNet.Initialize("daelim.co.kr(Doftech Corp):CPU:2::W:AMC(20120315):EF6E886F25A414FFB5F8C1F2999CF2DA33DC6C5164315BAF7011B87AF0FA");
426
        }
427
428
        public List<HoneyheadTEXT> GetPageText(string inputPdf, int pageNumber, bool IsSearchAllPage)
429
        {
430
            IList<TextLocation> res = null;
431
            List<HoneyheadTEXT> txtSet = new List<HoneyheadTEXT>();
432
433
            using (pdftron.PDF.PDFDoc doc = new pdftron.PDF.PDFDoc(inputPdf))
434
            {
435
                doc.InitSecurityHandler();
436
437
                if (IsSearchAllPage)
438
                {
439
                    for (int i = 1; i <= doc.GetPageCount(); i++)
440
                    {
441
                        pdftron.PDF.Page page = doc.GetPage(i);
442
                        using (pdftron.PDF.TextExtractor txt = new pdftron.PDF.TextExtractor())
443
                        {
444
                            txt.Begin(page);  // Read the page.
445
                                              //pdftron.PDF.Rect bbox;
446
                                              //int cur_flow_id = -1, cur_para_id = -1;
447
448
                            //TextExtractor.Line line;
449
                            //TextExtractor.Word word;
450
                            //TextExtractor.Style s, line_style;
451
452
                            //// For each line on the page...
453
                            //for (line = txt.GetFirstLine(); line.IsValid(); line = line.GetNextLine())
454
                            //{
455
456
                            //}
457
                            //String text = txt.GetAsXML(TextExtractor.XMLOutputFlags.e_output_bbox);
458
459
                            pdftron.PDF.TextExtractor.Word word;
460
                            for (pdftron.PDF.TextExtractor.Line line = txt.GetFirstLine(); line.IsValid(); line = line.GetNextLine())
461
                            {
462
                                for (word = line.GetFirstWord(); word.IsValid(); word = word.GetNextWord())
463
                                {
464
                                    string text = word.GetString();
465
                                    var data = word.GetBBox();
466
467
                                    //wordString += word.GetString() + " ";
468
469
                                    txtSet.Add(new HoneyheadTEXT { Text = text, Bounds = data, PageNo = i });
470
                                }
471
                            }
472
                        }
473
                    }
474
                }
475
                else
476
                {
477
                    pdftron.PDF.Page page = doc.GetPage(pageNumber);
478
                    if (page == null)
479
                    {
480
                        return txtSet;
481
                    }
482
                    using (pdftron.PDF.TextExtractor txt = new pdftron.PDF.TextExtractor())
483
                    {
484
                        txt.Begin(page);  // Read the page.
485
                                          //pdftron.PDF.Rect bbox;
486
                                          //int cur_flow_id = -1, cur_para_id = -1;
487
488
                        //TextExtractor.Line line;
489
                        //TextExtractor.Word word;
490
                        //TextExtractor.Style s, line_style;
491
492
                        //// For each line on the page...
493
                        //for (line = txt.GetFirstLine(); line.IsValid(); line = line.GetNextLine())
494
                        //{
495
496
                        //}
497
                        //String text = txt.GetAsXML(TextExtractor.XMLOutputFlags.e_output_bbox);
498
499
                        pdftron.PDF.TextExtractor.Word word;
500
                        for (pdftron.PDF.TextExtractor.Line line = txt.GetFirstLine(); line.IsValid(); line = line.GetNextLine())
501
                        {
502
                            for (word = line.GetFirstWord(); word.IsValid(); word = word.GetNextWord())
503
                            {
504
                                string text = word.GetString();
505
                                var data = word.GetBBox();
506
507
                                //wordString += word.GetString() + " ";
508
509
                                txtSet.Add(new HoneyheadTEXT { Text = text, Bounds = data, PageNo = pageNumber });
510
                            }
511
                        }
512
                    }
513
                }
514
                return txtSet;
515
            }
516
        }
517
    }
518
}
클립보드 이미지 추가 (최대 크기: 500 MB)