프로젝트

일반

사용자정보

통계
| 브랜치(Branch): | 개정판:

markus / TEST / Program.cs @ a4e5d148

이력 | 보기 | 이력해설 | 다운로드 (15.7 KB)

1
using iTextSharp.text.pdf;
2
using iTextSharp.text.pdf.parser;
3
using System;
4
using System.Collections.Generic;
5
using System.IO;
6
using System.IO.Compression;
7
using System.Linq;
8
using System.Text;
9

    
10
namespace TEST
11
{
12
    public class RectAndText
13
    {
14
        public iTextSharp.text.Rectangle Rect;
15
        public String Text;
16
        public RectAndText(iTextSharp.text.Rectangle rect, String text)
17
        {
18
            this.Rect = rect;
19
            this.Text = text;
20
        }
21
    }
22
    //public class MyLocationTextExtractionStrategy : LocationTextExtractionStrategy
23
    //{
24
    //    //Hold each coordinate
25
    //    public List<RectAndText> myPoints = new List<RectAndText>();
26

    
27
    //    //Automatically called for each chunk of text in the PDF
28
    //    public override void RenderText(TextRenderInfo renderInfo)
29
    //    {
30
    //        base.RenderText(renderInfo);
31

    
32
    //        //Get the bounding box for the chunk of text
33
    //        var bottomLeft = renderInfo.GetDescentLine().GetStartPoint();
34
    //        var topRight = renderInfo.GetAscentLine().GetEndPoint();
35

    
36
    //        //Create a rectangle from it
37
    //        var rect = new iTextSharp.text.Rectangle(
38
    //                                                bottomLeft[Vector.I1],
39
    //                                                bottomLeft[Vector.I2],
40
    //                                                topRight[Vector.I1],
41
    //                                                topRight[Vector.I2]
42
    //                                                );
43

    
44
    //        //Add this to our main collection
45
    //        this.myPoints.Add(new RectAndText(rect, renderInfo.GetText()));
46
    //    }
47
    //}
48

    
49
    public class TopToBottomTextExtractionStrategy : ITextExtractionStrategy //라인에 텍스트 찾는
50
    {
51

    
52
        private Vector lastStart;
53
        private Vector lastEnd;
54

    
55
        //Store each line individually. A SortedDictionary will automatically shuffle things around based on the key
56
        public SortedDictionary<int, StringBuilder> results = new SortedDictionary<int, StringBuilder>();
57

    
58
        //Constructor and some methods that aren't used
59
        public TopToBottomTextExtractionStrategy() { }
60
        public virtual void BeginTextBlock() { }
61
        public virtual void EndTextBlock() { }
62
        public virtual void RenderImage(ImageRenderInfo renderInfo) { }
63

    
64
        //Convert our lines into a giant block of text
65
        public virtual String GetResultantText()
66
        {
67
            //Buffer
68
            StringBuilder buf = new StringBuilder();
69
            //Loop through each line (which is already sorted top to bottom)
70
            foreach (var s in results)
71
            {
72
                //Append to the buffer
73
                buf.AppendLine(s.Value.ToString());
74
            }
75
            return buf.ToString();
76
        }
77
        public virtual void RenderText(TextRenderInfo renderInfo)
78
        {
79
            bool firstRender = results.Count == 0;
80

    
81
            LineSegment segment = renderInfo.GetBaseline();
82
            Vector start = segment.GetStartPoint();
83
            Vector end = segment.GetEndPoint();
84

    
85
            //Use the Y value of the bottom left corner of the text for the key
86
            int currentLineKey = (int)start[1];
87

    
88
            if (!firstRender)
89
            {
90
                Vector x0 = start;
91
                Vector x1 = lastStart;
92
                Vector x2 = lastEnd;
93

    
94
                float dist = (x2.Subtract(x1)).Cross((x1.Subtract(x0))).LengthSquared / x2.Subtract(x1).LengthSquared;
95

    
96
                float sameLineThreshold = 1f;
97
                //If we've detected that we're still on the same
98
                if (dist <= sameLineThreshold)
99
                {
100
                    //Use the previous Y coordinate
101
                    currentLineKey = (int)lastStart[1];
102
                }
103
            }
104
            //Hack: PDFs start with zero at the bottom so our keys will be upside down. Using negative keys cheats this.
105
            currentLineKey = currentLineKey * -1;
106

    
107
            //If this line hasn't been used before add a new line to our collection
108
            if (!results.ContainsKey(currentLineKey))
109
            {
110
                results.Add(currentLineKey, new StringBuilder());
111
            }
112

    
113
            //Insert a space between blocks of text if it appears there should be
114
            if (!firstRender &&                                       //First pass never needs a leading space
115
                results[currentLineKey].Length != 0 &&                 //Don't append a space to the begining of a line
116
                !results[currentLineKey].ToString().EndsWith(" ") &&  //Don't append if the current buffer ends in a space already
117
                renderInfo.GetText().Length > 0 &&                    //Don't append if the new next is empty
118
                !renderInfo.GetText().StartsWith(" "))
119
            {              //Don't append if the new text starts with a space
120
                           //Calculate the distance between the two blocks
121
                float spacing = lastEnd.Subtract(start).Length;
122
                //If it "looks" like it should be a space
123
                if (spacing > renderInfo.GetSingleSpaceWidth() / 2f)
124
                {
125
                    //Add a space
126
                    results[currentLineKey].Append(" ");
127
                }
128
            }
129

    
130
            //Add the text to the line in our collection
131
            results[currentLineKey].Append(renderInfo.GetText());
132

    
133
            lastStart = start;
134
            lastEnd = end;
135
        }
136
    }
137

    
138
    public class MyLocationTextExtractionStrategy : LocationTextExtractionStrategy
139
    {
140
        //Hold each coordinate
141
        public List<RectAndText> myPoints = new List<RectAndText>();
142

    
143
        //The string that we're searching for
144
        public String TextToSearchFor { get; set; }
145

    
146

    
147
        //How to compare strings
148
        public System.Globalization.CompareOptions CompareOptions { get; set; }
149

    
150
        public MyLocationTextExtractionStrategy(String textToSearchFor, System.Globalization.CompareOptions compareOptions = System.Globalization.CompareOptions.None)
151
        {
152
            this.TextToSearchFor = textToSearchFor;
153
            this.CompareOptions = compareOptions;
154
        }
155

    
156
        //Automatically called for each chunk of text in the PDF
157
        public override void RenderText(TextRenderInfo renderInfo)
158
        {
159
            base.RenderText(renderInfo);
160

    
161
            //if (renderInfo.GetText().Contains("기술들을"))
162
            //{
163

    
164
            //}
165
            //See if the current chunk contains the text
166
            var startPosition = System.Globalization.CultureInfo.CurrentCulture.CompareInfo.IndexOf(renderInfo.GetText(), this.TextToSearchFor, this.CompareOptions);
167

    
168
            //If not found bail
169
            if (startPosition < 0)
170
            {
171
                return;
172
            }
173

    
174
            //Grab the individual characters
175
            var chars = renderInfo.GetCharacterRenderInfos().Skip(startPosition).Take(this.TextToSearchFor.Length).ToList();
176

    
177
            //Grab the first and last character
178
            var firstChar = chars.First();
179
            var lastChar = chars.Last();
180

    
181

    
182
            //Get the bounding box for the chunk of text
183
            var bottomLeft = firstChar.GetDescentLine().GetStartPoint();
184
            var topRight = lastChar.GetAscentLine().GetEndPoint();
185

    
186
            //Create a rectangle from it
187
            var rect = new iTextSharp.text.Rectangle(
188
                                                    bottomLeft[Vector.I1],
189
                                                    bottomLeft[Vector.I2],
190
                                                    topRight[Vector.I1],
191
                                                    topRight[Vector.I2]
192
                                                    );
193

    
194
            //Add this to our main collection
195
            this.myPoints.Add(new RectAndText(rect, this.TextToSearchFor));
196
        }
197
    }
198

    
199

    
200
    class Program
201
    {
202
        public static string Zip(string value)
203
        {
204
            //Transform string into byte[]  
205
            byte[] byteArray = new byte[value.Length];
206
            int indexBA = 0;
207
            foreach (char item in value.ToCharArray())
208
            {
209
                byteArray[indexBA++] = (byte)item;
210
            }
211

    
212
            //Prepare for compress
213
            System.IO.MemoryStream ms = new System.IO.MemoryStream();
214
            System.IO.Compression.GZipStream sw = new System.IO.Compression.GZipStream(ms, System.IO.Compression.CompressionMode.Compress);
215

    
216
            //Compress
217
            sw.Write(byteArray, 0, byteArray.Length);
218
            //Close, DO NOT FLUSH cause bytes will go missing...
219
            sw.Close();
220

    
221
            //Transform byte[] zip data to string
222
            byteArray = ms.ToArray();
223
            System.Text.StringBuilder sB = new System.Text.StringBuilder(byteArray.Length);
224
            foreach (byte item in byteArray)
225
            {
226
                sB.Append((char)item);
227
            }
228
            ms.Close();
229
            sw.Dispose();
230
            ms.Dispose();
231
            return sB.ToString();
232
        }
233

    
234
        public static string UnZip(string value)
235
        {
236
            //Transform string into byte[]
237
            byte[] byteArray = new byte[value.Length];
238
            int indexBA = 0;
239
            foreach (char item in value.ToCharArray())
240
            {
241
                byteArray[indexBA++] = (byte)item;
242
            }
243

    
244
            //Prepare for decompress
245
            System.IO.MemoryStream ms = new System.IO.MemoryStream(byteArray);
246
            System.IO.Compression.GZipStream sr = new System.IO.Compression.GZipStream(ms,
247
                System.IO.Compression.CompressionMode.Decompress);
248

    
249
            //Reset variable to collect uncompressed result
250
            byteArray = new byte[byteArray.Length];
251

    
252
            //Decompress
253
            int rByte = sr.Read(byteArray, 0, byteArray.Length);
254

    
255
            //Transform byte[] unzip data to string
256
            System.Text.StringBuilder sB = new System.Text.StringBuilder(rByte);
257
            //Read the number of bytes GZipStream red and do not a for each bytes in
258
            //resultByteArray;
259
            for (int i = 0; i < rByte; i++)
260
            {
261
                sB.Append((char)byteArray[i]);
262
            }
263
            sr.Close();
264
            ms.Close();
265
            sr.Dispose();
266
            ms.Dispose();
267
            return sB.ToString();
268
        }
269

    
270
        static void Main(string[] args)
271
        {
272
            //string rr= @"";
273

    
274
            //var ra = Zip(rr);
275
            //var rb = UnZip(ra);
276

    
277
            //string filePath = @"E:\sample2.pdf";
278

    
279
            //Console.WriteLine("입력 : ");
280
            //string embro = Console.ReadLine();
281
            //while(embro!="종료")
282
            //{
283
            //    var result = ReadPdfFile(filePath, embro);
284
            //    embro = Console.ReadLine();
285
            //}
286

    
287
            iTextSharp.text.Document doc = new iTextSharp.text.Document(iTextSharp.text.PageSize.A4, 50, 50, 50, 50);
288
            PdfWriter writer = PdfWriter.GetInstance(doc, new FileStream(@"E:\test.pdf", FileMode.OpenOrCreate));
289
            doc.Open();
290
            iTextSharp.text.Font link = iTextSharp.text.FontFactory.GetFont("Arial", 12, iTextSharp.text.Font.UNDERLINE, new iTextSharp.text.BaseColor(0, 0, 255));
291
            iTextSharp.text.Anchor anchor = new iTextSharp.text.Anchor("www.mikesdotnetting.com", link);
292
            anchor.Reference = "http://www.mikesdotnetting.com";
293
            doc.Add(anchor);
294
              
295
            doc.Close();
296
            Console.WriteLine("Finished...");
297

    
298
            Console.ReadKey();
299
        }
300

    
301

    
302

    
303
        public static List<int> ReadPdfFile(string fileName, string searchText)
304
        {
305
            string currentText = string.Empty;
306
            System.Text.StringBuilder pdfText = new System.Text.StringBuilder();
307
            List<int> pages = new List<int>();
308
            if (File.Exists(fileName))
309
            {
310
                PdfReader pdfReader = new PdfReader(fileName);
311
                for (int page = 1; page <= pdfReader.NumberOfPages; page++)
312
                {
313
                    var t = new MyLocationTextExtractionStrategy(searchText, System.Globalization.CompareOptions.None);
314

    
315
                    //var t = new TopToBottomTextExtractionStrategy();
316
                    var ex = PdfTextExtractor.GetTextFromPage(pdfReader, page, t);
317
                    //if (ex.Contains(searchText) && searchText.Contains(" "))
318
                    //{
319

    
320
                    //}
321
                    //foreach (var p in t.myPoints)
322
                    //{
323
                    //    Console.WriteLine(string.Format("Found text {0} at {1}x{2}", p.Text, p.Rect.Left, p.Rect.Bottom));
324
                    //}
325
                }
326

    
327
                //pdfReader.Close();
328
                //List<string> lines = new List<string>();
329
                //lines = pdfText.ToString().Trim().Split(' ').ToList();
330
                //List<string> matchedWord = new List<string>();
331
                //foreach (string item in lines)
332
                //{
333
                //    if (!string.IsNullOrEmpty(item))
334
                //    {
335
                //        if (item.ToUpper().Contains(searthText.ToUpper()))
336
                //        {
337
                //            matchedWord.Add(item);
338
                //        }
339
                //    }
340
                //}
341
            }
342
            return pages;
343
        }
344
    }
345
}
클립보드 이미지 추가 (최대 크기: 500 MB)