markus / TEST / Program.cs @ de4c7a4a
이력 | 보기 | 이력해설 | 다운로드 (15.7 KB)
1 |
using iTextSharp.text.pdf; |
---|---|
2 |
using iTextSharp.text.pdf.parser; |
3 |
using System; |
4 |
using System.Collections.Generic; |
5 |
using System.IO; |
6 |
using System.IO.Compression; |
7 |
using System.Linq; |
8 |
using System.Text; |
9 |
|
10 |
namespace TEST |
11 |
{ |
12 |
public class RectAndText |
13 |
{ |
14 |
public iTextSharp.text.Rectangle Rect; |
15 |
public String Text; |
16 |
public RectAndText(iTextSharp.text.Rectangle rect, String text) |
17 |
{ |
18 |
this.Rect = rect; |
19 |
this.Text = text; |
20 |
} |
21 |
} |
22 |
//public class MyLocationTextExtractionStrategy : LocationTextExtractionStrategy |
23 |
//{ |
24 |
// //Hold each coordinate |
25 |
// public List<RectAndText> myPoints = new List<RectAndText>(); |
26 |
|
27 |
// //Automatically called for each chunk of text in the PDF |
28 |
// public override void RenderText(TextRenderInfo renderInfo) |
29 |
// { |
30 |
// base.RenderText(renderInfo); |
31 |
|
32 |
// //Get the bounding box for the chunk of text |
33 |
// var bottomLeft = renderInfo.GetDescentLine().GetStartPoint(); |
34 |
// var topRight = renderInfo.GetAscentLine().GetEndPoint(); |
35 |
|
36 |
// //Create a rectangle from it |
37 |
// var rect = new iTextSharp.text.Rectangle( |
38 |
// bottomLeft[Vector.I1], |
39 |
// bottomLeft[Vector.I2], |
40 |
// topRight[Vector.I1], |
41 |
// topRight[Vector.I2] |
42 |
// ); |
43 |
|
44 |
// //Add this to our main collection |
45 |
// this.myPoints.Add(new RectAndText(rect, renderInfo.GetText())); |
46 |
// } |
47 |
//} |
48 |
|
49 |
public class TopToBottomTextExtractionStrategy : ITextExtractionStrategy //라인에 텍스트 찾는 |
50 |
{ |
51 |
|
52 |
private Vector lastStart; |
53 |
private Vector lastEnd; |
54 |
|
55 |
//Store each line individually. A SortedDictionary will automatically shuffle things around based on the key |
56 |
public SortedDictionary<int, StringBuilder> results = new SortedDictionary<int, StringBuilder>(); |
57 |
|
58 |
//Constructor and some methods that aren't used |
59 |
public TopToBottomTextExtractionStrategy() { } |
60 |
public virtual void BeginTextBlock() { } |
61 |
public virtual void EndTextBlock() { } |
62 |
public virtual void RenderImage(ImageRenderInfo renderInfo) { } |
63 |
|
64 |
//Convert our lines into a giant block of text |
65 |
public virtual String GetResultantText() |
66 |
{ |
67 |
//Buffer |
68 |
StringBuilder buf = new StringBuilder(); |
69 |
//Loop through each line (which is already sorted top to bottom) |
70 |
foreach (var s in results) |
71 |
{ |
72 |
//Append to the buffer |
73 |
buf.AppendLine(s.Value.ToString()); |
74 |
} |
75 |
return buf.ToString(); |
76 |
} |
77 |
public virtual void RenderText(TextRenderInfo renderInfo) |
78 |
{ |
79 |
bool firstRender = results.Count == 0; |
80 |
|
81 |
LineSegment segment = renderInfo.GetBaseline(); |
82 |
Vector start = segment.GetStartPoint(); |
83 |
Vector end = segment.GetEndPoint(); |
84 |
|
85 |
//Use the Y value of the bottom left corner of the text for the key |
86 |
int currentLineKey = (int)start[1]; |
87 |
|
88 |
if (!firstRender) |
89 |
{ |
90 |
Vector x0 = start; |
91 |
Vector x1 = lastStart; |
92 |
Vector x2 = lastEnd; |
93 |
|
94 |
float dist = (x2.Subtract(x1)).Cross((x1.Subtract(x0))).LengthSquared / x2.Subtract(x1).LengthSquared; |
95 |
|
96 |
float sameLineThreshold = 1f; |
97 |
//If we've detected that we're still on the same |
98 |
if (dist <= sameLineThreshold) |
99 |
{ |
100 |
//Use the previous Y coordinate |
101 |
currentLineKey = (int)lastStart[1]; |
102 |
} |
103 |
} |
104 |
//Hack: PDFs start with zero at the bottom so our keys will be upside down. Using negative keys cheats this. |
105 |
currentLineKey = currentLineKey * -1; |
106 |
|
107 |
//If this line hasn't been used before add a new line to our collection |
108 |
if (!results.ContainsKey(currentLineKey)) |
109 |
{ |
110 |
results.Add(currentLineKey, new StringBuilder()); |
111 |
} |
112 |
|
113 |
//Insert a space between blocks of text if it appears there should be |
114 |
if (!firstRender && //First pass never needs a leading space |
115 |
results[currentLineKey].Length != 0 && //Don't append a space to the begining of a line |
116 |
!results[currentLineKey].ToString().EndsWith(" ") && //Don't append if the current buffer ends in a space already |
117 |
renderInfo.GetText().Length > 0 && //Don't append if the new next is empty |
118 |
!renderInfo.GetText().StartsWith(" ")) |
119 |
{ //Don't append if the new text starts with a space |
120 |
//Calculate the distance between the two blocks |
121 |
float spacing = lastEnd.Subtract(start).Length; |
122 |
//If it "looks" like it should be a space |
123 |
if (spacing > renderInfo.GetSingleSpaceWidth() / 2f) |
124 |
{ |
125 |
//Add a space |
126 |
results[currentLineKey].Append(" "); |
127 |
} |
128 |
} |
129 |
|
130 |
//Add the text to the line in our collection |
131 |
results[currentLineKey].Append(renderInfo.GetText()); |
132 |
|
133 |
lastStart = start; |
134 |
lastEnd = end; |
135 |
} |
136 |
} |
137 |
|
138 |
public class MyLocationTextExtractionStrategy : LocationTextExtractionStrategy |
139 |
{ |
140 |
//Hold each coordinate |
141 |
public List<RectAndText> myPoints = new List<RectAndText>(); |
142 |
|
143 |
//The string that we're searching for |
144 |
public String TextToSearchFor { get; set; } |
145 |
|
146 |
|
147 |
//How to compare strings |
148 |
public System.Globalization.CompareOptions CompareOptions { get; set; } |
149 |
|
150 |
public MyLocationTextExtractionStrategy(String textToSearchFor, System.Globalization.CompareOptions compareOptions = System.Globalization.CompareOptions.None) |
151 |
{ |
152 |
this.TextToSearchFor = textToSearchFor; |
153 |
this.CompareOptions = compareOptions; |
154 |
} |
155 |
|
156 |
//Automatically called for each chunk of text in the PDF |
157 |
public override void RenderText(TextRenderInfo renderInfo) |
158 |
{ |
159 |
base.RenderText(renderInfo); |
160 |
|
161 |
//if (renderInfo.GetText().Contains("기술들을")) |
162 |
//{ |
163 |
|
164 |
//} |
165 |
//See if the current chunk contains the text |
166 |
var startPosition = System.Globalization.CultureInfo.CurrentCulture.CompareInfo.IndexOf(renderInfo.GetText(), this.TextToSearchFor, this.CompareOptions); |
167 |
|
168 |
//If not found bail |
169 |
if (startPosition < 0) |
170 |
{ |
171 |
return; |
172 |
} |
173 |
|
174 |
//Grab the individual characters |
175 |
var chars = renderInfo.GetCharacterRenderInfos().Skip(startPosition).Take(this.TextToSearchFor.Length).ToList(); |
176 |
|
177 |
//Grab the first and last character |
178 |
var firstChar = chars.First(); |
179 |
var lastChar = chars.Last(); |
180 |
|
181 |
|
182 |
//Get the bounding box for the chunk of text |
183 |
var bottomLeft = firstChar.GetDescentLine().GetStartPoint(); |
184 |
var topRight = lastChar.GetAscentLine().GetEndPoint(); |
185 |
|
186 |
//Create a rectangle from it |
187 |
var rect = new iTextSharp.text.Rectangle( |
188 |
bottomLeft[Vector.I1], |
189 |
bottomLeft[Vector.I2], |
190 |
topRight[Vector.I1], |
191 |
topRight[Vector.I2] |
192 |
); |
193 |
|
194 |
//Add this to our main collection |
195 |
this.myPoints.Add(new RectAndText(rect, this.TextToSearchFor)); |
196 |
} |
197 |
} |
198 |
|
199 |
|
200 |
class Program |
201 |
{ |
202 |
public static string Zip(string value) |
203 |
{ |
204 |
//Transform string into byte[] |
205 |
byte[] byteArray = new byte[value.Length]; |
206 |
int indexBA = 0; |
207 |
foreach (char item in value.ToCharArray()) |
208 |
{ |
209 |
byteArray[indexBA++] = (byte)item; |
210 |
} |
211 |
|
212 |
//Prepare for compress |
213 |
System.IO.MemoryStream ms = new System.IO.MemoryStream(); |
214 |
System.IO.Compression.GZipStream sw = new System.IO.Compression.GZipStream(ms, System.IO.Compression.CompressionMode.Compress); |
215 |
|
216 |
//Compress |
217 |
sw.Write(byteArray, 0, byteArray.Length); |
218 |
//Close, DO NOT FLUSH cause bytes will go missing... |
219 |
sw.Close(); |
220 |
|
221 |
//Transform byte[] zip data to string |
222 |
byteArray = ms.ToArray(); |
223 |
System.Text.StringBuilder sB = new System.Text.StringBuilder(byteArray.Length); |
224 |
foreach (byte item in byteArray) |
225 |
{ |
226 |
sB.Append((char)item); |
227 |
} |
228 |
ms.Close(); |
229 |
sw.Dispose(); |
230 |
ms.Dispose(); |
231 |
return sB.ToString(); |
232 |
} |
233 |
|
234 |
public static string UnZip(string value) |
235 |
{ |
236 |
//Transform string into byte[] |
237 |
byte[] byteArray = new byte[value.Length]; |
238 |
int indexBA = 0; |
239 |
foreach (char item in value.ToCharArray()) |
240 |
{ |
241 |
byteArray[indexBA++] = (byte)item; |
242 |
} |
243 |
|
244 |
//Prepare for decompress |
245 |
System.IO.MemoryStream ms = new System.IO.MemoryStream(byteArray); |
246 |
System.IO.Compression.GZipStream sr = new System.IO.Compression.GZipStream(ms, |
247 |
System.IO.Compression.CompressionMode.Decompress); |
248 |
|
249 |
//Reset variable to collect uncompressed result |
250 |
byteArray = new byte[byteArray.Length]; |
251 |
|
252 |
//Decompress |
253 |
int rByte = sr.Read(byteArray, 0, byteArray.Length); |
254 |
|
255 |
//Transform byte[] unzip data to string |
256 |
System.Text.StringBuilder sB = new System.Text.StringBuilder(rByte); |
257 |
//Read the number of bytes GZipStream red and do not a for each bytes in |
258 |
//resultByteArray; |
259 |
for (int i = 0; i < rByte; i++) |
260 |
{ |
261 |
sB.Append((char)byteArray[i]); |
262 |
} |
263 |
sr.Close(); |
264 |
ms.Close(); |
265 |
sr.Dispose(); |
266 |
ms.Dispose(); |
267 |
return sB.ToString(); |
268 |
} |
269 |
|
270 |
static void Main(string[] args) |
271 |
{ |
272 |
//string rr= @"data:image/gif;base64,R0lGODlhPQBEAPeoAJosM//AwO/AwHVYZ/z595kzAP/s7P+goOXMv8+fhw/v739/f+8PD98fH/8mJl+fn/9ZWb8/PzWlwv///6wWGbImAPgTEMImIN9gUFCEm/gDALULDN8PAD6atYdCTX9gUNKlj8wZAKUsAOzZz+UMAOsJAP/Z2ccMDA8PD/95eX5NWvsJCOVNQPtfX/8zM8+QePLl38MGBr8JCP+zs9myn/8GBqwpAP/GxgwJCPny78lzYLgjAJ8vAP9fX/+MjMUcAN8zM/9wcM8ZGcATEL+QePdZWf/29uc/P9cmJu9MTDImIN+/r7+/vz8/P8VNQGNugV8AAF9fX8swMNgTAFlDOICAgPNSUnNWSMQ5MBAQEJE3QPIGAM9AQMqGcG9vb6MhJsEdGM8vLx8fH98AANIWAMuQeL8fABkTEPPQ0OM5OSYdGFl5jo+Pj/+pqcsTE78wMFNGQLYmID4dGPvd3UBAQJmTkP+8vH9QUK+vr8ZWSHpzcJMmILdwcLOGcHRQUHxwcK9PT9DQ0O/v70w5MLypoG8wKOuwsP/g4P/Q0IcwKEswKMl8aJ9fX2xjdOtGRs/Pz+Dg4GImIP8gIH0sKEAwKKmTiKZ8aB/f39Wsl+LFt8dgUE9PT5x5aHBwcP+AgP+WltdgYMyZfyywz78AAAAAAAD///8AAP9mZv///wAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACH5BAEAAKgALAAAAAA9AEQAAAj/AFEJHEiwoMGDCBMqXMiwocAbBww4nEhxoYkUpzJGrMixogkfGUNqlNixJEIDB0SqHGmyJSojM1bKZOmyop0gM3Oe2liTISKMOoPy7GnwY9CjIYcSRYm0aVKSLmE6nfq05QycVLPuhDrxBlCtYJUqNAq2bNWEBj6ZXRuyxZyDRtqwnXvkhACDV+euTeJm1Ki7A73qNWtFiF+/gA95Gly2CJLDhwEHMOUAAuOpLYDEgBxZ4GRTlC1fDnpkM+fOqD6DDj1aZpITp0dtGCDhr+fVuCu3zlg49ijaokTZTo27uG7Gjn2P+hI8+PDPERoUB318bWbfAJ5sUNFcuGRTYUqV/3ogfXp1rWlMc6awJjiAAd2fm4ogXjz56aypOoIde4OE5u/F9x199dlXnnGiHZWEYbGpsAEA3QXYnHwEFliKAgswgJ8LPeiUXGwedCAKABACCN+EA1pYIIYaFlcDhytd51sGAJbo3onOpajiihlO92KHGaUXGwWjUBChjSPiWJuOO/LYIm4v1tXfE6J4gCSJEZ7YgRYUNrkji9P55sF/ogxw5ZkSqIDaZBV6aSGYq/lGZplndkckZ98xoICbTcIJGQAZcNmdmUc210hs35nCyJ58fgmIKX5RQGOZowxaZwYA+JaoKQwswGijBV4C6SiTUmpphMspJx9unX4KaimjDv9aaXOEBteBqmuuxgEHoLX6Kqx+yXqqBANsgCtit4FWQAEkrNbpq7HSOmtwag5w57GrmlJBASEU18ADjUYb3ADTinIttsgSB1oJFfA63bduimuqKB1keqwUhoCSK374wbujvOSu4QG6UvxBRydcpKsav++Ca6G8A6Pr1x2kVMyHwsVxUALDq/krnrhPSOzXG1lUTIoffqGR7Goi2MAxbv6O2kEG56I7CSlRsEFKFVyovDJoIRTg7sugNRDGqCJzJgcKE0ywc0ELm6KBCCJo8DIPFeCWNGcyqNFE06ToAfV0HBRgxsvLThHn1oddQMrXj5DyAQgjEHSAJMWZwS3HPxT/QMbabI/iBCliMLEJKX2EEkomBAUCxRi42VDADxyTYDVogV+wSChqmKxEKCDAYFDFj4OmwbY7bDGdBhtrnTQYOigeChUmc1K3QTnAUfEgGFgAWt88hKA6aCRIXhxnQ1yg3BCayK44EWdkUQcBByEQChFXfCB776aQsG0BIlQgQgE8qO26X1h8cEUep8ngRBnOy74E9QgRgEAC8SvOfQkh7FDBDmS43PmGoIiKUUEGkMEC/PJHgxw0xH74yx/3XnaYRJgMB8obxQW6kL9QYEJ0FIFgByfIL7/IQAlvQwEpnAC7DtLNJCKUoO/w45c44GwCXiAFB/OXAATQryUxdN4LfFiwgjCNYg+kYMIEFkCKDs6PKAIJouyGWMS1FSKJOMRB/BoIxYJIUXFUxNwoIkEKPAgCBZSQHQ1A2EWDfDEUVLyADj5AChSIQW6gu10bE/JG2VnCZGfo4R4d0sdQoBAHhPjhIB94v/wRoRKQWGRHgrhGSQJxCS+0pCZbEhAAOw=="; |
273 |
|
274 |
//var ra = Zip(rr); |
275 |
//var rb = UnZip(ra); |
276 |
|
277 |
//string filePath = @"E:\sample2.pdf"; |
278 |
|
279 |
//Console.WriteLine("입력 : "); |
280 |
//string embro = Console.ReadLine(); |
281 |
//while(embro!="종료") |
282 |
//{ |
283 |
// var result = ReadPdfFile(filePath, embro); |
284 |
// embro = Console.ReadLine(); |
285 |
//} |
286 |
|
287 |
iTextSharp.text.Document doc = new iTextSharp.text.Document(iTextSharp.text.PageSize.A4, 50, 50, 50, 50); |
288 |
PdfWriter writer = PdfWriter.GetInstance(doc, new FileStream(@"E:\test.pdf", FileMode.OpenOrCreate)); |
289 |
doc.Open(); |
290 |
iTextSharp.text.Font link = iTextSharp.text.FontFactory.GetFont("Arial", 12, iTextSharp.text.Font.UNDERLINE, new iTextSharp.text.BaseColor(0, 0, 255)); |
291 |
iTextSharp.text.Anchor anchor = new iTextSharp.text.Anchor("www.mikesdotnetting.com", link); |
292 |
anchor.Reference = "http://www.mikesdotnetting.com"; |
293 |
doc.Add(anchor); |
294 |
|
295 |
doc.Close(); |
296 |
Console.WriteLine("Finished..."); |
297 |
|
298 |
Console.ReadKey(); |
299 |
} |
300 |
|
301 |
|
302 |
|
303 |
public static List<int> ReadPdfFile(string fileName, string searchText) |
304 |
{ |
305 |
string currentText = string.Empty; |
306 |
System.Text.StringBuilder pdfText = new System.Text.StringBuilder(); |
307 |
List<int> pages = new List<int>(); |
308 |
if (File.Exists(fileName)) |
309 |
{ |
310 |
PdfReader pdfReader = new PdfReader(fileName); |
311 |
for (int page = 1; page <= pdfReader.NumberOfPages; page++) |
312 |
{ |
313 |
var t = new MyLocationTextExtractionStrategy(searchText, System.Globalization.CompareOptions.None); |
314 |
|
315 |
//var t = new TopToBottomTextExtractionStrategy(); |
316 |
var ex = PdfTextExtractor.GetTextFromPage(pdfReader, page, t); |
317 |
//if (ex.Contains(searchText) && searchText.Contains(" ")) |
318 |
//{ |
319 |
|
320 |
//} |
321 |
//foreach (var p in t.myPoints) |
322 |
//{ |
323 |
// Console.WriteLine(string.Format("Found text {0} at {1}x{2}", p.Text, p.Rect.Left, p.Rect.Bottom)); |
324 |
//} |
325 |
} |
326 |
|
327 |
//pdfReader.Close(); |
328 |
//List<string> lines = new List<string>(); |
329 |
//lines = pdfText.ToString().Trim().Split(' ').ToList(); |
330 |
//List<string> matchedWord = new List<string>(); |
331 |
//foreach (string item in lines) |
332 |
//{ |
333 |
// if (!string.IsNullOrEmpty(item)) |
334 |
// { |
335 |
// if (item.ToUpper().Contains(searthText.ToUpper())) |
336 |
// { |
337 |
// matchedWord.Add(item); |
338 |
// } |
339 |
// } |
340 |
//} |
341 |
} |
342 |
return pages; |
343 |
} |
344 |
} |
345 |
} |