프로젝트

일반

사용자정보

통계
| 브랜치(Branch): | 개정판:

markus / MarkusAutoUpdate / src / NetSparkle / Libraries / MarkdownSharp.cs @ f2b4c204

이력 | 보기 | 이력해설 | 다운로드 (69.7 KB)

1 d8f5045e taeseongkim
/*
2
 * MarkdownSharp
3
 * -------------
4
 * a C# Markdown processor
5
 *
6
 * Markdown is a text-to-HTML conversion tool for web writers
7
 * Copyright (c) 2004 John Gruber
8
 * http://daringfireball.net/projects/markdown/
9
 *
10
 * Markdown.NET
11
 * Copyright (c) 2004-2009 Milan Negovan
12
 * http://www.aspnetresources.com
13
 * http://aspnetresources.com/blog/markdown_announced.aspx
14
 *
15
 * MarkdownSharp
16
 * Copyright (c) 2009-2011 Jeff Atwood
17
 * http://stackoverflow.com
18
 * http://www.codinghorror.com/blog/
19
 * http://code.google.com/p/markdownsharp/
20
 *
21
 * History: Milan ported the Markdown processor to C#. He granted license to me so I can open source it
22
 * and let the community contribute to and improve MarkdownSharp.
23
 *
24
 */
25
26
#region Copyright and license
27
28
/*
29
30
Copyright (c) 2009 - 2010 Jeff Atwood
31
32
http://www.opensource.org/licenses/mit-license.php
33
  
34
Permission is hereby granted, free of charge, to any person obtaining a copy
35
of this software and associated documentation files (the "Software"), to deal
36
in the Software without restriction, including without limitation the rights
37
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
38
copies of the Software, and to permit persons to whom the Software is
39
furnished to do so, subject to the following conditions:
40
41
The above copyright notice and this permission notice shall be included in
42
all copies or substantial portions of the Software.
43
44
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
45
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
46
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
47
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
48
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
49
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
50
THE SOFTWARE.
51
52
Copyright (c) 2003-2004 John Gruber
53
<http://daringfireball.net/>   
54
All rights reserved.
55
56
Redistribution and use in source and binary forms, with or without
57
modification, are permitted provided that the following conditions are
58
met:
59
60
* Redistributions of source code must retain the above copyright notice,
61
  this list of conditions and the following disclaimer.
62
63
* Redistributions in binary form must reproduce the above copyright
64
  notice, this list of conditions and the following disclaimer in the
65
  documentation and/or other materials provided with the distribution.
66
67
* Neither the name "Markdown" nor the names of its contributors may
68
  be used to endorse or promote products derived from this software
69
  without specific prior written permission.
70
71
This software is provided by the copyright holders and contributors "as
72
is" and any express or implied warranties, including, but not limited
73
to, the implied warranties of merchantability and fitness for a
74
particular purpose are disclaimed. In no event shall the copyright owner
75
or contributors be liable for any direct, indirect, incidental, special,
76
exemplary, or consequential damages (including, but not limited to,
77
procurement of substitute goods or services; loss of use, data, or
78
profits; or business interruption) however caused and on any theory of
79
liability, whether in contract, strict liability, or tort (including
80
negligence or otherwise) arising in any way out of the use of this
81
software, even if advised of the possibility of such damage.
82
*/
83
84
#endregion
85
86
using System;
87
using System.Collections.Generic;
88
using System.Configuration;
89
using System.Text;
90
using System.Text.RegularExpressions;
91
92
namespace MarkdownSharp
93
{
94
95
    /// <summary>
96
    /// 
97
    /// </summary>
98
    public class MarkdownOptions
99
    {
100
        /// <summary>
101
        /// when true, (most) bare plain URLs are auto-hyperlinked  
102
        /// WARNING: this is a significant deviation from the markdown spec
103
        /// </summary>
104
        public bool AutoHyperlink { get; set; }
105
        /// <summary>
106
        /// when true, RETURN becomes a literal newline  
107
        /// WARNING: this is a significant deviation from the markdown spec
108
        /// </summary>
109
        public bool AutoNewlines { get; set; }
110
        /// <summary>
111
        /// use ">" for HTML output, or " />" for XHTML output
112
        /// </summary>
113
        public string EmptyElementSuffix { get; set; }
114
        /// <summary>
115
        /// when true, problematic URL characters like [, ], (, and so forth will be encoded
116
        /// WARNING: this is a significant deviation from the markdown spec
117
        /// </summary>
118
        public bool EncodeProblemUrlCharacters { get; set; }
119
        /// <summary>
120
        /// when false, email addresses will never be auto-linked  
121
        /// WARNING: this is a significant deviation from the markdown spec
122
        /// </summary>
123
        public bool LinkEmails { get; set; }
124
        /// <summary>
125
        /// when true, bold and italic require non-word characters on either side  
126
        /// WARNING: this is a significant deviation from the markdown spec
127
        /// </summary>
128
        public bool StrictBoldItalic { get; set; }
129
    }
130
131
132
    /// <summary>
133
    /// Markdown is a text-to-HTML conversion tool for web writers.
134
    /// Markdown allows you to write using an easy-to-read, easy-to-write plain text format,
135
    /// then convert it to structurally valid XHTML (or HTML).
136
    /// </summary>
137
    public class Markdown
138
    {
139
        private const string _version = "1.13";
140
141
        #region Constructors and Options
142
143
        /// <summary>
144
        /// Create a new Markdown instance using default options
145
        /// </summary>
146
        public Markdown()
147
            : this(false)
148
        {
149
        }
150
151
        /// <summary>
152
        /// Create a new Markdown instance and optionally load options from a configuration
153
        /// file. There they should be stored in the appSettings section, available options are:
154
        ///
155
        ///     Markdown.StrictBoldItalic (true/false)
156
        ///     Markdown.EmptyElementSuffix (">" or " />" without the quotes)
157
        ///     Markdown.LinkEmails (true/false)
158
        ///     Markdown.AutoNewLines (true/false)
159
        ///     Markdown.AutoHyperlink (true/false)
160
        ///     Markdown.EncodeProblemUrlCharacters (true/false)
161
        ///     
162
        /// </summary>
163
        public Markdown(bool loadOptionsFromConfigFile)
164
        {
165
            if (!loadOptionsFromConfigFile) return;
166
//
167
//            var settings = ConfigurationManager.AppSettings;
168
//            foreach (string key in settings.Keys)
169
//            {
170
//                switch (key)
171
//                {
172
//                    case "Markdown.AutoHyperlink":
173
//                        _autoHyperlink = Convert.ToBoolean(settings[key]);
174
//                        break;
175
//                    case "Markdown.AutoNewlines":
176
//                        _autoNewlines = Convert.ToBoolean(settings[key]);
177
//                        break;
178
//                    case "Markdown.EmptyElementSuffix":
179
//                        _emptyElementSuffix = settings[key];
180
//                        break;
181
//                    case "Markdown.EncodeProblemUrlCharacters":
182
//                        _encodeProblemUrlCharacters = Convert.ToBoolean(settings[key]);
183
//                        break;
184
//                    case "Markdown.LinkEmails":
185
//                        _linkEmails = Convert.ToBoolean(settings[key]);
186
//                        break;
187
//                    case "Markdown.StrictBoldItalic":
188
//                        _strictBoldItalic = Convert.ToBoolean(settings[key]);
189
//                        break;
190
//                }
191
//            }
192
        }
193
194
        /// <summary>
195
        /// Create a new Markdown instance and set the options from the MarkdownOptions object.
196
        /// </summary>
197
        public Markdown(MarkdownOptions options)
198
        {
199
            _autoHyperlink = options.AutoHyperlink;
200
            _autoNewlines = options.AutoNewlines;
201
            _emptyElementSuffix = options.EmptyElementSuffix;
202
            _encodeProblemUrlCharacters = options.EncodeProblemUrlCharacters;
203
            _linkEmails = options.LinkEmails;
204
            _strictBoldItalic = options.StrictBoldItalic;
205
        }
206
207
208
        /// <summary>
209
        /// use ">" for HTML output, or " />" for XHTML output
210
        /// </summary>
211
        public string EmptyElementSuffix
212
        {
213
            get { return _emptyElementSuffix; }
214
            set { _emptyElementSuffix = value; }
215
        }
216
        private string _emptyElementSuffix = " />";
217
218
        /// <summary>
219
        /// when false, email addresses will never be auto-linked  
220
        /// WARNING: this is a significant deviation from the markdown spec
221
        /// </summary>
222
        public bool LinkEmails
223
        {
224
            get { return _linkEmails; }
225
            set { _linkEmails = value; }
226
        }
227
        private bool _linkEmails = true;
228
229
        /// <summary>
230
        /// when true, bold and italic require non-word characters on either side  
231
        /// WARNING: this is a significant deviation from the markdown spec
232
        /// </summary>
233
        public bool StrictBoldItalic
234
        {
235
            get { return _strictBoldItalic; }
236
            set { _strictBoldItalic = value; }
237
        }
238
        private bool _strictBoldItalic = false;
239
240
        /// <summary>
241
        /// when true, RETURN becomes a literal newline  
242
        /// WARNING: this is a significant deviation from the markdown spec
243
        /// </summary>
244
        public bool AutoNewLines
245
        {
246
            get { return _autoNewlines; }
247
            set { _autoNewlines = value; }
248
        }
249
        private bool _autoNewlines = false;
250
251
        /// <summary>
252
        /// when true, (most) bare plain URLs are auto-hyperlinked  
253
        /// WARNING: this is a significant deviation from the markdown spec
254
        /// </summary>
255
        public bool AutoHyperlink
256
        {
257
            get { return _autoHyperlink; }
258
            set { _autoHyperlink = value; }
259
        }
260
        private bool _autoHyperlink = false;
261
262
        /// <summary>
263
        /// when true, problematic URL characters like [, ], (, and so forth will be encoded
264
        /// WARNING: this is a significant deviation from the markdown spec
265
        /// </summary>
266
        public bool EncodeProblemUrlCharacters
267
        {
268
            get { return _encodeProblemUrlCharacters; }
269
            set { _encodeProblemUrlCharacters = value; }
270
        }
271
        private bool _encodeProblemUrlCharacters = false;
272
273
        #endregion
274
275
        private enum TokenType { Text, Tag }
276
277
        private struct Token
278
        {
279
            public Token(TokenType type, string value)
280
            {
281
                this.Type = type;
282
                this.Value = value;
283
            }
284
            public TokenType Type;
285
            public string Value;
286
        }
287
288
        /// <summary>
289
        /// maximum nested depth of [] and () supported by the transform; implementation detail
290
        /// </summary>
291
        private const int _nestDepth = 6;
292
293
        /// <summary>
294
        /// Tabs are automatically converted to spaces as part of the transform  
295
        /// this constant determines how "wide" those tabs become in spaces  
296
        /// </summary>
297
        private const int _tabWidth = 4;
298
299
        private const string _markerUL = @"[*+-]";
300
        private const string _markerOL = @"\d+[.]";
301
302
        private static readonly Dictionary<string, string> _escapeTable;
303
        private static readonly Dictionary<string, string> _invertedEscapeTable;
304
        private static readonly Dictionary<string, string> _backslashEscapeTable;
305
306
        private readonly Dictionary<string, string> _urls = new Dictionary<string, string>();
307
        private readonly Dictionary<string, string> _titles = new Dictionary<string, string>();
308
        private readonly Dictionary<string, string> _htmlBlocks = new Dictionary<string, string>();
309
310
        private int _listLevel;
311
        private static string AutoLinkPreventionMarker = "\x1AP"; // temporarily replaces "://" where auto-linking shouldn't happen;
312
313
        /// <summary>
314
        /// In the static constuctor we'll initialize what stays the same across all transforms.
315
        /// </summary>
316
        static Markdown()
317
        {
318
            // Table of hash values for escaped characters:
319
            _escapeTable = new Dictionary<string, string>();
320
            _invertedEscapeTable = new Dictionary<string, string>();
321
            // Table of hash value for backslash escaped characters:
322
            _backslashEscapeTable = new Dictionary<string, string>();
323
324
            string backslashPattern = "";
325
326
            foreach (char c in @"\`*_{}[]()>#+-.!/")
327
            {
328
                string key = c.ToString();
329
                string hash = GetHashKey(key, isHtmlBlock: false);
330
                _escapeTable.Add(key, hash);
331
                _invertedEscapeTable.Add(hash, key);
332
                _backslashEscapeTable.Add(@"\" + key, hash);
333
                backslashPattern += Regex.Escape(@"\" + key) + "|";
334
            }
335
336
            _backslashEscapes = new Regex(backslashPattern.Substring(0, backslashPattern.Length - 1), RegexOptions.Compiled);
337
        }
338
339
        /// <summary>
340
        /// current version of MarkdownSharp;  
341
        /// see http://code.google.com/p/markdownsharp/ for the latest code or to contribute
342
        /// </summary>
343
        public string Version
344
        {
345
            get { return _version; }
346
        }
347
348
        /// <summary>
349
        /// Transforms the provided Markdown-formatted text to HTML;  
350
        /// see http://en.wikipedia.org/wiki/Markdown
351
        /// </summary>
352
        /// <remarks>
353
        /// The order in which other subs are called here is
354
        /// essential. Link and image substitutions need to happen before
355
        /// EscapeSpecialChars(), so that any *'s or _'s in the a
356
        /// and img tags get encoded.
357
        /// </remarks>
358
        public string Transform(string text)
359
        {
360
            if (String.IsNullOrEmpty(text)) return "";
361
362
            Setup();
363
364
            text = Normalize(text);
365
366
            text = HashHTMLBlocks(text);
367
            text = StripLinkDefinitions(text);
368
            text = RunBlockGamut(text);
369
            text = Unescape(text);
370
371
            Cleanup();
372
373
            return text + "\n";
374
        }
375
376
377
        /// <summary>
378
        /// Perform transformations that form block-level tags like paragraphs, headers, and list items.
379
        /// </summary>
380
        private string RunBlockGamut(string text, bool unhash = true)
381
        {
382
            text = DoHeaders(text);
383
            text = DoHorizontalRules(text);
384
            text = DoLists(text);
385
            text = DoCodeBlocks(text);
386
            text = DoBlockQuotes(text);
387
388
            // We already ran HashHTMLBlocks() before, in Markdown(), but that
389
            // was to escape raw HTML in the original Markdown source. This time,
390
            // we're escaping the markup we've just created, so that we don't wrap
391
            // <p> tags around block-level tags.
392
            text = HashHTMLBlocks(text);
393
394
            text = FormParagraphs(text, unhash: unhash);
395
396
            return text;
397
        }
398
399
400
        /// <summary>
401
        /// Perform transformations that occur *within* block-level tags like paragraphs, headers, and list items.
402
        /// </summary>
403
        private string RunSpanGamut(string text)
404
        {
405
            text = DoCodeSpans(text);
406
            text = EscapeSpecialCharsWithinTagAttributes(text);
407
            text = EscapeBackslashes(text);
408
409
            // Images must come first, because ![foo][f] looks like an anchor.
410
            text = DoImages(text);
411
            text = DoAnchors(text);
412
413
            // Must come after DoAnchors(), because you can use < and >
414
            // delimiters in inline links like [this](<url>).
415
            text = DoAutoLinks(text);
416
417
            text = text.Replace(AutoLinkPreventionMarker, "://");
418
419
            text = EncodeAmpsAndAngles(text);
420
            text = DoItalicsAndBold(text);
421
            text = DoHardBreaks(text);
422
423
            return text;
424
        }
425
426
        private static Regex _newlinesLeadingTrailing = new Regex(@"^\n+|\n+\z", RegexOptions.Compiled);
427
        private static Regex _newlinesMultiple = new Regex(@"\n{2,}", RegexOptions.Compiled);
428
        private static Regex _leadingWhitespace = new Regex(@"^[ ]*", RegexOptions.Compiled);
429
430
        private static Regex _htmlBlockHash = new Regex("\x1AH\\d+H", RegexOptions.Compiled);
431
432
        /// <summary>
433
        /// splits on two or more newlines, to form "paragraphs";    
434
        /// each paragraph is then unhashed (if it is a hash and unhashing isn't turned off) or wrapped in HTML p tag
435
        /// </summary>
436
        private string FormParagraphs(string text, bool unhash = true)
437
        {
438
            // split on two or more newlines
439
            string[] grafs = _newlinesMultiple.Split(_newlinesLeadingTrailing.Replace(text, ""));
440
441
            for (int i = 0; i < grafs.Length; i++)
442
            {
443
                if (grafs[i].StartsWith("\x1AH"))
444
                {
445
                    // unhashify HTML blocks
446
                    if (unhash)
447
                    {
448
                        int sanityCheck = 50; // just for safety, guard against an infinite loop
449
                        bool keepGoing = true; // as long as replacements where made, keep going
450
                        while (keepGoing && sanityCheck > 0)
451
                        {
452
                            keepGoing = false;
453
                            grafs[i] = _htmlBlockHash.Replace(grafs[i], match =>
454
                            {
455
                                keepGoing = true;
456
                                return _htmlBlocks[match.Value];
457
                            });
458
                            sanityCheck--;
459
                        }
460
                        /* if (keepGoing)
461
                        {
462
                            // Logging of an infinite loop goes here.
463
                            // If such a thing should happen, please open a new issue on http://code.google.com/p/markdownsharp/
464
                            // with the input that caused it.
465
                        }*/
466
                    }
467
                }
468
                else
469
                {
470
                    // do span level processing inside the block, then wrap result in <p> tags
471
                    grafs[i] = _leadingWhitespace.Replace(RunSpanGamut(grafs[i]), "<p>") + "</p>";
472
                }
473
            }
474
475
            return string.Join("\n\n", grafs);
476
        }
477
478
479
        private void Setup()
480
        {
481
            // Clear the global hashes. If we don't clear these, you get conflicts
482
            // from other articles when generating a page which contains more than
483
            // one article (e.g. an index page that shows the N most recent
484
            // articles):
485
            _urls.Clear();
486
            _titles.Clear();
487
            _htmlBlocks.Clear();
488
            _listLevel = 0;
489
        }
490
491
        private void Cleanup()
492
        {
493
            Setup();
494
        }
495
496
        private static string _nestedBracketsPattern;
497
498
        /// <summary>
499
        /// Reusable pattern to match balanced [brackets]. See Friedl's
500
        /// "Mastering Regular Expressions", 2nd Ed., pp. 328-331.
501
        /// </summary>
502
        private static string GetNestedBracketsPattern()
503
        {
504
            // in other words [this] and [this[also]] and [this[also[too]]]
505
            // up to _nestDepth
506
            if (_nestedBracketsPattern == null)
507
                _nestedBracketsPattern =
508
                    RepeatString(@"
509
                    (?>              # Atomic matching
510
                       [^\[\]]+      # Anything other than brackets
511
                     |
512
                       \[
513
                           ", _nestDepth) + RepeatString(
514
                    @" \]
515
                    )*"
516
                    , _nestDepth);
517
            return _nestedBracketsPattern;
518
        }
519
520
        private static string _nestedParensPattern;
521
522
        /// <summary>
523
        /// Reusable pattern to match balanced (parens). See Friedl's
524
        /// "Mastering Regular Expressions", 2nd Ed., pp. 328-331.
525
        /// </summary>
526
        private static string GetNestedParensPattern()
527
        {
528
            // in other words (this) and (this(also)) and (this(also(too)))
529
            // up to _nestDepth
530
            if (_nestedParensPattern == null)
531
                _nestedParensPattern =
532
                    RepeatString(@"
533
                    (?>              # Atomic matching
534
                       [^()\s]+      # Anything other than parens or whitespace
535
                     |
536
                       \(
537
                           ", _nestDepth) + RepeatString(
538
                    @" \)
539
                    )*"
540
                    , _nestDepth);
541
            return _nestedParensPattern;
542
        }
543
544
        private static Regex _linkDef = new Regex(string.Format(@"
545
                        ^[ ]{{0,{0}}}\[(.+)\]:  # id = $1
546
                          [ ]*
547
                          \n?                   # maybe *one* newline
548
                          [ ]*
549
                        <?(\S+?)>?              # url = $2
550
                          [ ]*
551
                          \n?                   # maybe one newline
552
                          [ ]*
553
                        (?:
554
                            (?<=\s)             # lookbehind for whitespace
555
                            [""(]
556
                            (.+?)               # title = $3
557
                            ["")]
558
                            [ ]*
559
                        )?                      # title is optional
560
                        (?:\n+|\Z)", _tabWidth - 1), RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.Compiled);
561
562
        /// <summary>
563
        /// Strips link definitions from text, stores the URLs and titles in hash references.
564
        /// </summary>
565
        /// <remarks>
566
        /// ^[id]: url "optional title"
567
        /// </remarks>
568
        private string StripLinkDefinitions(string text)
569
        {
570
            return _linkDef.Replace(text, new MatchEvaluator(LinkEvaluator));
571
        }
572
573
        private string LinkEvaluator(Match match)
574
        {
575
            string linkID = match.Groups[1].Value.ToLowerInvariant();
576
            _urls[linkID] = EncodeAmpsAndAngles(match.Groups[2].Value);
577
578
            if (match.Groups[3] != null && match.Groups[3].Length > 0)
579
                _titles[linkID] = match.Groups[3].Value.Replace("\"", "&quot;");
580
581
            return "";
582
        }
583
584
        // compiling this monster regex results in worse performance. trust me.
585
        private static Regex _blocksHtml = new Regex(GetBlockPattern(), RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace);
586
587
588
        /// <summary>
589
        /// derived pretty much verbatim from PHP Markdown
590
        /// </summary>
591
        private static string GetBlockPattern()
592
        {
593
594
            // Hashify HTML blocks:
595
            // We only want to do this for block-level HTML tags, such as headers,
596
            // lists, and tables. That's because we still want to wrap <p>s around
597
            // "paragraphs" that are wrapped in non-block-level tags, such as anchors,
598
            // phrase emphasis, and spans. The list of tags we're looking for is
599
            // hard-coded:
600
            //
601
            // *  List "a" is made of tags which can be both inline or block-level.
602
            //    These will be treated block-level when the start tag is alone on
603
            //    its line, otherwise they're not matched here and will be taken as
604
            //    inline later.
605
            // *  List "b" is made of tags which are always block-level;
606
            //
607
            string blockTagsA = "ins|del";
608
            string blockTagsB = "p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|address|script|noscript|form|fieldset|iframe|math";
609
610
            // Regular expression for the content of a block tag.
611
            string attr = @"
612
            (?>                    # optional tag attributes
613
              \s                  # starts with whitespace
614
              (?>
615
                [^>""/]+              # text outside quotes
616
              |
617
                /+(?!>)                # slash not followed by >
618
              |
619
                ""[^""]*""            # text inside double quotes (tolerate >)
620
              |
621
                '[^']*'                  # text inside single quotes (tolerate >)
622
              )*
623
            )?  
624
            ";
625
626
            string content = RepeatString(@"
627
                (?>
628
                  [^<]+              # content without tag
629
                |
630
                  <\2              # nested opening tag
631
                    " + attr + @"       # attributes
632
                  (?>
633
                      />
634
                  |
635
                      >", _nestDepth) +   // end of opening tag
636
                      ".*?" +             // last level nested tag content
637
            RepeatString(@"
638
                      </\2\s*>          # closing nested tag
639
                  )
640
                  |        
641
                  <(?!/\2\s*>           # other tags with a different name
642
                  )
643
                )*", _nestDepth);
644
645
            string content2 = content.Replace(@"\2", @"\3");
646
647
            // First, look for nested blocks, e.g.:
648
            //   <div>
649
            //     <div>
650
            //     tags for inner block must be indented.
651
            //     </div>
652
            //   </div>
653
            //
654
            // The outermost tags must start at the left margin for this to match, and
655
            // the inner nested divs must be indented.
656
            // We need to do this before the next, more liberal match, because the next
657
            // match will start at the first `<div>` and stop at the first `</div>`.
658
            string pattern = @"
659
            (?>
660
                  (?>
661
                    (?<=\n)     # Starting at the beginning of a line
662
                    |           # or
663
                    \A\n?       # the beginning of the doc
664
                  )
665
                  (             # save in $1
666
667
                    # Match from `\n<tag>` to `</tag>\n`, handling nested tags
668
                    # in between.
669
                      
670
                        <($block_tags_b_re)   # start tag = $2
671
                        $attr>                # attributes followed by > and \n
672
                        $content              # content, support nesting
673
                        </\2>                 # the matching end tag
674
                        [ ]*                  # trailing spaces
675
                        (?=\n+|\Z)            # followed by a newline or end of document
676
677
                  | # Special version for tags of group a.
678
679
                        <($block_tags_a_re)   # start tag = $3
680
                        $attr>[ ]*\n          # attributes followed by >
681
                        $content2             # content, support nesting
682
                        </\3>                 # the matching end tag
683
                        [ ]*                  # trailing spaces
684
                        (?=\n+|\Z)            # followed by a newline or end of document
685
                      
686
                  | # Special case just for <hr />. It was easier to make a special
687
                    # case than to make the other regex more complicated.
688
                  
689
                        [ ]{0,$less_than_tab}
690
                        <hr
691
                        $attr                 # attributes
692
                        /?>                   # the matching end tag
693
                        [ ]*
694
                        (?=\n{2,}|\Z)         # followed by a blank line or end of document
695
                  
696
                  | # Special case for standalone HTML comments:
697
                  
698
                      (?<=\n\n|\A)            # preceded by a blank line or start of document
699
                      [ ]{0,$less_than_tab}
700
                      (?s:
701
                        <!--(?:|(?:[^>-]|-[^>])(?:[^-]|-[^-])*)-->
702
                      )
703
                      [ ]*
704
                      (?=\n{2,}|\Z)            # followed by a blank line or end of document
705
                  
706
                  | # PHP and ASP-style processor instructions (<? and <%)
707
                  
708
                      [ ]{0,$less_than_tab}
709
                      (?s:
710
                        <([?%])                # $4
711
                        .*?
712
                        \4>
713
                      )
714
                      [ ]*
715
                      (?=\n{2,}|\Z)            # followed by a blank line or end of document
716
                      
717
                  )
718
            )";
719
720
            pattern = pattern.Replace("$less_than_tab", (_tabWidth - 1).ToString());
721
            pattern = pattern.Replace("$block_tags_b_re", blockTagsB);
722
            pattern = pattern.Replace("$block_tags_a_re", blockTagsA);
723
            pattern = pattern.Replace("$attr", attr);
724
            pattern = pattern.Replace("$content2", content2);
725
            pattern = pattern.Replace("$content", content);
726
727
            return pattern;
728
        }
729
730
        /// <summary>
731
        /// replaces any block-level HTML blocks with hash entries
732
        /// </summary>
733
        private string HashHTMLBlocks(string text)
734
        {
735
            return _blocksHtml.Replace(text, new MatchEvaluator(HtmlEvaluator));
736
        }
737
738
        private string HtmlEvaluator(Match match)
739
        {
740
            string text = match.Groups[1].Value;
741
            string key = GetHashKey(text, isHtmlBlock: true);
742
            _htmlBlocks[key] = text;
743
744
            return string.Concat("\n\n", key, "\n\n");
745
        }
746
747
        private static string GetHashKey(string s, bool isHtmlBlock)
748
        {
749
            var delim = isHtmlBlock ? 'H' : 'E';
750
            return "\x1A" + delim + Math.Abs(s.GetHashCode()).ToString() + delim;
751
        }
752
753
        private static Regex _htmlTokens = new Regex(@"
754
            (<!--(?:|(?:[^>-]|-[^>])(?:[^-]|-[^-])*)-->)|        # match <!-- foo -->
755
            (<\?.*?\?>)|                 # match <?foo?> " +
756
            RepeatString(@"
757
            (<[A-Za-z\/!$](?:[^<>]|", _nestDepth) + RepeatString(@")*>)", _nestDepth) +
758
                                       " # match <tag> and </tag>",
759
            RegexOptions.Multiline | RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.IgnorePatternWhitespace | RegexOptions.Compiled);
760
761
        /// <summary>
762
        /// returns an array of HTML tokens comprising the input string. Each token is
763
        /// either a tag (possibly with nested, tags contained therein, such
764
        /// as &lt;a href="&lt;MTFoo&gt;"&gt;, or a run of text between tags. Each element of the
765
        /// array is a two-element array; the first is either 'tag' or 'text'; the second is
766
        /// the actual value.
767
        /// </summary>
768
        private List<Token> TokenizeHTML(string text)
769
        {
770
            int pos = 0;
771
            int tagStart = 0;
772
            var tokens = new List<Token>();
773
774
            // this regex is derived from the _tokenize() subroutine in Brad Choate's MTRegex plugin.
775
            // http://www.bradchoate.com/past/mtregex.php
776
            foreach (Match m in _htmlTokens.Matches(text))
777
            {
778
                tagStart = m.Index;
779
780
                if (pos < tagStart)
781
                    tokens.Add(new Token(TokenType.Text, text.Substring(pos, tagStart - pos)));
782
783
                tokens.Add(new Token(TokenType.Tag, m.Value));
784
                pos = tagStart + m.Length;
785
            }
786
787
            if (pos < text.Length)
788
                tokens.Add(new Token(TokenType.Text, text.Substring(pos, text.Length - pos)));
789
790
            return tokens;
791
        }
792
793
794
        private static Regex _anchorRef = new Regex(string.Format(@"
795
            (                               # wrap whole match in $1
796
                \[
797
                    ({0})                   # link text = $2
798
                \]
799
800
                [ ]?                        # one optional space
801
                (?:\n[ ]*)?                 # one optional newline followed by spaces
802
803
                \[
804
                    (.*?)                   # id = $3
805
                \]
806
            )", GetNestedBracketsPattern()), RegexOptions.Singleline | RegexOptions.IgnorePatternWhitespace | RegexOptions.Compiled);
807
808
        private static Regex _anchorInline = new Regex(string.Format(@"
809
                (                           # wrap whole match in $1
810
                    \[
811
                        ({0})               # link text = $2
812
                    \]
813
                    \(                      # literal paren
814
                        [ ]*
815
                        ({1})               # href = $3
816
                        [ ]*
817
                        (                   # $4
818
                        (['""])           # quote char = $5
819
                        (.*?)               # title = $6
820
                        \5                  # matching quote
821
                        [ ]*                # ignore any spaces between closing quote and )
822
                        )?                  # title is optional
823
                    \)
824
                )", GetNestedBracketsPattern(), GetNestedParensPattern()),
825
                  RegexOptions.Singleline | RegexOptions.IgnorePatternWhitespace | RegexOptions.Compiled);
826
827
        private static Regex _anchorRefShortcut = new Regex(@"
828
            (                               # wrap whole match in $1
829
              \[
830
                 ([^\[\]]+)                 # link text = $2; can't contain [ or ]
831
              \]
832
            )", RegexOptions.Singleline | RegexOptions.IgnorePatternWhitespace | RegexOptions.Compiled);
833
834
        /// <summary>
835
        /// Turn Markdown link shortcuts into HTML anchor tags
836
        /// </summary>
837
        /// <remarks>
838
        /// [link text](url "title")
839
        /// [link text][id]
840
        /// [id]
841
        /// </remarks>
842
        private string DoAnchors(string text)
843
        {
844
            // First, handle reference-style links: [link text] [id]
845
            text = _anchorRef.Replace(text, new MatchEvaluator(AnchorRefEvaluator));
846
847
            // Next, inline-style links: [link text](url "optional title") or [link text](url "optional title")
848
            text = _anchorInline.Replace(text, new MatchEvaluator(AnchorInlineEvaluator));
849
850
            //  Last, handle reference-style shortcuts: [link text]
851
            //  These must come last in case you've also got [link test][1]
852
            //  or [link test](/foo)
853
            text = _anchorRefShortcut.Replace(text, new MatchEvaluator(AnchorRefShortcutEvaluator));
854
            return text;
855
        }
856
857
        private string SaveFromAutoLinking(string s)
858
        {
859
            return s.Replace("://", AutoLinkPreventionMarker);
860
        }
861
862
        private string AnchorRefEvaluator(Match match)
863
        {
864
            string wholeMatch = match.Groups[1].Value;
865
            string linkText = SaveFromAutoLinking(match.Groups[2].Value);
866
            string linkID = match.Groups[3].Value.ToLowerInvariant();
867
868
            string result;
869
870
            // for shortcut links like [this][].
871
            if (linkID == "")
872
                linkID = linkText.ToLowerInvariant();
873
874
            if (_urls.ContainsKey(linkID))
875
            {
876
                string url = _urls[linkID];
877
878
                url = EncodeProblemUrlChars(url);
879
                url = EscapeBoldItalic(url);
880
                result = "<a href=\"" + url + "\"";
881
882
                if (_titles.ContainsKey(linkID))
883
                {
884
                    string title = AttributeEncode(_titles[linkID]);
885
                    title = AttributeEncode(EscapeBoldItalic(title));
886
                    result += " title=\"" + title + "\"";
887
                }
888
889
                result += ">" + linkText + "</a>";
890
            }
891
            else
892
                result = wholeMatch;
893
894
            return result;
895
        }
896
897
        private string AnchorRefShortcutEvaluator(Match match)
898
        {
899
            string wholeMatch = match.Groups[1].Value;
900
            string linkText = SaveFromAutoLinking(match.Groups[2].Value);
901
            string linkID = Regex.Replace(linkText.ToLowerInvariant(), @"[ ]*\n[ ]*", " ");  // lower case and remove newlines / extra spaces
902
903
            string result;
904
905
            if (_urls.ContainsKey(linkID))
906
            {
907
                string url = _urls[linkID];
908
909
                url = EncodeProblemUrlChars(url);
910
                url = EscapeBoldItalic(url);
911
                result = "<a href=\"" + url + "\"";
912
913
                if (_titles.ContainsKey(linkID))
914
                {
915
                    string title = AttributeEncode(_titles[linkID]);
916
                    title = EscapeBoldItalic(title);
917
                    result += " title=\"" + title + "\"";
918
                }
919
920
                result += ">" + linkText + "</a>";
921
            }
922
            else
923
                result = wholeMatch;
924
925
            return result;
926
        }
927
928
929
        private string AnchorInlineEvaluator(Match match)
930
        {
931
            string linkText = SaveFromAutoLinking(match.Groups[2].Value);
932
            string url = match.Groups[3].Value;
933
            string title = match.Groups[6].Value;
934
            string result;
935
936
            url = EncodeProblemUrlChars(url);
937
            url = EscapeBoldItalic(url);
938
            if (url.StartsWith("<") && url.EndsWith(">"))
939
                url = url.Substring(1, url.Length - 2); // remove <>'s surrounding URL, if present            
940
941
            result = string.Format("<a href=\"{0}\"", url);
942
943
            if (!String.IsNullOrEmpty(title))
944
            {
945
                title = AttributeEncode(title);
946
                title = EscapeBoldItalic(title);
947
                result += string.Format(" title=\"{0}\"", title);
948
            }
949
950
            result += string.Format(">{0}</a>", linkText);
951
            return result;
952
        }
953
954
        private static Regex _imagesRef = new Regex(@"
955
                    (               # wrap whole match in $1
956
                    !\[
957
                        (.*?)       # alt text = $2
958
                    \]
959
960
                    [ ]?            # one optional space
961
                    (?:\n[ ]*)?     # one optional newline followed by spaces
962
963
                    \[
964
                        (.*?)       # id = $3
965
                    \]
966
967
                    )", RegexOptions.IgnorePatternWhitespace | RegexOptions.Singleline | RegexOptions.Compiled);
968
969
        private static Regex _imagesInline = new Regex(String.Format(@"
970
              (                     # wrap whole match in $1
971
                !\[
972
                    (.*?)           # alt text = $2
973
                \]
974
                \s?                 # one optional whitespace character
975
                \(                  # literal paren
976
                    [ ]*
977
                    ({0})           # href = $3
978
                    [ ]*
979
                    (               # $4
980
                    (['""])       # quote char = $5
981
                    (.*?)           # title = $6
982
                    \5              # matching quote
983
                    [ ]*
984
                    )?              # title is optional
985
                \)
986
              )", GetNestedParensPattern()),
987
                  RegexOptions.IgnorePatternWhitespace | RegexOptions.Singleline | RegexOptions.Compiled);
988
989
        /// <summary>
990
        /// Turn Markdown image shortcuts into HTML img tags.
991
        /// </summary>
992
        /// <remarks>
993
        /// ![alt text][id]
994
        /// ![alt text](url "optional title")
995
        /// </remarks>
996
        private string DoImages(string text)
997
        {
998
            // First, handle reference-style labeled images: ![alt text][id]
999
            text = _imagesRef.Replace(text, new MatchEvaluator(ImageReferenceEvaluator));
1000
1001
            // Next, handle inline images:  ![alt text](url "optional title")
1002
            // Don't forget: encode * and _
1003
            text = _imagesInline.Replace(text, new MatchEvaluator(ImageInlineEvaluator));
1004
1005
            return text;
1006
        }
1007
1008
        // This prevents the creation of horribly broken HTML when some syntax ambiguities
1009
        // collide. It likely still doesn't do what the user meant, but at least we're not
1010
        // outputting garbage.
1011
        private string EscapeImageAltText(string s)
1012
        {
1013
            s = EscapeBoldItalic(s);
1014
            s = Regex.Replace(s, @"[\[\]()]", m => _escapeTable[m.ToString()]);
1015
            return s;
1016
        }
1017
1018
        private string ImageReferenceEvaluator(Match match)
1019
        {
1020
            string wholeMatch = match.Groups[1].Value;
1021
            string altText = match.Groups[2].Value;
1022
            string linkID = match.Groups[3].Value.ToLowerInvariant();
1023
1024
            // for shortcut links like ![this][].
1025
            if (linkID == "")
1026
                linkID = altText.ToLowerInvariant();
1027
1028
            if (_urls.ContainsKey(linkID))
1029
            {
1030
                string url = _urls[linkID];
1031
                string title = null;
1032
1033
                if (_titles.ContainsKey(linkID))
1034
                    title = _titles[linkID];
1035
1036
                return ImageTag(url, altText, title);
1037
            }
1038
            else
1039
            {
1040
                // If there's no such link ID, leave intact:
1041
                return wholeMatch;
1042
            }
1043
        }
1044
1045
        private string ImageInlineEvaluator(Match match)
1046
        {
1047
            string alt = match.Groups[2].Value;
1048
            string url = match.Groups[3].Value;
1049
            string title = match.Groups[6].Value;
1050
1051
            if (url.StartsWith("<") && url.EndsWith(">"))
1052
                url = url.Substring(1, url.Length - 2);    // Remove <>'s surrounding URL, if present
1053
1054
            return ImageTag(url, alt, title);
1055
        }
1056
1057
        private string ImageTag(string url, string altText, string title)
1058
        {
1059
            altText = EscapeImageAltText(AttributeEncode(altText));
1060
            url = EncodeProblemUrlChars(url);
1061
            url = EscapeBoldItalic(url);
1062
            var result = string.Format("<img src=\"{0}\" alt=\"{1}\"", url, altText);
1063
            if (!String.IsNullOrEmpty(title))
1064
            {
1065
                title = AttributeEncode(EscapeBoldItalic(title));
1066
                result += string.Format(" title=\"{0}\"", title);
1067
            }
1068
            result += _emptyElementSuffix;
1069
            return result;
1070
        }
1071
1072
        private static Regex _headerSetext = new Regex(@"
1073
                ^(.+?)
1074
                [ ]*
1075
                \n
1076
                (=+|-+)     # $1 = string of ='s or -'s
1077
                [ ]*
1078
                \n+",
1079
            RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.Compiled);
1080
1081
        private static Regex _headerAtx = new Regex(@"
1082
                ^(\#{1,6})  # $1 = string of #'s
1083
                [ ]*
1084
                (.+?)       # $2 = Header text
1085
                [ ]*
1086
                \#*         # optional closing #'s (not counted)
1087
                \n+",
1088
            RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.Compiled);
1089
1090
        /// <summary>
1091
        /// Turn Markdown headers into HTML header tags
1092
        /// </summary>
1093
        /// <remarks>
1094
        /// Header 1  
1095
        /// ========  
1096
        ///
1097
        /// Header 2  
1098
        /// --------  
1099
        ///
1100
        /// # Header 1  
1101
        /// ## Header 2  
1102
        /// ## Header 2 with closing hashes ##  
1103
        /// ...  
1104
        /// ###### Header 6  
1105
        /// </remarks>
1106
        private string DoHeaders(string text)
1107
        {
1108
            text = _headerSetext.Replace(text, new MatchEvaluator(SetextHeaderEvaluator));
1109
            text = _headerAtx.Replace(text, new MatchEvaluator(AtxHeaderEvaluator));
1110
            return text;
1111
        }
1112
1113
        private string SetextHeaderEvaluator(Match match)
1114
        {
1115
            string header = match.Groups[1].Value;
1116
            int level = match.Groups[2].Value.StartsWith("=") ? 1 : 2;
1117
            return string.Format("<h{1}>{0}</h{1}>\n\n", RunSpanGamut(header), level);
1118
        }
1119
1120
        private string AtxHeaderEvaluator(Match match)
1121
        {
1122
            string header = match.Groups[2].Value;
1123
            int level = match.Groups[1].Value.Length;
1124
            return string.Format("<h{1}>{0}</h{1}>\n\n", RunSpanGamut(header), level);
1125
        }
1126
1127
1128
        private static Regex _horizontalRules = new Regex(@"
1129
            ^[ ]{0,3}         # Leading space
1130
                ([-*_])       # $1: First marker
1131
                (?>           # Repeated marker group
1132
                    [ ]{0,2}  # Zero, one, or two spaces.
1133
                    \1        # Marker character
1134
                ){2,}         # Group repeated at least twice
1135
                [ ]*          # Trailing spaces
1136
                $             # End of line.
1137
            ", RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.Compiled);
1138
1139
        /// <summary>
1140
        /// Turn Markdown horizontal rules into HTML hr tags
1141
        /// </summary>
1142
        /// <remarks>
1143
        /// ***  
1144
        /// * * *  
1145
        /// ---
1146
        /// - - -
1147
        /// </remarks>
1148
        private string DoHorizontalRules(string text)
1149
        {
1150
            return _horizontalRules.Replace(text, "<hr" + _emptyElementSuffix + "\n");
1151
        }
1152
1153
        private static string _wholeList = string.Format(@"
1154
            (                               # $1 = whole list
1155
              (                             # $2
1156
                [ ]{{0,{1}}}
1157
                ({0})                       # $3 = first list item marker
1158
                [ ]+
1159
              )
1160
              (?s:.+?)
1161
              (                             # $4
1162
                  \z
1163
                |
1164
                  \n{{2,}}
1165
                  (?=\S)
1166
                  (?!                       # Negative lookahead for another list item marker
1167
                    [ ]*
1168
                    {0}[ ]+
1169
                  )
1170
              )
1171
            )", string.Format("(?:{0}|{1})", _markerUL, _markerOL), _tabWidth - 1);
1172
1173
        private static Regex _listNested = new Regex(@"^" + _wholeList,
1174
            RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.Compiled);
1175
1176
        private static Regex _listTopLevel = new Regex(@"(?:(?<=\n\n)|\A\n?)" + _wholeList,
1177
            RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.Compiled);
1178
1179
        /// <summary>
1180
        /// Turn Markdown lists into HTML ul and ol and li tags
1181
        /// </summary>
1182
        private string DoLists(string text)
1183
        {
1184
            // We use a different prefix before nested lists than top-level lists.
1185
            // See extended comment in _ProcessListItems().
1186
            if (_listLevel > 0)
1187
                text = _listNested.Replace(text, new MatchEvaluator(ListEvaluator));
1188
            else
1189
                text = _listTopLevel.Replace(text, new MatchEvaluator(ListEvaluator));
1190
1191
            return text;
1192
        }
1193
1194
        private string ListEvaluator(Match match)
1195
        {
1196
            string list = match.Groups[1].Value;
1197
            string listType = Regex.IsMatch(match.Groups[3].Value, _markerUL) ? "ul" : "ol";
1198
            string result;
1199
1200
            result = ProcessListItems(list, listType == "ul" ? _markerUL : _markerOL);
1201
1202
            result = string.Format("<{0}>\n{1}</{0}>\n", listType, result);
1203
            return result;
1204
        }
1205
1206
        /// <summary>
1207
        /// Process the contents of a single ordered or unordered list, splitting it
1208
        /// into individual list items.
1209
        /// </summary>
1210
        private string ProcessListItems(string list, string marker)
1211
        {
1212
            // The listLevel global keeps track of when we're inside a list.
1213
            // Each time we enter a list, we increment it; when we leave a list,
1214
            // we decrement. If it's zero, we're not in a list anymore.
1215
1216
            // We do this because when we're not inside a list, we want to treat
1217
            // something like this:
1218
1219
            //    I recommend upgrading to version
1220
            //    8. Oops, now this line is treated
1221
            //    as a sub-list.
1222
1223
            // As a single paragraph, despite the fact that the second line starts
1224
            // with a digit-period-space sequence.
1225
1226
            // Whereas when we're inside a list (or sub-list), that line will be
1227
            // treated as the start of a sub-list. What a kludge, huh? This is
1228
            // an aspect of Markdown's syntax that's hard to parse perfectly
1229
            // without resorting to mind-reading. Perhaps the solution is to
1230
            // change the syntax rules such that sub-lists must start with a
1231
            // starting cardinal number; e.g. "1." or "a.".
1232
1233
            _listLevel++;
1234
1235
            // Trim trailing blank lines:
1236
            list = Regex.Replace(list, @"\n{2,}\z", "\n");
1237
1238
            string pattern = string.Format(
1239
              @"(^[ ]*)                    # leading whitespace = $1
1240
                ({0}) [ ]+                 # list marker = $2
1241
                ((?s:.+?)                  # list item text = $3
1242
                (\n+))      
1243
                (?= (\z | \1 ({0}) [ ]+))", marker);
1244
1245
            bool lastItemHadADoubleNewline = false;
1246
1247
            // has to be a closure, so subsequent invocations can share the bool
1248
            MatchEvaluator ListItemEvaluator = (Match match) =>
1249
            {
1250
                string item = match.Groups[3].Value;
1251
1252
                bool endsWithDoubleNewline = item.EndsWith("\n\n");
1253
                bool containsDoubleNewline = endsWithDoubleNewline || item.Contains("\n\n");
1254
1255
                if (containsDoubleNewline || lastItemHadADoubleNewline)
1256
                    // we could correct any bad indentation here..
1257
                    item = RunBlockGamut(Outdent(item) + "\n", unhash: false);
1258
                else
1259
                {
1260
                    // recursion for sub-lists
1261
                    item = DoLists(Outdent(item));
1262
                    item = item.TrimEnd('\n');
1263
                    item = RunSpanGamut(item);
1264
                }
1265
                lastItemHadADoubleNewline = endsWithDoubleNewline;
1266
                return string.Format("<li>{0}</li>\n", item);
1267
            };
1268
1269
            list = Regex.Replace(list, pattern, new MatchEvaluator(ListItemEvaluator),
1270
                                  RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline);
1271
            _listLevel--;
1272
            return list;
1273
        }
1274
1275
        private static Regex _codeBlock = new Regex(string.Format(@"
1276
                    (?:\n\n|\A\n?)
1277
                    (                        # $1 = the code block -- one or more lines, starting with a space
1278
                    (?:
1279
                        (?:[ ]{{{0}}})       # Lines must start with a tab-width of spaces
1280
                        .*\n+
1281
                    )+
1282
                    )
1283
                    ((?=^[ ]{{0,{0}}}[^ \t\n])|\Z) # Lookahead for non-space at line-start, or end of doc",
1284
                    _tabWidth), RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.Compiled);
1285
1286
        /// <summary>
1287
        /// /// Turn Markdown 4-space indented code into HTML pre code blocks
1288
        /// </summary>
1289
        private string DoCodeBlocks(string text)
1290
        {
1291
            text = _codeBlock.Replace(text, new MatchEvaluator(CodeBlockEvaluator));
1292
            return text;
1293
        }
1294
1295
        private string CodeBlockEvaluator(Match match)
1296
        {
1297
            string codeBlock = match.Groups[1].Value;
1298
1299
            codeBlock = EncodeCode(Outdent(codeBlock));
1300
            codeBlock = _newlinesLeadingTrailing.Replace(codeBlock, "");
1301
1302
            return string.Concat("\n\n<pre><code>", codeBlock, "\n</code></pre>\n\n");
1303
        }
1304
1305
        private static Regex _codeSpan = new Regex(@"
1306
                    (?<!\\)   # Character before opening ` can't be a backslash
1307
                    (`+)      # $1 = Opening run of `
1308
                    (.+?)     # $2 = The code block
1309
                    (?<!`)
1310
                    \1
1311
                    (?!`)", RegexOptions.IgnorePatternWhitespace | RegexOptions.Singleline | RegexOptions.Compiled);
1312
1313
        /// <summary>
1314
        /// Turn Markdown `code spans` into HTML code tags
1315
        /// </summary>
1316
        private string DoCodeSpans(string text)
1317
        {
1318
            //    * You can use multiple backticks as the delimiters if you want to
1319
            //        include literal backticks in the code span. So, this input:
1320
            //
1321
            //        Just type ``foo `bar` baz`` at the prompt.
1322
            //
1323
            //        Will translate to:
1324
            //
1325
            //          <p>Just type <code>foo `bar` baz</code> at the prompt.</p>
1326
            //
1327
            //        There's no arbitrary limit to the number of backticks you
1328
            //        can use as delimters. If you need three consecutive backticks
1329
            //        in your code, use four for delimiters, etc.
1330
            //
1331
            //    * You can use spaces to get literal backticks at the edges:
1332
            //
1333
            //          ... type `` `bar` `` ...
1334
            //
1335
            //        Turns to:
1336
            //
1337
            //          ... type <code>`bar`</code> ...         
1338
            //
1339
1340
            return _codeSpan.Replace(text, new MatchEvaluator(CodeSpanEvaluator));
1341
        }
1342
1343
        private string CodeSpanEvaluator(Match match)
1344
        {
1345
            string span = match.Groups[2].Value;
1346
            span = Regex.Replace(span, @"^[ ]*", ""); // leading whitespace
1347
            span = Regex.Replace(span, @"[ ]*$", ""); // trailing whitespace
1348
            span = EncodeCode(span);
1349
            span = SaveFromAutoLinking(span); // to prevent auto-linking. Not necessary in code *blocks*, but in code spans.
1350
1351
            return string.Concat("<code>", span, "</code>");
1352
        }
1353
1354
1355
        private static Regex _bold = new Regex(@"(\*\*|__) (?=\S) (.+?[*_]*) (?<=\S) \1",
1356
            RegexOptions.IgnorePatternWhitespace | RegexOptions.Singleline | RegexOptions.Compiled);
1357
        private static Regex _strictBold = new Regex(@"([\W_]|^) (\*\*|__) (?=\S) ([^\r]*?\S[\*_]*) \2 ([\W_]|$)",
1358
            RegexOptions.IgnorePatternWhitespace | RegexOptions.Singleline | RegexOptions.Compiled);
1359
1360
        private static Regex _italic = new Regex(@"(\*|_) (?=\S) (.+?) (?<=\S) \1",
1361
            RegexOptions.IgnorePatternWhitespace | RegexOptions.Singleline | RegexOptions.Compiled);
1362
        private static Regex _strictItalic = new Regex(@"([\W_]|^) (\*|_) (?=\S) ([^\r\*_]*?\S) \2 ([\W_]|$)",
1363
            RegexOptions.IgnorePatternWhitespace | RegexOptions.Singleline | RegexOptions.Compiled);
1364
1365
        /// <summary>
1366
        /// Turn Markdown *italics* and **bold** into HTML strong and em tags
1367
        /// </summary>
1368
        private string DoItalicsAndBold(string text)
1369
        {
1370
1371
            // <strong> must go first, then <em>
1372
            if (_strictBoldItalic)
1373
            {
1374
                text = _strictBold.Replace(text, "$1<strong>$3</strong>$4");
1375
                text = _strictItalic.Replace(text, "$1<em>$3</em>$4");
1376
            }
1377
            else
1378
            {
1379
                text = _bold.Replace(text, "<strong>$2</strong>");
1380
                text = _italic.Replace(text, "<em>$2</em>");
1381
            }
1382
            return text;
1383
        }
1384
1385
        /// <summary>
1386
        /// Turn markdown line breaks (two space at end of line) into HTML break tags
1387
        /// </summary>
1388
        private string DoHardBreaks(string text)
1389
        {
1390
            if (_autoNewlines)
1391
                text = Regex.Replace(text, @"\n", string.Format("<br{0}\n", _emptyElementSuffix));
1392
            else
1393
                text = Regex.Replace(text, @" {2,}\n", string.Format("<br{0}\n", _emptyElementSuffix));
1394
            return text;
1395
        }
1396
1397
        private static Regex _blockquote = new Regex(@"
1398
            (                           # Wrap whole match in $1
1399
                (
1400
                ^[ ]*>[ ]?              # '>' at the start of a line
1401
                    .+\n                # rest of the first line
1402
                (.+\n)*                 # subsequent consecutive lines
1403
                \n*                     # blanks
1404
                )+
1405
            )", RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline | RegexOptions.Compiled);
1406
1407
        /// <summary>
1408
        /// Turn Markdown > quoted blocks into HTML blockquote blocks
1409
        /// </summary>
1410
        private string DoBlockQuotes(string text)
1411
        {
1412
            return _blockquote.Replace(text, new MatchEvaluator(BlockQuoteEvaluator));
1413
        }
1414
1415
        private string BlockQuoteEvaluator(Match match)
1416
        {
1417
            string bq = match.Groups[1].Value;
1418
1419
            bq = Regex.Replace(bq, @"^[ ]*>[ ]?", "", RegexOptions.Multiline);       // trim one level of quoting
1420
            bq = Regex.Replace(bq, @"^[ ]+$", "", RegexOptions.Multiline);           // trim whitespace-only lines
1421
            bq = RunBlockGamut(bq);                                                  // recurse
1422
1423
            bq = Regex.Replace(bq, @"^", "  ", RegexOptions.Multiline);
1424
1425
            // These leading spaces screw with <pre> content, so we need to fix that:
1426
            bq = Regex.Replace(bq, @"(\s*<pre>.+?</pre>)", new MatchEvaluator(BlockQuoteEvaluator2), RegexOptions.IgnorePatternWhitespace | RegexOptions.Singleline);
1427
1428
            bq = string.Format("<blockquote>\n{0}\n</blockquote>", bq);
1429
            string key = GetHashKey(bq, isHtmlBlock: true);
1430
            _htmlBlocks[key] = bq;
1431
1432
            return "\n\n" + key + "\n\n";
1433
        }
1434
1435
        private string BlockQuoteEvaluator2(Match match)
1436
        {
1437
            return Regex.Replace(match.Groups[1].Value, @"^  ", "", RegexOptions.Multiline);
1438
        }
1439
1440
        private static Regex _autolinkBare = new Regex(@"(<|="")?\b(https?|ftp)(://[-A-Z0-9+&@#/%?=~_|\[\]\(\)!:,\.;]*[-A-Z0-9+&@#/%=~_|\[\])])(?=$|\W)",
1441
            RegexOptions.IgnoreCase | RegexOptions.Compiled);
1442
1443
        private static string handleTrailingParens(Match match)
1444
        {
1445
            // The first group is essentially a negative lookbehind -- if there's a < or a =", we don't touch this.
1446
            // We're not using a *real* lookbehind, because of links with in links, like <a href="http://web.archive.org/web/20121130000728/http://www.google.com/">
1447
            // With a real lookbehind, the full link would never be matched, and thus the http://www.google.com *would* be matched.
1448
            // With the simulated lookbehind, the full link *is* matched (just not handled, because of this early return), causing
1449
            // the google link to not be matched again.
1450
            if (match.Groups[1].Success)
1451
                return match.Value;
1452
1453
            var protocol = match.Groups[2].Value;
1454
            var link = match.Groups[3].Value;
1455
            if (!link.EndsWith(")"))
1456
                return "<" + protocol + link + ">";
1457
            var level = 0;
1458
            foreach (Match c in Regex.Matches(link, "[()]"))
1459
            {
1460
                if (c.Value == "(")
1461
                {
1462
                    if (level <= 0)
1463
                        level = 1;
1464
                    else
1465
                        level++;
1466
                }
1467
                else
1468
                {
1469
                    level--;
1470
                }
1471
            }
1472
            var tail = "";
1473
            if (level < 0)
1474
            {
1475
                link = Regex.Replace(link, @"\){1," + (-level) + "}$", m => { tail = m.Value; return ""; });
1476
            }
1477
            return "<" + protocol + link + ">" + tail;
1478
        }
1479
1480
        /// <summary>
1481
        /// Turn angle-delimited URLs into HTML anchor tags
1482
        /// </summary>
1483
        /// <remarks>
1484
        /// &lt;http://www.example.com&gt;
1485
        /// </remarks>
1486
        private string DoAutoLinks(string text)
1487
        {
1488
1489
            if (_autoHyperlink)
1490
            {
1491
                // fixup arbitrary URLs by adding Markdown < > so they get linked as well
1492
                // note that at this point, all other URL in the text are already hyperlinked as <a href=""></a>
1493
                // *except* for the <http://www.foo.com> case
1494
                text = _autolinkBare.Replace(text, handleTrailingParens);
1495
            }
1496
1497
            // Hyperlinks: <http://foo.com>
1498
            text = Regex.Replace(text, "<((https?|ftp):[^'\">\\s]+)>", new MatchEvaluator(HyperlinkEvaluator));
1499
1500
            if (_linkEmails)
1501
            {
1502
                // Email addresses: <address@domain.foo>
1503
                string pattern =
1504
                    @"<
1505
                      (?:mailto:)?
1506
                      (
1507
                        [-.\w]+
1508
                        \@
1509
                        [-a-z0-9]+(\.[-a-z0-9]+)*\.[a-z]+
1510
                      )
1511
                      >";
1512
                text = Regex.Replace(text, pattern, new MatchEvaluator(EmailEvaluator), RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace);
1513
            }
1514
1515
            return text;
1516
        }
1517
1518
        private string HyperlinkEvaluator(Match match)
1519
        {
1520
            string link = match.Groups[1].Value;
1521
            return string.Format("<a href=\"{0}\">{0}</a>", link);
1522
        }
1523
1524
        private string EmailEvaluator(Match match)
1525
        {
1526
            string email = Unescape(match.Groups[1].Value);
1527
1528
            //
1529
            //    Input: an email address, e.g. "foo@example.com"
1530
            //
1531
            //    Output: the email address as a mailto link, with each character
1532
            //            of the address encoded as either a decimal or hex entity, in
1533
            //            the hopes of foiling most address harvesting spam bots. E.g.:
1534
            //
1535
            //      <a href="&#x6D;&#97;&#105;&#108;&#x74;&#111;:&#102;&#111;&#111;&#64;&#101;
1536
            //        x&#x61;&#109;&#x70;&#108;&#x65;&#x2E;&#99;&#111;&#109;">&#102;&#111;&#111;
1537
            //        &#64;&#101;x&#x61;&#109;&#x70;&#108;&#x65;&#x2E;&#99;&#111;&#109;</a>
1538
            //
1539
            //    Based by a filter by Matthew Wickline, posted to the BBEdit-Talk
1540
            //    mailing list: <http://tinyurl.com/yu7ue>
1541
            //
1542
            email = "mailto:" + email;
1543
1544
            // leave ':' alone (to spot mailto: later)
1545
            email = EncodeEmailAddress(email);
1546
1547
            email = string.Format("<a href=\"{0}\">{0}</a>", email);
1548
1549
            // strip the mailto: from the visible part
1550
            email = Regex.Replace(email, "\">.+?:", "\">");
1551
            return email;
1552
        }
1553
1554
1555
        private static Regex _outDent = new Regex(@"^[ ]{1," + _tabWidth + @"}", RegexOptions.Multiline | RegexOptions.Compiled);
1556
1557
        /// <summary>
1558
        /// Remove one level of line-leading spaces
1559
        /// </summary>
1560
        private string Outdent(string block)
1561
        {
1562
            return _outDent.Replace(block, "");
1563
        }
1564
1565
1566
        #region Encoding and Normalization
1567
1568
1569
        /// <summary>
1570
        /// encodes email address randomly  
1571
        /// roughly 10% raw, 45% hex, 45% dec
1572
        /// note that @ is always encoded and : never is
1573
        /// </summary>
1574
        private string EncodeEmailAddress(string addr)
1575
        {
1576
            var sb = new StringBuilder(addr.Length * 5);
1577
            var rand = new Random();
1578
            int r;
1579
            foreach (char c in addr)
1580
            {
1581
                r = rand.Next(1, 100);
1582
                if ((r > 90 || c == ':') && c != '@')
1583
                    sb.Append(c);                         // m
1584
                else if (r < 45)
1585
                    sb.AppendFormat("&#x{0:x};", (int)c); // &#x6D
1586
                else
1587
                    sb.AppendFormat("&#{0};", (int)c);    // &#109
1588
            }
1589
            return sb.ToString();
1590
        }
1591
1592
        private static Regex _codeEncoder = new Regex(@"&|<|>|\\|\*|_|\{|\}|\[|\]", RegexOptions.Compiled);
1593
1594
        /// <summary>
1595
        /// Encode/escape certain Markdown characters inside code blocks and spans where they are literals
1596
        /// </summary>
1597
        private string EncodeCode(string code)
1598
        {
1599
            return _codeEncoder.Replace(code, EncodeCodeEvaluator);
1600
        }
1601
        private string EncodeCodeEvaluator(Match match)
1602
        {
1603
            switch (match.Value)
1604
            {
1605
                // Encode all ampersands; HTML entities are not
1606
                // entities within a Markdown code span.
1607
                case "&":
1608
                    return "&amp;";
1609
                // Do the angle bracket song and dance
1610
                case "<":
1611
                    return "&lt;";
1612
                case ">":
1613
                    return "&gt;";
1614
                // escape characters that are magic in Markdown
1615
                default:
1616
                    return _escapeTable[match.Value];
1617
            }
1618
        }
1619
1620
1621
        private static Regex _amps = new Regex(@"&(?!((#[0-9]+)|(#[xX][a-fA-F0-9]+)|([a-zA-Z][a-zA-Z0-9]*));)", RegexOptions.ExplicitCapture | RegexOptions.Compiled);
1622
        private static Regex _angles = new Regex(@"<(?![A-Za-z/?\$!])", RegexOptions.ExplicitCapture | RegexOptions.Compiled);
1623
1624
        /// <summary>
1625
        /// Encode any ampersands (that aren't part of an HTML entity) and left or right angle brackets
1626
        /// </summary>
1627
        private string EncodeAmpsAndAngles(string s)
1628
        {
1629
            s = _amps.Replace(s, "&amp;");
1630
            s = _angles.Replace(s, "&lt;");
1631
            return s;
1632
        }
1633
1634
        private static Regex _backslashEscapes;
1635
1636
        /// <summary>
1637
        /// Encodes any escaped characters such as \`, \*, \[ etc
1638
        /// </summary>
1639
        private string EscapeBackslashes(string s)
1640
        {
1641
            return _backslashEscapes.Replace(s, new MatchEvaluator(EscapeBackslashesEvaluator));
1642
        }
1643
        private string EscapeBackslashesEvaluator(Match match)
1644
        {
1645
            return _backslashEscapeTable[match.Value];
1646
        }
1647
1648
        private static Regex _unescapes = new Regex("\x1A" + "E\\d+E", RegexOptions.Compiled);
1649
1650
        /// <summary>
1651
        /// swap back in all the special characters we've hidden
1652
        /// </summary>
1653
        private string Unescape(string s)
1654
        {
1655
            return _unescapes.Replace(s, new MatchEvaluator(UnescapeEvaluator));
1656
        }
1657
        private string UnescapeEvaluator(Match match)
1658
        {
1659
            return _invertedEscapeTable[match.Value];
1660
        }
1661
1662
1663
        /// <summary>
1664
        /// escapes Bold [ * ] and Italic [ _ ] characters
1665
        /// </summary>
1666
        private string EscapeBoldItalic(string s)
1667
        {
1668
            s = s.Replace("*", _escapeTable["*"]);
1669
            s = s.Replace("_", _escapeTable["_"]);
1670
            return s;
1671
        }
1672
1673
        private static string AttributeEncode(string s)
1674
        {
1675
            return s.Replace(">", "&gt;").Replace("<", "&lt;").Replace("\"", "&quot;");
1676
        }
1677
1678
        private static char[] _problemUrlChars = @"""'*()[]$:".ToCharArray();
1679
1680
        /// <summary>
1681
        /// hex-encodes some unusual "problem" chars in URLs to avoid URL detection problems
1682
        /// </summary>
1683
        private string EncodeProblemUrlChars(string url)
1684
        {
1685
            if (!_encodeProblemUrlCharacters) return url;
1686
1687
            var sb = new StringBuilder(url.Length);
1688
            bool encode;
1689
            char c;
1690
1691
            for (int i = 0; i < url.Length; i++)
1692
            {
1693
                c = url[i];
1694
                encode = Array.IndexOf(_problemUrlChars, c) != -1;
1695
                if (encode && c == ':' && i < url.Length - 1)
1696
                    encode = !(url[i + 1] == '/') && !(url[i + 1] >= '0' && url[i + 1] <= '9');
1697
1698
                if (encode)
1699
                    sb.Append("%" + String.Format("{0:x}", (byte)c));
1700
                else
1701
                    sb.Append(c);
1702
            }
1703
1704
            return sb.ToString();
1705
        }
1706
1707
1708
        /// <summary>
1709
        /// Within tags -- meaning between &lt; and &gt; -- encode [\ ` * _] so they
1710
        /// don't conflict with their use in Markdown for code, italics and strong.
1711
        /// We're replacing each such character with its corresponding hash
1712
        /// value; this is likely overkill, but it should prevent us from colliding
1713
        /// with the escape values by accident.
1714
        /// </summary>
1715
        private string EscapeSpecialCharsWithinTagAttributes(string text)
1716
        {
1717
            var tokens = TokenizeHTML(text);
1718
1719
            // now, rebuild text from the tokens
1720
            var sb = new StringBuilder(text.Length);
1721
1722
            foreach (var token in tokens)
1723
            {
1724
                string value = token.Value;
1725
1726
                if (token.Type == TokenType.Tag)
1727
                {
1728
                    value = value.Replace(@"\", _escapeTable[@"\"]);
1729
1730
                    if (_autoHyperlink && value.StartsWith("<!")) // escape slashes in comments to prevent autolinking there -- http://meta.stackoverflow.com/questions/95987/html-comment-containing-url-breaks-if-followed-by-another-html-comment
1731
                        value = value.Replace("/", _escapeTable["/"]);
1732
1733
                    value = Regex.Replace(value, "(?<=.)</?code>(?=.)", _escapeTable[@"`"]);
1734
                    value = EscapeBoldItalic(value);
1735
                }
1736
1737
                sb.Append(value);
1738
            }
1739
1740
            return sb.ToString();
1741
        }
1742
1743
        /// <summary>
1744
        /// convert all tabs to _tabWidth spaces;
1745
        /// standardizes line endings from DOS (CR LF) or Mac (CR) to UNIX (LF);
1746
        /// makes sure text ends with a couple of newlines;
1747
        /// removes any blank lines (only spaces) in the text
1748
        /// </summary>
1749
        private string Normalize(string text)
1750
        {
1751
            var output = new StringBuilder(text.Length);
1752
            var line = new StringBuilder();
1753
            bool valid = false;
1754
1755
            for (int i = 0; i < text.Length; i++)
1756
            {
1757
                switch (text[i])
1758
                {
1759
                    case '\n':
1760
                        if (valid) output.Append(line);
1761
                        output.Append('\n');
1762
                        line.Length = 0; valid = false;
1763
                        break;
1764
                    case '\r':
1765
                        if ((i < text.Length - 1) && (text[i + 1] != '\n'))
1766
                        {
1767
                            if (valid) output.Append(line);
1768
                            output.Append('\n');
1769
                            line.Length = 0; valid = false;
1770
                        }
1771
                        break;
1772
                    case '\t':
1773
                        int width = (_tabWidth - line.Length % _tabWidth);
1774
                        for (int k = 0; k < width; k++)
1775
                            line.Append(' ');
1776
                        break;
1777
                    case '\x1A':
1778
                        break;
1779
                    default:
1780
                        if (!valid && text[i] != ' ') valid = true;
1781
                        line.Append(text[i]);
1782
                        break;
1783
                }
1784
            }
1785
1786
            if (valid) output.Append(line);
1787
            output.Append('\n');
1788
1789
            // add two newlines to the end before return
1790
            return output.Append("\n\n").ToString();
1791
        }
1792
1793
        #endregion
1794
1795
        /// <summary>
1796
        /// this is to emulate what's evailable in PHP
1797
        /// </summary>
1798
        private static string RepeatString(string text, int count)
1799
        {
1800
            var sb = new StringBuilder(text.Length * count);
1801
            for (int i = 0; i < count; i++)
1802
                sb.Append(text);
1803
            return sb.ToString();
1804
        }
1805
1806
    }
1807
}
클립보드 이미지 추가 (최대 크기: 500 MB)