1 /** 2 Markdown parser implementation 3 4 Copyright: © 2012-2014 RejectedSoftware e.K. 5 License: Subject to the terms of the MIT license, as written in the included LICENSE.txt file. 6 Authors: Sönke Ludwig 7 */ 8 module dmarkdown.markdown; 9 10 import dmarkdown.html; 11 import dmarkdown..string; 12 13 import std.algorithm : joiner, map, canFind, countUntil, min; 14 import std.array; 15 import std.ascii : isAlpha, isWhite; 16 import std.format; 17 import std.range; 18 import std..string; 19 20 /* 21 TODO: 22 detect inline HTML tags 23 */ 24 25 // TODO (dmarkdown) detailed API docs and examples for everything 26 27 unittest 28 { 29 auto text = 30 "=======\n" 31 "Heading\n" 32 "=======\n" 33 "\n" 34 "**bold** *italic*\n" 35 "\n" 36 "List:\n" 37 "\n" 38 " * a\n" 39 " * b\n" 40 " * c\n"; 41 42 import std.stdio; 43 writeln("==========="); 44 writeln(text); 45 writeln("==========="); 46 writeln(filterMarkdown(text)); 47 } 48 49 unittest 50 { 51 auto source = 52 `Merged prototype. The prototype is not locked, allowing to add more components. 53 To be used it must be locked by calling EntityPrototype.lockAndTrimMemory().`; 54 auto expected = 55 `<p>Merged prototype. The prototype is not locked, allowing to add more components. 56 To be used it must be locked by calling EntityPrototype.lockAndTrimMemory(). 57 </p> 58 `; 59 string result = filterMarkdown(source); 60 assert(result == expected); 61 } 62 63 unittest 64 { 65 auto source = `*stars* under_score_s`; 66 auto expectedUnderscores = `<p><em>stars</em> under<em>score</em>s 67 </p> 68 `; 69 auto expectedNoUnderscores = `<p><em>stars</em> under_score_s 70 </p> 71 `; 72 73 string resultUnderscores = filterMarkdown(source); 74 string resultNoUnderscores = filterMarkdown(source, MarkdownFlags.disableUnderscoreEmphasis); 75 76 assert(resultUnderscores == expectedUnderscores, 77 "'%s' != '%s'".format(resultUnderscores, expectedUnderscores)); 78 assert(resultNoUnderscores == expectedNoUnderscores, 79 "'%s' != '%s'".format(resultNoUnderscores, expectedNoUnderscores)); 80 } 81 82 /** Returns a Markdown filtered HTML string. 83 */ 84 string filterMarkdown()(string str, MarkdownFlags flags) 85 { 86 scope settings = new MarkdownSettings; 87 settings.flags = flags; 88 return filterMarkdown(str, settings); 89 } 90 /// ditto 91 string filterMarkdown()(string str, scope MarkdownSettings settings = null) 92 @trusted { // Appender not @safe as of 2.065 93 auto dst = appender!string(); 94 filterMarkdown(dst, str, settings); 95 return dst.data; 96 } 97 98 99 /** Markdown filters the given string and writes the corresponding HTML to an output range. 100 */ 101 void filterMarkdown(R)(ref R dst, string src, MarkdownFlags flags) 102 { 103 scope settings = new MarkdownSettings; 104 settings.flags = flags; 105 filterMarkdown(dst, src, settings); 106 } 107 /// ditto 108 void filterMarkdown(R)(ref R dst, string src, scope MarkdownSettings settings = null) 109 { 110 if (!settings) settings = new MarkdownSettings; 111 112 auto all_lines = splitLines(src); 113 auto links = scanForReferences(all_lines); 114 auto lines = parseLines(all_lines, settings); 115 Block root_block; 116 parseBlocks(root_block, lines, null, settings); 117 writeBlock(dst, root_block, links, settings); 118 } 119 120 /** 121 Returns the hierarchy of sections 122 */ 123 Section[] getMarkdownOutline(string markdown_source, scope MarkdownSettings settings = null) 124 { 125 import std.conv : to; 126 127 if (!settings) settings = new MarkdownSettings; 128 auto all_lines = splitLines(markdown_source); 129 auto lines = parseLines(all_lines, settings); 130 Block root_block; 131 parseBlocks(root_block, lines, null, settings); 132 Section root; 133 134 foreach (ref sb; root_block.blocks) { 135 if (sb.type == BlockType.Header) { 136 auto s = &root; 137 while (true) { 138 if (s.subSections.length == 0) break; 139 if (s.subSections[$-1].headingLevel >= sb.headerLevel) break; 140 s = &s.subSections[$-1]; 141 } 142 s.subSections ~= Section(sb.headerLevel, sb.text[0], sb.text[0].asSlug.to!string); 143 } 144 } 145 146 return root.subSections; 147 } 148 149 /// 150 unittest { 151 import std.conv : to; 152 assert(getMarkdownOutline("## first\n## second\n### third\n# fourth\n### fifth") == 153 [ 154 Section(2, " first", "first"), 155 Section(2, " second", "second", [ 156 Section(3, " third", "third") 157 ]), 158 Section(1, " fourth", "fourth", [ 159 Section(3, " fifth", "fifth") 160 ]) 161 ] 162 ); 163 } 164 165 final class MarkdownSettings { 166 /// Controls the capabilities of the parser. 167 MarkdownFlags flags = MarkdownFlags.vanillaMarkdown; 168 169 /// Heading tags will start at this level. 170 size_t headingBaseLevel = 1; 171 172 /// Called for every link/image URL to perform arbitrary transformations. 173 string delegate(string url_or_path, bool is_image) urlFilter; 174 175 /** An optional delegate to post-process code blocks and inline code. 176 * 177 * Useful to e.g. add code highlighting. 178 */ 179 string delegate(string) @safe nothrow processCode = null; 180 } 181 // Unittest for code post-processing 182 unittest 183 { 184 auto text = 185 "`inline code`\n" 186 "block:\n" 187 "\n" 188 " code block\n"; 189 auto expected = 190 "<p><code class=\"prettyprint\">AAAAAAAAAAA</code>\n" 191 "block:\n" 192 "</p>\n" 193 "<pre class=\"prettyprint\"><code>" 194 "AAAAAAAAAA" 195 "</code></pre>"; 196 197 import std.algorithm: filter; 198 string processCode(string input) @safe nothrow 199 { 200 import std.conv; 201 import std.exception: assumeWontThrow; 202 // ignore newlines generated by code block processing 203 input = input.filter!(c => c != '\n').array.to!string.assumeWontThrow; 204 return 'A'.repeat(input.length).array.to!string.assumeWontThrow; 205 } 206 auto settings = new MarkdownSettings; 207 settings.processCode = &processCode; 208 auto result = filterMarkdown(text, settings); 209 210 assert(result == expected, "Unexpected code processing result:\n" ~ 211 result ~ "\nExpected:\n" ~ expected); 212 } 213 214 enum MarkdownFlags { 215 none = 0, 216 keepLineBreaks = 1<<0, 217 backtickCodeBlocks = 1<<1, 218 noInlineHtml = 1<<2, 219 //noLinks = 1<<3, 220 //allowUnsafeHtml = 1<<4, 221 /// If used, subheadings are underlined by stars ('*') instead of dashes ('-') 222 alternateSubheaders = 1 << 5, 223 /// If used, '_' may not be used for emphasis ('*' may still be used) 224 disableUnderscoreEmphasis = 1 << 6, 225 vanillaMarkdown = none, 226 forumDefault = keepLineBreaks|backtickCodeBlocks|noInlineHtml 227 } 228 229 struct Section { 230 size_t headingLevel; 231 string caption; 232 string anchor; 233 Section[] subSections; 234 } 235 236 private { 237 immutable s_blockTags = ["div", "ol", "p", "pre", "section", "table", "ul"]; 238 } 239 240 private enum IndentType { 241 White, 242 Quote 243 } 244 245 private enum LineType { 246 Undefined, 247 Blank, 248 Plain, 249 Hline, 250 AtxHeader, 251 SetextHeader, 252 UList, 253 OList, 254 HtmlBlock, 255 CodeBlockDelimiter 256 } 257 258 private struct Line { 259 LineType type; 260 IndentType[] indent; 261 string text; 262 string unindented; 263 264 string unindent(size_t n) 265 pure @safe { 266 assert(n <= indent.length); 267 string ln = text; 268 foreach( i; 0 .. n ){ 269 final switch(indent[i]){ 270 case IndentType.White: 271 if( ln[0] == ' ' ) ln = ln[4 .. $]; 272 else ln = ln[1 .. $]; 273 break; 274 case IndentType.Quote: 275 ln = ln.stripLeft()[1 .. $]; 276 break; 277 } 278 } 279 return ln; 280 } 281 } 282 283 private Line[] parseLines(ref string[] lines, scope MarkdownSettings settings) 284 pure @safe { 285 Line[] ret; 286 const subHeaderChar = settings.flags * MarkdownFlags.alternateSubheaders ? '*' : '-'; 287 while( !lines.empty ){ 288 auto ln = lines.front; 289 lines.popFront(); 290 291 Line lninfo; 292 lninfo.text = ln; 293 294 while( ln.length > 0 ){ 295 if( ln[0] == '\t' ){ 296 lninfo.indent ~= IndentType.White; 297 ln.popFront(); 298 } else if( ln.startsWith(" ") ){ 299 lninfo.indent ~= IndentType.White; 300 ln.popFrontN(4); 301 } else { 302 ln = ln.stripLeft(); 303 if( ln.startsWith(">") ){ 304 lninfo.indent ~= IndentType.Quote; 305 ln.popFront(); 306 } else break; 307 } 308 } 309 lninfo.unindented = ln; 310 311 if( (settings.flags & MarkdownFlags.backtickCodeBlocks) && isCodeBlockDelimiter(ln) ) lninfo.type = LineType.CodeBlockDelimiter; 312 else if( isAtxHeaderLine(ln) ) lninfo.type = LineType.AtxHeader; 313 else if( isSetextHeaderLine(ln, subHeaderChar) ) lninfo.type = LineType.SetextHeader; 314 else if( isHlineLine(ln) ) lninfo.type = LineType.Hline; 315 else if( isOListLine(ln) ) lninfo.type = LineType.OList; 316 else if( isUListLine(ln) ) lninfo.type = LineType.UList; 317 else if( isLineBlank(ln) ) lninfo.type = LineType.Blank; 318 else if( !(settings.flags & MarkdownFlags.noInlineHtml) && isHtmlBlockLine(ln) ) lninfo.type = LineType.HtmlBlock; 319 else lninfo.type = LineType.Plain; 320 321 ret ~= lninfo; 322 } 323 return ret; 324 } 325 326 private enum BlockType { 327 Plain, 328 Text, 329 Paragraph, 330 Header, 331 OList, 332 UList, 333 ListItem, 334 Code, 335 Quote 336 } 337 338 private struct Block { 339 BlockType type; 340 string[] text; 341 Block[] blocks; 342 size_t headerLevel; 343 344 // A human-readable toString for debugging. 345 string toString() 346 { 347 return toStringNested; 348 } 349 350 // toString implementation; capable of indenting nested blocks. 351 string toStringNested(uint depth = 0) 352 { 353 import std.conv: to; 354 string indent = " ".repeat(depth * 2).joiner.array.to!string; 355 return indent ~ "%s\n".format(type) ~ 356 indent ~ "%s\n".format(text) ~ 357 blocks.map!((ref b) => b.toStringNested(depth + 1)).joiner.array.to!string ~ 358 indent ~ "%s\n".format(headerLevel); 359 } 360 } 361 362 private void parseBlocks(ref Block root, ref Line[] lines, IndentType[] base_indent, scope MarkdownSettings settings) 363 pure @safe { 364 if( base_indent.length == 0 ) root.type = BlockType.Text; 365 else if( base_indent[$-1] == IndentType.Quote ) root.type = BlockType.Quote; 366 367 while( !lines.empty ){ 368 auto ln = lines.front; 369 370 if( ln.type == LineType.Blank ){ 371 lines.popFront(); 372 continue; 373 } 374 375 if( ln.indent != base_indent ){ 376 if( ln.indent.length < base_indent.length || ln.indent[0 .. base_indent.length] != base_indent ) 377 return; 378 379 auto cindent = base_indent ~ IndentType.White; 380 if( ln.indent == cindent ){ 381 Block cblock; 382 cblock.type = BlockType.Code; 383 while( !lines.empty && lines.front.indent.length >= cindent.length 384 && lines.front.indent[0 .. cindent.length] == cindent) 385 { 386 cblock.text ~= lines.front.unindent(cindent.length); 387 lines.popFront(); 388 } 389 root.blocks ~= cblock; 390 } else { 391 Block subblock; 392 parseBlocks(subblock, lines, ln.indent[0 .. base_indent.length+1], settings); 393 root.blocks ~= subblock; 394 } 395 } else { 396 Block b; 397 final switch(ln.type){ 398 case LineType.Undefined: assert(false); 399 case LineType.Blank: assert(false); 400 case LineType.Plain: 401 if( lines.length >= 2 && lines[1].type == LineType.SetextHeader ){ 402 auto setln = lines[1].unindented; 403 b.type = BlockType.Header; 404 b.text = [ln.unindented]; 405 b.headerLevel = setln.strip()[0] == '=' ? 1 : 2; 406 lines.popFrontN(2); 407 } else { 408 b.type = BlockType.Paragraph; 409 b.text = skipText(lines, base_indent); 410 } 411 break; 412 case LineType.Hline: 413 b.type = BlockType.Plain; 414 b.text = ["<hr>"]; 415 lines.popFront(); 416 break; 417 case LineType.AtxHeader: 418 b.type = BlockType.Header; 419 string hl = ln.unindented; 420 b.headerLevel = 0; 421 while( hl.length > 0 && hl[0] == '#' ){ 422 b.headerLevel++; 423 hl = hl[1 .. $]; 424 } 425 while( hl.length > 0 && (hl[$-1] == '#' || hl[$-1] == ' ') ) 426 hl = hl[0 .. $-1]; 427 b.text = [hl]; 428 lines.popFront(); 429 break; 430 case LineType.SetextHeader: 431 lines.popFront(); 432 break; 433 case LineType.UList: 434 case LineType.OList: 435 b.type = ln.type == LineType.UList ? BlockType.UList : BlockType.OList; 436 auto itemindent = base_indent ~ IndentType.White; 437 bool firstItem = true, paraMode = false; 438 while(!lines.empty && lines.front.type == ln.type && lines.front.indent == base_indent ){ 439 Block itm; 440 itm.text = skipText(lines, itemindent); 441 itm.text[0] = removeListPrefix(itm.text[0], ln.type); 442 443 // emit <p></p> if there are blank lines between the items 444 if( firstItem && !lines.empty && lines.front.type == LineType.Blank ) 445 paraMode = true; 446 firstItem = false; 447 if( paraMode ){ 448 Block para; 449 para.type = BlockType.Paragraph; 450 para.text = itm.text; 451 itm.blocks ~= para; 452 itm.text = null; 453 } 454 455 parseBlocks(itm, lines, itemindent, settings); 456 itm.type = BlockType.ListItem; 457 b.blocks ~= itm; 458 } 459 break; 460 case LineType.HtmlBlock: 461 int nestlevel = 0; 462 auto starttag = parseHtmlBlockLine(ln.unindented); 463 if( !starttag.isHtmlBlock || !starttag.open ) 464 break; 465 466 b.type = BlockType.Plain; 467 while(!lines.empty){ 468 if( lines.front.indent.length < base_indent.length ) break; 469 if( lines.front.indent[0 .. base_indent.length] != base_indent ) break; 470 471 auto str = lines.front.unindent(base_indent.length); 472 auto taginfo = parseHtmlBlockLine(str); 473 b.text ~= lines.front.unindent(base_indent.length); 474 lines.popFront(); 475 if( taginfo.isHtmlBlock && taginfo.tagName == starttag.tagName ) 476 nestlevel += taginfo.open ? 1 : -1; 477 if( nestlevel <= 0 ) break; 478 } 479 break; 480 case LineType.CodeBlockDelimiter: 481 lines.popFront(); // TODO: get language from line 482 b.type = BlockType.Code; 483 while(!lines.empty){ 484 if( lines.front.indent.length < base_indent.length ) break; 485 if( lines.front.indent[0 .. base_indent.length] != base_indent ) break; 486 if( lines.front.type == LineType.CodeBlockDelimiter ){ 487 lines.popFront(); 488 break; 489 } 490 b.text ~= lines.front.unindent(base_indent.length); 491 lines.popFront(); 492 } 493 break; 494 } 495 root.blocks ~= b; 496 } 497 } 498 } 499 500 private string[] skipText(ref Line[] lines, IndentType[] indent) 501 pure @safe { 502 static bool matchesIndent(IndentType[] indent, IndentType[] base_indent) 503 { 504 // Any *plain* line with a higher indent should still be a part of 505 // a paragraph read by skipText(). Returning false here resulted in 506 // text such as: 507 // --- 508 // First line 509 // Second line 510 // --- 511 // being interpreted as a paragraph followed by a code block, even though 512 // other Markdown processors would interpret it as a single paragraph. 513 514 // if( indent.length > base_indent.length ) return false; 515 if( indent.length > base_indent.length ) return true; 516 if( indent != base_indent[0 .. indent.length] ) return false; 517 sizediff_t qidx = -1; 518 foreach_reverse (i, tp; base_indent) if (tp == IndentType.Quote) { qidx = i; break; } 519 if( qidx >= 0 ){ 520 qidx = base_indent.length-1 - qidx; 521 if( indent.length <= qidx ) return false; 522 } 523 return true; 524 } 525 526 string[] ret; 527 528 while(true){ 529 ret ~= lines.front.unindent(min(indent.length, lines.front.indent.length)); 530 lines.popFront(); 531 532 if( lines.empty || !matchesIndent(lines.front.indent, indent) || lines.front.type != LineType.Plain ) 533 return ret; 534 } 535 } 536 537 /// private 538 private void writeBlock(R)(ref R dst, ref const Block block, LinkRef[string] links, scope MarkdownSettings settings) 539 { 540 final switch(block.type){ 541 case BlockType.Plain: 542 foreach( ln; block.text ){ 543 dst.put(ln); 544 dst.put("\n"); 545 } 546 foreach(b; block.blocks) 547 writeBlock(dst, b, links, settings); 548 break; 549 case BlockType.Text: 550 writeMarkdownEscaped(dst, block, links, settings); 551 foreach(b; block.blocks) 552 writeBlock(dst, b, links, settings); 553 break; 554 case BlockType.Paragraph: 555 assert(block.blocks.length == 0); 556 dst.put("<p>"); 557 writeMarkdownEscaped(dst, block, links, settings); 558 dst.put("</p>\n"); 559 break; 560 case BlockType.Header: 561 assert(block.blocks.length == 0); 562 auto hlvl = block.headerLevel + (settings ? settings.headingBaseLevel-1 : 0); 563 dst.formattedWrite("<h%s id=\"%s\">", hlvl, block.text[0].asSlug); 564 assert(block.text.length == 1); 565 writeMarkdownEscaped(dst, block.text[0], links, settings); 566 dst.formattedWrite("</h%s>\n", hlvl); 567 break; 568 case BlockType.OList: 569 dst.put("<ol>\n"); 570 foreach(b; block.blocks) 571 writeBlock(dst, b, links, settings); 572 dst.put("</ol>\n"); 573 break; 574 case BlockType.UList: 575 dst.put("<ul>\n"); 576 foreach(b; block.blocks) 577 writeBlock(dst, b, links, settings); 578 dst.put("</ul>\n"); 579 break; 580 case BlockType.ListItem: 581 dst.put("<li>"); 582 writeMarkdownEscaped(dst, block, links, settings); 583 foreach(b; block.blocks) 584 writeBlock(dst, b, links, settings); 585 dst.put("</li>\n"); 586 break; 587 case BlockType.Code: 588 assert(block.blocks.length == 0); 589 dst.put("<pre class=\"prettyprint\"><code>"); 590 if(settings.processCode is null) 591 { 592 foreach(ln; block.text){ 593 filterHTMLEscape(dst, ln); 594 dst.put("\n"); 595 } 596 } 597 else 598 { 599 auto temp = appender!string(); 600 foreach(ln; block.text){ 601 filterHTMLEscape(temp, ln); 602 temp.put("\n"); 603 } 604 dst.put(settings.processCode(temp.data)); 605 } 606 dst.put("</code></pre>"); 607 break; 608 case BlockType.Quote: 609 dst.put("<blockquote>"); 610 writeMarkdownEscaped(dst, block, links, settings); 611 foreach(b; block.blocks) 612 writeBlock(dst, b, links, settings); 613 dst.put("</blockquote>\n"); 614 break; 615 } 616 } 617 618 private void writeMarkdownEscaped(R)(ref R dst, ref const Block block, in LinkRef[string] links, scope MarkdownSettings settings) 619 { 620 auto lines = cast(string[])block.text; 621 auto text = settings.flags & MarkdownFlags.keepLineBreaks ? lines.join("<br>") : lines.join("\n"); 622 writeMarkdownEscaped(dst, text, links, settings); 623 if (lines.length) dst.put("\n"); 624 } 625 626 /// private 627 private void writeMarkdownEscaped(R)(ref R dst, string ln, in LinkRef[string] linkrefs, scope MarkdownSettings settings) 628 { 629 string filterLink(string lnk, bool is_image) { 630 return settings.urlFilter ? settings.urlFilter(lnk, is_image) : lnk; 631 } 632 633 bool br = ln.endsWith(" "); 634 while( ln.length > 0 ){ 635 switch( ln[0] ){ 636 default: 637 dst.put(ln[0]); 638 ln = ln[1 .. $]; 639 break; 640 case '\\': 641 if( ln.length >= 2 ){ 642 switch(ln[1]){ 643 default: 644 dst.put(ln[0 .. 2]); 645 ln = ln[2 .. $]; 646 break; 647 case '\'', '`', '*', '_', '{', '}', '[', ']', 648 '(', ')', '#', '+', '-', '.', '!': 649 dst.put(ln[1]); 650 ln = ln[2 .. $]; 651 break; 652 } 653 } else { 654 dst.put(ln[0]); 655 ln = ln[1 .. $]; 656 } 657 break; 658 case '_': 659 if(settings.flags & MarkdownFlags.disableUnderscoreEmphasis) 660 { 661 dst.put(ln[0]); 662 ln = ln[1 .. $]; 663 break; 664 } 665 goto case; 666 case '*': 667 string text; 668 if( auto em = parseEmphasis(ln, text) ){ 669 dst.put(em == 1 ? "<em>" : em == 2 ? "<strong>" : "<strong><em>"); 670 filterHTMLEscape(dst, text, HTMLEscapeFlags.escapeMinimal); 671 dst.put(em == 1 ? "</em>" : em == 2 ? "</strong>": "</em></strong>"); 672 } else { 673 dst.put(ln[0]); 674 ln = ln[1 .. $]; 675 } 676 break; 677 case '`': 678 string code; 679 if( parseInlineCode(ln, code) ){ 680 dst.put("<code class=\"prettyprint\">"); 681 if(settings.processCode is null) 682 { 683 filterHTMLEscape(dst, code, HTMLEscapeFlags.escapeMinimal); 684 } 685 else 686 { 687 auto temp = appender!string(); 688 filterHTMLEscape(temp, code, HTMLEscapeFlags.escapeMinimal); 689 dst.put(settings.processCode(temp.data)); 690 } 691 dst.put("</code>"); 692 } else { 693 dst.put(ln[0]); 694 ln = ln[1 .. $]; 695 } 696 break; 697 case '[': 698 Link link; 699 if( parseLink(ln, link, linkrefs) ){ 700 dst.put("<a href=\""); 701 filterHTMLAttribEscape(dst, filterLink(link.url, false)); 702 dst.put("\""); 703 if( link.title.length ){ 704 dst.put(" title=\""); 705 filterHTMLAttribEscape(dst, link.title); 706 dst.put("\""); 707 } 708 dst.put(">"); 709 writeMarkdownEscaped(dst, link.text, linkrefs, settings); 710 dst.put("</a>"); 711 } else { 712 dst.put(ln[0]); 713 ln = ln[1 .. $]; 714 } 715 break; 716 case '!': 717 Link link; 718 if( parseLink(ln, link, linkrefs) ){ 719 dst.put("<img src=\""); 720 filterHTMLAttribEscape(dst, filterLink(link.url, true)); 721 dst.put("\" alt=\""); 722 filterHTMLAttribEscape(dst, link.text); 723 dst.put("\""); 724 if( link.title.length ){ 725 dst.put(" title=\""); 726 filterHTMLAttribEscape(dst, link.title); 727 dst.put("\""); 728 } 729 dst.put(">"); 730 } else if( ln.length >= 2 ){ 731 dst.put(ln[0 .. 2]); 732 ln = ln[2 .. $]; 733 } else { 734 dst.put(ln[0]); 735 ln = ln[1 .. $]; 736 } 737 break; 738 case '>': 739 if( settings.flags & MarkdownFlags.noInlineHtml ) dst.put(">"); 740 else dst.put(ln[0]); 741 ln = ln[1 .. $]; 742 break; 743 case '<': 744 string url; 745 if( parseAutoLink(ln, url) ){ 746 bool is_email = url.startsWith("mailto:"); 747 dst.put("<a href=\""); 748 if( is_email ) filterHTMLAllEscape(dst, url); 749 else filterHTMLAttribEscape(dst, filterLink(url, false)); 750 dst.put("\">"); 751 if( is_email ) filterHTMLAllEscape(dst, url[7 .. $]); 752 else filterHTMLEscape(dst, url, HTMLEscapeFlags.escapeMinimal); 753 dst.put("</a>"); 754 } else { 755 if (ln.startsWith("<br>")) { 756 // always support line breaks, since we embed them here ourselves! 757 dst.put("<br/>"); 758 ln = ln[4 .. $]; 759 } else if(ln.startsWith("<br/>")) { 760 dst.put("<br/>"); 761 ln = ln[5 .. $]; 762 } else { 763 if( settings.flags & MarkdownFlags.noInlineHtml ) dst.put("<"); 764 else dst.put(ln[0]); 765 ln = ln[1 .. $]; 766 } 767 } 768 break; 769 } 770 } 771 if( br ) dst.put("<br/>"); 772 } 773 774 private bool isLineBlank(string ln) 775 pure @safe { 776 return allOf(ln, " \t"); 777 } 778 779 private bool isSetextHeaderLine(string ln, char subHeaderChar) 780 pure @safe { 781 ln = stripLeft(ln); 782 if( ln.length < 1 ) return false; 783 if( ln[0] == '=' ){ 784 while(!ln.empty && ln.front == '=') ln.popFront(); 785 return allOf(ln, " \t"); 786 } 787 if( ln[0] == subHeaderChar ){ 788 while(!ln.empty && ln.front == subHeaderChar) ln.popFront(); 789 return allOf(ln, " \t"); 790 } 791 return false; 792 } 793 794 private bool isAtxHeaderLine(string ln) 795 pure @safe { 796 ln = stripLeft(ln); 797 size_t i = 0; 798 while( i < ln.length && ln[i] == '#' ) i++; 799 if( i < 1 || i > 6 || i >= ln.length ) return false; 800 return ln[i] == ' '; 801 } 802 803 private bool isHlineLine(string ln) 804 pure @safe { 805 if( allOf(ln, " -") && count(ln, '-') >= 3 ) return true; 806 if( allOf(ln, " *") && count(ln, '*') >= 3 ) return true; 807 if( allOf(ln, " _") && count(ln, '_') >= 3 ) return true; 808 return false; 809 } 810 811 private bool isQuoteLine(string ln) 812 pure @safe { 813 return ln.stripLeft().startsWith(">"); 814 } 815 816 private size_t getQuoteLevel(string ln) 817 pure @safe { 818 size_t level = 0; 819 ln = stripLeft(ln); 820 while( ln.length > 0 && ln[0] == '>' ){ 821 level++; 822 ln = stripLeft(ln[1 .. $]); 823 } 824 return level; 825 } 826 827 private bool isUListLine(string ln) 828 pure @safe { 829 ln = stripLeft(ln); 830 if (ln.length < 2) return false; 831 if (!canFind("*+-", ln[0])) return false; 832 if (ln[1] != ' ' && ln[1] != '\t') return false; 833 return true; 834 } 835 836 private bool isOListLine(string ln) 837 pure @safe { 838 ln = stripLeft(ln); 839 if( ln.length < 1 ) return false; 840 if( ln[0] < '0' || ln[0] > '9' ) return false; 841 ln = ln[1 .. $]; 842 while( ln.length > 0 && ln[0] >= '0' && ln[0] <= '9' ) 843 ln = ln[1 .. $]; 844 if( ln.length < 2 ) return false; 845 if( ln[0] != '.' ) return false; 846 if( ln[1] != ' ' && ln[1] != '\t' ) 847 return false; 848 return true; 849 } 850 851 private string removeListPrefix(string str, LineType tp) 852 pure @safe { 853 switch(tp){ 854 default: assert(false); 855 case LineType.OList: // skip bullets and output using normal escaping 856 auto idx = str.indexOfCT('.'); 857 assert(idx > 0); 858 return str[idx+1 .. $].stripLeft(); 859 case LineType.UList: 860 return stripLeft(str.stripLeft()[1 .. $]); 861 } 862 } 863 864 865 private auto parseHtmlBlockLine(string ln) 866 pure @safe { 867 struct HtmlBlockInfo { 868 bool isHtmlBlock; 869 string tagName; 870 bool open; 871 } 872 873 HtmlBlockInfo ret; 874 ret.isHtmlBlock = false; 875 ret.open = true; 876 877 ln = strip(ln); 878 if( ln.length < 3 ) return ret; 879 if( ln[0] != '<' ) return ret; 880 if( ln[1] == '/' ){ 881 ret.open = false; 882 ln = ln[1 .. $]; 883 } 884 if( !isAlpha(ln[1]) ) return ret; 885 ln = ln[1 .. $]; 886 size_t idx = 0; 887 while( idx < ln.length && ln[idx] != ' ' && ln[idx] != '>' ) 888 idx++; 889 ret.tagName = ln[0 .. idx]; 890 ln = ln[idx .. $]; 891 892 auto eidx = ln.indexOf('>'); 893 if( eidx < 0 ) return ret; 894 if( eidx != ln.length-1 ) return ret; 895 896 if (!s_blockTags.canFind(ret.tagName)) return ret; 897 898 ret.isHtmlBlock = true; 899 return ret; 900 } 901 902 private bool isHtmlBlockLine(string ln) 903 pure @safe { 904 auto bi = parseHtmlBlockLine(ln); 905 return bi.isHtmlBlock && bi.open; 906 } 907 908 private bool isHtmlBlockCloseLine(string ln) 909 pure @safe { 910 auto bi = parseHtmlBlockLine(ln); 911 return bi.isHtmlBlock && !bi.open; 912 } 913 914 private bool isCodeBlockDelimiter(string ln) 915 pure @safe { 916 return ln.startsWith("```"); 917 } 918 919 private string getHtmlTagName(string ln) 920 pure @safe { 921 return parseHtmlBlockLine(ln).tagName; 922 } 923 924 private bool isLineIndented(string ln) 925 pure @safe { 926 return ln.startsWith("\t") || ln.startsWith(" "); 927 } 928 929 private string unindentLine(string ln) 930 pure @safe { 931 if( ln.startsWith("\t") ) return ln[1 .. $]; 932 if( ln.startsWith(" ") ) return ln[4 .. $]; 933 assert(false); 934 } 935 936 private int parseEmphasis(ref string str, ref string text) 937 pure @safe { 938 string pstr = str; 939 if( pstr.length < 3 ) return false; 940 941 string ctag; 942 if( pstr.startsWith("***") ) ctag = "***"; 943 else if( pstr.startsWith("**") ) ctag = "**"; 944 else if( pstr.startsWith("*") ) ctag = "*"; 945 else if( pstr.startsWith("___") ) ctag = "___"; 946 else if( pstr.startsWith("__") ) ctag = "__"; 947 else if( pstr.startsWith("_") ) ctag = "_"; 948 else return false; 949 950 pstr = pstr[ctag.length .. $]; 951 952 auto cidx = () @trusted { return pstr.indexOf(ctag); }(); 953 if( cidx < 1 ) return false; 954 955 text = pstr[0 .. cidx]; 956 957 str = pstr[cidx+ctag.length .. $]; 958 return cast(int)ctag.length; 959 } 960 961 private bool parseInlineCode(ref string str, ref string code) 962 pure @safe { 963 string pstr = str; 964 if( pstr.length < 3 ) return false; 965 string ctag; 966 if( pstr.startsWith("``") ) ctag = "``"; 967 else if( pstr.startsWith("`") ) ctag = "`"; 968 else return false; 969 pstr = pstr[ctag.length .. $]; 970 971 auto cidx = () @trusted { return pstr.indexOf(ctag); }(); 972 if( cidx < 1 ) return false; 973 974 code = pstr[0 .. cidx]; 975 str = pstr[cidx+ctag.length .. $]; 976 return true; 977 } 978 979 private bool parseLink(ref string str, ref Link dst, in LinkRef[string] linkrefs) 980 pure @safe { 981 string pstr = str; 982 if( pstr.length < 3 ) return false; 983 // ignore img-link prefix 984 if( pstr[0] == '!' ) pstr = pstr[1 .. $]; 985 986 // parse the text part [text] 987 if( pstr[0] != '[' ) return false; 988 auto cidx = pstr.matchBracket(); 989 if( cidx < 1 ) return false; 990 string refid; 991 dst.text = pstr[1 .. cidx]; 992 pstr = pstr[cidx+1 .. $]; 993 994 // parse either (link '['"title"']') or '[' ']'[refid] 995 if( pstr.length < 2 ) return false; 996 if( pstr[0] == '('){ 997 cidx = pstr.matchBracket(); 998 if( cidx < 1 ) return false; 999 auto inner = pstr[1 .. cidx]; 1000 immutable qidx = inner.indexOfCT('"'); 1001 if( qidx > 1 && inner[qidx - 1].isWhite()){ 1002 dst.url = inner[0 .. qidx].stripRight(); 1003 immutable len = inner[qidx .. $].lastIndexOf('"'); 1004 if( len == 0 ) return false; 1005 assert(len > 0); 1006 dst.title = inner[qidx + 1 .. qidx + len]; 1007 } else { 1008 dst.url = inner.stripRight(); 1009 dst.title = null; 1010 } 1011 if (dst.url.startsWith("<") && dst.url.endsWith(">")) 1012 dst.url = dst.url[1 .. $-1]; 1013 pstr = pstr[cidx+1 .. $]; 1014 } else { 1015 if( pstr[0] == ' ' ) pstr = pstr[1 .. $]; 1016 if( pstr[0] != '[' ) return false; 1017 pstr = pstr[1 .. $]; 1018 cidx = pstr.indexOfCT(']'); 1019 if( cidx < 0 ) return false; 1020 if( cidx == 0 ) refid = dst.text; 1021 else refid = pstr[0 .. cidx]; 1022 pstr = pstr[cidx+1 .. $]; 1023 } 1024 1025 1026 if( refid.length > 0 ){ 1027 auto pr = toLower(refid) in linkrefs; 1028 if( !pr ){ 1029 // debug if (!__ctfe) logDebug("[LINK REF NOT FOUND: '%s'", refid); 1030 return false; 1031 } 1032 dst.url = pr.url; 1033 dst.title = pr.title; 1034 } 1035 1036 str = pstr; 1037 return true; 1038 } 1039 1040 @safe unittest 1041 { 1042 static void testLink(string s, Link exp, in LinkRef[string] refs) 1043 { 1044 Link link; 1045 assert(parseLink(s, link, refs), s); 1046 assert(link == exp); 1047 } 1048 LinkRef[string] refs; 1049 refs["ref"] = LinkRef("ref", "target", "title"); 1050 1051 testLink(`[link](target)`, Link("link", "target"), null); 1052 testLink(`[link](target "title")`, Link("link", "target", "title"), null); 1053 testLink(`[link](target "title")`, Link("link", "target", "title"), null); 1054 testLink(`[link](target "title" )`, Link("link", "target", "title"), null); 1055 1056 testLink(`[link](target)`, Link("link", "target"), null); 1057 testLink(`[link](target "title")`, Link("link", "target", "title"), null); 1058 1059 testLink(`[link][ref]`, Link("link", "target", "title"), refs); 1060 testLink(`[ref][]`, Link("ref", "target", "title"), refs); 1061 1062 testLink(`[link[with brackets]](target)`, Link("link[with brackets]", "target"), null); 1063 testLink(`[link[with brackets]][ref]`, Link("link[with brackets]", "target", "title"), refs); 1064 1065 testLink(`[link](/target with spaces )`, Link("link", "/target with spaces"), null); 1066 testLink(`[link](/target with spaces "title")`, Link("link", "/target with spaces", "title"), null); 1067 1068 testLink(`[link](white-space "around title" )`, Link("link", "white-space", "around title"), null); 1069 testLink(`[link](tabs "around title" )`, Link("link", "tabs", "around title"), null); 1070 1071 testLink(`[link](target "")`, Link("link", "target", ""), null); 1072 testLink(`[link](target-no-title"foo" )`, Link("link", "target-no-title\"foo\"", ""), null); 1073 1074 testLink(`[link](<target>)`, Link("link", "target"), null); 1075 1076 auto failing = [ 1077 `text`, `[link](target`, `[link]target)`, `[link]`, 1078 `[link(target)`, `link](target)`, `[link] (target)`, 1079 `[link][noref]`, `[noref][]` 1080 ]; 1081 Link link; 1082 foreach (s; failing) 1083 assert(!parseLink(s, link, refs), s); 1084 } 1085 1086 private bool parseAutoLink(ref string str, ref string url) 1087 pure @safe { 1088 string pstr = str; 1089 if( pstr.length < 3 ) return false; 1090 if( pstr[0] != '<' ) return false; 1091 pstr = pstr[1 .. $]; 1092 auto cidx = pstr.indexOf('>'); 1093 if( cidx < 0 ) return false; 1094 url = pstr[0 .. cidx]; 1095 if( anyOf(url, " \t") ) return false; 1096 if( !anyOf(url, ":@") ) return false; 1097 str = pstr[cidx+1 .. $]; 1098 if( url.indexOf('@') > 0 ) url = "mailto:"~url; 1099 return true; 1100 } 1101 1102 private LinkRef[string] scanForReferences(ref string[] lines) 1103 pure @safe { 1104 LinkRef[string] ret; 1105 bool[size_t] reflines; 1106 1107 // search for reference definitions: 1108 // [refid] link "opt text" 1109 // [refid] <link> "opt text" 1110 // "opt text", 'opt text', (opt text) 1111 // line must not be indented 1112 foreach( lnidx, ln; lines ){ 1113 if( isLineIndented(ln) ) continue; 1114 ln = strip(ln); 1115 if( !ln.startsWith("[") ) continue; 1116 ln = ln[1 .. $]; 1117 1118 auto idx = () @trusted { return ln.indexOf("]:"); }(); 1119 if( idx < 0 ) continue; 1120 string refid = ln[0 .. idx]; 1121 ln = stripLeft(ln[idx+2 .. $]); 1122 1123 string url; 1124 if( ln.startsWith("<") ){ 1125 idx = ln.indexOfCT('>'); 1126 if( idx < 0 ) continue; 1127 url = ln[1 .. idx]; 1128 ln = ln[idx+1 .. $]; 1129 } else { 1130 idx = ln.indexOfCT(' '); 1131 if( idx > 0 ){ 1132 url = ln[0 .. idx]; 1133 ln = ln[idx+1 .. $]; 1134 } else { 1135 idx = ln.indexOfCT('\t'); 1136 if( idx < 0 ){ 1137 url = ln; 1138 ln = ln[$ .. $]; 1139 } else { 1140 url = ln[0 .. idx]; 1141 ln = ln[idx+1 .. $]; 1142 } 1143 } 1144 } 1145 ln = stripLeft(ln); 1146 1147 string title; 1148 if( ln.length >= 3 ){ 1149 if( ln[0] == '(' && ln[$-1] == ')' || ln[0] == '\"' && ln[$-1] == '\"' || ln[0] == '\'' && ln[$-1] == '\'' ) 1150 title = ln[1 .. $-1]; 1151 } 1152 1153 ret[toLower(refid)] = LinkRef(refid, url, title); 1154 reflines[lnidx] = true; 1155 1156 // debug if (!__ctfe) logTrace("[detected ref on line %d]", lnidx+1); 1157 } 1158 1159 // remove all lines containing references 1160 auto nonreflines = appender!(string[])(); 1161 nonreflines.reserve(lines.length); 1162 foreach( i, ln; lines ) 1163 if( i !in reflines ) 1164 nonreflines.put(ln); 1165 lines = nonreflines.data(); 1166 1167 return ret; 1168 } 1169 1170 1171 /** 1172 Generates an identifier suitable to use as within a URL. 1173 1174 The resulting string will contain only ASCII lower case alphabetic or 1175 numeric characters, as well as dashes (-). Every sequence of 1176 non-alphanumeric characters will be replaced by a single dash. No dashes 1177 will be at either the front or the back of the result string. 1178 */ 1179 auto asSlug(R)(R text) 1180 if (isInputRange!R && is(typeof(R.init.front) == dchar)) 1181 { 1182 static struct SlugRange { 1183 private { 1184 R _input; 1185 bool _dash; 1186 } 1187 1188 this(R input) 1189 { 1190 _input = input; 1191 skipNonAlphaNum(); 1192 } 1193 1194 @property bool empty() const { return _dash ? false : _input.empty; } 1195 @property char front() const { 1196 if (_dash) return '-'; 1197 1198 char r = cast(char)_input.front; 1199 if (r >= 'A' && r <= 'Z') return cast(char)(r + ('a' - 'A')); 1200 return r; 1201 } 1202 1203 void popFront() 1204 { 1205 if (_dash) { 1206 _dash = false; 1207 return; 1208 } 1209 1210 _input.popFront(); 1211 auto na = skipNonAlphaNum(); 1212 if (na && !_input.empty) 1213 _dash = true; 1214 } 1215 1216 private bool skipNonAlphaNum() 1217 { 1218 bool have_skipped = false; 1219 while (!_input.empty) { 1220 switch (_input.front) { 1221 default: 1222 _input.popFront(); 1223 have_skipped = true; 1224 break; 1225 case 'a': .. case 'z': 1226 case 'A': .. case 'Z': 1227 case '0': .. case '9': 1228 return have_skipped; 1229 } 1230 } 1231 return have_skipped; 1232 } 1233 } 1234 return SlugRange(text); 1235 } 1236 1237 unittest { 1238 import std.algorithm : equal; 1239 assert("".asSlug.equal("")); 1240 assert(".,-".asSlug.equal("")); 1241 assert("abc".asSlug.equal("abc")); 1242 assert("aBc123".asSlug.equal("abc123")); 1243 assert("....aBc...123...".asSlug.equal("abc-123")); 1244 } 1245 1246 private struct LinkRef { 1247 string id; 1248 string url; 1249 string title; 1250 } 1251 1252 private struct Link { 1253 string text; 1254 string url; 1255 string title; 1256 } 1257 1258 @safe unittest { // alt and title attributes 1259 assert(filterMarkdown("![alt](http://example.org/image)") 1260 == "<p><img src=\"http://example.org/image\" alt=\"alt\">\n</p>\n"); 1261 assert(filterMarkdown("![alt](http://example.org/image \"Title\")") 1262 == "<p><img src=\"http://example.org/image\" alt=\"alt\" title=\"Title\">\n</p>\n"); 1263 } 1264 1265 @safe unittest { // complex links 1266 assert(filterMarkdown("their [install\ninstructions](<http://www.brew.sh>) and") 1267 == "<p>their <a href=\"http://www.brew.sh\">install\ninstructions</a> and\n</p>\n"); 1268 assert(filterMarkdown("[![Build Status](https://travis-ci.org/rejectedsoftware/vibe.d.png)](https://travis-ci.org/rejectedsoftware/vibe.d)") 1269 == "<p><a href=\"https://travis-ci.org/rejectedsoftware/vibe.d\"><img src=\"https://travis-ci.org/rejectedsoftware/vibe.d.png\" alt=\"Build Status\"></a>\n</p>\n"); 1270 } 1271 1272 @safe unittest { // check CTFE-ability 1273 enum res = filterMarkdown("### some markdown\n[foo][]\n[foo]: /bar"); 1274 assert(res == "<h3 id=\"some-markdown\"> some markdown</h3>\n<p><a href=\"/bar\">foo</a>\n</p>\n", res); 1275 } 1276 1277 @safe unittest { // correct line breaks in restrictive mode 1278 auto res = filterMarkdown("hello\nworld", MarkdownFlags.forumDefault); 1279 assert(res == "<p>hello<br/>world\n</p>\n", res); 1280 } 1281 1282 /*@safe unittest { // code blocks and blockquotes 1283 assert(filterMarkdown("\tthis\n\tis\n\tcode") == 1284 "<pre><code>this\nis\ncode</code></pre>\n"); 1285 assert(filterMarkdown(" this\n is\n code") == 1286 "<pre><code>this\nis\ncode</code></pre>\n"); 1287 assert(filterMarkdown(" this\n is\n\tcode") == 1288 "<pre><code>this\nis</code></pre>\n<pre><code>code</code></pre>\n"); 1289 assert(filterMarkdown("\tthis\n\n\tcode") == 1290 "<pre><code>this\n\ncode</code></pre>\n"); 1291 assert(filterMarkdown("\t> this") == 1292 "<pre><code>> this</code></pre>\n"); 1293 assert(filterMarkdown("> this") == 1294 "<blockquote><pre><code>this</code></pre></blockquote>\n"); 1295 assert(filterMarkdown("> this\n is code") == 1296 "<blockquote><pre><code>this\nis code</code></pre></blockquote>\n"); 1297 }*/