From 4de8c983f16ea0957d3c1454a281b53279e2d05a Mon Sep 17 00:00:00 2001 From: "Kyle J. McKay" Date: Mon, 21 Oct 2019 07:19:08 -0700 Subject: [PATCH] Markdown.pl: avoid list nesting confusion With this source: * A * B C D * E * F G * H I The parser was getting confused about where each unordered list actually ended when nesting the processed inner list inside the outer list. Address this by: 1) Giving "```" style code blocks their own hash just in case to make sure they can never collide with html blocks. 2) Temporarily (the indentation gets removed before final output) indent nested list tags by the current list nesting level to ensure there's no confusion about where each list/sublist starts and ends. 3) Make sure the list closing tag is followed by two newlines rather than one to avoid potentially not applying markup to the immediately following line. Since a few patterns had minor adjustments for these changes, those patterns also had a few unnecessarily capturing groups changed to non-capturing groups in the hope that some miniscule performance gain can be squeezed out with the change. Signed-off-by: Kyle J. McKay --- Markdown.pl | 39 +++++++++++++++++++++++++-------------- 1 file changed, 25 insertions(+), 14 deletions(-) diff --git a/Markdown.pl b/Markdown.pl index b8e8414..eb9bfce 100755 --- a/Markdown.pl +++ b/Markdown.pl @@ -75,6 +75,7 @@ my %g_titles; my %g_anchors; my %g_anchors_id; my %g_block_ids; +my %g_code_block_ids; my %g_html_blocks; my %g_code_blocks; my %opt; @@ -88,9 +89,10 @@ my %opt; # else, it's prefixed with a control character and suffixed with another # both of which are not allowed by the XML standard or Unicode. sub block_id { - $_[1] ? - "\2".refaddr(\$g_perm_block_ids{$_[0]})."\3" : - "\5".refaddr(\$g_block_ids{$_[0]})."\6"; + $_[1] or return "\5".refaddr(\$g_block_ids{$_[0]})."\6"; + $_[1] == 1 and return "\2".refaddr(\$g_perm_block_ids{$_[0]})."\3"; + $_[1] == 2 and return "\25".refaddr(\$g_code_block_ids{$_[0]})."\26"; + die "programmer error: bad block_id type $_[1]"; } # Regex to match balanced [brackets]. See Friedl's @@ -439,6 +441,7 @@ sub Markdown { %g_titles = (); %g_anchors = (); %g_block_ids = (); + %g_code_block_ids = (); %g_html_blocks = (); %g_code_blocks = (); $g_list_level = 0; @@ -470,8 +473,11 @@ sub Markdown { $text = _RunBlockGamut($text, 1); + # Remove indentation markers + $text =~ s/\027+//gs; + # Unhashify code blocks - $text =~ s/(\005\d+\006)/$g_code_blocks{$1}/g; + $text =~ s/(\025\d+\026)/$g_code_blocks{$1}/g; $text = _UnescapeSpecialChars($text); @@ -578,6 +584,7 @@ BEGIN { sub _HashHTMLBlocks { my $text = shift; my $less_than_indent = $opt{indent_width} - 1; + my $idt = "\027" x $g_list_level; # Hashify HTML blocks: # We only want to do this for block-level HTML tags, such as headers, @@ -600,10 +607,11 @@ sub _HashHTMLBlocks { $text =~ s{ ( # save in $1 ^ # start of line (with /m) - <($block_tags_a) # start tag = $2 + ((?:\Q$idt\E)?) # optional lead in = $2 + <($block_tags_a) # start tag = $3 \b # word break - (.*\n)*? # any number of lines, minimally matching - # the matching end tag + (?:.*\n)*? # any number of lines, minimally matching + \2 # the matching end tag [ ]* # trailing spaces (?=\n+|\Z) # followed by a newline or end of document ) @@ -620,9 +628,10 @@ sub _HashHTMLBlocks { $text =~ s{ ( # save in $1 ^ # start of line (with /m) + (?:\Q$idt\E)? # optional lead in <($block_tags_b) # start tag = $2 \b # word break - (.*\n)*? # any number of lines, minimally matching + (?:.*\n)*? # any number of lines, minimally matching .* # the matching end tag [ ]* # trailing spaces (?=\n+|\Z) # followed by a newline or end of document @@ -642,9 +651,9 @@ sub _HashHTMLBlocks { ) ( # save in $1 [ ]{0,$less_than_indent} - <(hr) # start tag = $2 + <(?:hr) # start tag \b # word break - ([^<>])*? # + (?:[^<>])*? # /?> # the matching end tag [ ]* (?=\n{2,}|\Z) # followed by a blank line or end of document @@ -1422,7 +1431,8 @@ sub _DoLists { $list_att .= " start=\"$first_marker_num\"" unless $first_marker_num == 1; } } - $result = "<$list_type$list_att$list_class>\n$list_incr" . $result . "\n"; + my $idt = "\027" x $g_list_level; + $result = "$idt<$list_type$list_att$list_class>\n$list_incr" . $result . "$idt\n\n"; $result; }; @@ -1513,6 +1523,7 @@ sub _ProcessListItems { # starting cardinal number; e.g. "1." or "a.". $g_list_level++; + my $idt = "\027" x $g_list_level; my $marker_kind = $list_type eq "ul" ? $marker_ul : $marker_ol; my $first_marker; my $first_marker_type; @@ -1639,12 +1650,12 @@ sub _ProcessListItems { } # Append to $result - $result .= "$incr" . $checkbox . $item . "\n"; + $result .= "$incr$idt" . $checkbox . $item . "$idt\n"; } if ($fancy) { # remove "incrlevel=$g_list_level " parts $result =~ s{} - {}g; + {$idt}g; } else { # remove the $g_list_level incr spans entirely $result =~ s{\n}{}g; @@ -1687,7 +1698,7 @@ sub _DoCodeBlocks { my $result = "
"
 		. $codeblock . "\n
"; - my $key = block_id($result); + my $key = block_id($result, 2); $g_code_blocks{$key} = $result; "\n\n" . $key . "\n\n"; }egmx;