From 964670e66b6a3ec3e82fd422fb3608856deac976 Mon Sep 17 00:00:00 2001 From: "Kyle J. McKay" Date: Sun, 31 Jan 2021 08:42:45 -0700 Subject: [PATCH] Markdown.pl: run _EncodeAmpsAndAngles on top-level raw html blocks At the top level of the document, the _HashHTMLBlocks function gets called to sequester raw top-level html blocks from being processed. As a result, anything in these top-level blocks escapes general Markdown processing except that if XML validation has been enabled (the default), the final result of processing does always pass through a validation stage. On the one hand that's good as it allows raw HTML in Markdown docs, but on the other hand, some basic fix ups are not happening and that's bad. Rather than try and push all of the top-level raw HTML block content through either _RunBlockGamut or _RunSpanGamut (thereby somewhat defeating the point of allowing raw HTML top-level blocks in the first place), use a compromise between the two extremes and push all the text of raw HTML block content through just the _EncodeAmpsAndAngles function. This causes things like non-html-escaped ampersands (&) inside "href" and "src" attributes to magically be transformed into "&" and at the same time any url adjustment options (i.e. -r, -i, -b, -a) to be applied. The result produces better and less surprising outcomes than before. Signed-off-by: Kyle J. McKay --- Markdown.pl | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/Markdown.pl b/Markdown.pl index 6870d04..a2dbd78 100755 --- a/Markdown.pl +++ b/Markdown.pl @@ -1011,7 +1011,7 @@ sub Markdown { $text =~ s/^ +$//mg; # Turn block-level HTML blocks into hash entries - $text = _HashHTMLBlocks($text); + $text = _HashHTMLBlocks($text, 1); # Strip link definitions, store in hashes. $text = _StripLinkDefinitions($text); @@ -1150,6 +1150,7 @@ sub _StripLinkDefinitions { return $text; } +my %ok_tag_name; # initialized later my ($block_tags_a, $block_tags_b); BEGIN { $block_tags_a = qr/\020|p|div|center|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math|ins|del/io; @@ -1157,9 +1158,12 @@ BEGIN { } sub _HashHTMLBlocks { - my $text = shift; + my ($text, $toplevel) = @_; my $less_than_indent = $opt{indent_width} - 1; my $idt = "\027" x $g_list_level; + my $blkprc = $toplevel ? + sub { return $ok_tag_name{$_[1]} ? _EncodeAmpsAndAngles($_[0]) : $_[0] } : + sub { return $_[0] }; # Hashify HTML blocks: # We only want to do this for block-level HTML tags, such as headers, @@ -1191,8 +1195,9 @@ sub _HashHTMLBlocks { (?=\n+|\Z) # followed by a newline or end of document ) }{ - my $key = block_id($1); - $g_html_blocks{$key} = $1; + my $blk = &$blkprc($1, $3); + my $key = block_id($blk); + $g_html_blocks{$key} = $blk; "\n\n" . $key . "\n\n"; }eigmx; @@ -1212,8 +1217,9 @@ sub _HashHTMLBlocks { (?=\n+|\Z) # followed by a newline or end of document ) }{ - my $key = block_id($1); - $g_html_blocks{$key} = $1; + my $blk = &$blkprc($1, $2); + my $key = block_id($blk); + $g_html_blocks{$key} = $blk; "\n\n" . $key . "\n\n"; }eigmx; @@ -2874,8 +2880,8 @@ sub _FormParagraphs { } +# %ok_tag_name declared previously my $g_possible_tag_name; -my %ok_tag_name; BEGIN { # note: length("blockquote") == 10 $g_possible_tag_name = qr/(?i:[a-z]{1,10}|h[1-6]|\020)/o;