From 845104c13a9673835e98b3701a50ffd74118fab2 Mon Sep 17 00:00:00 2001 From: "Kyle J. McKay" Date: Sat, 13 Feb 2021 08:13:07 -0700 Subject: [PATCH] Markdown.pl: improve handling of auto-closed p tags Given an input document like this:

hi

It will validate just fine in `--raw-xml` mode. However, in normal "html/xhtml" mode, the "pre" opening tag automatically closes the currently open "p" tag leading to this:

hi

Without further intervention, the closing "p" tag that was already there (just before the closing "div" tag), now has no matching open "p" tag to close anymore -- the corresponding open tag is now the open "div" section. Obviously the document fails to validate at this point. The naive fix simply has the closing tag that corresponds to the opening tag that caused the "p" to be auto-closed to then automatically re-open a "p" at that point producing this:

hi

While such a solution does work, it frequently ends up introducing extra unwanted "p" sections. Instead of reopening the "p" immediately upon seeing the closing tag that matches the opening tag that auto-closed the "p", simply set a "reopen p" flag. When the "reopen p" flag is set and suitable conditions are met, then go ahead and "reopen" a new "p" tag. The exact conditions are a bit of an heuristic at the moment but amount to clearing the "reopen p" flag when the next start tag is seen and inserting a new "p" at that time only if the open tag is a text level element opening tag. Alternatively, if the "reopen p" flag is currently set and some non-whitespace text shows up before seeing another open tag, re-open a new "p" at that point (and clear the "reopen p" flag). Finally, if the flag is currently set and a closing "p" tag appears, just discard it and clear the "reopen p" flag. Essentially this case has the effect of just moving the closing "p" tag. With these changes, the troublesome document now produces this:

hi
An improvement on what came before. Some might argue that the empty "p" section ought to simply be omitted entirely. Perhaps. But there was an explicit open "p" tag in the text -- auto closing it is one thing -- removing an explicit open tag entirely is something else. Additionally, since the validator validates in a "streamy" way, that's much more difficult to accomplish since at the time the initial opening "p" has been seen there's not yet any information available about the fact it's about to be auto-closed while still not containing any text and it therefore gets emitted to the output. Signed-off-by: Kyle J. McKay --- Markdown.pl | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/Markdown.pl b/Markdown.pl index 0ef5fdc..45bef2a 100755 --- a/Markdown.pl +++ b/Markdown.pl @@ -3053,6 +3053,7 @@ my %tagmt; # empty element tags my %tagocl; # non-empty elements with optional closing tag my %tagacl; # which %tagocl an opening %tagocl will close my %tagblk; # block elements +my %taginl; # inline markup tags which trigger an auto

reopen my %taga1p; # open tags which require at least one attribute my %lcattval; # names of attribute values to lowercase my %impatt; # names of "implied" attributes @@ -3109,6 +3110,8 @@ BEGIN { 'tr' => { map({$_ => 1} qw(colgroup dd dt li p td tfoot th thead tr)) }, ); %tagblk = map({$_ => 1} qw(address blockquote center div dl h1 h2 h3 h4 h5 h6 hr ol p pre table ul)); + %taginl = map({$_ => 1} qw(a abbr acronym b basefont bdo big br cite code dfn em font i + img kbd map q s samp small span strike strong sub sup tt u var)); %impatt = map({$_ => 1} qw(checked compact ismap nohref noshade nowrap)); %lcattval = map({$_ => 1} qw( align border cellpadding cellspacing checked clear color colspan @@ -3136,6 +3139,7 @@ sub _SanitizeTags { pos($text) = 0; my ($autoclose, $autoclopen); my $lastmt = ""; + my $reopenp = 0; $autoclose = $htmlauto ? sub { my $s = $_[0] || ""; while (@stack && @@ -3151,7 +3155,9 @@ sub _SanitizeTags { if ($tagblk{$s}) {$c = {p=>1}} elsif ($tagocl{$s}) {$c = $tagacl{$s}} else {return} + my $clp = 0; while (@stack && $c->{$stack[$#stack]->[0]}) { + $clp = 0; if ($stack[$#stack]->[2] && $stack[$#stack]->[1]+3 eq $_[1]) { $ans .= ""; @@ -3161,9 +3167,11 @@ sub _SanitizeTags { if ($stack[$#stack]->[2]) { $stack[$#stack]->[0] = "\20"; } else { - pop(@stack); + $clp = $s ne "p" && $stack[$#stack]->[0] eq "p"; + pop(@stack); } } + $clp; } : sub {} if $validate; while (pos($text) < $end) { if ($text =~ /\G(\s+)/gc) { @@ -3173,8 +3181,14 @@ sub _SanitizeTags { if ($text =~ /\G([^<]+)/gc) { if ($validate && @stack && $stack[$#stack]->[0] eq "\20") { push(@stack,["p",pos($text)-length($1)]); + $reopenp = 0; $ans .= "

"; } + $reopenp && do { + push(@stack,["p",pos($text)-length($1)]); + $reopenp = 0; + $ans .= "

"; + }; $ans .= _EncodeAmps($1); $lastmt = ""; next; @@ -3206,10 +3220,19 @@ sub _SanitizeTags { $lastmt = $styp == -3 ? $tt : ""; $tt = "p" if $autocloseflag; if ($validate && $styp) { - &$autoclopen($tt, $tstart) if $styp != 2; + my $clp = &$autoclopen($tt, $tstart) if $styp != 2; if ($styp == 1) { - push(@stack,[$tt,$tstart,$autocloseflag]); + $reopenp && $taginl{$tt} and do { + push(@stack,["p",$tstart]); + $ans .= "

"; + }; + push(@stack,[$tt,$tstart,$autocloseflag,$clp]); + $reopenp = 0; } elsif ($styp == 2) { + $reopenp && ($tt eq "p" || $tt eq "\20") and do { + $reopenp = 0; + next; + }; &$autoclose($tt, $autocloseflag); my $mtstkchk = sub { !@stack and _xmlfail("closing tag $tt without matching open at " . @@ -3220,6 +3243,7 @@ sub _SanitizeTags { pop(@stack); $stag = ""; } elsif ($stack[$#stack]->[0] eq $tt) { + $stack[$#stack]->[3] and $reopenp = 1; pop(@stack); } else { pop(@stack) while @stack && $stack[$#stack]->[0] eq "\20";