Browse Source

Markdown.pl: avoid auto-closing <p> problems

When forming paragraphs, a $string is wrapped to become <p>$string</p>.

If the opening "<p>" ends up being auto-closed by markup within
$string, then either another "<p>" must be auto-opened or the closing
"</p>" of the wrapper must be silently dropped to avoid a validation
failure.

Figuring out exactly where to auto-open the "<p>" turns out to be
somewhat more difficult than just dropping the wrapper's "</p>".

For now just go ahead and drop the wrapper's closing "</p>" if the
wrapper's opening "<p>" has been auto-closed by the time the validator
encounters the wrapper's closing "</p>".

At the same time, make sure that all "optional closing tag" tags
that occur after the wrapper's opening "<p>" get closed immediately
upon encountering the wrapper's closing "</p>" (whether or not it
ultimately gets dropped).

With these changes, this input:

    line<p>one

    line<p>three

or this input:

    line<p>one</p>

    line<p>three</p>

produces this output:

    <p>line</p><p>one</p>

    <p>line</p><p>three</p>

While this input:

    line<p>one</p>x1

    line<p>three</p>x3

produces this output:

    <p>line</p><p>one</p>x1

    <p>line</p><p>three</p>x3

In this last example, the "x1" and "x3" text is left hanging outside
of a "p" section.  The client "user agent" (aka browser) will end
up rendering these hanging "x1" and "x3" pieces of text in their
own "p" sections.

With these changes, simple markup that would previously have been
rejected for no apparent reason by the default `--validate-xml-internal`
parser while being accepted by the `--validate-xml` option becomes
acceptable to the `--validate-xml-internal` parser as well.

Signed-off-by: Kyle J. McKay <mackyle@gmail.com>
master
Kyle J. McKay 4 years ago
parent
commit
e028ec2909
  1. 85
      Markdown.pl

85
Markdown.pl

@ -58,12 +58,15 @@ BEGIN {
#
# Global default settings:
#
my ($g_style_prefix, $g_empty_element_suffix, $g_indent_width, $g_tab_width);
my ($g_style_prefix, $g_empty_element_suffix, $g_indent_width, $g_tab_width,
$g_start_p, $g_close_p);
BEGIN {
$g_style_prefix = "_markdown-"; # Prefix for markdown css class styles
$g_empty_element_suffix = " />"; # Change to ">" for HTML output
$g_indent_width = 4; # Number of spaces considered new level
$g_tab_width = 4; # Legacy even though it's wrong
$g_start_p = "<p>"; # _FormParagraphs open paragraph tag
$g_close_p = "</p>"; # _FormParagraphs close paragraph tag
}
@ -703,6 +706,15 @@ sub _SanitizeOpts {
$o->{sanitize} = 1 if $o->{stripcomments} && !$o->{sanitize};
$o->{sanitize} = 1 if $o->{xmlcheck} == 2 && !$o->{sanitize};
# this is gross, but having the globals avoids unnecessary slowdown
if ($o->{sanitize} && $o->{xmlcheck} == 2) {
$g_start_p = "<\20>";
$g_close_p = "</\20>";
} else {
$g_start_p = "<p>";
$g_close_p = "</p>";
}
defined($o->{empty_element_suffix}) &&
($o->{empty_element_suffix} eq " />" || $o->{empty_element_suffix} eq ">")
or $o->{empty_element_suffix} = " />";
@ -936,8 +948,8 @@ sub _StripLinkDefinitions {
my ($block_tags_a, $block_tags_b);
BEGIN {
$block_tags_a = qr/p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math|ins|del/io;
$block_tags_b = qr/p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math/io;
$block_tags_a = qr/\020|p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math|ins|del/io;
$block_tags_b = qr/\020|p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math/io;
}
sub _HashHTMLBlocks {
@ -970,7 +982,7 @@ sub _HashHTMLBlocks {
<($block_tags_a) # start tag = $3
\b # word break
(?:.*\n)*? # any number of lines, minimally matching
\2</\3> # the matching end tag
\2</\3\s*> # the matching end tag
[ ]* # trailing spaces
(?=\n+|\Z) # followed by a newline or end of document
)
@ -991,7 +1003,7 @@ sub _HashHTMLBlocks {
<($block_tags_b) # start tag = $2
\b # word break
(?:.*\n)*? # any number of lines, minimally matching
.*</\2> # the matching end tag
.*</\2\s*> # the matching end tag
[ ]* # trailing spaces
(?=\n+|\Z) # followed by a newline or end of document
)
@ -2562,8 +2574,8 @@ sub _FormParagraphs {
foreach (@grafs) {
unless (defined($g_html_blocks{$_}) || defined($g_code_blocks{$_})) {
$_ = _RunSpanGamut($_);
s/^([ ]*)/<p>/;
$_ .= "</p>";
s/^([ ]*)/$g_start_p/;
$_ .= $g_close_p;
}
}
@ -2595,8 +2607,8 @@ my $g_possible_tag_name;
my %ok_tag_name;
BEGIN {
# note: length("blockquote") == 10
$g_possible_tag_name = qr/(?i:[a-z]{1,10}|h[1-6])/o;
%ok_tag_name = map({$_ => 1} qw(
$g_possible_tag_name = qr/(?i:[a-z]{1,10}|h[1-6]|\020)/o;
%ok_tag_name = map({$_ => 1} "\20", qw(
a abbr acronym address area
b basefont bdo big blockquote br
caption center cite code col colgroup
@ -2753,8 +2765,9 @@ sub _SanitizeTags {
my $lastmt = "";
$autoclose = sub {
my $s = $_[0] || "";
while (@stack && $stack[$#stack]->[0] ne $s &&
$tagocl{$stack[$#stack]->[0]}) {
while (@stack &&
($stack[$#stack]->[0] ne $s || $_[1] && !$stack[$#stack]->[2]) &&
$tagocl{$stack[$#stack]->[0]}) {
$ans .= "</" . $stack[$#stack]->[0] . ">";
pop(@stack);
}
@ -2767,7 +2780,11 @@ sub _SanitizeTags {
else {return}
while (@stack && $c->{$stack[$#stack]->[0]}) {
$ans .= "</" . $stack[$#stack]->[0] . ">";
pop(@stack);
if ($stack[$#stack]->[2]) {
$stack[$#stack]->[0] = "\20";
} else {
pop(@stack);
}
}
} if $validate;
while (pos($text) < $end) {
@ -2795,23 +2812,32 @@ sub _SanitizeTags {
$tag =~ m{^</($g_possible_tag_name)\s*>}) &&
$ok_tag_name{$tt=lc($1)})
{
my ($stag, $styp) = _Sanitize($tag);
my ($stag, $styp, $autocloseflag) = _Sanitize($tag);
if ($styp == 2 && $lastmt eq $tt) {
$lastmt = "";
next;
}
$lastmt = $styp == 3 ? $tt : "";
$lastmt = $styp == -3 ? $tt : "";
$tt = "p" if $autocloseflag;
if ($validate && $styp) {
&$autoclopen($tt) if $styp == 1 || $styp == 3;
&$autoclopen($tt) if $styp != 2;
if ($styp == 1) {
push(@stack,[$tt,$tstart]);
push(@stack,[$tt,$tstart,$autocloseflag]);
} elsif ($styp == 2) {
&$autoclose($tt) unless $tt eq "p";
!@stack and _xmlfail("closing tag $tt without matching open at " .
_linecol($tstart, $text));
if ($stack[$#stack]->[0] eq $tt) {
&$autoclose($tt, $autocloseflag);
my $mtstkchk = sub {
!@stack and _xmlfail("closing tag $tt without matching open at " .
_linecol($tstart, $text));
};
&$mtstkchk;
if ($autocloseflag && $stack[$#stack]->[0] eq "\20") {
pop(@stack);
$stag = "";
} elsif ($stack[$#stack]->[0] eq $tt) {
pop(@stack);
} else {
pop(@stack) while @stack && $stack[$#stack]->[0] eq "\20";
&$mtstkchk;
my @i = @{$stack[$#stack]};
_xmlfail("opening tag $i[0] at " . _linecol($i[1], $text) .
" mismatch with closing tag $tt at " . _linecol($tstart, $text));
@ -2838,10 +2864,11 @@ sub _SanitizeTags {
my $j;
for ($j = 0; $j <= $#stack; ++$j) {
my @i = @{$stack[$j]};
next if $i[0] eq "\20";
unshift(@errs, "opening tag $i[0] without matching close at " .
_linecol($i[1], $text));
}
_xmlfail(@errs);
_xmlfail(@errs) unless !@errs;
}
return $ans."\n";
}
@ -2866,11 +2893,14 @@ sub _Sanitize {
my $tag = shift;
my $seenatt = {};
if ($tag =~ m{^</}) {
$tag =~ s/\s+>$/>/;
return (lc($tag),2);
my $autocloseflag = undef;
$autocloseflag = 1, $tag="</p>" if $tag eq "</\20>";
return (lc($tag),2,$autocloseflag);
}
if ($tag =~ /^<([^\s<\/>]+)\s+/gs) {
my $tt = lc($1);
my $autocloseflag = undef;
$autocloseflag = 1, $tt="p" if $tt eq "\20";
my $out = "<" . $tt . " ";
my $ok = $tagatt{$tt};
ref($ok) eq "HASH" or $ok = {};
@ -2912,13 +2942,13 @@ sub _Sanitize {
$out =~ s/\s+$//;
my $typ = 1;
if ($tagmt{$tt}) {
$typ = 3;
$typ = ($tag =~ m,/>$,) ? 3 : -3;
$out .= $opt{empty_element_suffix};
} else {
$out .= ">";
$out .= "</$tt>" and $typ = 3 if $tag =~ m,/>$,;
}
return ($out,$typ);
return ($out,$typ,$autocloseflag);
} elsif ($tag =~ /^<([^\s<\/>]+)/s) {
my $tt = lc($1);
return ("&lt;" . substr($tag,1), 0) if $taga1p{$tt};
@ -2927,7 +2957,8 @@ sub _Sanitize {
} elsif ($tag =~ m,/>$,) {
return ("<" . $tt . "></" . $tt . ">", 3);
} else {
return ("<" . $tt . ">", 1);
return ("<" . $tt . ">", 1) unless $tt eq "\20";
return ("<p>", 1, 1);
}
}
return (lc($tag),0);
@ -3019,7 +3050,7 @@ sub _EncodeAmpsAndAngles {
$text =~ s/&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)/&amp;/g;
# Encode naked <'s
$text =~ s{<(?![a-z/?\$!])}{&lt;}gi;
$text =~ s{<(?![\020a-z/?\$!])}{&lt;}gi;
$text =~ s{<(?=[^>]*$)}{&lt;}g;
# Encode <'s that cannot possibly be a start or end tag

Loading…
Cancel
Save