From e0abe51cbaf71073d6c669b448b80525abed0318 Mon Sep 17 00:00:00 2001 From: "Kyle J. McKay" Date: Wed, 13 Nov 2019 14:15:55 -0700 Subject: [PATCH] Markdown.pl: sanitize and validate more With `--sanitize`, minimized empty tags that should not be for XHTML (e.g. "

") have been automatically split into separate start and end tags (e.g. "

"). Do the same in reverse for separate start and end tags that should not be for XHTML (e.g. "
") and turn them into a single minimized tag (e.g. "
"). Additionally, when `--validate-xml-internal` is active, automatically insert omitted optional end tags in (hopefully) the right places. For example, "" will automatically become "" thus making it valid XHTML. When there are multiple errors reported (can only happen when there are multiple opening tags missing their ending tags), report the errors in reverse order (i.e. the first one reported will be the largest line number) because that will often identify the source of the trouble as the first error line due to the nature of tag nesting. Make a few related wordsmithing changes at the same time. Signed-off-by: Kyle J. McKay --- Markdown.pl | 208 ++++++++++++++++++++++++++++++++++------------------ 1 file changed, 135 insertions(+), 73 deletions(-) diff --git a/Markdown.pl b/Markdown.pl index f86763f..8999faa 100755 --- a/Markdown.pl +++ b/Markdown.pl @@ -2245,6 +2245,76 @@ sub _DoTag { } +my %univatt; # universally allowed attribute names +my %tagatt; # per-element allowed attribute names +my %tagmt; # empty element tags +my %tagocl; # non-empty elements with optional closing tag +my %tagacl; # which %tagocl an opening %tagocl will close +my %tagblk; # block elements +my %lcattval; # names of attribute values to lowercase +my %impatt; # names of "implied" attributes +BEGIN { + %univatt = map({$_ => 1} qw(class dir id lang style title xml:lang)); + %tagatt = ( + 'a' => { map({$_ => 1} qw(href name)) }, + 'area' => { map({$_ => 1} qw(alt coords href nohref shape)) }, + 'basefont' => { map({$_ => 1} qw(color face size)) }, + 'br' => { map({$_ => 1} qw(clear)) }, + 'caption' => { map({$_ => 1} qw(align)) }, + 'col' => { map({$_ => 1} qw(align span width valign)) }, + 'colgroup' => { map({$_ => 1} qw(align span width valign)) }, + 'dir' => { map({$_ => 1} qw(compact)) }, + 'div' => { map({$_ => 1} qw(align)) }, + 'dl' => { map({$_ => 1} qw(compact)) }, + 'font' => { map({$_ => 1} qw(color face size)) }, + 'h1' => { map({$_ => 1} qw(align)) }, + 'h2' => { map({$_ => 1} qw(align)) }, + 'h3' => { map({$_ => 1} qw(align)) }, + 'h4' => { map({$_ => 1} qw(align)) }, + 'h5' => { map({$_ => 1} qw(align)) }, + 'h6' => { map({$_ => 1} qw(align)) }, + 'hr' => { map({$_ => 1} qw(align noshade size width)) }, + # NO server-side image maps, therefore NOT ismap ! + 'img' => { map({$_ => 1} qw(align alt border height hspace src usemap vspace width)) }, + 'li' => { map({$_ => 1} qw(compact type value)) }, + 'map' => { map({$_ => 1} qw(name)) }, + 'menu' => { map({$_ => 1} qw(compact)) }, + 'ol' => { map({$_ => 1} qw(compact start type)) }, + 'p' => { map({$_ => 1} qw(align)) }, + 'pre' => { map({$_ => 1} qw(width)) }, + 'table' => { map({$_ => 1} qw(align border cellpadding cellspacing summary width)) }, + 'tbody' => { map({$_ => 1} qw(align valign)) }, + 'tfoot' => { map({$_ => 1} qw(align valign)) }, + 'thead' => { map({$_ => 1} qw(align valign)) }, + 'td' => { map({$_ => 1} qw(align colspan height nowrap rowspan valign width)) }, + 'th' => { map({$_ => 1} qw(align colspan height nowrap rowspan valign width)) }, + 'tr' => { map({$_ => 1} qw(align valign)) }, + 'ul' => { map({$_ => 1} qw(compact type)) } + ); + %tagmt = map({$_ => 1} qw(area basefont br col hr img)); + %tagocl = map({$_ => 1} qw(colgroup dd dt li p tbody td tfoot th thead tr)); + %tagacl = ( + 'colgroup' => \%tagocl, + 'dd' => \%tagocl, + 'dt' => \%tagocl, + 'li' => \%tagocl, + 'tbody' => \%tagocl, + 'td' => { map({$_ => 1} qw(colgroup dd dt li p td tfoot th thead)) }, + 'tfoot' => \%tagocl, + 'th' => { map({$_ => 1} qw(colgroup dd dt li p td tfoot th thead)) }, + 'thead' => \%tagocl, + 'tr' => { map({$_ => 1} qw(colgroup dd dt li p td tfoot th thead tr)) }, + ); + %tagblk = map({$_ => 1} qw(address blockquote div dl h1 h2 h3 h4 h5 h6 hr ol p pre table)); + %impatt = map({$_ => 1} qw(checked compact ismap nohref noshade nowrap)); + %lcattval = map({$_ => 1} qw( + align border cellpadding cellspacing checked clear color colspan + compact coords height hspace ismap nohref noshade nowrap rowspan size + span shape valign vspace width + )); +} + + # _SanitizeTags # # Inspect all '<'...'>' tags in the input and HTML encode those things @@ -2254,13 +2324,37 @@ sub _DoTag { # <= sanitized text sub _SanitizeTags { my ($text, $validate) = @_; + $text =~ s/\s+$//; + $text ne "" or return ""; my @stack = (); my $ans = ""; my $end = length($text); pos($text) = 0; + my ($autoclose, $autoclopen); + my $lastmt = ""; + $autoclose = sub { + my $s = $_[0] || ""; + while (@stack && $stack[$#stack]->[0] ne $s && + $tagocl{$stack[$#stack]->[0]}) { + $ans .= "[0] . ">"; + pop(@stack); + } + } if $validate; + $autoclopen = sub { + my $s = $_[0] || ""; + my $c; + if ($tagblk{$s}) {$c = {p=>1}} + elsif ($tagocl{$s}) {$c = $tagacl{$s}} + else {return} + while (@stack && $c->{$stack[$#stack]->[0]}) { + $ans .= "[0] . ">"; + pop(@stack); + } + } if $validate; while (pos($text) < $end) { if ($text =~ /\G([^<]+)/gc) { $ans .= $1; + $lastmt = "" if $1 =~ /\S/; next; } my $tstart = pos($text); @@ -2270,17 +2364,23 @@ sub _SanitizeTags { $ans .= $tag; next; } + my $tt; if (($tag =~ m{^<($g_possible_tag_name)(?:[\s>]|/>$)} || $tag =~ m{^}) && - $ok_tag_name{lc($1)}) + $ok_tag_name{$tt=lc($1)}) { my ($stag, $styp) = _Sanitize($tag); - $ans .= $stag; - if ($validate && ($styp == 1 || $styp == 2) && $stag =~ m{^]+)}) { - my $tt = $1; + if ($styp == 2 && $lastmt eq $tt) { + $lastmt = ""; + next; + } + $lastmt = $styp == 3 ? $tt : ""; + if ($validate && $styp) { + &$autoclopen($tt) if $styp == 1 || $styp == 3; if ($styp == 1) { push(@stack,[$tt,$tstart]); - } else { + } elsif ($styp == 2) { + &$autoclose($tt) unless $tt eq "p"; !@stack and _xmlfail("closing tag $tt without matching open at " . _linecol($tstart, $text)); if ($stack[$#stack]->[0] eq $tt) { @@ -2292,28 +2392,32 @@ sub _SanitizeTags { } } } + $ans .= $stag; next; } else { $tag =~ s/^ 1} qw(class dir id lang style title xml:lang)); - %tagatt = ( - 'a' => { map({$_ => 1} qw(href name)) }, - 'area' => { map({$_ => 1} qw(alt coords href nohref shape)) }, - 'basefont' => { map({$_ => 1} qw(color face size)) }, - 'br' => { map({$_ => 1} qw(clear)) }, - 'caption' => { map({$_ => 1} qw(align)) }, - 'col' => { map({$_ => 1} qw(align span width valign)) }, - 'colgroup' => { map({$_ => 1} qw(align span width valign)) }, - 'dir' => { map({$_ => 1} qw(compact)) }, - 'div' => { map({$_ => 1} qw(align)) }, - 'dl' => { map({$_ => 1} qw(compact)) }, - 'font' => { map({$_ => 1} qw(color face size)) }, - 'h1' => { map({$_ => 1} qw(align)) }, - 'h2' => { map({$_ => 1} qw(align)) }, - 'h3' => { map({$_ => 1} qw(align)) }, - 'h4' => { map({$_ => 1} qw(align)) }, - 'h5' => { map({$_ => 1} qw(align)) }, - 'h6' => { map({$_ => 1} qw(align)) }, - 'hr' => { map({$_ => 1} qw(align noshade size width)) }, - # NO server-side image maps, therefore NOT ismap ! - 'img' => { map({$_ => 1} qw(align alt border height hspace src usemap vspace width)) }, - 'li' => { map({$_ => 1} qw(compact type value)) }, - 'map' => { map({$_ => 1} qw(name)) }, - 'menu' => { map({$_ => 1} qw(compact)) }, - 'ol' => { map({$_ => 1} qw(compact start type)) }, - 'p' => { map({$_ => 1} qw(align)) }, - 'pre' => { map({$_ => 1} qw(width)) }, - 'table' => { map({$_ => 1} qw(align border cellpadding cellspacing summary width)) }, - 'tbody' => { map({$_ => 1} qw(align valign)) }, - 'tfoot' => { map({$_ => 1} qw(align valign)) }, - 'thead' => { map({$_ => 1} qw(align valign)) }, - 'td' => { map({$_ => 1} qw(align colspan height nowrap rowspan valign width)) }, - 'th' => { map({$_ => 1} qw(align colspan height nowrap rowspan valign width)) }, - 'tr' => { map({$_ => 1} qw(align valign)) }, - 'ul' => { map({$_ => 1} qw(compact type)) } - ); - %tagmt = map({$_ => 1} qw(area basefont br col hr img)); - %impatt = map({$_ => 1} qw(checked compact ismap nohref noshade nowrap)); - %lcattval = map({$_ => 1} qw( - align border cellpadding cellspacing checked clear color colspan - compact coords height hspace ismap nohref noshade nowrap rowspan size - span shape valign vspace width - )); -} - - sub _Sanitize { my $tag = shift; my $seenatt = {}; @@ -3057,15 +3108,20 @@ to be recognized and passed through even without using this option. =item B<--sanitize> -Remove troublesome tag attributes from embedded tags. Only a very strictly +Removes troublesome tag attributes from embedded tags. Only a very strictly limited set of tag attributes will be permitted, other attributes will be silently discarded. The set of allowed attributes varies by tag. -This is enabled by default. -Also split empty minimized elements that are not one of the HTML allowed -empty elements (C C C
C C
C) into separate -begin and end tags. For example, C<<

>> or C<<

>> will be split -into C<<

>>. +Splits empty minimized elements that are not one of the HTML allowed empty +elements (C C C
C C
C) into separate begin +and end tags. For example, C<<

>> or C<<

>> will be split into +C<<

>>. + +Combines adjacent (whitespace separated only) opening and closing tags for +the same HTML empty element into a single minimized tag. For example, +C<<

>> will become C<<
>>. + +This is enabled by default. =item B<--no-sanitize> @@ -3086,8 +3142,8 @@ module be present (one is only required if this option is given). Any errors are reported to STDERR and the exit status will be non-zero on XML validation failure. Note that all line and column -numbers in the output refer to the entire output that would have -been produced. Re-run with B<--no-validate-xml> to see what's +numbers in the error output refer to the entire output that would +have been produced. Re-run with B<--no-validate-xml> to see what's actually present at those line and column positions. If the B<--stub> option has also been given, then the entire output is @@ -3107,13 +3163,19 @@ Perform XML validation on the output before it's output and die if it fails validation. This uses a simple internal consistency checker that finds unmatched and mismatched open/close tags. +Non-empty elements that in HTML have optional closing tags (C +C
C
C
  • C

    C C C C C C) +will automatically have any omitted end tags inserted during the +`--validate-xml-internal` process. + Any errors are reported to STDERR and the exit status will be non-zero on XML validation failure. Note that all line and column -numbers in the output refer to the entire output that would have -been produced without any B<--stub> or B<--stylesheet> options. -Re-run with B<--no-validate-xml> and I any B<--stub> or -B<--stylesheet> options to see what's actually present at those -line and column positions. +numbers in the error output refer to the entire output that would +have been produced before sanitization without any B<--stub> or +B<--stylesheet> options. Re-run with B<--no-sanitize> and +B<--no-validate-xml> and I any B<--stub> or B<--stylesheet> +options to see what's actually present at those line and column +positions. This option validates the output I adding any requested B<--stub> or B<--stylesheet>. As the built-in stub and stylesheet