diff --git a/Markdown.pl b/Markdown.pl index f86763f..8999faa 100755 --- a/Markdown.pl +++ b/Markdown.pl @@ -2245,6 +2245,76 @@ sub _DoTag { } +my %univatt; # universally allowed attribute names +my %tagatt; # per-element allowed attribute names +my %tagmt; # empty element tags +my %tagocl; # non-empty elements with optional closing tag +my %tagacl; # which %tagocl an opening %tagocl will close +my %tagblk; # block elements +my %lcattval; # names of attribute values to lowercase +my %impatt; # names of "implied" attributes +BEGIN { + %univatt = map({$_ => 1} qw(class dir id lang style title xml:lang)); + %tagatt = ( + 'a' => { map({$_ => 1} qw(href name)) }, + 'area' => { map({$_ => 1} qw(alt coords href nohref shape)) }, + 'basefont' => { map({$_ => 1} qw(color face size)) }, + 'br' => { map({$_ => 1} qw(clear)) }, + 'caption' => { map({$_ => 1} qw(align)) }, + 'col' => { map({$_ => 1} qw(align span width valign)) }, + 'colgroup' => { map({$_ => 1} qw(align span width valign)) }, + 'dir' => { map({$_ => 1} qw(compact)) }, + 'div' => { map({$_ => 1} qw(align)) }, + 'dl' => { map({$_ => 1} qw(compact)) }, + 'font' => { map({$_ => 1} qw(color face size)) }, + 'h1' => { map({$_ => 1} qw(align)) }, + 'h2' => { map({$_ => 1} qw(align)) }, + 'h3' => { map({$_ => 1} qw(align)) }, + 'h4' => { map({$_ => 1} qw(align)) }, + 'h5' => { map({$_ => 1} qw(align)) }, + 'h6' => { map({$_ => 1} qw(align)) }, + 'hr' => { map({$_ => 1} qw(align noshade size width)) }, + # NO server-side image maps, therefore NOT ismap ! + 'img' => { map({$_ => 1} qw(align alt border height hspace src usemap vspace width)) }, + 'li' => { map({$_ => 1} qw(compact type value)) }, + 'map' => { map({$_ => 1} qw(name)) }, + 'menu' => { map({$_ => 1} qw(compact)) }, + 'ol' => { map({$_ => 1} qw(compact start type)) }, + 'p' => { map({$_ => 1} qw(align)) }, + 'pre' => { map({$_ => 1} qw(width)) }, + 'table' => { map({$_ => 1} qw(align border cellpadding cellspacing summary width)) }, + 'tbody' => { map({$_ => 1} qw(align valign)) }, + 'tfoot' => { map({$_ => 1} qw(align valign)) }, + 'thead' => { map({$_ => 1} qw(align valign)) }, + 'td' => { map({$_ => 1} qw(align colspan height nowrap rowspan valign width)) }, + 'th' => { map({$_ => 1} qw(align colspan height nowrap rowspan valign width)) }, + 'tr' => { map({$_ => 1} qw(align valign)) }, + 'ul' => { map({$_ => 1} qw(compact type)) } + ); + %tagmt = map({$_ => 1} qw(area basefont br col hr img)); + %tagocl = map({$_ => 1} qw(colgroup dd dt li p tbody td tfoot th thead tr)); + %tagacl = ( + 'colgroup' => \%tagocl, + 'dd' => \%tagocl, + 'dt' => \%tagocl, + 'li' => \%tagocl, + 'tbody' => \%tagocl, + 'td' => { map({$_ => 1} qw(colgroup dd dt li p td tfoot th thead)) }, + 'tfoot' => \%tagocl, + 'th' => { map({$_ => 1} qw(colgroup dd dt li p td tfoot th thead)) }, + 'thead' => \%tagocl, + 'tr' => { map({$_ => 1} qw(colgroup dd dt li p td tfoot th thead tr)) }, + ); + %tagblk = map({$_ => 1} qw(address blockquote div dl h1 h2 h3 h4 h5 h6 hr ol p pre table)); + %impatt = map({$_ => 1} qw(checked compact ismap nohref noshade nowrap)); + %lcattval = map({$_ => 1} qw( + align border cellpadding cellspacing checked clear color colspan + compact coords height hspace ismap nohref noshade nowrap rowspan size + span shape valign vspace width + )); +} + + # _SanitizeTags # # Inspect all '<'...'>' tags in the input and HTML encode those things @@ -2254,13 +2324,37 @@ sub _DoTag { # <= sanitized text sub _SanitizeTags { my ($text, $validate) = @_; + $text =~ s/\s+$//; + $text ne "" or return ""; my @stack = (); my $ans = ""; my $end = length($text); pos($text) = 0; + my ($autoclose, $autoclopen); + my $lastmt = ""; + $autoclose = sub { + my $s = $_[0] || ""; + while (@stack && $stack[$#stack]->[0] ne $s && + $tagocl{$stack[$#stack]->[0]}) { + $ans .= "[0] . ">"; + pop(@stack); + } + } if $validate; + $autoclopen = sub { + my $s = $_[0] || ""; + my $c; + if ($tagblk{$s}) {$c = {p=>1}} + elsif ($tagocl{$s}) {$c = $tagacl{$s}} + else {return} + while (@stack && $c->{$stack[$#stack]->[0]}) { + $ans .= "[0] . ">"; + pop(@stack); + } + } if $validate; while (pos($text) < $end) { if ($text =~ /\G([^<]+)/gc) { $ans .= $1; + $lastmt = "" if $1 =~ /\S/; next; } my $tstart = pos($text); @@ -2270,17 +2364,23 @@ sub _SanitizeTags { $ans .= $tag; next; } + my $tt; if (($tag =~ m{^<($g_possible_tag_name)(?:[\s>]|/>$)} || $tag =~ m{^}) && - $ok_tag_name{lc($1)}) + $ok_tag_name{$tt=lc($1)}) { my ($stag, $styp) = _Sanitize($tag); - $ans .= $stag; - if ($validate && ($styp == 1 || $styp == 2) && $stag =~ m{^]+)}) { - my $tt = $1; + if ($styp == 2 && $lastmt eq $tt) { + $lastmt = ""; + next; + } + $lastmt = $styp == 3 ? $tt : ""; + if ($validate && $styp) { + &$autoclopen($tt) if $styp == 1 || $styp == 3; if ($styp == 1) { push(@stack,[$tt,$tstart]); - } else { + } elsif ($styp == 2) { + &$autoclose($tt) unless $tt eq "p"; !@stack and _xmlfail("closing tag $tt without matching open at " . _linecol($tstart, $text)); if ($stack[$#stack]->[0] eq $tt) { @@ -2292,28 +2392,32 @@ sub _SanitizeTags { } } } + $ans .= $stag; next; } else { $tag =~ s/^ 1} qw(class dir id lang style title xml:lang)); - %tagatt = ( - 'a' => { map({$_ => 1} qw(href name)) }, - 'area' => { map({$_ => 1} qw(alt coords href nohref shape)) }, - 'basefont' => { map({$_ => 1} qw(color face size)) }, - 'br' => { map({$_ => 1} qw(clear)) }, - 'caption' => { map({$_ => 1} qw(align)) }, - 'col' => { map({$_ => 1} qw(align span width valign)) }, - 'colgroup' => { map({$_ => 1} qw(align span width valign)) }, - 'dir' => { map({$_ => 1} qw(compact)) }, - 'div' => { map({$_ => 1} qw(align)) }, - 'dl' => { map({$_ => 1} qw(compact)) }, - 'font' => { map({$_ => 1} qw(color face size)) }, - 'h1' => { map({$_ => 1} qw(align)) }, - 'h2' => { map({$_ => 1} qw(align)) }, - 'h3' => { map({$_ => 1} qw(align)) }, - 'h4' => { map({$_ => 1} qw(align)) }, - 'h5' => { map({$_ => 1} qw(align)) }, - 'h6' => { map({$_ => 1} qw(align)) }, - 'hr' => { map({$_ => 1} qw(align noshade size width)) }, - # NO server-side image maps, therefore NOT ismap ! - 'img' => { map({$_ => 1} qw(align alt border height hspace src usemap vspace width)) }, - 'li' => { map({$_ => 1} qw(compact type value)) }, - 'map' => { map({$_ => 1} qw(name)) }, - 'menu' => { map({$_ => 1} qw(compact)) }, - 'ol' => { map({$_ => 1} qw(compact start type)) }, - 'p' => { map({$_ => 1} qw(align)) }, - 'pre' => { map({$_ => 1} qw(width)) }, - 'table' => { map({$_ => 1} qw(align border cellpadding cellspacing summary width)) }, - 'tbody' => { map({$_ => 1} qw(align valign)) }, - 'tfoot' => { map({$_ => 1} qw(align valign)) }, - 'thead' => { map({$_ => 1} qw(align valign)) }, - 'td' => { map({$_ => 1} qw(align colspan height nowrap rowspan valign width)) }, - 'th' => { map({$_ => 1} qw(align colspan height nowrap rowspan valign width)) }, - 'tr' => { map({$_ => 1} qw(align valign)) }, - 'ul' => { map({$_ => 1} qw(compact type)) } - ); - %tagmt = map({$_ => 1} qw(area basefont br col hr img)); - %impatt = map({$_ => 1} qw(checked compact ismap nohref noshade nowrap)); - %lcattval = map({$_ => 1} qw( - align border cellpadding cellspacing checked clear color colspan - compact coords height hspace ismap nohref noshade nowrap rowspan size - span shape valign vspace width - )); -} - - sub _Sanitize { my $tag = shift; my $seenatt = {}; @@ -3057,15 +3108,20 @@ to be recognized and passed through even without using this option. =item B<--sanitize> -Remove troublesome tag attributes from embedded tags. Only a very strictly +Removes troublesome tag attributes from embedded tags. Only a very strictly limited set of tag attributes will be permitted, other attributes will be silently discarded. The set of allowed attributes varies by tag. -This is enabled by default. -Also split empty minimized elements that are not one of the HTML allowed -empty elements (C C C
C C
C) into separate -begin and end tags. For example, C<<

>> or C<<

>> will be split -into C<<

>>. +Splits empty minimized elements that are not one of the HTML allowed empty +elements (C C C
C C
C) into separate begin +and end tags. For example, C<<

>> or C<<

>> will be split into +C<<

>>. + +Combines adjacent (whitespace separated only) opening and closing tags for +the same HTML empty element into a single minimized tag. For example, +C<<

>> will become C<<
>>. + +This is enabled by default. =item B<--no-sanitize> @@ -3086,8 +3142,8 @@ module be present (one is only required if this option is given). Any errors are reported to STDERR and the exit status will be non-zero on XML validation failure. Note that all line and column -numbers in the output refer to the entire output that would have -been produced. Re-run with B<--no-validate-xml> to see what's +numbers in the error output refer to the entire output that would +have been produced. Re-run with B<--no-validate-xml> to see what's actually present at those line and column positions. If the B<--stub> option has also been given, then the entire output is @@ -3107,13 +3163,19 @@ Perform XML validation on the output before it's output and die if it fails validation. This uses a simple internal consistency checker that finds unmatched and mismatched open/close tags. +Non-empty elements that in HTML have optional closing tags (C +C
C
C
  • C

    C C C C C C) +will automatically have any omitted end tags inserted during the +`--validate-xml-internal` process. + Any errors are reported to STDERR and the exit status will be non-zero on XML validation failure. Note that all line and column -numbers in the output refer to the entire output that would have -been produced without any B<--stub> or B<--stylesheet> options. -Re-run with B<--no-validate-xml> and I any B<--stub> or -B<--stylesheet> options to see what's actually present at those -line and column positions. +numbers in the error output refer to the entire output that would +have been produced before sanitization without any B<--stub> or +B<--stylesheet> options. Re-run with B<--no-sanitize> and +B<--no-validate-xml> and I any B<--stub> or B<--stylesheet> +options to see what's actually present at those line and column +positions. This option validates the output I adding any requested B<--stub> or B<--stylesheet>. As the built-in stub and stylesheet