Browse Source

Markdown.pl: --validate-xml-internal by default

Enhance the sanitation process slightly in order to perform simple
tag missing/mismatch validation.  Almost all the work needed was
already being performed with the exception of keeping a tag stack.

Keep an active tag stack when `--validate-xml-internal` is active
and use it to find mismatched and/or missing open/closing tags.

Enable this by default unless `--no-sanitize` has been given (the
sanitize machinery does the validation and is required) or
`--validate-xml` or `--no-validate-xml` has been given explicitly.

Unlike the more comprehensive `--validate-xml`, this option operates
very quickly and does not require any additional XML modules to be
present.  It's also compatible with `--html4tags`.

Signed-off-by: Kyle J. McKay <mackyle@gmail.com>
master
Kyle J. McKay 5 years ago
parent
commit
db542706ff
  1. 133
      Markdown.pl

133
Markdown.pl

@ -288,6 +288,7 @@ sub _main {
'sanitize', 'sanitize',
'no-sanitize', 'no-sanitize',
'validate-xml', 'validate-xml',
'validate-xml-internal',
'no-validate-xml', 'no-validate-xml',
'htmlroot|r=s', 'htmlroot|r=s',
'imageroot|i=s', 'imageroot|i=s',
@ -335,15 +336,21 @@ sub _main {
if ($cli_opts{'sanitize'}) { # --sanitize always wins if ($cli_opts{'sanitize'}) { # --sanitize always wins
$options{sanitize} = 1; $options{sanitize} = 1;
} }
$options{xmlcheck} = $options{sanitize} ? 2 : 0;
if ($cli_opts{'no-validate-xml'}) { # Do not validate XML if ($cli_opts{'no-validate-xml'}) { # Do not validate XML
$options{xmlcheck} = 0; $options{xmlcheck} = 0;
} }
if ($cli_opts{'validate-xml'}) { # Validate XML output if ($cli_opts{'validate-xml'}) { # Validate XML output
$options{xmlcheck} = 1; $options{xmlcheck} = 1;
} }
if ($cli_opts{'validate-xml-internal'}) { # Validate XML output internally
$options{xmlcheck} = 2;
}
die "--html4tags and --validate-xml are incompatible\n" die "--html4tags and --validate-xml are incompatible\n"
if $cli_opts{'html4tags'} && $options{xmlcheck}; if $cli_opts{'html4tags'} && $options{xmlcheck} == 1;
if ($options{xmlcheck}) { die "--no-sanitize and --validate-xml-internal are incompatible\n"
if !$options{'sanitize'} && $options{xmlcheck} == 2;
if ($options{xmlcheck} == 1) {
eval { require XML::Simple; 1 } and $hasxml = 1 or $hasxml_err = $@; eval { require XML::Simple; 1 } and $hasxml = 1 or $hasxml_err = $@;
eval { require XML::Parser; 1 } and $hasxmlp = 1 or $hasxmlp_err = $@ unless $hasxml; eval { require XML::Parser; 1 } and $hasxmlp = 1 or $hasxmlp_err = $@ unless $hasxml;
die "$hasxml_err$hasxmlp_err" unless $hasxml || $hasxmlp; die "$hasxml_err$hasxmlp_err" unless $hasxml || $hasxmlp;
@ -448,7 +455,7 @@ HTML4
} }
$hdr = &$hdrf() unless $didhdr || $raw; $hdr = &$hdrf() unless $didhdr || $raw;
$ftr = "</div>\n</body>\n</html>\n" if $stub && !$raw; $ftr = "</div>\n</body>\n</html>\n" if $stub && !$raw;
if ($options{xmlcheck}) { if ($options{xmlcheck} == 1) {
my ($good, $errs); my ($good, $errs);
if ($stub && !$raw) { if ($stub && !$raw) {
($good, $errs) = _xmlcheck($hdr.$result.$ftr); ($good, $errs) = _xmlcheck($hdr.$result.$ftr);
@ -518,9 +525,10 @@ sub ProcessRaw {
while (my ($k,$v) = each %args) { while (my ($k,$v) = each %args) {
$opt{$k} = $v; $opt{$k} = $v;
} }
$opt{xmlcheck} = 0 unless looks_like_number($opt{xmlcheck});
# Sanitize all '<'...'>' tags if requested # Sanitize all '<'...'>' tags if requested
$text = _SanitizeTags($text) if $opt{sanitize}; $text = _SanitizeTags($text, $opt{xmlcheck} == 2) if $opt{sanitize};
utf8::encode($text); utf8::encode($text);
return $text; return $text;
@ -556,6 +564,7 @@ sub Markdown {
while (my ($k,$v) = each %args) { while (my ($k,$v) = each %args) {
$opt{$k} = $v; $opt{$k} = $v;
} }
$opt{xmlcheck} = 0 unless looks_like_number($opt{xmlcheck});
# Clear the globals. If we don't clear these, you get conflicts # Clear the globals. If we don't clear these, you get conflicts
# from other articles when generating a page which contains more than # from other articles when generating a page which contains more than
@ -604,7 +613,7 @@ sub Markdown {
$text .= "\n" unless $text eq ""; $text .= "\n" unless $text eq "";
# Sanitize all '<'...'>' tags if requested # Sanitize all '<'...'>' tags if requested
$text = _SanitizeTags($text) if $opt{sanitize}; $text = _SanitizeTags($text, $opt{xmlcheck} == 2) if $opt{sanitize};
utf8::encode($text); utf8::encode($text);
if (defined($opt{h1}) && $opt{h1} ne "" && ref($_[0]) eq "HASH") { if (defined($opt{h1}) && $opt{h1} ne "" && ref($_[0]) eq "HASH") {
@ -2244,7 +2253,8 @@ sub _DoTag {
# $1 => text to process # $1 => text to process
# <= sanitized text # <= sanitized text
sub _SanitizeTags { sub _SanitizeTags {
my $text = shift; my ($text, $validate) = @_;
my @stack = ();
my $ans = ""; my $ans = "";
my $end = length($text); my $end = length($text);
pos($text) = 0; pos($text) = 0;
@ -2253,6 +2263,7 @@ sub _SanitizeTags {
$ans .= $1; $ans .= $1;
next; next;
} }
my $tstart = pos($text);
if ($text =~ /\G(<[^>]*>)/gc) { if ($text =~ /\G(<[^>]*>)/gc) {
my $tag = $1; my $tag = $1;
if ($tag =~ /^<!--/) { # pass "comments" through if ($tag =~ /^<!--/) { # pass "comments" through
@ -2263,7 +2274,24 @@ sub _SanitizeTags {
$tag =~ m{^</($g_possible_tag_name)\s*>}) && $tag =~ m{^</($g_possible_tag_name)\s*>}) &&
$ok_tag_name{lc($1)}) $ok_tag_name{lc($1)})
{ {
$ans .= _Sanitize($tag); my ($stag, $styp) = _Sanitize($tag);
$ans .= $stag;
if ($validate && ($styp == 1 || $styp == 2) && $stag =~ m{^</?([^/\s>]+)}) {
my $tt = $1;
if ($styp == 1) {
push(@stack,[$tt,$tstart]);
} else {
!@stack and _xmlfail("closing tag $tt without matching open at " .
_linecol($tstart, $text));
if ($stack[$#stack]->[0] eq $tt) {
pop(@stack);
} else {
my @i = @{$stack[$#stack]};
_xmlfail("opening tag $i[0] at " . _linecol($i[1], $text) .
" mismatch with closing tag $tt at " . _linecol($tstart, $text));
}
}
}
next; next;
} else { } else {
$tag =~ s/^</&lt;/; $tag =~ s/^</&lt;/;
@ -2275,10 +2303,35 @@ sub _SanitizeTags {
pos($text) += 1; pos($text) += 1;
$ans .= "&lt;"; $ans .= "&lt;";
} }
if ($validate && @stack) {
my @errs;
my $j;
for ($j = 0; $j <= $#stack; ++$j) {
my @i = @{$stack[$j]};
push(@errs, "opening tag $i[0] without matching close at " .
_linecol($i[1], $text));
}
_xmlfail(@errs);
}
return $ans; return $ans;
} }
sub _linecol {
my ($pos, $txt) = @_;
pos($txt) = 0;
my ($l, $p);
$l = 1;
++$l while ($p = pos($txt)), $txt =~ /\G[^\n]*\n/gc && pos($txt) <= $pos;
return "line $l col " . (1 + ($pos - $p));
}
sub _xmlfail {
die join("", map("$_\n", @_));
}
my %univatt; my %univatt;
my %tagatt; my %tagatt;
my %tagmt; my %tagmt;
@ -2335,7 +2388,7 @@ sub _Sanitize {
my $seenatt = {}; my $seenatt = {};
if ($tag =~ m{^</}) { if ($tag =~ m{^</}) {
$tag =~ s/\s+>$/>/; $tag =~ s/\s+>$/>/;
return lc($tag); return (lc($tag),2);
} }
if ($tag =~ /^<([^\s<\/>]+)\s+/gs) { if ($tag =~ /^<([^\s<\/>]+)\s+/gs) {
my $tt = lc($1); my $tt = lc($1);
@ -2378,24 +2431,26 @@ sub _Sanitize {
} }
my $sfx = substr($tag, pos($tag)); my $sfx = substr($tag, pos($tag));
$out =~ s/\s+$//; $out =~ s/\s+$//;
my $typ = 1;
if ($tagmt{$tt}) { if ($tagmt{$tt}) {
$typ = 3;
$out .= $opt{empty_element_suffix}; $out .= $opt{empty_element_suffix};
} else { } else {
$out .= ">"; $out .= ">";
$out .= "</$tt>" if $tag =~ m,/>$,; $out .= "</$tt>" and $typ = 3 if $tag =~ m,/>$,;
} }
return $out; return ($out,$typ);
} elsif ($tag =~ /^<([^\s<\/>]+)/gs) { } elsif ($tag =~ /^<([^\s<\/>]+)/gs) {
my $tt = lc($1); my $tt = lc($1);
if ($tagmt{$tt}) { if ($tagmt{$tt}) {
return "<" . $tt . $opt{empty_element_suffix}; return ("<" . $tt . $opt{empty_element_suffix}, 3);
} elsif ($tag =~ m,/>$,) { } elsif ($tag =~ m,/>$,) {
return "<" . $tt . "></" . $tt . ">"; return ("<" . $tt . "></" . $tt . ">", 3);
} else { } else {
return "<" . $tt . ">"; return ("<" . $tt . ">", 1);
} }
} }
return lc($tag); return (lc($tag),0);
} }
@ -2922,6 +2977,7 @@ B<Markdown.pl> [B<--help>] [B<--html4tags>] [B<--htmlroot>=I<prefix>]
--sanitize sanitize tag attributes --sanitize sanitize tag attributes
--no-sanitize do not sanitize tag attributes --no-sanitize do not sanitize tag attributes
--validate-xml check if output is valid XML --validate-xml check if output is valid XML
--validate-xml-internal fast basic check if output is valid XML
--no-validate-xml do not check output for valid XML --no-validate-xml do not check output for valid XML
--tabwidth=num expand tabs to num instead of 8 --tabwidth=num expand tabs to num instead of 8
-r prefix | --htmlroot=prefix append relative non-img URLs -r prefix | --htmlroot=prefix append relative non-img URLs
@ -3020,8 +3076,12 @@ Use of this option is I<NOT RECOMMENDED>.
Perform XML validation on the output before it's output and die if Perform XML validation on the output before it's output and die if
it fails validation. This requires the C<XML::Simple> or C<XML::Parser> it fails validation. This requires the C<XML::Simple> or C<XML::Parser>
module be present (one is only required if this option is given). module be present (one is only required if this option is given).
Any errors are reported to STDERR and the exit status will be Any errors are reported to STDERR and the exit status will be
non-zero on XML validation failure. non-zero on XML validation failure. Note that all line and column
numbers in the output refer to the entire output that would have
been produced. Re-run with B<--no-validate-xml> to see what's
actually present at those line and column positions.
If the B<--stub> option has also been given, then the entire output is If the B<--stub> option has also been given, then the entire output is
validated as-is. Without the B<--stub> option, the output will be wrapped validated as-is. Without the B<--stub> option, the output will be wrapped
@ -3034,6 +3094,40 @@ This option is I<NOT compatible> with the B<--html4tags> option and will
produce an immediate error if both are given. produce an immediate error if both are given.
=item B<--validate-xml-internal>
Perform XML validation on the output before it's output and die if
it fails validation. This uses a simple internal consistency checker
that finds unmatched and mismatched open/close tags.
Any errors are reported to STDERR and the exit status will be
non-zero on XML validation failure. Note that all line and column
numbers in the output refer to the entire output that would have
been produced without any B<--stub> or B<--stylesheet> options.
Re-run with B<--no-validate-xml> and I<without> any B<--stub> or
B<--stylesheet> options to see what's actually present at those
line and column positions.
This option validates the output I<prior to> adding any requested
B<--stub> or B<--stylesheet>. As the built-in stub and stylesheet
have already been validated that speeds things up. The output is
I<NOT> wrapped (in a C<< <div>...</div> >>) for validation as that's
not required for the internal checker.
This option is I<IS enabled by default> unless B<--no-sanitize> is
active.
This option is I<IS compatible> with the B<--html4tags> option.
This option requires the B<--sanitize> option and will produce an
immediate error if both B<--no-sanitize> and B<--validate-xml-internal>
are given.
Note that B<--validate-xml-internal> is I<MUCH faster> than
B<--validate-xml> and I<does NOT> require any extra XML modules to
be present.
=item B<--no-validate-xml> =item B<--no-validate-xml>
Do not perform XML validation on the output. Markdown.pl itself will Do not perform XML validation on the output. Markdown.pl itself will
@ -3046,7 +3140,9 @@ Markdown.pl will I<NOT check> for these issues itself. But with
the B<--validate-xml> option will use C<XML::Simple> or C<XML::Parser> the B<--validate-xml> option will use C<XML::Simple> or C<XML::Parser>
to do so. to do so.
Note that B<--no-validate-xml> is the default option. Note that B<--validate-xml-internal> is the default option unless
B<--no-sanitize> is used in which case B<--no-validate-xml> is the
default option.
=item B<--tabwidth>=I<num> =item B<--tabwidth>=I<num>
@ -3166,8 +3262,9 @@ Display the short-form version number.
=item B<--raw> =item B<--raw>
Input contains only raw HTML/XHTML. All options other than Input contains only raw HTML/XHTML. All options other than
B<--html4tags>, B<--deprecated>, B<--sanitize> (on by default) and B<--html4tags>, B<--deprecated>, B<--sanitize> (on by default),
B<--validate-xml> (and their B<--no-...> variants) are ignored. B<--validate-xml> and B<--validate-xml-internal> (and their B<--no-...>
variants) are ignored.
With this option, arbitrary HTML/XHTML input can be passed through With this option, arbitrary HTML/XHTML input can be passed through
the sanitizer and/or validator. If sanitation is requested (the the sanitizer and/or validator. If sanitation is requested (the

Loading…
Cancel
Save