From 7e34ed6a63ff49b9c5ec53298657d8778626cc01 Mon Sep 17 00:00:00 2001
From: "Kyle J. McKay"
Date: Sun, 13 Jun 2021 13:13:03 -0700
Subject: [PATCH] Markdown.pl: adjust strip-comments options and defaults
Make --strip-comments be an alias for --strip-comments-lax.
Make --strip-comments-lax-only be the default whenever --sanitize
is active (which is the default).
Whenever possible, running Markdown.pl without any options should
provide the best output possible by default.
Turning what appear to be (at first glance) XML comments into plain
text in the output clearly violates the principle of least surprise
and can make for some very ugly pages.
Similarly using a `--strip-comments` option and discovering those
same plain text XML comments in the output also invites bewilderment.
Instead, make `--strip-comments` be a short form of `--strip-comments-lax`
and make `--strip-comments-lax-only` be the normal default.
By doing this there are no ugly page surprises by default.
Those pesky double hyphen sequences (`--`) that have furtively
slipped into what were supposed to be strictly valid XML comments
thereby making them strictly invalid XML comments are now rendered
impotent by default.
The output remains strictly valid XML uncontaminated by the surprise
appearance of strictly invalid XML comments suddenly rendered as
plain text due to accidental inclusion of a double hyphen sequence.
Signed-off-by: Kyle J. McKay
---
Markdown.pl | 125 +++++++++++++++++++++++++++++++---------------------
1 file changed, 74 insertions(+), 51 deletions(-)
diff --git a/Markdown.pl b/Markdown.pl
index 66098f2..ec5cf36 100755
--- a/Markdown.pl
+++ b/Markdown.pl
@@ -609,13 +609,13 @@ sub _main {
'validate-xml' => sub {$cli_opts{'validate-xml'} = 1},
'validate-xml-internal' => sub {$cli_opts{'validate-xml'} = 2},
'no-validate-xml' => sub {$cli_opts{'validate-xml'} = 0},
- 'stripcomments|strip-comments' => sub
- {!$cli_opts{'stripcomments'} and $cli_opts{'stripcomments'} = 1},
- 'stripcommentslax|stripcomments-lax|strip-comments-lax' =>
+ 'stripcommentsstrict|stripcomments-strict|strip-comments-strict' =>
+ sub {$cli_opts{'stripcomments'} = 1},
+ 'stripcomments|stripcommentslax|stripcomments-lax|strip-comments|strip-comments-lax' =>
sub {$cli_opts{'stripcomments'} = 2},
'stripcommentslaxonly|stripcomments-laxonly|stripcomments-lax-only|strip-comments-lax-only' =>
sub {$cli_opts{'stripcomments'} = 3},
- 'no-stripcomments|no-strip-comments' => sub {$cli_opts{'stripcomments'} = 0},
+ 'nostripcomments|no-stripcomments|no-strip-comments' => sub {$cli_opts{'stripcomments'} = 0},
'keepabs|keep-abs|k' => \$cli_opts{'keepabs'},
'absroot|a=s' => \$cli_opts{'absroot'},
'base|b=s' => \$cli_opts{'base'},
@@ -995,16 +995,20 @@ sub ProcessRaw {
# which gets turned into "
" which then
# no longer validates).
# stripcomments => any-false-value (no action), any-true-value (strip).
-# => 1 (strip), 2 (strip-lax), 3 (strip-lax-only)
-# a non-integer true value will be forced to 1.
-# an integer value < 0 will be forced to 1.
-# an integer value > 3 will be forced to 3.
+# => 1 (strip-strict), 2 (strip-lax), 3 (strip-lax-only)
+# a non-numeric true value will be forced to 2.
+# a numeric value < 0 will be forced to 2.
+# a numeric value > 0 and < 1 will be forced to 2.
+# a numeric value > 3 will be forced to 3.
+# a non-integer value will forced to an integral value.
# 1, 2, and 3 correspond to the command line options
-# --strip-comments, --strip-comments-lax and
+# --strip-comments-strict, --strip-comments-lax and
# --strip-comments-lax-only respectively.
# since the strip comments mechanism is a function of the
# sanitizer, if stripcomments is set to any-true-value then
# tag attributes will also always be sanitized.
+# if stripcomments is not set or is set to the empty string,
+# then it will be set to 3 if sanitize is true and 0 otherwise.
# effective for both ProcessRaw and Markdown.
# empty_element_suffix => " />" or ">"
# will be forced to " />" if not valid or defined.
@@ -1172,13 +1176,15 @@ sub _SanitizeOpts {
$o->{keep_named_character_entities} = 0 unless
defined($o->{keep_named_character_entities}) && $o->{keep_named_character_entities} eq "1";
$o->{xmlcheck} = looks_like_number($o->{xmlcheck}) && $o->{xmlcheck} == 0 ? 0 : 2;
- !looks_like_number($o->{stripcomments}) and $o->{stripcomments} = $o->{stripcomments} ? 1 : 0;
- $o->{stripcomments} && $o->{stripcomments} < 2 and $o->{stripcomments} = 1;
- $o->{stripcomments} = int($o->{stripcomments});
- $o->{stripcomments} > 3 and $o->{stripcomments} = 3;
- $o->{sanitize} = 1 if $o->{stripcomments} && !$o->{sanitize};
$o->{sanitize} = 1 if $o->{xmlcheck} && !$o->{sanitize};
$o->{sanitize} = 1 if ref($o->{urlfunc}) eq 'CODE' && !$o->{sanitize};
+ !looks_like_number($o->{stripcomments}) and
+ $o->{stripcomments} = $o->{stripcomments} ? 2 :
+ ($o->{sanitize} && (!defined($o->{stripcomments}) || $o->{stripcomments} eq "") ? 3 : 0);
+ $o->{stripcomments} && $o->{stripcomments} < 1 and $o->{stripcomments} = 2;
+ $o->{stripcomments} = int($o->{stripcomments});
+ $o->{stripcomments} > 3 and $o->{stripcomments} = 3;
+ $o->{stripcomments} && !$o->{sanitize} and $o->{sanitize} = 1;
# this is gross, but having the globals avoids unnecessary slowdown
if ($o->{sanitize} && $o->{xmlcheck}) {
@@ -4503,10 +4509,11 @@ B [B<--help>] [B<--html4tags>] [B<--htmlroot>=I]
--validate-xml check if output is valid XML
--validate-xml-internal fast basic check if output is valid XML
--no-validate-xml do not check output for valid XML
- --strip-comments remove XML comments from output
+ --strip-comments remove XML-like comments from output
--strip-comments-lax remove XML-like comments from output
+ --strip-comments-strict remove only strictly valid XML comments
--strip-comments-lax-only remove only invalid XML-like comments
- --no-strip-comments do not remove XML comments (default)
+ --no-strip-comments do not remove any XML/XML-like comments
--tabwidth=num expand tabs to num instead of 8
--auto-number automatically number h1-h6 headers
-k | --keep-abs keep abspath URLs despite -r/-i
@@ -4715,15 +4722,31 @@ B<--no-sanitize> is used in which case B<--no-validate-xml> is the
default option.
-=item B<--strip-comments>
+=item B<--strip-comments>/B<--strip-comments-lax>
+
+(N.B. B<--strip-comments> is just a short form of B<--strip-comments-lax>)
+
+Strip XML and XML-like comments from the output. Any XML or XML-like
+comments encountered will be omitted from the output if either of these
+options is given.
-Strip XML comments from the output. Any XML comments encountered will
-be omitted from the output if this option is given.
+Unlike the B<--strip-comments-strict> option, these options I
+strip any XML-like comments that contain internal double hyphen
+(i.e. C<-->) sequences.
This option requires the B<--sanitize> option to be used (which is
the default).
-However, note that the XML standard section 2.5 specifically prohibits
+If either of these options is given, it will supersede any previous
+B<--strip-comments-strict>, B<--strip-comments-lax-only> or
+B<--no-strip-comments> options.
+
+
+=item B<--strip-comments-strict>
+
+Strip only strictly XML standard compliant comments from the output.
+
+Note that the XML standard section 2.5 specifically prohibits
a C<--> sequence within an XML comment (i.e. C<--> cannot occur after
the comment start tag C<< >>).
@@ -4737,56 +4760,56 @@ option), any invalid tags have their leading C<< < >> escaped (to
C<< lt; >>) thus making them ordinary text and this I
invalid XML comments.
-What this means is that the B<--strip-comments> option I remove
-invalid XML comments (such as S >>>)!
+What this means is that the B<--strip-comments-strict> option I
+remove invalid XML comments (such as S >>>)!
But see the B<--strip-comments-lax> option for a solution.
-
-=item B<--strip-comments-lax>
-
-Strip XML-like comments from the output. Any XML-like comments encountered
-will be omitted from the output if this option is given. Supersedes the
-B<--strip-comments> option if both are given.
-
-While the syntax of XML comments cannot be relaxed (that would require
-altering the XML standard), if they are being stripped out anyway, then the
-standard isn't quite so relevant since they will not be present in the output.
-
-The B<--strip-comments-lax> option acts just like the B<--strip-comments>
-option EXCEPT that the content between the starting comment tag S>>
-and then ending comment tag S >>> is I restricted since it will be
-stripped out of the final result which will therefore remain XML compliant.
-
-The only restriction, of course, is that the content between the XML comment
-start tag and the XML comment end tag cannot contain the XML comment end tag
-itself.
-
-With the B<--strip-comments-lax> option, strictly invalid XML comments
-(such as S >>>) I be stripped as well as all
-strictly valid XML comments.
+If this option is given, it will supersede any previous
+B<--strip-comments>, B<--strip-comments-lax>, B<--strip-comments-lax-only>
+or B<--no-strip-comments> options.
=item B<--strip-comments-lax-only>
-This is a compromise option. It works just like B<--strip-comments-lax>, but
-I on strictly invalid XML-like comments. Supersedes the
-B<--strip-comments> option if both are given.
+This is the default option if no other strip comments options are given
+AND the B<--sanitize> option is active (the default).
+
+This is a compromise option. It works just like the B<--strip-comments-lax>
+option, but I on strictly invalid XML-like comments.
In other words, if a strictly valid XML comment is present, it will be retained
in the output. If a strictly invalid XML comment is present which would have
been stripped by B<--strip-comments-lax> but would have had its leading C<< < >>
-escaped automatically by the default B<--no-strip-comments> mode (because it's
-not a strictly valid XML comment), then it will be stripped by this mode.
+escaped automatically by the B<--no-strip-comments> or B<--strip-comments-strict>
+modes (because it's not a strictly valid XML comment), then it I be stripped
+by this mode.
This option prevents ugly invalid XML comments from slipping through into the
output as escaped plain text while still passing through valid XML comments
without stripping them.
+If this option is given, it will supersede any previous
+B<--strip-comments>, B<--strip-comments-lax>, B<--strip-comments-lax-only>
+or B<--no-strip-comments> options.
+
=item B<--no-strip-comments>
-Do not strip XML comments from the output. This is the default.
+Do not strip XML or XML-like comments from the output.
+
+This is the default option I when no other strip comments options have
+been give I the B<--no-sanitize> option is in effect (which is I the
+default).
+
+When B<--no-strip-comments> is active, strictly invalid XML comments such
+as those that contain an internal double hyphen (C<-->) sequence will end
+up having their leading C<< < >> escaped automatically and end up as plain
+text in the output!
+
+If this option is given, it will supersede any previous
+B<--strip-comments>, B<--strip-comments-lax>, B<--strip-comments-lax-only>
+or B<--no-strip-comments> options.
=item B<--tabwidth>=I