From 7e34ed6a63ff49b9c5ec53298657d8778626cc01 Mon Sep 17 00:00:00 2001 From: "Kyle J. McKay" Date: Sun, 13 Jun 2021 13:13:03 -0700 Subject: [PATCH] Markdown.pl: adjust strip-comments options and defaults Make --strip-comments be an alias for --strip-comments-lax. Make --strip-comments-lax-only be the default whenever --sanitize is active (which is the default). Whenever possible, running Markdown.pl without any options should provide the best output possible by default. Turning what appear to be (at first glance) XML comments into plain text in the output clearly violates the principle of least surprise and can make for some very ugly pages. Similarly using a `--strip-comments` option and discovering those same plain text XML comments in the output also invites bewilderment. Instead, make `--strip-comments` be a short form of `--strip-comments-lax` and make `--strip-comments-lax-only` be the normal default. By doing this there are no ugly page surprises by default. Those pesky double hyphen sequences (`--`) that have furtively slipped into what were supposed to be strictly valid XML comments thereby making them strictly invalid XML comments are now rendered impotent by default. The output remains strictly valid XML uncontaminated by the surprise appearance of strictly invalid XML comments suddenly rendered as plain text due to accidental inclusion of a double hyphen sequence. Signed-off-by: Kyle J. McKay --- Markdown.pl | 125 +++++++++++++++++++++++++++++++--------------------- 1 file changed, 74 insertions(+), 51 deletions(-) diff --git a/Markdown.pl b/Markdown.pl index 66098f2..ec5cf36 100755 --- a/Markdown.pl +++ b/Markdown.pl @@ -609,13 +609,13 @@ sub _main { 'validate-xml' => sub {$cli_opts{'validate-xml'} = 1}, 'validate-xml-internal' => sub {$cli_opts{'validate-xml'} = 2}, 'no-validate-xml' => sub {$cli_opts{'validate-xml'} = 0}, - 'stripcomments|strip-comments' => sub - {!$cli_opts{'stripcomments'} and $cli_opts{'stripcomments'} = 1}, - 'stripcommentslax|stripcomments-lax|strip-comments-lax' => + 'stripcommentsstrict|stripcomments-strict|strip-comments-strict' => + sub {$cli_opts{'stripcomments'} = 1}, + 'stripcomments|stripcommentslax|stripcomments-lax|strip-comments|strip-comments-lax' => sub {$cli_opts{'stripcomments'} = 2}, 'stripcommentslaxonly|stripcomments-laxonly|stripcomments-lax-only|strip-comments-lax-only' => sub {$cli_opts{'stripcomments'} = 3}, - 'no-stripcomments|no-strip-comments' => sub {$cli_opts{'stripcomments'} = 0}, + 'nostripcomments|no-stripcomments|no-strip-comments' => sub {$cli_opts{'stripcomments'} = 0}, 'keepabs|keep-abs|k' => \$cli_opts{'keepabs'}, 'absroot|a=s' => \$cli_opts{'absroot'}, 'base|b=s' => \$cli_opts{'base'}, @@ -995,16 +995,20 @@ sub ProcessRaw { # which gets turned into "

" which then # no longer validates). # stripcomments => any-false-value (no action), any-true-value (strip). -# => 1 (strip), 2 (strip-lax), 3 (strip-lax-only) -# a non-integer true value will be forced to 1. -# an integer value < 0 will be forced to 1. -# an integer value > 3 will be forced to 3. +# => 1 (strip-strict), 2 (strip-lax), 3 (strip-lax-only) +# a non-numeric true value will be forced to 2. +# a numeric value < 0 will be forced to 2. +# a numeric value > 0 and < 1 will be forced to 2. +# a numeric value > 3 will be forced to 3. +# a non-integer value will forced to an integral value. # 1, 2, and 3 correspond to the command line options -# --strip-comments, --strip-comments-lax and +# --strip-comments-strict, --strip-comments-lax and # --strip-comments-lax-only respectively. # since the strip comments mechanism is a function of the # sanitizer, if stripcomments is set to any-true-value then # tag attributes will also always be sanitized. +# if stripcomments is not set or is set to the empty string, +# then it will be set to 3 if sanitize is true and 0 otherwise. # effective for both ProcessRaw and Markdown. # empty_element_suffix => " />" or ">" # will be forced to " />" if not valid or defined. @@ -1172,13 +1176,15 @@ sub _SanitizeOpts { $o->{keep_named_character_entities} = 0 unless defined($o->{keep_named_character_entities}) && $o->{keep_named_character_entities} eq "1"; $o->{xmlcheck} = looks_like_number($o->{xmlcheck}) && $o->{xmlcheck} == 0 ? 0 : 2; - !looks_like_number($o->{stripcomments}) and $o->{stripcomments} = $o->{stripcomments} ? 1 : 0; - $o->{stripcomments} && $o->{stripcomments} < 2 and $o->{stripcomments} = 1; - $o->{stripcomments} = int($o->{stripcomments}); - $o->{stripcomments} > 3 and $o->{stripcomments} = 3; - $o->{sanitize} = 1 if $o->{stripcomments} && !$o->{sanitize}; $o->{sanitize} = 1 if $o->{xmlcheck} && !$o->{sanitize}; $o->{sanitize} = 1 if ref($o->{urlfunc}) eq 'CODE' && !$o->{sanitize}; + !looks_like_number($o->{stripcomments}) and + $o->{stripcomments} = $o->{stripcomments} ? 2 : + ($o->{sanitize} && (!defined($o->{stripcomments}) || $o->{stripcomments} eq "") ? 3 : 0); + $o->{stripcomments} && $o->{stripcomments} < 1 and $o->{stripcomments} = 2; + $o->{stripcomments} = int($o->{stripcomments}); + $o->{stripcomments} > 3 and $o->{stripcomments} = 3; + $o->{stripcomments} && !$o->{sanitize} and $o->{sanitize} = 1; # this is gross, but having the globals avoids unnecessary slowdown if ($o->{sanitize} && $o->{xmlcheck}) { @@ -4503,10 +4509,11 @@ B [B<--help>] [B<--html4tags>] [B<--htmlroot>=I] --validate-xml check if output is valid XML --validate-xml-internal fast basic check if output is valid XML --no-validate-xml do not check output for valid XML - --strip-comments remove XML comments from output + --strip-comments remove XML-like comments from output --strip-comments-lax remove XML-like comments from output + --strip-comments-strict remove only strictly valid XML comments --strip-comments-lax-only remove only invalid XML-like comments - --no-strip-comments do not remove XML comments (default) + --no-strip-comments do not remove any XML/XML-like comments --tabwidth=num expand tabs to num instead of 8 --auto-number automatically number h1-h6 headers -k | --keep-abs keep abspath URLs despite -r/-i @@ -4715,15 +4722,31 @@ B<--no-sanitize> is used in which case B<--no-validate-xml> is the default option. -=item B<--strip-comments> +=item B<--strip-comments>/B<--strip-comments-lax> + +(N.B. B<--strip-comments> is just a short form of B<--strip-comments-lax>) + +Strip XML and XML-like comments from the output. Any XML or XML-like +comments encountered will be omitted from the output if either of these +options is given. -Strip XML comments from the output. Any XML comments encountered will -be omitted from the output if this option is given. +Unlike the B<--strip-comments-strict> option, these options I +strip any XML-like comments that contain internal double hyphen +(i.e. C<-->) sequences. This option requires the B<--sanitize> option to be used (which is the default). -However, note that the XML standard section 2.5 specifically prohibits +If either of these options is given, it will supersede any previous +B<--strip-comments-strict>, B<--strip-comments-lax-only> or +B<--no-strip-comments> options. + + +=item B<--strip-comments-strict> + +Strip only strictly XML standard compliant comments from the output. + +Note that the XML standard section 2.5 specifically prohibits a C<--> sequence within an XML comment (i.e. C<--> cannot occur after the comment start tag C<< >>). @@ -4737,56 +4760,56 @@ option), any invalid tags have their leading C<< < >> escaped (to C<< &#lt; >>) thus making them ordinary text and this I invalid XML comments. -What this means is that the B<--strip-comments> option I remove -invalid XML comments (such as S >>>)! +What this means is that the B<--strip-comments-strict> option I +remove invalid XML comments (such as S >>>)! But see the B<--strip-comments-lax> option for a solution. - -=item B<--strip-comments-lax> - -Strip XML-like comments from the output. Any XML-like comments encountered -will be omitted from the output if this option is given. Supersedes the -B<--strip-comments> option if both are given. - -While the syntax of XML comments cannot be relaxed (that would require -altering the XML standard), if they are being stripped out anyway, then the -standard isn't quite so relevant since they will not be present in the output. - -The B<--strip-comments-lax> option acts just like the B<--strip-comments> -option EXCEPT that the content between the starting comment tag S>> -and then ending comment tag S >>> is I restricted since it will be -stripped out of the final result which will therefore remain XML compliant. - -The only restriction, of course, is that the content between the XML comment -start tag and the XML comment end tag cannot contain the XML comment end tag -itself. - -With the B<--strip-comments-lax> option, strictly invalid XML comments -(such as S >>>) I be stripped as well as all -strictly valid XML comments. +If this option is given, it will supersede any previous +B<--strip-comments>, B<--strip-comments-lax>, B<--strip-comments-lax-only> +or B<--no-strip-comments> options. =item B<--strip-comments-lax-only> -This is a compromise option. It works just like B<--strip-comments-lax>, but -I on strictly invalid XML-like comments. Supersedes the -B<--strip-comments> option if both are given. +This is the default option if no other strip comments options are given +AND the B<--sanitize> option is active (the default). + +This is a compromise option. It works just like the B<--strip-comments-lax> +option, but I on strictly invalid XML-like comments. In other words, if a strictly valid XML comment is present, it will be retained in the output. If a strictly invalid XML comment is present which would have been stripped by B<--strip-comments-lax> but would have had its leading C<< < >> -escaped automatically by the default B<--no-strip-comments> mode (because it's -not a strictly valid XML comment), then it will be stripped by this mode. +escaped automatically by the B<--no-strip-comments> or B<--strip-comments-strict> +modes (because it's not a strictly valid XML comment), then it I be stripped +by this mode. This option prevents ugly invalid XML comments from slipping through into the output as escaped plain text while still passing through valid XML comments without stripping them. +If this option is given, it will supersede any previous +B<--strip-comments>, B<--strip-comments-lax>, B<--strip-comments-lax-only> +or B<--no-strip-comments> options. + =item B<--no-strip-comments> -Do not strip XML comments from the output. This is the default. +Do not strip XML or XML-like comments from the output. + +This is the default option I when no other strip comments options have +been give I the B<--no-sanitize> option is in effect (which is I the +default). + +When B<--no-strip-comments> is active, strictly invalid XML comments such +as those that contain an internal double hyphen (C<-->) sequence will end +up having their leading C<< < >> escaped automatically and end up as plain +text in the output! + +If this option is given, it will supersede any previous +B<--strip-comments>, B<--strip-comments-lax>, B<--strip-comments-lax-only> +or B<--no-strip-comments> options. =item B<--tabwidth>=I