Markdown.pl: add new --us-ascii option for 7-bit output

With the new --us-ascii (aka --ascii) option enabled, any characters with a code point value larger than 127 are output using their equivalent numerical character entity. This makes the output strictly US-ASCII (which is a subset of UTF-8) and should allow it to survive almost any transport mechanism at the expense of an increase in size that depends on how many non-US-ASCII characters are present. Signed-off-by: Kyle J. McKay <mackyle@gmail.com>
5 years ago · 62c5916945
1 changed files with 58 additions and 6 deletions
--- a/Markdown.pl
+++ b/Markdown.pl
@ -632,6 +632,8 @@ sub _main {
 	'no-stylesheet|no-style-sheet' => sub {$cli_opts{'stylesheet'} = 0},
 	'keep-named-character-entities' => \$cli_opts{'keepcharents'},
 	'no-keep-named-character-entities' => sub {$cli_opts{'keepcharents'} = 0},
+	'us-ascii|ascii' => \$cli_opts{'us_ascii'},
+	'no-us-ascii|no-ascii' => sub {$cli_opts{'us_ascii'} = 0},
 	'stub' => \$cli_opts{'stub'},
 	'yaml:s' => \$cli_opts{'yaml'},
    );
@ -650,6 +652,7 @@ sub _main {
    }
    my $xmlcheck;
    $options{'keep_named_character_entities'} = $cli_opts{'keepcharents'} ? "1" : 0;
+    $options{'us_ascii'} = $cli_opts{'us_ascii'} ? "1" : 0;
    $options{divwrap} = defined($cli_opts{'divname'});
    $options{divname} = defined($cli_opts{'divname'}) ? $cli_opts{'divname'} : "";
    $options{sanitize} = 1; # sanitize by default
@ -946,6 +949,10 @@ sub ProcessRaw {
    $opt{keep_named_character_entities} or
 	$text = ConvertNamedCharacterEntities($text);

+    # Convert to US-ASCII only if requested
+    $opt{us_ascii} and
+	$text = ConvertToASCII($text);
+
    utf8::encode($text);
    if ($opt{divwrap}) {
 	my $id = $opt{divname};
@ -1018,6 +1025,8 @@ sub ProcessRaw {
 #               then known named character entities will be converted to
 #               their equivalent numerical entity.  Use of this option is
 #               strongly discouraged to avoid strict XML validation failures.
+#   us_ascii    => if true, non-US-ASCII characters will be converted to
+#               numerical character entities making the output US-ASCII only.
 #   divwrap     => if true, wrap output contents in <div>...</div>
 #   divname     => if defined and non-empty will be id of divwrap div tag
 #   urlfunc     => if set to a CODE ref, the function will be called with
@ -1421,6 +1430,20 @@ sub Markdown {
    # Sanitize all '<'...'>' tags if requested
    $text = _SanitizeTags($text, $opt{xmlcheck}, 1) if $opt{sanitize};

+    # Eliminate known named character entities
+    $opt{keep_named_character_entities} or do {
+	$yamltable = ConvertNamedCharacterEntities($yamltable);
+	$text = ConvertNamedCharacterEntities($text);
+    };
+
+    # Convert to US-ASCII only if requested
+    $opt{us_ascii} and do {
+	utf8::decode($yamltable);
+	$yamltable = ConvertToASCII($yamltable);
+	utf8::encode($yamltable);
+	$text = ConvertToASCII($text);
+    };
+
    utf8::encode($text);
    if (ref($_[0]) eq "HASH") {
 	${$_[0]}{anchors} = {%g_anchors_id} if exists(${$_[0]}{anchors});
@ -1431,12 +1454,6 @@ sub Markdown {
 	${$_[0]}{yaml} = $yaml if ref($yaml) eq "HASH";
    }

-    # Eliminate known named character entities
-    $opt{keep_named_character_entities} or do {
-	$yamltable = ConvertNamedCharacterEntities($yamltable);
-	$text = ConvertNamedCharacterEntities($text);
-    };
-
    if ($opt{divwrap}) {
 	my $id = $opt{divname};
 	defined($id) or $id = "";
@ -4036,6 +4053,26 @@ sub ConvertNamedCharacterEntities {
 }


+my $_usasciisub;
+BEGIN { $_usasciisub = sub {
+    my $c = $_[0];
+    my $o = ord($c);
+    return ($o <= 999) ? (($o < 128) ? $c : "&#$o;") : sprintf("&#x%x;", $o);
+} }
+
+
+# $_[0] => the input text to process
+# returns text with non-US-ASCII characters replaced
+# with their equivalent numerical character entities,
+# but only if the input text has already been utf8::decode'd
+sub ConvertToASCII {
+    my $text = shift;
+    defined($text) or return undef;
+    $text =~ s/([^\x00-\x7F])/&$_usasciisub($1)/goes;
+    return $text;
+}
+
+
 sub _EncodeAmps {
    my $text = shift;

@ -4539,6 +4576,7 @@ B<Markdown.pl> [B<--help>] [B<--html4tags>] [B<--htmlroot>=I<prefix>]
   --stylesheet                         output the fancy style sheet
   --no-stylesheet                      do not output fancy style sheet
   --keep-named-character-entities      do not convert named character entities
+   --us-ascii                           convert non-ASCII to character entities
   --stub                               wrap output in stub document
                                        implies --stylesheet
   --                                   end options and treat next
@ -5220,6 +5258,20 @@ left alone since they are universally supported.
 Use of this option is I<NOT RECOMMENDED>.


+=item B<--us-ascii>/B<--ascii>
+
+(N.B. B<--ascii> is just a short form of B<--us-ascii>)
+
+Convert any non-US-ASCII characters to their equivalent numerical character
+entity.  Any characters with a code point value greater than or equal to
+128 will be converted.  Note that the output is still technically UTF-8 since
+the US-ASCII code points coincide with the same code points of UTF-8.
+
+Using this option will make the output strictly 7-bit and therefore it should
+survive just about any transport mechanism at the expense of an increase in
+size that depends on how many non-US-ASCII characters are present.
+
+
 =item B<--stub>

 Wrap the output in a full document stub (i.e. has C<html>, C<head> and C<body>