diff --git a/Markdown.pl b/Markdown.pl
index 23b0053..d76307f 100755
--- a/Markdown.pl
+++ b/Markdown.pl
@@ -632,6 +632,8 @@ sub _main {
'no-stylesheet|no-style-sheet' => sub {$cli_opts{'stylesheet'} = 0},
'keep-named-character-entities' => \$cli_opts{'keepcharents'},
'no-keep-named-character-entities' => sub {$cli_opts{'keepcharents'} = 0},
+ 'us-ascii|ascii' => \$cli_opts{'us_ascii'},
+ 'no-us-ascii|no-ascii' => sub {$cli_opts{'us_ascii'} = 0},
'stub' => \$cli_opts{'stub'},
'yaml:s' => \$cli_opts{'yaml'},
);
@@ -650,6 +652,7 @@ sub _main {
}
my $xmlcheck;
$options{'keep_named_character_entities'} = $cli_opts{'keepcharents'} ? "1" : 0;
+ $options{'us_ascii'} = $cli_opts{'us_ascii'} ? "1" : 0;
$options{divwrap} = defined($cli_opts{'divname'});
$options{divname} = defined($cli_opts{'divname'}) ? $cli_opts{'divname'} : "";
$options{sanitize} = 1; # sanitize by default
@@ -946,6 +949,10 @@ sub ProcessRaw {
$opt{keep_named_character_entities} or
$text = ConvertNamedCharacterEntities($text);
+ # Convert to US-ASCII only if requested
+ $opt{us_ascii} and
+ $text = ConvertToASCII($text);
+
utf8::encode($text);
if ($opt{divwrap}) {
my $id = $opt{divname};
@@ -1018,6 +1025,8 @@ sub ProcessRaw {
# then known named character entities will be converted to
# their equivalent numerical entity. Use of this option is
# strongly discouraged to avoid strict XML validation failures.
+# us_ascii => if true, non-US-ASCII characters will be converted to
+# numerical character entities making the output US-ASCII only.
# divwrap => if true, wrap output contents in
...
# divname => if defined and non-empty will be id of divwrap div tag
# urlfunc => if set to a CODE ref, the function will be called with
@@ -1421,6 +1430,20 @@ sub Markdown {
# Sanitize all '<'...'>' tags if requested
$text = _SanitizeTags($text, $opt{xmlcheck}, 1) if $opt{sanitize};
+ # Eliminate known named character entities
+ $opt{keep_named_character_entities} or do {
+ $yamltable = ConvertNamedCharacterEntities($yamltable);
+ $text = ConvertNamedCharacterEntities($text);
+ };
+
+ # Convert to US-ASCII only if requested
+ $opt{us_ascii} and do {
+ utf8::decode($yamltable);
+ $yamltable = ConvertToASCII($yamltable);
+ utf8::encode($yamltable);
+ $text = ConvertToASCII($text);
+ };
+
utf8::encode($text);
if (ref($_[0]) eq "HASH") {
${$_[0]}{anchors} = {%g_anchors_id} if exists(${$_[0]}{anchors});
@@ -1431,12 +1454,6 @@ sub Markdown {
${$_[0]}{yaml} = $yaml if ref($yaml) eq "HASH";
}
- # Eliminate known named character entities
- $opt{keep_named_character_entities} or do {
- $yamltable = ConvertNamedCharacterEntities($yamltable);
- $text = ConvertNamedCharacterEntities($text);
- };
-
if ($opt{divwrap}) {
my $id = $opt{divname};
defined($id) or $id = "";
@@ -4036,6 +4053,26 @@ sub ConvertNamedCharacterEntities {
}
+my $_usasciisub;
+BEGIN { $_usasciisub = sub {
+ my $c = $_[0];
+ my $o = ord($c);
+ return ($o <= 999) ? (($o < 128) ? $c : "$o;") : sprintf("%x;", $o);
+} }
+
+
+# $_[0] => the input text to process
+# returns text with non-US-ASCII characters replaced
+# with their equivalent numerical character entities,
+# but only if the input text has already been utf8::decode'd
+sub ConvertToASCII {
+ my $text = shift;
+ defined($text) or return undef;
+ $text =~ s/([^\x00-\x7F])/&$_usasciisub($1)/goes;
+ return $text;
+}
+
+
sub _EncodeAmps {
my $text = shift;
@@ -4539,6 +4576,7 @@ B [B<--help>] [B<--html4tags>] [B<--htmlroot>=I]
--stylesheet output the fancy style sheet
--no-stylesheet do not output fancy style sheet
--keep-named-character-entities do not convert named character entities
+ --us-ascii convert non-ASCII to character entities
--stub wrap output in stub document
implies --stylesheet
-- end options and treat next
@@ -5220,6 +5258,20 @@ left alone since they are universally supported.
Use of this option is I.
+=item B<--us-ascii>/B<--ascii>
+
+(N.B. B<--ascii> is just a short form of B<--us-ascii>)
+
+Convert any non-US-ASCII characters to their equivalent numerical character
+entity. Any characters with a code point value greater than or equal to
+128 will be converted. Note that the output is still technically UTF-8 since
+the US-ASCII code points coincide with the same code points of UTF-8.
+
+Using this option will make the output strictly 7-bit and therefore it should
+survive just about any transport mechanism at the expense of an increase in
+size that depends on how many non-US-ASCII characters are present.
+
+
=item B<--stub>
Wrap the output in a full document stub (i.e. has C, C and C