diff --git a/Markdown.pl b/Markdown.pl index 23b0053..d76307f 100755 --- a/Markdown.pl +++ b/Markdown.pl @@ -632,6 +632,8 @@ sub _main { 'no-stylesheet|no-style-sheet' => sub {$cli_opts{'stylesheet'} = 0}, 'keep-named-character-entities' => \$cli_opts{'keepcharents'}, 'no-keep-named-character-entities' => sub {$cli_opts{'keepcharents'} = 0}, + 'us-ascii|ascii' => \$cli_opts{'us_ascii'}, + 'no-us-ascii|no-ascii' => sub {$cli_opts{'us_ascii'} = 0}, 'stub' => \$cli_opts{'stub'}, 'yaml:s' => \$cli_opts{'yaml'}, ); @@ -650,6 +652,7 @@ sub _main { } my $xmlcheck; $options{'keep_named_character_entities'} = $cli_opts{'keepcharents'} ? "1" : 0; + $options{'us_ascii'} = $cli_opts{'us_ascii'} ? "1" : 0; $options{divwrap} = defined($cli_opts{'divname'}); $options{divname} = defined($cli_opts{'divname'}) ? $cli_opts{'divname'} : ""; $options{sanitize} = 1; # sanitize by default @@ -946,6 +949,10 @@ sub ProcessRaw { $opt{keep_named_character_entities} or $text = ConvertNamedCharacterEntities($text); + # Convert to US-ASCII only if requested + $opt{us_ascii} and + $text = ConvertToASCII($text); + utf8::encode($text); if ($opt{divwrap}) { my $id = $opt{divname}; @@ -1018,6 +1025,8 @@ sub ProcessRaw { # then known named character entities will be converted to # their equivalent numerical entity. Use of this option is # strongly discouraged to avoid strict XML validation failures. +# us_ascii => if true, non-US-ASCII characters will be converted to +# numerical character entities making the output US-ASCII only. # divwrap => if true, wrap output contents in
...
# divname => if defined and non-empty will be id of divwrap div tag # urlfunc => if set to a CODE ref, the function will be called with @@ -1421,6 +1430,20 @@ sub Markdown { # Sanitize all '<'...'>' tags if requested $text = _SanitizeTags($text, $opt{xmlcheck}, 1) if $opt{sanitize}; + # Eliminate known named character entities + $opt{keep_named_character_entities} or do { + $yamltable = ConvertNamedCharacterEntities($yamltable); + $text = ConvertNamedCharacterEntities($text); + }; + + # Convert to US-ASCII only if requested + $opt{us_ascii} and do { + utf8::decode($yamltable); + $yamltable = ConvertToASCII($yamltable); + utf8::encode($yamltable); + $text = ConvertToASCII($text); + }; + utf8::encode($text); if (ref($_[0]) eq "HASH") { ${$_[0]}{anchors} = {%g_anchors_id} if exists(${$_[0]}{anchors}); @@ -1431,12 +1454,6 @@ sub Markdown { ${$_[0]}{yaml} = $yaml if ref($yaml) eq "HASH"; } - # Eliminate known named character entities - $opt{keep_named_character_entities} or do { - $yamltable = ConvertNamedCharacterEntities($yamltable); - $text = ConvertNamedCharacterEntities($text); - }; - if ($opt{divwrap}) { my $id = $opt{divname}; defined($id) or $id = ""; @@ -4036,6 +4053,26 @@ sub ConvertNamedCharacterEntities { } +my $_usasciisub; +BEGIN { $_usasciisub = sub { + my $c = $_[0]; + my $o = ord($c); + return ($o <= 999) ? (($o < 128) ? $c : "&#$o;") : sprintf("&#x%x;", $o); +} } + + +# $_[0] => the input text to process +# returns text with non-US-ASCII characters replaced +# with their equivalent numerical character entities, +# but only if the input text has already been utf8::decode'd +sub ConvertToASCII { + my $text = shift; + defined($text) or return undef; + $text =~ s/([^\x00-\x7F])/&$_usasciisub($1)/goes; + return $text; +} + + sub _EncodeAmps { my $text = shift; @@ -4539,6 +4576,7 @@ B [B<--help>] [B<--html4tags>] [B<--htmlroot>=I] --stylesheet output the fancy style sheet --no-stylesheet do not output fancy style sheet --keep-named-character-entities do not convert named character entities + --us-ascii convert non-ASCII to character entities --stub wrap output in stub document implies --stylesheet -- end options and treat next @@ -5220,6 +5258,20 @@ left alone since they are universally supported. Use of this option is I. +=item B<--us-ascii>/B<--ascii> + +(N.B. B<--ascii> is just a short form of B<--us-ascii>) + +Convert any non-US-ASCII characters to their equivalent numerical character +entity. Any characters with a code point value greater than or equal to +128 will be converted. Note that the output is still technically UTF-8 since +the US-ASCII code points coincide with the same code points of UTF-8. + +Using this option will make the output strictly 7-bit and therefore it should +survive just about any transport mechanism at the expense of an increase in +size that depends on how many non-US-ASCII characters are present. + + =item B<--stub> Wrap the output in a full document stub (i.e. has C, C and C