diff --git a/Markdown.pl b/Markdown.pl index e336c62..af15b20 100755 --- a/Markdown.pl +++ b/Markdown.pl @@ -37,7 +37,7 @@ my ($hasxmlp, $hasxmlp_err); BEGIN { ($hasxmlp, $hasxmlp_err) = (0, "") } BEGIN { @ISA = qw(Exporter); @EXPORT_OK = qw(Markdown ProcessRaw GenerateStyleSheet SetWikiOpts SplitURL - escapeXML unescapeXML ResolveFragment); + escapeXML unescapeXML ResolveFragment ConvertNamedCharacterEntities); $INC{__PACKAGE__.'.pm'} = $INC{basename(__FILE__)} unless exists $INC{__PACKAGE__.'.pm'}; } @@ -155,6 +155,260 @@ BEGIN { $g_list_level = 0; } +# Entity conversion table +my %named_character_entity; +BEGIN { %named_character_entity = ( + 'Aacute' => '193', + 'aacute' => '225', + 'Acirc' => '194', + 'acirc' => '226', + 'acute' => '180', + 'AElig' => '198', + 'aelig' => '230', + 'Agrave' => '192', + 'agrave' => '224', + 'alefsym' => 'x2135', + 'Alpha' => '913', + 'alpha' => '945', + 'and' => 'x2227', + 'ang' => 'x2220', + 'apos' => '39', + 'Aring' => '197', + 'aring' => '229', + 'asymp' => 'x2248', + 'Atilde' => '195', + 'atilde' => '227', + 'Auml' => '196', + 'auml' => '228', + 'bdquo' => 'x201e', + 'Beta' => '914', + 'beta' => '946', + 'brvbar' => '166', + 'bull' => 'x2022', + 'cap' => 'x2229', + 'Ccedil' => '199', + 'ccedil' => '231', + 'cedil' => '184', + 'cent' => '162', + 'Chi' => '935', + 'chi' => '967', + 'circ' => '710', + 'clubs' => 'x2663', + 'cong' => 'x2245', + 'copy' => '169', + 'crarr' => 'x21b5', + 'cup' => 'x222a', + 'curren' => '164', + 'Dagger' => 'x2021', + 'dagger' => 'x2020', + 'dArr' => 'x21d3', + 'darr' => 'x2193', + 'deg' => '176', + 'Delta' => '916', + 'delta' => '948', + 'diams' => 'x2666', + 'divide' => '247', + 'Eacute' => '201', + 'eacute' => '233', + 'Ecirc' => '202', + 'ecirc' => '234', + 'Egrave' => '200', + 'egrave' => '232', + 'empty' => 'x2205', + 'emsp' => 'x2003', + 'ensp' => 'x2002', + 'Epsilon' => '917', + 'epsilon' => '949', + 'equiv' => 'x2261', + 'Eta' => '919', + 'eta' => '951', + 'ETH' => '208', + 'eth' => '240', + 'Euml' => '203', + 'euml' => '235', + 'euro' => 'x20ac', + 'exist' => 'x2203', + 'fnof' => '402', + 'forall' => 'x2200', + 'frac12' => '189', + 'frac14' => '188', + 'frac34' => '190', + 'frasl' => 'x2044', + 'Gamma' => '915', + 'gamma' => '947', + 'ge' => 'x2265', + 'hArr' => 'x21d4', + 'harr' => 'x2194', + 'hearts' => 'x2665', + 'hellip' => 'x2026', + 'Iacute' => '205', + 'iacute' => '237', + 'Icirc' => '206', + 'icirc' => '238', + 'iexcl' => '161', + 'Igrave' => '204', + 'igrave' => '236', + 'image' => 'x2111', + 'infin' => 'x221e', + 'int' => 'x222b', + 'Iota' => '921', + 'iota' => '953', + 'iquest' => '191', + 'isin' => 'x2208', + 'Iuml' => '207', + 'iuml' => '239', + 'Kappa' => '922', + 'kappa' => '954', + 'Lambda' => '923', + 'lambda' => '955', + 'lang' => 'x2329', + 'laquo' => '171', + 'lArr' => 'x21d0', + 'larr' => 'x2190', + 'lceil' => 'x2308', + 'ldquo' => 'x201c', + 'le' => 'x2264', + 'lfloor' => 'x230a', + 'lowast' => 'x2217', + 'loz' => 'x25ca', + 'lrm' => 'x200e', + 'lsaquo' => 'x2039', + 'lsquo' => 'x2018', + 'macr' => '175', + 'mdash' => 'x2014', + 'micro' => '181', + 'middot' => '183', + 'minus' => 'x2212', + 'Mu' => '924', + 'mu' => '956', + 'nabla' => 'x2207', + 'nbsp' => '160', + 'ndash' => 'x2013', + 'ne' => 'x2260', + 'ni' => 'x220b', + 'not' => '172', + 'notin' => 'x2209', + 'nsub' => 'x2284', + 'Ntilde' => '209', + 'ntilde' => '241', + 'Nu' => '925', + 'nu' => '957', + 'Oacute' => '211', + 'oacute' => '243', + 'Ocirc' => '212', + 'ocirc' => '244', + 'OElig' => '338', + 'oelig' => '339', + 'Ograve' => '210', + 'ograve' => '242', + 'oline' => 'x203e', + 'Omega' => '937', + 'omega' => '969', + 'Omicron' => '927', + 'omicron' => '959', + 'oplus' => 'x2295', + 'or' => 'x2228', + 'ordf' => '170', + 'ordm' => '186', + 'Oslash' => '216', + 'oslash' => '248', + 'Otilde' => '213', + 'otilde' => '245', + 'otimes' => 'x2297', + 'Ouml' => '214', + 'ouml' => '246', + 'para' => '182', + 'part' => 'x2202', + 'permil' => 'x2030', + 'perp' => 'x22a5', + 'Phi' => '934', + 'phi' => '966', + 'Pi' => '928', + 'pi' => '960', + 'piv' => '982', + 'plusmn' => '177', + 'pound' => '163', + 'Prime' => 'x2033', + 'prime' => 'x2032', + 'prod' => 'x220f', + 'prop' => 'x221d', + 'Psi' => '936', + 'psi' => '968', + 'radic' => 'x221a', + 'rang' => 'x232a', + 'raquo' => '187', + 'rArr' => 'x21d2', + 'rarr' => 'x2192', + 'rceil' => 'x2309', + 'rdquo' => 'x201d', + 'real' => 'x211c', + 'reg' => '174', + 'rfloor' => 'x230b', + 'Rho' => '929', + 'rho' => '961', + 'rlm' => 'x200f', + 'rsaquo' => 'x203a', + 'rsquo' => 'x2019', + 'sbquo' => 'x201a', + 'Scaron' => '352', + 'scaron' => '353', + 'sdot' => 'x22c5', + 'sect' => '167', + 'shy' => '173', + 'Sigma' => '931', + 'sigma' => '963', + 'sigmaf' => '962', + 'sim' => 'x223c', + 'spades' => 'x2660', + 'sub' => 'x2282', + 'sube' => 'x2286', + 'sum' => 'x2211', + 'sup' => 'x2283', + 'sup1' => '185', + 'sup2' => '178', + 'sup3' => '179', + 'supe' => 'x2287', + 'szlig' => '223', + 'Tau' => '932', + 'tau' => '964', + 'there4' => 'x2234', + 'Theta' => '920', + 'theta' => '952', + 'thetasym' => '977', + 'thinsp' => 'x2009', + 'THORN' => '222', + 'thorn' => '254', + 'tilde' => '732', + 'times' => '215', + 'trade' => 'x2122', + 'Uacute' => '218', + 'uacute' => '250', + 'uArr' => 'x21d1', + 'uarr' => 'x2191', + 'Ucirc' => '219', + 'ucirc' => '251', + 'Ugrave' => '217', + 'ugrave' => '249', + 'uml' => '168', + 'upsih' => '978', + 'Upsilon' => '933', + 'upsilon' => '965', + 'Uuml' => '220', + 'uuml' => '252', + 'weierp' => 'x2118', + 'Xi' => '926', + 'xi' => '958', + 'Yacute' => '221', + 'yacute' => '253', + 'yen' => '165', + 'Yuml' => '376', + 'yuml' => '255', + 'Zeta' => '918', + 'zeta' => '950', + 'zwj' => 'x200d', + 'zwnj' => 'x200c' +) } + #### Blosxom plug-in interface ########################################## my $_haveBX; @@ -368,6 +622,8 @@ sub _main { 'raw-html' => sub { $cli_opts{'raw'} = 2 }, 'stylesheet|style-sheet' => \$cli_opts{'stylesheet'}, 'no-stylesheet|no-style-sheet' => sub {$cli_opts{'stylesheet'} = 0}, + 'keep-named-character-entities' => \$cli_opts{'keepcharents'}, + 'no-keep-named-character-entities' => sub {$cli_opts{'keepcharents'} = 0}, 'stub' => \$cli_opts{'stub'}, 'yaml:s' => \$cli_opts{'yaml'}, ); @@ -385,6 +641,7 @@ sub _main { _SetAllowedTag("menu"); } my $xmlcheck; + $options{'keep_named_character_entities'} = $cli_opts{'keepcharents'} ? "1" : 0; $options{divwrap} = defined($cli_opts{'divname'}); $options{divname} = defined($cli_opts{'divname'}) ? $cli_opts{'divname'} : ""; $options{sanitize} = 1; # sanitize by default @@ -677,6 +934,10 @@ sub ProcessRaw { # Sanitize all '<'...'>' tags if requested $text = _SanitizeTags($text, $opt{xmlcheck}, $opt{htmlauto}) if $opt{sanitize}; + # Eliminate known named character entities + $opt{keep_named_character_entities} or + $text = ConvertNamedCharacterEntities($text); + utf8::encode($text); if ($opt{divwrap}) { my $id = $opt{divname}; @@ -733,6 +994,11 @@ sub ProcessRaw { # empty_element_suffix => " />" or ">" # will be forced to " />" if not valid or defined. # effective for both ProcessRaw and Markdown. +# keep_named_character_entities => "1" (keep them), any-other-value (convert). +# unless this option is present and has exactly the value "1" +# then known named character entities will be converted to +# their equivalent numerical entity. Use of this option is +# strongly discouraged to avoid strict XML validation failures. # divwrap => if true, wrap output contents in
...
# divname => if defined and non-empty will be id of divwrap div tag # urlfunc => if set to a CODE ref, the function will be called with @@ -887,6 +1153,8 @@ sub _SanitizeOpts { my $o = shift; # hashref ref($o) eq "HASH" or return; + $o->{keep_named_character_entities} = 0 unless + defined($o->{keep_named_character_entities}) && $o->{keep_named_character_entities} eq "1"; $o->{xmlcheck} = looks_like_number($o->{xmlcheck}) && $o->{xmlcheck} == 0 ? 0 : 2; $o->{sanitize} = 1 if $o->{stripcomments} && !$o->{sanitize}; $o->{sanitize} = 1 if $o->{xmlcheck} && !$o->{sanitize}; @@ -1135,6 +1403,12 @@ sub Markdown { "\n$hrows\n\n$drows\n\n"; } } + # Eliminate known named character entities + $opt{keep_named_character_entities} or do { + $yamltable = ConvertNamedCharacterEntities($yamltable); + $text = ConvertNamedCharacterEntities($text); + }; + if ($opt{divwrap}) { my $id = $opt{divname}; defined($id) or $id = ""; @@ -3591,6 +3865,25 @@ sub SplitURL { } +my $_replacesub; +BEGIN { $_replacesub = sub { + my $x = $named_character_entity{$_[1]}; + $x ? '&#'.$x.';' : $_[0]; +} } + + +# $_[0] => the input text to process +# returns text with all known named character entities replaced +# with their equivalent numerical entity +sub ConvertNamedCharacterEntities { + use bytes; + my $text = shift; + defined($text) or return undef; + $text =~ s/(&([A-Za-z]{3,8}[1-4]{0,2});)/&$_replacesub($1,$2)/goes; + return $text; +} + + sub _EncodeAmps { my $text = shift; @@ -4090,6 +4383,7 @@ B [B<--help>] [B<--html4tags>] [B<--htmlroot>=I] --div[=id] wrap body in div with given id --stylesheet output the fancy style sheet --no-stylesheet do not output fancy style sheet + --keep-named-character-entities do not convert named character entities --stub wrap output in stub document implies --stylesheet -- end options and treat next @@ -4611,8 +4905,9 @@ Display the short-form version number. Input contains only raw XHTML. All options other than B<--html4tags>, B<--deprecated>, B<--sanitize> (on by default), B<--strip-comments>, -B<--div>, B<--validate-xml> and B<--validate-xml-internal> (and -their B<--no-...> variants) are ignored. +B<--div>, B<--keep-named-character-entities>, B<--validate-xml> and +B<--validate-xml-internal> (and their B<--no-...> variants) are +ignored. With this option, arbitrary XHTML input can be passed through the sanitizer and/or validator. If sanitation is requested (the @@ -4693,6 +4988,24 @@ Overrides a previous B<--stylesheet> and disables implicit inclusion of the style sheet by the B<--stub> option. +=item B<--keep-named-character-entities> + +Do not convert named character entities to their equivalent numerical character +entity. Normally any occurrence of a named character entity such as +C<…> would be converted to its equivalent character entity such as +C<…>. If this option is given, that conversion is suppressed. + +The only always-valid named entities as far as XML is concerned are the five +entities C<&>, C<<>, C<>>, C<"> and C<'>. Even that last +one (C<'>) may not be universally supported in XHTML user agents (and it +is converted to C<'> for that reason unless this option is given). + +Regardless of this option, C<&>, C<<>, C<>> and C<"> are always +left alone since they are universally supported. + +Use of this option is I. + + =item B<--stub> Wrap the output in a full document stub (i.e. has C, C and C