From 34b44054db967b4e3b4df4730824cc36b798e183 Mon Sep 17 00:00:00 2001 From: "Kyle J. McKay" Date: Sun, 31 Jan 2021 10:38:24 -0700 Subject: [PATCH] Markdown.pl: sanitize common "oops" entities Take a hint from w3m and quietly fix up the six common entities < > & " '   when they are missing their trailing ';' provided whatever trailing character is there is not alphanumeric, an equals sign or a semicolon. Without this change this case the leading ampersand would have ended up being escaped to & in these cases which seems likely to be almost certainly incorrect. Signed-off-by: Kyle J. McKay --- Markdown.pl | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/Markdown.pl b/Markdown.pl index 324282e..eec3bec 100755 --- a/Markdown.pl +++ b/Markdown.pl @@ -3315,9 +3315,16 @@ sub _ProcessURLTag { } +my $oops_entities; +BEGIN { $oops_entities = qr/(?:lt|gt|amp|quot|apos|nbsp)/io; } + + sub _HTMLEncode { my $text = shift; + # Treat these accidents as though they had the needed ';' + $text =~ s/&($oops_entities)(?![A-Za-z0-9=;])/&$1;/go; + # Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin: # http://bumppo.net/projects/amputator/ $text =~ s/&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)/&/g; @@ -3335,6 +3342,9 @@ sub _HTMLEncode { sub _EncodeAmps { my $text = shift; + # Treat these accidents as though they had the needed ';' + $text =~ s/&($oops_entities)(?![A-Za-z0-9=;])/&$1;/go; + # Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin: # http://bumppo.net/projects/amputator/ $text =~ s/&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)/&/g; @@ -3348,6 +3358,9 @@ sub _EncodeAmpsAndAngles { my $text = shift; + # Treat these accidents as though they had the needed ';' + $text =~ s/&($oops_entities)(?![A-Za-z0-9=;])/&$1;/go; + # Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin: # http://bumppo.net/projects/amputator/ $text =~ s/&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)/&/g;