From 149f4d630856688ac8bf83877a9b657d835038db Mon Sep 17 00:00:00 2001 From: "Kyle J. McKay" Date: Thu, 24 Oct 2019 02:24:16 -0700 Subject: [PATCH] Markdown.pl: eliminate egregious URL snafus The standard explicitly prohibits nesting of "a" tags. Prevent nesting from occuring when only "markdown" input is present. If explicit "..." tags are present in the input they will be (mostly) left alone even if they've been incorrectly nested. Avoid producing mojibake links in the case that the URL itself appears to contain another URL. (The wayback machine links often look like this.) In essence, once a link (either an "a" or "img") tag has been generated/processed, avoid processing it again in order to make sure no accidental mojibake occurs. One consequence of this change is that "Automatic Links" that are NOT surrounded by '<' and '>' will now only be recognized if they occur at the beginning of the input or after a whitespace (a newline qualifies) character. This also helps to eliminate unintended double linkification. Furthermore, URLs containing peculiar characters in them (e.g. single quote and/or double quote) should be far less troublesome now as well. Signed-off-by: Kyle J. McKay --- Markdown.pl | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/Markdown.pl b/Markdown.pl index af40566..8894d89 100755 --- a/Markdown.pl +++ b/Markdown.pl @@ -115,7 +115,7 @@ BEGIN { my %g_escape_table; BEGIN { $g_escape_table{""} = "\2\3"; - foreach my $char (split //, "\\\`*_~{}[]()>#+-.!|:") { + foreach my $char (split //, "\\\`*_~{}[]()>#+-.!|:<") { $g_escape_table{$char} = block_id($char,1); } } @@ -568,10 +568,10 @@ sub _StripLinkDefinitions { my $title = _strip($5); $url =~ s/\\\n\s*//gs; if ($id ne "") { - $g_urls{$id} = _EncodeAmpsAndAngles($url); + # These values always get passed through _MakeATag or _MakeIMGTag later + $g_urls{$id} = $url; if (defined($title) && $title ne "") { $g_titles{$id} = $title; - $g_titles{$id} =~ s/\042/"/g; } } } @@ -818,10 +818,12 @@ sub _MakeATag { defined($text) or $text=""; defined($title) or $title=""; - my $result = "" . $text . ""; + return $result . $g_escape_table{'>'} . + $text . $g_escape_table{'<'}."/a".$g_escape_table{'>'}; } @@ -977,7 +979,7 @@ sub _MakeIMGTag { defined($title) or $title=""; return "" unless $url ne ""; - my $result = ""; + $result .= $g_escape_table{'>'}; return $result; } @@ -2025,11 +2028,13 @@ sub _ProcessURLTag { while ($tag =~ /\G([^\s\042\047>]+=)([\042\047])((?:(?!\2)(?!>).)*)(\2\s*)/gc) { my ($p, $q, $v, $s) = ($1, $2, $3, $4); if (lc($p) eq $att && $v ne "") { - $v = _HTMLEncode(_PrefixURL($v)); + $v = _EncodeAttText(_PrefixURL($v)); } $out .= $p . $q . $v . $s; } $out .= substr($tag, pos($tag)); + substr($out,0,1) = $g_escape_table{'<'}; + substr($out,-1,1) = $g_escape_table{'>'}; return $out; } @@ -2102,7 +2107,7 @@ sub _EncodeBackslashEscapes { sub _DoAutoLinks { local $_ = shift; - s{<((https?|ftps?):[^'\042>\s]+)>}{<$1>}gi; + s{<((https?|ftps?):[^'\042>\s]+)>(?!\s*)}{_MakeATag($1, "<".$1.">")}gise; # Email addresses: s{ @@ -2119,10 +2124,10 @@ sub _DoAutoLinks { }egix; # (kjm) I don't do "x" patterns - s{(?])(?$1}sog; + s{(?:^|(?<=\s))((?:https?|ftps?)://(?:[-a-zA-Z0-9./?\&\%=_~!*;:\@+\$,\x23](?:(?RFC$1$2]}sog; + {"["._MakeATag("https://tools.ietf.org/html/rfc$2", "RFC$1$2", "RFC $2")."]"}soge; return $_; } @@ -2177,7 +2182,7 @@ sub _EncodeEmailAddress { # strip the mailto: from the visible part (my $bareaddr = $addr) =~ s/^.+?://; - $addr = qq{$prefix$bareaddr$suffix}; + $addr = _MakeATag("$addr", $prefix.$bareaddr.$suffix); return $addr; }