From ebfb2dafdab663e2154393c73ea992fc24f30c32 Mon Sep 17 00:00:00 2001 From: "Kyle J. McKay" Date: Thu, 21 Nov 2019 10:41:37 -0700 Subject: [PATCH] Markdown.pl: introduce $g_nested_parens and use it The $g_nested_brackets recursive regular expression is already being used to match nested and balanced '['...']' sequences. Introduce a $g_nested_parens recursive regular expression that matches nested and balanced '('...')' sequences; use it to match the parenthesized portion of `[...](...)` and `![...](...)` links. This eliminates a number of previous issues with links that contained embedded parentheses and non-reference image links nested within non-reference non-image links. Signed-off-by: Kyle J. McKay --- Markdown.pl | 91 +++++++++++++++++++++++++++++++++++------------------ 1 file changed, 61 insertions(+), 30 deletions(-) diff --git a/Markdown.pl b/Markdown.pl index aa6c066..d1c2fd1 100755 --- a/Markdown.pl +++ b/Markdown.pl @@ -118,6 +118,19 @@ BEGIN { }ox } +# Regex to match balanced (parentheses) +my $g_nested_parens; +BEGIN { + $g_nested_parens = qr{ + (?> # Atomic matching + [^\(\)]+ # Anything other than parentheses + | + \( + (??{ $g_nested_parens }) # Recursive set of nested parentheses + \) + )* + }ox +} # Table of hash values for escaped characters: my %g_escape_table; @@ -1088,24 +1101,16 @@ sub _DoAnchors { ($g_nested_brackets) # link text = $2 \] \( # literal paren - [ ]* - ? # href = $3 - [ ]* - ( # $4 - (['\042]) # quote char = $5 - (.*?) # Title = $6 - \5 # matching quote - )? # title is optional + ($g_nested_parens) # href and optional title = $3 \) ) }{ #my $result; my $whole_match = $1; my $link_text = $2; - my $url = $3; - my $title = $6; + my ($url, $title) = _SplitUrlTitlePart($3); - if ($url =~ /^#\S/) { + if (defined($url) && $url =~ /^#\S/) { # try very hard to find a match my $idbase = _strip(lc(substr($url, 1))); my $idbase0 = $idbase; @@ -1130,8 +1135,13 @@ sub _DoAnchors { } } } - $link_text = '[' . $link_text . ']' if $link_text =~ /^\d{1,3}$/; - _MakeATag(_PrefixURL($url), $link_text, $title); + if (defined($url)) { + $link_text = '[' . $link_text . ']' if $link_text =~ /^\d{1,3}$/; + _MakeATag(_PrefixURL($url), $link_text, $title); + } else { + # The href/title part didn't match the pattern + $whole_match; + } }xsge; # @@ -1165,6 +1175,40 @@ sub _DoAnchors { } +sub _PeelWrapped { + defined($_[0]) or return undef; + if (substr($_[0],0,1) eq "(") { + return substr($_[0], 1, length($_[0]) - (substr($_[0], -1, 1) eq ")" ? 2 : 1)); + } + return $_[0]; +} + + +sub _SplitUrlTitlePart { + return ("", undef) if $_[0] =~ m{^\s*$}; # explicitly allowed + my $u = $_[0]; + $u =~ s/^\s*(['\042])/# $1/; + if ($u =~ m{ + ^ # match beginning + \s*? + ? # URL = $1 + (?: # optional grouping + \s+ # must be distinct from URL + (['\042]?) # quote char = $2 + (.*?) # Title = $3 + \2? # matching quote + )? # title is optional + \s* + \z # match end + }osx) { + return (undef, undef) if $_[1] && ($1 eq "" || $1 eq "#"); + return (_PeelWrapped($1), $2 ? $3 : _PeelWrapped($3)); + } else { + return (undef, undef); + } +} + + # Return a suitably encoded tag string # On input NONE of $url, $alt or $title should be xmlencoded # but $url should already be url-encoded if needed, but NOT g_escape_table'd @@ -1249,27 +1293,14 @@ sub _DoImages { ($g_nested_brackets) # alt text = $2 \] \( # literal paren - [ ]* - ? # src url = $3 - [ ]* - ( # $4 - (['\042]) # quote char = $5 - (.*?) # title = $6 - \5 # matching quote - [ ]* - )? # title is optional + ($g_nested_parens) # src and optional title = $3 \) ) }{ - #my $whole_match = $1; + my $whole_match = $1; my $alt_text = $2; - my $url = $3; - my $title = ''; - if (defined($6)) { - $title = $6; - } - - _MakeIMGTag(_PrefixURL($url), $alt_text, $title); + my ($url, $title) = _SplitUrlTitlePart($3, 1); + defined($url) ? _MakeIMGTag(_PrefixURL($url), $alt_text, $title) : $whole_match; }xsge; #