#!/usr/bin/env perl # # Markdown -- A text-to-HTML conversion tool for web writers # # Copyright (C) 2004 John Gruber # Copyright (C) 2015,2016,2017,2018,2019 Kyle J. McKay # All rights reserved. # License is Modified BSD (aka 3-clause BSD) License\n"; # See LICENSE file (or ) # package Markdown; require 5.008; use strict; use warnings; use Encode; use vars qw($COPYRIGHT $VERSION @ISA @EXPORT_OK); BEGIN {*COPYRIGHT = \"Copyright (C) 2004 John Gruber Copyright (C) 2015,2016,2017,2018,2019 Kyle J. McKay All rights reserved. "; *VERSION = \"1.1.8" } require Exporter; use Digest::MD5 qw(md5 md5_hex); use File::Basename qw(basename); use Scalar::Util qw(refaddr looks_like_number); my ($hasxml, $hasxml_err); BEGIN { ($hasxml, $hasxml_err) = (0, "") } my ($hasxmlp, $hasxmlp_err); BEGIN { ($hasxmlp, $hasxmlp_err) = (0, "") } @ISA = qw(Exporter); @EXPORT_OK = qw(Markdown); $INC{__PACKAGE__.'.pm'} = $INC{basename(__FILE__)} unless exists $INC{__PACKAGE__.'.pm'}; close(DATA) if fileno(DATA); exit(&_main(@ARGV)||0) unless caller; sub fauxdie($) { my $msg = join(" ", @_); $msg =~ s/\s+$//os; printf STDERR "%s: fatal: %s\n", basename($0), $msg; exit 1; } my $encoder; BEGIN { $encoder = Encode::find_encoding('Windows-1252') || Encode::find_encoding('ISO-8859-1') or die "failed to load ISO-8859-1 encoder\n"; } # # Global default settings: # my ($g_style_prefix, $g_empty_element_suffix, $g_indent_width, $g_tab_width); BEGIN { $g_style_prefix = "_markdown-"; # Prefix for markdown css class styles $g_empty_element_suffix = " />"; # Change to ">" for HTML output $g_indent_width = 4; # Number of spaces considered new level $g_tab_width = 4; # Legacy even though it's wrong } # # Globals: # # Style sheet template my $g_style_sheet; # Permanent block id table my %g_perm_block_ids; # Global hashes, used by various utility routines my %g_urls; my %g_titles; my %g_anchors; my %g_anchors_id; my %g_block_ids; my %g_code_block_ids; my %g_html_blocks; my %g_code_blocks; my %opt; # Return a "block id" to use to identify the block that does not contain # any characters that could be misinterpreted by the rest of the code # Originally this used md5_hex but that's unnecessarily slow # Instead just use the refaddr of the scalar ref of the entry for that # key in either the global or, if the optional second argument is true, # permanent table. To avoid the result being confused with anything # else, it's prefixed with a control character and suffixed with another # both of which are not allowed by the XML standard or Unicode. sub block_id { $_[1] or return "\5".refaddr(\$g_block_ids{$_[0]})."\6"; $_[1] == 1 and return "\2".refaddr(\$g_perm_block_ids{$_[0]})."\3"; $_[1] == 2 and return "\25".refaddr(\$g_code_block_ids{$_[0]})."\26"; die "programmer error: bad block_id type $_[1]"; } # Regex to match balanced [brackets]. See Friedl's # "Mastering Regular Expressions", 2nd Ed., pp. 328-331. my $g_nested_brackets; BEGIN { $g_nested_brackets = qr{ (?> # Atomic matching [^\[\]]+ # Anything other than brackets | \[ (??{ $g_nested_brackets }) # Recursive set of nested brackets \] )* }ox } # Regex to match balanced (parentheses) my $g_nested_parens; BEGIN { $g_nested_parens = qr{ (?> # Atomic matching [^\(\)]+ # Anything other than parentheses | \( (??{ $g_nested_parens }) # Recursive set of nested parentheses \) )* }ox } # Table of hash values for escaped characters: my %g_escape_table; BEGIN { $g_escape_table{""} = "\2\3"; foreach my $char (split //, "\\\`*_~{}[]()>#+-.!|:<") { $g_escape_table{$char} = block_id($char,1); } } # Used to track when we're inside an ordered or unordered list # (see _ProcessListItems() for details): my $g_list_level; BEGIN { $g_list_level = 0; } #### Blosxom plug-in interface ########################################## my $_haveBX; BEGIN { no warnings 'once'; $_haveBX = defined($blosxom::version); } # Set $g_blosxom_use_meta to 1 to use Blosxom's meta plug-in to determine # which posts Markdown should process, using a "meta-markup: markdown" # header. If it's set to 0 (the default), Markdown will process all # entries. my $g_blosxom_use_meta; BEGIN { $g_blosxom_use_meta = 0; } sub start { 1; } sub story { my($pkg, $path, $filename, $story_ref, $title_ref, $body_ref) = @_; if ((! $g_blosxom_use_meta) or (defined($meta::markup) and ($meta::markup =~ /^\s*markdown\s*$/i)) ) { $$body_ref = Markdown($$body_ref); } 1; } #### Movable Type plug-in interface ##################################### my $_haveMT = eval {require MT; 1;}; # Test to see if we're running in MT my $_haveMT3 = $_haveMT && eval {require MT::Plugin; 1;}; # and MT >= MT 3.0. if ($_haveMT) { require MT; import MT; require MT::Template::Context; import MT::Template::Context; if ($_haveMT3) { require MT::Plugin; import MT::Plugin; my $plugin = new MT::Plugin({ name => "Markdown", description => "A plain-text-to-HTML formatting plugin. (Version: $VERSION)", doc_link => 'http://daringfireball.net/projects/markdown/' }); MT->add_plugin( $plugin ); } MT::Template::Context->add_container_tag(MarkdownOptions => sub { my $ctx = shift; my $args = shift; my $builder = $ctx->stash('builder'); my $tokens = $ctx->stash('tokens'); if (defined ($args->{'output'}) ) { $ctx->stash('markdown_output', lc $args->{'output'}); } defined (my $str = $builder->build($ctx, $tokens) ) or return $ctx->error($builder->errstr); $str; # return value }); MT->add_text_filter('markdown' => { label => 'Markdown', docs => 'http://daringfireball.net/projects/markdown/', on_format => sub { my $text = shift; my $ctx = shift; my $raw = 0; if (defined $ctx) { my $output = $ctx->stash('markdown_output'); if (defined $output && $output =~ m/^html/i) { $g_empty_element_suffix = ">"; $ctx->stash('markdown_output', ''); } elsif (defined $output && $output eq 'raw') { $raw = 1; $ctx->stash('markdown_output', ''); } else { $raw = 0; $g_empty_element_suffix = " />"; } } $text = $raw ? $text : Markdown($text); $text; }, }); # If SmartyPants is loaded, add a combo Markdown/SmartyPants text filter: my $smartypants; { no warnings "once"; $smartypants = $MT::Template::Context::Global_filters{'smarty_pants'}; } if ($smartypants) { MT->add_text_filter('markdown_with_smartypants' => { label => 'Markdown With SmartyPants', docs => 'http://daringfireball.net/projects/markdown/', on_format => sub { my $text = shift; my $ctx = shift; if (defined $ctx) { my $output = $ctx->stash('markdown_output'); if (defined $output && $output eq 'html') { $g_empty_element_suffix = ">"; } else { $g_empty_element_suffix = " />"; } } $text = Markdown($text); $text = $smartypants->($text, '1'); }, }); } } sub _strip { my $str = shift; defined($str) or return undef; $str =~ s/^\s+//; $str =~ s/\s+$//; $str =~ s/\s+/ /g; $str; } #### BBEdit/command-line text filter interface ########################## sub _main { local *ARGV = \@_; #### Check for command-line switches: ################# my %options = (); my %cli_opts; my $raw = 0; use Getopt::Long; Getopt::Long::Configure(qw(bundling require_order pass_through)); GetOptions(\%cli_opts, 'help','h', 'version|V', 'shortversion|short-version|s', 'html4tags', 'deprecated', 'sanitize', 'no-sanitize', 'validate-xml', 'validate-xml-internal', 'no-validate-xml', 'htmlroot|r=s', 'imageroot|i=s', 'wiki|w:s', 'tabwidth|tab-width=s', 'raw', 'stylesheet|style-sheet', 'no-stylesheet|no-style-sheet', 'stub', ); if ($cli_opts{'help'}) { require Pod::Usage; Pod::Usage::pod2usage(-verbose => 2, -exitval => 0); } if ($cli_opts{'h'}) { require Pod::Usage; Pod::Usage::pod2usage(-verbose => 0, -exitval => 0); } if ($cli_opts{'version'}) { # Version info print "\nThis is Markdown, version $VERSION.\n", $COPYRIGHT; print "License is Modified BSD (aka 3-clause BSD) License\n"; print "\n"; exit 0; } if ($cli_opts{'shortversion'}) { # Just the version number string. print $VERSION; exit 0; } my $stub = 0; if ($cli_opts{'stub'}) { $stub = 1; } if ($cli_opts{'html4tags'}) { # Use HTML tag style instead of XHTML $options{empty_element_suffix} = ">"; $stub = -$stub; } if ($cli_opts{'deprecated'}) { # Allow and tags to pass through _SetAllowedTag("dir"); _SetAllowedTag("menu"); } $options{sanitize} = 1; # sanitize by default if ($cli_opts{'no-sanitize'}) { # Do not sanitize $options{sanitize} = 0; } if ($cli_opts{'sanitize'}) { # --sanitize always wins $options{sanitize} = 1; } $options{xmlcheck} = $options{sanitize} ? 2 : 0; if ($cli_opts{'no-validate-xml'}) { # Do not validate XML $options{xmlcheck} = 0; } if ($cli_opts{'validate-xml'}) { # Validate XML output $options{xmlcheck} = 1; } if ($cli_opts{'validate-xml-internal'}) { # Validate XML output internally $options{xmlcheck} = 2; } die "--html4tags and --validate-xml are incompatible\n" if $cli_opts{'html4tags'} && $options{xmlcheck} == 1; die "--no-sanitize and --validate-xml-internal are incompatible\n" if !$options{'sanitize'} && $options{xmlcheck} == 2; if ($options{xmlcheck} == 1) { eval { require XML::Simple; 1 } and $hasxml = 1 or $hasxml_err = $@; eval { require XML::Parser; 1 } and $hasxmlp = 1 or $hasxmlp_err = $@ unless $hasxml; die "$hasxml_err$hasxmlp_err" unless $hasxml || $hasxmlp; } if ($cli_opts{'tabwidth'}) { my $tw = $cli_opts{'tabwidth'}; die "invalid tab width (must be integer)\n" unless looks_like_number $tw; die "invalid tab width (must be >= 2 and <= 32)\n" unless $tw >= 2 && $tw <= 32; $options{tab_width} = int(0+$tw); } if ($cli_opts{'htmlroot'}) { # Use URL prefix $options{url_prefix} = $cli_opts{'htmlroot'}; } if ($cli_opts{'imageroot'}) { # Use image URL prefix $options{img_prefix} = $cli_opts{'imageroot'}; } if (exists $cli_opts{'wiki'}) { # Enable wiki links my $wpat = $cli_opts{'wiki'}; defined($wpat) or $wpat = ""; my $wopt = "s"; if ($wpat =~ /^(.*?)%\{([0-9A-Za-z]*)\}(.*)$/) { $options{wikipat} = $1 . "%{}" . $3; $wopt = $2; } else { $options{wikipat} = $wpat . "%{}.html"; } $options{wikiopt} = { map({$_ => 1} split(//,lc($wopt))) }; } if ($cli_opts{'raw'}) { $raw = 1; } if ($cli_opts{'stylesheet'}) { # Display the style sheet $options{show_styles} = 1; } if ($cli_opts{'no-stylesheet'}) { # Do not display the style sheet $options{show_styles} = 0; } $options{show_styles} = 1 if $stub && !defined($options{show_styles}); $options{tab_width} = 8 unless defined($options{tab_width}); my $hdrf = sub { my $out = ""; if ($stub > 0) { $out .= <<'HTML5'; HTML5 } elsif ($stub < 0) { $out .= <<'HTML4'; HTML4 } if ($stub && ($options{title} || $options{h1})) { my $title = $options{title}; defined($title) && $title ne "" or $title = $options{h1}; if (defined($title) && $title ne "") { $title =~ s/&/&/g; $title =~ s/$title\n"; } } if ($options{show_styles}) { my $stylesheet = $g_style_sheet; $stylesheet =~ s/%\(base\)/$g_style_prefix/g; $out .= $stylesheet; } if ($stub) { $out .= "\n\n" . "
\n"; } $out; }; #### Process incoming text: ########################### my ($didhdr, $hdr, $result, $ftr) = (0, "", "", ""); @ARGV or push(@ARGV, "-"); foreach (@ARGV) { my ($fh, $contents, $oneresult); $_ eq "-" or open $fh, '<', $_ or fauxdie "could not open \"$_\": $!\n"; { local $/; # Slurp the whole file $_ eq "-" and $contents = ; $_ ne "-" and $contents = <$fh>; } defined($contents) or fauxdie "could not read \"$_\": $!\n"; $_ eq "-" or close($fh); $oneresult = $raw ? ProcessRaw($contents, \%options) : Markdown($contents, \%options); $oneresult =~ s/\s+$//os; if ($oneresult ne "") { if (!$didhdr && !$raw) { $hdr = &$hdrf(); $didhdr = 1; } $result .= $oneresult . "\n"; } } $hdr = &$hdrf() unless $didhdr || $raw; $ftr = "
\n\n\n" if $stub && !$raw; if ($options{xmlcheck} == 1) { my ($good, $errs); if ($stub && !$raw) { ($good, $errs) = _xmlcheck($hdr.$result.$ftr); } else { ($good, $errs) = _xmlcheck("
".$result."
"); } $good or die $errs; } print $hdr, $result, $ftr; exit 0; } sub _xmlcheck { my $text = shift; my ($good, $errs); ($hasxml ? eval { XML::Simple::XMLin($text, KeepRoot => 1) && 1 } : eval { my $p = XML::Parser->new(Style => 'Tree', ErrorContext => 1); $p->parse($text) && 1; }) and $good = 1 or $errs = _trimerr($@); ($good, $errs); } sub _trimerr { my $err = shift; 1 while $err =~ s{\s+at\s+\.?/[^,\s\n]+\sline\s+[0-9]+\.?(\n|$)}{$1}is; $err =~ s/\s+$//os; $err . "\n"; } sub _PrepareInput { my $input = shift; defined $input or $input = ""; { use bytes; $input =~ s/[\x00-\x08\x0B\x0E-\x1F\x7F]+//gso; } my $output; if (Encode::is_utf8($input) || utf8::decode($input)) { $output = $input; } else { $output = $encoder->decode($input, Encode::FB_DEFAULT); } # Standardize line endings: $output =~ s{\r\n}{\n}g; # DOS to Unix $output =~ s{\r}{\n}g; # Mac to Unix return $output; } sub ProcessRaw { my $text = _PrepareInput(shift); %opt = ( empty_element_suffix => $g_empty_element_suffix, ); my %args = (); if (ref($_[0]) eq "HASH") { %args = %{$_[0]}; } else { %args = @_; } while (my ($k,$v) = each %args) { $opt{$k} = $v; } $opt{xmlcheck} = 0 unless looks_like_number($opt{xmlcheck}); # Sanitize all '<'...'>' tags if requested $text = _SanitizeTags($text, $opt{xmlcheck} == 2) if $opt{sanitize}; utf8::encode($text); return $text; } sub Markdown { # # Primary function. The order in which other subs are called here is # essential. Link and image substitutions need to happen before # _EscapeSpecialChars(), so that any *'s or _'s in the # and tags get encoded. # my $text = _PrepareInput(shift); # Any remaining arguments after the first are options; either a single # hashref or a list of name, value paurs. %opt = ( # set initial defaults style_prefix => $g_style_prefix, empty_element_suffix => $g_empty_element_suffix, tab_width => $g_tab_width, indent_width => $g_indent_width, url_prefix => "", # Prefixed to non-absolute URLs img_prefix => "", # Prefixed to non-absolute image URLs ); my %args = (); if (ref($_[0]) eq "HASH") { %args = %{$_[0]}; } else { %args = @_; } while (my ($k,$v) = each %args) { $opt{$k} = $v; } $opt{xmlcheck} = 0 unless looks_like_number($opt{xmlcheck}); # Clear the globals. If we don't clear these, you get conflicts # from other articles when generating a page which contains more than # one article (e.g. an index page that shows the N most recent # articles): %g_urls = (); %g_titles = (); %g_anchors = (); %g_block_ids = (); %g_code_block_ids = (); %g_html_blocks = (); %g_code_blocks = (); $g_list_level = 0; # Make sure $text ends with a couple of newlines: $text .= "\n\n"; # Handle backticks-delimited code blocks $text = _HashBTCodeBlocks($text); # Convert all tabs to spaces. $text = _DeTab($text); # Strip any lines consisting only of spaces. # This makes subsequent regexen easier to write, because we can # match consecutive blank lines with /\n+/ instead of something # contorted like / *\n+/ . $text =~ s/^ +$//mg; # Turn block-level HTML blocks into hash entries $text = _HashHTMLBlocks($text); # Strip link definitions, store in hashes. $text = _StripLinkDefinitions($text); $text = _RunBlockGamut($text, 1); # Remove indentation markers $text =~ s/\027+//gs; # Unhashify code blocks $text =~ s/(\025\d+\026)/$g_code_blocks{$1}/g; $text = _UnescapeSpecialChars($text); $text .= "\n" unless $text eq ""; # Sanitize all '<'...'>' tags if requested $text = _SanitizeTags($text, $opt{xmlcheck} == 2) if $opt{sanitize}; utf8::encode($text); if (defined($opt{h1}) && $opt{h1} ne "" && ref($_[0]) eq "HASH") { utf8::encode($opt{h1}); ${$_[0]}{h1} = $opt{h1} } return $text; } sub _HashBTCodeBlocks { # # Process Markdown backticks (```) delimited code blocks # my $text = shift; my $less_than_indent = $opt{indent_width} - 1; $text =~ s{ (?:(?<=\n)|\A) ([ ]{0,$less_than_indent})``(`+)[ \t]*(?:([\w.+-]+[#]?)[ \t]*)?\n ( # $4 = the code block -- one or more lines, starting with ``` (?: .*\n+ )+? ) # and ending with ``` or end of document (?:(?:[ ]{0,$less_than_indent}``\2[ \t]*(?:\n|\Z))|\Z) }{ # $2 contains syntax highlighting to use if defined my $leadsp = length($1); my $codeblock = $4; $codeblock =~ s/[ \t]+$//mg; # trim trailing spaces on lines $codeblock = _DeTab($codeblock, 8, $leadsp); # physical tab stops are always 8 $codeblock =~ s/\A\n+//; # trim leading newlines $codeblock =~ s/\s+\z//; # trim trailing whitespace $codeblock = _EncodeCode($codeblock); # or run highlighter here $codeblock = "
"
		. $codeblock . "\n
"; my $key = block_id($codeblock); $g_html_blocks{$key} = $codeblock; "\n\n" . $key . "\n\n"; }egmx; return $text; } sub _StripLinkDefinitions { # # Strips link definitions from text, stores the URLs and titles in # hash references. # my $text = shift; my $less_than_indent = $opt{indent_width} - 1; # Link defs are in the form: ^[id]: url "optional title" while ($text =~ s{ ^[ ]{0,$less_than_indent}\[(.+)\]: # id = $1 [ ]* \n? # maybe *one* newline [ ]* ? # url = $2 [ ]* \n? # maybe one newline [ ]* (?: (?<=\s) # lookbehind for whitespace (?:(['"])|(\()) # title quote char (.+?) # title = $5 (?(4)\)|\3) # match same quote [ ]* )? # title is optional (?:\n+|\Z) } {}mx) { my $id = _strip(lc $1); # Link IDs are case-insensitive my $url = $2; my $title = _strip($5); $url =~ s/\\\n\s*//gs; if ($id ne "") { # These values always get passed through _MakeATag or _MakeIMGTag later $g_urls{$id} = $url; if (defined($title) && $title ne "") { $g_titles{$id} = $title; } } } return $text; } my ($block_tags_a, $block_tags_b); BEGIN { $block_tags_a = qr/p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math|ins|del/o; $block_tags_b = qr/p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math/o; } sub _HashHTMLBlocks { my $text = shift; my $less_than_indent = $opt{indent_width} - 1; my $idt = "\027" x $g_list_level; # Hashify HTML blocks: # We only want to do this for block-level HTML tags, such as headers, # lists, and tables. That's because we still want to wrap

s around # "paragraphs" that are wrapped in non-block-level tags, such as anchors, # phrase emphasis, and spans. The list of tags we're looking for is # hard-coded: # First, look for nested blocks, e.g.: #

#
# tags for inner block must be indented. #
#
# # The outermost tags must start at the left margin for this to match, and # the inner nested divs must be indented. # We need to do this before the next, more liberal match, because the next # match will start at the first `
` and stop at the first `
`. $text =~ s{ ( # save in $1 ^ # start of line (with /m) ((?:\Q$idt\E)?) # optional lead in = $2 <($block_tags_a) # start tag = $3 \b # word break (?:.*\n)*? # any number of lines, minimally matching \2 # the matching end tag [ ]* # trailing spaces (?=\n+|\Z) # followed by a newline or end of document ) }{ my $key = block_id($1); $g_html_blocks{$key} = $1; "\n\n" . $key . "\n\n"; }egmx; # # Now match more liberally, simply from `\n` to `\n` # $text =~ s{ ( # save in $1 ^ # start of line (with /m) (?:\Q$idt\E)? # optional lead in <($block_tags_b) # start tag = $2 \b # word break (?:.*\n)*? # any number of lines, minimally matching .* # the matching end tag [ ]* # trailing spaces (?=\n+|\Z) # followed by a newline or end of document ) }{ my $key = block_id($1); $g_html_blocks{$key} = $1; "\n\n" . $key . "\n\n"; }egmx; # Special case just for
. It was easier to make a special case than # to make the other regex more complicated. $text =~ s{ (?: (?<=\n\n) # Starting after a blank line | # or \A\n? # the beginning of the doc ) ( # save in $1 [ ]{0,$less_than_indent} <(?:hr) # start tag \b # word break (?:[^<>])*? # /?> # the matching end tag [ ]* (?=\n{2,}|\Z) # followed by a blank line or end of document ) }{ my $key = block_id($1); $g_html_blocks{$key} = $1; "\n\n" . $key . "\n\n"; }egx; # Special case for standalone HTML comments: $text =~ s{ (?: (?<=\n\n) # Starting after a blank line | # or \A\n? # the beginning of the doc ) ( # save in $1 [ ]{0,$less_than_indent} (?s: ) [ ]* (?=\n{1,}|\Z) # followed by end of line or end of document ) }{ my $key = block_id($1); $g_html_blocks{$key} = $1; "\n\n" . $key . "\n\n"; }egx; return $text; } sub _RunBlockGamut { # # These are all the transformations that form block-level # tags like paragraphs, headers, and list items. # my ($text, $anchors) = @_; $text = _DoHeaders($text, $anchors); # Do Horizontal Rules: $text =~ s{^ {0,3}\*(?: {0,2}\*){2,}[ ]*$}{\n tags around block-level tags. $text = _HashHTMLBlocks($text); $text = _FormParagraphs($text); return $text; } sub _DoListBlocks { return _DoBlockQuotes(_DoCodeBlocks($_[0])) if $_[0] ne ""; } sub _RunSpanGamut { # # These are all the transformations that occur *within* block-level # tags like paragraphs, headers, and list items. # my $text = shift; $text = _DoCodeSpans($text); $text = _EscapeSpecialChars($text); # Process anchor and image tags. Images must come first, # because ![foo][f] looks like an anchor. $text = _DoImages($text); $text = _DoAnchors($text); # Make links out of things like `` # Must come after _DoAnchors(), because you can use < and > # delimiters in inline links like [this](). $text = _DoAutoLinks($text); $text = _EncodeAmpsAndAngles($text); $text = _DoItalicsAndBoldAndStrike($text); # Do hard breaks: $text =~ s/ {2,}\n/ or tags. # my $tags_to_skip = qr!<(/?)(?:pre|code|kbd|script|math)[\s>]!; foreach my $cur_token (@$tokens) { if ($cur_token->[0] eq "tag") { # Within tags, encode *, _ and ~ so they don't conflict # with their use in Markdown for italics and strong. # We're replacing each such character with its # corresponding block id value; this is likely # overkill, but it should prevent us from colliding # with the escape values by accident. $cur_token->[1] =~ s!([*_~])!$g_escape_table{$1}!g; $text .= $cur_token->[1]; } else { my $t = $cur_token->[1]; $t = _EncodeBackslashEscapes($t); $text .= $t; } } return $text; } sub _ProcessWikiLink { my ($link_text, $link_loc) = @_; if (defined($link_loc) && ($link_loc =~ m{^#\S*$} || $link_loc =~ m{^(?:http|ftp)s?://\S+$}i)) { # Return the new link return _MakeATag($link_loc, $link_text); } if (!defined($link_loc) && ($link_loc = _strip($link_text)) =~ m{^(?:http|ftp)s?://\S+$}i) { # Return the new link return _MakeATag($link_loc, $link_text); } return undef if $link_loc eq "" || $link_text eq ""; if ($link_loc =~ /^[A-Za-z][A-Za-z0-9+.-]*:/os) { # Unrecognized scheme return undef; } if ($opt{wikipat}) { my $o = $opt{wikiopt}; my $qsfrag = ""; my $base = $link_loc; if ($link_loc =~ /^(.*?)([?#].*)$/os) { ($base, $qsfrag) = ($1, $2); } $base = _wxform($base); my $result = $opt{wikipat}; $result =~ s/%\{\}/$base/; if ($qsfrag =~ /^([^#]*)(#.+)$/os) { my ($q,$f) = ($1,$2); #$f = _wxform($f) if $f =~ / /; $qsfrag = $q . $f; } $result .= $qsfrag; { use bytes; $result =~ s/%(?![0-9A-Fa-f]{2})/%25/sog; if ($o->{r}) { $result =~ s/([\x00-\x1F <>"{}|\\^`x7F])/sprintf("%%%02X",ord($1))/soge; } else { $result =~ s/([\x00-\x1F <>"{}|\\^`\x7F-\xFF])/sprintf("%%%02X",ord($1))/soge; } $result =~ s/(%(?![0-9A-F]{2})[0-9A-Fa-f]{2})/uc($1)/soge; } # Return the new link return _MakeATag($result, $link_text); } # leave it alone return undef; } sub _wxform { my $w = shift; my $o = $opt{wikiopt}; $w =~ s{[.][^./]*$}{} if $o->{s}; $w =~ tr{/}{ } if $o->{f}; $w =~ s{/+}{/}gos if !$o->{f} && !$o->{v}; if ($o->{d}) { $w =~ tr{ }{-}; $w =~ s/-+/-/gos unless $o->{v}; } else { $w =~ tr{ }{_}; $w =~ s/_+/_/gos unless $o->{v}; } $w = uc($w) if $o->{u}; $w = lc($w) if $o->{l}; return $w; } # Return a suitably encoded tag string # On input NONE of $url, $text or $title should be xmlencoded # but $url should already be url-encoded if needed, but NOT g_escape_table'd sub _MakeATag { my ($url, $text, $title) = @_; defined($url) or $url=""; defined($text) or $text=""; defined($title) or $title=""; my $result = $g_escape_table{'<'}."a href=\"" . _EncodeAttText($url) . "\""; $title = _strip($title); $text =~ s{<(/?a)}{<$1}sogi; $text = _DoItalicsAndBoldAndStrike($text); # We've got to encode any of these remaining to avoid # conflicting with other italics, bold and strike through. $text =~ s!([*_~])!$g_escape_table{$1}!g; $result .= " title=\"" . _EncodeAttText($title) . "\"" if $title ne ""; return $result . $g_escape_table{'>'} . $text . $g_escape_table{'<'}."/a".$g_escape_table{'>'}; } sub _DoAnchors { # # Turn Markdown link shortcuts into XHTML
tags. # my $text = shift; # # First, handle wiki-style links: [[wiki style link]] # $text =~ s{ ( # wrap whole match in $1 \[\[ ($g_nested_brackets) # link text and id = $2 \]\] ) }{ my $result; my $whole_match = $1; my $link_text = $2; my $link_loc = undef; if ($link_text =~ /^(.*)\|(.*)$/s) { $link_text = $1; $link_loc = _strip($2); } $result = _ProcessWikiLink($link_text, $link_loc); defined($result) or $result = $whole_match; $result; }xsge; # # Next, handle reference-style links: [link text] [id] # $text =~ s{ ( # wrap whole match in $1 \[ ($g_nested_brackets) # link text = $2 \] [ ]? # one optional space (?:\n[ ]*)? # one optional newline followed by spaces \[ ($g_nested_brackets) # id = $3 \] ) }{ my $result; my $whole_match = $1; my $link_text = $2; my $link_id = $3; $link_id ne "" or $link_id = $link_text; # for shortcut links like [this][]. $link_id = _strip(lc $link_id); if (defined($g_urls{$link_id}) || defined($g_anchors{$link_id})) { my $url = $g_urls{$link_id}; $url = defined($url) ? _PrefixURL($url) : $g_anchors{$link_id}; $link_text = '[' . $link_text . ']' if $link_text =~ /^\d{1,3}$/; $result = _MakeATag($url, $link_text, $g_titles{$link_id}); } else { $result = $whole_match; } $result; }xsge; # # Subsequently, inline-style links: [link text](url "optional title") # $text =~ s{ ( # wrap whole match in $1 \[ ($g_nested_brackets) # link text = $2 \] \( # literal paren ($g_nested_parens) # href and optional title = $3 \) ) }{ #my $result; my $whole_match = $1; my $link_text = $2; my ($url, $title) = _SplitUrlTitlePart($3); if (defined($url) && $url =~ /^#\S/) { # try very hard to find a match my $idbase = _strip(lc(substr($url, 1))); my $idbase0 = $idbase; my $id = _MakeAnchorId($idbase); if (defined($g_anchors_id{$id})) { $url = $g_anchors_id{$id}; } else { $idbase =~ s/-/_/gs; $id = _MakeAnchorId($idbase); if (defined($g_anchors_id{$id})) { $url = $g_anchors_id{$id}; } else { $id = _MakeAnchorId($idbase0, 1); if (defined($g_anchors_id{$id})) { $url = $g_anchors_id{$id}; } else { $id = _MakeAnchorId($idbase, 1); if (defined($g_anchors_id{$id})) { $url = $g_anchors_id{$id}; } } } } } if (defined($url)) { $link_text = '[' . $link_text . ']' if $link_text =~ /^\d{1,3}$/; _MakeATag(_PrefixURL($url), $link_text, $title); } else { # The href/title part didn't match the pattern $whole_match; } }xsge; # # Finally, handle reference-style implicit shortcut links: [link text] # $text =~ s{ ( # wrap whole match in $1 \[ ($g_nested_brackets) # link text = $2 \] ) }{ my $result; my $whole_match = $1; my $link_text = $2; my $link_id = _strip(lc $2); if (defined($g_urls{$link_id}) || defined($g_anchors{$link_id})) { my $url = $g_urls{$link_id}; $url = defined($url) ? _PrefixURL($url) : $g_anchors{$link_id}; $link_text = '[' . $link_text . ']' if $link_text =~ /^\d{1,3}$/; $result = _MakeATag($url, $link_text, $g_titles{$link_id}); } else { $result = $whole_match; } $result; }xsge; return $text; } sub _PeelWrapped { defined($_[0]) or return undef; if (substr($_[0],0,1) eq "(") { return substr($_[0], 1, length($_[0]) - (substr($_[0], -1, 1) eq ")" ? 2 : 1)); } return $_[0]; } sub _SplitUrlTitlePart { return ("", undef) if $_[0] =~ m{^\s*$}; # explicitly allowed my $u = $_[0]; $u =~ s/^\s*(['\042])/# $1/; if ($u =~ m{ ^ # match beginning \s*? ? # URL = $1 (?: # optional grouping \s+ # must be distinct from URL (['\042]?) # quote char = $2 (.*?) # Title = $3 \2? # matching quote )? # title is optional \s* \z # match end }osx) { return (undef, undef) if $_[1] && ($1 eq "" || $1 eq "#"); return (_PeelWrapped($1), $2 ? $3 : _PeelWrapped($3)); } else { return (undef, undef); } } # Return a suitably encoded tag string # On input NONE of $url, $alt or $title should be xmlencoded # but $url should already be url-encoded if needed, but NOT g_escape_table'd sub _MakeIMGTag { my ($url, $alt, $title) = @_; defined($url) or $url=""; defined($alt) or $alt=""; defined($title) or $title=""; return "" unless $url ne ""; my $result = $g_escape_table{'<'}."img src=\"" . _EncodeAttText($url) . "\""; my ($w, $h) = (0, 0); ($alt, $title) = (_strip($alt), _strip($title)); if ($title =~ /^(.*)\(([1-9][0-9]*)[xX]([1-9][0-9]*)\)$/os) { ($title, $w, $h) = (_strip($1), $2, $3); } elsif ($title =~ /^(.*)\(\?[xX]([1-9][0-9]*)\)$/os) { ($title, $h) = (_strip($1), $2); } elsif ($title =~ /^(.*)\(([1-9][0-9]*)[xX]\?\)$/os) { ($title, $w) = (_strip($1), $2); } $result .= " alt=\"" . _EncodeAttText($alt) . "\"" if $alt ne ""; $result .= " width=\"$w\"" if $w != 0; $result .= " height=\"$h\"" if $h != 0; $result .= " title=\"" . _EncodeAttText($title) . "\"" if $title ne ""; $result .= " /" unless $opt{empty_element_suffix} eq ">"; $result .= $g_escape_table{'>'}; return $result; } sub _DoImages { # # Turn Markdown image shortcuts into tags. # my $text = shift; # # First, handle reference-style labeled images: ![alt text][id] # $text =~ s{ ( # wrap whole match in $1 !\[ ($g_nested_brackets) # alt text = $2 \] [ ]? # one optional space (?:\n[ ]*)? # one optional newline followed by spaces \[ ($g_nested_brackets) # id = $3 \] ) }{ my $result; my $whole_match = $1; my $alt_text = $2; my $link_id = $3; $link_id ne "" or $link_id = $alt_text; # for shortcut links like ![this][]. $link_id = _strip(lc $link_id); if (defined $g_urls{$link_id}) { $result = _MakeIMGTag( _PrefixURL($g_urls{$link_id}), $alt_text, $g_titles{$link_id}); } else { # If there's no such link ID, leave intact: $result = $whole_match; } $result; }xsge; # # Next, handle inline images: ![alt text](url "optional title") # Don't forget: encode * and _ $text =~ s{ ( # wrap whole match in $1 !\[ ($g_nested_brackets) # alt text = $2 \] \( # literal paren ($g_nested_parens) # src and optional title = $3 \) ) }{ my $whole_match = $1; my $alt_text = $2; my ($url, $title) = _SplitUrlTitlePart($3, 1); defined($url) ? _MakeIMGTag(_PrefixURL($url), $alt_text, $title) : $whole_match; }xsge; # # Finally, handle reference-style implicitly labeled links: ![alt text] # $text =~ s{ ( # wrap whole match in $1 !\[ ($g_nested_brackets) # alt text = $2 \] ) }{ my $result; my $whole_match = $1; my $alt_text = $2; my $link_id = lc(_strip($alt_text)); if (defined $g_urls{$link_id}) { $result = _MakeIMGTag( _PrefixURL($g_urls{$link_id}), $alt_text, $g_titles{$link_id}); } else { # If there's no such link ID, leave intact: $result = $whole_match; } $result; }xsge; return $text; } sub _EncodeAttText { my $text = shift; defined($text) or return undef; $text = _HTMLEncode(_strip($text)); # We've got to encode these to avoid conflicting # with italics, bold and strike through. $text =~ s!([*_~:])!$g_escape_table{$1}!g; return $text; } sub _MakeAnchorId { use bytes; my ($link, $strip) = @_; $link = lc($link); if ($strip) { $link =~ s/\s+/_/gs; $link =~ tr/-a-z0-9_//cd; } else { $link =~ tr/-a-z0-9_/_/cs; } return '' unless $link ne ''; $link = "_".$link."_"; $link =~ s/__+/_/gs; $link = "_".md5_hex($link)."_" if length($link) > 66; return $link; } sub _GetNewAnchorId { my $link = _strip(lc(shift)); return '' if $link eq "" || defined($g_anchors{$link}); my $id = _MakeAnchorId($link); return '' unless $id; $g_anchors{$link} = '#'.$id; $g_anchors_id{$id} = $g_anchors{$link}; if ($id =~ /-/) { my $id2 = $id; $id2 =~ s/-/_/gs; $id2 =~ s/__+/_/gs; defined($g_anchors_id{$id2}) or $g_anchors_id{$id2} = $g_anchors{$link}; } my $idd = _MakeAnchorId($link, 1); if ($idd) { defined($g_anchors_id{$idd}) or $g_anchors_id{$idd} = $g_anchors{$link}; if ($idd =~ /-/) { my $idd2 = $idd; $idd2 =~ s/-/_/gs; $idd2 =~ s/__+/_/gs; defined($g_anchors_id{$idd2}) or $g_anchors_id{$idd2} = $g_anchors{$link}; } } $id; } sub _DoHeaders { my ($text, $anchors) = @_; my $h1; my $geth1 = $anchors && !defined($opt{h1}) ? sub { return unless !defined($h1); my $h = shift; $h =~ s/^\s+//; $h =~ s/\s+$//; $h =~ s/\s+/ /g; $h1 = $h if $h ne ""; } : sub {}; # atx-style headers: # # Header 1 # ## Header 2 # ## Header 2 with closing hashes ## # ... # ###### Header 6 # $text =~ s{ ^(\#{1,6}) # $1 = string of #'s [ ]* ((?:(?:(?" . _RunSpanGamut($h) . "\n\n"; }egmx; # Setext-style headers: # Header 1 # ======== # # Header 2 # -------- # # Header 3 # ~~~~~~~~ # $text =~ s{ ^(?:=+[ ]*\n)?[ ]*(.+?)[ ]*\n=+[ ]*\n+ }{ my $h = $1; my $id = _GetNewAnchorId($h); &$geth1($h); $id = " id=\"$id\"" if $id ne ""; "" . _RunSpanGamut($h) . "\n\n"; }egmx; $text =~ s{ ^(?:-+[ ]*\n)?[ ]*(.+?)[ ]*\n-+[ ]*\n+ }{ my $h = $1; my $id = _GetNewAnchorId($h); $id = " id=\"$id\"" if $id ne ""; "" . _RunSpanGamut($h) . "\n\n"; }egmx; $text =~ s{ ^(?:~+[ ]*\n)?[ ]*(.+?)[ ]*\n~+[ ]*\n+ }{ my $h = $1; my $id = _GetNewAnchorId($h); $id = " id=\"$id\"" if $id ne ""; "" . _RunSpanGamut($h) . "\n\n"; }egmx; $opt{h1} = $h1 if defined($h1) && $h1 ne ""; return $text; } my ($marker_ul, $marker_ol, $marker_any, $roman_numeral, $greek_lower); BEGIN { # Re-usable patterns to match list item bullets and number markers: $roman_numeral = qr/(?: [IiVvXx]|[Ii]{2,3}|[Ii][VvXx]|[VvXx][Ii]{1,3}|[Xx][Vv][Ii]{0,3}| [Xx][Ii][VvXx]|[Xx]{2}[Ii]{0,3}|[Xx]{2}[Ii]?[Vv]|[Xx]{2}[Vv][Ii]{1,2})/ox; $greek_lower = qr/(?:[\x{03b1}-\x{03c9}])/o; $marker_ul = qr/[*+-]/o; $marker_ol = qr/(?:\d+|[A-Za-z]|$roman_numeral|$greek_lower)[.\)]/o; $marker_any = qr/(?:$marker_ul|$marker_ol)/o; } sub _GetListMarkerType { my ($list_type, $list_marker, $last_marker) = @_; return "" unless $list_type && $list_marker && lc($list_type) eq "ol"; my $last_marker_type = ''; $last_marker_type = _GetListMarkerType($list_type, $last_marker) if defined($last_marker) && # these are roman unless $last_marker type case matches and is 'a' or 'A' $list_marker =~ /^[IiVvXx][.\)]?$/; return "I" if $list_marker =~ /^[IVX]/ && $last_marker_type ne 'A'; return "i" if $list_marker =~ /^[ivx]/ && $last_marker_type ne 'a'; return "A" if $list_marker =~ /^[A-Z]/; return "a" if $list_marker =~ /^[a-z]/ || $list_marker =~ /^$greek_lower/o; return "1"; } sub _GetListItemTypeClass { my ($list_type, $list_marker, $last_marker) = @_; my $list_marker_type = _GetListMarkerType($list_type, $list_marker, $last_marker); my $ans = &{sub{ return "" unless length($list_marker) >= 2 && $list_marker_type =~ /^[IiAa1]$/; return "lower-greek" if $list_marker_type eq "a" && $list_marker =~ /^$greek_lower/o; return "" unless $list_marker =~ /\)$/; return "upper-roman" if $list_marker_type eq "I"; return "lower-roman" if $list_marker_type eq "i"; return "upper-alpha" if $list_marker_type eq "A"; return "lower-alpha" if $list_marker_type eq "a"; return "decimal"; }}; return ($list_marker_type, $ans); } my %_roman_number_table; BEGIN { %_roman_number_table = ( i => 1, ii => 2, iii => 3, iv => 4, v => 5, vi => 6, vii => 7, viii => 8, ix => 9, x => 10, xi => 11, xii => 12, xiii => 13, xiv => 14, xv => 15, xvi => 16, xvii => 17, xviii => 18, xix => 19, xx => 20, xxi => 21, xxii => 22, xxiii => 23, xxiv => 24, xxv => 25, xxvi => 26, xxvii => 27 ); } # Necessary because ς and σ are the same value grrr my %_greek_number_table; BEGIN { %_greek_number_table = ( "\x{03b1}" => 1, # α "\x{03b2}" => 2, # β "\x{03b3}" => 3, # γ "\x{03b4}" => 4, # δ "\x{03b5}" => 5, # ε "\x{03b6}" => 6, # ζ "\x{03b7}" => 7, # η "\x{03b8}" => 8, # θ "\x{03b9}" => 9, # ι "\x{03ba}" => 10, # κ "\x{03bb}" => 11, # λ #"\x{00b5}"=> 12, # µ is "micro" not "mu" "\x{03bc}" => 12, # μ "\x{03bd}" => 13, # ν "\x{03be}" => 14, # ξ "\x{03bf}" => 15, # ο "\x{03c0}" => 16, # π "\x{03c1}" => 17, # ρ "\x{03c2}" => 18, # ς "\x{03c3}" => 18, # σ "\x{03c4}" => 19, # τ "\x{03c5}" => 20, # υ "\x{03c6}" => 21, # φ "\x{03c7}" => 22, # χ "\x{03c8}" => 23, # ψ "\x{03c9}" => 24 # ω ); } sub _GetMarkerIntegerNum { my ($list_marker_type, $marker_val) = @_; my $ans = &{sub{ return 0 + $marker_val if $list_marker_type eq "1"; $list_marker_type = lc($list_marker_type); return $_greek_number_table{$marker_val} if $list_marker_type eq "a" && defined($_greek_number_table{$marker_val}); $marker_val = lc($marker_val); return ord($marker_val) - ord("a") + 1 if $list_marker_type eq "a"; return 1 unless $list_marker_type eq "i"; defined($_roman_number_table{$marker_val}) and return $_roman_number_table{$marker_val}; return 1; }}; return $ans if $ans == 0 && $list_marker_type eq "1"; return $ans >= 1 ? $ans : 1; } sub _IncrList { my ($from, $to, $extra) = @_; $extra = defined($extra) ? " $extra" : ""; my $result = ""; while ($from + 10 <= $to) { $result .= "\n"; $from += 10; } while ($from + 5 <= $to) { $result .= "\n"; $from += 5; } while ($from + 2 <= $to) { $result .= "\n"; $from += 2; } while ($from < $to) { $result .= "\n"; ++$from; } return $result; } sub _DoListsAndBlocks { # # Form HTML ordered (numbered) and unordered (bulleted) lists. # my $text = shift; my $indent = $opt{indent_width}; my $less_than_indent = $indent - 1; my $less_than_double_indent = 2 * $indent - 1; # Re-usable pattern to match any entire ul or ol list: my $whole_list = qr{ ( # $1 (or $_[0]) = whole list ( # $2 (or $_[1]) (?:(?<=\n)|\A) [ ]{0,$less_than_indent} (${marker_any}) # $3 (or $_[2]) = first list item marker [ ]+ ) (?s:.+?) ( # $4 (or $_[3]) \z | \n{2,} (?=\S) (?! # Negative lookahead for another list item marker ${marker_any}[ ] ) ) ) }mx; my $list_item_sub = sub { my $list = $_[0]; my $list_type = ($_[2] =~ m/$marker_ul/) ? "ul" : "ol"; my $list_att = ""; my $list_class = ""; my $list_incr = ""; # Turn double returns into triple returns, so that we can make a # paragraph for the last item in a list, if necessary: $list =~ s/\n\n/\n\n\n/g; my ($result, $first_marker, $fancy) = _ProcessListItems($list_type, $list); defined($first_marker) or return $list; my $list_marker_type = _GetListMarkerType($list_type, $first_marker); if ($list_marker_type) { $first_marker =~ s/[.\)]$//; my $first_marker_num = _GetMarkerIntegerNum($list_marker_type, $first_marker); $list_att = $list_marker_type eq "1" ? "" : " type=\"$list_marker_type\""; if ($fancy) { $list_class = " class=\"$opt{style_prefix}ol\""; my $start = $first_marker_num; $start = 10 if $start > 10; $start = 5 if $start > 5 && $start < 10; $start = 1 if $start > 1 && $start < 5; $list_att .= " start=\"$start\"" unless $start == 1; $list_incr = _IncrList($start, $first_marker_num); } else { $list_class = " class=\"$opt{style_prefix}lc-greek\"" if $list_marker_type eq "a" && $first_marker =~ /^$greek_lower/o; $list_att .= " start=\"$first_marker_num\"" unless $first_marker_num == 1; } } my $idt = "\027" x $g_list_level; $result = "$idt<$list_type$list_att$list_class>\n$list_incr" . $result . "$idt\n\n"; $result; }; # We use a different prefix before nested lists than top-level lists. # See extended comment in _ProcessListItems(). # # Note: (jg) There's a bit of duplication here. My original implementation # created a scalar regex pattern as the conditional result of the test on # $g_list_level, and then only ran the $text =~ s{...}{...}egmx # substitution once, using the scalar as the pattern. This worked, # everywhere except when running under MT on my hosting account at Pair # Networks. There, this caused all rebuilds to be killed by the reaper (or # perhaps they crashed, but that seems incredibly unlikely given that the # same script on the same server ran fine *except* under MT. I've spent # more time trying to figure out why this is happening than I'd like to # admit. My only guess, backed up by the fact that this workaround works, # is that Perl optimizes the substition when it can figure out that the # pattern will never change, and when this optimization isn't on, we run # afoul of the reaper. Thus, the slightly redundant code to that uses two # static s/// patterns rather than one conditional pattern. # # Note: (kjm) With the addition of the two-of-the-same-kind-in-a-row- # starts-a-list-at-the-top-level rule the two patterns really are somewhat # different now, but the duplication has pretty much been eliminated via # use of a separate sub which has the side-effect of making the below # two cases much easier to grok all at once. if ($g_list_level) { my $parse = $text; $text = ""; pos($parse) = 0; while ($parse =~ /\G(?s:.)*?^$whole_list/gmc) { my @captures = ($1, $2, $3, $4); if ($-[1] > $-[0]) { $text .= _DoListBlocks(substr($parse, $-[0], $-[1] - $-[0])); } $text .= &$list_item_sub(@captures); } $text .= _DoListBlocks(substr($parse, pos($parse))) if pos($parse) < length($parse); } else { my $parse = $text; $text = ""; pos($parse) = 0; while ($parse =~ m{\G(?s:.)*? (?: (?<=\n\n) | \A\n? | (?<=:\n) | (?:(?<=\n) # a list starts with one unordered marker line (?=[ ]{0,$less_than_indent}$marker_ul[ ])) | (?:(?<=\n) # or two ordered marker lines in a row (?=[ ]{0,$less_than_indent}$marker_ol[ ].*\n\n? [ ]{0,$less_than_indent}$marker_ol[ ])) | (?:(?<=\n) # or any marker and a sublist marker (?=[ ]{0,$less_than_indent}$marker_any[ ].*\n\n? [ ]{$indent,$less_than_double_indent}$marker_any[ ])) ) $whole_list }gmcx) { my @captures = ($1, $2, $3, $4); if ($-[1] > $-[0]) { $text .= _DoListBlocks(substr($parse, $-[0], $-[1] - $-[0])); } $text .= &$list_item_sub(@captures); } $text .= _DoListBlocks(substr($parse, pos($parse))) if pos($parse) < length($parse); } return $text; } sub _ProcessListItems { # # Process the contents of a single ordered or unordered list, splitting it # into individual list items. # my $list_type = shift; my $list_str = shift; # The $g_list_level global keeps track of when we're inside a list. # Each time we enter a list, we increment it; when we leave a list, # we decrement. If it's zero, we're not in a list anymore. # # We do this because when we're not inside a list, we want to treat # something like this: # # I recommend upgrading to version # 8. Oops, now this line is treated # as a sub-list. # # As a single paragraph, despite the fact that the second line starts # with a digit-period-space sequence. # # Whereas when we're inside a list (or sub-list), that line will be # treated as the start of a sub-list. What a kludge, huh? This is # an aspect of Markdown's syntax that's hard to parse perfectly # without resorting to mind-reading. Perhaps the solution is to # change the syntax rules such that sub-lists must start with a # starting cardinal number; e.g. "1." or "a.". $g_list_level++; my $idt = "\027" x $g_list_level; my $marker_kind = $list_type eq "ul" ? $marker_ul : $marker_ol; my $first_marker; my $first_marker_type; my $first_marker_num; my $last_marker; my $fancy; my $skipped; my $typechanged; my $next_num = 1; # trim trailing blank lines: $list_str =~ s/\n{2,}\z/\n/; my $result = ""; my $oldpos = 0; pos($list_str) = 0; while ($list_str =~ m{\G # start where we left off (\n+)? # leading line = $1 (^[ ]*) # leading whitespace = $2 ($marker_any) [ ] ([ ]*) # list marker = $3 leading item space = $4 }cgmx) { my $leading_line = $1; my $leading_space = $2; my $list_marker = $3; my $list_marker_len = length($list_marker); my $leading_item_space = $4; if ($-[0] > $oldpos) { $result .= substr($list_str, $oldpos, $-[0] - $oldpos); # Sort-of $` $oldpos = $-[0]; # point at start of this entire match } if (!defined($first_marker)) { $first_marker = $list_marker; $first_marker_type = _GetListMarkerType($list_type, $first_marker); if ($first_marker_type) { (my $marker_val = $first_marker) =~ s/[.\)]$//; $first_marker_num = _GetMarkerIntegerNum($first_marker_type, $marker_val); $next_num = $first_marker_num; $skipped = 1 if $next_num != 1; } } elsif ($list_marker !~ /$marker_kind/) { # Wrong marker kind, "fix up" the marker to a correct "lazy" marker # But keep the old length in $list_marker_len $list_marker = $last_marker; } # Now grab the rest of this item's data upto but excluding the next # list marker at the SAME indent level, but sublists must be INCLUDED my $item = ""; while ($list_str =~ m{\G ((?:.+?)(?:\n{1,2})) # list item text = $1 (?= \n* (?: \z | # end of string OR (^[ ]*) # leading whitespace = $2 ($marker_any) # next list marker = $3 ([ ]+) )) # one or more spaces after marker = $4 }cgmxs) { # If $3 has a left edge that is at the left edge of the previous # marker OR $3 has a right edge that is at the right edge of the # previous marker then we stop; otherwise we go on $item .= substr($list_str, $-[0], $+[0] - $-[0]); # $& last if !defined($4) || length($2) == length($leading_space) || length($2) + length($3) == length($leading_space) + $list_marker_len; # move along, you're not the marker droid we're looking for... $item .= substr($list_str, $+[0], $+[4] - $+[0]); pos($list_str) = $+[4]; # ...move along over the marker droid } # Remember where we parked $oldpos = pos($list_str); # Process the $list_marker $item my $liatt = ''; my $checkbox = ''; my $incr = ''; if ($list_type eq "ul" && !$leading_item_space && $item =~ /^\[([ xX])\] +(.*)$/s) { my $checkmark = lc $1; $item = $2; my ($checkbox_class, $checkbox_val); if ($checkmark eq "x") { ($checkbox_class, $checkbox_val) = ("checkbox-on", "x"); } else { ($checkbox_class, $checkbox_val) = ("checkbox-off", " "); } $liatt = " class=\"$opt{style_prefix}$checkbox_class\""; $checkbox = "[$checkbox_val"; } else { my $list_marker_type; ($list_marker_type, $liatt) = _GetListItemTypeClass($list_type, $list_marker, $last_marker); if ($list_type eq "ol" && defined($first_marker)) { my $styled = $fancy = 1 if $liatt && $list_marker =~ /\)$/; my ($sfx, $dash) = ("", ""); ($sfx, $dash) = ("li", "-") if $styled; if ($liatt =~ /lower/) { $sfx .= "${dash}lc"; } elsif ($liatt =~ /upper/) { $sfx .= "${dash}uc"; } $sfx .= "-greek" if $liatt =~ /greek/; $liatt = " class=\"$opt{style_prefix}$sfx\"" if $sfx; $typechanged = 1 if $list_marker_type ne $first_marker_type; (my $marker_val = $list_marker) =~ s/[.\)]$//; my $marker_num = _GetMarkerIntegerNum($list_marker_type, $marker_val); $marker_num = $next_num if $marker_num < $next_num; $skipped = 1 if $next_num < $marker_num; $incr = _IncrList($next_num, $marker_num, "incrlevel=$g_list_level"); $liatt = " value=\"$marker_num\"$liatt" if $fancy || $skipped; $liatt = " type=\"$list_marker_type\"$liatt" if $styled || $typechanged; $next_num = $marker_num + 1; } } $last_marker = $list_marker; if ($leading_line or ($item =~ m/\n{2,}/)) { $item = _RunBlockGamut(_Outdent($item)); $item =~ s{()\s*\z}{$1} and $item .= "\n$idt "; } else { # Recursion for sub-lists: $item = _DoListsAndBlocks(_Outdent($item)); chomp $item; $item = _RunSpanGamut($item); } # Append to $result $result .= "$incr$idt" . $checkbox . $item . "$idt\n"; } if ($fancy) { # remove "incrlevel=$g_list_level " parts $result =~ s{} {$idt}g; } else { # remove the $g_list_level incr spans entirely $result =~ s{\n}{}g; # remove the class="$opt{style_prefix}lc-greek" if first_marker is greek $result =~ s{(]*?) class="$opt{style_prefix}lc-greek">}{$1>}g if defined($first_marker_type) && $first_marker_type eq "a" && $first_marker =~ /^$greek_lower/o; } # Anything left over (similar to $') goes into result, but this should always be empty $result .= _RunBlockGamut(substr($list_str, pos($list_str))) if pos($list_str) < length($list_str); $g_list_level--; # After all that, if we only got an ordered list with a single item # and its first marker is a four-digit number >= 1492 and <= 2999 # or an UPPERCASE letter, then pretend we didn't see any list at all. if ($first_marker_type && $first_marker_num + 1 == $next_num) { if (($first_marker_type eq "1" && $first_marker_num >= 1492 && $first_marker_num <= 2999) || ($first_marker_type eq "A" && !$fancy)) { return (undef, undef, undef); } } return ($result, $first_marker, $fancy); } sub _DoCodeBlocks { # # Process Markdown `
` blocks.
#

    my $text = shift;

    $text =~ s{
	    (?:\n\n|\A\n?)
	    (		# $1 = the code block -- one or more lines, starting with indent_width spaces
	      (?:
		(?:[ ]{$opt{indent_width}})  # Lines must start with indent_width of spaces
		.*\n+
	      )+
	    )
	    ((?=^[ ]{0,$opt{indent_width}}\S)|\Z) # Lookahead for non-space at line-start, or end of doc
	}{
	    my $codeblock = $1;

	    $codeblock =~ s/\n\n\n/\n\n/g; # undo "paragraph for last list item" change
	    $codeblock = _EncodeCode(_Outdent($codeblock));
	    $codeblock =~ s/\A\n+//; # trim leading newlines
	    $codeblock =~ s/\s+\z//; # trim trailing whitespace

	    my $result = "
"
		. $codeblock . "\n
"; my $key = block_id($result, 2); $g_code_blocks{$key} = $result; "\n\n" . $key . "\n\n"; }egmx; return $text; } sub _DoCodeSpans { # # * Backtick quotes are used for spans. # # * You can use multiple backticks as the delimiters if you want to # include literal backticks in the code span. So, this input: # # Just type ``foo `bar` baz`` at the prompt. # # Will translate to: # #

Just type foo `bar` baz at the prompt.

# # There's no arbitrary limit to the number of backticks you # can use as delimters. If you need three consecutive backticks # in your code, use four for delimiters, etc. # # * You can use spaces to get literal backticks at the edges: # # ... type `` `bar` `` ... # # Turns to: # # ... type `bar` ... # my $text = shift; $text =~ s@ (`+) # $1 = Opening run of ` (.+?) # $2 = The code block (?$c
"; @egsx; return $text; } sub _EncodeCode { # # Encode/escape certain characters inside Markdown code runs. # The point is that in code, these characters are literals, # and lose their special Markdown meanings. # local $_ = shift; # Encode all ampersands; HTML entities are not # entities within a Markdown code span. s/&/&/g; # Encode $'s, but only if we're running under Blosxom. # (Blosxom interpolates Perl variables in article bodies.) s/\$/$/g if $_haveBX; # Do the angle bracket song and dance: s! < !<!gx; s! > !>!gx; # Now, escape characters that are magic in Markdown: s!([*_~{}\[\]\\])!$g_escape_table{$1}!g; return $_; } sub _DoItalicsAndBoldAndStrike { my $text = shift; my $doital1 = sub { my $text = shift; $text =~ s{ \* (?=\S) (.+?) (?<=\S) \* } {$1}gsx; # We've got to encode any of these remaining to # avoid conflicting with other italics and bold. $text =~ s!([*])!$g_escape_table{$1}!g; $text; }; my $doital2 = sub { my $text = shift; $text =~ s{ (?$1}gsx; # We've got to encode any of these remaining to # avoid conflicting with other italics and bold. $text =~ s!([_])!$g_escape_table{$1}!g; $text; }; # must go first: $text =~ s{ \*\* (?=\S) (.+?[*_]*) (?<=\S) \*\* } {"".&$doital1($1).""}gsex; $text =~ s{ (?".&$doital2($1).""}gsex; $text =~ s{ ~~ (?=\S) (.+?[*_]*) (?<=\S) ~~ } {$1}gsx; $text =~ s{ \* (?=\S) (.+?) (?<=\S) \* } {$1}gsx; $text =~ s{ (?$1}gsx; return $text; } sub _DoBlockQuotes { my $text = shift; $text =~ s{ ( # Wrap whole match in $1 ( ^[ ]*>[ ]? # '>' at the start of a line .*\n # rest of the first line (.+\n)* # subsequent consecutive lines \n* # blanks )+ ) }{ my $bq = $1; $bq =~ s/^[ ]*>[ ]?//gm; # trim one level of quoting $bq =~ s/^[ ]+$//mg; # trim whitespace-only lines $bq = _RunBlockGamut($bq); # recurse $bq =~ s/^/\027/mg; "
\n$bq\n
\n\n"; }egmx; return $text; } my ($LEAD, $TRAIL, $LEADBAR, $LEADSP, $COLPL, $SEP); BEGIN { $LEAD = qr/(?>[ ]*(?:\|[ ]*)?)/o; $TRAIL = qr/[ ]*(?[ ]*\|[ ]*)/o; $LEADSP = qr/(?>[ ]*)/o; $COLPL = qr/(?:[^\n|\\]|\\(?:(?>[^\n])|(?=\n|$)))+/o; $SEP = qr/[ ]*:?-+:?[ ]*/o; } sub _DoTables { my $text = shift; $text =~ s{ ( # Wrap whole thing to avoid $& (?: (?<=\n\n) | \A\n? ) # Preceded by blank line or beginning of string ^( # Header line $LEADBAR \| [^\n]* | $LEADBAR $COLPL [^\n]* | $LEADSP $COLPL \| [^\n]* )\n ( # Separator line $LEADBAR $SEP (?: \| $SEP )* (?: \| [ ]*)? | $SEP (?: \| $SEP )+ (?: \| [ ]*)? | $SEP \| [ ]* )\n ((?: # Rows (0+) $LEADBAR \| [^\n]* \n | $LEADBAR $COLPL [^\n]* \n | $LEADSP $COLPL \| [^\n]* \n )*) ) } { my ($w, $h, $s, $rows) = ($1, $2, $3, $4); my @heads = _SplitTableRow($h); my @seps = _SplitTableRow($s); if (@heads == @seps) { my @align = map { if (/^:-+:$/) {" align=\"center\""} elsif (/^:/) {" align=\"left\""} elsif (/:$/) {" align=\"right\""} else {""} } @seps; my $nohdr = ""; $nohdr = " $opt{style_prefix}table-nohdr" if join("", @heads) eq ""; my $tab ="\n\n"; $tab .= " " . _MakeTableRow("th", \@align, @heads) . "\n" unless $nohdr; my $cnt = 0; my @classes = ("class=\"$opt{style_prefix}row-even\"", "class=\"$opt{style_prefix}row-odd\""); $tab .= " " . _MakeTableRow("td", \@align, @$_) . "\n" foreach (_SplitMergeRows($rows)); $tab .= "
\n\n"; } else { $w; } }egmx; return $text; } sub _SplitMergeRows { my @rows = (); my ($mergeprev, $mergenext) = (0,0); foreach (split(/\n/, $_[0])) { $mergeprev = $mergenext; $mergenext = 0; my @cols = _SplitTableRow($_); if (_endswithbareslash($cols[$#cols])) { my $last = $cols[$#cols]; substr($last, -1, 1) = ""; $last =~ s/[ ]+$//; $cols[$#cols] = $last; $mergenext = 1; } if ($mergeprev) { for (my $i = 0; $i <= $#cols; ++$i) { my $cell = $rows[$#rows]->[$i]; defined($cell) or $cell = ""; $rows[$#rows]->[$i] = _MergeCells($cell, $cols[$i]); } } else { push(@rows, [@cols]); } } return @rows; } sub _endswithbareslash { return 0 unless substr($_[0], -1, 1) eq "\\"; my @parts = split(/\\\\/, $_[0], -1); return substr($parts[$#parts], -1, 1) eq "\\"; } sub _MergeCells { my ($c1, $c2) = @_; return $c1 if $c2 eq ""; return $c2 if $c1 eq ""; return $c1 . " " . $c2; } sub _SplitTableRow { my $row = shift; $row =~ s/^$LEAD//; $row =~ s/$TRAIL$//; $row =~ s!\\\\!$g_escape_table{'\\'}!go; # Must process escaped backslashes first. $row =~ s!\\\|!$g_escape_table{'|'}!go; # Then do \| my @elems = map { s!$g_escape_table{'|'}!|!go; s!$g_escape_table{'\\'}!\\\\!go; s/^[ ]+//; s/[ ]+$//; $_; } split(/[ ]*\|[ ]*/, $row, -1); @elems or push(@elems, ""); return @elems; } sub _MakeTableRow { my $etype = shift; my $align = shift; my $row = ""; for (my $i = 0; $i < @$align; ++$i) { my $data = $_[$i]; defined($data) or $data = ""; $row .= "<" . $etype . $$align[$i] . ">" . _RunSpanGamut($data) . ""; } return $row; } sub _FormParagraphs { # # Params: # $text - string to process with html

tags # my $text = shift; # Strip leading and trailing lines: $text =~ s/\A\n+//; $text =~ s/\n+\z//; my @grafs = split(/\n{2,}/, $text); # # Wrap

tags. # foreach (@grafs) { unless (defined($g_html_blocks{$_}) || defined($g_code_blocks{$_})) { $_ = _RunSpanGamut($_); s/^([ ]*)/

/; $_ .= "

"; } } # # Unhashify HTML blocks # foreach (@grafs) { if (defined( $g_html_blocks{$_} )) { $_ = $g_html_blocks{$_}; } } return join "\n\n", @grafs; } my $g_possible_tag_name; my %ok_tag_name; BEGIN { # note: length("blockquote") == 10 $g_possible_tag_name = qr/(?i:[a-z]{1,10}|h[1-6])/o; %ok_tag_name = map({$_ => 1} qw( a abbr acronym address area b basefont bdo big blockquote br caption center cite code col colgroup dd del dfn div dl dt em font h1 h2 h3 h4 h5 h6 hr i img ins kbd li map ol p pre q s samp small span strike strong sub sup table tbody td tfoot th thead tr tt u ul var )); $ok_tag_name{$_} = 0 foreach (qw( dir menu )); } sub _SetAllowedTag { my ($tag, $forbid) = @_; $ok_tag_name{$tag} = $forbid ? 0 : 1 if defined($tag) && exists($ok_tag_name{$tag}); } # Encode leading '<' of any non-tags # However, "]|/>$)} || $tag =~ m{^}) && $ok_tag_name{lc($1)}) { return _ProcessURLTag("href", $tag) if $tag =~ /^ 1} qw(class dir id lang style title xml:lang)); %tagatt = ( 'a' => { map({$_ => 1} qw(href name)) }, 'area' => { map({$_ => 1} qw(alt coords href nohref shape)) }, 'basefont' => { map({$_ => 1} qw(color face size)) }, 'br' => { map({$_ => 1} qw(clear)) }, 'caption' => { map({$_ => 1} qw(align)) }, 'col' => { map({$_ => 1} qw(align span width valign)) }, 'colgroup' => { map({$_ => 1} qw(align span width valign)) }, 'dir' => { map({$_ => 1} qw(compact)) }, 'div' => { map({$_ => 1} qw(align)) }, 'dl' => { map({$_ => 1} qw(compact)) }, 'font' => { map({$_ => 1} qw(color face size)) }, 'h1' => { map({$_ => 1} qw(align)) }, 'h2' => { map({$_ => 1} qw(align)) }, 'h3' => { map({$_ => 1} qw(align)) }, 'h4' => { map({$_ => 1} qw(align)) }, 'h5' => { map({$_ => 1} qw(align)) }, 'h6' => { map({$_ => 1} qw(align)) }, 'hr' => { map({$_ => 1} qw(align noshade size width)) }, # NO server-side image maps, therefore NOT ismap ! 'img' => { map({$_ => 1} qw(align alt border height hspace src usemap vspace width)) }, 'li' => { map({$_ => 1} qw(compact type value)) }, 'map' => { map({$_ => 1} qw(name)) }, 'menu' => { map({$_ => 1} qw(compact)) }, 'ol' => { map({$_ => 1} qw(compact start type)) }, 'p' => { map({$_ => 1} qw(align)) }, 'pre' => { map({$_ => 1} qw(width)) }, 'table' => { map({$_ => 1} qw(align border cellpadding cellspacing summary width)) }, 'tbody' => { map({$_ => 1} qw(align valign)) }, 'tfoot' => { map({$_ => 1} qw(align valign)) }, 'thead' => { map({$_ => 1} qw(align valign)) }, 'td' => { map({$_ => 1} qw(align colspan height nowrap rowspan valign width)) }, 'th' => { map({$_ => 1} qw(align colspan height nowrap rowspan valign width)) }, 'tr' => { map({$_ => 1} qw(align valign)) }, 'ul' => { map({$_ => 1} qw(compact type)) } ); %tagmt = map({$_ => 1} qw(area basefont br col hr img)); %tagocl = map({$_ => 1} qw(colgroup dd dt li p tbody td tfoot th thead tr)); %tagacl = ( 'colgroup' => \%tagocl, 'dd' => \%tagocl, 'dt' => \%tagocl, 'li' => \%tagocl, 'tbody' => \%tagocl, 'td' => { map({$_ => 1} qw(colgroup dd dt li p td tfoot th thead)) }, 'tfoot' => \%tagocl, 'th' => { map({$_ => 1} qw(colgroup dd dt li p td tfoot th thead)) }, 'thead' => \%tagocl, 'tr' => { map({$_ => 1} qw(colgroup dd dt li p td tfoot th thead tr)) }, ); %tagblk = map({$_ => 1} qw(address blockquote div dl h1 h2 h3 h4 h5 h6 hr ol p pre table)); %impatt = map({$_ => 1} qw(checked compact ismap nohref noshade nowrap)); %lcattval = map({$_ => 1} qw( align border cellpadding cellspacing checked clear color colspan compact coords height hspace ismap nohref noshade nowrap rowspan size span shape valign vspace width )); %taga1p = map({$_ => 1} qw(a area img map)); } # _SanitizeTags # # Inspect all '<'...'>' tags in the input and HTML encode those things # that cannot possibly be tags and at the same time sanitize them. # # $1 => text to process # <= sanitized text sub _SanitizeTags { my ($text, $validate) = @_; $text =~ s/\s+$//; $text ne "" or return ""; my @stack = (); my $ans = ""; my $end = length($text); pos($text) = 0; my ($autoclose, $autoclopen); my $lastmt = ""; $autoclose = sub { my $s = $_[0] || ""; while (@stack && $stack[$#stack]->[0] ne $s && $tagocl{$stack[$#stack]->[0]}) { $ans .= "[0] . ">"; pop(@stack); } } if $validate; $autoclopen = sub { my $s = $_[0] || ""; my $c; if ($tagblk{$s}) {$c = {p=>1}} elsif ($tagocl{$s}) {$c = $tagacl{$s}} else {return} while (@stack && $c->{$stack[$#stack]->[0]}) { $ans .= "[0] . ">"; pop(@stack); } } if $validate; while (pos($text) < $end) { if ($text =~ /\G([^<]+)/gc) { $ans .= $1; $lastmt = "" if $1 =~ /\S/; next; } my $tstart = pos($text); if ($text =~ /\G(<[^>]*>)/gc) { my $tag = $1; if ($tag =~ /^