@ -2245,6 +2245,76 @@ sub _DoTag {
}
my % univatt ; # universally allowed attribute names
my % tagatt ; # per-element allowed attribute names
my % tagmt ; # empty element tags
my % tagocl ; # non-empty elements with optional closing tag
my % tagacl ; # which %tagocl an opening %tagocl will close
my % tagblk ; # block elements
my % lcattval ; # names of attribute values to lowercase
my % impatt ; # names of "implied" attributes
BEGIN {
% univatt = map ( { $ _ = > 1 } qw( class dir id lang style title xml:lang ) ) ;
% tagatt = (
'a' = > { map ( { $ _ = > 1 } qw( href name ) ) } ,
'area' = > { map ( { $ _ = > 1 } qw( alt coords href nohref shape ) ) } ,
'basefont' = > { map ( { $ _ = > 1 } qw( color face size ) ) } ,
'br' = > { map ( { $ _ = > 1 } qw( clear ) ) } ,
'caption' = > { map ( { $ _ = > 1 } qw( align ) ) } ,
'col' = > { map ( { $ _ = > 1 } qw( align span width valign ) ) } ,
'colgroup' = > { map ( { $ _ = > 1 } qw( align span width valign ) ) } ,
'dir' = > { map ( { $ _ = > 1 } qw( compact ) ) } ,
'div' = > { map ( { $ _ = > 1 } qw( align ) ) } ,
'dl' = > { map ( { $ _ = > 1 } qw( compact ) ) } ,
'font' = > { map ( { $ _ = > 1 } qw( color face size ) ) } ,
'h1' = > { map ( { $ _ = > 1 } qw( align ) ) } ,
'h2' = > { map ( { $ _ = > 1 } qw( align ) ) } ,
'h3' = > { map ( { $ _ = > 1 } qw( align ) ) } ,
'h4' = > { map ( { $ _ = > 1 } qw( align ) ) } ,
'h5' = > { map ( { $ _ = > 1 } qw( align ) ) } ,
'h6' = > { map ( { $ _ = > 1 } qw( align ) ) } ,
'hr' = > { map ( { $ _ = > 1 } qw( align noshade size width ) ) } ,
# NO server-side image maps, therefore NOT ismap !
'img' = > { map ( { $ _ = > 1 } qw( align alt border height hspace src usemap vspace width ) ) } ,
'li' = > { map ( { $ _ = > 1 } qw( compact type value ) ) } ,
'map' = > { map ( { $ _ = > 1 } qw( name ) ) } ,
'menu' = > { map ( { $ _ = > 1 } qw( compact ) ) } ,
'ol' = > { map ( { $ _ = > 1 } qw( compact start type ) ) } ,
'p' = > { map ( { $ _ = > 1 } qw( align ) ) } ,
'pre' = > { map ( { $ _ = > 1 } qw( width ) ) } ,
'table' = > { map ( { $ _ = > 1 } qw( align border cellpadding cellspacing summary width ) ) } ,
'tbody' = > { map ( { $ _ = > 1 } qw( align valign ) ) } ,
'tfoot' = > { map ( { $ _ = > 1 } qw( align valign ) ) } ,
'thead' = > { map ( { $ _ = > 1 } qw( align valign ) ) } ,
'td' = > { map ( { $ _ = > 1 } qw( align colspan height nowrap rowspan valign width ) ) } ,
'th' = > { map ( { $ _ = > 1 } qw( align colspan height nowrap rowspan valign width ) ) } ,
'tr' = > { map ( { $ _ = > 1 } qw( align valign ) ) } ,
'ul' = > { map ( { $ _ = > 1 } qw( compact type ) ) }
) ;
% tagmt = map ( { $ _ = > 1 } qw( area basefont br col hr img ) ) ;
% tagocl = map ( { $ _ = > 1 } qw( colgroup dd dt li p tbody td tfoot th thead tr ) ) ;
% tagacl = (
'colgroup' = > \ % tagocl ,
'dd' = > \ % tagocl ,
'dt' = > \ % tagocl ,
'li' = > \ % tagocl ,
'tbody' = > \ % tagocl ,
'td' = > { map ( { $ _ = > 1 } qw( colgroup dd dt li p td tfoot th thead ) ) } ,
'tfoot' = > \ % tagocl ,
'th' = > { map ( { $ _ = > 1 } qw( colgroup dd dt li p td tfoot th thead ) ) } ,
'thead' = > \ % tagocl ,
'tr' = > { map ( { $ _ = > 1 } qw( colgroup dd dt li p td tfoot th thead tr ) ) } ,
) ;
% tagblk = map ( { $ _ = > 1 } qw( address blockquote div dl h1 h2 h3 h4 h5 h6 hr ol p pre table ) ) ;
% impatt = map ( { $ _ = > 1 } qw( checked compact ismap nohref noshade nowrap ) ) ;
% lcattval = map ( { $ _ = > 1 } qw(
align border cellpadding cellspacing checked clear color colspan
compact coords height hspace ismap nohref noshade nowrap rowspan size
span shape valign vspace width
) ) ;
}
# _SanitizeTags
#
# Inspect all '<'...'>' tags in the input and HTML encode those things
@ -2254,13 +2324,37 @@ sub _DoTag {
# <= sanitized text
sub _SanitizeTags {
my ( $ text , $ validate ) = @ _ ;
$ text =~ s/\s+$// ;
$ text ne "" or return "" ;
my @ stack = ( ) ;
my $ ans = "" ;
my $ end = length ( $ text ) ;
pos ( $ text ) = 0 ;
my ( $ autoclose , $ autoclopen ) ;
my $ lastmt = "" ;
$ autoclose = sub {
my $ s = $ _ [ 0 ] || "" ;
while ( @ stack && $ stack [ $# stack ] - > [ 0 ] ne $ s &&
$ tagocl { $ stack [ $# stack ] - > [ 0 ] } ) {
$ ans . = "</" . $ stack [ $# stack ] - > [ 0 ] . ">" ;
pop ( @ stack ) ;
}
} if $ validate ;
$ autoclopen = sub {
my $ s = $ _ [ 0 ] || "" ;
my $ c ;
if ( $ tagblk { $ s } ) { $ c = { p = > 1 } }
elsif ( $ tagocl { $ s } ) { $ c = $ tagacl { $ s } }
else { return }
while ( @ stack && $ c - > { $ stack [ $# stack ] - > [ 0 ] } ) {
$ ans . = "</" . $ stack [ $# stack ] - > [ 0 ] . ">" ;
pop ( @ stack ) ;
}
} if $ validate ;
while ( pos ( $ text ) < $ end ) {
if ( $ text =~ /\G([^<]+)/gc ) {
$ ans . = $ 1 ;
$ lastmt = "" if $ 1 =~ /\S/ ;
next ;
}
my $ tstart = pos ( $ text ) ;
@ -2270,17 +2364,23 @@ sub _SanitizeTags {
$ ans . = $ tag ;
next ;
}
my $ tt ;
if ( ( $ tag =~ m {^<($g_possible_tag_name)(?:[\s>]|/>$)} ||
$ tag =~ m {^</($g_possible_tag_name)\s*>} ) &&
$ ok_tag_name { lc ( $ 1 ) } )
$ ok_tag_name { $ tt = lc ( $ 1 ) } )
{
my ( $ stag , $ styp ) = _Sanitize ( $ tag ) ;
$ ans . = $ stag ;
if ( $ validate && ( $ styp == 1 || $ styp == 2 ) && $ stag =~ m {^</?([^/\s>]+)} ) {
my $ tt = $ 1 ;
if ( $ styp == 2 && $ lastmt eq $ tt ) {
$ lastmt = "" ;
next ;
}
$ lastmt = $ styp == 3 ? $ tt : "" ;
if ( $ validate && $ styp ) {
& $ autoclopen ( $ tt ) if $ styp == 1 || $ styp == 3 ;
if ( $ styp == 1 ) {
push ( @ stack , [ $ tt , $ tstart ] ) ;
} else {
} elsif ( $ styp == 2 ) {
& $ autoclose ( $ tt ) unless $ tt eq "p" ;
! @ stack and _xmlfail ( "closing tag $tt without matching open at " .
_linecol ( $ tstart , $ text ) ) ;
if ( $ stack [ $# stack ] - > [ 0 ] eq $ tt ) {
@ -2292,28 +2392,32 @@ sub _SanitizeTags {
}
}
}
$ ans . = $ stag ;
next ;
} else {
$ tag =~ s/^</</ ;
$ ans . = $ tag ;
$ lastmt = "" ;
next ;
}
}
# can only get here if "\G" char is an unmatched "<"
pos ( $ text ) += 1 ;
$ ans . = "<" ;
$ lastmt = "" ;
}
& $ autoclose if $ validate ;
if ( $ validate && @ stack ) {
my @ errs ;
my $ j ;
for ( $ j = 0 ; $ j <= $# stack ; + + $ j ) {
my @ i = @ { $ stack [ $ j ] } ;
p ush( @ errs , "opening tag $i[0] without matching close at " .
un shift ( @ errs , "opening tag $i[0] without matching close at " .
_linecol ( $ i [ 1 ] , $ text ) ) ;
}
_xmlfail ( @ errs ) ;
}
return $ ans ;
return $ ans . "\n" ;
}
@ -2332,59 +2436,6 @@ sub _xmlfail {
}
my % univatt ; # universally allowed attribute names
my % tagatt ; # per-element allowed attribute names
my % tagmt ; # empty element tags
my % lcattval ; # names of attribute values to lowercase
my % impatt ; # names of "implied" attributes
BEGIN {
% univatt = map ( { $ _ = > 1 } qw( class dir id lang style title xml:lang ) ) ;
% tagatt = (
'a' = > { map ( { $ _ = > 1 } qw( href name ) ) } ,
'area' = > { map ( { $ _ = > 1 } qw( alt coords href nohref shape ) ) } ,
'basefont' = > { map ( { $ _ = > 1 } qw( color face size ) ) } ,
'br' = > { map ( { $ _ = > 1 } qw( clear ) ) } ,
'caption' = > { map ( { $ _ = > 1 } qw( align ) ) } ,
'col' = > { map ( { $ _ = > 1 } qw( align span width valign ) ) } ,
'colgroup' = > { map ( { $ _ = > 1 } qw( align span width valign ) ) } ,
'dir' = > { map ( { $ _ = > 1 } qw( compact ) ) } ,
'div' = > { map ( { $ _ = > 1 } qw( align ) ) } ,
'dl' = > { map ( { $ _ = > 1 } qw( compact ) ) } ,
'font' = > { map ( { $ _ = > 1 } qw( color face size ) ) } ,
'h1' = > { map ( { $ _ = > 1 } qw( align ) ) } ,
'h2' = > { map ( { $ _ = > 1 } qw( align ) ) } ,
'h3' = > { map ( { $ _ = > 1 } qw( align ) ) } ,
'h4' = > { map ( { $ _ = > 1 } qw( align ) ) } ,
'h5' = > { map ( { $ _ = > 1 } qw( align ) ) } ,
'h6' = > { map ( { $ _ = > 1 } qw( align ) ) } ,
'hr' = > { map ( { $ _ = > 1 } qw( align noshade size width ) ) } ,
# NO server-side image maps, therefore NOT ismap !
'img' = > { map ( { $ _ = > 1 } qw( align alt border height hspace src usemap vspace width ) ) } ,
'li' = > { map ( { $ _ = > 1 } qw( compact type value ) ) } ,
'map' = > { map ( { $ _ = > 1 } qw( name ) ) } ,
'menu' = > { map ( { $ _ = > 1 } qw( compact ) ) } ,
'ol' = > { map ( { $ _ = > 1 } qw( compact start type ) ) } ,
'p' = > { map ( { $ _ = > 1 } qw( align ) ) } ,
'pre' = > { map ( { $ _ = > 1 } qw( width ) ) } ,
'table' = > { map ( { $ _ = > 1 } qw( align border cellpadding cellspacing summary width ) ) } ,
'tbody' = > { map ( { $ _ = > 1 } qw( align valign ) ) } ,
'tfoot' = > { map ( { $ _ = > 1 } qw( align valign ) ) } ,
'thead' = > { map ( { $ _ = > 1 } qw( align valign ) ) } ,
'td' = > { map ( { $ _ = > 1 } qw( align colspan height nowrap rowspan valign width ) ) } ,
'th' = > { map ( { $ _ = > 1 } qw( align colspan height nowrap rowspan valign width ) ) } ,
'tr' = > { map ( { $ _ = > 1 } qw( align valign ) ) } ,
'ul' = > { map ( { $ _ = > 1 } qw( compact type ) ) }
) ;
% tagmt = map ( { $ _ = > 1 } qw( area basefont br col hr img ) ) ;
% impatt = map ( { $ _ = > 1 } qw( checked compact ismap nohref noshade nowrap ) ) ;
% lcattval = map ( { $ _ = > 1 } qw(
align border cellpadding cellspacing checked clear color colspan
compact coords height hspace ismap nohref noshade nowrap rowspan size
span shape valign vspace width
) ) ;
}
sub _Sanitize {
my $ tag = shift ;
my $ seenatt = { } ;
@ -3057,15 +3108,20 @@ to be recognized and passed through even without using this option.
= item B <--sanitize>
Remove troublesome tag attributes from embedded tags . Only a very strictly
Removes troublesome tag attributes from embedded tags . Only a very strictly
limited set of tag attributes will be permitted , other attributes will be
silently discarded . The set of allowed attributes varies by tag .
This is enabled by default .
Also split empty minimized elements that are not one of the HTML allowed
empty elements ( C <area> C <basefont> C <br> C <col> C <hr> C <img> ) into separate
begin and end tags . For example , C << <p/> >> or C << < p / > >> will be split
into C << <p> </p> >> .
Splits empty minimized elements that are not one of the HTML allowed empty
elements ( C <area> C <basefont> C <br> C <col> C <hr> C <img> ) into separate begin
and end tags . For example , C << <p/> >> or C << < p / > >> will be split into
C << <p> </p> >> .
Combines adjacent ( whitespace separated only ) opening and closing tags for
the same HTML empty element into a single minimized tag . For example ,
C << <br> </br> >> will become C << < br / > >> .
This is enabled by default .
= item B <--no-sanitize>
@ -3086,8 +3142,8 @@ module be present (one is only required if this option is given).
Any errors are reported to STDERR and the exit status will be
non - zero on XML validation failure . Note that all line and column
numbers in the output refer to the entire output that would have
been produced . Re - run with B <--no-validate-xml> to see what ' s
numbers in the error output refer to the entire output that would
have been produced . Re - run with B <--no-validate-xml> to see what ' s
actually present at those line and column positions .
If the B <--stub> option has also been given , then the entire output is
@ -3107,13 +3163,19 @@ Perform XML validation on the output before it's output and die if
it fails validation . This uses a simple internal consistency checker
that finds unmatched and mismatched open / close tags .
Non - empty elements that in HTML have optional closing tags ( C <colgroup>
C <dd> C <dt> C <li> C <p> C <tbody> C <td> C <tfoot> C <th> C <thead> C <tr> )
will automatically have any omitted end tags inserted during the
`--validate-xml-internal` process .
Any errors are reported to STDERR and the exit status will be
non - zero on XML validation failure . Note that all line and column
numbers in the output refer to the entire output that would have
been produced without any B <--stub> or B <--stylesheet> options .
Re - run with B <--no-validate-xml> and I <without> any B <--stub> or
B <--stylesheet> options to see what ' s actually present at those
line and column positions .
numbers in the error output refer to the entire output that would
have been produced before sanitization without any B <--stub> or
B <--stylesheet> options . Re - run with B <--no-sanitize> and
B <--no-validate-xml> and I <without> any B <--stub> or B <--stylesheet>
options to see what ' s actually present at those line and column
positions .
This option validates the output I < prior to > adding any requested
B <--stub> or B <--stylesheet> . As the built - in stub and stylesheet