perl/cpan/Pod-Simple/t/encod04.t

174 lines
3.9 KiB
Perl

# The encoding detection heuristic will choose UTF8 or CP1252. The current
# implementation will usually treat CP1252 (aka "Win-Latin-1") as CP1252 but
# can be fooled into seeing it as UTF8.
BEGIN {
if($ENV{PERL_CORE}) {
chdir 't';
@INC = '../lib';
}
}
use strict;
use Test;
BEGIN {
if ($] lt 5.007_003) {
plan tests => 5, todo => [4, 5]; # Need utf8::decode() to pass #5
# and isn't available in this
# release
}
else {
plan tests => 5, todo => [4];
}
}
ok 1;
use Pod::Simple::DumpAsXML;
use Pod::Simple::XMLOutStream;
# Initial, isolated, non-ASCII byte triggers CP1252 guess and later
# multi-byte sequence is not considered by heuristic.
my $x97;
my $x91;
my $dash;
if ($] ge 5.007_003) {
$x97 = chr utf8::unicode_to_native(0x97);
$x91 = chr utf8::unicode_to_native(0x91);
$dash = '&#8212';
}
else { # Tests will fail for early EBCDICs
$x97 = chr 0x97;
$x91 = chr 0x91;
$dash = '--';
}
my @output_lines = split m/[\r\n]+/, Pod::Simple::XMLOutStream->_out( qq{
=head1 NAME
Em::Dash $x97 ${x91}CAF\xC9\x92
=cut
} );
my($guess) = "@output_lines" =~ m{Non-ASCII.*?Assuming ([\w-]+)};
if( $guess ) {
if( $guess eq 'CP1252' ) {
if( grep m{Dash $dash}, @output_lines ) {
ok 1;
} else {
ok 0;
print STDERR "# failed to find expected control character in output\n"
}
} else {
ok 0;
print STDERR "# parser guessed wrong encoding expected 'CP1252' got '$guess'\n";
}
} else {
ok 0;
print STDERR "# parser failed to detect non-ASCII bytes in input\n";
}
# Initial smart-quote character triggers CP1252 guess as expected
@output_lines = split m/[\r\n]+/, Pod::Simple::XMLOutStream->_out( qq{
=head1 NAME
Smart::Quote - ${x91}FUT\xC9\x92
=cut
} );
if (ord("A") != 65) { # ASCII-platform dependent test skipped on this platform
ok (1);
}
else {
($guess) = "@output_lines" =~ m{Non-ASCII.*?Assuming ([\w-]+)};
if( $guess ) {
if( $guess eq 'CP1252' ) {
ok 1;
} else {
ok 0;
print STDERR "# parser guessed wrong encoding expected 'CP1252' got '$guess'\n";
}
} else {
ok 0;
print STDERR "# parser failed to detect non-ASCII bytes in input\n";
}
}
# Initial accented character followed by 'smart' apostrophe causes heuristic
# to choose UTF8 (a somewhat contrived example)
@output_lines = split m/[\r\n]+/, Pod::Simple::XMLOutStream->_out( qq{
=head1 NAME
=head2 JOS\xC9\x92S PLACE
=cut
} );
if (ord("A") != 65) { # ASCII-platform dependent test skipped on this platform
ok (1);
}
else {
($guess) = "@output_lines" =~ m{Non-ASCII.*?Assuming ([\w-]+)};
if( $guess ) {
if( $guess eq 'CP1252' ) {
ok 1;
} else {
ok 0;
print STDERR "# parser guessed wrong encoding expected 'CP1252' got '$guess'\n";
}
} else {
ok 0;
print STDERR "# parser failed to detect non-ASCII bytes in input\n";
}
}
# The previous example used a CP1252 byte sequence that also happened to be a
# valid UTF8 byte sequence. In this example we use an illegal UTF-8 sequence
# (it needs a third byte), so must be 1252
@output_lines = split m/[\r\n]+/, Pod::Simple::XMLOutStream->_out( qq{
=head1 NAME
Smart::Apostrophe::Fail - L\xE9\x92Strange
=cut
} );
if (ord("A") != 65) { # ASCII-platform dependent test skipped on this platform
ok (1);
}
else {
($guess) = "@output_lines" =~ m{Non-ASCII.*?Assuming ([\w-]+)};
if( $guess ) {
if( $guess eq 'CP1252' ) {
ok 1;
} else {
ok 0;
print STDERR "# parser guessed wrong encoding expected 'CP1252' got '$guess'\n";
}
} else {
ok 0;
print STDERR "# parser failed to detect non-ASCII bytes in input\n";
}
}
exit;