468 lines
15 KiB
Perl
Executable File
468 lines
15 KiB
Perl
Executable File
#!perl
|
|
use strict;
|
|
use warnings;
|
|
use autodie;
|
|
use feature qw(say);
|
|
require File::Find::Rule;
|
|
require File::Slurp;
|
|
require File::Spec;
|
|
require IO::Socket::SSL;
|
|
use List::Util qw(sum);
|
|
require LWP::UserAgent;
|
|
require Net::FTP;
|
|
require Parallel::Fork::BossWorkerAsync;
|
|
require Term::ProgressBar::Simple;
|
|
require URI::Find::Simple;
|
|
$| = 1;
|
|
|
|
my %ignore;
|
|
while ( my $line = <main::DATA> ) {
|
|
chomp $line;
|
|
next if $line =~ /^#/;
|
|
next unless $line;
|
|
$ignore{$line} = 1;
|
|
}
|
|
|
|
my $ua = LWP::UserAgent->new(ssl_opts => { verify_hostname => 0 });
|
|
$ua->timeout(58);
|
|
$ua->env_proxy;
|
|
|
|
my @filenames = @ARGV;
|
|
@filenames = sort grep { $_ !~ /^\.git/ } File::Find::Rule->new->file->in('.')
|
|
unless @filenames;
|
|
|
|
my $total_bytes = sum map {-s} @filenames;
|
|
|
|
my $extract_progress = Term::ProgressBar::Simple->new(
|
|
{ count => $total_bytes,
|
|
name => 'Extracting URIs',
|
|
}
|
|
);
|
|
|
|
my %uris;
|
|
foreach my $filename (@filenames) {
|
|
next if $filename =~ /uris\.txt/;
|
|
next if $filename =~ /check_uris/;
|
|
next if $filename =~ /\.patch$/;
|
|
next if $filename =~ 'cpan/Pod-Simple/t/perlfaqo?\.pod';
|
|
next if $filename =~ /checkURL\.pl$/;
|
|
my $contents = File::Slurp::read_file($filename);
|
|
my @uris = URI::Find::Simple::list_uris($contents);
|
|
foreach my $uri (@uris) {
|
|
next unless $uri =~ /^(http|ftp)/;
|
|
next if $ignore{$uri};
|
|
|
|
# no need to hit rt.perl.org
|
|
next
|
|
if $uri =~ m{^https?://rt.perl.org/(?:rt3/)?Ticket/Display.html?id=\d+$};
|
|
|
|
# no need to hit rt.cpan.org
|
|
next
|
|
if $uri =~ m{^https?://rt.cpan.org/Public/Bug/Display.html?id=\d+$};
|
|
|
|
# no need to hit google groups (weird redirect LWP does not like)
|
|
next
|
|
if $uri =~ m{^http://groups\.google\.com/};
|
|
|
|
push @{ $uris{$uri} }, $filename;
|
|
}
|
|
$extract_progress += -s $filename;
|
|
}
|
|
|
|
my $bw = Parallel::Fork::BossWorkerAsync->new(
|
|
work_handler => \&work_alarmed,
|
|
global_timeout => 120,
|
|
worker_count => 20,
|
|
);
|
|
|
|
foreach my $uri ( keys %uris ) {
|
|
my @filenames = @{ $uris{$uri} };
|
|
$bw->add_work( { uri => $uri, filenames => \@filenames } );
|
|
}
|
|
|
|
undef $extract_progress;
|
|
|
|
my $fetch_progress = Term::ProgressBar::Simple->new(
|
|
{ count => scalar( keys %uris ),
|
|
name => 'Fetching URIs',
|
|
}
|
|
);
|
|
|
|
my %filenames;
|
|
while ( $bw->pending() ) {
|
|
my $response = $bw->get_result();
|
|
my $uri = $response->{uri};
|
|
my @filenames = @{ $response->{filenames} };
|
|
my $is_success = $response->{is_success};
|
|
my $message = $response->{message};
|
|
|
|
unless ($is_success) {
|
|
foreach my $filename (@filenames) {
|
|
push @{ $filenames{$filename} },
|
|
{ uri => $uri, message => $message };
|
|
}
|
|
}
|
|
$fetch_progress++;
|
|
}
|
|
$bw->shut_down();
|
|
|
|
my $fh = IO::File->new('> uris.txt');
|
|
foreach my $filename ( sort keys %filenames ) {
|
|
$fh->say("* $filename");
|
|
my @bits = @{ $filenames{$filename} };
|
|
foreach my $bit (@bits) {
|
|
my $uri = $bit->{uri};
|
|
my $message = $bit->{message};
|
|
$fh->say(" $uri");
|
|
$fh->say(" $message");
|
|
}
|
|
}
|
|
$fh->close;
|
|
|
|
say 'Finished, see uris.txt';
|
|
|
|
sub work_alarmed {
|
|
my $conf = shift;
|
|
eval {
|
|
local $SIG{ALRM} = sub { die "alarm\n" }; # NB: \n required
|
|
alarm 60;
|
|
$conf = work($conf);
|
|
alarm 0;
|
|
};
|
|
if ($@) {
|
|
$conf->{is_success} = 0;
|
|
$conf->{message} = 'Timed out';
|
|
|
|
}
|
|
return $conf;
|
|
}
|
|
|
|
sub work {
|
|
my $conf = shift;
|
|
my $uri = $conf->{uri};
|
|
my @filenames = @{ $conf->{filenames} };
|
|
|
|
if ( $uri =~ /^http/ ) {
|
|
my $uri_without_fragment = URI->new($uri);
|
|
my $fragment = $uri_without_fragment->fragment(undef);
|
|
my $response = $ua->head($uri_without_fragment);
|
|
|
|
$conf->{is_success} = $response->is_success;
|
|
$conf->{message} = $response->status_line;
|
|
return $conf;
|
|
} else {
|
|
|
|
my $uri_object = URI->new($uri);
|
|
my $host = $uri_object->host;
|
|
my $path = $uri_object->path;
|
|
my ( $volume, $directories, $filename )
|
|
= File::Spec->splitpath($path);
|
|
|
|
my $ftp = Net::FTP->new( $host, Passive => 1, Timeout => 60 );
|
|
unless ($ftp) {
|
|
$conf->{is_succcess} = 0;
|
|
$conf->{message} = "Can not connect to $host: $@";
|
|
return $conf;
|
|
}
|
|
|
|
my $can_login = $ftp->login( "anonymous", '-anonymous@' );
|
|
unless ($can_login) {
|
|
$conf->{is_success} = 0;
|
|
$conf->{message} = "Can not login ", $ftp->message;
|
|
return $conf;
|
|
}
|
|
|
|
my $can_binary = $ftp->binary();
|
|
unless ($can_binary) {
|
|
$conf->{is_success} = 0;
|
|
$conf->{message} = "Can not binary ", $ftp->message;
|
|
return $conf;
|
|
}
|
|
|
|
my $can_cwd = $ftp->cwd($directories);
|
|
unless ($can_cwd) {
|
|
$conf->{is_success} = 0;
|
|
$conf->{message} = "Can not cwd to $directories ", $ftp->message;
|
|
return $conf;
|
|
}
|
|
|
|
if ($filename) {
|
|
my $can_size = $ftp->size($filename);
|
|
unless ($can_size) {
|
|
$conf->{is_success} = 0;
|
|
$conf->{message}
|
|
= "Can not size $filename in $directories",
|
|
$ftp->message;
|
|
return $conf;
|
|
}
|
|
} else {
|
|
my ($can_dir) = $ftp->dir;
|
|
unless ($can_dir) {
|
|
my ($can_ls) = $ftp->ls;
|
|
unless ($can_ls) {
|
|
$conf->{is_success} = 0;
|
|
$conf->{message}
|
|
= "Can not dir or ls in $directories ",
|
|
$ftp->message;
|
|
return $conf;
|
|
}
|
|
}
|
|
}
|
|
|
|
$conf->{is_success} = 1;
|
|
return $conf;
|
|
}
|
|
}
|
|
|
|
__DATA__
|
|
# these are fine but give errors
|
|
ftp://ftp.stratus.com/pub/vos/posix/ga/ga.html
|
|
ftp://ftp.stratus.com/pub/vos/utility/utility.html
|
|
|
|
# these are missing, sigh
|
|
ftp://ftp.sco.com/SLS/ptf7051e.Z
|
|
http://perlmonks.thepen.com/42898.html
|
|
http://svn.mutatus.co.uk/wsvn/Scalar-List-Utils/trunk/
|
|
http://public.activestate.com/cgi-bin/perlbrowse
|
|
http://svn.mutatus.co.uk/browse/libnet/tags/libnet-1.17/ChangeLog
|
|
http://aspn.activestate.com/ASPN/Mail/Message/perl6-internals/2746631
|
|
http://my.smithmicro.com/mac/stuffit/
|
|
http://www.wg.omron.co.jp/cgi-bin/j-e/jfriedl.html
|
|
http://persephone.cps.unizar.es/general/gente/spd/gzip/gzip.html
|
|
|
|
# these are URI extraction bugs
|
|
http://www.perl.org/E
|
|
http://en.wikipedia.org/wiki/SREC_(file_format
|
|
http://somewhere.else',-type=/
|
|
ftp:passive-mode
|
|
ftp:
|
|
http:[-
|
|
http://search.cpan.org/src/KWILLIAMS/Module-Build-0.30/lib/Module/Build/Platform/Windows.pm:split_like_shell
|
|
http://www.xray.mpe.mpg.de/mailing-lists/perl5-
|
|
http://support.microsoft.com/support/kb/articles/Q280/3/41.ASP:
|
|
http://perl.come/
|
|
http://www.perl.come/
|
|
|
|
# these are used as an example
|
|
http://example.com/
|
|
http://something.here/
|
|
http://users.perl5.git.perl.org/~yourlogin/
|
|
http://github.com/USERNAME/perl/tree/orange
|
|
http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi
|
|
http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi/somewhere/else?game=chess;game=checkers;weather=dull;foo=bar
|
|
http://somewhere.else$/
|
|
http://somewhere.else$/
|
|
http://somewhere.else/bin/foo&bar',-Type=
|
|
http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi
|
|
http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi/somewhere/else?game=chess;game=checkers;weather=dull;foo=bar
|
|
http://www.perl.org/test.cgi
|
|
http://cpan2.local/
|
|
http://search.cpan.org/perldoc?
|
|
http://cpan1.local/
|
|
http://cpan.dev.local/CPAN
|
|
http:///
|
|
ftp://
|
|
ftp://myurl/
|
|
ftp://ftp.software.ibm.com/aix/fixes/v4/other/vac.C.5.0.2.6.bff
|
|
http://www14.software.ibm.com/webapp/download/downloadaz.jsp
|
|
http://www.cpan.org/modules/by-module/Archive/Archive-Zip-*.tar.gz
|
|
http://www.unicode.org/Public/MAPPINGS/ISO8859/8859-*.TXT
|
|
http://localhost/tmp/index.txt
|
|
http://example.com/foo/bar.html
|
|
http://example.com/Text-Bastardize-1.06.tar.gz
|
|
ftp://example.com/sources/packages.txt
|
|
http://example.com/sources/packages.txt
|
|
http://example.com/sources
|
|
ftp://example.com/sources
|
|
http://some.where.com/dir/file.txt
|
|
http://some.where.com/dir/a.txt
|
|
http://foo.com/X.tgz
|
|
ftp://foo.com/X.tgz
|
|
http://foo/
|
|
http://www.foo.com:8000/
|
|
http://mysite.com/cgi-bin/log.cgi/http://www.some.other.site/args
|
|
http://decoded/mirror/path
|
|
http://a/b/c/d/e/f/g/h/i/j
|
|
http://foo/bar.gz
|
|
ftp://ftp.perl.org
|
|
http://purl.org/rss/1.0/modules/taxonomy/
|
|
ftp://ftp.sun.ac.za/CPAN/CPAN/
|
|
ftp://ftp.cpan.org/pub/mirror/index.txt
|
|
ftp://cpan.org/pub/mirror/index.txt
|
|
http://example.com/~eh/
|
|
http://plagger.org/.../rss
|
|
http://umn.dl.sourceforge.net/mingw/gcc-g++-3.4.5-20060117-1.tar.gz
|
|
http://umn.dl.sourceforge.net/mingw/binutils-2.16.91-20060119-1.tar.gz
|
|
http://umn.dl.sourceforge.net/mingw/mingw-runtime-3.10.tar.gz
|
|
http://umn.dl.sourceforge.net/mingw/gcc-core-3.4.5-20060117-1.tar.gz
|
|
http://umn.dl.sourceforge.net/mingw/w32api-3.6.tar.gz
|
|
http://search.cpan.org/CPAN/authors/id/S/SH/SHAY/dmake-4.5-20060619-SHAY.zip
|
|
http://module-build.sourceforge.net/META-spec-new.html
|
|
http://module-build.sourceforge.net/META-spec-v1.4.html
|
|
http://www.cs.vu.nl/~tmgil/vi.html
|
|
http://perlcomposer.sourceforge.net/vperl.html
|
|
http://www.perl.com/CPAN/authors/Tom_Christiansen/scripts/rep
|
|
http://w4.lns.cornell.edu/~pvhp/ptk/ptkTOC.html
|
|
http://world.std.com/~aep/ptkdb/
|
|
http://www.castlelink.co.uk/object_system/
|
|
http://www.fh-wedel.de/elvis/
|
|
ftp://ftp.blarg.net/users/amol/zsh/
|
|
ftp://ftp.funet.fi/pub/languages/perl/CPAN
|
|
http://search.cpan.org/CPAN/authors/id/S/SH/SHAY/dmake-4.5-20060619-SHAY.zip
|
|
http://users.perl5.git.perl.org/~USERNAME
|
|
http://foo/x//y/script.cgi/a//b
|
|
http://xxx/script.cgi/http://foo
|
|
http://foo/./x//z/script.cgi/a/../b//c
|
|
http://somewhere.else/in/movie/land
|
|
http://somewhere.else/finished.html
|
|
http://somewhere.else/bin/foo&bar$
|
|
http://somewhere.else/
|
|
http://proxy:8484/
|
|
http://proxy/
|
|
http://myrepo.example.com/
|
|
http://remote/source
|
|
https://example.com/
|
|
http://example.com:1024/
|
|
http:///path?foo=bar
|
|
http://[::]:1024/
|
|
http://([/
|
|
http://example.com:9000/index.html
|
|
http://proxy.example.com:8080/
|
|
http:///index.html
|
|
http://[www.json::pp.org]/
|
|
http://localhost/
|
|
http://foo.example.com/
|
|
http://abc.com/a.js
|
|
http://whatever/man/1/crontab
|
|
http://abc.com/c.js
|
|
http://whatever/Foo%3A%3ABar
|
|
http://abc.com/b.js
|
|
http://remote.server.com/jquery.css
|
|
http://some.other.com/page.html
|
|
https://text.com/1/2
|
|
https://text.com/1/2
|
|
http://link.included.here?o=1&p=2
|
|
http://link.included.here?o=1&p=2
|
|
http://link.included.here?o=1&p=2
|
|
http://link.included.here/
|
|
http://foo/x//y/script.cgi/a//b
|
|
http://xxx/script.cgi/http://foo
|
|
http://foo/./x//z/script.cgi/a/../b//c
|
|
http://somewhere.else/in/movie/land
|
|
http://somewhere.else/finished.html
|
|
http://webproxy:3128/
|
|
http://www/
|
|
|
|
# these are used to generate or match URLs
|
|
http://www.cpan.org/modules/by-authors/id/$1/$1$2/$dist
|
|
http://www.cpantesters.org/show/%s.yaml
|
|
ftp://(.*?)/(.*)/(.*
|
|
ftp://(.*?)/(.*)/(.*
|
|
ftp://(.*?)/(.*)/(.*
|
|
ftp://ftp.foo.bar/
|
|
http://$host/
|
|
http://wwwe%3C46/
|
|
ftp:/
|
|
http://$addr/mark?commit=$
|
|
http://search.cpan.org/~
|
|
http:/
|
|
ftp:%5Cn$url
|
|
http://www.ietf.org/rfc/rfc$2.txt
|
|
http://search.cpan.org/~
|
|
ftp:%5Cn$url
|
|
|
|
# weird redirects that LWP doesn't like
|
|
http://www.theperlreview.com/community_calendar
|
|
http://www.software.hp.com/portal/swdepot/displayProductInfo.do?productNumber=PERL
|
|
http://sunsolve.sun.com
|
|
|
|
# broken webserver that doesn't like HEAD requests
|
|
http://docs.sun.com/app/docs/doc/816-5168/syslog-3c?a=view
|
|
http://www.w3.org/TR/html4/loose.dtd
|
|
|
|
# these have been reported upstream to CPAN authors
|
|
http://www.gnu.org/manual/tar/html_node/tar_139.html
|
|
http://www.w3.org/pub/WWW/TR/Wd-css-1.html
|
|
http://support.microsoft.com/support/kb/articles/Q280/3/41.ASP
|
|
http://msdn.microsoft.com/workshop/author/dhtml/httponly_cookies.asp
|
|
http://search.cpan.org/search?query=Module::Build::Convert
|
|
http://www.refcnt.org/papers/module-build-convert
|
|
http://csrc.nist.gov/cryptval/shs.html
|
|
http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset4.asp
|
|
http://www.debian.or.jp/~kubota/unicode-symbols.html.en
|
|
http://www.mail-archive.com/perl5-porters@perl.org/msg69766.html
|
|
http://www.debian.or.jp/~kubota/unicode-symbols.html.en
|
|
http://rfc.net/rfc2781.html
|
|
http://www.icu-project.org/charset/
|
|
http://ppewww.ph.gla.ac.uk/~flavell/charset/form-i18n.html
|
|
http://www.rfc-editor.org/
|
|
http://www.rfc.net/
|
|
http://www.oreilly.com/people/authors/lunde/cjk_inf.html
|
|
http://www.oreilly.com/catalog/cjkvinfo/
|
|
http://www.cpan.org/modules/by-author/Damian_Conway/Filter-Simple.tar.gz
|
|
http://www.csse.monash.edu.au/~damian/CPAN/Filter-Simple.tar.gz
|
|
http://www.egt.ie/standards/iso3166/iso3166-1-en.html
|
|
http://www.bsi-global.com/iso4217currency
|
|
http://www.plover.com/~mjd/perl/Memoize/
|
|
http://www.plover.com/~mjd/perl/MiniMemoize/
|
|
http://www.sysadminmag.com/tpj/issues/vol5_5/
|
|
ftp://ftp.tpc.int/tpc/server/UNIX/
|
|
http://www.nara.gov/genealogy/
|
|
http://home.utah-inter.net/kinsearch/Soundex.html
|
|
http://www.nara.gov/genealogy/soundex/soundex.html
|
|
http://rfc.net/rfc3461.html
|
|
ftp://ftp.cs.pdx.edu/pub/elvis/
|
|
http://www.fh-wedel.de/elvis/
|
|
http://lists.perl.org/list/perl-mvs.html
|
|
http://www.cpan.org/ports/os2/
|
|
http://github.com/dagolden/cpan-meta-spec
|
|
http://github.com/dagolden/cpan-meta-spec/issues
|
|
http://www.opensource.org/licenses/lgpl-license.phpt
|
|
http://reality.sgi.com/ariel
|
|
http://www.chiark.greenend.org.uk/pipermail/ukcrypto/1999-February/003538.html
|
|
http://www.chiark.greenend.org.uk/pipermail/ukcrypto/1999-February/003538.html
|
|
http://www.nsrl.nist.gov/testdata/
|
|
http://public.activestate.com/cgi-bin/perlbrowse/p/31194
|
|
http://public.activestate.com/cgi-bin/perlbrowse?patch=16173
|
|
http://public.activestate.com/cgi-bin/perlbrowse?patch=16049
|
|
http://www.li18nux.org/docs/html/CodesetAliasTable-V10.html
|
|
http://aspn.activestate.com/ASPN/Mail/Message/perl5-porters/3486118
|
|
http://lxr.mozilla.org/seamonkey/source/intl/uconv/ucvlatin/vps.ut
|
|
http://lxr.mozilla.org/seamonkey/source/intl/uconv/ucvlatin/vps.uf
|
|
http://github.com/schwern/extutils-makemaker
|
|
https://github.com/dagolden/cpanpm/compare/master...private%2Fuse-http-lite
|
|
http://www.json.org/JSON::PP_checker/
|
|
ftp://ftp.kiae.su/pub/unix/fido/
|
|
http://www.gallistel.net/nparker/weather/code/
|
|
http://www.javaworld.com/javaworld/jw-09-2001/jw-0928-ntmessages.html
|
|
ftp://ftp-usa.tpc.int/pub/tpc/server/UNIX/
|
|
http://www.cis.ohio-state.edu/htbin/rfc/rfc959.html
|
|
http://public.activestate.com/cgi-bin/perlbrowse/p/33567
|
|
http://public.activestate.com/cgi-bin/perlbrowse/p/33566
|
|
http://www.dsmit.com/cons/
|
|
http://www.makemaker.org/wiki/index.cgi?ModuleBuildConversionGuide
|
|
|
|
__END__
|
|
|
|
=head1 NAME
|
|
|
|
checkURL.pl - Check that all the URLs in the Perl source are valid
|
|
|
|
=head1 DESCRIPTION
|
|
|
|
This program checks that all the URLs in the Perl source are valid. It
|
|
checks HTTP and FTP links in parallel and contains a list of known
|
|
bad example links in its source. It takes 4 minutes to run on my
|
|
machine. The results are written to 'uris.txt' and list the filename,
|
|
the URL and the error:
|
|
|
|
* ext/Locale-Maketext/lib/Locale/Maketext.pod
|
|
http://sunsite.dk/RFC/rfc/rfc2277.html
|
|
404 Not Found
|
|
...
|
|
|
|
It should be run every so often and links fixed and upstream authors
|
|
notified.
|
|
|
|
Note that the web is unstable and some websites are temporarily down.
|