Loading perl/crawlink.pl +32 −10 Original line number Original line Diff line number Diff line Loading @@ -9,10 +9,14 @@ # Written to use 'curl' for URL checking. # Written to use 'curl' for URL checking. # # # Author: Daniel Stenberg <daniel@haxx.se> # Author: Daniel Stenberg <daniel@haxx.se> # Version: 0.2 Dec 19, 2000 # Version: 0.3 Jan 3, 2001 # # # HISTORY # HISTORY # # # 0.3 - The -i now adds regexes that if a full URL link matches one of those, # it is not followed. This can then be used to prevent this script from # following '.*\.cgi', specific pages or whatever. # # 0.2 - Made it only HEAD non html files (i.e skip the GET). Makes it a lot # 0.2 - Made it only HEAD non html files (i.e skip the GET). Makes it a lot # faster to skip large non HTML files such as pdfs or big RFCs! ;-) # faster to skip large non HTML files such as pdfs or big RFCs! ;-) # Added a -c option that allows me to pass options to curl. # Added a -c option that allows me to pass options to curl. Loading @@ -32,6 +36,8 @@ my $help; my $external; my $external; my $curlopts; my $curlopts; my @ignorelist; argv: argv: if($ARGV[0] eq "-v" ) { if($ARGV[0] eq "-v" ) { $verbose++; $verbose++; Loading @@ -44,6 +50,12 @@ elsif($ARGV[0] eq "-c" ) { shift @ARGV; shift @ARGV; goto argv; goto argv; } } elsif($ARGV[0] eq "-i" ) { push @ignorelist, $ARGV[1]; shift @ARGV; shift @ARGV; goto argv; } elsif($ARGV[0] eq "-l" ) { elsif($ARGV[0] eq "-l" ) { $linenumber = 1; $linenumber = 1; shift @ARGV; shift @ARGV; Loading Loading @@ -72,7 +84,9 @@ $rooturls{$ARGV[0]}=1; if(($geturl eq "") || $help) { if(($geturl eq "") || $help) { print "Usage: $0 [-hilvx] <full URL>\n", print "Usage: $0 [-hilvx] <full URL>\n", " Use a traling slash for directory URLs!\n", " Use a traling slash for directory URLs!\n", " -c [data] Pass [data] as argument to every curl invoke\n", " -h This help text\n", " -h This help text\n", " -i [regex] Ignore root links that match this pattern\n", " -l Line number report for BAD links\n", " -l Line number report for BAD links\n", " -v Verbose mode\n", " -v Verbose mode\n", " -x Check non-local (external?) links only\n"; " -x Check non-local (external?) links only\n"; Loading Loading @@ -303,9 +317,6 @@ while(1) { if($geturl == -1) { if($geturl == -1) { last; last; } } if($verbose) { print "ROOT: $geturl\n"; } # # # Splits the URL in its different parts # Splits the URL in its different parts Loading @@ -332,6 +343,8 @@ while(1) { next; next; } } print " ==== $geturl ====\n"; if($verbose == 2) { if($verbose == 2) { printf("Error code $error, Content-Type: $ctype, got %d bytes\n", printf("Error code $error, Content-Type: $ctype, got %d bytes\n", length($in)); length($in)); Loading Loading @@ -405,9 +418,18 @@ while(1) { } } } } else { else { # the link works, add it! # the link works, add it if it isn't in the ingore list my $ignore=0; for(@ignorelist) { if($link =~ /$_/) { $ignore=1; } } if(!$ignore) { # not ignored, add $rooturls{$link}++; # check this if not checked already $rooturls{$link}++; # check this if not checked already } } } } } } } Loading Loading
perl/crawlink.pl +32 −10 Original line number Original line Diff line number Diff line Loading @@ -9,10 +9,14 @@ # Written to use 'curl' for URL checking. # Written to use 'curl' for URL checking. # # # Author: Daniel Stenberg <daniel@haxx.se> # Author: Daniel Stenberg <daniel@haxx.se> # Version: 0.2 Dec 19, 2000 # Version: 0.3 Jan 3, 2001 # # # HISTORY # HISTORY # # # 0.3 - The -i now adds regexes that if a full URL link matches one of those, # it is not followed. This can then be used to prevent this script from # following '.*\.cgi', specific pages or whatever. # # 0.2 - Made it only HEAD non html files (i.e skip the GET). Makes it a lot # 0.2 - Made it only HEAD non html files (i.e skip the GET). Makes it a lot # faster to skip large non HTML files such as pdfs or big RFCs! ;-) # faster to skip large non HTML files such as pdfs or big RFCs! ;-) # Added a -c option that allows me to pass options to curl. # Added a -c option that allows me to pass options to curl. Loading @@ -32,6 +36,8 @@ my $help; my $external; my $external; my $curlopts; my $curlopts; my @ignorelist; argv: argv: if($ARGV[0] eq "-v" ) { if($ARGV[0] eq "-v" ) { $verbose++; $verbose++; Loading @@ -44,6 +50,12 @@ elsif($ARGV[0] eq "-c" ) { shift @ARGV; shift @ARGV; goto argv; goto argv; } } elsif($ARGV[0] eq "-i" ) { push @ignorelist, $ARGV[1]; shift @ARGV; shift @ARGV; goto argv; } elsif($ARGV[0] eq "-l" ) { elsif($ARGV[0] eq "-l" ) { $linenumber = 1; $linenumber = 1; shift @ARGV; shift @ARGV; Loading Loading @@ -72,7 +84,9 @@ $rooturls{$ARGV[0]}=1; if(($geturl eq "") || $help) { if(($geturl eq "") || $help) { print "Usage: $0 [-hilvx] <full URL>\n", print "Usage: $0 [-hilvx] <full URL>\n", " Use a traling slash for directory URLs!\n", " Use a traling slash for directory URLs!\n", " -c [data] Pass [data] as argument to every curl invoke\n", " -h This help text\n", " -h This help text\n", " -i [regex] Ignore root links that match this pattern\n", " -l Line number report for BAD links\n", " -l Line number report for BAD links\n", " -v Verbose mode\n", " -v Verbose mode\n", " -x Check non-local (external?) links only\n"; " -x Check non-local (external?) links only\n"; Loading Loading @@ -303,9 +317,6 @@ while(1) { if($geturl == -1) { if($geturl == -1) { last; last; } } if($verbose) { print "ROOT: $geturl\n"; } # # # Splits the URL in its different parts # Splits the URL in its different parts Loading @@ -332,6 +343,8 @@ while(1) { next; next; } } print " ==== $geturl ====\n"; if($verbose == 2) { if($verbose == 2) { printf("Error code $error, Content-Type: $ctype, got %d bytes\n", printf("Error code $error, Content-Type: $ctype, got %d bytes\n", length($in)); length($in)); Loading Loading @@ -405,9 +418,18 @@ while(1) { } } } } else { else { # the link works, add it! # the link works, add it if it isn't in the ingore list my $ignore=0; for(@ignorelist) { if($link =~ /$_/) { $ignore=1; } } if(!$ignore) { # not ignored, add $rooturls{$link}++; # check this if not checked already $rooturls{$link}++; # check this if not checked already } } } } } } } Loading