Commit 0d12c567 authored by Daniel Stenberg's avatar Daniel Stenberg
Browse files

Added -i to allow ingore-patterns to get added

parent 880208c5
Loading
Loading
Loading
Loading
+32 −10
Original line number Original line Diff line number Diff line
@@ -9,10 +9,14 @@
# Written to use 'curl' for URL checking.
# Written to use 'curl' for URL checking.
#
#
# Author: Daniel Stenberg <daniel@haxx.se>
# Author: Daniel Stenberg <daniel@haxx.se>
# Version: 0.2 Dec 19, 2000
# Version: 0.3 Jan 3, 2001
#
#
# HISTORY
# HISTORY
#
#
# 0.3 - The -i now adds regexes that if a full URL link matches one of those,
#       it is not followed. This can then be used to prevent this script from
#       following '.*\.cgi', specific pages or whatever.
#
# 0.2 - Made it only HEAD non html files (i.e skip the GET). Makes it a lot
# 0.2 - Made it only HEAD non html files (i.e skip the GET). Makes it a lot
#       faster to skip large non HTML files such as pdfs or big RFCs! ;-)
#       faster to skip large non HTML files such as pdfs or big RFCs! ;-)
#       Added a -c option that allows me to pass options to curl.
#       Added a -c option that allows me to pass options to curl.
@@ -32,6 +36,8 @@ my $help;
my $external;
my $external;
my $curlopts;
my $curlopts;


my @ignorelist;

 argv:
 argv:
if($ARGV[0] eq "-v" ) {
if($ARGV[0] eq "-v" ) {
    $verbose++;
    $verbose++;
@@ -44,6 +50,12 @@ elsif($ARGV[0] eq "-c" ) {
    shift @ARGV;
    shift @ARGV;
    goto argv;
    goto argv;
}
}
elsif($ARGV[0] eq "-i" ) {
    push @ignorelist, $ARGV[1];
    shift @ARGV;
    shift @ARGV;
    goto argv;
}
elsif($ARGV[0] eq "-l" ) {
elsif($ARGV[0] eq "-l" ) {
    $linenumber = 1;
    $linenumber = 1;
    shift @ARGV;
    shift @ARGV;
@@ -72,7 +84,9 @@ $rooturls{$ARGV[0]}=1;
if(($geturl eq "") || $help) {
if(($geturl eq "") || $help) {
    print  "Usage: $0 [-hilvx] <full URL>\n",
    print  "Usage: $0 [-hilvx] <full URL>\n",
    " Use a traling slash for directory URLs!\n",
    " Use a traling slash for directory URLs!\n",
    " -c [data]  Pass [data] as argument to every curl invoke\n",
    " -h         This help text\n",
    " -h         This help text\n",
    " -i [regex] Ignore root links that match this pattern\n",
    " -l         Line number report for BAD links\n",
    " -l         Line number report for BAD links\n",
    " -v         Verbose mode\n",
    " -v         Verbose mode\n",
    " -x         Check non-local (external?) links only\n";
    " -x         Check non-local (external?) links only\n";
@@ -303,9 +317,6 @@ while(1) {
    if($geturl == -1) {
    if($geturl == -1) {
        last;
        last;
    }
    }
    if($verbose) {
        print "ROOT: $geturl\n";
    }


    #
    #
    # Splits the URL in its different parts
    # Splits the URL in its different parts
@@ -332,6 +343,8 @@ while(1) {
        next;
        next;
    }
    }


    print "    ==== $geturl ====\n";

    if($verbose == 2) {
    if($verbose == 2) {
        printf("Error code $error, Content-Type: $ctype, got %d bytes\n",
        printf("Error code $error, Content-Type: $ctype, got %d bytes\n",
               length($in));
               length($in));
@@ -405,9 +418,18 @@ while(1) {
            }
            }
        }
        }
        else {
        else {
            # the link works, add it!
            # the link works, add it if it isn't in the ingore list
            my $ignore=0;
            for(@ignorelist) {
                if($link =~ /$_/) {
                    $ignore=1;
                }
            }
            if(!$ignore) {
                # not ignored, add
                $rooturls{$link}++; # check this if not checked already
                $rooturls{$link}++; # check this if not checked already
            }
            }
        }
        
        
    }
    }
}
}