Commit 880208c5 authored by Daniel Stenberg's avatar Daniel Stenberg
Browse files

only add good links as root links

don't break the loop on root link errors
parent f4acbed2
Loading
Loading
Loading
Loading
+41 −22
Original line number Diff line number Diff line
@@ -9,10 +9,14 @@
# Written to use 'curl' for URL checking.
#
# Author: Daniel Stenberg <daniel@haxx.se>
# Version: 0.1 Dec 14, 2000
# Version: 0.2 Dec 19, 2000
#
# HISTORY
#
# 0.2 - Made it only HEAD non html files (i.e skip the GET). Makes it a lot
#       faster to skip large non HTML files such as pdfs or big RFCs! ;-)
#       Added a -c option that allows me to pass options to curl.
#
# 0.1 - The given url works as the root. This script will only continue
#       and check other URLs if the leftmost part of the new URL is identical
#       to the root URL.
@@ -26,6 +30,7 @@ my $usestdin;
my $linenumber;
my $help;
my $external;
my $curlopts;

 argv:
if($ARGV[0] eq "-v" ) {
@@ -33,6 +38,12 @@ if($ARGV[0] eq "-v" ) {
    shift @ARGV;
    goto argv;
}
elsif($ARGV[0] eq "-c" ) {
    $curlopts=$ARGV[1];
    shift @ARGV;
    shift @ARGV;
    goto argv;
}
elsif($ARGV[0] eq "-l" ) {
    $linenumber = 1;
    shift @ARGV;
@@ -68,10 +79,11 @@ if(($geturl eq "") || $help) {
    exit;
}

# This is necessary from where I tried this:
my $proxy="";
my $proxy;
if($curlopts ne "") {
    $proxy=" $curlopts";
    #$proxy =" -x 194.237.142.41:80";

}

# linkchecker, URL will be appended to the right of this command line
# this is the one using HEAD:
@@ -169,20 +181,22 @@ sub GetRootPage {
	exit;
    }

    if($type ne "text/html") {
        # there no point in getting anything but HTML
        $in="";
    }
    else {
        open(WEBGET, "$htmlget $geturl|") ||
            die "Couldn't get web page for some reason";

        while(<WEBGET>) {
            my $line = $_;
            push @indoc, $line;
            $line=~ s/\n/ /g;
            $line=~ s/\r//g;
#    print $line."\n";
            $in=$in.$line;
        }

        close(WEBGET);

    }
    return ($in, $code, $type);
}

@@ -252,6 +266,9 @@ sub GetLinks {
		if($done{$url}) {
		    # if this url already is done, do next
		    $done{$url}++;
                    if($verbose) {
                        print " FOUND $url but that is already checked\n";
                    }
		    next;
		}

@@ -311,8 +328,8 @@ while(1) {
    }

    if($error >= 400) {
        print "$geturl return $error, exiting\n";
        exit;
        print "ROOT page $geturl returned $error\n";
        next;
    }

    if($verbose == 2) {
@@ -375,8 +392,6 @@ while(1) {
        
        print "$success $count <".$tagtype{$url}."> $link $url\n";

        $rooturls{$link}++; # check this if not checked already
        
        if("BAD" eq $success) {
            $badlinks++;
            if($linenumber) {
@@ -389,6 +404,10 @@ while(1) {
                }
            }
        }
        else {
            # the link works, add it!
            $rooturls{$link}++; # check this if not checked already
        }
        
    }
}