Skip to content
Snippets Groups Projects
Commit a2790f75 authored by Daniel Stenberg's avatar Daniel Stenberg
Browse files

removed generated files

parent 90719eb3
No related branches found
No related tags found
No related merge requests found
#!/usr/local/bin/perl
#
# checklinks.pl
#
# This script extracts all links from a HTML page and checks their validity.
# Written to use 'curl' for URL checking.
#
# Author: Daniel Stenberg <Daniel.Stenberg@sth.frontec.se>
# Version: 0.7 Sept 30, 1998
#
# HISTORY
#
# 0.5 - Cuts off the #-part from links before checking.
#
# 0.6 - Now deals with error codes 3XX better and follows the Location:
# properly.
# - Added the -x flag that only checks http:// -links
#
# 0.7 - Ok, http://www.viunga.se/main.html didn't realize this had no path
# but a document. Now it does.
#
#
$in="";
argv:
if($ARGV[0] eq "-v" ) {
$verbose = 1;
shift @ARGV;
goto argv;
}
elsif($ARGV[0] eq "-i" ) {
$usestdin = 1;
shift @ARGV;
goto argv;
}
elsif($ARGV[0] eq "-l" ) {
$linenumber = 1;
shift @ARGV;
goto argv;
}
elsif($ARGV[0] eq "-h" ) {
$help = 1;
shift @ARGV;
goto argv;
}
elsif($ARGV[0] eq "-x" ) {
$external = 1;
shift @ARGV;
goto argv;
}
$geturl = $ARGV[0];
if(($geturl eq "") || $help) {
print "Usage: $0 [-hilvx] <full URL>\n",
" Use a traling slash for directory URLs!\n",
" -h This help text\n",
" -i Read the initial page from stdin\n",
" -l Line number report for BAD links\n",
" -v Verbose mode\n",
" -x Check non-local (external?) links only\n";
exit;
}
if($ARGV[1] eq "-") {
print "We use stdin!\n";
$usestdin = 1;
}
# This is necessary from where I tried this:
#$proxy =" -x 194.237.142.41:80";
# linkchecker, URL will be appended to the right of this command line
# this is the one using HEAD:
$linkcheck = "curl -s -m 20 -I$proxy";
# as a second attempt, this will be used. This is not using HEAD but will
# get the whole frigging document!
$linkcheckfull = "curl -s -m 20 -i$proxy";
# htmlget, URL will be appended to the right of this command line
$htmlget = "curl -s$proxy";
# Parse the input URL and split it into the relevant parts:
sub SplitURL {
my $inurl = $_[0];
if($inurl=~ /^([^:]+):\/\/([^\/]*)\/(.*)\/(.*)/ ) {
$getprotocol = $1;
$getserver = $2;
$getpath = $3;
$getdocument = $4;
}
elsif ($inurl=~ /^([^:]+):\/\/([^\/]*)\/(.*)/ ) {
$getprotocol = $1;
$getserver = $2;
$getpath = $3;
$getdocument = "";
if($getpath !~ /\//) {
$getpath ="";
$getdocument = $3;
}
}
elsif ($inurl=~ /^([^:]+):\/\/(.*)/ ) {
$getprotocol = $1;
$getserver = $2;
$getpath = "";
$getdocument = "";
}
else {
print "Couldn't parse the specified URL, retry please!\n";
exit;
}
}
&SplitURL($geturl);
#print "protocol = $getprotocol\n";
#print "server = $getserver\n";
#print "path = $getpath\n";
#print "document = $getdocument\n";
#exit;
if(!$usestdin) {
open(HEADGET, "$linkcheck $geturl|") ||
die "Couldn't get web page for some reason";
headget:
while(<HEADGET>) {
# print $_;
if($_ =~ /HTTP\/.*3\d\d /) {
$pagemoved=1;
}
elsif($pagemoved &&
($_ =~ /^Location: (.*)/)) {
$geturl = $1;
&SplitURL($geturl);
$pagemoved++;
last headget;
}
}
close(HEADGET);
if($pagemoved == 1) {
print "Page is moved but we don't know where. Did you forget the ",
"traling slash?\n";
exit;
}
open(WEBGET, "$htmlget $geturl|") ||
die "Couldn't get web page for some reason";
while(<WEBGET>) {
$line = $_;
push @indoc, $line;
$line=~ s/\n//g;
$line=~ s/\r//g;
# print $line."\n";
$in=$in.$line;
}
close(WEBGET);
}
else {
while(<STDIN>) {
$line = $_;
push @indoc, $line;
$line=~ s/\n//g;
$line=~ s/\r//g;
$in=$in.$line;
}
}
#print length($in)."\n";
sub LinkWorks {
my $check = $_[0];
# URL encode:
# $check =~s/([^a-zA-Z0-9_:\/.-])/uc sprintf("%%%02x",ord($1))/eg;
@doc = `$linkcheck \"$check\"`;
$head = 1;
# print "COMMAND: $linkcheck \"$check\"\n";
# print $doc[0]."\n";
boo:
if( $doc[0] =~ /^HTTP[^ ]+ (\d+)/ ) {
$error = $1;
if($error < 400 ) {
return "GOOD";
}
else {
if($head && ($error >= 500)) {
# This server doesn't like HEAD!
@doc = `$linkcheckfull \"$check\"`;
$head = 0;
goto boo;
}
return "BAD";
}
}
return "BAD";
}
sub GetLinks {
my $in = $_[0];
my @result;
getlinkloop:
while($in =~ /[^<]*(<[^>]+>)/g ) {
# we have a tag in $1
$tag = $1;
if($tag =~ /^<!--/) {
# this is a comment tag, ignore it
}
else {
if($tag =~ /(src|href|background|archive) *= *(\"[^\"]\"|[^ )>]*)/i) {
$url=$2;
if($url =~ /^\"(.*)\"$/) {
# this was a "string" now $1 has removed the quotes:
$url=$1;
}
$url =~ s/([^\#]*)\#.*/$1/g;
if($url eq "") {
# if the link was nothing than a #-link it may now have
# been emptied completely so then we skip the rest
next getlinkloop;
}
if($done{$url}) {
# if this url already is done, do next
$done{$url}++;
next getlinkloop;
}
$done{$url} = 1; # this is "done"
push @result, $url;
if($tag =~ /< *([^ ]+)/) {
# print "TAG: $1\n";
$tagtype{$url}=$1;
}
}
}
}
return @result;
}
@links = &GetLinks($in);
linkloop:
for(@links) {
$url = $_;
if($url =~ /^([^:]+):/) {
$prot = $1;
# if($prot !~ /(http|ftp|gopher)/i) {
if($prot !~ /http/i) {
# this is an unsupported protocol, we ignore this
next linkloop;
}
$link = $url;
}
else {
if($external) {
next linkloop;
}
# this is a link on the save server:
if($url =~ /^\//) {
# from root
$link = "$getprotocol://$getserver$url";
}
else {
# from the scanned page's dir
$nyurl=$url;
if(length($getpath) &&
($getpath !~ /\/$/) &&
($nyurl !~ /^\//)) {
# lacks ending slash, add one to the document part:
$nyurl = "/".$nyurl;
}
$link = "$getprotocol://$getserver/$getpath$nyurl";
}
}
#print "test $link\n";
#$success = "GOOD";
$success = &LinkWorks($link);
$count = $done{$url};
$allcount += $count;
print "$success $count <".$tagtype{$url}."> $link $url\n";
# If bad and -l, present the line numbers of the usage
if("BAD" eq $success) {
$badlinks++;
if($linenumber) {
$line =1;
for(@indoc) {
if($_ =~ /$url/) {
print " line $line\n";
}
$line++;
}
}
}
}
if($verbose) {
print "$allcount links were checked";
if($badlinks > 0) {
print ", $badlinks were found bad";
}
print "\n";
}
#!/usr/local/bin/perl
#
# formfind.pl
#
# This script gets a HTML page from the specified URL and presents form
# information you may need in order to machine-make a respond to the form.
#
# Written to use 'curl' for URL fetching.
#
# Author: Daniel Stenberg <Daniel.Stenberg@sth.frontec.se>
# Version: 0.1 Nov 12, 1998
#
# HISTORY
#
# 0.1 - Created now!
#
# TODO
# respect file:// URLs for local file fetches!
$in="";
$usestdin = 0;
if($ARGV[0] eq "" ) {
$usestdin = 1;
}
else {
$geturl = $ARGV[0];
}
if(($geturl eq "") && !$usestdin) {
print "Usage: $0 <full source URL>\n",
" Use a traling slash for directory URLs!\n";
exit;
}
# If you need a proxy for web access, edit your .curlrc file to feature
# -x <proxy:port>
# linkchecker, URL will be appended to the right of this command line
# this is the one using HEAD:
$linkcheck = "curl -s -m 20 -I";
# as a second attempt, this will be used. This is not using HEAD but will
# get the whole frigging document!
$linkcheckfull = "curl -s -m 20 -i";
# htmlget, URL will be appended to the right of this command line
$htmlget = "curl -s";
# urlget, URL will be appended to the right of this command line
# this stores the file with the remote file name in the current dir
$urlget = "curl -O -s";
# Parse the input URL and split it into the relevant parts:
sub SplitURL {
my $inurl = $_[0];
if($inurl=~ /^([^:]+):\/\/([^\/]*)\/(.*)\/(.*)/ ) {
$getprotocol = $1;
$getserver = $2;
$getpath = $3;
$getdocument = $4;
}
elsif ($inurl=~ /^([^:]+):\/\/([^\/]*)\/(.*)/ ) {
$getprotocol = $1;
$getserver = $2;
$getpath = $3;
$getdocument = "";
if($getpath !~ /\//) {
$getpath ="";
$getdocument = $3;
}
}
elsif ($inurl=~ /^([^:]+):\/\/(.*)/ ) {
$getprotocol = $1;
$getserver = $2;
$getpath = "";
$getdocument = "";
}
else {
print "Couldn't parse the specified URL, retry please!\n";
exit;
}
}
if(!$usestdin) {
&SplitURL($geturl);
#print "protocol = $getprotocol\n";
#print "server = $getserver\n";
#print "path = $getpath\n";
#print "document = $getdocument\n";
#exit;
open(HEADGET, "$linkcheck $geturl|") ||
die "Couldn't get web page for some reason";
headget:
while(<HEADGET>) {
# print $_;
if($_ =~ /HTTP\/.*3\d\d /) {
$pagemoved=1;
}
elsif($pagemoved &&
($_ =~ /^Location: (.*)/)) {
$geturl = $1;
&SplitURL($geturl);
$pagemoved++;
last headget;
}
}
close(HEADGET);
if($pagemoved == 1) {
print "Page is moved but we don't know where. Did you forget the ",
"traling slash?\n";
exit;
}
open(WEBGET, "$htmlget $geturl|") ||
die "Couldn't get web page for some reason";
while(<WEBGET>) {
$line = $_;
push @indoc, $line;
$line=~ s/\n//g;
$line=~ s/\r//g;
# print $line."\n";
$in=$in.$line;
}
close(WEBGET);
}
else {
while(<STDIN>) {
$line = $_;
push @indoc, $line;
$line=~ s/\n//g;
$line=~ s/\r//g;
$in=$in.$line;
}
}
getlinkloop:
while($in =~ /[^<]*(<[^>]+>)/g ) {
# we have a tag in $1
$tag = $1;
if($tag =~ /^<!--/) {
# this is a comment tag, ignore it
}
else {
if(!$form &&
($tag =~ /^< *form/i )) {
$method= $tag;
if($method =~ /method *=/i) {
$method=~ s/.*method *= *(\"|)([^ \">]*).*/$2/gi;
}
else {
$method="get"; # default method
}
$action= $tag;
$action=~ s/.*action *= *(\"|)([^ \">]*).*/$2/gi;
$method=uc($method);
$enctype=$tag;
if ($enctype =~ /enctype *=/) {
$enctype=~ s/.*enctype *= *(\'|\"|)([^ \"\'>]*).*/$2/gi;
if($enctype eq "multipart/form-data") {
$enctype="multipart form upload [use -F]"
}
$enctype = "\n--- type: $enctype";
}
else {
$enctype="";
}
print "--- FORM report. Uses $method to URL \"$action\"$enctype\n";
# print "TAG: $tag\n";
# print "METHOD: $method\n";
# print "ACTION: $action\n";
$form=1;
}
elsif($form &&
($tag =~ /< *\/form/i )) {
# print "TAG: $tag\n";
print "--- end of FORM\n";
$form=0;
if( 0 ) {
print "*** Fill in all or any of these: (default assigns may be shown)\n";
for(@vars) {
$var = $_;
$def = $value{$var};
print "$var=$def\n";
}
print "*** Pick one of these:\n";
for(@alts) {
print "$_\n";
}
}
undef @vars;
undef @alts;
}
elsif($form &&
($tag =~ /^< *(input|select)/i)) {
$mtag = $1;
# print "TAG: $tag\n";
$name=$tag;
if($name =~ /name *=/i) {
$name=~ s/.*name *= *(\"|)([^ \">]*).*/$2/gi;
}
else {
# no name given
$name="";
}
# get value tag
$value= $tag;
if($value =~ /value *=/i) {
$value=~ s/.*value *= *(\"|)([^ \">]*).*/$2/gi;
}
else {
$value="";
}
if($mtag =~ /select/i) {
print "Select: $name\n";
push @vars, "$name";
$select = 1;
}
else {
$type=$tag;
if($type =~ /type *=/i) {
$type =~ s/.*type *= *(\"|)([^ \">]*).*/$2/gi;
}
else {
$type="text"; # default type
}
$type=uc($type);
if(lc($type) eq "reset") {
# reset types are for UI only, ignore.
}
elsif($name eq "") {
# let's read the value parameter
print "Button: \"$value\" ($type)\n";
push @alts, "$value";
}
else {
$info="";
if($value ne "") {
$info="=$value";
}
print "Input: $name$info ($type)\n";
push @vars, "$name";
# store default value:
$value{$name}=$value;
}
}
}
elsif($select &&
($tag =~ /^< *\/ *select/i)) {
$select = 0;
}
}
}
#!/usr/local/bin/perl
#
# getlinks.pl
#
# This script extracts all links from a HTML page, compares them to a pattern
# entered on the command line and then downloads matching links into the
# target dir (also specified on the command line).
#
# Written to use 'curl' for URL fetching, uses the source file names in the
# target directory.
#
# Author: Daniel Stenberg <Daniel.Stenberg@sth.frontec.se>
# Version: 0.1 Oct 7, 1998
#
# HISTORY
#
# 0.1 - Created now!
#
$in="";
argv:
if($ARGV[0] eq "-v" ) {
$verbose = 1;
shift @ARGV;
goto argv;
}
if($ARGV[0] eq "-d" ) {
$display = 1;
shift @ARGV;
goto argv;
}
elsif($ARGV[0] eq "-h" ) {
$help = 1;
shift @ARGV;
goto argv;
}
$geturl = $ARGV[0];
$getdir = $ARGV[1];
$getregex = $ARGV[2];
if(($geturl eq "") ||
(($getdir eq "") && !$display) ||
$help) {
print "Usage: $0 [-hv] <full source URL> <target dir> [regex]\n",
" Use a traling slash for directory URLs!\n",
" Use \"quotes\" around the regex!\n",
" -h This help text\n",
" -d Display matches only instead of downloading\n",
" -v Verbose mode\n";
exit;
}
# change to target directory:
chdir $getdir ||
die "couldn't cd into $getdir";
# This is necessary from where I tried this:
#$proxy =" -x 194.237.142.41:80";
# linkchecker, URL will be appended to the right of this command line
# this is the one using HEAD:
$linkcheck = "curl -s -m 20 -I$proxy";
# as a second attempt, this will be used. This is not using HEAD but will
# get the whole frigging document!
$linkcheckfull = "curl -s -m 20 -i$proxy";
# htmlget, URL will be appended to the right of this command line
$htmlget = "curl -s$proxy";
# urlget, URL will be appended to the right of this command line
# this stores the file with the remote file name in the current dir
$urlget = "curl -O -s$proxy";
# Parse the input URL and split it into the relevant parts:
sub SplitURL {
my $inurl = $_[0];
if($inurl=~ /^([^:]+):\/\/([^\/]*)\/(.*)\/(.*)/ ) {
$getprotocol = $1;
$getserver = $2;
$getpath = $3;
$getdocument = $4;
}
elsif ($inurl=~ /^([^:]+):\/\/([^\/]*)\/(.*)/ ) {
$getprotocol = $1;
$getserver = $2;
$getpath = $3;
$getdocument = "";
if($getpath !~ /\//) {
$getpath ="";
$getdocument = $3;
}
}
elsif ($inurl=~ /^([^:]+):\/\/(.*)/ ) {
$getprotocol = $1;
$getserver = $2;
$getpath = "";
$getdocument = "";
}
else {
print "Couldn't parse the specified URL, retry please!\n";
exit;
}
}
&SplitURL($geturl);
#print "protocol = $getprotocol\n";
#print "server = $getserver\n";
#print "path = $getpath\n";
#print "document = $getdocument\n";
#exit;
if(!$usestdin) {
open(HEADGET, "$linkcheck $geturl|") ||
die "Couldn't get web page for some reason";
headget:
while(<HEADGET>) {
# print $_;
if($_ =~ /HTTP\/.*3\d\d /) {
$pagemoved=1;
}
elsif($pagemoved &&
($_ =~ /^Location: (.*)/)) {
$geturl = $1;
&SplitURL($geturl);
$pagemoved++;
last headget;
}
}
close(HEADGET);
if($pagemoved == 1) {
print "Page is moved but we don't know where. Did you forget the ",
"traling slash?\n";
exit;
}
open(WEBGET, "$htmlget $geturl|") ||
die "Couldn't get web page for some reason";
while(<WEBGET>) {
$line = $_;
push @indoc, $line;
$line=~ s/\n//g;
$line=~ s/\r//g;
# print $line."\n";
$in=$in.$line;
}
close(WEBGET);
}
else {
while(<STDIN>) {
$line = $_;
push @indoc, $line;
$line=~ s/\n//g;
$line=~ s/\r//g;
$in=$in.$line;
}
}
sub GetLinks {
my $in = $_[0];
my @result;
getlinkloop:
while($in =~ /[^<]*(<[^>]+>)/g ) {
# we have a tag in $1
$tag = $1;
if($tag =~ /^<!--/) {
# this is a comment tag, ignore it
}
else {
if($tag =~ /(src|href|background|archive) *= *(\"[^\"]\"|[^ )>]*)/i) {
$url=$2;
if($url =~ /^\"(.*)\"$/) {
# this was a "string" now $1 has removed the quotes:
$url=$1;
}
$url =~ s/([^\#]*)\#.*/$1/g;
if($url eq "") {
# if the link was nothing than a #-link it may now have
# been emptied completely so then we skip the rest
next getlinkloop;
}
if($done{$url}) {
# if this url already is done, do next
$done{$url}++;
next getlinkloop;
}
$done{$url} = 1; # this is "done"
push @result, $url;
if($tag =~ /< *([^ ]+)/) {
# print "TAG: $1\n";
$tagtype{$url}=$1;
}
}
}
}
return @result;
}
@links = &GetLinks($in);
linkloop:
for(@links) {
$url = $_;
if($url =~ /^([^:]+):/) {
$link = $url;
}
else {
# this is an absolute link on the same server:
if($url =~ /^\//) {
# from root
$link = "$getprotocol://$getserver$url";
}
else {
# from the scanned page's dir
$nyurl=$url;
if(length($getpath) &&
($getpath !~ /\/$/) &&
($nyurl !~ /^\//)) {
# lacks ending slash, add one to the document part:
$nyurl = "/".$nyurl;
}
$link = "$getprotocol://$getserver/$getpath$nyurl";
}
}
if($link =~ /$getregex/) {
if($display) {
print "$link\n";
}
else {
if($verbose) {
print "Gets $link\n";
}
print `$urlget $link`;
}
}
}
#!/usr/local/bin/perl
#
# Author: Daniel Stenberg <Daniel.Stenberg@sth.frontec.se>
# Date: August 25 1998
# Version: 0.1
#
# This is just meant as an example of why we wrote curl in the first place.
# Quick n' easy scripting use.
#
$dir = $ARGV[0];
$target = $ARGV[1];
$maxdepth = $ARGV[2];
if($dir eq "" || $target eq "") {
print "Usage: <URL> <dir> [max depth level] \n";
print " End the URL with a slash if a directory is specified, please\n";
exit;
}
if(($maxdepth ne "") && ($maxdepth == 0)) {
# reached maximum depth, die
print "Reached maximum recursive depth level ($maxdepth), exiting...\n";
exit;
}
# get dir
@all = `curl -s $dir`;
if($all[0] ne "") {
print "Got the main $dir dir\n";
}
line:
for(@all) {
chop; # cut off newline
@linep= split(" ", $_);
$name = $linep[$#linep];
$firstletter=substr($linep[0], 0, 1);
if($firstletter eq "d") {
# this is a subdir, recurse
# if not . or .. of course
if(($name eq ".") || ($name eq "..")) {
next line;
}
print "Recursing for dir $dir$name in target $target/$name\n";
$nextdepth=$maxdepth-1;
print `$0 $dir$name/ $target/$name $nextdepth`;
}
elsif($firstletter eq "-") {
# this is a file, get it
# oh, make sure the target dir exists first
if(! -r $target ) {
mkdir($target,0777);
}
print "Getting file $dir$name in target $target/$name\n";
print `curl -s $dir$name >$target/$name`;
}
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment