Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
T
TLMSP curl
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
CYBER - Cyber Security
TS 103 523 MSP
TLMSP
TLMSP curl
Commits
6ad9bd80
Commit
6ad9bd80
authored
24 years ago
by
Daniel Stenberg
Browse files
Options
Downloads
Patches
Plain Diff
crawls through a whole site and verifies links
parent
ec5ac82c
No related branches found
Branches containing commit
No related tags found
Tags containing commit
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
perl/crawlink.pl
+402
-0
402 additions, 0 deletions
perl/crawlink.pl
with
402 additions
and
0 deletions
perl/crawlink.pl
0 → 100755
+
402
−
0
View file @
6ad9bd80
#!/usr/bin/perl
#
# crawlink.pl
#
# This script crawls across all found links below the given "root" URL.
# It reports all good and bad links to stdout. This code was based on the
# checklink.pl script I wrote ages ago.
#
# Written to use 'curl' for URL checking.
#
# Author: Daniel Stenberg <daniel@haxx.se>
# Version: 0.1 Dec 14, 2000
#
# HISTORY
#
# 0.1 - The given url works as the root. This script will only continue
# and check other URLs if the leftmost part of the new URL is identical
# to the root URL.
#
use
strict
;
my
$in
=
"";
my
$verbose
=
0
;
my
$usestdin
;
my
$linenumber
;
my
$help
;
my
$external
;
argv:
if
(
$ARGV
[
0
]
eq
"
-v
"
)
{
$verbose
++
;
shift
@ARGV
;
goto
argv
;
}
elsif
(
$ARGV
[
0
]
eq
"
-l
"
)
{
$linenumber
=
1
;
shift
@ARGV
;
goto
argv
;
}
elsif
(
$ARGV
[
0
]
eq
"
-h
"
)
{
$help
=
1
;
shift
@ARGV
;
goto
argv
;
}
elsif
(
$ARGV
[
0
]
eq
"
-x
"
)
{
$external
=
1
;
shift
@ARGV
;
goto
argv
;
}
my
$geturl
=
$ARGV
[
0
];
my
$firsturl
=
$geturl
;
#
# Define a hash array to hold all root URLs to visit/we have visited
#
my
%rooturls
;
$rooturls
{
$ARGV
[
0
]}
=
1
;
if
((
$geturl
eq
"")
||
$help
)
{
print
"
Usage: $0 [-hilvx] <full URL>
\n
",
"
Use a traling slash for directory URLs!
\n
",
"
-h This help text
\n
",
"
-l Line number report for BAD links
\n
",
"
-v Verbose mode
\n
",
"
-x Check non-local (external?) links only
\n
";
exit
;
}
# This is necessary from where I tried this:
my
$proxy
=
"";
#$proxy =" -x 194.237.142.41:80";
# linkchecker, URL will be appended to the right of this command line
# this is the one using HEAD:
my
$linkcheck
=
"
curl -s -m 20 -I
$proxy
";
# as a second attempt, this will be used. This is not using HEAD but will
# get the whole frigging document!
my
$linkcheckfull
=
"
curl -s -m 20 -i
$proxy
";
# htmlget, URL will be appended to the right of this command line
my
$htmlget
=
"
curl -s
$proxy
";
# Parse the input URL and split it into the relevant parts:
my
$getprotocol
;
my
$getserver
;
my
$getpath
;
my
$getdocument
;
my
%done
;
my
%tagtype
;
my
$allcount
=
0
;
my
$badlinks
=
0
;
sub
SplitURL
{
my
$inurl
=
$_
[
0
];
if
(
$inurl
=~
/^([^:]+):\/\/([^\/]*)\/(.*)\/(.*)/
)
{
$getprotocol
=
$
1
;
$getserver
=
$
2
;
$getpath
=
$
3
;
$getdocument
=
$
4
;
}
elsif
(
$inurl
=~
/^([^:]+):\/\/([^\/]*)\/(.*)/
)
{
$getprotocol
=
$
1
;
$getserver
=
$
2
;
$getpath
=
$
3
;
$getdocument
=
"";
if
(
$getpath
!~
/\//
)
{
$getpath
=
"";
$getdocument
=
$
3
;
}
}
elsif
(
$inurl
=~
/^([^:]+):\/\/(.*)/
)
{
$getprotocol
=
$
1
;
$getserver
=
$
2
;
$getpath
=
"";
$getdocument
=
"";
}
else
{
print
"
Couldn't parse the specified URL, retry please!
\n
";
exit
;
}
}
my
@indoc
;
sub
GetRootPage
{
my
$geturl
=
$_
[
0
];
my
$in
=
"";
my
$code
=
200
;
my
$type
=
"
text/plain
";
my
$pagemoved
=
0
;
open
(
HEADGET
,
"
$linkcheck
$geturl
|
")
||
die
"
Couldn't get web page for some reason
";
while
(
<
HEADGET
>
)
{
#print STDERR $_;
if
(
$_
=~
/HTTP\/1\.[01] (\d\d\d) /
)
{
$code
=
$
1
;
if
(
$code
=~
/^3/
)
{
$pagemoved
=
1
;
}
}
elsif
(
$_
=~
/^Content-Type: ([\/a-zA-Z]+)/
)
{
$type
=
$
1
;
}
elsif
(
$pagemoved
&&
(
$_
=~
/^Location: (.*)/
))
{
$geturl
=
$
1
;
&SplitURL
(
$geturl
);
$pagemoved
++
;
last
;
}
}
close
(
HEADGET
);
if
(
$pagemoved
==
1
)
{
print
"
Page is moved but we don't know where. Did you forget the
",
"
traling slash?
\n
";
exit
;
}
open
(
WEBGET
,
"
$htmlget
$geturl
|
")
||
die
"
Couldn't get web page for some reason
";
while
(
<
WEBGET
>
)
{
my
$line
=
$_
;
push
@indoc
,
$line
;
$line
=~
s/\n/ /g
;
$line
=~
s/\r//g
;
# print $line."\n";
$in
=
$in
.
$line
;
}
close
(
WEBGET
);
return
(
$in
,
$code
,
$type
);
}
sub
LinkWorks
{
my
$check
=
$_
[
0
];
# URL encode:
# $check =~s/([^a-zA-Z0-9_:\/.-])/uc sprintf("%%%02x",ord($1))/eg;
my
@doc
=
`
$linkcheck
\
"
$check
\
"
`;
my
$head
=
1
;
# print "COMMAND: $linkcheck \"$check\"\n";
# print $doc[0]."\n";
boo:
if
(
$doc
[
0
]
=~
/^HTTP[^ ]+ (\d+)/
)
{
my
$error
=
$
1
;
if
(
$error
<
400
)
{
return
"
GOOD
";
}
else
{
if
(
$head
&&
(
$error
>=
500
))
{
# This server doesn't like HEAD!
@doc
=
`
$linkcheckfull
\
"
$check
\
"
`;
$head
=
0
;
goto
boo
;
}
return
"
BAD
";
}
}
return
"
BAD
";
}
sub
GetLinks
{
my
$in
=
$_
[
0
];
my
@result
;
while
(
$in
=~
/[^<]*(<[^>]+>)/g
)
{
# we have a tag in $1
my
$tag
=
$
1
;
if
(
$tag
=~
/^<!--/
)
{
# this is a comment tag, ignore it
}
else
{
if
(
$tag
=~
/(src|href|background|archive) *= *(\"[^\"]\"|[^ \)>]*)/i
)
{
my
$url
=
$
2
;
if
(
$url
=~
/^\"(.*)\"$/
)
{
# this was a "string" now $1 has removed the quotes:
$url
=
$
1
;
}
$url
=~
s/([^\#]*)\#.*/$1/g
;
if
(
$url
eq
"")
{
# if the link was nothing than a #-link it may now have
# been emptied completely so then we skip the rest
next
;
}
if
(
$done
{
$url
})
{
# if this url already is done, do next
$done
{
$url
}
++
;
next
;
}
$done
{
$url
}
=
1
;
# this is "done"
push
@result
,
$url
;
if
(
$tag
=~
/< *([^ ]+)/
)
{
$tagtype
{
$url
}
=
$
1
;
}
}
}
}
return
@result
;
}
while
(
1
)
{
$geturl
=-
1
;
for
(
keys
%rooturls
)
{
if
(
$rooturls
{
$_
}
==
1
)
{
if
(
$_
!~
/^$firsturl/
)
{
$rooturls
{
$_
}
+=
1000
;
# don't do this, outside our scope
if
(
$verbose
)
{
print
"
SKIP:
$_
\n
";
}
next
;
}
$geturl
=
$_
;
last
;
}
}
if
(
$geturl
==
-
1
)
{
last
;
}
if
(
$verbose
)
{
print
"
ROOT:
$geturl
\n
";
}
#
# Splits the URL in its different parts
#
&SplitURL
(
$geturl
);
#
# Returns the full HTML of the root page
#
my
(
$in
,
$error
,
$ctype
)
=
&GetRootPage
(
$geturl
);
$rooturls
{
$geturl
}
++
;
# increase to prove we have already got it
if
(
$ctype
ne
"
text/html
")
{
# this is not HTML, we skip this
if
(
$verbose
==
2
)
{
print
"
Non-HTML link, skipping
\n
";
next
;
}
}
if
(
$error
>=
400
)
{
print
"
$geturl
return
$error
, exiting
\n
";
exit
;
}
if
(
$verbose
==
2
)
{
printf
("
Error code
$error
, Content-Type:
$ctype
, got %d bytes
\n
",
length
(
$in
));
}
#print "protocol = $getprotocol\n";
#print "server = $getserver\n";
#print "path = $getpath\n";
#print "document = $getdocument\n";
#exit;
#
# Extracts all links from the given HTML buffer
#
my
@links
=
&GetLinks
(
$in
);
for
(
@links
)
{
my
$url
=
$_
;
my
$link
;
if
(
$url
=~
/^([^:]+):/
)
{
my
$prot
=
$
1
;
if
(
$prot
!~
/http/i
)
{
# this is an unsupported protocol, we ignore this
next
;
}
$link
=
$url
;
}
else
{
if
(
$external
)
{
next
;
}
# this is a link on the same server:
if
(
$url
=~
/^\//
)
{
# from root
$link
=
"
$getprotocol
://
$getserver$url
";
}
else
{
# from the scanned page's dir
my
$nyurl
=
$url
;
if
(
length
(
$getpath
)
&&
(
$getpath
!~
/\/$/
)
&&
(
$nyurl
!~
/^\//
))
{
# lacks ending slash, add one to the document part:
$nyurl
=
"
/
"
.
$nyurl
;
}
$link
=
"
$getprotocol
://
$getserver
/
$getpath$nyurl
";
}
}
my
$success
=
&LinkWorks
(
$link
);
my
$count
=
$done
{
$url
};
$allcount
+=
$count
;
print
"
$success
$count
<
"
.
$tagtype
{
$url
}
.
"
>
$link
$url
\n
";
$rooturls
{
$link
}
++
;
# check this if not checked already
if
("
BAD
"
eq
$success
)
{
$badlinks
++
;
if
(
$linenumber
)
{
my
$line
=
1
;
for
(
@indoc
)
{
if
(
$_
=~
/$url/
)
{
print
"
line
$line
\n
";
}
$line
++
;
}
}
}
}
}
if
(
$verbose
)
{
print
"
$allcount
links were checked
";
if
(
$badlinks
>
0
)
{
print
"
,
$badlinks
were found bad
";
}
print
"
\n
";
}
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment