Virtual Remote Control:
Test Site
Home |
Table of Contents |
Documentation
#!/usr/local/bin/perl -w
# $Id: getPageListFromWgetLog,v 1.3 2003/11/04 18:27:05 wrk1 Exp $
use strict;
use vars qw($opt_s $opt_h);
use Getopt::Std;
use Pod::Text;
#------------------------------------------------------------------------
# Help messages
#------------------------------------------------------------------------
&usage, exit if scalar @ARGV == 0;
# Check for options
getopts('hs');
print pod2text("$0"), exit if $opt_h;
# ------------------------------------------------------------------------
# Open file if it exists. Exit otherwise with error msg.
-f $ARGV[0] && open (IN, $ARGV[0]) ||
die "$ARGV[0] can't be opened for reading.\nRun '$0 -h' for help.\n";
my @file= <IN>;
close IN;
my $savedPage;
my $scheme = "";
foreach my $line (@file) {
## If -s option is set, parse file for string with the following
## format: "HTTP request sent,". Construct a URI scheme string from
## the name of the protocol.
if ($opt_s) {
if ($line =~ /(\w+) request sent,/o) {
$scheme = lc($1). "://";
}
}
# Parse file for the strings with the following format: "16:36:21
# (1.86 MB/s) - `prism.library.cornell.edu/control/index.html' saved
# [1948]"
# Extract the name of the saved page.
if ($line =~/`(.*)' saved/o) {
$savedPage = $1;
print $scheme, $savedPage, "\n";
}
}# end "foreach $line (@file)" loop
#------------------------------------------------------------------------
# end of main()
#------------------------------------------------------------------------
sub usage {
print <<EOM;
SYNTAX: $0 [-h|-s] logfile
where logfile is a wget log file
OPTIONS:
-s -- prepends the protocol string (the "URI scheme", usually
"http://" ) to the name of the saved pages.
-h -- prints more documentation
EOM
}
__END__
$Log: getPageListFromWgetLog,v $
Revision 1.3 2003/11/04 18:27:05 wrk1
*** empty log message ***
Revision 1.2 2003/09/17 17:26:51 wrk1
Production Version
Revision 1.1 2003/09/17 16:43:50 wrk1
The beginning -- POD, usage(), pseudo-code
=head1 NAME
getPageListFromWgetLog
=head1 SYNTAX
getPageListFromWgetLog.pl [-h|-s] logfile
where logfile is a wget log file
=head1 OPTIONS
-s -- prepends the protocol string (the "URI scheme", ususally
"http://" ) to the name of the saved pages.
-h -- prints this documentation
=head1 DESCRIPTION
This script parses the log file generated by wget during a crawl. It
looks for lines like the following:
"16:36:21 (1.86 MB/s) - `prism.library.cornell.edu/control/index.html' saved [1948]"
It extracts the URI fragment and prints the string to STDOUT.
=cut
|
© IRIS Research Department, Cornell University Library, 2003
Send questions/comments/suggestions to vrc-testsite@cornell.edu