Virtual Remote Control:
Test Site

Home | Table of Contents | Documentation

#!/usr/local/bin/perl -w
# $Id: getPageListFromWgetLog,v 1.3 2003/11/04 18:27:05 wrk1 Exp $
use strict;
use vars qw($opt_s $opt_h);
use Getopt::Std;
use Pod::Text;

#------------------------------------------------------------------------
# Help messages
#------------------------------------------------------------------------ 
&usage, exit if scalar @ARGV == 0;

# Check for options
getopts('hs'); 

print pod2text("$0"), exit if $opt_h;

# ------------------------------------------------------------------------
# Open file if it exists. Exit otherwise with error msg.
-f $ARGV[0] && open (IN, $ARGV[0]) ||
   die "$ARGV[0] can't be opened for reading.\nRun '$0 -h' for help.\n";
my @file= <IN>;
close IN;

my $savedPage;
my $scheme = ""; 


foreach my $line (@file) {
  ## If -s option is set, parse file for string with the following
  ## format: "HTTP request sent,". Construct a URI scheme string from
  ## the name of the protocol.
  if ($opt_s) {
    if ($line =~ /(\w+) request sent,/o) {
      $scheme = lc($1). "://";
    }
  }

  # Parse file for the strings with the following format: "16:36:21
  # (1.86 MB/s) - `prism.library.cornell.edu/control/index.html' saved
  # [1948]" 
  # Extract the name of the saved page.  
  if ($line =~/`(.*)' saved/o) {
    $savedPage = $1;
    print $scheme, $savedPage, "\n";
  }
     
}# end "foreach $line (@file)" loop
#------------------------------------------------------------------------ 
# end of main()
#------------------------------------------------------------------------

sub usage {
print <<EOM;

 SYNTAX: $0 [-h|-s] logfile
    where logfile is a wget log file

  OPTIONS:
  -s -- prepends the protocol string (the "URI scheme", usually
   "http://" ) to the name of the saved pages.
  -h -- prints more documentation

EOM
}
__END__

$Log: getPageListFromWgetLog,v $
Revision 1.3  2003/11/04 18:27:05  wrk1
*** empty log message ***

Revision 1.2  2003/09/17 17:26:51  wrk1
Production Version

Revision 1.1  2003/09/17 16:43:50  wrk1
The beginning -- POD, usage(), pseudo-code


=head1 NAME

getPageListFromWgetLog

=head1 SYNTAX

 getPageListFromWgetLog.pl [-h|-s] logfile
   where logfile is a wget log file

=head1 OPTIONS 

  -s -- prepends the protocol string (the "URI scheme", ususally
   "http://" ) to the name of the saved pages.

  -h -- prints this documentation

=head1 DESCRIPTION

This script parses the log file generated by wget during a crawl. It
looks for lines like the following:

"16:36:21 (1.86 MB/s) - `prism.library.cornell.edu/control/index.html' saved [1948]"

It extracts the URI fragment and prints the string to STDOUT.
  
=cut

© IRIS Research Department, Cornell University Library, 2003
Send questions/comments/suggestions to vrc-testsite@cornell.edu