#!/usr/bin/perl
#    OpaL Mirror Tool
#    Copyright (C) 2000  Ola Lundqvist
#    $Id: mirrortool.pl,v 1.16 2002/04/15 08:14:30 ola Exp $
#    For full COPYRIGHT notice see the COPYING document.
#
#    This program is free software; you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation; either version 2 of the License, or
#    (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with this program; if not, write to the Free Software
#    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
#
#    For more information take a look at the official homepage at:
#      http://www.opal.dhs.org/programs/omt/
#    or contact the author at:
#      opal@lysator.liu.se
#
#

use File::Path qw(mkpath); 

###############################################################################
################################ HELP TEXT ####################################
###############################################################################

$version = "0.1.10";

$versionstr = "mirrortool.pl (OpaL Mirror Tool) version $version.\n";

$help = "
 OpaL Mirror Tool
 Written by Ola Lundqvist <opal\@lysator.liu.se>
 Lincensed under GPL

 Usage: mirrortool.pl [ options ] url [ options url ] [ ... ]

 options:
   --images             : Include <img src=xxx>:s in the download. (default)
   --noimages           : Do not include <img src=xxx>:s in the download.
   --depth n            : Maximum recursion depth. (default 1)
   --store \"regexp\"     : Files matching regexp are actually stored locally.
		        : It is possible to | separate (with or).
   --rewrite \"from=>to\" : Urls are rewritten using this rules.
		        : It is possible to | separate (with or).
                        : Do not rewrite the dir, because that it will affect
                        : later lookup. Have to fix this sometime.
   --what \"regexp\"      : Files matching regexp are downloaded and traversed.
		        : It is possible to | separate (with or).
   --dir basedir        : Where to store local files.
   --nohostcheck        : Do not check if url points to other host.
   --notreecheck        : Do not check if url points to other dirtree.
   --force              : Overwrite all files.
   --debug              : Print debug-messages.
   --retry n            : Number of times an url will be retried (default 1)
   --auth user:pass     : use Basic Authentication
   --proxy url		: Use a proxy server (like http://u:p@localhost/).
   --help		: Print this text.

 requires LWP::UserAgent

 For more information contact the author or look at:
   http://www.opal.dhs.org/programs/omt/

";

use LWP::UserAgent;
$hdrs = new HTTP::Headers(Accept => 'text/plain',
			  Accept => 'text/html',
			  Accept => 'image/gif',
			  Accept => 'image/jpeg',
			  User-Agent => 'OpaLMirrorTool/1.0'); 
$ua = new LWP::UserAgent;

# START DEFAULTS
# DO NOT CHANGE THEM UNLESS YOU KNOW WHAT YOU ARE DOING!
# images, what, rewrite, store, depth, basedir, baseurl, dir, url 
# =============================================================================
undef $/;
$images = 1;
$basedir = "";
$what = ".*";
$store = ".*";
$rewrite = "";
$depth = 1;
$retry = 1;
$debug = 0;
$outtree = 0;
$outhost = 0;
$auth = 0;
$proxy = "";

if ($ENV{'HTTP_PROXY'} ne "") {
    $proxy = $ENV{'HTTP_PROXY'};
}

# =============================================================================
# END DEFAULTS

# START INIT
# Dates:	2000-03-12 (changed)
# =============================================================================
while($_ = shift) {
  if (/^--/) {
    if (/^--help$/) {
      print($help);
      exit;
    }
    elsif (/^--version$/) {
      print($versionstr);
      exit;
    }
    elsif (/^--noimages$/)    { $images  = 0; }
    elsif (/^--images$/)      { $images  = 1; }
    elsif (/^--depth$/)       { $depth   = shift; }
    elsif (/^--retry$/)       { $retry   = shift; }
    elsif (/^--store$/)       { $store   = shift; }
    elsif (/^--what$/)        { $what    = shift; }
    elsif (/^--debug$/)       { $debug   = 1; }
    elsif (/^--nohostcheck$/) { $outhost = 1; }
    elsif (/^--notreecheck$/) { $outtree = 1; }
    elsif (/^--force$/)       { $force   = 1; }
    elsif (/^--dir$/)         { $basedir = &fixdir(shift); }
    elsif (/^--rewrite$/)     { $rewrite = shift; }
    elsif (/^--auth$/)        { $auth    = shift; }
    elsif (/^--proxy$/)	      { $proxy   = shift; }
    else {
      print ("There is no such option $_, EXITING!\n");
      exit;
    }
  }
  else {
    $url = $_;
    if ($url !~ /\w+:/) {
      print ("$url is not a valid url!\n");
      next;
    }
    s/(\/)[^\/]+$/$1/;
    $baseurl = $_;
    &push_init;
  }
}

if ($proxy ne "") {
    $ua->proxy(http => $proxy);
}

# =============================================================================
# END INIT

# START THE LOOP
# =============================================================================
# The loop that downloads the mirror
while (scalar @init_url > 0) {
  &shift_init;
  print ("OUTLOOP\n") if ($debug);
  while (scalar @download_url > 0) {
    &shift_it;
    print ("INLOOP\n") if ($debug);
  
    if (-f "$basedir$filename" && ! $force) {
      print ("File $basedir$filename does already exist!\n") if ($debug);
    }
    else {
      
      print ("Open URL: $url, $filename\n") if ($debug);
      
      # START DOWNLOAD
      $urlo = new URI($url);
      $req = new HTTP::Request(GET, $urlo, $hrds);
      #$req->authorization_basic('motp-excellence', 'nada');
 
      if ($auth) {
	($u, $p) = split(":", $auth);
	$req->authorization_basic($u, $p);
      }

      $resp = $ua->request($req);
      if (! ($resp->is_success)) {
	if (--$depth >= 0) {
	  print ("Failed open url $url, retry $depth more times.\n");
	  &push_it;
	}  
	else {
	  print ("Failed open url $url, ".$resp->message."\n");
	}
      }
      else {
	$file = $resp->content;
	print ("Read Done\n") if ($debug);
	# END DOWNLOAD
	
	# check if to handle
	if ((
	     ($file =~ /<\s*html\s*>/i) ||
	     ($file =~ /<\s*head\s*>/i) ||
	     ($file =~ /<\s*a\s/i) ||
	     ($file =~ /<\s*img\s/i) ||
	     ($file =~ /<\s*body/i)
	     ) &&
	    ($depth_o > 0)) {
	    print ("Handle it!\n") if ($debug);
	    $file = &handle_file($file);
	}
	print ("Save file: $filename\n") if ($debug);
	&save_file($file, $filename);
      }
    }
    print ("INLOOP DONE!\n") if ($debug);
  }
  print ("OUTLOOP DONE!\n") if ($debug);
}
# =============================================================================
# END THE LOOP

# -----------------------------------------------------------------------------
# Args:		
# Returns:    	
# Desctiption:	
# Uses:		
# Dates:	2000-03-12 (written)
# -----------------------------------------------------------------------------


# START HANDLE FUNCTIONS
# =============================================================================

# -----------------------------------------------------------------------------
# Args:		file data
# Returns:    	file data (possibly rewritten)
# Desctiption:	Process the file
# Uses:		handle_tags
# Type:		file handle
# Changes:
#	2000-03-11 Ola Lundqvist <opal@lysator.liu.se>
#		Written.
#	2000-03-14 Ola Lundqvist <opal@lysator.liu.se>
#		implemented --noimages, forgot $images
#	2002-14-15 Sebastiaan <sebastia@ch.twi.tudelft.nl>
#		Added support for area tags.
# Status:	Should be done.
# -----------------------------------------------------------------------------
sub handle_file {
  my $lo = $_;
  my $file = "";
  # Get all tags
  #$file =~ s/\n/ /g;
  foreach (split /(<[^>]+>)/, (shift)) {
    # if image tag
    if (/<\s*img\s/i && $images) {
      $_ = &handle_tag("src", $_);
    }
    elsif (/<\s*a\s/i) {
      $_ = &handle_tag("href", $_);
    }
    elsif (/<\s*link\s/i) {
      $_ = &handle_tag("href", $_);
    }
    elsif (/<\s*area\s/i) {
	$_ = &handle_tag("href", $_);
    }
    $file = $file.$_;
  }
  &get_it;
  $_ = $lo;
  return $file;
}

# -----------------------------------------------------------------------------
# Args:		search for
#		tag string		
# Returns:	rewritten tag (not implemented yet)    	
# Description:	search for urls and add them to queue (in later versions may
#		rewrite it too)
# Uses:		global data in queues.
# Dates:	2000-03-11 (written)
# -----------------------------------------------------------------------------
sub handle_tag {
  my $lo = $_;
  my $searchf = shift;
  &get_it;
  my $tag = shift;
  my $turl = &tag_attribute($searchf, $tag);
  print ("handle_tag: $turl\n") if ($debug);
  if ($turl =~ /^\s*$/) {
    $_ = $lo;
    return $tag 
  }
  # if there is a url and if it should be downloaded then...
  if (($turl !~ /^$/) && (&find_in_list($what, $turl))) {
    my $type;
#    print ("handle_tag0: $turl\n") if ($debug);
    ($type, $url, $filename, $filerel) = &handle_url($turl);
    print ("handle_tag: $filename\n") if ($debug);
    $depth--;
    $depth = 0 if ($searchf =~ /^src$/i);
    
    print ("handle_tag/type: $type $outhost\n") if ($debug);
    if (($type =~ /external/ && $outhost) ||
	($type =~ /intree/) ||
	($type =~ /inhost/ && $outtree)) {
      print ("handle_tag/push: $type, $url, $filename, $filerel\n") if ($debug);
      &push_it;
      $tag =~ s/$turl/$filerel/g;
    }
  }
  $_ = $lo;
  return $tag;
}

# -----------------------------------------------------------------------------
# Args:		url
# Returns:	(loc)
# Description:	Converts a url to a file path.
# Uses:		$baseurl, $dir, relative_path, rewrite_url, external_file
# Dates:	2000-03-12 (written)
# -----------------------------------------------------------------------------
sub handle_url {
  my $lo = $_;
  $_ = shift;
  my $hosturl = &url_type_host($baseurl);
  # if starting with /
  if (/^\//) {
    $_ = $hosturl.$_;
  }
  # if it is not external (make it possibly external)
  if (! /\w+:/) {
    $_ = $baseurl.$dir.$_;
    print ("handle_url/dir/fn: $dir, $filename\n") if ($debug);
  }
  my $ur = $_;


  my $type;
  # if in base url
  if (/^$baseurl/) {
    s/^$baseurl//;
    $_ = &rewrite_url($_); 
    print ("DO REWRITE! $_\n") if ($debug);
    $type = "intree";
  }
  # if in host
  elsif (/^$hosturl/) {
    $type = "inhost";
    $_ = &external_file($_);
  }
  # external
  elsif (/^ftp:/) {
    $type = "ftp";
    $_ = &external_file($_);
  }
  elsif (/^http:/){
    $type = "external";
    $_ = &external_file($_);
  }
  else {
    $type = "unknown";
  }
  my @t = ($type, $ur, $_, &relative_path($_, $dir)); 
  $_ = $lo;
  return @t;
}

# -----------------------------------------------------------------------------
# Args:		file data
#		file name
# Returns:    	nothing
# Desctiption:	saves file data to a dir/file
# Uses:		$store
# Dates:	2000-03-11 (written)
#		2000-03-12 (changed)
#		2000-03-13 (better create dir and better store-check)
# Status:	Probably done
# -----------------------------------------------------------------------------
sub save_file {
  my $filedata = shift;
  $filename =~ s/#.*$//;
  my $filename = $basedir.(shift);
  my $dir      = &path_dir($filename);
  my $err = 0;
  if (&find_in_list($store, $filename)) {
    if ($dir !~ /^$/) {
      if (! mkpath ($dir, 0, 0755)) {
	if (! -d $dir) {
	  print ("Cannot create $dir\n");
	  $err = 1;
	}
      }
    }
    if (! $err) {
      if (open (LF, ">$filename")) {
        print ("Writing $filename\n") if ($debug);
        print LF $filedata;
        close LF;
      }
      else {
        print ("Cannot create $filename\n")
      }
    }
  }
}

# =============================================================================
# END HANDLE FUNCTIONS



# START REWRITE FUNCTIONS
# =============================================================================

# -----------------------------------------------------------------------------
# Args:		url
# Returns:    	path relative
# Description:	converts a external path to where to store
# Uses:		rewrite_url
# Dates:	2000-03-11 (written)
# -----------------------------------------------------------------------------
sub external_file {
  my $lo = $_;
  $_ = shift;
  if (! /\w+:\/\//i) {
    print ("This is not an external url, ERROR! exiting!\n");
    exit;
  }
  s/\w+:\/\///i;
  my $ret = &rewrite_url($_);
#  $ret =~ s/\//_/;
  $_ = $lo;
  return $ret;
}

# -----------------------------------------------------------------------------
# Args:		url
# Returns:    	url (rewritten)
# Description:	rewrites a url so that it can be stored localy
# Uses:		$rewrite
# Dates:	2000-03-12 (written)
# Note:		Do not rewrite the dir, then the script will fail.
# -----------------------------------------------------------------------------
sub rewrite_url {
  my $lo = $_;
  my $sp;
  $_ = shift;
  s/^$/index.html/;
  s/\/$/\/index.html/;
  # USER DEFINED REWRITE RULES
  foreach $sp (split /\|/, $rewrite) {
    my $f, $t;
    ($f, $t) = split /=>/, $sp;
    print ("rewrite_url1: $f, $t\n") if ($debug);
    s/$f/$t/;
  }
  # SUPERSTANDARD REWRITE RULES
  s/\?/G/;
  s/=/E/;
  s/&/A/;
  s/ /S/;
  s/\"/Q/;
  my $ret = $_;
  $_ = $lo;
  return $ret;
}

# =============================================================================
# END REWRITE FUNCTIONS


# START QUEUE FUNCTIONS
# =============================================================================

# -----------------------------------------------------------------------------
# Args:		nothing
# Returns:    	nothing
# Desctiption:	pushes a lot of data to different queues
# Uses:		a lot (see code)
# Type:		init
# Dates:	2000-03-12 (written, created from push_it)
# -----------------------------------------------------------------------------
sub push_init {
    push @init_images, $images;
    push @init_depth, $depth;
    push @init_what, $what;
    push @init_store, $store;
    push @init_basedir, $basedir;
    push @init_url, $url;
    push @init_baseurl, $baseurl;
    push @init_rewrite, $rewrite;
    push @init_outhost, $outhost;
    push @init_outtree, $outtree;
    push @init_retry, $retry;
}

# -----------------------------------------------------------------------------
# Args:		nothing
# Returns:    	nothing
# Desctiption:	shifts out item from the queue
# Uses:		queues, data... see the code.
# Type:		init
# Dates:	2000-03-12 (written, created from shift_it)
# -----------------------------------------------------------------------------
sub shift_init {
  $images     = shift @init_images;
  $what       = shift @init_what;
  $store      = shift @init_store;
  $basedir    = shift @init_basedir;
  $baseurl    = shift @init_baseurl;
  $rewrite    = shift @init_rewrite; 
  $outtree    = shift @init_outtree; 
  $outhost    = shift @init_outhost; 
  $depth_o    = shift @init_depth;
  $url_o      = shift @init_url;
  $retry_o    = shift @init_retry;
  $filename_o = &rewrite_url(&url_file($url_o));
  $dir_o      = "";
  &get_it;
  &push_it;
}

# -----------------------------------------------------------------------------
# Args:		nothing
# Returns:    	nothing
# Desctiption:	pushes a lot of data to different queues
# Uses:		a lot (see code)
# Dates:	2000-03-11 (written)
#		2000-03-12 (changed)
# -----------------------------------------------------------------------------
sub push_it {
    push @download_depth, $depth;
    push @download_retry, $retry;
    push @download_filename, $filename;
    push @download_url, $url;
}

# -----------------------------------------------------------------------------
# Args:		nothing
# Returns:    	nothing
# Desctiption:	shifts out item from the queue
# Uses:		queues, data... see the code.
# Dates:	2000-03-11 (written)
#		2000-03-12 (changed)
# -----------------------------------------------------------------------------
sub shift_it {
  $depth_o    = shift @download_depth;
  $retry_o    = shift @download_retry;
  $filename_o = shift @download_filename;
  $url_o      = shift @download_url;
  $dir_o      = &path_dir($filename_o);
  &get_it;
}

# -----------------------------------------------------------------------------
# Args:		nothing
# Returns:    	nothing
# Desctiption:	get the latest...
# Uses:		global data
# Dates:	2000-03-11 (written)
#		2000-03-12 (changed)
# -----------------------------------------------------------------------------
sub get_it {
  $depth    = $depth_o;
  $retry    = $retry_o;
  $filename = $filename_o;
  $dir      = $dir_o;
  $url      = $url_o;
}

# =============================================================================
# END QUEUE FUNCTIONS


# START GENERAL URL FUNCTIONS
# =============================================================================

# -----------------------------------------------------------------------------
# Args:		url
# Returns:	type and host (ex: http://www.opal.dhs.org)
# Description:	Converts a url to type and host
#		NOTE that the url must begin with http://
# Type:		general url
# Dates:	2000-03-12 (written)
# -----------------------------------------------------------------------------
sub url_type_host {
  my $lo = shift;
#  print ("url_type_host: $lo\n") if ($debug);
  if ($lo !~ /^\w+:\/\//) {
    print ("WARNING! No type, returns nothing!\n");
    return "";
  }
  $lo =~ s/^(\w+:\/\/[^\/]*).*$/$1/;
#  print ("url_type_host: $lo\n") if ($debug);
  return $lo;
}

# -----------------------------------------------------------------------------
# Args:		url
# Returns:	type, host and dir (ex: http://www.opal.dhs.org/foo/)
# Description:	Converts a url to type, host and dir
#		NOTE that the url must begin with http://
# Type:		general url
# Dates:	2000-03-12 (written)
# -----------------------------------------------------------------------------
sub url_type_host_dir {
  my $lo = shift;
  if ($lo !~ /^\w+:\/\//) {
    print ("WARNING! No type, returns nothing!\n");
    return "";
  }
  $lo =~ s/^(\w+:\/\/.*\/)[^\/]*$/$1/;
  return $lo;
}

# -----------------------------------------------------------------------------
# Args:		url		
# Returns:    	file-name
# Desctiption:	returns a file name for the url
# Uses:		rewrite rules (not implemented yet)
# Dates:	2000-03-12 (written)
# -----------------------------------------------------------------------------
sub url_file {
  my $lo = shift;
  # removes the dir and everything before the file...
  $lo =~ s/.*\///;
  if ($lo =~ /\/$/) {
    $lo = $lo."index.html";
  }
  return $lo;
}

# =============================================================================
# END GENERAL URL FUNCTIONS


# START GENERAL PATH FUNCTIONS
# =============================================================================

# -----------------------------------------------------------------------------
# Args:	        path
# Returns:    	file name
# Description:	extracts a file name from a path
# Uses:		nothing
# Type:		general path
# Dates:	2000-03-12 (written)
# -----------------------------------------------------------------------------
sub path_file {
  my $lo = shift;
  $lo =~ s/^.*\/([^\/]*)$/$1/;
  return "index.html" if ($lo =~ /^$/);
  return $lo;
}

# -----------------------------------------------------------------------------
# Args:	        path
# Returns:    	dir name
# Description:	extracts a dir name from a path
# Uses:		nothing
# Type:		general path
# Dates:	2000-03-12 (written)
# -----------------------------------------------------------------------------
sub path_dir {
  my $lo = shift;
  $lo =~ s/^(.*\/)[^\/]*$/$1/;
  return "" if ($lo !~ /\//);
  return &fixdir($lo);
}

# -----------------------------------------------------------------------------
# Args:		dir
# Returns:    	dir (fixed)
# Desctiption:	Add / to a dir if it does not exist. Does not do this with ""
# Uses:		nothing
# Dates:	2000-03-11 (written)
# -----------------------------------------------------------------------------
sub fixdir {
  my $lo = shift;
  return "" if ($lo =~ /^$/);
  if ($lo !~ /\/$/) {
    $lo = $lo."/";
  }
  return $lo;
}

# -----------------------------------------------------------------------------
# Args:		dir
#		cur dir
# Returns:	relative path
# Description:	Gives the relative path for dir to cur dir.
# Type:		general path
# Dates:	2000-03-12 (written)
# -----------------------------------------------------------------------------
sub relative_path {
  my $lo = $_;
  $_         = shift;
  my $curdir = shift;
  if (/^$curdir/) {
    s/^$curdir//;
  }
  else {
    $curdir =~ s/[^\/]+/\.\./g;
    $_ = $curdir.$_;
  }
  my $ret = $_;
  $_ = $lo;
  return $ret;
}

# =============================================================================
# END GENERAL PATH FUNCTIONS


# START GENERAL TAG FUNCTIONS
# =============================================================================

# -----------------------------------------------------------------------------
# Args:		search for
#		tag string	
# Returns:    	attribute value
# Desctiption:	extracts the value for an attribute in a tag
# Type:		general html tag
# Dates:	2000-03-11 (written)
# -----------------------------------------------------------------------------
sub tag_attribute {
  my $lo = $_;
  my $searchf = shift;
  $_ = shift;
#  print ("tag_attribute: $searchf, $_\n") if ($debug);
  if (! /$searchf/i) {
    $_ = $lo;
    return "";
  }

  # remove newlines
  s/\n/ /g;
  # remove double-spaces
  s/^\s+//;
  # get what we are searching for!
#  print ("tag_attribute1: $_\n") if ($debug);
  s/^.*$searchf\s*=\s*//i;
#  print ("tag_attribute2: $_\n") if ($debug);
  # If it is quoted...
  if (/\"/) {
    s/^\"//;
    s/\s.*//;
    s/^(\S*).*$/$1/;      
    s/\".*$//;
  }
  # if not quoted
  else {
    s/^(\S*).*$/$1/;      
  }
#  print ("tag_attribute3: $_\n") if ($debug);
  # NOW $a contains the attrib value
  # remove some crappy whitespaces
  s/\s+$//;
  s/^\s+//;
  my $ret = $_;
  $_ = $lo;
  return $ret;
}

# =============================================================================
# END GENERAL TAG FUNCTIONS


# START GENERAL LIST FUNCTIONS
# =============================================================================

# -----------------------------------------------------------------------------
# Args:		search for (| separated)
#		string
# Returns:	1 if found, 0 if not
# Type:		general
# Description:	Search for expressions
# Dates:	2000-03-12 (written)
# -----------------------------------------------------------------------------
sub find_in_list {
  my $w = shift;
  my $s = shift;
  my $lo;
#  print ("find_in_list: $w, $s\n") if ($debug);
  foreach $lo (split /\|/, $w) {
    if ($s !~ /^$/) {
      return 1 if ($s =~ /$lo/);
    }
  }
  return 0;
}

# =============================================================================
# END GENERAL LIST FUNCTIONS




