// WebCopy - copy a remote web subtree to the local disk
//
// Given one or more URLs as arguments, enumerates the files reachable at
// or below those URLs and copies them to the local disk, creating
// subdirectories as necessary.
//
// Options:
//
// -v
// Verbose. Shows names of files being copied.
//
// -f
// Force overwriting of existing files. Otherwise they are left alone.
//
// -d
// Maximum depth to copy. Depth refers to how many links to follow.
// A depth of 0 means just copy the file given on the connald line,
// don't follow any links at all. Without this flag there is no limit
// on the depth, the entire subtree is copied.
//
// -e
// Edit local URLs. If an HTML file contains a URL that is
// unnecessarily absolute - i.e. it's absolute but it refers to
// a location within the tree being copied - then convert it to a
// relative URL. Without this flag, all files are copied verbatim.
// With it, the copied tree is a self-contained functional snapshot
// of the remote.
//
// -a
// Authorization. Syntax is userid:password.
//
// Copyright (C)1996,1998 by Jef Poskanzer . All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
// OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
// SUCH DAMAGE.
//
// Visit the ACME Labs Java page for up-to-date versions of this and other
// fine Java utilities: http://www.acme.com/java/
import java.util.*;
import java.net.*;
import java.io.*;
public class WebCopy extends Acme.Application implements Acme.HtmlEditObserver
{
static final String progName = "WebCopy";
static final String index_html = "index.html";
// Old-style main() routine. Calls compatibility routine for newMain().
public static void main( String[] args )
{
(new WebCopy()).compat( args );
}
private boolean verbose = false;
private boolean forceOverwrite = false;
int maxDepth = -1;
private boolean editLocalUrls = false;
private String baseUrlStr;
public int newMain( String[] args )
{
int argc = args.length;
int argn;
String auth_cookie = null;
// Parse args.
for ( argn = 0; argn < argc && args[argn].charAt( 0 ) == '-'; ++argn )
{
if ( args[argn].equals( "-v" ) )
verbose = true;
else if ( args[argn].equals( "-f" ) )
forceOverwrite = true;
else if ( args[argn].equals( "-d" ) )
{
++argn;
maxDepth = Integer.parseInt( args[argn] );
}
else if ( args[argn].equals( "-e" ) )
editLocalUrls = true;
else if ( args[argn].equals( "-a" ) )
{
++argn;
auth_cookie = args[argn];
}
else
{
usage();
return -1;
}
}
if ( argc - argn < 1 )
{
usage();
return -1;
}
for ( ; argn < argc; ++argn )
copy( args[argn], auth_cookie );
return 0;
}
private void usage()
{
err.println( "usage: " + progName + " [-v] [-f] [-d maxDepth] [-e] [-a username:password] URL ..." );
}
void copy( String urlStr, String auth_cookie )
{
baseUrlStr = Acme.Utils.baseUrlStr( urlStr );
Acme.Spider spider;
try
{
spider = new WebCopySpider( urlStr, err, this );
}
catch ( MalformedURLException e )
{
err.println( e );
return;
}
if ( auth_cookie != null )
spider.setAuth( auth_cookie );
while ( spider.hasMoreElements() )
{
URLConnection uc = (URLConnection) spider.nextElement();
if ( uc == null ) // non-fatal error
continue;
URL thisUrl = uc.getURL();
String thisUrlStr = thisUrl.toExternalForm();
InputStream s = null;
try
{
// Open the input file. We have to do this up here instead of
// down where we open the output file because Spider requires
// us to always open and close the input file - that's what
// causes it to scan the file for links. So we always open
// it, and we call close() in a finally block.
s = uc.getInputStream();
// Figure out the local filename.
if ( ! thisUrlStr.startsWith( baseUrlStr ) )
{
err.println(
"Something's wrong - " + thisUrlStr +
" doesn't begin with the base URL, " + baseUrlStr );
continue;
}
String localName = thisUrlStr.substring( baseUrlStr.length() );
localName = localName.replace( '/', File.separatorChar );
if ( localName.length() == 0 ||
localName.charAt( localName.length() - 1 ) ==
File.separatorChar )
localName = localName + index_html;
else if ( Acme.Utils.urlStrIsDir( thisUrlStr ) )
localName = localName + File.separatorChar + index_html;
// Make sure the local directories exist.
if ( localName.lastIndexOf( File.separatorChar ) != -1 )
{
String localDirs = localName.substring(
0, localName.lastIndexOf( File.separatorChar ) );
File dirsFile = new File( localDirs );
if ( dirsFile.exists() )
{
// Something exists there. Is it a directory?
if ( ! dirsFile.isDirectory() )
{
// We have a file where we want a directory. Ugh.
// Rename the file to be index.html in the
// directory.
String tempName = progName + ".tmp";
File tempFile = new File( tempName );
if ( ! dirsFile.renameTo( tempFile ) )
{
err.println(
"Error renaming existing file for " +
localName );
continue;
}
if ( ! dirsFile.mkdirs() )
{
err.println(
"Error creating directories for " +
localName );
continue;
}
String newName =
localDirs + File.separatorChar + index_html;
File newFile = new File( newName );
if ( ! tempFile.renameTo( newFile ) )
{
err.println(
"Error renaming temporary file for " +
localName );
continue;
}
}
}
else
{
// The directory doesn't exist yet, so make it.
if ( ! dirsFile.mkdirs() )
{
err.println(
"Error creating directories for " + localName );
continue;
}
}
}
// Check the output file.
File localFile = new File( localName );
if ( localFile.exists() && ! forceOverwrite )
{
err.println( localName + " already exists - skipping" );
continue;
}
if ( verbose )
err.println(
"Copying " + thisUrlStr + " to " + localName );
// If we're editing the URLs, interpose an HtmlEditScanner.
if ( editLocalUrls && ( s instanceof Acme.HtmlScanner ) )
s = new Acme.HtmlEditScanner( (Acme.HtmlScanner) s, this );
// Open the output file.
OutputStream out = new FileOutputStream( localFile );
// Copy the file.
byte[] buf = new byte[4096];
int len;
while ( ( len = s.read( buf ) ) != -1 )
out.write( buf, 0, len );
out.close();
}
catch ( IOException e ) {}
finally
{
try
{
if ( s != null )
s.close();
}
catch ( IOException e ) {}
}
}
}
/// Callback from HtmlEditScanner.
public String editAHREF( String urlStr, URL contextUrl, Object junk )
{
return editAny( urlStr, contextUrl );
}
/// Callback from HtmlEditScanner.
public String editIMGSRC( String urlStr, URL contextUrl, Object junk )
{
return editAny( urlStr, contextUrl );
}
/// Callback from HtmlEditScanner.
public String editFRAMESRC( String urlStr, URL contextUrl, Object junk )
{
return editAny( urlStr, contextUrl );
}
/// Callback from HtmlEditScanner.
public String editBASEHREF( String urlStr, URL contextUrl, Object junk )
{
return editAny( urlStr, contextUrl );
}
/// Callback from HtmlEditScanner.
public String editAREAHREF( String urlStr, URL contextUrl, Object junk )
{
return editAny( urlStr, contextUrl );
}
/// Callback from HtmlEditScanner.
public String editLINKHREF( String urlStr, URL contextUrl, Object junk )
{
return editAny( urlStr, contextUrl );
}
/// Callback from HtmlEditScanner.
public String editBODYBACKGROUND( String urlStr, URL contextUrl, Object junk )
{
return editAny( urlStr, contextUrl );
}
// If the URL is absolute but doesn't need to be, then make it
// relative.
private String editAny( String urlStr, URL contextUrl )
{
if ( ! editLocalUrls )
return null;
// If the URL is not absolute, leave it alone.
if ( ! Acme.Utils.urlStrIsAbsolute( urlStr ) )
return null;
// It's absolute.
try
{
String fullUrlStr =
Acme.Utils.fixDirUrlStr(
Acme.Utils.absoluteUrlStr( urlStr, contextUrl ) );
if ( ! fullUrlStr.startsWith( baseUrlStr ) )
return null;
// It's unnecessarily absolute. Trim it.
String contextUrlStr =
Acme.Utils.fixDirUrlStr( contextUrl.toExternalForm() );
int sameSpan = Acme.Utils.sameSpan( contextUrlStr, fullUrlStr );
int sameSlashSpan =
fullUrlStr.lastIndexOf( '/', sameSpan - 1 ) + 1;
String samePart =
fullUrlStr.substring( 0, sameSlashSpan );
String differentPart =
fullUrlStr.substring( sameSlashSpan );
int contextSlashes =
Acme.Utils.charCount( contextUrlStr, '/' );
int sameSlashes =
Acme.Utils.charCount( samePart, '/' );
StringBuffer newUrlStr = new StringBuffer();
if ( sameSlashes < contextSlashes )
{
// Going up.
for ( int i = contextSlashes; i > sameSlashes; --i )
newUrlStr.append( "../" );
}
// And back down the other side.
newUrlStr.append( differentPart );
// If we're left with nothing at all, make it be index.html.
if ( newUrlStr.length() == 0 )
newUrlStr.append( index_html );
// That ought to do it.
return newUrlStr.toString();
}
catch ( MalformedURLException e )
{
// Bogus URL? Ignore.
return null;
}
}
}
class WebCopySpider extends Acme.Spider
{
private WebCopy parent;
public WebCopySpider( String urlStr, PrintStream err, WebCopy parent ) throws MalformedURLException
{
super( urlStr, err );
this.parent = parent;
}
protected boolean doThisUrl( String thisUrlStr, int depth, String baseUrlStr )
{
if ( thisUrlStr.startsWith( baseUrlStr ) &&
( parent.maxDepth == -1 || depth <= parent.maxDepth ) )
return true;
return false;
}
}