#! /usr/bin/perl
# $Id: gmap_setup.pl.in,v 1.50 2010/07/21 22:01:03 twu Exp $

use warnings;

$gmapdb = "/var/cache/gmap";
$bindir = "/usr/bin";
$package_version = "2010-07-21";

use IO::File;
use Getopt::Std;

# Usage: gmap_setup -d <db> [-D <destdir>] [-o <Makefile>] [-M <.md file>] [-C] [-W] [-E] [-q interval] [-Q interval] <fasta_files or command>
#

undef($opt_d);			# genome name
undef($opt_D);			# destination directory
undef($opt_o);			# name of output file (default is "Makefile.<genome>")

undef($opt_M);			# use seq_contig.md file (provide as argument)
undef($opt_O);			# order chromosomes
undef($opt_S);			# Treat each sequence as a separate chromosome; now the default
undef($opt_C);			# Tries to find a chromosome tag in each FASTA header (also turns ons chromosome sorting)

undef($opt_W);			# write directly to file, not RAM
undef($opt_E);			# interpret argument as a command

undef($opt_B);			# bindir (needed for "make check")

$opt_q = 3;			# GMAP interval
$opt_Q = 6;			# PMAP interval;

#undef($opt_9);			# debug

getopts("d:D:o:M:SCO:WEB:q:Q:");

if (defined($opt_S)) {
    print STDERR "The -S flag is no longer available.  It is now the default behavior, so you may\n";
    print STDERR "  simply call gmap_setup without it.  The other ways to call gmap_setup are with\n";
    print STDERR "  the -C flag (to try to parse the chromosomal coordinates from each FASTA header)\n";
    print STDERR "  or with the -M flag (to specify an NCBI .md file that contains chromosomal\n";
    print STDERR "  coordinates.\n";
    die;
}

if (!defined($genome_name = $opt_d)) {
  print_usage();
  die "Must specify genome database name with -d flag.";
} elsif ($opt_d =~ /(\S+)\/(\S+)/) {
  $genome_name = $2;
  if (defined($opt_D)) {
    $opt_D = $opt_D . "/" . $1;
  } else {
    $opt_D = $1;
  }
}

if (!defined($outputfile = $opt_o)) {
  $outputfile = "Makefile.$genome_name";
}
$gmap_interval = $opt_q;
$pmap_interval = $opt_Q;


$MAKEFILE = new IO::File(">$outputfile") or die "Cannot write to file $outputfile";

print $MAKEFILE "# Makefile for creating genome $genome_name for use by GMAP.\n";
print $MAKEFILE "# Created by gmap_setup, but may be edited as needed\n\n";
#print $MAKEFILE ":SILENT: \n"; -- Doesn't work
print $MAKEFILE "\n";

# Remove unexpected suffix behavior
print $MAKEFILE ".SUFFIXES:\n";
print $MAKEFILE ".SUFFIXES: .txt\n\n";

print $MAKEFILE "GENOME = $genome_name\n";

if (!defined($destination = $opt_D)) {
    $destination = $gmapdb . "/" . $genome_name;
}
$destination =~ s/\/\//\//g;	# Remove double slashes
print $MAKEFILE "DESTINATION = $destination\n";
print $MAKEFILE "\n";


if (defined($opt_O)) {
    if ($opt_O == 0) {
	print $MAKEFILE "CHR_ORDER_FLAG = -S\n";
    } elsif ($opt_O == 1) {
	print $MAKEFILE "CHR_ORDER_FLAG = \n";
    } else {
	die "Don't recognize value $opt_O for -O flag";
    }
} else {
    print $MAKEFILE "CHR_ORDER_FLAG = \n";
}


if (defined($opt_E)) {
  $incommand = $ARGV[0];
} else {
  #$incommand = "cat \$(FASTA)";
  print $MAKEFILE "FASTA = " . join(" \\\n         ",@ARGV) . "\n";
}

print $MAKEFILE "COORDS = coords.\$(GENOME)\n";

if (defined($opt_B)) {
  print $MAKEFILE "FA_COORDS = $opt_B/fa_coords\n";
  print $MAKEFILE "MD_COORDS = $opt_B/md_coords\n";
  print $MAKEFILE "GMAP_PROCESS = $opt_B/gmap_process\n";
  print $MAKEFILE "GMAPINDEX = $opt_B/gmapindex\n";
  print $MAKEFILE "PMAPINDEX = $opt_B/pmapindex\n";
} else {
  print $MAKEFILE "FA_COORDS = $bindir/fa_coords\n";
  print $MAKEFILE "MD_COORDS = $bindir/md_coords\n";
  print $MAKEFILE "GMAP_PROCESS = $bindir/gmap_process\n";
  print $MAKEFILE "GMAPINDEX = $bindir/gmapindex\n";
  print $MAKEFILE "PMAPINDEX = $bindir/pmapindex\n";
}
print $MAKEFILE "\n";

print $MAKEFILE "# If WRITEFILE is '-W', then gmapindex will write directly in the file, not in RAM\n";
if (defined($opt_W)) {
  print $MAKEFILE "WRITEFILE = -W\n"
} else {
  print $MAKEFILE "WRITEFILE = \n";
}
print $MAKEFILE "\n";

# Now hardcoded
#print $MAKEFILE "GMAPINDEX_INTERVAL = $opt_q # Change this only for special situations.  Intervals < 3\n";
#print $MAKEFILE "                       # may result in large index files that cannot be read on some machines\n";
#print $MAKEFILE "                       # If you set interval to be 3 (default) or 1, the file can also be used for GSNAP\n";

#print $MAKEFILE "PMAPINDEX_INTERVAL = $opt_Q # Change this only for special situations.  Intervals < 6\n";
#print $MAKEFILE "                       # may result in large index files that cannot be read on some machines\n";
#print $MAKEFILE "\n";

print $MAKEFILE "FILE_ENDINGS = chromosome chromosome.iit chrsubset contig contig.iit \\\n";
print $MAKEFILE "               genome genomecomp genomealt version \\\n";
print $MAKEFILE "               ";
printf $MAKEFILE "id%doffsets id%dpositions ref%doffsets ref%dpositions snp%doffsets snp%dpositions \\\n",
    $gmap_interval,$gmap_interval,$gmap_interval,$gmap_interval,$gmap_interval,$gmap_interval;
print $MAKEFILE "               ";
printf $MAKEFILE "pf*offsets pf*positions pr*offsets pr*positions\n",
    $pmap_interval,$pmap_interval,$pmap_interval,$pmap_interval;

print $MAKEFILE "\n";


########################################################################
#   General commands
########################################################################

print $MAKEFILE "install:\n";
print $MAKEFILE "\tmkdir -p \$(DESTINATION)\n";
if (!defined($opt_D) || $opt_D ne ".") {
    print $MAKEFILE "\tfor s in \$(FILE_ENDINGS); do \\\n";
    print $MAKEFILE "\tif test -r \$(GENOME).\$\$s; then \\\n";
    print $MAKEFILE "\tmv -f \$(GENOME).\$\$s \$(DESTINATION); \\\n";
    print $MAKEFILE "\tfi \\\n";
    print $MAKEFILE "\tdone\n";
}
print $MAKEFILE "\tchmod 644 \$(DESTINATION)/\$(GENOME).*\n";
print $MAKEFILE "\tmkdir -p \$(DESTINATION)/\$(GENOME).maps\n";
print $MAKEFILE "\tchmod 755 \$(DESTINATION)/\$(GENOME).maps\n";
print $MAKEFILE "\n";

print $MAKEFILE "gmapdb: checkcoords \$(GENOME).version \$(GENOME).genomecomp \$(GENOME).ref3offsets \$(GENOME).ref3positions gmapmsg\n";
print $MAKEFILE "gmapdb_lc: checkcoords \$(GENOME).version \$(GENOME).genome \$(GENOME).ref3offsets_lc \$(GENOME).ref3positions gmapmsg\n";
print $MAKEFILE "gmapdb_lc_masked: checkcoords \$(GENOME).version \$(GENOME).genome \$(GENOME).ref3offsets.masked \$(GENOME).ref3positions.masked gmapmsg\n";

print $MAKEFILE "pmapdb: pmapindices \$(GENOME).version pmapmsg\n";
print $MAKEFILE "\n";

print $MAKEFILE "coords: \$(COORDS)\n";
print $MAKEFILE "\n";

print $MAKEFILE "checkcoords: \n";
print $MAKEFILE "\tif test -s \$(COORDS); then \\\n";
print $MAKEFILE "\techo File \$(COORDS) found.  Proceeding...; \\\n";
print $MAKEFILE "\telse \\\n";
print $MAKEFILE "\techo File \$(COORDS) not found.  Please run \"make -f $outputfile coords\" first.; \\\n";
print $MAKEFILE "\texit 1; \\\n";
print $MAKEFILE "\tfi\n";
print $MAKEFILE "\n";

print $MAKEFILE "gmapmsg:\n";
print $MAKEFILE "\techo gmapdb for $genome_name complete.\n";
#print $MAKEFILE "\techo To make pmapdb, do: make -f Makefile.$genome_name pmapdb\n";
print $MAKEFILE "\techo To install, do: make -f Makefile.$genome_name install\n";

print $MAKEFILE "pmapmsg:\n";
#print $MAKEFILE "\techo pmapdb for $genome_name complete.\n";
print $MAKEFILE "\techo To install, do: make -f Makefile.$genome_name install\n";


print $MAKEFILE "cleanall: clean\n";
print $MAKEFILE "\trm -f \$(GENOME).version\n";
print $MAKEFILE "\trm -f Makefile.\$(GENOME)\n";
print $MAKEFILE "\techo To start over again, run gmap_setup\n";
print $MAKEFILE "\n";

print $MAKEFILE "clean:\n";
print $MAKEFILE "\tfor s in \$(FILE_ENDINGS); do \\\n";
print $MAKEFILE "\tif test -r \$(GENOME).\$\$s; then \\\n";
print $MAKEFILE "\trm -f \$(GENOME).\$\$s; \\\n";
print $MAKEFILE "\tfi; \\\n";
print $MAKEFILE "\tdone;\n";
print $MAKEFILE "\trm -f coords.\$(GENOME)\n";
print $MAKEFILE "\n";



########################################################################
#   Coords file
########################################################################

print $MAKEFILE "\$(COORDS):\n";
print $MAKEFILE "\tfor s in \$(FILE_ENDINGS); do \\\n";
print $MAKEFILE "\trm -f \$(GENOME).\$\$s; \\\n";
print $MAKEFILE "\tdone;\n";
if (defined($mdfile = $opt_M)) {
    print $MAKEFILE "\t\$(MD_COORDS) -o \$(COORDS) $mdfile\n";

} elsif (defined($opt_C)) {
# Try to parse chromosomal information
    if (defined($opt_E)) {
	print $MAKEFILE "\t$incommand | \$(FA_COORDS) -C -o \$(COORDS)\n";
    } else {
	print $MAKEFILE "\t\$(FA_COORDS) -C -o \$(COORDS) \$(FASTA)\n";
    }
} else {
    if (defined($opt_E)) {
	print $MAKEFILE "\t$incommand | \$(FA_COORDS) -o \$(COORDS)\n";
    } else {
	print $MAKEFILE "\t\$(FA_COORDS) -o \$(COORDS) \$(FASTA)\n";
    }
}


########################################################################
#   Contig files
########################################################################

if (defined($opt_E)) {
    print $MAKEFILE "\$(GENOME).contig.iit: \$(COORDS)\n";
} else {
    print $MAKEFILE "\$(GENOME).contig.iit: \$(COORDS) \$(FASTA)\n";
}
print $MAKEFILE "\techo Making contig files...\n";
if (defined($opt_E)) {
    print $MAKEFILE "\t$incommand | \$(GMAP_PROCESS) -c \$(COORDS) | \$(GMAPINDEX) -d \$(GENOME) -A \$(CHR_ORDER_FLAG)\n";
} else {
    print $MAKEFILE "\t\$(GMAP_PROCESS) -c \$(COORDS) \$(FASTA) | \$(GMAPINDEX) -d \$(GENOME) -A \$(CHR_ORDER_FLAG)\n";
}
print $MAKEFILE "\n";


########################################################################
#   Genome files
########################################################################

print $MAKEFILE "\$(GENOME).genomecomp: \$(GENOME).contig.iit\n";
print $MAKEFILE "\techo Making compressed genome file...\n";
if (defined($opt_E)) {
    print $MAKEFILE "\t$incommand | \$(GMAP_PROCESS) -c \$(COORDS) | \$(GMAPINDEX) -d \$(GENOME) \$(WRITEFILE) -G\n";
} else {
    print $MAKEFILE "\t\$(GMAP_PROCESS) -c \$(COORDS) \$(FASTA) | \$(GMAPINDEX) -d \$(GENOME) \$(WRITEFILE) -G\n";
}
print $MAKEFILE "\n";


print $MAKEFILE "\$(GENOME).genome: \$(GENOME).contig.iit\n";
print $MAKEFILE "\techo Making full ASCII genome file...\n";
if (defined($opt_E)) {
    print $MAKEFILE "\t$incommand | \$(GMAP_PROCESS) -c \$(COORDS) | \$(GMAPINDEX) -d \$(GENOME) \$(WRITEFILE) -l -G\n";
} else {
    print $MAKEFILE "\t\$(GMAP_PROCESS) -c \$(COORDS) \$(FASTA) | \$(GMAPINDEX) -d \$(GENOME) \$(WRITEFILE) -l -G\n";
}
print $MAKEFILE "\n";


########################################################################
#   GMAP index files, unmasked
########################################################################

$q_flag = "-q $gmap_interval";

print $MAKEFILE "\$(GENOME).ref3offsets: \$(GENOME).genomecomp\n";
print $MAKEFILE "\techo Making index offsets file...\n";
print $MAKEFILE "\tcat \$(GENOME).genomecomp | \$(GMAPINDEX) -d \$(GENOME) $q_flag -O; \n";
print $MAKEFILE "\n";

print $MAKEFILE "\$(GENOME).ref3offsets_lc: \$(GENOME).genome\n";
print $MAKEFILE "\techo Making index offsets file...\n";
print $MAKEFILE "\tcat \$(GENOME).genome | \$(GMAPINDEX) -d \$(GENOME) -l $q_flag -O; \n";
print $MAKEFILE "\n";

print $MAKEFILE "\$(GENOME).ref3positions: \$(GENOME).ref3offsets\n";
print $MAKEFILE "\techo Making index positions file...\n";
print $MAKEFILE "\tif test -s \$(GENOME).genome; then \\\n";
print $MAKEFILE "\tcat \$(GENOME).genome | \$(GMAPINDEX) -d \$(GENOME) -l $q_flag -P \$(WRITEFILE); \\\n";
print $MAKEFILE "\telse \\\n";
print $MAKEFILE "\tcat \$(GENOME).genomecomp | \$(GMAPINDEX) -d \$(GENOME) $q_flag -P \$(WRITEFILE); \\\n";
print $MAKEFILE "\tfi\n";
print $MAKEFILE "\n";


########################################################################
#   GMAP index files, masked
########################################################################

print $MAKEFILE "\$(GENOME).ref3offsets.masked: \$(GENOME).genome\n";
print $MAKEFILE "\techo Making masked index offsets file...\n";
print $MAKEFILE "\tcat \$(GENOME).genome | \$(GMAPINDEX) -d \$(GENOME) -l -m $q_flag -O\n";
print $MAKEFILE "\n";

print $MAKEFILE "\$(GENOME).ref3positions.masked: \$(GENOME).ref3offsets.masked\n";
print $MAKEFILE "\techo Making masked index positions file...\n";
print $MAKEFILE "\tcat \$(GENOME).genome | \$(GMAPINDEX) -d \$(GENOME) -l -m $q_flag -P\n";
print $MAKEFILE "\n";


########################################################################
#   PMAP index files, unmasked: pmapindex can do all four steps
########################################################################

# Don't concatenate
$q_flag = "-q $pmap_interval";

print $MAKEFILE "pmapindices:\n";
print $MAKEFILE "\t\$(PMAPINDEX) -d \$(GENOME) $q_flag\n";
print $MAKEFILE "\n";

########################################################################
#   Version file
########################################################################

print $MAKEFILE "\$(GENOME).version:\n";
print $MAKEFILE "\techo \$(GENOME) > \$\@\n";
print $MAKEFILE "#\tThis file can be edited manually to change the genome name printed by GMAP\n";
print $MAKEFILE "\n";

close($MAKEFILE);


########################################################################
#   Final instructions
########################################################################

$FP = new IO::File(">INSTALL.$genome_name") or die "Cannot write to INSTALL.$genome_name";
print_instructions($FP);
close($FP);

$FP = new IO::File(">&STDOUT");
print_instructions($FP);
close($FP);

exit;



sub print_instructions {
  my ($FP) = @_;

  print $FP <<INSTRUCTIONS1;
============================================================

A Makefile called Makefile.$genome_name has been created for your genome.
You may inspect and edit it if necessary.  To start the build of your genome,
do

INSTRUCTIONS1

print $FP "    make -f $outputfile coords\n";
print $FP "\n";
print $FP "    make -f $outputfile gmapdb, or\n";
print $FP "    make -f $outputfile gmapdb_lc, or\n";
print $FP "    make -f $outputfile gmapdb_lc_masked\n";
print $FP "\n";
print $FP "    (edit the .chrsubset file [optional])\n";
print $FP "\n";
print $FP "    make -f $outputfile install\n";

print $FP <<INSTRUCTIONS2;

Note that the "make gmapdb" and "make fullascii" commands can take a while
to index a large genome.

If you need to start over again, do

    make -f $outputfile clean  (which preserves Makefile.$genome_name and INSTALL.$genome_name)
 or
    make -f $outputfile cleanall  (which deletes the above files)

The optional "make fullascii" command is intended to represent genomes
where it is important to show lower-case or non-standard characters
(anything other than A, C, G, T, N, or X) in the alignment.  This step
is optional.  For human- or mouse-sized genomes, it requires that your
computer can address files greater than 2 gigabytes in size.  See the
README file in the original source tree to see if you really need to
do this.

The .chrsubset file defines various subsets of the available
chromosomes, which can then be specified by the user with the "-c"
flag to GMAP.  For more information about specifying chromosomal
subsets, see the README file that comes with the GMAP source code.

A copy of these commands has been written to a file called
INSTALL.$genome_name.

============================================================
INSTRUCTIONS2

  return;
}



sub print_usage {
  print <<TEXT1;

gmap_setup: Sets up a Makefile for creating a genome by use by GMAP.
Part of GMAP package, version $package_version.

Usage: gmap_setup -d <genomename> [-D <destdir>] [-o <Makefile>] [-C] [-M <.md file>]
       [-E] [-O <int>] [-W] [-q interval] [-Q interval] <fasta_files or command>

    -d    genome name
    -D    destination directory for installation (defaults to gmapdb directory specified at configure time)
    -o    name of output Makefile (default is "Makefile.<genome>")
    -M    use coordinates from an .md file (e.g., seq_contig.md file from NCBI)
    -C    try to parse chromosomal coordinates from each FASTA header
    -E    interpret argument as a command, instead of a list of FASTA files
    -O    order chromosomes in numeric/alphabetic order (0 = no, 1 = yes (default))

  Advanced flags, not typically used:
    -W    write some output directly to file, instead of using RAM (use only if RAM is limited)
    -q    GMAP indexing interval (default: 3 nt)
    -Q    PMAP indexing interval (default: 6 aa)

These flags are explained below:

If you want to treat each FASTA entry as a separate chromosome (either
because it is in fact an entire chromosome or because you have contigs
without any chromosomal information), you can simply call gmap_setup
like this:

    gmap_setup -d <genome> <fasta_file>...

The accession of each FASTA header (the word following each ">") will
be the name of each chromosome.  GMAP can handle an unlimited number
of "chromosomes", with arbitrarily long names.  In this way, GMAP
could be used as a general search program for near-identity matches
against a FASTA file.


* The -M and -C flags: If your sequences represent contigs that have
  mapping information to specific chromosomal regions, then you can
  have gmap_setup try to read each header to determine its chromosomal
  region (the -C flag) or read an .md file that contains information
  about chromosomal regions (the -M flag).  The .md files are often
  provided in NCBI releases, but since the formats change often,
  gmap_setup will prompt you to make sure it parses it correctly.


* The -E flag: If you need to pre-process the FASTA files before using
  these programs, perhaps because they are compressed or because you
  need to insert chromosomal information in the header lines, you can
  specify a command instead of multiple fasta_files, like these
  examples:

    gmap_setup -d <genome> -E 'gunzip -c genomefiles.gz'
    gmap_setup -d <genome> -E 'cat *.fa | ./add-chromosomal-info.pl'


* The -W flag: The gmap_setup process works best if you have a
  computer with enough RAM to hold the entire genome (e.g., 3
  gigabytes for a human- or mouse-sized genome).  Since the resulting
  genome files work across all machine architectures, you can find any
  machine with sufficient RAM to build the genome files and then
  transfer the files to another machine.  (GMAP itself runs fine on
  machines with limited RAM.)  If you cannot find any machine with
  sufficient RAM for gmap_setup, you can run the program with the -W
  flag to write the files directly, but this can be very slow.


* The -q and -Q flags: If you specify a smaller interval (for example,
  3 for the GMAP interval), you can create a higher-resolution
  database, which can be useful for mapping small oligomers (smaller
  than 18 nt).  However, the corresponding genome index files will be
  larger (twice as big if you specify -q 3).  These index files may
  exceed the 2 gigabyte file offset limit on some computers, and will
  therefore fail to work on those computers.


TEXT1
  return;
}

