fasta-make-index.pl

09/12/2014 15:34

Ce script écrit en perl génère un index pour un fichier de séquences au format fasta.

 

#!/usr/athena/bin/perl
#
#
#

$pgm = $0;            # name of program
$pgm =~ s#.*/##;                # remove part up to last slash
@args = @ARGV;            # arguments to program
$status = 0;            # exit status
$SIG{'INT'} = 'cleanup';    # interrupt handler

$usage = <<USAGE;        # usage message
  USAGE:
    $pgm <file> [-f] [-sp] [-l] [-aa] [-gc] [-gi]

    <file>        name of FASTA file

    [-f]        overwrite existing index file if one exists
    [-sp]        make a swissprot index by stripping all but
            last part of sequence names
    [-l]        index by sequence name and lowercased name
    [-aa]        include first amino acid index found in parens
    [-gc]        genechip format: first field in pipes
    [-gi]        if identifier starts with "gi|<number>" replace
            it with <number>

    Make an index for a FASTA file for use by fasta-fetch.

    Creates file <file>.index.
USAGE

$nargs = 1;
if ($#ARGV+1 < $nargs) {        # wrong number of arguments
  print $usage;
  exit(1);
}

# get input arguments
$file = shift;
while ($#ARGV >= 0) {
  if ($ARGV[0] eq "-sp") {
    $sprot = 1;
  } elsif ($ARGV[0] eq "-l") {
    $lower = 1;
  } elsif ($ARGV[0] eq "-aa") {
    $aa = 1;
  } elsif ($ARGV[0] eq "-gc") {
    $gc = 1;
  } elsif ($ARGV[0] eq "-gi") {
    $gi = 1;
  } elsif ($ARGV[0] eq "-f") {
    $force = 1;
  } else {
    print $usage;
    exit(1);
  }
  shift;
}

open FILE, "<$file" || die "Couldn't open $file";
if (!$force && -e "$file.index") {
  print STDERR "Can't create fetch index file for file $file.\n";
  print STDERR "File $file.index already exists.\n";
  print STDERR "Move or rename file $file.index and try again.\n";
  exit 1;
}

open INDEX, ">$file.index" || die "Couldn't open $file.index";
$byte = 0;
while (<FILE>) {
  if (/^>/) {
    #if (++$i % 1000 == 0) {print stderr "seq: $i\r";}
    @words = split;
    $id = $words[0] eq ">" ? $words[1] : $words[0];
    $id =~ s/>//;            # remove ">"
    if ($sprot) {
      $id =~ s/.*\|//;            # remove up to last "|"
    } elsif ($gc) {
      # first to last field between "|"
      @words = split(/\|/, $id);    # split on pipes
      $id = $words[0];
    } elsif ($gi) {
      if ($id =~ /^gi\|(\d+)/) { $id = $1; }
    }
    @ids = ($id);
    if ($aa) {                # add amino acid id
      foreach $id (@words[1..$#words]) {
        if ($id =~ /^\(([A-Z][0-9]+)\)$/) {
          push @ids, "$1";
          last;
        }
      }
    }
    foreach $id (@ids) {
      printf INDEX ">%s %d\n", $id, $byte;
      if ($lower) {
    $l = $id;
    $l =~ tr/A-Z/a-z/;
    printf INDEX ">%s %d\n", $l, $byte;
      }
    }
  }
  $byte += length;
}
    
# cleanup files
# note: "if ($status == 130) {cleanup(1);}" must follow $status = system(...)
#&cleanup($status);
 
sub cleanup {
  system "/bin/rm $pgm.$$.*.tmp";
  if ($_[0] eq "INT") { exit(1); } else { exit($_[0]); }
}