Page d'accueil > fasta-make-index.pl

fasta-make-index.pl

09/12/2014 15:34

Ce script écrit en perl génère un index pour un fichier de séquences au format fasta.

#!/usr/athena/bin/perl
#
#
#

$pgm = $0;           # name of program
$pgm =~ s#.*/##;                # remove part up to last slash
@args = @ARGV;           # arguments to program
$status = 0;           # exit status
$SIG{'INT'} = 'cleanup';   # interrupt handler

$usage = <<USAGE;       # usage message
USAGE:
   $pgm <file> [-f] [-sp] [-l] [-aa] [-gc] [-gi]

   <file>       name of FASTA file

   [-f]       overwrite existing index file if one exists
   [-sp]       make a swissprot index by stripping all but
           last part of sequence names
   [-l]       index by sequence name and lowercased name
   [-aa]       include first amino acid index found in parens
   [-gc]       genechip format: first field in pipes
   [-gi]       if identifier starts with "gi|<number>" replace
           it with <number>

   Make an index for a FASTA file for use by fasta-fetch.

   Creates file <file>.index.
USAGE

$nargs = 1;
if ($#ARGV+1 < $nargs) {       # wrong number of arguments
print $usage;
exit(1);
}

# get input arguments
$file = shift;
while ($#ARGV >= 0) {
if ($ARGV[0] eq "-sp") {
    $sprot = 1;
} elsif ($ARGV[0] eq "-l") {
    $lower = 1;
} elsif ($ARGV[0] eq "-aa") {
    $aa = 1;
} elsif ($ARGV[0] eq "-gc") {
    $gc = 1;
} elsif ($ARGV[0] eq "-gi") {
    $gi = 1;
} elsif ($ARGV[0] eq "-f") {
    $force = 1;
} else {
    print $usage;
    exit(1);
}
shift;
}

open FILE, "<$file" || die "Couldn't open $file";
if (!$force && -e "$file.index") {
print STDERR "Can't create fetch index file for file $file.\n";
print STDERR "File $file.index already exists.\n";
print STDERR "Move or rename file $file.index and try again.\n";
exit 1;
}

open INDEX, ">$file.index" || die "Couldn't open $file.index";
$byte = 0;
while (<FILE>) {
if (/^>/) {
    #if (++$i % 1000 == 0) {print stderr "seq: $i\r";}
    @words = split;
    $id = $words[0] eq ">" ? $words[1] : $words[0];
    $id =~ s/>//;           # remove ">"
    if ($sprot) {
      $id =~ s/.*\|//;           # remove up to last "|"
    } elsif ($gc) {
      # first to last field between "|"
      @words = split(/\|/, $id);   # split on pipes
      $id = $words[0];
    } elsif ($gi) {
      if ($id =~ /^gi\|(\d+)/) { $id = $1; }
    }
    @ids = ($id);
    if ($aa) {               # add amino acid id
      foreach $id (@words[1..$#words]) {
        if ($id =~ /^$([A-Z][0-9]+)$$/) {
          push @ids, "$1";
          last;
        }
      }
    }
    foreach $id (@ids) {
      printf INDEX ">%s %d\n", $id, $byte;
      if ($lower) {
   $l = $id;
   $l =~ tr/A-Z/a-z/;
   printf INDEX ">%s %d\n", $l, $byte;
      }
    }
}
$byte += length;
}

# cleanup files
# note: "if ($status == 130) {cleanup(1);}" must follow $status = system(...)
#&cleanup($status);

sub cleanup {
system "/bin/rm $pgm.$$.*.tmp";
if ($_[0] eq "INT") { exit(1); } else { exit($_[0]); }
}

fasta-make-index.pl

Rechercher

Contact