Page d'accueil > fastafetch.pl

fastafetch.pl

09/12/2014 15:19

Ce script écrit en perl est très utile enfin de récupérer rapidement à partir d'un gros fichier de séquence en format fasta, quelques séquences d'intéret.

Attention: Avant de lancer ce script, assurez vous d'avoir construit un index avec le script fasta-make-index.pl fournit sur le site.

#!/usr/athena/bin/perl

$pgm = $0;                      # name of program
$pgm =~ s#.*/##;                # remove part up to last slash
$status = 0;                    # exit status
#$SIG{'INT'} = 'cleanup';   # interrupt handler
$line_size = 50;       # size of lines to print
$off = 1;           # first position of sequence to print
$len = -1;           # maximum length of sequence to print

$usage = < USAGE:
        $pgm [-f | []+] [-c] [-s ]
       [-off ] [-len ]

               name of FASTA sequence file
       [-f ]   file containing list of sequence identifiers
       []       sequence identifier
       [-c]       check that first word in fasta id matches
       [-s ]   put each sequence in a file named "after" the
               sequence identifier with "." appended;
               pipes in file names are changed to underscores
       [-off ]    print starting at position ; default: $off
        [-len ]   print up to characters for each seq;
               default: print entire sequence

   Note: Assumes and index file has been made using fasta-make-index.
   Sequence identifiers must be same as made by fasta-make-index.

   Reads sequence identifiers from the command line, from
   a file and from standard input, in that order.

   Fetch sequences from a FASTA sequence file and
   write to standard output.

USAGE

if ($#ARGV+1 < 1) {             # wrong number of arguments
die $usage;
}

# get input arguments
$fasta = shift;           # file of FASTA sequences
while ($#ARGV >= 0) {
$_ = shift;
if ($_ eq "-f") {        # from file
    $filename = shift;
} elsif ($_ eq "-c") {   # check name match
    $check = 1;
} elsif ($_ eq "-s") {   # put in files
    $suffix = shift;
    $print_to_files = 1;
} elsif ($_ eq "-off") {   # offset
    $off = shift;
} elsif ($_ eq "-len") {   # maximum length
    $len = shift;
} else {           # file name
    push @target, $_;
}
}

# get targets from file
if ($filename) {
open(IDFILE, "<$filename") || die("Can't open file $filename\n");
while () {
    next if (/^#/ || /^\s*$/);    # skip comments an blank lines
    @words = split;
    push @target, @words;
}
close(IDFILE);
}
#while (<>) {
# @words = split;
# push @target, @words;
#}

# open the FASTA file
open(FASTA, "<$fasta") || die "Can't open file $fasta\n";

# open the index file
$index = "$fasta.index";
open(INDEX, "<$index") || die "Can't open $index\n";

# read in the index
while () {
@words = split;
$id = $words[0];           # sequence id
$addr = $words[1];           # start of sequence
$index{$id} = "$index{$id} $addr";
}

# use direct access to get each target
foreach $id (@target) {
$_ = $index{">$id"};           # address of sequence
if ($_ eq undef) {
    print STDERR "Can't find target $id\n";
    next;               # go to next target
}
# open output file if requested
if ($print_to_files) {
    $outfile = $id;
    $outfile =~ s/\|/_/g;
    $outfile .= ".$suffix" if ($suffix);
    open(OUTPUT, ">$outfile") || die("Can't open file $outfile : $!\n");
} else { # print to files
    open(OUTPUT, ">-");
}
foreach $addr (split) {       # one-to-many
    seek(FASTA, $addr, 0);       # move to start of target
    $_ = ;
    $idline = $_;
    @words = split;
    if ($check && $words[0] ne ">$id") {
      $tmp = $words[0];
      @words2 = split(/[>\|]/, $tmp);
      $tmp = $words2[$#words2];
      if ($tmp ne $id) {
   $lower = $tmp;
   $lower =~ tr/A-Z/a-z/;
   if ($lower ne $id) {
      print STDERR "Bad index? target= >$id found= $words[0]\n";
      next;               # go to next target
   }
      }
    }
    print OUTPUT "$idline";       # print header
    $seq = "";
    while () {           # read sequence lines
      if (/^>/) {last}           # start of next sequence
      # print sequence line
      chop;
      $seq .= $_;
    }
    # print sequence in lines of length $line_size
    # get portion of sequence to print if -off and/or -len given
    if ($off != 1 || $len != -1) {
      if ($len == -1) {
        $seq = substr($seq, $off-1);
      } else {
        $seq = substr($seq, $off-1, $len);
      }
    }
    for ($i=0; $i       print substr($seq, $i, $line_size), "\n";
    }
} # addr
close(OUTPUT);
} # id

fastafetch.pl

Rechercher

Contact