fastafetch.pl

09/12/2014 15:19

Ce script écrit en perl est très utile enfin de récupérer rapidement à partir d'un gros fichier de séquence en format fasta, quelques séquences d'intéret.

Attention: Avant de lancer ce script, assurez vous d'avoir construit un index avec le script fasta-make-index.pl fournit sur le site.

 

#!/usr/athena/bin/perl

$pgm = $0;                      # name of program
$pgm =~ s#.*/##;                # remove part up to last slash
$status = 0;                    # exit status
#$SIG{'INT'} = 'cleanup';    # interrupt handler
$line_size = 50;        # size of lines to print
$off = 1;            # first position of sequence to print
$len = -1;            # maximum length of sequence to print
 
$usage = <   USAGE:
        $pgm [-f | []+] [-c] [-s ]
        [-off ] [-len ]

                name of FASTA sequence file
        [-f ]    file containing list of sequence identifiers
        []        sequence identifier
        [-c]        check that first word in fasta id matches
        [-s ]    put each sequence in a file named "after" the
                sequence identifier with "." appended;
                pipes in file names are changed to underscores
        [-off ]     print starting at position ; default: $off
         [-len ]    print up to characters for each seq;
                default: print entire sequence

    Note: Assumes and index file has been made using fasta-make-index.
    Sequence identifiers must be same as made by fasta-make-index.

    Reads sequence identifiers from the command line, from
    a file and from standard input, in that order.

    Fetch sequences from a FASTA sequence file and
    write to standard output.

USAGE

if ($#ARGV+1 < 1) {             # wrong number of arguments
  die $usage;
}

# get input arguments
$fasta = shift;            # file of FASTA sequences
while ($#ARGV >= 0) {
  $_ = shift;
  if ($_ eq "-f") {         # from file
    $filename = shift;
  } elsif ($_ eq "-c") {    # check name match
    $check = 1;
  } elsif ($_ eq "-s") {    # put in files
    $suffix = shift;
    $print_to_files = 1;
  } elsif ($_ eq "-off") {    # offset
    $off = shift;
  } elsif ($_ eq "-len") {    # maximum length
    $len = shift;
  } else {            # file name
    push @target, $_;
  }
}

# get targets from file
if ($filename) {
  open(IDFILE,  "<$filename") || die("Can't open file $filename\n");
  while () {
    next if (/^#/ || /^\s*$/);     # skip comments an blank lines
    @words = split;
    push @target, @words;
  }
  close(IDFILE);
}
#while (<>) {
#  @words = split;
#  push @target, @words;
#}

# open the FASTA file
open(FASTA, "<$fasta") || die "Can't open file $fasta\n";

# open the index file
$index = "$fasta.index";
open(INDEX, "<$index") || die "Can't open $index\n";

# read in the index
while () {
  @words = split;
  $id = $words[0];            # sequence id
  $addr = $words[1];            # start of sequence
  $index{$id} = "$index{$id} $addr";
}

# use direct access to get each target
foreach $id (@target) {
  $_ = $index{">$id"};            # address of sequence
  if ($_ eq undef) {
    print STDERR "Can't find target $id\n";
    next;                # go to next target
  }
  # open output file if requested
  if ($print_to_files) {
    $outfile = $id;
    $outfile =~ s/\|/_/g;
    $outfile .= ".$suffix" if ($suffix);
    open(OUTPUT,  ">$outfile") || die("Can't open file $outfile : $!\n");
  } else { # print to files
    open(OUTPUT, ">-");
  }
  foreach $addr (split) {        # one-to-many
    seek(FASTA, $addr, 0);        # move to start of target
    $_ = ;
    $idline = $_;
    @words = split;
    if ($check && $words[0] ne ">$id") {
      $tmp = $words[0];
      @words2 = split(/[>\|]/, $tmp);
      $tmp = $words2[$#words2];
      if ($tmp ne $id) {
    $lower = $tmp;
    $lower =~ tr/A-Z/a-z/;
    if ($lower ne $id) {
      print STDERR "Bad index?  target= >$id  found= $words[0]\n";
      next;                # go to next target
    }
      }
    }
    print OUTPUT "$idline";        # print header
    $seq = "";
    while () {            # read sequence lines
      if (/^>/) {last}            # start of next sequence
      # print sequence line
      chop;
      $seq .= $_;
    }
    # print sequence in lines of length $line_size
    # get portion of sequence to print if -off and/or -len given
    if ($off != 1 || $len != -1) {
      if ($len == -1) {
        $seq = substr($seq, $off-1);
      } else {
        $seq = substr($seq, $off-1, $len);
      }
    }
    for ($i=0; $i       print substr($seq, $i, $line_size), "\n";
    }
  } # addr
  close(OUTPUT);
} # id