translate2aa.pl
Ce script écrit en perl traduit des séquences nucléotidiques (ADN ou ARN) en séquences protéiques en utilisant l'outil getorf d'EMBOSS. Il sélectionne de manière automatique l'ORF la plus grande à partir d'un seuil de taille fournit par l'utilisateur.
Attention, ce script nécessité d'installer le package EMBOSS
#!/usr/bin/perl -w
#
# translate2aa.pl
#
# Translate dna sequences in fasta format using EMBOSS getorf.
# The longest ORF is used. Input sequences shorter than a
# threshold (minLen) is skipped.
#
# getorf must be located in PATH.
#
use Bio::SeqIO;
use IPC::Open2;
use strict;
my $usage = "translate2aa.pl [ -m minlen ] dna_fasta_file";
my $minLen = 30; # Minimum number of nucleotides to be reported
sub dna2aa($) {
my $getorfProg = "/home/cyril/Programs/EMBOSS-6.6.0/emboss/getorf --filter -minsize $minLen";
my $dna = shift @_;
return if length($dna) < $minLen;
# Run getorf
my $pid = open2(*RFH, *WFH, $getorfProg);
print WFH $dna;
close WFH;
my $maxLen = 0;
my $aa;
my $best = 0;
while (<RFH>) {
chomp;
if (/^>.*\[(\d+)\D+(\d+)]/) {
my $len = abs($2-$1);
if ($len > $maxLen) {
$maxLen = $len;
$best = 1;
$aa = "";
}
else {
$best = 0;
}
}
elsif ($best) {
$aa .= $_;
}
}
close RFH;
waitpid($pid, 0);
return $aa;
}
# Get options and arguments
if (@ARGV == 3 && $ARGV[0] eq "-m") {
shift;
$minLen = shift;
die "Minimum length must be a positive integer!\n" if $minLen !~ /^\d+$/;
}
die "Usage: $usage\n" if @ARGV != 1;
my $fna = $ARGV[0];
die "$fna does not exist!\n" if ! -f $fna;
my $in = Bio::SeqIO->new(-file => $fna, -format => 'Fasta');
while (my $seq = $in->next_seq) {
my $aa = dna2aa($seq->seq);
if (!defined $aa) {
warn "Could not translate ", $seq->id, "\n";
next;
}
print ">", $seq->id, "\n";
print "$aa\n";
}