get_gc_content.pl

09/12/2014 14:54

Ce script en perl génère un fichier tabulé nommé "gc_out.txt" avec pour chaque séquence au format fasta, sa taille, son taux de GC et le nombre de A, T, C et G présent dans la séquence.

 

 #!/usr/bin/perl -w
####################################################################################################
### Get GC Content                                                                               ###
### Usage: get_gc_content.pl <fasta file>                                                        ###
### This program takes a fasta file as it's first (and only) parameter.                          ###
###                                                                                              ###
### It returns a tab delimited file (gc_out.txt): column 1 = header ID (everything between ">"   ###
### and the first space in the header), and column 2 = gc content for the fasta entry.           ###
###                                                                                              ###
### This script now also returns the total nucleotide count, along with the number of of         ###
### A's, G's, C's and T's for each fasta record.                                                 ###
####################################################################################################
#---------------------------------------------------------------------------------------------------------------------------
#Deal with passed parameters
#---------------------------------------------------------------------------------------------------------------------------
if ($#ARGV == -1) {
    usage();
    exit;
}
$fasta_file = $ARGV[0];
$out_file = "gc_out.txt";
unless ( open(IN, "$fasta_file") ) {    
    print "Got a bad fasta file: $fasta_file\n\n";
    exit;
}
unless ( open(OUT, ">$out_file") ) {
    print "Couldn't create $out_file\n";
    exit;
}
print "Parameters:\nfasta file = $fasta_file\noutput file = $out_file\n\n";
#---------------------------------------------------------------------------------------------------------------------------
#The main event
#---------------------------------------------------------------------------------------------------------------------------
print OUT "ID\t% GCContent\tTotal Count\tG Count\tC Count\tA Count\tT Count\n";
$seq = "";
while (<IN>) {
    chomp;
    if (/^>/) {
    #finish up previous line.
    if (length($seq) > 0) {
        &process_it;
    }
    #start new line.
    $id = $_;
    $id =~ s/^>(.+?)\s.+$/$1/g;
    print OUT "$id\t";
    }
    else {
    $seq = $seq . $_;
    }
}

#finish up last line.
&process_it;

close(IN);
close(OUT);

sub process_it {
    @letters = split(//, $seq);
    $gccount = 0;
    $totalcount = 0;
    $acount = 0;
    $tcount = 0;
    $gcount = 0;
    $ccount = 0;
    foreach $i (@letters) {
    if (lc($i) =~ /[a-z]/) {
        $totalcount++;
    }
    if (lc($i) eq "g" || lc($i) eq "c") {
        $gccount++;
    }
    if (lc($i) eq "a") {
        $acount++;
    }
    if (lc($i) eq "t") {
        $tcount++;
    }
    if (lc($i) eq "g") {
        $gcount++;
    }
    if (lc($i) eq "c") {
        $ccount++;
    }
    }
    if ($totalcount > 0) {
    $gccontent = (100 * $gccount) / $totalcount;
    }
    else {
    $gccontent = 0;
    }
    print OUT "$gccontent\t$totalcount\t$gcount\t$ccount\t$acount\t$tcount\n";
    $seq = "";
}