stat_descriptive.pl
Ce script écrit en perl fournit des statistiques descriptives globales pour un fichier de séquences au format fasta ou fastq.
Il calcule le nombre totale de séquences, la longueur minimale, maximale, moyenne et médiane.
#!/usr/bin/perl
=pod
=head1 NAME
stat_descriptive.pl
=head1 SYNOPSIS
stat_descriptive.pl --help
=head1 REQUIRES
Perl5
=head1 DESCRIPTION
Get minimal statistic from a fasta or fastq file
=cut
use Statistics::Descriptive;
use Bio::SeqIO;
use Getopt::Long;
use Carp qw (cluck confess croak);
use warnings;
use Pod::Usage;
=pod
=head1 OPTIONS
Example :
perl stat_descriptive.pl --input <file> --format fasta
=head2 Parameters
=over 4
=item B<--input or -i> :
Input file name (<string>)
=item B<--format or -f> :
Format (<string>)
Default = 'fasta'
=item B<--variant or -v>
Supports all variants of fastq, including Illumina v 1.0 and 1.3:
variant note
-----------------------------------------------------------
sanger original
solexa Solexa, Inc. (2004), aka Illumina 1.0
illumina Illumina 1.3
Only available if format is fastq
=item B<--output or -o>
Output file name. Default input.summary
=item B<--help or -h> :
Prints a brief help message and exits.
=back
=cut
my $format = 'fasta';
my $variant = "";
my $help;
my $man;
my $file = "";
my $output = "";
my $usage = q/
stat_descriptive.pl [--input <file>] [--format <format>]
Parameters
-input Input file name (<string>) [required]
-format Input Format (<string>), default fasta
-variant Variants of fastq
-output Ouput file name (<string>), default input.summary
-help
/;
GetOptions(
'input=s' => \$file,
'format=s' => \$format,
'variant=s'=> \$variant,
'output=s' => \$output,
'man' => \$man,
'help|h|?' => \$help
) or pod2usage(1);
if ($help) { pod2usage(1); }
if ($man) { pod2usage( -verbose => 2 ); }
if ($file eq "") {
print "Warn :: --input is empty\nPlease specify a sequence file to be analyzed\n";
print $usage;
exit 0;
}
$output = $file .".summary" if $output eq "";
if ($variant eq "solexa") {
$format = "fastq-solexa";
}
elsif ($variant eq "illumina"){
$format = "fastq-illumina";
}
open(OUT,">$output");
my $in = new Bio::SeqIO(
-file => $file,
-format => $format
);
my $stats = new Statistics::Descriptive::Full;
while(my $seqobj = $in->next_seq) {
$stats->add_data($seqobj->length);
}
$in->close;
printf OUT "Number of sequences = %d\nMin length = %d\nMax length = %d\nMean = %.1f\nMedian = %1.f\n",
$stats->count, $stats->min,$stats->max, $stats->mean, $stats->median;
close OUT;
print "Result in : $output\n";