seqfilediff.pl

11/12/2014 15:42

Ce script perl compare deux fichiers de séquences au format fasta

 

 

#!/usr/bin/perl

use warnings;
use strict;
use Bio::SeqIO;
use Getopt::Long;
use constant USAGE =><<END;

SYNOPSIS:

 seqfilediff.pl [OPTIONS] [file1] <file2>

DESCRIPTION:

Reports the differences between two sequence files, and prints either the
sequences unique to file1 or the sequences that the files have in common.

OPTIONS:

       --help
             Prints a help.
       --informat
             Format of the sequence files. Fasta is default.
       --outformat
             Format of the sequence files. Fasta is default.
       --common
             Prints the sequences that the files have in common.
       --onlyfile2
             Prints the sequences that are unique to file2 instead.
       --onlyid
             Prints the id of each sequence only.

EXAMPLES:

 # Print the ids that the two fasta files have in common:
 seqfilediff.pl --common --onlyid file1.fa file2.fa

 # Print the entries unique to each of two EMBL files:
 cat file2.embl | seqfilediff.pl --informat EMBL file1.embl

COPYRIGHT:

This program is free software. You may copy and redistribute it under
the same terms as Perl itself.

END

my $help = 0;
my $informat = 'Fasta';
my $outformat = '';
my $common = 0;
my $onlyfile2 = 0;
my $onlyid = 0;

GetOptions("help" => \$help,
           "informat|format=s" => \$informat,
           "outformat=s" => \$outformat,
           "common" => \$common,
           "onlyfile2" => \$onlyfile2,
           "onlyid" => \$onlyid) or die USAGE;

$common && $onlyfile2 and die "Incompatible options: --common and --onlyfile2\n";

my $file2 = pop @ARGV or die USAGE;;
my $file1 = pop @ARGV if @ARGV;

$outformat ||= $informat;

# Make file/steam handles:
my $fh1;
my $fh2;
if ($file2) {
  $fh2 = Bio::SeqIO->newFh(-file => "$file2",
                           -format => $informat);
}
if ($file1) {
  $fh1 = Bio::SeqIO->newFh(-file => "$file1",
                           -format => $informat);
} else {
  $fh1 = Bio::SeqIO->newFh(-fh => \*STDIN,
                           -format => $informat);
}

# Create an output file handle:
my $out;
$out = Bio::SeqIO->newFh(-fh => \*STDOUT , '-format' => $outformat );

# Hash the sequence entries:
my %file1;
while (<$fh1>) {
  $file1{$_->id()} = $_;
}
my %file2;
while (<$fh2>) {
  $file2{$_->id()} = $_;
}

# Do the proper printing:
if ($common) {
  for (keys %file1) {
    if ($onlyid) {
      print "$_\n" if defined $file2{$_};
    } else {
      print $out $file1{$_} if defined $file2{$_};
    }
  }
} elsif ($onlyfile2) {
  for (keys %file2) {
    if ($onlyid) {
      print "$_\n" unless defined $file1{$_};
    } else {
      print $out $file2{$_} unless defined $file1{$_};
    }
  }
} else {
  for (keys %file1) {
    if ($onlyid) {
      print "$_\n" unless defined $file2{$_};
    } else {
      print $out $file1{$_} unless defined $file2{$_};
    }
  }
}

=head1 SYNOPSIS:

 seqfilediff.pl [OPTIONS] [file1] <file2>

=head1 DESCRIPTION:

Reports the differences between two sequence files, and prints either the
sequences unique to file1 or the sequences that the files have in common.

=head1 OPTIONS:

=over 4

=item --help

Prints a help.

=item --format

Format of the sequence files. Fasta is default.

=item --common

Prints the sequences that the files have in common.

=item --onlyfile2

Prints the sequences that are unique to file2 instead.

=item --onlyid

Prints the id of each sequence only.

=back

=head1 EXAMPLES:

 # Print the ids that the two fasta files have in common:
 seqfilediff.pl --common --onlyid file1.fa file2.fa

 # Print the entries unique to each of two EMBL files:
 cat file2.embl | seqfilediff.pl --informat EMBL file1.embl

=head1 COPYRIGHT:

This program is free software. You may copy and redistribute it under
the same terms as Perl itself.


=cut