Venn_stats.pl
Ce script écrit en perl produit des statistiques Venn en comparant des lignes de plusieurs fichiers.
#!/usr/bin/perl
use strict; use warnings;
# Produce Venn stats for N files. As Perl can use any
# string for a hash key this is fairly easy using hashes.
# Files need not be sorted. Any duplicate lines within an input
# file will just be ignored.
use IO::File;
use IO::Pipe;
use List::Util qw(sum);
use Data::Dumper;
my %bighash;
my @bitarray; #Holds one count per file combination.
my @linecounts; #Holds total lines per file.
@ARGV or die "Usage: venn_stats.perl [-p FILTER] <file1> <file2> ... [fileN]\n";
#Allow a pre-processing step on each file. Don't need to use
#getopts unless I add more args.
my $file_filter;
if($ARGV[0] eq '-p')
{
shift;
$file_filter = shift;
}
#Collect the data
my $bitmask = 1;
for(@ARGV)
{
my $fh = $file_filter ?
(IO::Pipe->new()->reader("$file_filter '$_'") or die $!) :
(IO::File->new($_) or die $!);
push @linecounts, 0;
for(<$fh>)
{
chomp; $linecounts[-1]++;
$bighash{$_} |= $bitmask;
}
$bitmask <<= 1;
}
#Flip the hash - we potentially have counts between 1 and $bitmask - 1;
$bitarray[$_]++ for values(%bighash);
#Now I need to generate the output.
# The hard part is the textual descriptions.
# First go through all files again and add up matching bitmasks to get total
# unique seqs in each file (ie. as in: sort -u | wc -l)
for(my $nn = 0; $nn < @ARGV; $nn++)
{
my $unique_lines = sum map { $_ & 2**$nn ? ($bitarray[$_] || 0) : 0 } (1..$bitmask-1);
printf "Total unique lines in file %s = %d%s\n",
$nn + 1,
$unique_lines,
($unique_lines == $linecounts[$nn]) ? "" : " (of $linecounts[$nn] total)";
}
# Then add everything to get grand total.
my $unique_total = sum grep {$_} @bitarray;
my $total_total = sum @linecounts;
printf "Total unique lines in all files = %d%s\n\n", $unique_total,
($unique_total == $total_total) ? "" : " (of $total_total total)";
for( sort { ($a =~ tr/1//) <=> ($b =~ tr/1//) || $a <=> $b } map {sprintf("%b", $_)} (1..$bitmask-2) )
{
if(tr/1// == 1)
{
printf "Only in file %s = %d\n", length($_), $bitarray[oct("0b$_")] || 0;
}
else
{
my $filenum = 1;
printf "Found in files %s = %d\n",
join(',', grep {$_} map { $filenum++ * $_ } split('', reverse($_))),
$bitarray[oct("0b$_")] || 0;
}
}
printf "Found in all files = %d\n", $bitarray[$bitmask-1] || 0;