Venn_stats.pl

09/12/2014 15:10

Ce script écrit en perl produit des statistiques Venn en comparant des lignes de plusieurs fichiers.

 

#!/usr/bin/perl

use strict; use warnings;

# Produce Venn stats for N files.  As Perl can use any
# string for a hash key this is fairly easy using hashes.

# Files need not be sorted.  Any duplicate lines within an input
# file will just be ignored.

use IO::File;
use IO::Pipe;
use List::Util qw(sum);
use Data::Dumper;

my %bighash;
my @bitarray; #Holds one count per file combination.
my @linecounts; #Holds total lines per file.
@ARGV or die "Usage: venn_stats.perl [-p FILTER] <file1> <file2> ... [fileN]\n";

#Allow a pre-processing step on each file.  Don't need to use
#getopts unless I add more args.
my $file_filter;
if($ARGV[0] eq '-p')
{
    shift;
    $file_filter = shift;
}

#Collect the data
my $bitmask = 1;
for(@ARGV)
{
    my $fh = $file_filter ?
    (IO::Pipe->new()->reader("$file_filter '$_'") or die $!) :
    (IO::File->new($_) or die $!);
    
    push @linecounts, 0;
    for(<$fh>)
    {
    chomp; $linecounts[-1]++;
    $bighash{$_} |= $bitmask;
    }

    $bitmask <<= 1;
}

#Flip the hash - we potentially have counts between 1 and $bitmask - 1;
$bitarray[$_]++ for values(%bighash);

#Now I need to generate the output.
# The hard part is the textual descriptions.

# First go through all files again and add up matching bitmasks to get total
# unique seqs in each file (ie. as in: sort -u | wc -l)
for(my $nn = 0; $nn < @ARGV; $nn++)
{
    my $unique_lines = sum map { $_ & 2**$nn ? ($bitarray[$_] || 0) : 0 } (1..$bitmask-1);
    printf "Total unique lines in file %s = %d%s\n",
        $nn + 1,
        $unique_lines,
        ($unique_lines == $linecounts[$nn]) ? "" : " (of $linecounts[$nn] total)";
}

# Then add everything to get grand total.
my $unique_total = sum grep {$_} @bitarray;
my $total_total = sum @linecounts;
printf "Total unique lines in all files = %d%s\n\n", $unique_total,
        ($unique_total == $total_total) ? "" : " (of $total_total total)";

for( sort { ($a =~ tr/1//) <=> ($b =~ tr/1//) || $a <=> $b } map {sprintf("%b", $_)} (1..$bitmask-2) )
{
    if(tr/1// == 1)
    {
    printf "Only in file %s = %d\n",  length($_), $bitarray[oct("0b$_")] || 0;
    }
    else
    {
    my $filenum = 1;
    printf "Found in files %s = %d\n",
        join(',', grep {$_} map { $filenum++ * $_ } split('', reverse($_))),
        $bitarray[oct("0b$_")] || 0;
    }    
}
printf "Found in all files = %d\n", $bitarray[$bitmask-1] || 0;