#!/usr/bin/env perl # Strictly a filter, this histograms incoming data points into requested bin sizes. If # bin sizes are not given with the -interval option, then the interval is assumed to be # max-min/10 # &: Jan 28, 2003 # use strict; use warnings; use POSIX qw(floor ceil); my $IntervalWidth = undef; my $Min = undef; my $Max = undef; my $UseStar = undef; # Print horiz histogram with hash-marks my $Scale = 1; # If $UseStar, each * represents a count of this much. sub usage { print STDERR "Usage: $0 -min MINVALUE -max MAXVALUE -int INTERVAL [-stars] [-scale]\n"; print STDERR "Strictly a filter, this histograms incoming numerical data points into\n" . "requested bin sizes. If bin sizes are not given with the -interval\n" . "option, then the interval is assumed to be max-min/10.\n"; print STDERR "Use -stars to print a horizontal bar of stars for each bin\n"; print STDERR "Use -scale to increase the weight of each star, if -stars is used\n"; exit 1; } MAIN: { while (my $arg = shift(@ARGV)) { if ($arg =~ /^-h(elp)?$/) { usage(); } elsif ($arg =~ /^-int/) { $IntervalWidth = shift(@ARGV); } elsif ($arg =~ /^-min/) { $Min = shift(@ARGV); } elsif ($arg =~ /^-max/) { $Max = shift(@ARGV); } elsif ($arg =~ /^-star/) { $UseStar = 1; } elsif ($arg =~ /^-scale/) { $Scale = shift(@ARGV); } elsif ($arg =~ /^-/) { print STDERR "Unknown option: $arg\n"; usage(); } else { last; } } usage() if (!defined($Max) || !defined($Min) || !$IntervalWidth || $IntervalWidth < 0); # We have ($Max-$Min)/$IntervalWidth bins. my ($sum, $sumSq, $num, $ltMin, $gtMax) = (0,0,0,0,0); my ($actualMin, $actualMax) = (undef, undef); my @bins = (); while (my $line = ) { my @f = split(/\s+/, $line); foreach my $val (@f) { next if $val =~ /^\s*$/; $sum += $val; $sumSq += $val * $val; $num++; if ($val < $Min) { ++$ltMin; } elsif ($val >$Max) { ++$gtMax; } else { my $binNdx = floor(($val-$Min)/$IntervalWidth); ++$bins[$binNdx]; } $actualMin = $val if (!defined $actualMin || $val < $actualMin); $actualMax = $val if (!defined $actualMax || $val > $actualMax); print STDERR "$ltMin ] $num [ $gtMax\r" if ($num %10000 == 0); print STDERR "$ltMin ] $num [ $gtMax\n" if ($num %1000000 == 0); } } # Misc Stats my $mean = $num == 0? 0 : $sum/$num; my $var = $num == 0? 0 : $sumSq/$num - $mean * $mean; my $sd = sqrt($var); $Scale = 1 if ($Scale <= 0); print "# NumSamples = $num; Max = $actualMax; Min = $actualMin\n"; print "# Mean = $mean; Variance = $var; SD = $sd\n"; print "# Each * represents a count of $Scale\n" if ($UseStar); # Use a flag to indicate whether we are within a range of consecutive # empty bins to better format output. Assume gaps to the left of # min bin to start. my $inGap = 1; my $haveStarted = 0; my $numBins = ($Max-$Min)/$IntervalWidth; for (my $i = 0; $i < $numBins; $i++) { $bins[$i] = 0 if !defined($bins[$i]); next if (!$haveStarted && $bins[$i] == 0); $haveStarted = 1; my $lo = $Min + $i*$IntervalWidth; my $hi = $Min + ($i+1)*$IntervalWidth; # Print a blank line in lieu of multiple consecutive empty bins next if ($inGap && $bins[$i] == 0); if ($bins[$i] == 0) { print "\n"; $inGap = 1; next; } $inGap = 0; my $tag = sprintf("%.4f - %.4f", $lo, $hi); if ($UseStar && $Scale != 0) { printf "%20s [%6d]: %s\n", "$tag", $bins[$i], getStringOfLength($bins[$i]/$Scale); } else { printf "%20s: %d\n", "$tag", $bins[$i]; } } } #---------------------------------------------------------------------- sub getStringOfLength { my($n) = @_; my $s = ""; for (my $j = 0; $j < $n; $j++) { $s .= "*"; } return $s; }