#!/usr/bin/perl -w

eval 'exec /usr/bin/perl -w -S $0 ${1+"$@"}'
    if 0; # not running under some shell

# StatNews - A script to generate some statistics out of a newsgroup.
# $Id: statnews.pl,v 1.2 1999/11/04 23:25:43 salve Exp $
#
# Copyright  Davide G. M. Salvetti <salve@debian.org>, 1998.
#
# This program is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the
# Free Software Foundation; either version 2 of the License, or (at your
# option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to: The Free Software Foundation, Inc.,
# 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
#
# On Debian GNU/Linux System you can find a copy of the GNU General Public
# License in /usr/doc/copyright/GPL.

use strict; use vars qw($CAP $DOT $FROM $SHRINK $SPOOL $TO $WIDTH);
use Getopt::Long;
#Getopt::Long::config(qw(bundling));
use Time::Local;
use MIME::Words qw(:all);
use locale;

# Please, don't change the following line unless you've carefully read
# the GNU General Public License and you're sure you both understand
# it and know what you're doing (in legal terms).
my $AUTHOR = 'Copyleft (C) Davide G. M. Salvetti <salve@debian.org>, 1998.';

my $REL = q$Revision: 1.2 $; chop($REL);

#---------------------------#
#-c-o-n-f-i-g-u-r-a-t-i-o-n-#
#---------------------------#

# The default directory where all newsgroups hierarchies reside.
$SPOOL = '/var/spool/news/';

# The default terminal width.
$WIDTH = 80;

# The minimum allowed terminal width.
$SHRINK = 70;

# Whether to capitalize sender and receiver names.
$CAP = 1;

# Whether to leave dot or translate to slash in newsgroup names.
$DOT = 0;


#-----------------------#
#-s-u-b-r-o-u-t-i-n-e-s-#
#-----------------------#

# help: Prints usage information.
sub help {
    print "StatNews generates some useful statistics out of a newsgroup.\n";
    print "$AUTHOR\n";
    print "Refer to the GNU General Public License for condition of use.\n";
    print "\nUsage: statnews [OPTIONS] NEWSGROUP\n";
    print "\nOptions:\n";
    print "  --capitalize(*)    whether to capitalize the name of both the sender\n";
    print "                     and the receiver of each message\n";
    print "                     (default is --capitalize: yes)\n";
    print "  --dotted(*)        whether to translate \".\" to \"/\"' in NEWSGROUP\n";
    print "                     (default is --nodotted: does translate)\n";
    print "  --from=DATE        set the date statistics start from\n";
    print "                     (DATE format is dd/mm/yyyy GMT)\n";
    print "  --help             display this help summary\n";
    print "  --spool=SPOOLDIR   search NEWSGROUP in SPOOLDIR\n";
    print "                     (default is $SPOOL)\n";
    print "  --to=DATE          set the date statistics end by\n";
    print "                     (DATE format is dd/mm/yyyy GMT)\n";
    print "  --width=WIDTH      set the terminal width to WIDTH columns\n";
    print "                     (default is $WIDTH, with a minimum of $SHRINK)\n";
    print "\nOptions may be conveniently abbreviated and prefixed by \"-\" instead\n";
    print "of \"--\"; the \"=\" may be omitted or substituted with one or more blanks.\n";
    print "Options listed with (*) may be negated by adding the prefix \"no\" in\n";
    print "front of them (e.g., --dotted => --nodotted).\n";
    print "\nThe content of the environment variable STATNEWS is prepended to the\n";
    print "argument list if set.  This can be used to override defaults.\n";
    print "\nPlease, report bugs to <salve\@debian.org>.\n";
}

# center: Center the argument string and return it.
sub center {
    my ($line) = @_;

    return ' ' x (($WIDTH - length($line))/2) . $line . "\n";
}

# dotline: Take the string and pad it right with dots.
sub dotline {
    my ($len, $line) = @_;

    my $fmt = sprintf("%%.%ds%%s", $len);
    return sprintf($fmt, $line, '.' x ($len -length($line)));
}

# underline: Underline the argument string and return it.
sub underline {
    my ($line) = @_;

    return sprintf("%s\n%s\n", $line, '=' x length($line));
}

# fmttime: Take a time struct and returns a string.
sub fmttime {
    my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) = @_;

    my @days = ("Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat");
    my @months = ("Jan", "Feb", "Mar", "Apr", "May", "Jun",
		  "Jul", "Aug", "Sep", "Oct", "Nov", "Dec");
    # _I_ like it this way: if you want some other order, just do it.
    return sprintf("%s %d %s %d (GMT) %02d:%02d", $days[$wday], $mday, $months[$mon],
		   1900 + $year, $hour, $min);
}


#-------------------------#
#-m-a-i-n---p-r-o-g-r-a-m-#
#-------------------------#

# Initialize start and end date.
($FROM, $TO) = (0, time);
# Environment configuration.
if ($_ = $ENV{STATNEWS}) {@_ = split; unshift(@ARGV, @_);}
# Options.
my %opt; unless (GetOptions(\%opt, "capitalize!", "dotted!", "from=s", "help",
			    "spool=s", "to=s", "width=i")) {help(); exit(1);}
if ($opt{"help"}) {help(); exit(0);}
if (defined($opt{"capitalize"})) {$CAP = $opt{"capitalize"};}
if (defined($opt{"dotted"})) {$DOT = $opt{"dotted"};}
if ($_ = $opt{"spool"}) {$SPOOL = $_ . '/';}
if ($_ = $opt{"width"}) {
    if ($_ >= $SHRINK) {$WIDTH = $_;}
    else { warn "Width should be at least $SHRINK."; $WIDTH = $SHRINK;}
}
if ($_ = $opt{"from"}) {@_ = split m|/|; $FROM = timegm(0, 0, 0, $_[0], $_[1] - 1, $_[2] - 1900);}
if ($_ = $opt{"to"}) {@_ = split m|/|; $TO = timegm(0, 0, 0, $_[0], $_[1] - 1, $_[2] - 1900);}
die "$0: Start date has to be less than end date: " . 
    gmtime($FROM) . ", " . gmtime($TO) . ".\n" unless ($FROM < $TO);
unless ($_ = $ARGV[0]) {help(); exit(0);}
# Translate "news.group" to "news/group", unless --dotted is set.
unless ($DOT) {s|\.|/|g;}
my $newsgroup = $_;
# Initialize the time interval (this is intentionally reversed).
my($mintime, $maxtime) = ($TO, $FROM);
# Results variables, declared here for strict.
my(%auth, %dest, %qchars, %qlines, %quote, %ratio, %rchars, %rlines, %subj, %wchars, %wlines);
my($auth, $chars, $days, $i, $lines, $qchars, $qlines);

chdir($SPOOL) || die "$0: Can't chdir \`$SPOOL\': \"$!\"";
opendir(DIR, $newsgroup) || die "$0: Can't opendir \`$newsgroup\': \"$!\"";
my @articles = grep { /^[0-9]+$/ && -f "$newsgroup/$_" } readdir(DIR);
die "$0: Empty newsgroup \`$newsgroup\'." unless (@articles);
foreach my $IN (@articles) {
    # mtime is $stat[9].
    my $mtime = (stat("$newsgroup/$IN"))[9];
    # Skip articles outside the time window.
    next if ($mtime < $FROM || $mtime > $TO);
    # Get the maximum and minimum times.
    $mintime = ($mintime < $mtime)?$mintime:$mtime;
    $maxtime = ($maxtime > $mtime)?$maxtime:$mtime;
    my($from, $dest);
    open(IN, "$newsgroup/$IN") || die "$0: Can't open $IN: $!";
    while (<IN>) {
	# Until headers end.
	last if /^$/;
	# I think to study is a good thing.
	chomp; study;
	if (s/^From: //) {
	    # Get name from address: this is simple, but behaves mostly well.
	    s/\"//go; unless (s/\s+<.*>//g) {s/.*\((.*)\)/$1/g;}
	    # Capitalize: this is to collect "d'Andrea" together with "D'Andrea".
	    if ($CAP) {s/\b(\w)/\U$1/g;}
	    $from = decode_mimewords($_);
	    $auth{$from}++;
	}
	# X-Comment-To is a Fido syntax.
	if (s/^((X-)?Comment-)?To: //) {
	    # Capitalize.
	    if ($CAP) {s/\b(\w)/\U$1/g;}
	    $dest = decode_mimewords($_);
	    $dest{$dest}++;
	}
	if (s/^Subject: ([Rr]e:\s+)?//) {$subj{decode_mimewords($_)}++;}
    }
    # The body.
    my ($chars, $lines, $qchars, $qlines) = (0, 0, 0, 0);
    while (<IN>) {
	# Count new-line's as well.
	$chars += length;
	# Don't count blank lines.
	$lines++ if ($_);
	# It's not a quote if it doesn't match.
	if (/^\s*\w{0,5}> /) {
	    $qchars += length;
	    $qlines++;
	}
    }
    $wlines{$from} += $lines;
    $qchars{$from} += $qchars;
    $qlines{$from} += $qlines;
    $wchars{$from} += $chars;
    # There are messages without $dest, especially in UseNet.
    if ($dest) {
	$rchars{$dest} += $chars;
	$rlines{$dest} += $lines;
    }
    close(IN);
}
closedir(DIR);
# It makes sense to add one day to the time interval.
$days = int(($maxtime - $mintime)/(3600 * 24)) + 1;
# Grand totals.
($auth, $chars, $lines, $qchars, $qlines) = (0, 0, 0, 0, 0);
foreach my $key (keys %auth) {
    $auth += $auth{$key};
    $qchars += $qchars{$key};
    $qlines += $qlines{$key};
    # These could not be %rchars and %rlines,
    # since not all chars are received, but every char is written.
    $chars += $wchars{$key};
    $lines += $wlines{$key};
}

print center("StatNews Report"), center("($REL)"), "\n", center($AUTHOR), "\n\n\n";
printf("Newsgroup................: %s\n", $ARGV[0]);
printf("Time stamp...............: %s\n", fmttime(gmtime()));
printf("Start....................: %s\n", fmttime(gmtime(($mintime==$TO)?$FROM:$mintime)));
printf("End......................: %s\n", fmttime(gmtime(($maxtime==$FROM)?$TO:$maxtime)));
unless ($auth) {print "\nNo articles found!\n"; exit(0);}
printf("Days.....................: %d\n", $days);
printf("Messages.................: %d\n", $auth);
printf("Characters...............: %d\n", $chars);
printf("Average message length...: %.1f\n", $chars/$auth);
printf("Messages per day.........: %.1f\n", $auth/$days);
printf("Characters per day.......: %d\n", $chars/$days);
printf("Quoting ratio............: %.1f%% (lines) %.1f%% (chars)\n", 
       100*$qlines/$lines, 100*$qchars/$chars);

printf("\n\n%s\n", underline('Message threads (#1: tot, #2: mesgs/day, #3: share)'));
$i = 0;
foreach my $key (sort {$subj{$b} <=> $subj{$a} || $a cmp $b} keys %subj) {
    printf("%3d) %s: %3d %4.1f %4.1f%%\n", ++$i, dotline($WIDTH - 22, $key),
	   $subj{$key}, $subj{$key}/$days, 100*$subj{$key}/$auth);
}

# Usenet (as opposed to Fidonet) doesn't use To very often.
if (%dest) {
    printf("\n\n%s\n", underline('Messages received (#1: tot, #2: mesgs/day, #3: share)'));
    $i = 0;
    foreach my $key (sort {$dest{$b} <=> $dest{$a} || $a cmp $b} keys %dest) {
	printf("%3d) %s: %3d %4.1f %4.1f%%\n", ++$i, dotline($WIDTH - 22, $key),
	       $dest{$key}, $dest{$key}/$days, 100*$dest{$key}/$auth);
    }

    printf("\n\n%s\n", underline('Characters received (#1: tot, #2: chars/day, #3: share)'));
    $i = 0;
    foreach my $key (sort {$rchars{$b} <=> $rchars{$a} || $a cmp $b} keys %rchars) {
	printf("%3d) %s: %6d %7.1f %4.1f%%\n", ++$i, dotline($WIDTH - 28, $key),
	       $rchars{$key}, $rchars{$key}/$days, 100*$rchars{$key}/$chars);
    }
}

printf("\n\n%s\n", underline('Messages sent (#1: tot, #2: mesgs/day, #3: share)'));
$i = 0;
foreach my $key (sort {$auth{$b} <=> $auth{$a} || $a cmp $b} keys %auth) {
    printf("%3d) %s: %3d %4.1f %4.1f%%\n", ++$i, dotline($WIDTH - 22, $key),
	   $auth{$key}, $auth{$key}/$days, 100*$auth{$key}/$auth);
}

printf("\n\n%s\n", underline('Characters sent (#1: tot, #2: chars/day, #3: share)'));
$i = 0;
foreach my $key (sort {$wchars{$b} <=> $wchars{$a} || $a cmp $b} keys %wchars) {
    printf("%3d) %s: %6d %7.1f %4.1f%%\n", ++$i, dotline($WIDTH - 28, $key),
	   $wchars{$key}, $wchars{$key}/$days, 100*$wchars{$key}/$chars);
}

# Received/sent ratio.  Build the hash first to test and sort it over.
# Iteration on %dest, since chances are it's shorter than %auth.
foreach my $key (keys %dest) {
    if ($auth{$key}) {$ratio{$key} = $dest{$key}/$auth{$key}};
}
if (%ratio) {
    printf("\n\n%s\n",
	   underline('Received/Sent Ratio (#1: rmesgs/smesgs, #2: rchars/schars)'));
    $i = 0;
    foreach my $key (sort {$ratio{$b} <=> $ratio{$a} || $a cmp $b} keys %ratio) {
	printf("%3d) %s: %5.2f %5.2f\n", ++$i, dotline($WIDTH - 19, $key),
	       $ratio{$key}, $rlines{$key}/$wlines{$key}, $rchars{$key}/$wchars{$key});
    }
}

# Quoting ratio & Co.  Build the hash first to sort it over.
foreach my $key (keys %qlines) {$quote{$key} = 100*$qlines{$key}/$wlines{$key};}
printf("\n\n%s\n", underline('Quoting Ratio (#1: qlines/wlines, #2: qchars/wchars)'));
$i = 0;
foreach my $key (sort {$quote{$b} <=> $quote{$a} || $a cmp $b} keys %quote) {
    printf("%3d) %s: %4.1f%% %4.1f%%\n", ++$i, dotline($WIDTH - 19, $key),
	   $quote{$key}, 100*$qchars{$key}/$wchars{$key});
}

__END__

=head1 NAME

statnews - generate some useful statistics out of a newsgroup

=head1 SYNOPSIS

statnews [OPTIONS] I<NEWSGROUP>

=head1 DESCRIPTION

The B<statnews> command get some useful statistics out of a newsgroup.
It displays things like how many articles each author posted, how many
characters was written, how many lines were quoted, how many articles
belong to each thread, the number of messages/characters per day, the
average message length, and so on.

=head1 OPTIONS

=over 4

=item B<--capitalize>(*)

Whether to capitalize the name of both the sender and the receiver of
each message (default is C<--capitalize>: yes).  This option is useful
to collect C<"AUTHOR"> together with C<"author">, C<"Author">, and
C<"AuThor">.

=item B<--dotted>(*)

Whether to translate C<"."> to C<"/"> in I<NEWSGROUP> (default is
C<--nodotted>: does translate).  This option may be useful if your
system stores each newsgroup in a dedicate directory (e.g.,
F<news.useless.group>) instead that by hierarchy (e.g.,
F<news/useless/group>), or if your system has a news archive stored
this way.

=item B<--from=>I<DATE>

Set the date statistics start from (I<DATE> format is C<dd/mm/yyyy>, GMT).

=item B<--help>

Display the help summary.

=item B<--spool=>I<SPOOLDIR>

Search I<NEWSGROUP> in I<SPOOLDIR> (default is F</var/spool/news/>).

=item B<--to=>I<DATE>

Set the date statistics end by (I<DATE> format is C<dd/mm/yyyy>, GMT).

=item B<--width=>I<WIDTH>

Set the terminal width to I<WIDTH> columns (default is 80, with a
minimum of 70).

=back

Options may be conveniently abbreviated and prefixed by "-" instead of
"--"; the "=" may be omitted or substituted with one or more blanks.

Options listed with (*) may be negated by adding the prefix C<"no"> in
front of them (e.g., C<--dotted> => C<--nodotted>).

=head1 RETURN VALUE

The B<statnews> command returns 0 on success and a positive integer on
errors.

=head1 ENVIRONMENT

The environment variable I<STATNEWS> can hold a set of default options for
B<statnews>.

These options are interpreted first by the program and can be
overridden by explicit command line parameters.  For example:

=over 4

=item B<sh:>

C<STATNEWS="--nocapital --width=132"; export STATNEWS>

=item B<csh:>

C<setenv STATNEWS "--nocapital --width=132">

=back

=head1 FILES

The default spool directory is F</var/spool/news>.

=head1 SEE ALSO

L<rn(1)>, L<readnews(1)>.

=head1 BUGS

There are no know bugs.

=head1 UNRESTRICTIONS

This program is copylefted.  Refer to the GNU General Public License
for conditions of use.

=head1 AUTHOR

This program has been written and is actively maintained by S<Davide
G. M.> Salvetti <salve@debian.org>.

=head1 HISTORY

This program was originally aimed for use with FidoNet style echo
areas under Debian GNU/Linux.  It now can be used with Usenet
newsgroups as well.  More precisely, it can be used with every message
base that stores each message in a file in some directory.

=cut
