#!/usr/bin/perl

# configure
use constant STOP => 'stopwords.inc';

# include
use strict;
require 'subroutines.pl';

my $book = $ARGV[ 0 ];
if ( ! $book ) {

	print "Usage: $0 <file>\n";
	exit;
	
}

# parse
my @words = &list_words( $book );

# count words
my %word_count = ();
for ( my $i = 0; $i <= $#words; $i++ ) { $word_count{ $words[ $i ] }++ }

# construct bi-grams
my @bigrams = ();
for ( my $i = 0; $i < $#words; $i++ ) { $bigrams[ $i ] = $words[ $i ] . ' ' . $words[ $i + 1 ] }

# count bi-grams
my %bigram_count = ();
for ( my $i = 0; $i < $#words; $i++ ) { $bigram_count{ $bigrams[ $i ] }++ }

# calculate t-score
my %tscore = ();
for ( my $i = 0; $i < $#words; $i++ ) {

	$tscore{ $bigrams[ $i ] } = ( $bigram_count{ $bigrams[ $i ] } - 
	                              $word_count{ $words[ $i ] } * 
	                              $word_count{ $words[ $i + 1 ] } / 
	                              ( $#words + 1 ) ) / 
	                              sqrt( $bigram_count{ $bigrams[ $i ] }
	                            );

}

# display (sans stopwords, etc.)
my $stopwords = &slurp_words( STOP );
foreach my $bigram ( sort { $tscore{ $b } <=> $tscore{ $a } } keys %tscore ) {

	my ( $first_token, $second_token ) = split / /, $bigram;
	
	# remove stopwords, etc.
	next if ( $$stopwords{ $first_token } );
	next if ( $first_token =~ /[,.?!:;()\-]/ );
	next if ( $$stopwords{ $second_token } );
	next if ( $second_token =~ /[,.?!:;()\-]/ );
	
	# done
	print "$tscore{ $bigram }\t"           . 
	      "$word_count{ $first_token }\t"  . 
	      "$word_count{ $second_token }\t" . 
	      "$bigram_count{ $bigram }\t"     . 
	      "$bigram\t\n";

}