# freqs.pl v 0.2b
# By Catherine N. Ball, based on obvious code in
# Wall & Schwartz (1992:38) _Programming Perl_
sub make_frequency_index {
#package CNBfreqs;
@st_time = times;
# Initialization
$success = 0;
$Result = "";
($UD, $TextFile, $text_name, $sort) = @_;
#($TextFile, $text_name, $sort) = @_;
$/ = ''; # line terminator set to null (slurp paragraphs)
$* = 1; # enable multi-line patterns (allow newlines in string)
$OK = &LoadText; # load the text: results in $text
if(!$OK) { return (0, $Result); }
($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time);
$mon = $mon+1;
$year=$year+1900;
$date = "$mon/$mday/$year $hour\:$min";
$tokens = 0;
$ttr = 0;
$counter=0;
$cumperc=0;
undef %freqs;
# splitting on " " actually handles tabs, newlines, and any number of spaces.
# I need to remove punctuation. This is tricky. If I do 'non-word
# characters', that removes things like ash and accents, e.g. in
# foreach (split(/\W*\s+\W*/o)) {
# So, remove punctuation, but only before or after space.
foreach (@paragraphs) {
tr/a-z/A-Z/; # uppercase for sorting
s/[\!\,\?\:\;\.\/\"\'\`\(\)\{\}\[\]\|\#\-\_\^\\\*]+\s/ /g;
s/\s[\!\,\?\:\;\.\/\"\'\`\(\)\{\}\[\]\|\#\-\_\^\\\*]+/ /g;
foreach $w (split (" ")) {
$freqs{$w}++;
$tokens++;
}
}
undef @paragraphs;
$types = keys(%freqs);
$ttr = $types / $tokens;
$ttr = sprintf("%.4f",$ttr);
$Result .= "\tText:\t$text_name\n";
$Result .= "\tDate:\t$date\n";
$Result .= "\tTokens:\t$tokens\n";
$Result .= "\tTypes:\t$types\n";
$Result .= "\tRatio:\t$ttr\n";
$Result .= "\tSort:\t$sort\n\n";
#if ($sort ne 'alpha') { # Not enough info to do this with alpha.
if ($sort eq 'descending') {
$Result .= "\t-------------------------------------------------------------------------------------------\n";
$Result .= "\tRANK \tFREQ \tCOVERAGE\t\t\tWORD \n";
$Result .= "\t \t \tindividual\tcumulative \n";
$Result .= "\t-------------------------------------------------------------------------------------------\n";
}
if ($sort eq 'alpha') {
foreach (sort keys (%freqs)) {
$Result .= sprintf ("\t%-20s %d\n", $_, $freqs{$_});
}
}
### MAIN SORT IS THIS ONE
### Descending frequency
elsif ($sort eq 'descending')
{
while (($word, $freq) = each %freqs) {
$perc = ($freq/$tokens)*100;
$perc = sprintf("%.4f",$perc);
push(@count_pairs,"$freq \t$word \t$perc%");
}
#Now sort by $freq, ie biggest freq at top
sub bynumber_d {($b <=> $a) || ($a cmp $b);}
foreach (sort bynumber_d @count_pairs){
# The following has to be AFTER above.
$counter++;
split ("\t"); #Get data back out
$cumperc = $cumperc + $_[2]; #This is the indiv percent
if ($cumperc>100){$cumperc=100;}
$cumperc = sprintf("%.4f",$cumperc);
if ($cumperc>100){$cumperc=100;}
#if (($counter eq 10) || ($counter eq 20) || ($counter eq 30)) {
if (($counter>9)&&("10 20 30 40 50 60 70 80 90 100" =~ $counter)) {
$Result .= "\t$counter.\t$_[0] \t$_[2] \t$cumperc% \t$_[1] \n";
} else {
$Result .= "\t$counter.\t$_[0] \t$_[2] \t$cumperc% \t$_[1] \n";
}
}
}
# Ascending frequency
elsif ($sort eq 'ascending')
{
while (($word, $freq) = each %freqs) {
$perc = ($freq/$tokens)*100;
$perc = sprintf("%.4f",$perc);
push(@count_pairs,"$freq\t$perc%\t$word");
}
sub bynumber_a {($a <=> $b) || ($a cmp $b);}
foreach (sort bynumber_a @count_pairs){
$Result .= "\t$_\n";
}
}
# @end_time = times;
# $Result .= sprintf("\nProcessing time: %.4f CPU seconds.\n", ($end_time[0] - $st_time[0]) + ($end_time[1] - $st_time[1]));
return $Result;
#=======================================================
sub LoadText {
if ($UD) {@paragraphs = "$TextFile\n";} # add newline in case none.
else {
if(!open (TEXT,$TextFile)) {
$Result = "Unable to open text file $TextFile.\n";
return (0);
}
#@paragraphs = "$TextFile\n";
@paragraphs = ;
close TEXT;
}
} #end of LoadText
} # end of freqs
1;