# freqs.pl v 0.2b
# By Catherine N. Ball, based on obvious code in
# Wall & Schwartz (1992:38) _Programming Perl_
sub make_frequency_index {
#package CNBfreqs;
@st_time = times;
# Initialization
$success = 0;
$Result = "";
($UD, $TextFile, $text_name, $sort) = @_;
$/ = ''; # line terminator set to null (slurp paragraphs)
$* = 1; # enable multi-line patterns (allow newlines in string)
$OK = &LoadText; # load the text: results in $text
if(!$OK) { return (0, $Result); }
($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time);
$mon = $mon+1;
$year=$year+1900;
$date = "$mon/$mday/$year $hour\:$min";
$tokens = 0;
$ttr = 0;
undef %freqs;
# splitting on " " actually handles tabs, newlines, and any number of spaces.
# I need to remove punctuation. This is tricky. If I do 'non-word
# characters', that removes things like ash and accents, e.g. in
# foreach (split(/\W*\s+\W*/o)) {
# So, remove punctuation, but only before or after space.
foreach (@paragraphs) {
tr/a-z/A-Z/; # uppercase for sorting
s/[\!\,\?\:\;\.\/\"\'\`\(\)\{\}\[\]\|\#\-\_\^\\\*]+\s/ /g;
s/\s[\!\,\?\:\;\.\/\"\'\`\(\)\{\}\[\]\|\#\-\_\^\\\*]+/ /g;
foreach $w (split (" ")) {
$freqs{$w}++;
$tokens++;
}
}
undef @paragraphs;
$types = keys(%freqs);
$ttr = $types / $tokens;
$ttr = sprintf("%.2f",$ttr);
$Result .= "\tText name:\t$text_name\n";
$Result .= "\tDate/time:\t$date\n";
$Result .= "\tTokens:\t$tokens\n";
$Result .= "\tTypes:\t$types\n";
$Result .= "\tType-Token:\t$ttr\n";
$Result .= "\tSort order:\t$sort\n";
$Result .= "\n";
if ($sort eq 'alpha') {
foreach (sort keys (%freqs)) {
$Result .= sprintf ("%-20s %d\n", $_, $freqs{$_});
}
}
# Descending frequency
elsif ($sort eq 'descending')
{
while (($word, $freq) = each %freqs) {
push(@count_pairs,"$freq\t$word");
}
sub bynumber_d {($b <=> $a) || ($a cmp $b);}
foreach (sort bynumber_d @count_pairs){
$Result .= "$_\n";
}
}
# Ascending frequency
elsif ($sort eq 'ascending')
{
while (($word, $freq) = each %freqs) {
push(@count_pairs,"$freq\t$word");
}
sub bynumber_a {($a <=> $b) || ($a cmp $b);}
foreach (sort bynumber_a @count_pairs){
$Result .= "$_\n";
}
}
@end_time = times;
$Result .= sprintf("\nProcessing time: %.2f CPU seconds.\n", ($end_time[0] - $st_time[0]) + ($end_time[1] - $st_time[1]));
return $Result;
#=======================================================
sub LoadText {
if ($UD) {@paragraphs = "$TextFile\n";} # add newline in case none.
else {
if(!open (TEXT,$TextFile)) {
$Result = "Unable to open text file $TextFile.\n";
return (0);
}
@paragraphs = ;
close TEXT;
}
} #end of LoadText
} # end of freqs
1;