# freqs.pl v 0.2b # By Catherine N. Ball, based on obvious code in # Wall & Schwartz (1992:38) _Programming Perl_ sub make_frequency_index { #package CNBfreqs; @st_time = times; # Initialization $success = 0; $Result = ""; ($UD, $TextFile, $text_name, $sort) = @_; $/ = ''; # line terminator set to null (slurp paragraphs) $* = 1; # enable multi-line patterns (allow newlines in string) $OK = &LoadText; # load the text: results in $text if(!$OK) { return (0, $Result); } ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time); $mon = $mon+1; $year=$year+1900; $date = "$mon/$mday/$year $hour\:$min"; $tokens = 0; $ttr = 0; undef %freqs; # splitting on " " actually handles tabs, newlines, and any number of spaces. # I need to remove punctuation. This is tricky. If I do 'non-word # characters', that removes things like ash and accents, e.g. in # foreach (split(/\W*\s+\W*/o)) { # So, remove punctuation, but only before or after space. foreach (@paragraphs) { tr/a-z/A-Z/; # uppercase for sorting s/[\!\,\?\:\;\.\/\"\'\`\(\)\{\}\[\]\|\#\-\_\^\\\*]+\s/ /g; s/\s[\!\,\?\:\;\.\/\"\'\`\(\)\{\}\[\]\|\#\-\_\^\\\*]+/ /g; foreach $w (split (" ")) { $freqs{$w}++; $tokens++; } } undef @paragraphs; $types = keys(%freqs); $ttr = $types / $tokens; $ttr = sprintf("%.2f",$ttr); $Result .= "\tText name:\t$text_name\n"; $Result .= "\tDate/time:\t$date\n"; $Result .= "\tTokens:\t$tokens\n"; $Result .= "\tTypes:\t$types\n"; $Result .= "\tType-Token:\t$ttr\n"; $Result .= "\tSort order:\t$sort\n"; $Result .= "\n"; if ($sort eq 'alpha') { foreach (sort keys (%freqs)) { $Result .= sprintf ("%-20s %d\n", $_, $freqs{$_}); } } # Descending frequency elsif ($sort eq 'descending') { while (($word, $freq) = each %freqs) { push(@count_pairs,"$freq\t$word"); } sub bynumber_d {($b <=> $a) || ($a cmp $b);} foreach (sort bynumber_d @count_pairs){ $Result .= "$_\n"; } } # Ascending frequency elsif ($sort eq 'ascending') { while (($word, $freq) = each %freqs) { push(@count_pairs,"$freq\t$word"); } sub bynumber_a {($a <=> $b) || ($a cmp $b);} foreach (sort bynumber_a @count_pairs){ $Result .= "$_\n"; } } @end_time = times; $Result .= sprintf("\nProcessing time: %.2f CPU seconds.\n", ($end_time[0] - $st_time[0]) + ($end_time[1] - $st_time[1])); return $Result; #======================================================= sub LoadText { if ($UD) {@paragraphs = "$TextFile\n";} # add newline in case none. else { if(!open (TEXT,$TextFile)) { $Result = "Unable to open text file $TextFile.\n"; return (0); } @paragraphs = ; close TEXT; } } #end of LoadText } # end of freqs 1;