# freqs.pl v 0.2b # By Catherine N. Ball, based on obvious code in # Wall & Schwartz (1992:38) _Programming Perl_ sub make_frequency_index { #package CNBfreqs; @st_time = times; # Initialization $success = 0; $Result = ""; ($UD, $TextFile, $text_name, $sort) = @_; #($TextFile, $text_name, $sort) = @_; $/ = ''; # line terminator set to null (slurp paragraphs) $* = 1; # enable multi-line patterns (allow newlines in string) $OK = &LoadText; # load the text: results in $text if(!$OK) { return (0, $Result); } ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time); $mon = $mon+1; $year=$year+1900; $date = "$mon/$mday/$year $hour\:$min"; $tokens = 0; $ttr = 0; $counter=0; $cumperc=0; undef %freqs; # splitting on " " actually handles tabs, newlines, and any number of spaces. # I need to remove punctuation. This is tricky. If I do 'non-word # characters', that removes things like ash and accents, e.g. in # foreach (split(/\W*\s+\W*/o)) { # So, remove punctuation, but only before or after space. foreach (@paragraphs) { tr/a-z/A-Z/; # uppercase for sorting s/[\!\,\?\:\;\.\/\"\'\`\(\)\{\}\[\]\|\#\-\_\^\\\*]+\s/ /g; s/\s[\!\,\?\:\;\.\/\"\'\`\(\)\{\}\[\]\|\#\-\_\^\\\*]+/ /g; foreach $w (split (" ")) { $freqs{$w}++; $tokens++; } } undef @paragraphs; $types = keys(%freqs); $ttr = $types / $tokens; $ttr = sprintf("%.4f",$ttr); $Result .= "\tText:\t$text_name\n"; $Result .= "\tDate:\t$date\n"; $Result .= "\tTokens:\t$tokens\n"; $Result .= "\tTypes:\t$types\n"; $Result .= "\tRatio:\t$ttr\n"; $Result .= "\tSort:\t$sort\n\n"; #if ($sort ne 'alpha') { # Not enough info to do this with alpha. if ($sort eq 'descending') { $Result .= "\t-------------------------------------------------------------------------------------------\n"; $Result .= "\tRANK \tFREQ \tCOVERAGE\t\t\tWORD \n"; $Result .= "\t \t \tindividual\tcumulative \n"; $Result .= "\t-------------------------------------------------------------------------------------------\n"; } if ($sort eq 'alpha') { foreach (sort keys (%freqs)) { $Result .= sprintf ("\t%-20s %d\n", $_, $freqs{$_}); } } ### MAIN SORT IS THIS ONE ### Descending frequency elsif ($sort eq 'descending') { while (($word, $freq) = each %freqs) { $perc = ($freq/$tokens)*100; $perc = sprintf("%.4f",$perc); push(@count_pairs,"$freq \t$word \t$perc%"); } #Now sort by $freq, ie biggest freq at top sub bynumber_d {($b <=> $a) || ($a cmp $b);} foreach (sort bynumber_d @count_pairs){ # The following has to be AFTER above. $counter++; split ("\t"); #Get data back out $cumperc = $cumperc + $_[2]; #This is the indiv percent if ($cumperc>100){$cumperc=100;} $cumperc = sprintf("%.4f",$cumperc); if ($cumperc>100){$cumperc=100;} #if (($counter eq 10) || ($counter eq 20) || ($counter eq 30)) { if (($counter>9)&&("10 20 30 40 50 60 70 80 90 100" =~ $counter)) { $Result .= "\t$counter.\t$_[0] \t$_[2] \t$cumperc% \t$_[1] \n"; } else { $Result .= "\t$counter.\t$_[0] \t$_[2] \t$cumperc% \t$_[1] \n"; } } } # Ascending frequency elsif ($sort eq 'ascending') { while (($word, $freq) = each %freqs) { $perc = ($freq/$tokens)*100; $perc = sprintf("%.4f",$perc); push(@count_pairs,"$freq\t$perc%\t$word"); } sub bynumber_a {($a <=> $b) || ($a cmp $b);} foreach (sort bynumber_a @count_pairs){ $Result .= "\t$_\n"; } } # @end_time = times; # $Result .= sprintf("\nProcessing time: %.4f CPU seconds.\n", ($end_time[0] - $st_time[0]) + ($end_time[1] - $st_time[1])); return $Result; #======================================================= sub LoadText { if ($UD) {@paragraphs = "$TextFile\n";} # add newline in case none. else { if(!open (TEXT,$TextFile)) { $Result = "Unable to open text file $TextFile.\n"; return (0); } #@paragraphs = "$TextFile\n"; @paragraphs = ; close TEXT; } } #end of LoadText } # end of freqs 1;