sub process_text{ $text3 = $_[0]; #Had a big time waste here using "my $text2" # Not needed for new Perl or old # 1k hyphens $text3 =~ s/north-east/northeast/ig; $text3 =~ s/north-west/northwest/ig; $text3 =~ s/mother-in-law/motherinlaw/ig; $text3 =~ s/father-in-law/fatherinlaw/ig; $text3 =~ s/build-up/buildup/ig; # AWL hyphens (none in 2K) $text3 =~ s/non-conformist/nonconformist/ig; $text3 =~ s/non-conformists/nonconformists/ig; $text3 =~ s/non-conformity/nonconformity/ig; $text3 =~ s/co-operate/cooperate/ig; $text3 =~ s/co-operated/cooperated/ig; $text3 =~ s/co-operates/cooperates/ig; $text3 =~ s/co-operation/cooperation/ig; $text3 =~ s/co-operative/cooperative/ig; $text3 =~ s/co-operatively/cooperatively/ig; $text3 =~ s/co-ordinate/coordinate/ig; $text3 =~ s/co-ordinated/coordinated/ig; $text3 =~ s/co-ordinates/coordinates/ig; $text3 =~ s/co-ordinating/coordinating/ig; $text3 =~ s/co-ordination/coordination/ig; $text3 =~ s/co-ordinator/coordinator/ig; $text3 =~ s/co-ordinators/coordinators/ig; $text3 =~ s/over-estimate/overestimate/ig; $text3 =~ s/over-estimates/overestimates/ig; $text3 =~ s/over-estimated/overestimated/ig; $text3 =~ s/over-estimation/overestimation/ig; $text3 =~ s/re-evaluate/reevaluate/ig; $text3 =~ s/re-evaluated/reevaluated/ig; $text3 =~ s/re-evaluates/reevaluates/ig; $text3 =~ s/re-evaluating/reevaluating/ig; $text3 =~ s/re-evaluation/reevaluation/ig; $text3 =~ s/under-resourced/underresourced/ig; $text3 =~ s/non-traditional/nontraditional/ig; $text3 =~ s/so-called/socalled/ig; $text3 =~ s/\t//ig; #tabs $text3 =~ s/\n/ /ig; #line ends $text3 =~ s/\)/ /ig; # For Brindamour, must do first $text3 =~ s/\(/ /ig; # For Brindamour, must do first $text3 =~ s/\// /ig; # read/discuss $text3 =~ s/`/'/ig; #French apost 1 $text3 =~ s/’/'/ig; #French apost 1 $text3 =~ s/at's/at is/ig; #what's or that's $text3 =~ s/he's/he is/ig; $text3 =~ s/she's/she is/ig; $text3 =~ s/it's/it is/ig; $text3 =~ s/I'm/I am/ig; $text3 =~ s/'re/ are/ig; $text3 =~ s/'d / would /ig; $text3 =~ s/'ve/ have/ig; #should do all (I've, you've) $text3 =~ s/!/ /ig; #othewise know! = offlist $text3 =~ s/'ll / will /ig; #in case no space after comma $text3 =~ s/,/, /ig; #in case no space after comma $text3 =~ s/\./. /ig; #in case no space after period $text3 =~ s/'s//ig; # taking out possessives AFTER he's etc #$text3 =~ s/--/ /g; # common double with no spaces, unhandled $text3 =~ s/-/ /g; #Try it with no hyphens $text3 =~ s/_/ /g; #Try it with no underscore - seems to crash prog $text3 =~ s/can\'t/can not /g; #Not like isn't (leaves 'ca' $text3 =~ s/won\'t/will not /g; #or else leaves 'wo' $text3 =~ s/n\'t/ not/ig; #isn't => is not #Note this comes first $text3 =~ s/'/ '/ig; # I've => I ve (two words) #$text3 =~ s/\(/ \(/g; #norm #$text3 =~ s/\)/ \(/g; #norm #$text3 =~ s/[0]//ig; # Get rid of strings of zeroes $text3 =~ s/[\d+]/number/ig;# if $text3 =~/[\d+]; #OK but leaves 1999 as numbernumber $text3 =~ s/numberth/number/g; #7th etc $text3 =~ s/numbernd/number/g; #2nd $text3 =~ s/numberrd/number/g; #3rd $text3 =~ s/numberst/number/g; #1st $text3 =~ s/number.number/number/ig;# if $text3 =~/number.number/; #for decimals while ($text3 =~ /numbernumber/) { $text3 =~ s/numbernumber/number/g; } # Brill! return $text3; } sub xx_process_text{ $text2 = $_[0]; #Had a big time waste here using "my $text2" (this is not new kind of Perl) # 1k hyphens $text2 =~ s/north-east/northeast/ig; $text2 =~ s/north-west/northwest/ig; $text2 =~ s/mother-in-law/motherinlaw/ig; $text2 =~ s/father-in-law/fatherinlaw/ig; $text2 =~ s/build-up/buildup/ig; # AWL hyphens (none in 2K) $text2 =~ s/non-conformist/nonconformist/ig; $text2 =~ s/non-conformists/nonconformists/ig; $text2 =~ s/non-conformity/nonconformity/ig; $text2 =~ s/co-operate/cooperate/ig; $text2 =~ s/co-operated/cooperated/ig; $text2 =~ s/co-operates/cooperates/ig; $text2 =~ s/co-operation/cooperation/ig; $text2 =~ s/co-operative/cooperative/ig; $text2 =~ s/co-operatively/cooperatively/ig; $text2 =~ s/co-ordinate/coordinate/ig; $text2 =~ s/co-ordinated/coordinated/ig; $text2 =~ s/co-ordinates/coordinates/ig; $text2 =~ s/co-ordinating/coordinating/ig; $text2 =~ s/co-ordination/coordination/ig; $text2 =~ s/co-ordinator/coordinator/ig; $text2 =~ s/co-ordinators/coordinators/ig; $text2 =~ s/over-estimate/overestimate/ig; $text2 =~ s/over-estimates/overestimates/ig; $text2 =~ s/over-estimated/overestimated/ig; $text2 =~ s/over-estimation/overestimation/ig; $text2 =~ s/re-evaluate/reevaluate/ig; $text2 =~ s/re-evaluated/reevaluated/ig; $text2 =~ s/re-evaluates/reevaluates/ig; $text2 =~ s/re-evaluating/reevaluating/ig; $text2 =~ s/re-evaluation/reevaluation/ig; $text2 =~ s/under-resourced/underresourced/ig; $text2 =~ s/non-traditional/nontraditional/ig; $text2 =~ s/so-called/socalled/ig; $text2 =~ s/\t//ig; #tabs $text2 =~ s/\n/ /ig; #line breaks $text2 =~ s/\)/ /ig; # For Brindamour, must do first $text2 =~ s/\(/ /ig; # For Brindamour, must do first $text2 =~ s/\// /ig; # read/discuss $text2 =~ s/`/'/ig; #French apost 1 $text2 =~ s/’/'/ig; #French apost 1 $text2 =~ s/at's/at is/ig; #what's or that's $text2 =~ s/he's/he is/ig; $text2 =~ s/she's/she is/ig; $text2 =~ s/it's/it is/ig; $text2 =~ s/I'm/I am/ig; $text2 =~ s/'re/ are/ig; $text2 =~ s/'d / would /ig; $text2 =~ s/'ve/ have/ig; #should do all (I've, you've) $text2 =~ s/!/ /ig; #othewise know! = offlist $text2 =~ s/'ll / will /ig; #in case no space after comma $text2 =~ s/,/, /ig; #in case no space after comma $text2 =~ s/\./. /ig; #in case no space after period $text2 =~ s/'s//ig; # taking out possessives AFTER he's etc #$text2 =~ s/--/ /g; # common double with no spaces, unhandled $text2 =~ s/-/ /g; #Try it with no hyphens $text2 =~ s/_/ /g; #Try it with no underscore - seems to crash prog $text2 =~ s/can\'t/can not /g; #Not like isn't (leaves 'ca' $text2 =~ s/won\'t/will not /g; #or else leaves 'wo' $text2 =~ s/n\'t/ not/ig; #isn't => is not #Note this comes first $text2 =~ s/'/ '/ig; # I've => I ve (two words) #$text2 =~ s/\(/ \(/g; #norm #$text2 =~ s/\)/ \(/g; #norm #$text2 =~ s/[0]//ig; # Get rid of strings of zeroes $text2 =~ s/[\d+]/number/ig;# if $text2 =~/[\d+]; #OK but leaves 1999 as numbernumber $text2 =~ s/numberth/number/g; #7th etc $text2 =~ s/numbernd/number/g; #2nd $text2 =~ s/numberrd/number/g; #3rd $text2 =~ s/numberst/number/g; #1st $text2 =~ s/number.number/number/ig;# if $text2 =~/number.number/; #for decimals while ($text2 =~ /numbernumber/) { $text2 =~ s/numbernumber/number/g; } # Brill! } 1;