#!/usr/bin/perl -w # © Gertjan van Noord, 1997. # mailto:vannoord@let.rug.nl use strict; use vars qw($opt_d $opt_f $opt_h $opt_i $opt_l $opt_n $opt_s $opt_t $opt_v $opt_u $opt_a); use Getopt::Std; use Benchmark; my $non_word_characters='0-9\s'; my @languages; # languages (sorted by name) my %ngram_for; # map language x ngram => rang # OPTIONS getopts('a:d:f:hi:lnst:u:v'); # defaults: set $opt_X unless already defined (Perl Cookbook p. 6): $opt_a ||= 10; $opt_d ||= '/users1/vannoord/Perl/TextCat/LM'; $opt_f ||= 0; $opt_t ||= 400; $opt_u ||= 1.05; $| = 1; # auto-flush stdout sub help { print <) { chomp; classify($_); } } else { classify(input()); } sub read_model { my ($file) = @_; open(LM,"$file") or die "cannot open $file: $!\n"; my %ngram; my $rang = 1; while () { chomp; # only use lines starting with appropriate character. Others are # ignored. if (/^[^$non_word_characters]+/o) { $ngram{$&} = $rang++; } } return \%ngram; } sub read_models { # open directory to find which languages are supported opendir DIR, "$opt_d" or die "directory $opt_d: $!\n"; @languages = sort(grep { s/\.lm// && -r "$opt_d/$_.lm" } readdir(DIR)); closedir DIR; @languages or die "sorry, can't read any language models from $opt_d\n" . "language models must reside in files with .lm ending\n"; foreach my $language (@languages) { $ngram_for{$language} = read_model("$opt_d/$language.lm"); } } # CLASSIFICATION sub classify { my ($input)=@_; my %results=(); my $maxp = $opt_t; read_models() if !@languages; # create ngrams for input. Note that hash %unknown is not used; # it contains the actual counts which are only used under -n: creating # new language model (and even then they are not really required). my @unknown=create_lm($input); my $t1 = new Benchmark; foreach my $language (@languages) { # compares the language model with input ngrams list my $ngram = $ngram_for{$language} or die "no ngrams for $language"; my ($i,$p)=(0,0); while ($i < @unknown) { if ($ngram->{$unknown[$i]}) { $p=$p+abs($ngram->{$unknown[$i]}-$i); } else { $p=$p+$maxp; } ++$i; } #print STDERR "$language: $p\n" if $opt_v; $results{$language} = $p; } print STDERR "read language models done (" . timestr(timediff(new Benchmark, $t1)) . ".\n" if $opt_v; my @results = sort { $results{$a} <=> $results{$b} } keys %results; print join("\n",map { "$_\t $results{$_}"; } @results),"\n" if $opt_v; my $a = $results{$results[0]}; my @answers=(shift(@results)); while (@results && $results{$results[0]} < ($opt_u *$a)) { @answers=(@answers,shift(@results)); } if (@answers > $opt_a) { print "I don't know; " . "Perhaps this is a language I haven't seen before?\n"; } else { print join(" or ", @answers), "\n"; } } # first and only argument is reference to hash. # this hash is filled, and a sorted list (opt_n elements) # is returned. sub input { my $read=""; if ($opt_i) { while(<>) { if ($. == $opt_i) { return $read . $_; } $read = $read . $_; } return $read; } else { local $/; # so it doesn't affect $/ elsewhere undef $/; $read = <>; # swallow input. $read || die "determining the language of an empty file is hard...\n"; return $read; } } sub create_lm { my $t1 = new Benchmark; my $ngram; ($_,$ngram) = @_; #$ngram contains reference to the hash we build # then add the ngrams found in each word in the hash my $word; foreach $word (split("[$non_word_characters]+")) { $word = "_" . $word . "_"; my $len = length($word); my $flen=$len; my $i; for ($i=0;$i<$flen;$i++) { $$ngram{substr($word,$i,5)}++ if $len > 4; $$ngram{substr($word,$i,4)}++ if $len > 3; $$ngram{substr($word,$i,3)}++ if $len > 2; $$ngram{substr($word,$i,2)}++ if $len > 1; $$ngram{substr($word,$i,1)}++; $len--; } } ###print "@{[%$ngram]}"; my $t2 = new Benchmark; print STDERR "count_ngrams done (". timestr(timediff($t2, $t1)) .").\n" if $opt_v; # as suggested by Karel P. de Vos, k.vos@elsevier.nl, we speed up # sorting by removing singletons map { my $key=$_; if ($$ngram{$key} <= $opt_f) { delete $$ngram{$key}; }; } keys %$ngram; #however I have very bad results for short inputs, this way # sort the ngrams, and spit out the $opt_t frequent ones. # adding `or $a cmp $b' in the sort block makes sorting five # times slower..., although it would be somewhat nicer (unique result) my @sorted = sort { $$ngram{$b} <=> $$ngram{$a} } keys %$ngram; splice(@sorted,$opt_t) if (@sorted > $opt_t); print STDERR "sorting done (" . timestr(timediff(new Benchmark, $t2)) . ").\n" if $opt_v; return @sorted; }