bin/text_cat/text_cat


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242

#!/usr/bin/perl -w
# � Gertjan van Noord, 1997.
# mailto:vannoord@let.rug.nl

use strict;
use vars qw($opt_d $opt_f $opt_h $opt_i $opt_l $opt_n $opt_s $opt_t $opt_v $opt_u $opt_a);
use Getopt::Std;
use Benchmark;

my $non_word_characters='0-9\s';
my @languages; # languages (sorted by name)
my %ngram_for; # map language x ngram => rang

# OPTIONS
getopts('a:d:f:hi:lnst:u:v');

# defaults: set $opt_X unless already defined (Perl Cookbook p. 6):
$opt_a ||= 10;
$opt_d ||= '/users1/vannoord/Perl/TextCat/LM';
$opt_f ||= 0;
$opt_t ||= 400;
$opt_u ||= 1.05;

$| = 1; # auto-flush stdout

sub help {
    print <<HELP
Text Categorization. Typically used to determine the language of a
given document. 

Usage
-----

* print help message:

$0 -h

* for guessing: 

$0 [-a Int] [-d Dir] [-f Int] [-i N] [-l] [-t Int] [-u Int] [-v]

    -a    the program returns the best-scoring language together
          with all languages which are $opt_u times worse (cf option -u). 
          If the number of languages to be printed is larger than the value 
          of this option (default: $opt_a) then no language is returned, but
          instead a message that the input is of an unknown language is
          printed. Default: $opt_a.
    -d    indicates in which directory the language models are 
          located (files ending in .lm). Currently only a single 
          directory is supported. Default: $opt_d.
    -f    Before sorting is performed the Ngrams which occur this number 
          of times or less are removed. This can be used to speed up
          the program for longer inputs. For short inputs you should use
          -f 0.
          Default: $opt_f.
    -i N  only read first N lines
    -l    indicates that input is given as an argument on the command line,
          e.g. text_cat -l "this is english text"
          Cannot be used in combination with -n.
    -s    Determine language of each line of input. Not very efficient yet,
          because language models are re-loaded after each line.
    -t    indicates the topmost number of ngrams that should be used. 
          If used in combination with -n this determines the size of the 
          output. If used with categorization this determines
          the number of ngrams that are compared with each of the language
          models (but each of those models is used completely). 
    -u    determines how much worse result must be in order not to be 
          mentioned as an alternative. Typical value: 1.05 or 1.1. 
          Default: $opt_u.
    -v    verbose. Continuation messages are written to standard error.

* for creating new language model, based on text read from standard input:

$0 -n [-v]

    -v    verbose. Continuation messages are written to standard error.


HELP
}

if ($opt_h) { help(); exit 0; };

if ($opt_n) { 
    my %ngram=();
    my @result = create_lm(input(),\%ngram);
    print join("\n",map { "$_\t $ngram{$_}" ; } @result),"\n";
} elsif ($opt_l) {
    classify($ARGV[0]);
} elsif ($opt_s) {
    while (<>) {
	chomp;
	classify($_);
    }
} else { 
    classify(input()); 
}

sub read_model {
    my ($file) = @_;
    open(LM,"$file") or  die "cannot open $file: $!\n";
    my %ngram;
    my $rang = 1;
    while (<LM>) {
	chomp;
	# only use lines starting with appropriate character. Others are
	# ignored.
	if (/^[^$non_word_characters]+/o) {
	    $ngram{$&} = $rang++;
	}
    }
    return \%ngram;
}

sub read_models {
  # open directory to find which languages are supported
  opendir DIR, "$opt_d" or die "directory $opt_d: $!\n";
  @languages = sort(grep { s/\.lm// && -r "$opt_d/$_.lm" } readdir(DIR));
  closedir DIR;
  @languages or die "sorry, can't read any language models from $opt_d\n" .
    "language models must reside in files with .lm ending\n";

  foreach my $language (@languages) {
      $ngram_for{$language} = read_model("$opt_d/$language.lm");
  }
}

# CLASSIFICATION
sub classify {
  my ($input)=@_;
  my %results=();
  my $maxp = $opt_t;
  read_models() if  !@languages;

  # create ngrams for input. Note that hash %unknown is not used;
  # it contains the actual counts which are only used under -n: creating
  # new language model (and even then they are not really required).
  my @unknown=create_lm($input);

  my $t1 = new Benchmark;
  foreach my $language (@languages) {
      # compares the language model with input ngrams list
      my $ngram = $ngram_for{$language} or die "no ngrams for $language";

      my ($i,$p)=(0,0);
      while ($i < @unknown) {
	  if ($ngram->{$unknown[$i]}) {
	      $p=$p+abs($ngram->{$unknown[$i]}-$i);
	  } else {
	      $p=$p+$maxp;
	  }
	  ++$i;
      }
      #print STDERR "$language: $p\n" if $opt_v;
    
      $results{$language} = $p;
  }
  print STDERR "read language models done (" . 
    timestr(timediff(new Benchmark, $t1)) . 
      ".\n" if $opt_v;
  my @results = sort { $results{$a} <=> $results{$b} } keys %results;
  
  print join("\n",map { "$_\t $results{$_}"; } @results),"\n" if $opt_v;
  my $a = $results{$results[0]};
  
  my @answers=(shift(@results));
  while (@results && $results{$results[0]} < ($opt_u *$a)) {
    @answers=(@answers,shift(@results));
  }
  if (@answers > $opt_a) {
    print "I don't know; " .
      "Perhaps this is a language I haven't seen before?\n";
  } else {
    print join(" or ", @answers), "\n";
  }
}

# first and only argument is reference to hash.
# this hash is filled, and a sorted list (opt_n elements)
# is returned.
sub input {
    my $read="";
    if ($opt_i) {
	while(<>) {
	    if ($. == $opt_i) {
		return $read . $_;
	    }
	    $read = $read . $_;
	}
	return $read;
    } else {
	local $/;     # so it doesn't affect $/ elsewhere
	undef $/;
	$read = <>;      # swallow input.
	$read || die "determining the language of an empty file is hard...\n";
	return $read;
    }
}


sub create_lm {
  my $t1 = new Benchmark;
  my $ngram;
  ($_,$ngram) = @_;  #$ngram contains reference to the hash we build
    # then add the ngrams found in each word in the hash
  my $word;
  foreach $word (split("[$non_word_characters]+")) {
    $word = "_" . $word . "_";
    my $len = length($word);
    my $flen=$len;
    my $i;
    for ($i=0;$i<$flen;$i++) {
      $$ngram{substr($word,$i,5)}++ if $len > 4;
      $$ngram{substr($word,$i,4)}++ if $len > 3;
      $$ngram{substr($word,$i,3)}++ if $len > 2;
      $$ngram{substr($word,$i,2)}++ if $len > 1;
      $$ngram{substr($word,$i,1)}++;
      $len--;
    }
  }
  ###print "@{[%$ngram]}";
  my $t2 = new Benchmark;
  print STDERR "count_ngrams done (". 
    timestr(timediff($t2, $t1)) .").\n" if $opt_v;

  # as suggested by Karel P. de Vos, k.vos@elsevier.nl, we speed up
  # sorting by removing singletons
  map { my $key=$_; if ($$ngram{$key} <= $opt_f) 
             { delete $$ngram{$key}; }; } keys %$ngram;
  #however I have very bad results for short inputs, this way

  
  # sort the ngrams, and spit out the $opt_t frequent ones.
  # adding  `or $a cmp $b' in the sort block makes sorting five
  # times slower..., although it would be somewhat nicer (unique result)
  my @sorted = sort { $$ngram{$b} <=> $$ngram{$a} } keys %$ngram;
  splice(@sorted,$opt_t) if (@sorted > $opt_t); 
  print STDERR "sorting done (" . 
    timestr(timediff(new Benchmark, $t2)) . 
      ").\n" if $opt_v;
  return @sorted;
}