1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27 import sys
28 import re
29 from os import path
30 import glob
31
32
33 nb_ngrams = 400
34 white_space_re = re.compile('\s+')
35
36
38
40 if isinstance(arg, basestring):
41 self.addText(arg)
42 self.normalise()
43 elif isinstance(arg, dict):
44
45 self.ngrams = arg
46 else:
47 self.ngrams = dict()
48
49 - def addText(self, text):
50 if isinstance(text, str):
51 text = text.decode('utf-8')
52
53 ngrams = dict()
54
55 for word in white_space_re.split(text):
56 word = '_%s_' % word
57 size = len(word)
58 for i in xrange(size - 1):
59 for s in (1, 2, 3, 4):
60 end = i+s
61 if end >= size:
62 break
63 sub = word[i:end]
64
65 if not sub in ngrams:
66 ngrams[sub] = 0
67 ngrams[sub] += 1
68
69 self.ngrams = ngrams
70 return self
71
73 sorted = [(self.ngrams[k], k) for k in self.ngrams]
74 sorted.sort()
75 sorted.reverse()
76 sorted = sorted[:nb_ngrams]
77 return sorted
78
80 ngrams = {}
81 for count, (v, k) in enumerate(self.sorted_by_score()):
82 ngrams[k] = count
83
84 self.ngrams = ngrams
85 return self
86
88 self.ngrams[key] = value
89 return self
90
92 d = 0
93 ngrams = ngram.ngrams
94 for k in self.ngrams:
95 if k in ngrams:
96 d += abs(ngrams[k] - self.ngrams[k])
97 else:
98 d += nb_ngrams
99 return d
100
101
102
104
106 self.ngrams = dict()
107 folder = path.join(folder, '*' + ext)
108 size = len(ext)
109
110 for fname in glob.glob(path.normcase(folder)):
111 lang = path.split(fname)[-1][:-size]
112 ngrams = {}
113 try:
114 f = open(fname, 'r')
115 lines = f.read().decode('utf-8').splitlines()
116 try:
117 for i, line in enumerate(lines):
118 ngram, _t, _f = line.partition(u'\t')
119 ngrams[ngram] = i
120 except AttributeError, e:
121
122 for i, line in enumerate(lines):
123 ngram = line.split(u'\t')[0]
124 ngrams[ngram] = i
125 except UnicodeDecodeError, e:
126 continue
127
128 if ngrams:
129 self.ngrams[lang] = _NGram(ngrams)
130
131 if not self.ngrams:
132 raise ValueError("no language files found")
133
135 ngram = _NGram(text)
136 r = 'guess'
137
138 min = sys.maxint
139
140 for lang in self.ngrams:
141 d = self.ngrams[lang].compare(ngram)
142 if d < min:
143 min = d
144 r = lang
145
146 if min > 0.8 * (nb_ngrams ** 2):
147 r = ''
148 return r
149
150
152
153 - def __init__(self, folder, ext='.txt'):
154 self.ngrams = dict()
155 folder = path.join(folder, '*' + ext)
156 size = len(ext)
157
158 for fname in glob.glob(path.normcase(folder)):
159 lang = path.split(fname)[-1][:-size]
160 n = _NGram()
161
162 file = open(fname, 'r')
163 for line in file.readlines():
164 n.addText(line)
165 file.close()
166
167 n.normalise()
168 self.ngrams[lang] = n
169
170 - def save(self, folder, ext='.lm'):
177
178 if __name__ == '__main__':
179 import sys
180
181
182
183
184
185 text = sys.stdin.readline()
186 from translate.misc.file_discovery import get_abs_data_filename
187 l = NGram(get_abs_data_filename('langmodels'))
188 print l.classify(text)
189