1 | // Porter stemmer in Javascript. Few comments, but it's easy to follow against the rules in the original
|
---|
2 | // paper, in
|
---|
3 | //
|
---|
4 | // Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
|
---|
5 | // no. 3, pp 130-137,
|
---|
6 | //
|
---|
7 | // see also http://www.tartarus.org/~martin/PorterStemmer
|
---|
8 |
|
---|
9 | // Release 1
|
---|
10 | // Derived from (http://tartarus.org/~martin/PorterStemmer/js.txt) - cjm (iizuu) Aug 24, 2009
|
---|
11 |
|
---|
12 | var stemmer = (function(){
|
---|
13 | var step2list = {
|
---|
14 | "ational" : "ate",
|
---|
15 | "tional" : "tion",
|
---|
16 | "enci" : "ence",
|
---|
17 | "anci" : "ance",
|
---|
18 | "izer" : "ize",
|
---|
19 | "bli" : "ble",
|
---|
20 | "alli" : "al",
|
---|
21 | "entli" : "ent",
|
---|
22 | "eli" : "e",
|
---|
23 | "ousli" : "ous",
|
---|
24 | "ization" : "ize",
|
---|
25 | "ation" : "ate",
|
---|
26 | "ator" : "ate",
|
---|
27 | "alism" : "al",
|
---|
28 | "iveness" : "ive",
|
---|
29 | "fulness" : "ful",
|
---|
30 | "ousness" : "ous",
|
---|
31 | "aliti" : "al",
|
---|
32 | "iviti" : "ive",
|
---|
33 | "biliti" : "ble",
|
---|
34 | "logi" : "log"
|
---|
35 | },
|
---|
36 |
|
---|
37 | step3list = {
|
---|
38 | "icate" : "ic",
|
---|
39 | "ative" : "",
|
---|
40 | "alize" : "al",
|
---|
41 | "iciti" : "ic",
|
---|
42 | "ical" : "ic",
|
---|
43 | "ful" : "",
|
---|
44 | "ness" : ""
|
---|
45 | },
|
---|
46 |
|
---|
47 | c = "[^aeiou]", // consonant
|
---|
48 | v = "[aeiouy]", // vowel
|
---|
49 | C = c + "[^aeiouy]*", // consonant sequence
|
---|
50 | V = v + "[aeiou]*", // vowel sequence
|
---|
51 |
|
---|
52 | mgr0 = "^(" + C + ")?" + V + C, // [C]VC... is m>0
|
---|
53 | meq1 = "^(" + C + ")?" + V + C + "(" + V + ")?$", // [C]VC[V] is m=1
|
---|
54 | mgr1 = "^(" + C + ")?" + V + C + V + C, // [C]VCVC... is m>1
|
---|
55 | s_v = "^(" + C + ")?" + v; // vowel in stem
|
---|
56 |
|
---|
57 | return function (w) {
|
---|
58 | var stem,
|
---|
59 | suffix,
|
---|
60 | firstch,
|
---|
61 | re,
|
---|
62 | re2,
|
---|
63 | re3,
|
---|
64 | re4,
|
---|
65 | origword = w;
|
---|
66 |
|
---|
67 | if (w.length < 3) { return w; }
|
---|
68 |
|
---|
69 | firstch = w.substr(0,1);
|
---|
70 | if (firstch == "y") {
|
---|
71 | w = firstch.toUpperCase() + w.substr(1);
|
---|
72 | }
|
---|
73 |
|
---|
74 | // Step 1a
|
---|
75 | re = /^(.+?)(ss|i)es$/;
|
---|
76 | re2 = /^(.+?)([^s])s$/;
|
---|
77 |
|
---|
78 | if (re.test(w)) { w = w.replace(re,"$1$2"); }
|
---|
79 | else if (re2.test(w)) { w = w.replace(re2,"$1$2"); }
|
---|
80 |
|
---|
81 | // Step 1b
|
---|
82 | re = /^(.+?)eed$/;
|
---|
83 | re2 = /^(.+?)(ed|ing)$/;
|
---|
84 | if (re.test(w)) {
|
---|
85 | var fp = re.exec(w);
|
---|
86 | re = new RegExp(mgr0);
|
---|
87 | if (re.test(fp[1])) {
|
---|
88 | re = /.$/;
|
---|
89 | w = w.replace(re,"");
|
---|
90 | }
|
---|
91 | } else if (re2.test(w)) {
|
---|
92 | var fp = re2.exec(w);
|
---|
93 | stem = fp[1];
|
---|
94 | re2 = new RegExp(s_v);
|
---|
95 | if (re2.test(stem)) {
|
---|
96 | w = stem;
|
---|
97 | re2 = /(at|bl|iz)$/;
|
---|
98 | re3 = new RegExp("([^aeiouylsz])\\1$");
|
---|
99 | re4 = new RegExp("^" + C + v + "[^aeiouwxy]$");
|
---|
100 | if (re2.test(w)) { w = w + "e"; }
|
---|
101 | else if (re3.test(w)) { re = /.$/; w = w.replace(re,""); }
|
---|
102 | else if (re4.test(w)) { w = w + "e"; }
|
---|
103 | }
|
---|
104 | }
|
---|
105 |
|
---|
106 | // Step 1c
|
---|
107 | re = new RegExp("^(.+" + c + ")y$");
|
---|
108 | if (re.test(w)) {
|
---|
109 | var fp = re.exec(w);
|
---|
110 | stem = fp[1];
|
---|
111 | w = stem + "i";
|
---|
112 | }
|
---|
113 |
|
---|
114 | // Step 2
|
---|
115 | re = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/;
|
---|
116 | if (re.test(w)) {
|
---|
117 | var fp = re.exec(w);
|
---|
118 | stem = fp[1];
|
---|
119 | suffix = fp[2];
|
---|
120 | re = new RegExp(mgr0);
|
---|
121 | if (re.test(stem)) {
|
---|
122 | w = stem + step2list[suffix];
|
---|
123 | }
|
---|
124 | }
|
---|
125 |
|
---|
126 | // Step 3
|
---|
127 | re = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/;
|
---|
128 | if (re.test(w)) {
|
---|
129 | var fp = re.exec(w);
|
---|
130 | stem = fp[1];
|
---|
131 | suffix = fp[2];
|
---|
132 | re = new RegExp(mgr0);
|
---|
133 | if (re.test(stem)) {
|
---|
134 | w = stem + step3list[suffix];
|
---|
135 | }
|
---|
136 | }
|
---|
137 |
|
---|
138 | // Step 4
|
---|
139 | re = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/;
|
---|
140 | re2 = /^(.+?)(s|t)(ion)$/;
|
---|
141 | if (re.test(w)) {
|
---|
142 | var fp = re.exec(w);
|
---|
143 | stem = fp[1];
|
---|
144 | re = new RegExp(mgr1);
|
---|
145 | if (re.test(stem)) {
|
---|
146 | w = stem;
|
---|
147 | }
|
---|
148 | } else if (re2.test(w)) {
|
---|
149 | var fp = re2.exec(w);
|
---|
150 | stem = fp[1] + fp[2];
|
---|
151 | re2 = new RegExp(mgr1);
|
---|
152 | if (re2.test(stem)) {
|
---|
153 | w = stem;
|
---|
154 | }
|
---|
155 | }
|
---|
156 |
|
---|
157 | // Step 5
|
---|
158 | re = /^(.+?)e$/;
|
---|
159 | if (re.test(w)) {
|
---|
160 | var fp = re.exec(w);
|
---|
161 | stem = fp[1];
|
---|
162 | re = new RegExp(mgr1);
|
---|
163 | re2 = new RegExp(meq1);
|
---|
164 | re3 = new RegExp("^" + C + v + "[^aeiouwxy]$");
|
---|
165 | if (re.test(stem) || (re2.test(stem) && !(re3.test(stem)))) {
|
---|
166 | w = stem;
|
---|
167 | }
|
---|
168 | }
|
---|
169 |
|
---|
170 | re = /ll$/;
|
---|
171 | re2 = new RegExp(mgr1);
|
---|
172 | if (re.test(w) && re2.test(w)) {
|
---|
173 | re = /.$/;
|
---|
174 | w = w.replace(re,"");
|
---|
175 | }
|
---|
176 |
|
---|
177 | // and turn initial Y back to y
|
---|
178 |
|
---|
179 | if (firstch == "y") {
|
---|
180 | w = firstch.toLowerCase() + w.substr(1);
|
---|
181 | }
|
---|
182 |
|
---|
183 | // See http://snowball.tartarus.org/algorithms/english/stemmer.html
|
---|
184 | // "Exceptional forms in general"
|
---|
185 | var specialWords = {
|
---|
186 | "skis" : "ski",
|
---|
187 | "skies" : "sky",
|
---|
188 | "dying" : "die",
|
---|
189 | "lying" : "lie",
|
---|
190 | "tying" : "tie",
|
---|
191 | "idly" : "idl",
|
---|
192 | "gently" : "gentl",
|
---|
193 | "ugly" : "ugli",
|
---|
194 | "early": "earli",
|
---|
195 | "only": "onli",
|
---|
196 | "singly": "singl"
|
---|
197 | };
|
---|
198 |
|
---|
199 | if(specialWords[origword]){
|
---|
200 | w = specialWords[origword];
|
---|
201 | }
|
---|
202 |
|
---|
203 | if( "sky news howe atlas cosmos bias \
|
---|
204 | andes inning outing canning herring \
|
---|
205 | earring proceed exceed succeed".indexOf(origword) !== -1 ){
|
---|
206 | w = origword;
|
---|
207 | }
|
---|
208 |
|
---|
209 | // Address words overstemmed as gener-
|
---|
210 | re = /.*generate?s?d?(ing)?$/;
|
---|
211 | if( re.test(origword) ){
|
---|
212 | w = w + 'at';
|
---|
213 | }
|
---|
214 | re = /.*general(ly)?$/;
|
---|
215 | if( re.test(origword) ){
|
---|
216 | w = w + 'al';
|
---|
217 | }
|
---|
218 | re = /.*generic(ally)?$/;
|
---|
219 | if( re.test(origword) ){
|
---|
220 | w = w + 'ic';
|
---|
221 | }
|
---|
222 | re = /.*generous(ly)?$/;
|
---|
223 | if( re.test(origword) ){
|
---|
224 | w = w + 'ous';
|
---|
225 | }
|
---|
226 | // Address words overstemmed as commun-
|
---|
227 | re = /.*communit(ies)?y?/;
|
---|
228 | if( re.test(origword) ){
|
---|
229 | w = w + 'iti';
|
---|
230 | }
|
---|
231 |
|
---|
232 | return w;
|
---|
233 | }
|
---|
234 | })();
|
---|