source: stylesheets/lfs-xsl/docbook-xsl-1.78.1/webhelp/docs/search/stemmers/en_stemmer.js@ b1a51ac1

7.5-systemd 7.6-systemd 7.7-systemd 7.8-systemd 7.9-systemd
Last change on this file since b1a51ac1 was b1a51ac1, checked in by Krejzi <krejzi@…>, 11 years ago

Import new branch

git-svn-id: http://svn.linuxfromscratch.org/LFS/branches/systemd/BOOK@10389 4aa44e1e-78dd-0310-a6d2-fbcd4c07a689

  • Property mode set to 100644
File size: 5.2 KB
Line 
1// Porter stemmer in Javascript. Few comments, but it's easy to follow against the rules in the original
2// paper, in
3//
4// Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
5// no. 3, pp 130-137,
6//
7// see also http://www.tartarus.org/~martin/PorterStemmer
8
9// Release 1
10// Derived from (http://tartarus.org/~martin/PorterStemmer/js.txt) - cjm (iizuu) Aug 24, 2009
11
12var stemmer = (function(){
13 var step2list = {
14 "ational" : "ate",
15 "tional" : "tion",
16 "enci" : "ence",
17 "anci" : "ance",
18 "izer" : "ize",
19 "bli" : "ble",
20 "alli" : "al",
21 "entli" : "ent",
22 "eli" : "e",
23 "ousli" : "ous",
24 "ization" : "ize",
25 "ation" : "ate",
26 "ator" : "ate",
27 "alism" : "al",
28 "iveness" : "ive",
29 "fulness" : "ful",
30 "ousness" : "ous",
31 "aliti" : "al",
32 "iviti" : "ive",
33 "biliti" : "ble",
34 "logi" : "log"
35 },
36
37 step3list = {
38 "icate" : "ic",
39 "ative" : "",
40 "alize" : "al",
41 "iciti" : "ic",
42 "ical" : "ic",
43 "ful" : "",
44 "ness" : ""
45 },
46
47 c = "[^aeiou]", // consonant
48 v = "[aeiouy]", // vowel
49 C = c + "[^aeiouy]*", // consonant sequence
50 V = v + "[aeiou]*", // vowel sequence
51
52 mgr0 = "^(" + C + ")?" + V + C, // [C]VC... is m>0
53 meq1 = "^(" + C + ")?" + V + C + "(" + V + ")?$", // [C]VC[V] is m=1
54 mgr1 = "^(" + C + ")?" + V + C + V + C, // [C]VCVC... is m>1
55 s_v = "^(" + C + ")?" + v; // vowel in stem
56
57 return function (w) {
58 var stem,
59 suffix,
60 firstch,
61 re,
62 re2,
63 re3,
64 re4,
65 origword = w;
66
67 if (w.length < 3) { return w; }
68
69 firstch = w.substr(0,1);
70 if (firstch == "y") {
71 w = firstch.toUpperCase() + w.substr(1);
72 }
73
74 // Step 1a
75 re = /^(.+?)(ss|i)es$/;
76 re2 = /^(.+?)([^s])s$/;
77
78 if (re.test(w)) { w = w.replace(re,"$1$2"); }
79 else if (re2.test(w)) { w = w.replace(re2,"$1$2"); }
80
81 // Step 1b
82 re = /^(.+?)eed$/;
83 re2 = /^(.+?)(ed|ing)$/;
84 if (re.test(w)) {
85 var fp = re.exec(w);
86 re = new RegExp(mgr0);
87 if (re.test(fp[1])) {
88 re = /.$/;
89 w = w.replace(re,"");
90 }
91 } else if (re2.test(w)) {
92 var fp = re2.exec(w);
93 stem = fp[1];
94 re2 = new RegExp(s_v);
95 if (re2.test(stem)) {
96 w = stem;
97 re2 = /(at|bl|iz)$/;
98 re3 = new RegExp("([^aeiouylsz])\\1$");
99 re4 = new RegExp("^" + C + v + "[^aeiouwxy]$");
100 if (re2.test(w)) { w = w + "e"; }
101 else if (re3.test(w)) { re = /.$/; w = w.replace(re,""); }
102 else if (re4.test(w)) { w = w + "e"; }
103 }
104 }
105
106 // Step 1c
107 re = new RegExp("^(.+" + c + ")y$");
108 if (re.test(w)) {
109 var fp = re.exec(w);
110 stem = fp[1];
111 w = stem + "i";
112 }
113
114 // Step 2
115 re = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/;
116 if (re.test(w)) {
117 var fp = re.exec(w);
118 stem = fp[1];
119 suffix = fp[2];
120 re = new RegExp(mgr0);
121 if (re.test(stem)) {
122 w = stem + step2list[suffix];
123 }
124 }
125
126 // Step 3
127 re = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/;
128 if (re.test(w)) {
129 var fp = re.exec(w);
130 stem = fp[1];
131 suffix = fp[2];
132 re = new RegExp(mgr0);
133 if (re.test(stem)) {
134 w = stem + step3list[suffix];
135 }
136 }
137
138 // Step 4
139 re = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/;
140 re2 = /^(.+?)(s|t)(ion)$/;
141 if (re.test(w)) {
142 var fp = re.exec(w);
143 stem = fp[1];
144 re = new RegExp(mgr1);
145 if (re.test(stem)) {
146 w = stem;
147 }
148 } else if (re2.test(w)) {
149 var fp = re2.exec(w);
150 stem = fp[1] + fp[2];
151 re2 = new RegExp(mgr1);
152 if (re2.test(stem)) {
153 w = stem;
154 }
155 }
156
157 // Step 5
158 re = /^(.+?)e$/;
159 if (re.test(w)) {
160 var fp = re.exec(w);
161 stem = fp[1];
162 re = new RegExp(mgr1);
163 re2 = new RegExp(meq1);
164 re3 = new RegExp("^" + C + v + "[^aeiouwxy]$");
165 if (re.test(stem) || (re2.test(stem) && !(re3.test(stem)))) {
166 w = stem;
167 }
168 }
169
170 re = /ll$/;
171 re2 = new RegExp(mgr1);
172 if (re.test(w) && re2.test(w)) {
173 re = /.$/;
174 w = w.replace(re,"");
175 }
176
177 // and turn initial Y back to y
178
179 if (firstch == "y") {
180 w = firstch.toLowerCase() + w.substr(1);
181 }
182
183 // See http://snowball.tartarus.org/algorithms/english/stemmer.html
184 // "Exceptional forms in general"
185 var specialWords = {
186 "skis" : "ski",
187 "skies" : "sky",
188 "dying" : "die",
189 "lying" : "lie",
190 "tying" : "tie",
191 "idly" : "idl",
192 "gently" : "gentl",
193 "ugly" : "ugli",
194 "early": "earli",
195 "only": "onli",
196 "singly": "singl"
197 };
198
199 if(specialWords[origword]){
200 w = specialWords[origword];
201 }
202
203 if( "sky news howe atlas cosmos bias \
204 andes inning outing canning herring \
205 earring proceed exceed succeed".indexOf(origword) !== -1 ){
206 w = origword;
207 }
208
209 // Address words overstemmed as gener-
210 re = /.*generate?s?d?(ing)?$/;
211 if( re.test(origword) ){
212 w = w + 'at';
213 }
214 re = /.*general(ly)?$/;
215 if( re.test(origword) ){
216 w = w + 'al';
217 }
218 re = /.*generic(ally)?$/;
219 if( re.test(origword) ){
220 w = w + 'ic';
221 }
222 re = /.*generous(ly)?$/;
223 if( re.test(origword) ){
224 w = w + 'ous';
225 }
226 // Address words overstemmed as commun-
227 re = /.*communit(ies)?y?/;
228 if( re.test(origword) ){
229 w = w + 'iti';
230 }
231
232 return w;
233 }
234})();
Note: See TracBrowser for help on using the repository browser.