1 | /*
|
---|
2 | * Author: Kasun Gajasinghe
|
---|
3 | * E-Mail: kasunbg AT gmail DOT com
|
---|
4 | * Date: 09.08.2010
|
---|
5 | *
|
---|
6 | * usage: stemmer(word);
|
---|
7 | * ex: var stem = stemmer(foobar);
|
---|
8 | * Implementation of the stemming algorithm from http://snowball.tartarus.org/algorithms/french/stemmer.html
|
---|
9 | *
|
---|
10 | * LICENSE:
|
---|
11 | *
|
---|
12 | * Copyright (c) 2010, Kasun Gajasinghe. All rights reserved.
|
---|
13 | *
|
---|
14 | * Redistribution and use in source and binary forms, with or without modification,
|
---|
15 | * are permitted provided that the following conditions are met:
|
---|
16 | *
|
---|
17 | * 1. Redistributions of source code must retain the above copyright notice,
|
---|
18 | * this list of conditions and the following disclaimer.
|
---|
19 | *
|
---|
20 | * 2. Redistributions in binary form must reproduce the above copyright notice,
|
---|
21 | * this list of conditions and the following disclaimer in the documentation
|
---|
22 | * and/or other materials provided with the distribution.
|
---|
23 | *
|
---|
24 | *
|
---|
25 | * THIS SOFTWARE IS PROVIDED BY KASUN GAJASINGHE ''AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
|
---|
26 | * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
---|
27 | * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL KASUN GAJASINGHE BE LIABLE FOR ANY DIRECT,
|
---|
28 | * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
---|
29 | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
|
---|
30 | * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
---|
31 | * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
---|
32 | * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
---|
33 | *
|
---|
34 | */
|
---|
35 |
|
---|
36 | var stemmer = function(word){
|
---|
37 | // Letters in French include the following accented forms,
|
---|
38 | // â à ç ë é ê è ï î ô û ù
|
---|
39 | // The following letters are vowels:
|
---|
40 | // a e i o u y â à ë é ê è ï î ô û ù
|
---|
41 |
|
---|
42 | word = word.toLowerCase();
|
---|
43 | var oriWord = word;
|
---|
44 | word = word.replace(/qu/g, 'qU'); //have to perform first, as after the operation, capital U is not treated as a vowel
|
---|
45 | word = word.replace(/([aeiouyâàëéêèïîôûù])u([aeiouyâàëéêèïîôûù])/g, '$1U$2');
|
---|
46 | word = word.replace(/([aeiouyâàëéêèïîôûù])i([aeiouyâàëéêèïîôûù])/g, '$1I$2');
|
---|
47 | word = word.replace(/([aeiouyâàëéêèïîôûù])y/g, '$1Y');
|
---|
48 | word = word.replace(/y([aeiouyâàëéêèïîôûù])/g, 'Y$1');
|
---|
49 |
|
---|
50 | var rv='';
|
---|
51 | var rvIndex = -1;
|
---|
52 | if(word.search(/^(par|col|tap)/) != -1 || word.search(/^[aeiouyâàëéêèïîôûù]{2}/) != -1){
|
---|
53 | rv = word.substring(3);
|
---|
54 | rvIndex = 3;
|
---|
55 | } else {
|
---|
56 | rvIndex = word.substring(1).search(/[aeiouyâàëéêèïîôûù]/);
|
---|
57 | if(rvIndex != -1){
|
---|
58 | rvIndex +=2; //+2 is to supplement the substring(1) used to find rvIndex
|
---|
59 | rv = word.substring(rvIndex);
|
---|
60 | } else {
|
---|
61 | rvIndex = word.length;
|
---|
62 | }
|
---|
63 | }
|
---|
64 |
|
---|
65 | // R1 is the region after the first non-vowel following a vowel, or the end of the word if there is no such non-vowel.
|
---|
66 | // R2 is the region after the first non-vowel following a vowel in R1, or the end of the word if there is no such non-vowel
|
---|
67 | var r1Index = word.search(/[aeiouyâàëéêèïîôûù][^aeiouyâàëéêèïîôûù]/);
|
---|
68 | var r1 = '';
|
---|
69 | if (r1Index != -1) {
|
---|
70 | r1Index += 2;
|
---|
71 | r1 = word.substring(r1Index);
|
---|
72 | } else {
|
---|
73 | r1Index = word.length;
|
---|
74 | }
|
---|
75 |
|
---|
76 | var r2Index = -1;
|
---|
77 | var r2 = '';
|
---|
78 | if (r1Index != -1) {
|
---|
79 | r2Index = r1.search(/[aeiouyâàëéêèïîôûù][^aeiouyâàëéêèïîôûù]/);
|
---|
80 | if (r2Index != -1) {
|
---|
81 | r2Index += 2;
|
---|
82 | r2 = r1.substring(r2Index);
|
---|
83 | r2Index += r1Index;
|
---|
84 | } else {
|
---|
85 | r2 = '';
|
---|
86 | r2Index = word.length;
|
---|
87 | }
|
---|
88 | }
|
---|
89 | if (r1Index != -1 && r1Index < 3) {
|
---|
90 | r1Index = 3;
|
---|
91 | r1 = word.substring(r1Index);
|
---|
92 | }
|
---|
93 |
|
---|
94 | /*
|
---|
95 | Step 1: Standard suffix removal
|
---|
96 | */
|
---|
97 | var a1Index = word.search(/(ance|iqUe|isme|able|iste|eux|ances|iqUes|ismes|ables|istes)$/);
|
---|
98 | var a2Index = word.search(/(atrice|ateur|ation|atrices|ateurs|ations)$/);
|
---|
99 | var a3Index = word.search(/(logie|logies)$/);
|
---|
100 | var a4Index = word.search(/(usion|ution|usions|utions)$/);
|
---|
101 | var a5Index = word.search(/(ence|ences)$/);
|
---|
102 | var a6Index = word.search(/(ement|ements)$/);
|
---|
103 | var a7Index = word.search(/(ité|ités)$/);
|
---|
104 | var a8Index = word.search(/(if|ive|ifs|ives)$/);
|
---|
105 | var a9Index = word.search(/(eaux)$/);
|
---|
106 | var a10Index = word.search(/(aux)$/);
|
---|
107 | var a11Index = word.search(/(euse|euses)$/);
|
---|
108 | var a12Index = word.search(/[^aeiouyâàëéêèïîôûù](issement|issements)$/);
|
---|
109 | var a13Index = word.search(/(amment)$/);
|
---|
110 | var a14Index = word.search(/(emment)$/);
|
---|
111 | var a15Index = word.search(/[aeiouyâàëéêèïîôûù](ment|ments)$/);
|
---|
112 |
|
---|
113 | if(a1Index != -1 && a1Index >= r2Index){
|
---|
114 | word = word.substring(0,a1Index);
|
---|
115 | } else if(a2Index != -1 && a2Index >= r2Index){
|
---|
116 | word = word.substring(0,a2Index);
|
---|
117 | var a2Index2 = word.search(/(ic)$/);
|
---|
118 | if(a2Index2 != -1 && a2Index2 >= r2Index){
|
---|
119 | word = word.substring(0, a2Index2); //if preceded by ic, delete if in R2,
|
---|
120 | } else { //else replace by iqU
|
---|
121 | word = word.replace(/(ic)$/,'iqU');
|
---|
122 | }
|
---|
123 | } else if(a3Index != -1 && a3Index >= r2Index){
|
---|
124 | word = word.replace(/(logie|logies)$/,'log'); //replace with log if in R2
|
---|
125 | } else if(a4Index != -1 && a4Index >= r2Index){
|
---|
126 | word = word.replace(/(usion|ution|usions|utions)$/,'u'); //replace with u if in R2
|
---|
127 | } else if(a5Index != -1 && a5Index >= r2Index){
|
---|
128 | word = word.replace(/(ence|ences)$/,'ent'); //replace with ent if in R2
|
---|
129 | } else if(a6Index != -1 && a6Index >= rvIndex){
|
---|
130 | word = word.substring(0,a6Index);
|
---|
131 | if(word.search(/(iv)$/) >= r2Index){
|
---|
132 | word = word.replace(/(iv)$/, '');
|
---|
133 | if(word.search(/(at)$/) >= r2Index){
|
---|
134 | word = word.replace(/(at)$/, '');
|
---|
135 | }
|
---|
136 | } else if(word.search(/(eus)$/) != -1){
|
---|
137 | var a6Index2 = word.search(/(eus)$/);
|
---|
138 | if(a6Index2 >=r2Index){
|
---|
139 | word = word.substring(0, a6Index2);
|
---|
140 | } else if(a6Index2 >= r1Index){
|
---|
141 | word = word.substring(0,a6Index2)+"eux";
|
---|
142 | }
|
---|
143 | } else if(word.search(/(abl|iqU)$/) >= r2Index){
|
---|
144 | word = word.replace(/(abl|iqU)$/,''); //if preceded by abl or iqU, delete if in R2,
|
---|
145 | } else if(word.search(/(ièr|Ièr)$/) >= rvIndex){
|
---|
146 | word = word.replace(/(ièr|Ièr)$/,'i'); //if preceded by abl or iqU, delete if in R2,
|
---|
147 | }
|
---|
148 | } else if(a7Index != -1 && a7Index >= r2Index){
|
---|
149 | word = word.substring(0,a7Index); //delete if in R2
|
---|
150 | if(word.search(/(abil)$/) != -1){ //if preceded by abil, delete if in R2, else replace by abl, otherwise,
|
---|
151 | var a7Index2 = word.search(/(abil)$/);
|
---|
152 | if(a7Index2 >=r2Index){
|
---|
153 | word = word.substring(0, a7Index2);
|
---|
154 | } else {
|
---|
155 | word = word.substring(0,a7Index2)+"abl";
|
---|
156 | }
|
---|
157 | } else if(word.search(/(ic)$/) != -1){
|
---|
158 | var a7Index3 = word.search(/(ic)$/);
|
---|
159 | if(a7Index3 != -1 && a7Index3 >= r2Index){
|
---|
160 | word = word.substring(0, a7Index3); //if preceded by ic, delete if in R2,
|
---|
161 | } else { //else replace by iqU
|
---|
162 | word = word.replace(/(ic)$/,'iqU');
|
---|
163 | }
|
---|
164 | } else if(word.search(/(iv)$/) != r2Index){
|
---|
165 | word = word.replace(/(iv)$/,'');
|
---|
166 | }
|
---|
167 | } else if(a8Index != -1 && a8Index >= r2Index){
|
---|
168 | word = word.substring(0,a8Index);
|
---|
169 | if(word.search(/(at)$/) >= r2Index){
|
---|
170 | word = word.replace(/(at)$/, '');
|
---|
171 | if(word.search(/(ic)$/) >= r2Index){
|
---|
172 | word = word.replace(/(ic)$/, '');
|
---|
173 | } else { word = word.replace(/(ic)$/, 'iqU'); }
|
---|
174 | }
|
---|
175 | } else if(a9Index != -1){ word = word.replace(/(eaux)/,'eau')
|
---|
176 | } else if(a10Index >= r1Index){ word = word.replace(/(aux)/,'al')
|
---|
177 | } else if(a11Index != -1 ){
|
---|
178 | var a11Index2 = word.search(/(euse|euses)$/);
|
---|
179 | if(a11Index2 >=r2Index){
|
---|
180 | word = word.substring(0, a11Index2);
|
---|
181 | } else if(a11Index2 >= r1Index){
|
---|
182 | word = word.substring(0, a11Index2)+"eux";
|
---|
183 | }
|
---|
184 | } else if(a12Index!=-1 && a12Index>=r1Index){
|
---|
185 | word = word.substring(0,a12Index+1); //+1- amendment to non-vowel
|
---|
186 | } else if(a13Index!=-1 && a13Index>=rvIndex){
|
---|
187 | word = word.replace(/(amment)$/,'ant');
|
---|
188 | } else if(a14Index!=-1 && a14Index>=rvIndex){
|
---|
189 | word = word.replace(/(emment)$/,'ent');
|
---|
190 | } else if(a15Index!=-1 && a15Index>=rvIndex){
|
---|
191 | word = word.substring(0,a15Index+1);
|
---|
192 | }
|
---|
193 |
|
---|
194 | /* Step 2a: Verb suffixes beginning i*/
|
---|
195 | var wordStep1 = word;
|
---|
196 | var step2aDone = false;
|
---|
197 | if(oriWord == word.toLowerCase() || oriWord.search(/(amment|emment|ment|ments)$/) != -1){
|
---|
198 | step2aDone = true;
|
---|
199 | var b1Regex = /([^aeiouyâàëéêèïîôûù])(îmes|ît|îtes|i|ie|ies|ir|ira|irai|iraIent|irais|irait|iras|irent|irez|iriez|irions|irons|iront|is|issaIent|issais|issait|issant|issante|issantes|issants|isse|issent|isses|issez|issiez|issions|issons|it)$/i;
|
---|
200 | if(word.search(b1Regex) >= rvIndex){
|
---|
201 | word = word.replace(b1Regex,'$1');
|
---|
202 | }
|
---|
203 | }
|
---|
204 |
|
---|
205 | /* Step 2b: Other verb suffixes*/
|
---|
206 | if (step2aDone && wordStep1 == word) {
|
---|
207 | if (word.search(/(ions)$/) >= r2Index) {
|
---|
208 | word = word.replace(/(ions)$/, '');
|
---|
209 | } else {
|
---|
210 | var b2Regex = /(é|ée|ées|és|èrent|er|era|erai|eraIent|erais|erait|eras|erez|eriez|erions|erons|eront|ez|iez)$/i;
|
---|
211 | if (word.search(b2Regex) >= rvIndex) {
|
---|
212 | word = word.replace(b2Regex, '');
|
---|
213 | } else {
|
---|
214 | var b3Regex = /e(âmes|ât|âtes|a|ai|aIent|ais|ait|ant|ante|antes|ants|as|asse|assent|asses|assiez|assions)$/i;
|
---|
215 | if (word.search(b3Regex) >= rvIndex) {
|
---|
216 | word = word.replace(b3Regex, '');
|
---|
217 | } else {
|
---|
218 | var b3Regex2 = /(âmes|ât|âtes|a|ai|aIent|ais|ait|ant|ante|antes|ants|as|asse|assent|asses|assiez|assions)$/i;
|
---|
219 | if (word.search(b3Regex2) >= rvIndex) {
|
---|
220 | word = word.replace(b3Regex2, '');
|
---|
221 | }
|
---|
222 | }
|
---|
223 | }
|
---|
224 | }
|
---|
225 | }
|
---|
226 |
|
---|
227 | if(oriWord != word.toLowerCase()){
|
---|
228 | /* Step 3 */
|
---|
229 | var rep = '';
|
---|
230 | if(word.search(/Y$/) != -1) {
|
---|
231 | word = word.replace(/Y$/, 'i');
|
---|
232 | } else if(word.search(/ç$/) != -1){
|
---|
233 | word = word.replace(/ç$/, 'c');
|
---|
234 | }
|
---|
235 | } else {
|
---|
236 | /* Step 4 */
|
---|
237 | //If the word ends s, not preceded by a, i, o, u, è or s, delete it.
|
---|
238 | if (word.search(/([^aiouès])s$/) >= rvIndex) {
|
---|
239 | word = word.replace(/([^aiouès])s$/, '$1');
|
---|
240 | }
|
---|
241 | var e1Index = word.search(/ion$/);
|
---|
242 | if (e1Index >= r2Index && word.search(/[st]ion$/) >= rvIndex) {
|
---|
243 | word = word.substring(0, e1Index);
|
---|
244 | } else {
|
---|
245 | var e2Index = word.search(/(ier|ière|Ier|Ière)$/);
|
---|
246 | if (e2Index != -1 && e2Index >= rvIndex) {
|
---|
247 | word = word.substring(0, e2Index) + "i";
|
---|
248 | } else {
|
---|
249 | if (word.search(/e$/) >= rvIndex) {
|
---|
250 | word = word.replace(/e$/, ''); //delete last e
|
---|
251 | } else if (word.search(/guë$/) >= rvIndex) {
|
---|
252 | word = word.replace(/guë$/, 'gu');
|
---|
253 | }
|
---|
254 | }
|
---|
255 | }
|
---|
256 | }
|
---|
257 |
|
---|
258 | /* Step 5: Undouble */
|
---|
259 | //word = word.replace(/(en|on|et|el|eil)(n|t|l)$/,'$1');
|
---|
260 | word = word.replace(/(en|on)(n)$/,'$1');
|
---|
261 | word = word.replace(/(ett)$/,'et');
|
---|
262 | word = word.replace(/(el|eil)(l)$/,'$1');
|
---|
263 |
|
---|
264 | /* Step 6: Un-accent */
|
---|
265 | word = word.replace(/[éè]([^aeiouyâàëéêèïîôûù]+)$/,'e$1');
|
---|
266 | word = word.toLowerCase();
|
---|
267 | return word;
|
---|
268 | };
|
---|
269 |
|
---|
270 | var eqOut = new Array();
|
---|
271 | var noteqOut = new Array();
|
---|
272 | var eqCount = 0;
|
---|
273 | /*
|
---|
274 | To test the stemming, create two arrays named "voc" and "COut" which are for vocabualary and the stemmed output.
|
---|
275 | Then add the vocabulary strings and output strings. This method will generate the stemmed output for "voc" and will
|
---|
276 | compare the output with COut.
|
---|
277 | (I used porter's voc and out files and did a regex to convert them to js objects. regex: /");\nvoc.push("/g . This
|
---|
278 | will add strings to voc array such that output would look like: voc.push("foobar"); ) drop me an email for any help.
|
---|
279 | */
|
---|
280 | function testFr(){
|
---|
281 | var start = new Date().getTime(); //execution time
|
---|
282 | eqCount = 0;
|
---|
283 | eqOut = new Array();
|
---|
284 | noteqOut = new Array();
|
---|
285 | for(var k=0;k<voc.length;k++){
|
---|
286 | if(COut[k]==stemmer(voc[k])){
|
---|
287 | eqCount++;
|
---|
288 | eqOut.push("v: "+voc[k]+" c: "+COut[k]);
|
---|
289 | } else {
|
---|
290 | noteqOut.push(voc[k]+", c: "+COut[k]+" s:"+stemmer(voc[k]));
|
---|
291 | }
|
---|
292 | }
|
---|
293 | var end = new Date().getTime(); //execution time
|
---|
294 | var time = end-start;
|
---|
295 | alert("equal count= "+eqCount+" out of "+voc.length+" words. time= "+time+" ms");
|
---|
296 | //console.log("equal count= "+eqCount+" out of "+voc.length+" words. time= "+time+" ms");
|
---|
297 | }
|
---|
298 |
|
---|
299 |
|
---|