Context Navigation

source: stylesheets/lfs-xsl/docbook-xsl-1.78.1/webhelp/docs/search/stemmers/fr_stemmer.js@ d568769

Visit:

ml-11.0 multilib xry111/multilib

Last change on this file since d568769 was 15c7d39, checked in by Matthew Burgess <matthew@…>, 11 years ago

Update stylesheets to docbook-xsl-1.78.1.

git-svn-id: http://svn.linuxfromscratch.org/LFS/trunk/BOOK@10355 4aa44e1e-78dd-0310-a6d2-fbcd4c07a689

Property mode set to 100644

File size: 13.0 KB

Line
1	/*
2	* Author: Kasun Gajasinghe
3	* E-Mail: kasunbg AT gmail DOT com
4	* Date: 09.08.2010
5	*
6	* usage: stemmer(word);
7	* ex: var stem = stemmer(foobar);
8	* Implementation of the stemming algorithm from http://snowball.tartarus.org/algorithms/french/stemmer.html
9	*
10	* LICENSE:
11	*
12	* Copyright (c) 2010, Kasun Gajasinghe. All rights reserved.
13	*
14	* Redistribution and use in source and binary forms, with or without modification,
15	* are permitted provided that the following conditions are met:
16	*
17	* 1. Redistributions of source code must retain the above copyright notice,
18	* this list of conditions and the following disclaimer.
19	*
20	* 2. Redistributions in binary form must reproduce the above copyright notice,
21	* this list of conditions and the following disclaimer in the documentation
22	* and/or other materials provided with the distribution.
23	*
24	*
25	* THIS SOFTWARE IS PROVIDED BY KASUN GAJASINGHE ''AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
26	* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
27	* PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL KASUN GAJASINGHE BE LIABLE FOR ANY DIRECT,
28	* INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
29	* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
30	* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
31	* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
32	* USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33	*
34	*/
35
36	var stemmer = function(word){
37	// Letters in French include the following accented forms,
38	// â à ç ë é ê è ï î ô û ù
39	// The following letters are vowels:
40	// a e i o u y â à ë é ê è ï î ô û ù
41
42	word = word.toLowerCase();
43	var oriWord = word;
44	word = word.replace(/qu/g, 'qU'); //have to perform first, as after the operation, capital U is not treated as a vowel
45	word = word.replace(/([aeiouyâàëéêèïîôûù])u([aeiouyâàëéêèïîôûù])/g, '$1U$2');
46	word = word.replace(/([aeiouyâàëéêèïîôûù])i([aeiouyâàëéêèïîôûù])/g, '$1I$2');
47	word = word.replace(/([aeiouyâàëéêèïîôûù])y/g, '$1Y');
48	word = word.replace(/y([aeiouyâàëéêèïîôûù])/g, 'Y$1');
49
50	var rv='';
51	var rvIndex = -1;
52	if(word.search(/^(par\|col\|tap)/) != -1 \|\| word.search(/^[aeiouyâàëéêèïîôûù]{2}/) != -1){
53	rv = word.substring(3);
54	rvIndex = 3;
55	} else {
56	rvIndex = word.substring(1).search(/[aeiouyâàëéêèïîôûù]/);
57	if(rvIndex != -1){
58	rvIndex +=2; //+2 is to supplement the substring(1) used to find rvIndex
59	rv = word.substring(rvIndex);
60	} else {
61	rvIndex = word.length;
62	}
63	}
64
65	// R1 is the region after the first non-vowel following a vowel, or the end of the word if there is no such non-vowel.
66	// R2 is the region after the first non-vowel following a vowel in R1, or the end of the word if there is no such non-vowel
67	var r1Index = word.search(/[aeiouyâàëéêèïîôûù][^aeiouyâàëéêèïîôûù]/);
68	var r1 = '';
69	if (r1Index != -1) {
70	r1Index += 2;
71	r1 = word.substring(r1Index);
72	} else {
73	r1Index = word.length;
74	}
75
76	var r2Index = -1;
77	var r2 = '';
78	if (r1Index != -1) {
79	r2Index = r1.search(/[aeiouyâàëéêèïîôûù][^aeiouyâàëéêèïîôûù]/);
80	if (r2Index != -1) {
81	r2Index += 2;
82	r2 = r1.substring(r2Index);
83	r2Index += r1Index;
84	} else {
85	r2 = '';
86	r2Index = word.length;
87	}
88	}
89	if (r1Index != -1 && r1Index < 3) {
90	r1Index = 3;
91	r1 = word.substring(r1Index);
92	}
93
94	/*
95	Step 1: Standard suffix removal
96	*/
97	var a1Index = word.search(/(ance\|iqUe\|isme\|able\|iste\|eux\|ances\|iqUes\|ismes\|ables\|istes)$/);
98	var a2Index = word.search(/(atrice\|ateur\|ation\|atrices\|ateurs\|ations)$/);
99	var a3Index = word.search(/(logie\|logies)$/);
100	var a4Index = word.search(/(usion\|ution\|usions\|utions)$/);
101	var a5Index = word.search(/(ence\|ences)$/);
102	var a6Index = word.search(/(ement\|ements)$/);
103	var a7Index = word.search(/(ité\|ités)$/);
104	var a8Index = word.search(/(if\|ive\|ifs\|ives)$/);
105	var a9Index = word.search(/(eaux)$/);
106	var a10Index = word.search(/(aux)$/);
107	var a11Index = word.search(/(euse\|euses)$/);
108	var a12Index = word.search(/[^aeiouyâàëéêèïîôûù](issement\|issements)$/);
109	var a13Index = word.search(/(amment)$/);
110	var a14Index = word.search(/(emment)$/);
111	var a15Index = word.search(/[aeiouyâàëéêèïîôûù](ment\|ments)$/);
112
113	if(a1Index != -1 && a1Index >= r2Index){
114	word = word.substring(0,a1Index);
115	} else if(a2Index != -1 && a2Index >= r2Index){
116	word = word.substring(0,a2Index);
117	var a2Index2 = word.search(/(ic)$/);
118	if(a2Index2 != -1 && a2Index2 >= r2Index){
119	word = word.substring(0, a2Index2); //if preceded by ic, delete if in R2,
120	} else { //else replace by iqU
121	word = word.replace(/(ic)$/,'iqU');
122	}
123	} else if(a3Index != -1 && a3Index >= r2Index){
124	word = word.replace(/(logie\|logies)$/,'log'); //replace with log if in R2
125	} else if(a4Index != -1 && a4Index >= r2Index){
126	word = word.replace(/(usion\|ution\|usions\|utions)$/,'u'); //replace with u if in R2
127	} else if(a5Index != -1 && a5Index >= r2Index){
128	word = word.replace(/(ence\|ences)$/,'ent'); //replace with ent if in R2
129	} else if(a6Index != -1 && a6Index >= rvIndex){
130	word = word.substring(0,a6Index);
131	if(word.search(/(iv)$/) >= r2Index){
132	word = word.replace(/(iv)$/, '');
133	if(word.search(/(at)$/) >= r2Index){
134	word = word.replace(/(at)$/, '');
135	}
136	} else if(word.search(/(eus)$/) != -1){
137	var a6Index2 = word.search(/(eus)$/);
138	if(a6Index2 >=r2Index){
139	word = word.substring(0, a6Index2);
140	} else if(a6Index2 >= r1Index){
141	word = word.substring(0,a6Index2)+"eux";
142	}
143	} else if(word.search(/(abl\|iqU)$/) >= r2Index){
144	word = word.replace(/(abl\|iqU)$/,''); //if preceded by abl or iqU, delete if in R2,
145	} else if(word.search(/(ièr\|Ièr)$/) >= rvIndex){
146	word = word.replace(/(ièr\|Ièr)$/,'i'); //if preceded by abl or iqU, delete if in R2,
147	}
148	} else if(a7Index != -1 && a7Index >= r2Index){
149	word = word.substring(0,a7Index); //delete if in R2
150	if(word.search(/(abil)$/) != -1){ //if preceded by abil, delete if in R2, else replace by abl, otherwise,
151	var a7Index2 = word.search(/(abil)$/);
152	if(a7Index2 >=r2Index){
153	word = word.substring(0, a7Index2);
154	} else {
155	word = word.substring(0,a7Index2)+"abl";
156	}
157	} else if(word.search(/(ic)$/) != -1){
158	var a7Index3 = word.search(/(ic)$/);
159	if(a7Index3 != -1 && a7Index3 >= r2Index){
160	word = word.substring(0, a7Index3); //if preceded by ic, delete if in R2,
161	} else { //else replace by iqU
162	word = word.replace(/(ic)$/,'iqU');
163	}
164	} else if(word.search(/(iv)$/) != r2Index){
165	word = word.replace(/(iv)$/,'');
166	}
167	} else if(a8Index != -1 && a8Index >= r2Index){
168	word = word.substring(0,a8Index);
169	if(word.search(/(at)$/) >= r2Index){
170	word = word.replace(/(at)$/, '');
171	if(word.search(/(ic)$/) >= r2Index){
172	word = word.replace(/(ic)$/, '');
173	} else { word = word.replace(/(ic)$/, 'iqU'); }
174	}
175	} else if(a9Index != -1){ word = word.replace(/(eaux)/,'eau')
176	} else if(a10Index >= r1Index){ word = word.replace(/(aux)/,'al')
177	} else if(a11Index != -1 ){
178	var a11Index2 = word.search(/(euse\|euses)$/);
179	if(a11Index2 >=r2Index){
180	word = word.substring(0, a11Index2);
181	} else if(a11Index2 >= r1Index){
182	word = word.substring(0, a11Index2)+"eux";
183	}
184	} else if(a12Index!=-1 && a12Index>=r1Index){
185	word = word.substring(0,a12Index+1); //+1- amendment to non-vowel
186	} else if(a13Index!=-1 && a13Index>=rvIndex){
187	word = word.replace(/(amment)$/,'ant');
188	} else if(a14Index!=-1 && a14Index>=rvIndex){
189	word = word.replace(/(emment)$/,'ent');
190	} else if(a15Index!=-1 && a15Index>=rvIndex){
191	word = word.substring(0,a15Index+1);
192	}
193
194	/* Step 2a: Verb suffixes beginning i*/
195	var wordStep1 = word;
196	var step2aDone = false;
197	if(oriWord == word.toLowerCase() \|\| oriWord.search(/(amment\|emment\|ment\|ments)$/) != -1){
198	step2aDone = true;
199	var b1Regex = /([^aeiouyâàëéêèïîôûù])(îmes\|ît\|îtes\|i\|ie\|ies\|ir\|ira\|irai\|iraIent\|irais\|irait\|iras\|irent\|irez\|iriez\|irions\|irons\|iront\|is\|issaIent\|issais\|issait\|issant\|issante\|issantes\|issants\|isse\|issent\|isses\|issez\|issiez\|issions\|issons\|it)$/i;
200	if(word.search(b1Regex) >= rvIndex){
201	word = word.replace(b1Regex,'$1');
202	}
203	}
204
205	/* Step 2b: Other verb suffixes*/
206	if (step2aDone && wordStep1 == word) {
207	if (word.search(/(ions)$/) >= r2Index) {
208	word = word.replace(/(ions)$/, '');
209	} else {
210	var b2Regex = /(é\|ée\|ées\|és\|èrent\|er\|era\|erai\|eraIent\|erais\|erait\|eras\|erez\|eriez\|erions\|erons\|eront\|ez\|iez)$/i;
211	if (word.search(b2Regex) >= rvIndex) {
212	word = word.replace(b2Regex, '');
213	} else {
214	var b3Regex = /e(âmes\|ât\|âtes\|a\|ai\|aIent\|ais\|ait\|ant\|ante\|antes\|ants\|as\|asse\|assent\|asses\|assiez\|assions)$/i;
215	if (word.search(b3Regex) >= rvIndex) {
216	word = word.replace(b3Regex, '');
217	} else {
218	var b3Regex2 = /(âmes\|ât\|âtes\|a\|ai\|aIent\|ais\|ait\|ant\|ante\|antes\|ants\|as\|asse\|assent\|asses\|assiez\|assions)$/i;
219	if (word.search(b3Regex2) >= rvIndex) {
220	word = word.replace(b3Regex2, '');
221	}
222	}
223	}
224	}
225	}
226
227	if(oriWord != word.toLowerCase()){
228	/* Step 3 */
229	var rep = '';
230	if(word.search(/Y$/) != -1) {
231	word = word.replace(/Y$/, 'i');
232	} else if(word.search(/ç$/) != -1){
233	word = word.replace(/ç$/, 'c');
234	}
235	} else {
236	/* Step 4 */
237	//If the word ends s, not preceded by a, i, o, u, è or s, delete it.
238	if (word.search(/([^aiouès])s$/) >= rvIndex) {
239	word = word.replace(/([^aiouès])s$/, '$1');
240	}
241	var e1Index = word.search(/ion$/);
242	if (e1Index >= r2Index && word.search(/[st]ion$/) >= rvIndex) {
243	word = word.substring(0, e1Index);
244	} else {
245	var e2Index = word.search(/(ier\|ière\|Ier\|Ière)$/);
246	if (e2Index != -1 && e2Index >= rvIndex) {
247	word = word.substring(0, e2Index) + "i";
248	} else {
249	if (word.search(/e$/) >= rvIndex) {
250	word = word.replace(/e$/, ''); //delete last e
251	} else if (word.search(/guë$/) >= rvIndex) {
252	word = word.replace(/guë$/, 'gu');
253	}
254	}
255	}
256	}
257
258	/* Step 5: Undouble */
259	//word = word.replace(/(en\|on\|et\|el\|eil)(n\|t\|l)$/,'$1');
260	word = word.replace(/(en\|on)(n)$/,'$1');
261	word = word.replace(/(ett)$/,'et');
262	word = word.replace(/(el\|eil)(l)$/,'$1');
263
264	/* Step 6: Un-accent */
265	word = word.replace(/[éè]([^aeiouyâàëéêèïîôûù]+)$/,'e$1');
266	word = word.toLowerCase();
267	return word;
268	};
269
270	var eqOut = new Array();
271	var noteqOut = new Array();
272	var eqCount = 0;
273	/*
274	To test the stemming, create two arrays named "voc" and "COut" which are for vocabualary and the stemmed output.
275	Then add the vocabulary strings and output strings. This method will generate the stemmed output for "voc" and will
276	compare the output with COut.
277	(I used porter's voc and out files and did a regex to convert them to js objects. regex: /");\nvoc.push("/g . This
278	will add strings to voc array such that output would look like: voc.push("foobar"); ) drop me an email for any help.
279	*/
280	function testFr(){
281	var start = new Date().getTime(); //execution time
282	eqCount = 0;
283	eqOut = new Array();
284	noteqOut = new Array();
285	for(var k=0;k<voc.length;k++){
286	if(COut[k]==stemmer(voc[k])){
287	eqCount++;
288	eqOut.push("v: "+voc[k]+" c: "+COut[k]);
289	} else {
290	noteqOut.push(voc[k]+", c: "+COut[k]+" s:"+stemmer(voc[k]));
291	}
292	}
293	var end = new Date().getTime(); //execution time
294	var time = end-start;
295	alert("equal count= "+eqCount+" out of "+voc.length+" words. time= "+time+" ms");
296	//console.log("equal count= "+eqCount+" out of "+voc.length+" words. time= "+time+" ms");
297	}
298
299

Note: See TracBrowser for help on using the repository browser.

Download in other formats: