1 
  2 ////////////////////////////////////////////////////////////////////////////////
  3 /**
  4  * @fileOverview Copyright (C) 2009 www.webis.de<br/>
  5  * Porter stemmer in Javascript. Few comments, but it's easy to follow against<br/>
  6  * the rules in the original paper, in 
  7  * <emph>Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14, no. 3, pp 130-137<br/>
  8  * see also http://www.tartarus.org/~martin/PorterStemmer</emph>
  9  * @version 0.1
 10  */
 11 
 12 step2list = {};
 13 step2list["ational"]="ate";
 14 step2list["tional"]="tion";
 15 step2list["enci"]="ence";
 16 step2list["anci"]="ance";
 17 step2list["izer"]="ize";
 18 step2list["bli"]="ble";
 19 step2list["alli"]="al";
 20 step2list["entli"]="ent";
 21 step2list["eli"]="e";
 22 step2list["ousli"]="ous";
 23 step2list["ization"]="ize";
 24 step2list["ation"]="ate";
 25 step2list["ator"]="ate";
 26 step2list["alism"]="al";
 27 step2list["iveness"]="ive";
 28 step2list["fulness"]="ful";
 29 step2list["ousness"]="ous";
 30 step2list["aliti"]="al";
 31 step2list["iviti"]="ive";
 32 step2list["biliti"]="ble";
 33 step2list["logi"]="log";
 34 
 35 step3list = {};
 36 step3list["icate"]="ic";
 37 step3list["ative"]="";
 38 step3list["alize"]="al";
 39 step3list["iciti"]="ic";
 40 step3list["ical"]="ic";
 41 step3list["ful"]="";
 42 step3list["ness"]="";
 43 
 44 c = "[^aeiou]";          // consonant
 45 v = "[aeiouy]";          // vowel
 46 C = c + "[^aeiouy]*";    // consonant sequence
 47 V = v + "[aeiou]*";      // vowel sequence
 48 
 49 mgr0 = "^(" + C + ")?" + V + C;               // [C]VC... is m>0
 50 meq1 = "^(" + C + ")?" + V + C + "(" + V + ")?$";  // [C]VC[V] is m=1
 51 mgr1 = "^(" + C + ")?" + V + C + V + C;       // [C]VCVC... is m>1
 52 s_v   = "^(" + C + ")?" + v;                   // vowel in stem
 53 
 54 function stemWord(w) {
 55 	var stem;
 56 	var suffix;
 57 	var firstch;
 58 	var origword = w;
 59 
 60 	if (w.length < 3) { return w; }
 61 
 62    	var re;
 63    	var re2;
 64    	var re3;
 65    	var re4;
 66 
 67 	firstch = w.substr(0,1);
 68 	if (firstch == "y") {
 69 		w = firstch.toUpperCase() + w.substr(1);
 70 	}
 71 
 72 	// Step 1a
 73    	re = /^(.+?)(ss|i)es$/;
 74    	re2 = /^(.+?)([^s])s$/;
 75 
 76    	if (re.test(w)) { w = w.replace(re,"$1$2"); }
 77    	else if (re2.test(w)) {	w = w.replace(re2,"$1$2"); }
 78 
 79 	// Step 1b
 80 	re = /^(.+?)eed$/;
 81 	re2 = /^(.+?)(ed|ing)$/;
 82 	if (re.test(w)) {
 83 		var fp = re.exec(w);
 84 		re = new RegExp(mgr0);
 85 		if (re.test(fp[1])) {
 86 			re = /.$/;
 87 			w = w.replace(re,"");
 88 		}
 89 	} else if (re2.test(w)) {
 90 		var fp = re2.exec(w);
 91 		stem = fp[1];
 92 		re2 = new RegExp(s_v);
 93 		if (re2.test(stem)) {
 94 			w = stem;
 95 			re2 = /(at|bl|iz)$/;
 96 			re3 = new RegExp("([^aeiouylsz])\\1$");
 97 			re4 = new RegExp("^" + C + v + "[^aeiouwxy]$");
 98 			if (re2.test(w)) {	w = w + "e"; }
 99 			else if (re3.test(w)) { re = /.$/; w = w.replace(re,""); }
100 			else if (re4.test(w)) { w = w + "e"; }
101 		}
102 	}
103 
104 	// Step 1c
105 	re = /^(.+?)y$/;
106 	if (re.test(w)) {
107 		var fp = re.exec(w);
108 		stem = fp[1];
109 		re = new RegExp(s_v);
110 		if (re.test(stem)) { w = stem + "i"; }
111 	}
112 
113 	// Step 2
114 	re = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/;
115 	if (re.test(w)) {
116 		var fp = re.exec(w);
117 		stem = fp[1];
118 		suffix = fp[2];
119 		re = new RegExp(mgr0);
120 		if (re.test(stem)) {
121 			w = stem + step2list[suffix];
122 		}
123 	}
124 
125 	// Step 3
126 	re = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/;
127 	if (re.test(w)) {
128 		var fp = re.exec(w);
129 		stem = fp[1];
130 		suffix = fp[2];
131 		re = new RegExp(mgr0);
132 		if (re.test(stem)) {
133 			w = stem + step3list[suffix];
134 		}
135 	}
136 
137 	// Step 4
138 	re = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/;
139 	re2 = /^(.+?)(s|t)(ion)$/;
140 	if (re.test(w)) {
141 		var fp = re.exec(w);
142 		stem = fp[1];
143 		re = new RegExp(mgr1);
144 		if (re.test(stem)) {
145 			w = stem;
146 		}
147 	} else if (re2.test(w)) {
148 		var fp = re2.exec(w);
149 		stem = fp[1] + fp[2];
150 		re2 = new RegExp(mgr1);
151 		if (re2.test(stem)) {
152 			w = stem;
153 		}
154 	}
155 
156 	// Step 5
157 	re = /^(.+?)e$/;
158 	if (re.test(w)) {
159 		var fp = re.exec(w);
160 		stem = fp[1];
161 		re = new RegExp(mgr1);
162 		re2 = new RegExp(meq1);
163 		re3 = new RegExp("^" + C + v + "[^aeiouwxy]$");
164 		if (re.test(stem) || (re2.test(stem) && !(re3.test(stem)))) {
165 			w = stem;
166 		}
167 	}
168 
169 	re = /ll$/;
170 	re2 = new RegExp(mgr1);
171 	if (re.test(w) && re2.test(w)) {
172 		re = /.$/;
173 		w = w.replace(re,"");
174 	}
175 
176 	// and turn initial Y back to y
177 
178 	if (firstch == "y") {
179 		w = firstch.toLowerCase() + w.substr(1);
180 	}
181 
182 	return w;
183 
184 }
185 
186 var PorterStemmer = {
187 	_stemmBuffer : {},
188 	stem : function(word){
189 		return stemWord(word);
190 	},
191 	fastStem:function(word){
192 		var result;
193 		var res2=null;
194 		if((result = PorterStemmer._stemmBuffer[word]) && 
195 			typeof(result) != 'function') {
196 			return result;		
197 		}
198 		else{
199 			 result=stemWord(word);
200 			 PorterStemmer._stemmBuffer[word]=result;
201 			 return result;
202 		}
203 	},
204 	reset:function(){
205 		this._stemmBuffer={};
206 	}
207 };
208 
209 var stemmedWords={};
210 
211 /**
212  * @ignore
213  * @author david.wiesner
214  */
215 function fastStemm(word){
216 	var result;
217 	var res2=null;
218 	if((result = stemmedWords[word]) && typeof(result) != 'function'){
219 		return result;		
220 	}
221 	else{
222 		 result=stemWord(word);
223 		 stemmedWords[word]=result;
224 		 return result;
225 	}
226 }
227 
228 
229 // PorterStemmer.js
230 ////////////////////////////////////////////////////////////////////////////////
231