1 2 //////////////////////////////////////////////////////////////////////////////// 3 /** 4 * @fileOverview Copyright (C) 2009 www.webis.de 5 * @author Christof Braeutigam christof.braeutigam@uni-weimar.de 6 * @version 0.1 7 */ 8 9 /** 10 * Implementation of CosineSimilarity as relevance function (rho) to be used in 11 * a retrieval model 12 * @see de.aitools.js.RetrievalModel 13 * @author Christof Braeutigam christof.braeutigam@uni-weimar.de 14 * @class Implementation of CosineSimilarity 15 * @constructor 16 */ 17 de.aitools.js.CosineSimilarity = (function () { 18 return { 19 /** 20 * Computes the cosine similarity for the given two vectors and returns it 21 * as number value between 1 (high similarity) and 0 (no similarity). The 22 * cosinus similarity is computed by normalizing the given vectors and 23 * compute their scalar product. Note that the given vectors will be 24 * modified (normalized) if they are not already normalized. 25 * @param {de.aitools.js.Vector} vector1 first vector 26 * @param {de.aitools.js.Vector} vector2 second vector 27 * @returns {number} cosinus similarity value 28 */ 29 computeRelevance : function (vector1,vector2) { 30 // to prevent changes in original vector (reference) a clone could be used 31 // but this will slow down the process! 32 if (!vector1.isNormalized()) { 33 // vector1 = vector1.clone(); 34 vector1.normalize(); 35 } 36 if (!vector2.isNormalized()) { 37 // vector2 = vector2.clone(); 38 vector2.normalize(); 39 } 40 return vector1.scalarProduct(vector2); 41 } 42 }; 43 }); 44 45 // RelevanceFunctions.js 46 //////////////////////////////////////////////////////////////////////////////// 47 48 //////////////////////////////////////////////////////////////////////////////// 49 // RepresentationFunctions.js 50 // Copyright (C) 2009 www.webis.de 51 52 /** 53 * Implementation of TermFrequency as representation function to be used in a 54 * retrieval model 55 * @see de.aitools.js.RetrievalModel 56 * @author Christof Braeutigam christof.braeutigam@uni-weimar.de 57 * @constructor 58 * @class Implementation of TermFrequency 59 */ 60 de.aitools.js.TermFrequency = (function () { 61 62 /* private */ 63 var voc = new de.aitools.js.Vocabulary(); // holds the vocabulary 64 var backStemmingMap = {}; // holds shortest term for every stem 65 var stemming = false; 66 var removeStopwords = false; 67 var dynamicStopwords = {}; // holds extra stopwords, e.g. search terms 68 69 // methods 70 71 /* public */ 72 73 return { 74 75 /** 76 * Activate or deactivate stemming. 77 * @param {boolean} bool Activate (true) or deactivate (false) stemming. 78 */ 79 setStemming : function (bool) { stemming = bool; }, 80 81 /** 82 * Activate or deactivate stopword removal. 83 * @param {boolean} bool Activate (true) or deactivate (false) stopword 84 * removal. 85 */ 86 setRemoveStopwords : function (bool) { removeStopwords = bool; }, 87 88 /* 89 * Set extra stopwords to extend standard stopword list (e.g. to filter 90 * search terms when representing search engine snippets). 91 * @param {Array} wordsArray List of words to extend stopword list. 92 */ 93 setDynamicStopwords : function (wordsArray) { 94 dynamicStopwords = Array.toAssocArray(wordsArray); 95 }, 96 97 /** 98 * Set a vocabulary to use in document 99 * representation process. Without specifying a vocabulary by this function, 100 * a empty vocabulary is used. The vocabulary will be modified, i.e. new 101 * terms will be added. 102 * @see de.aitools.js.Vocabulary 103 * @param {de.aitools.js.Vocabulary} vocabulary The vocabulary to use. 104 */ 105 setVocabulary : function (vocabulary) { voc = vocabulary; }, 106 107 /** 108 * Returns the vocabulary that is created 109 * while document representation. 110 * @see de.aitools.js.Vocabulary 111 * @returns {de.aitools.js.Vocabulary} the vocabulary 112 */ 113 getVocabulary : function () { return voc; }, 114 115 /** 116 * Returns the backstemming map that is created while document 117 * representation. 118 * @returns {Object} the backstemming map 119 */ 120 getBackStemming : function () { return backStemmingMap; }, 121 122 /** 123 * Creates and returns a vector space representation with the term frequency 124 * model. The representation is a Vector 125 * @see de.aitools.js.Vector 126 * @param {de.aitools.js.Document} document The document to represent. 127 * @returns {de.aitools.js.Vector} A vector space representation of the 128 * document. 129 */ 130 representDocument : function (document) { 131 var termFrequencyMap = {}; 132 var termList = de.aitools.js.decompose(document.getPlaintext()); 133 for (var i = 0; i < termList.length; ++i) { 134 term = termList[i].toLowerCase(); 135 // Workaround: String.split() returns an empty string if the last 136 // character is a delimiter, one-character "words" are mostly special 137 // characters or numbers and should not be considered. 138 if (term.length < 2) { continue; } 139 140 if (removeStopwords) { 141 if (stopwords.english[term] !== undefined) { continue; } 142 if (dynamicStopwords[term] !== undefined) { continue; } 143 } 144 145 if (stemming) { 146 var stem = PorterStemmer.stem(term); 147 // use shortest term for backstemming 148 if (backStemmingMap[stem] === undefined || 149 backStemmingMap[stem].length > term) { 150 backStemmingMap[stem] = term; 151 } 152 term = stem; 153 } 154 if (termFrequencyMap[term] !== undefined) { 155 termFrequencyMap[term] += 1; 156 } else { 157 termFrequencyMap[term] = 1; 158 } 159 } 160 var vec = new de.aitools.js.Vector(); 161 for (var term in termFrequencyMap) { 162 if (termFrequencyMap.hasOwnProperty(term)) { 163 if (!voc.contains(term)) { 164 voc.add(term); 165 } 166 vec.setValue(voc.getId(term), termFrequencyMap[term]); 167 } 168 } 169 return vec; 170 } 171 }; 172 }); 173 174 /** 175 * Implementation of TermFrequency-InverseDocumentFrequency as representation 176 * function to be used in a retrieval model<br/> 177 * TODO: NEEDS BUGFIXING, SEEMS NOT TO WORK PROPERLY 178 * @see de.aitools.js.RetrievalModel 179 * @author Christof Braeutigam christof.braeutigam@uni-weimar.de 180 * @constructor 181 * @class Implementation of TermFrequency-InverseDocumentFrequency 182 */ 183 de.aitools.js.TermFrequencyInverseDocumentFrequency = (function () { 184 185 /* private */ 186 var voc = new de.aitools.js.Vocabulary(); 187 var backStemmingMap = {}; 188 var stemming = false; 189 var removeStopwords = false; 190 //var dynamicStopwords = {}; 191 192 return { 193 setStemming : function(bool) { stemming = bool; }, 194 195 setRemoveStopwords : function(bool) { removeStopwords = bool; }, 196 197 // use to extend stopword list e.g. to filter search terms when representing 198 // search engine snippets 199 //setDynamicStopwords : function(wordsArray) { 200 // dynamicStopwords = Array.toAssocArray(wordsArray); 201 //}, 202 203 setVocabulary : function(vocabulary) { voc = vocabulary; }, 204 205 getVocabulary : function() { return voc; }, 206 207 getBackStemming : function() { return backStemmingMap; }, 208 209 /** 210 * Creates and returns a vector space representation with the term frequency 211 * inverse document model for each document in the given Array. The 212 * representation is a Vector 213 * @see de.aitools.js.Vector 214 * @param {Array} documentArray The documents to represent. 215 * @returns {Array} A array of vector space representations of the 216 * documents. 217 */ 218 representDocument : function(documentArray) { 219 220 var termFrequencys = []; 221 var documentRepresentations = []; 222 var documentFrequencyMap = {}; 223 224 for (var doc = 0; doc<documentArray.length; ++doc) { 225 226 var termFrequencyMap = {}; 227 var termList = 228 de.aitools.js.decompose(documentArray[doc].getPlaintext()); 229 for (var i = 0; i < termList.length; ++i) { 230 var termsObserved = {}; 231 term = termList[i].toLowerCase(); 232 // Workaround: String.split() returns an empty string if the last 233 // character is a delimiter, one-character "words" are mostly special 234 // characters or numbers and should not be considered 235 if (term.length < 2) { continue; } 236 237 if (removeStopwords) { 238 if(stopwords.english[term] !== undefined) { continue; } 239 //if(dynamicStopwords[term] !== undefined) { continue; } 240 } 241 242 if (stemming) { 243 var stem = PorterStemmer.stem(term); 244 // use shortest term for backstemming 245 if (backStemmingMap[stem] === undefined || 246 backStemmingMap[stem].length > term) { 247 backStemmingMap[stem] = term; 248 } 249 term = stem; 250 } 251 252 if (termFrequencyMap[term] !== undefined) { 253 termFrequencyMap[term] += 1; 254 } else { 255 termFrequencyMap[term] = 1; 256 } 257 258 if (termsObserved[term] === undefined) { 259 if (documentFrequencyMap[term] !== undefined) { 260 documentFrequencyMap[term] += 1; 261 } else { 262 documentFrequencyMap[term] = 1; 263 } 264 termsObserved[term] = "ok"; 265 } 266 267 if (!voc.contains(term)) { 268 voc.add(term); 269 } 270 } 271 termFrequencys.push(termFrequencyMap); 272 } 273 274 for (var i = 0; i<termFrequencys.length; ++i) { 275 var vec = new de.aitools.js.Vector(); 276 var tfRep = termFrequencys[i]; 277 for (var term in tfRep) { 278 if (tfRep.hasOwnProperty(term)) { 279 vec.setValue( 280 voc.getId(term), 281 tfRep[term] / documentFrequencyMap[term] 282 ); 283 } 284 } 285 documentRepresentations.push(vec); 286 } 287 return documentRepresentations; 288 } 289 }; 290 }); 291 292 // RepresentationFunctions.js 293 //////////////////////////////////////////////////////////////////////////////// 294 295 //////////////////////////////////////////////////////////////////////////////// 296 // RetrievalModel.js 297 // Copyright (C) 2009 www.webis.de 298 299 /* 300 RetrievalModel: 301 Repraesentiert Dokumente als verrechenbare Dok.-repraesentationen und kann 302 Relevanz zwischen Dok.-repraesentationen berechnen. 303 Repraesentationsfunktion, representDocument() 304 in: Document 305 out: Dokumentrepraesentation 306 Beispiel: TermFrequency, verarbeitet den Plaintext eines Document, erstellt 307 einen Vector unter Verwendung des Vocabulary 308 Relevanz-/Aehnlichkeitsfunktion, rho() 309 in: Dokumentrepraesentation1 310 in: Dokumentrepraesentation2 311 out: Relevanz-/Aehnlichkeitsmaß 312 Beispiel: CosinusSimilarity, berechnet aus zwei Vectors die Aehnlichkeit als 313 Wert zwischen 0 und 1 314 */ 315 316 /** 317 * Represents documents as formal representations (e.g. vector space 318 * representations) and calculates relevance between representations. 319 * @param {Function} representationFunction A representation function (e.g. tf, 320 * tf-idf) to represent documents. 321 * @param {Function} relevanceFunction A relevance function (e.g. cosinus 322 * similarity) to compute the relevance 323 * between document representations. 324 * @author Christof Braeutigam christof.braeutigam@uni-weimar.de 325 * @constructor 326 * @class Represents documents as formal representations 327 */ 328 de.aitools.js.RetrievalModel = 329 function (representationFunction, relevanceFunction) { 330 331 CHECK_TYPE(representationFunction.representDocument, 332 "function", 333 "no representation function given"); 334 CHECK_TYPE(relevanceFunction.computeRelevance, 335 "function", 336 "no relevance function given"); 337 338 this.representDocument = function(document) { 339 CHECK_NOT_UNDEFINED(document, "No document defined"); 340 return representationFunction.representDocument(document); 341 }; 342 343 this.computeRelevance = function(docRepresentation1, docRepresentation2) { 344 CHECK_NOT_UNDEFINED(docRepresentation1, 345 "Document representation 1 not defined"); 346 CHECK_NOT_UNDEFINED(docRepresentation2, 347 "Document representation 2 not defined"); 348 return relevanceFunction.computeRelevance(docRepresentation1, 349 docRepresentation2); 350 }; 351 }; 352 353 // RetrievalModel.js 354 //////////////////////////////////////////////////////////////////////////////// 355