1 
  2 ////////////////////////////////////////////////////////////////////////////////
  3 /**
  4  * @fileOverview Copyright (C) 2009 www.webis.de
  5  * @author Christof Braeutigam christof.braeutigam@uni-weimar.de
  6  * @version 0.1
  7  */
  8 
  9 /**
 10  * Implementation of CosineSimilarity as relevance function (rho) to be used in
 11  * a retrieval model
 12  * @see de.aitools.js.RetrievalModel
 13  * @author Christof Braeutigam christof.braeutigam@uni-weimar.de
 14  * @class Implementation of CosineSimilarity
 15  * @constructor
 16  */
 17 de.aitools.js.CosineSimilarity = (function () {
 18   return {
 19     /**
 20      * Computes the cosine similarity for the given two vectors and returns it
 21      * as number value between 1 (high similarity) and 0 (no similarity). The
 22      * cosinus similarity is computed by normalizing the given vectors and
 23      * compute their scalar product. Note that the given vectors will be
 24      * modified (normalized) if they are not already normalized.
 25      * @param {de.aitools.js.Vector} vector1 first vector
 26      * @param {de.aitools.js.Vector} vector2 second vector
 27      * @returns {number} cosinus similarity value
 28      */
 29     computeRelevance : function (vector1,vector2) {
 30       // to prevent changes in original vector (reference) a clone could be used
 31       // but this will slow down the process!
 32       if (!vector1.isNormalized()) {
 33         // vector1 = vector1.clone();
 34         vector1.normalize();
 35       }
 36       if (!vector2.isNormalized()) {
 37         // vector2 = vector2.clone();
 38         vector2.normalize();
 39       }
 40       return vector1.scalarProduct(vector2);
 41     }
 42   };
 43 });
 44 
 45 // RelevanceFunctions.js
 46 ////////////////////////////////////////////////////////////////////////////////
 47 
 48 ////////////////////////////////////////////////////////////////////////////////
 49 // RepresentationFunctions.js
 50 // Copyright (C) 2009 www.webis.de
 51 
 52 /**
 53  * Implementation of TermFrequency as representation function to be used in a
 54  * retrieval model 
 55  * @see de.aitools.js.RetrievalModel
 56  * @author Christof Braeutigam christof.braeutigam@uni-weimar.de
 57  * @constructor
 58  * @class Implementation of TermFrequency
 59  */
 60 de.aitools.js.TermFrequency = (function () {
 61   
 62   /* private */
 63   var voc = new de.aitools.js.Vocabulary(); // holds the vocabulary
 64   var backStemmingMap = {}; // holds shortest term for every stem
 65   var stemming = false;
 66   var removeStopwords = false;
 67   var dynamicStopwords = {}; // holds extra stopwords, e.g. search terms
 68   
 69   // methods
 70 
 71   /* public */
 72   
 73   return {
 74     
 75     /**
 76      * Activate or deactivate stemming.
 77      * @param {boolean} bool Activate (true) or deactivate (false) stemming.
 78      */
 79     setStemming : function (bool) { stemming = bool; },
 80     
 81     /**
 82      * Activate or deactivate stopword removal.
 83      * @param {boolean} bool Activate (true) or deactivate (false) stopword
 84      *                       removal.
 85      */
 86     setRemoveStopwords : function (bool) { removeStopwords = bool; },
 87     
 88     /*
 89      * Set extra stopwords to extend standard stopword list (e.g. to filter
 90      * search terms when representing search engine snippets).
 91      * @param {Array} wordsArray List of words to extend stopword list.
 92      */
 93     setDynamicStopwords : function (wordsArray) {
 94       dynamicStopwords = Array.toAssocArray(wordsArray);
 95     },
 96     
 97     /**
 98      * Set a vocabulary to use in document
 99      * representation process. Without specifying a vocabulary by this function,
100      * a empty vocabulary is used. The vocabulary will be modified, i.e. new
101      * terms will be added.
102      * @see de.aitools.js.Vocabulary
103      * @param {de.aitools.js.Vocabulary} vocabulary The vocabulary to use.
104      */
105     setVocabulary : function (vocabulary) { voc = vocabulary; },
106     
107     /**
108      * Returns the vocabulary that is created
109      * while document representation.
110      * @see de.aitools.js.Vocabulary
111      * @returns {de.aitools.js.Vocabulary} the vocabulary
112      */
113     getVocabulary : function () { return voc; },
114     
115     /**
116      * Returns the backstemming map that is created while document
117      * representation.
118      * @returns {Object} the backstemming map
119      */
120     getBackStemming : function () { return backStemmingMap; },
121     
122     /**
123      * Creates and returns a vector space representation with the term frequency
124      * model. The representation is a Vector 
125      * @see de.aitools.js.Vector
126      * @param {de.aitools.js.Document} document The document to represent.
127      * @returns {de.aitools.js.Vector} A vector space representation of the
128      *                                 document.
129      */
130     representDocument : function (document) {
131       var termFrequencyMap = {};
132       var termList = de.aitools.js.decompose(document.getPlaintext());
133       for (var i = 0; i < termList.length; ++i) {
134         term = termList[i].toLowerCase();
135         // Workaround: String.split() returns an empty string if the last
136         // character is a delimiter, one-character "words" are mostly special
137         // characters or numbers and should not be considered.
138         if (term.length < 2) { continue; }
139         
140         if (removeStopwords) {
141           if (stopwords.english[term] !== undefined) { continue; }
142           if (dynamicStopwords[term] !== undefined) { continue; }
143         }
144         
145         if (stemming) {
146           var stem = PorterStemmer.stem(term);
147           // use shortest term for backstemming
148           if (backStemmingMap[stem] === undefined ||
149               backStemmingMap[stem].length > term) {
150             backStemmingMap[stem] = term;
151           }
152           term = stem;
153         }
154         if (termFrequencyMap[term] !== undefined) {
155           termFrequencyMap[term] += 1;
156         } else {
157           termFrequencyMap[term] = 1;
158         }
159       }
160       var vec = new de.aitools.js.Vector();
161       for (var term in termFrequencyMap) {
162         if (termFrequencyMap.hasOwnProperty(term)) {
163           if (!voc.contains(term)) {
164             voc.add(term);
165           }
166           vec.setValue(voc.getId(term), termFrequencyMap[term]);
167         }
168       }
169       return vec;
170     }
171   };
172 });
173 
174 /**
175  * Implementation of TermFrequency-InverseDocumentFrequency as representation
176  * function to be used in a retrieval model<br/>
177  * TODO: NEEDS BUGFIXING, SEEMS NOT TO WORK PROPERLY
178  * @see de.aitools.js.RetrievalModel
179  * @author Christof Braeutigam christof.braeutigam@uni-weimar.de
180  * @constructor
181  * @class Implementation of TermFrequency-InverseDocumentFrequency
182  */
183 de.aitools.js.TermFrequencyInverseDocumentFrequency = (function () {
184   
185   /* private */
186   var voc = new de.aitools.js.Vocabulary();
187   var backStemmingMap = {};
188   var stemming = false;
189   var removeStopwords = false;
190   //var dynamicStopwords = {};
191   
192   return {
193     setStemming : function(bool) { stemming = bool; },
194     
195     setRemoveStopwords : function(bool) { removeStopwords = bool; },
196     
197     // use to extend stopword list e.g. to filter search terms when representing
198     // search engine snippets
199     //setDynamicStopwords : function(wordsArray) {
200     //  dynamicStopwords = Array.toAssocArray(wordsArray);
201     //},
202     
203     setVocabulary : function(vocabulary) { voc = vocabulary; },
204     
205     getVocabulary : function() { return voc; },
206     
207     getBackStemming : function() { return backStemmingMap; },
208     
209     /**
210      * Creates and returns a vector space representation with the term frequency
211      * inverse document model for each document in the given Array. The
212      * representation is a Vector 
213      * @see de.aitools.js.Vector
214      * @param {Array} documentArray The documents to represent.
215      * @returns {Array} A array of vector space representations of the
216      *                  documents.
217      */
218     representDocument : function(documentArray) {
219       
220       var termFrequencys = [];
221       var documentRepresentations = [];
222       var documentFrequencyMap = {};
223       
224       for (var doc = 0; doc<documentArray.length; ++doc) {
225         
226         var termFrequencyMap = {};
227         var termList =
228           de.aitools.js.decompose(documentArray[doc].getPlaintext());
229         for (var i = 0; i < termList.length; ++i) {
230           var termsObserved = {};
231           term = termList[i].toLowerCase();
232           // Workaround: String.split() returns an empty string if the last
233           // character is a delimiter, one-character "words" are mostly special
234           // characters or numbers and should not be considered
235           if (term.length < 2) { continue; }
236           
237           if (removeStopwords) {
238             if(stopwords.english[term] !== undefined) { continue; }
239             //if(dynamicStopwords[term] !== undefined) { continue; }
240           }
241           
242           if (stemming) {
243             var stem = PorterStemmer.stem(term);
244             // use shortest term for backstemming
245             if (backStemmingMap[stem] === undefined ||
246                 backStemmingMap[stem].length > term) {
247               backStemmingMap[stem] = term;
248             }
249             term = stem;
250           }
251           
252           if (termFrequencyMap[term] !== undefined) {
253             termFrequencyMap[term] += 1;
254           } else {
255             termFrequencyMap[term] = 1;
256           }
257           
258           if (termsObserved[term] === undefined) {
259             if (documentFrequencyMap[term] !== undefined) {
260               documentFrequencyMap[term] += 1;
261             } else {
262               documentFrequencyMap[term] = 1;
263             }
264             termsObserved[term] = "ok";
265           }
266           
267           if (!voc.contains(term)) {
268               voc.add(term);
269           } 
270         }
271         termFrequencys.push(termFrequencyMap);
272       }
273       
274       for (var i = 0; i<termFrequencys.length; ++i) {
275         var vec = new de.aitools.js.Vector();
276         var tfRep = termFrequencys[i];
277         for (var term in tfRep) {
278           if (tfRep.hasOwnProperty(term)) {
279             vec.setValue(
280               voc.getId(term),
281               tfRep[term] / documentFrequencyMap[term]
282             );
283           }
284         }
285         documentRepresentations.push(vec);
286       }
287       return documentRepresentations;
288     }
289   };
290 });
291 
292 // RepresentationFunctions.js
293 ////////////////////////////////////////////////////////////////////////////////
294 
295 ////////////////////////////////////////////////////////////////////////////////
296 // RetrievalModel.js
297 // Copyright (C) 2009 www.webis.de
298 
299 /*
300  RetrievalModel:
301   Repraesentiert Dokumente als verrechenbare Dok.-repraesentationen und kann
302   Relevanz zwischen Dok.-repraesentationen berechnen.
303   Repraesentationsfunktion, representDocument()
304     in: Document
305     out: Dokumentrepraesentation
306     Beispiel: TermFrequency, verarbeitet den Plaintext eines Document, erstellt
307     einen Vector unter Verwendung des Vocabulary
308   Relevanz-/Aehnlichkeitsfunktion, rho()
309     in: Dokumentrepraesentation1
310     in: Dokumentrepraesentation2
311     out: Relevanz-/Aehnlichkeitsma├č
312     Beispiel: CosinusSimilarity, berechnet aus zwei Vectors die Aehnlichkeit als
313               Wert zwischen 0 und 1
314 */
315 
316 /**
317  * Represents documents as formal representations (e.g. vector space
318  * representations) and calculates relevance between representations.
319  * @param {Function} representationFunction A representation function (e.g. tf,
320  *                                          tf-idf) to represent documents.
321  * @param {Function} relevanceFunction A relevance function (e.g. cosinus
322  *                                     similarity) to compute the relevance
323  *                                     between document representations.
324  * @author Christof Braeutigam christof.braeutigam@uni-weimar.de
325  * @constructor
326  * @class Represents documents as formal representations
327  */
328 de.aitools.js.RetrievalModel =
329   function (representationFunction, relevanceFunction) {
330   
331   CHECK_TYPE(representationFunction.representDocument,
332              "function",
333              "no representation function given");
334   CHECK_TYPE(relevanceFunction.computeRelevance,
335              "function",
336              "no relevance function given");
337   
338   this.representDocument = function(document) {
339     CHECK_NOT_UNDEFINED(document, "No document defined");
340     return representationFunction.representDocument(document);
341   };
342   
343   this.computeRelevance = function(docRepresentation1, docRepresentation2) {
344     CHECK_NOT_UNDEFINED(docRepresentation1,
345                         "Document representation 1 not defined");
346     CHECK_NOT_UNDEFINED(docRepresentation2,
347                         "Document representation 2 not defined");
348     return relevanceFunction.computeRelevance(docRepresentation1,
349                                               docRepresentation2);
350   };
351 };
352 
353 // RetrievalModel.js
354 ////////////////////////////////////////////////////////////////////////////////
355