```  1
2 ////////////////////////////////////////////////////////////////////////////////
3 /**
4  * @fileOverview Copyright (C) 2009 www.webis.de
5  * @author Christof Braeutigam christof.braeutigam@uni-weimar.de
6  * @version 0.1
7  */
8
9 /**
10  * Implementation of CosineSimilarity as relevance function (rho) to be used in
11  * a retrieval model
12  * @see de.aitools.js.RetrievalModel
13  * @author Christof Braeutigam christof.braeutigam@uni-weimar.de
14  * @class Implementation of CosineSimilarity
15  * @constructor
16  */
17 de.aitools.js.CosineSimilarity = (function () {
18   return {
19     /**
20      * Computes the cosine similarity for the given two vectors and returns it
21      * as number value between 1 (high similarity) and 0 (no similarity). The
22      * cosinus similarity is computed by normalizing the given vectors and
23      * compute their scalar product. Note that the given vectors will be
24      * modified (normalized) if they are not already normalized.
25      * @param {de.aitools.js.Vector} vector1 first vector
26      * @param {de.aitools.js.Vector} vector2 second vector
27      * @returns {number} cosinus similarity value
28      */
29     computeRelevance : function (vector1,vector2) {
30       // to prevent changes in original vector (reference) a clone could be used
31       // but this will slow down the process!
32       if (!vector1.isNormalized()) {
33         // vector1 = vector1.clone();
34         vector1.normalize();
35       }
36       if (!vector2.isNormalized()) {
37         // vector2 = vector2.clone();
38         vector2.normalize();
39       }
40       return vector1.scalarProduct(vector2);
41     }
42   };
43 });
44
45 // RelevanceFunctions.js
46 ////////////////////////////////////////////////////////////////////////////////
47
48 ////////////////////////////////////////////////////////////////////////////////
49 // RepresentationFunctions.js
50 // Copyright (C) 2009 www.webis.de
51
52 /**
53  * Implementation of TermFrequency as representation function to be used in a
54  * retrieval model
55  * @see de.aitools.js.RetrievalModel
56  * @author Christof Braeutigam christof.braeutigam@uni-weimar.de
57  * @constructor
58  * @class Implementation of TermFrequency
59  */
60 de.aitools.js.TermFrequency = (function () {
61
62   /* private */
63   var voc = new de.aitools.js.Vocabulary(); // holds the vocabulary
64   var backStemmingMap = {}; // holds shortest term for every stem
65   var stemming = false;
66   var removeStopwords = false;
67   var dynamicStopwords = {}; // holds extra stopwords, e.g. search terms
68
69   // methods
70
71   /* public */
72
73   return {
74
75     /**
76      * Activate or deactivate stemming.
77      * @param {boolean} bool Activate (true) or deactivate (false) stemming.
78      */
79     setStemming : function (bool) { stemming = bool; },
80
81     /**
82      * Activate or deactivate stopword removal.
83      * @param {boolean} bool Activate (true) or deactivate (false) stopword
84      *                       removal.
85      */
86     setRemoveStopwords : function (bool) { removeStopwords = bool; },
87
88     /*
89      * Set extra stopwords to extend standard stopword list (e.g. to filter
90      * search terms when representing search engine snippets).
91      * @param {Array} wordsArray List of words to extend stopword list.
92      */
93     setDynamicStopwords : function (wordsArray) {
94       dynamicStopwords = Array.toAssocArray(wordsArray);
95     },
96
97     /**
98      * Set a vocabulary to use in document
99      * representation process. Without specifying a vocabulary by this function,
100      * a empty vocabulary is used. The vocabulary will be modified, i.e. new
101      * terms will be added.
102      * @see de.aitools.js.Vocabulary
103      * @param {de.aitools.js.Vocabulary} vocabulary The vocabulary to use.
104      */
105     setVocabulary : function (vocabulary) { voc = vocabulary; },
106
107     /**
108      * Returns the vocabulary that is created
109      * while document representation.
110      * @see de.aitools.js.Vocabulary
111      * @returns {de.aitools.js.Vocabulary} the vocabulary
112      */
113     getVocabulary : function () { return voc; },
114
115     /**
116      * Returns the backstemming map that is created while document
117      * representation.
118      * @returns {Object} the backstemming map
119      */
120     getBackStemming : function () { return backStemmingMap; },
121
122     /**
123      * Creates and returns a vector space representation with the term frequency
124      * model. The representation is a Vector
125      * @see de.aitools.js.Vector
126      * @param {de.aitools.js.Document} document The document to represent.
127      * @returns {de.aitools.js.Vector} A vector space representation of the
128      *                                 document.
129      */
130     representDocument : function (document) {
131       var termFrequencyMap = {};
132       var termList = de.aitools.js.decompose(document.getPlaintext());
133       for (var i = 0; i < termList.length; ++i) {
134         term = termList[i].toLowerCase();
135         // Workaround: String.split() returns an empty string if the last
136         // character is a delimiter, one-character "words" are mostly special
137         // characters or numbers and should not be considered.
138         if (term.length < 2) { continue; }
139
140         if (removeStopwords) {
141           if (stopwords.english[term] !== undefined) { continue; }
142           if (dynamicStopwords[term] !== undefined) { continue; }
143         }
144
145         if (stemming) {
146           var stem = PorterStemmer.stem(term);
147           // use shortest term for backstemming
148           if (backStemmingMap[stem] === undefined ||
149               backStemmingMap[stem].length > term) {
150             backStemmingMap[stem] = term;
151           }
152           term = stem;
153         }
154         if (termFrequencyMap[term] !== undefined) {
155           termFrequencyMap[term] += 1;
156         } else {
157           termFrequencyMap[term] = 1;
158         }
159       }
160       var vec = new de.aitools.js.Vector();
161       for (var term in termFrequencyMap) {
162         if (termFrequencyMap.hasOwnProperty(term)) {
163           if (!voc.contains(term)) {
165           }
166           vec.setValue(voc.getId(term), termFrequencyMap[term]);
167         }
168       }
169       return vec;
170     }
171   };
172 });
173
174 /**
175  * Implementation of TermFrequency-InverseDocumentFrequency as representation
176  * function to be used in a retrieval model<br/>
177  * TODO: NEEDS BUGFIXING, SEEMS NOT TO WORK PROPERLY
178  * @see de.aitools.js.RetrievalModel
179  * @author Christof Braeutigam christof.braeutigam@uni-weimar.de
180  * @constructor
181  * @class Implementation of TermFrequency-InverseDocumentFrequency
182  */
183 de.aitools.js.TermFrequencyInverseDocumentFrequency = (function () {
184
185   /* private */
186   var voc = new de.aitools.js.Vocabulary();
187   var backStemmingMap = {};
188   var stemming = false;
189   var removeStopwords = false;
190   //var dynamicStopwords = {};
191
192   return {
193     setStemming : function(bool) { stemming = bool; },
194
195     setRemoveStopwords : function(bool) { removeStopwords = bool; },
196
197     // use to extend stopword list e.g. to filter search terms when representing
198     // search engine snippets
199     //setDynamicStopwords : function(wordsArray) {
200     //  dynamicStopwords = Array.toAssocArray(wordsArray);
201     //},
202
203     setVocabulary : function(vocabulary) { voc = vocabulary; },
204
205     getVocabulary : function() { return voc; },
206
207     getBackStemming : function() { return backStemmingMap; },
208
209     /**
210      * Creates and returns a vector space representation with the term frequency
211      * inverse document model for each document in the given Array. The
212      * representation is a Vector
213      * @see de.aitools.js.Vector
214      * @param {Array} documentArray The documents to represent.
215      * @returns {Array} A array of vector space representations of the
216      *                  documents.
217      */
218     representDocument : function(documentArray) {
219
220       var termFrequencys = [];
221       var documentRepresentations = [];
222       var documentFrequencyMap = {};
223
224       for (var doc = 0; doc<documentArray.length; ++doc) {
225
226         var termFrequencyMap = {};
227         var termList =
228           de.aitools.js.decompose(documentArray[doc].getPlaintext());
229         for (var i = 0; i < termList.length; ++i) {
230           var termsObserved = {};
231           term = termList[i].toLowerCase();
232           // Workaround: String.split() returns an empty string if the last
233           // character is a delimiter, one-character "words" are mostly special
234           // characters or numbers and should not be considered
235           if (term.length < 2) { continue; }
236
237           if (removeStopwords) {
238             if(stopwords.english[term] !== undefined) { continue; }
239             //if(dynamicStopwords[term] !== undefined) { continue; }
240           }
241
242           if (stemming) {
243             var stem = PorterStemmer.stem(term);
244             // use shortest term for backstemming
245             if (backStemmingMap[stem] === undefined ||
246                 backStemmingMap[stem].length > term) {
247               backStemmingMap[stem] = term;
248             }
249             term = stem;
250           }
251
252           if (termFrequencyMap[term] !== undefined) {
253             termFrequencyMap[term] += 1;
254           } else {
255             termFrequencyMap[term] = 1;
256           }
257
258           if (termsObserved[term] === undefined) {
259             if (documentFrequencyMap[term] !== undefined) {
260               documentFrequencyMap[term] += 1;
261             } else {
262               documentFrequencyMap[term] = 1;
263             }
264             termsObserved[term] = "ok";
265           }
266
267           if (!voc.contains(term)) {
269           }
270         }
271         termFrequencys.push(termFrequencyMap);
272       }
273
274       for (var i = 0; i<termFrequencys.length; ++i) {
275         var vec = new de.aitools.js.Vector();
276         var tfRep = termFrequencys[i];
277         for (var term in tfRep) {
278           if (tfRep.hasOwnProperty(term)) {
279             vec.setValue(
280               voc.getId(term),
281               tfRep[term] / documentFrequencyMap[term]
282             );
283           }
284         }
285         documentRepresentations.push(vec);
286       }
287       return documentRepresentations;
288     }
289   };
290 });
291
292 // RepresentationFunctions.js
293 ////////////////////////////////////////////////////////////////////////////////
294
295 ////////////////////////////////////////////////////////////////////////////////
296 // RetrievalModel.js
297 // Copyright (C) 2009 www.webis.de
298
299 /*
300  RetrievalModel:
301   Repraesentiert Dokumente als verrechenbare Dok.-repraesentationen und kann
302   Relevanz zwischen Dok.-repraesentationen berechnen.
303   Repraesentationsfunktion, representDocument()
304     in: Document
305     out: Dokumentrepraesentation
306     Beispiel: TermFrequency, verarbeitet den Plaintext eines Document, erstellt
307     einen Vector unter Verwendung des Vocabulary
308   Relevanz-/Aehnlichkeitsfunktion, rho()
309     in: Dokumentrepraesentation1
310     in: Dokumentrepraesentation2
311     out: Relevanz-/Aehnlichkeitsmaß
312     Beispiel: CosinusSimilarity, berechnet aus zwei Vectors die Aehnlichkeit als
313               Wert zwischen 0 und 1
314 */
315
316 /**
317  * Represents documents as formal representations (e.g. vector space
318  * representations) and calculates relevance between representations.
319  * @param {Function} representationFunction A representation function (e.g. tf,
320  *                                          tf-idf) to represent documents.
321  * @param {Function} relevanceFunction A relevance function (e.g. cosinus
322  *                                     similarity) to compute the relevance
323  *                                     between document representations.
324  * @author Christof Braeutigam christof.braeutigam@uni-weimar.de
325  * @constructor
326  * @class Represents documents as formal representations
327  */
328 de.aitools.js.RetrievalModel =
329   function (representationFunction, relevanceFunction) {
330
331   CHECK_TYPE(representationFunction.representDocument,
332              "function",
333              "no representation function given");
334   CHECK_TYPE(relevanceFunction.computeRelevance,
335              "function",
336              "no relevance function given");
337
338   this.representDocument = function(document) {
339     CHECK_NOT_UNDEFINED(document, "No document defined");
340     return representationFunction.representDocument(document);
341   };
342
343   this.computeRelevance = function(docRepresentation1, docRepresentation2) {
344     CHECK_NOT_UNDEFINED(docRepresentation1,
345                         "Document representation 1 not defined");
346     CHECK_NOT_UNDEFINED(docRepresentation2,
347                         "Document representation 2 not defined");
348     return relevanceFunction.computeRelevance(docRepresentation1,
349                                               docRepresentation2);
350   };
351 };
352
353 // RetrievalModel.js
354 ////////////////////////////////////////////////////////////////////////////////
355 ```