1 
  2 ////////////////////////////////////////////////////////////////////////////////
  3 /**
  4  * @fileOverview Copyright (C) 2009 www.webis.de<br/>
  5  * <ul>
  6  * <li>Implementation of the clustering data structure</li>
  7  * <li>Implementation of generic cluster algorithm wrapper</li>
  8  * <li>Implemention of the MajorClust algorithmus using a singleton pattern</li>
  9  * <li>Implementation of cluster labeling using Weighted Centroid Covering</li>
 10  * </ul>
 11  * @author Alexander Kuemmel alexander.kuemmel@uni-weimar.de
 12  * @author Christian Fricke christian.fricke@uni-weimar.de
 13  * @version 0.1
 14  */
 15  
 16 /**
 17  * Implementation of the clustering data structure
 18  * @class Implementation of the clustering data structure
 19  * @constructor
 20  * @param {Object} nodeIdClusterMapping mapping: key[nodeId]=>value[clusterId]
 21  * @author Christian Fricke christian.fricke@uni-weimar.de
 22  */
 23 de.aitools.js.Clustering = function (nodeIdClusterMapping) {
 24   CHECK_TYPE(nodeIdClusterMapping, "object", "clustering parameter incorrect.");
 25   /**
 26    * @private
 27    * @type Number
 28    */
 29   var clusterCount = 0;
 30   /**
 31    * @private
 32    * @type mapping: key[nodeId]=>value[clusterId]
 33    */
 34   var clusterResults = {};
 35   /**
 36    * @private
 37    * @type mapping: key[clusterId]=>array[nodeIds]
 38    */
 39   var itemsOfCluster = {};
 40    
 41   /**
 42    * Method to get the total cluster count of clustering.
 43    * @public
 44    * @returns {Number} Total count of clusters within clustering
 45    */
 46   this.getClusterCount = function () {
 47     return clusterCount;
 48   };
 49   
 50   /**
 51    * Method to get the cluster id that a given node id belongs to.
 52    * @public
 53 	 * @param {Number} nodeId The node id to be used
 54    * @returns {Number} Cluster id the given node id belongs to
 55    */
 56   this.getClusterOfItem = function (nodeId) {
 57     return clusterResults[nodeId];
 58   };
 59   
 60   /**
 61    * Method to get the size of a cluster.
 62    * @public
 63 	 * @param {Number} clusterId The cluster id to be used
 64    * @returns {Number} Size of desired cluster
 65    */
 66   this.getSizeOfCluster = function (clusterId) {
 67     return itemsOfCluster[clusterId].length;
 68   };
 69   
 70   /**
 71    * Method to get the node ids of a cluster.
 72    * @public
 73 	 * @param {Number} clusterId The cluster id to be used
 74    * @returns {Array} Array of all nodeIds in the cluster
 75    */
 76   this.getItemsOfCluster = function (clusterId) {
 77     CHECK_NOT_UNDEFINED(itemsOfCluster[clusterId], "clusterId doesn't exist.");
 78     return itemsOfCluster[clusterId];
 79   };
 80   
 81   /**
 82    * Constructor function.
 83    */
 84   (function() {
 85     // each clusterId is unused at first
 86     var used = [];
 87     for (var j in nodeIdClusterMapping) {
 88       if (nodeIdClusterMapping.hasOwnProperty(j)) {
 89         used[nodeIdClusterMapping[j]] = false;
 90       }
 91     }
 92     
 93     // each existing clusterId is assigned a new one
 94     var idCounter = 0;
 95     var newId;
 96     var newIds = [];
 97     for (var k in nodeIdClusterMapping) {
 98       if (nodeIdClusterMapping.hasOwnProperty(k)) {
 99         if (!used[nodeIdClusterMapping[k]]) {
100           used[nodeIdClusterMapping[k]] = true;
101           newId = idCounter;
102           ++idCounter;
103         } else { newId = newIds[nodeIdClusterMapping[k]]; }
104       newIds[nodeIdClusterMapping[k]] = newId;
105       }
106     }
107     
108     // each nodeId is assigned the newly created clusterId,
109     // each nodeId is appended to the array of the appropriate clusterId
110     for (var l in nodeIdClusterMapping) {
111       if (nodeIdClusterMapping.hasOwnProperty(l)) {
112         clusterResults[l] = newIds[nodeIdClusterMapping[l]];
113         // create new clusterId if it doesn't exist already
114         if (typeof(itemsOfCluster[clusterResults[l]]) === "undefined") {
115           itemsOfCluster[clusterResults[l]] = [];
116         }
117         // store nodeId in array belonging to appropriate clusterId
118         itemsOfCluster[clusterResults[l]].push(l);
119       }
120     }
121     
122     clusterCount = idCounter;
123   })();
124 };
125 
126 ////////////////////////////////////////////////////////////////////////////////
127 
128 ////////////////////////////////////////////////////////////////////////////////
129 /**
130  * Implementation of ClusterAlgorithm - algorithms are implemented as
131  * a singleton patterns with an input: Graph and output: Clustering.<br/>
132  * Example:<br/>
133  *  MajorClust, creates a clustering through comparing weights
134  *  of single clusters / node connections.
135  * Usage:<br/>
136  *  de.aitools.js.MajorClust.setThreshold(value);<br/>
137  *  var MajorClust = new de.aitools.js.MajorClust();<br/>
138  *  var ca = new de.aitools.js.ClusterAlgorithm(MajorClust);<br/>
139  *  var cluster = ca.computeClustering(graph);
140  * @class Implementation of ClusterAlgorithm
141  * @constructor
142  * @param {Object} nodeIdClusterMapping mapping: key[nodeId]=>value[clusterId]
143  * @author Christian Fricke christian.fricke@uni-weimar.de
144  */
145 de.aitools.js.ClusterAlgorithm = function (clusteringFunction) {
146   CHECK_TYPE(clusteringFunction.createClustering, "function",
147              "no clustering function given.");
148   
149   /**
150    * Method to compute a clustering using a graph.
151    * @public
152 	 * @param {de.aitools.js.Graph} graph The graph to be clusterd
153    * @returns {de.aitools.js.Clustering} A clustering
154    */
155   this.computeClustering = function (graph) {
156     CHECK_NOT_UNDEFINED(graph, "No graph defined");
157     return clusteringFunction.createClustering(graph);
158   };
159 };
160 
161 ////////////////////////////////////////////////////////////////////////////////
162 
163 ////////////////////////////////////////////////////////////////////////////////
164 /**
165  * Implemention of the MajorClust algorithm using a singleton pattern.
166  * @class Implemention of MajorClust
167  * @constructor
168  * @author Christian Fricke christian.fricke@uni-weimar.de
169  */
170 de.aitools.js.MajorClust = (function () {
171   
172   /**
173    * @private
174    * @type Number
175    */
176   var threshold = 0;
177   
178   /**
179    * Constructor function.
180    */
181   return {
182     /**
183      * Method to set the threshold.
184      * @public
185      * @static
186      * @param {Number} value Threshold to be set
187      */
188     setThreshold : function (value) {
189       CHECK_TYPE(value, "number", "input is not of type number");
190       threshold = value;
191     },
192     
193     /**
194      * Method to create the clustering.
195      * @public
196      * @static
197      * @param {de.aitools.js.Graph} graph The graph to be clustered
198      * @returns {de.aitools.js.Clustering} A complete clustering.
199      */
200     createClustering : function (graph) {
201       // initial clustering: each vertex in its own cluster
202       // mapping: key[nodeId]=>value[clusterId]
203       var nodesToCluster = graph.getNodes();
204       var size = graph.getNodeCount();
205       var nodeIdClusterMapping = {};
206       for (var c = 0; c < size; c++) {
207         nodeIdClusterMapping[nodesToCluster[c]] = c;
208       }
209       
210       // loop until nothing is changed due to relocation
211       var terminate = false;
212       while (!terminate) {
213         terminate = true;
214         
215         // create random permutation of node ids.
216         // different permutations may produce different results
217         var nodeIdPermutation = graph.getNodes();
218         for (var i = 0; i < size; i++) {
219           var first  = Math.floor(Math.random() * (size));
220           var second = Math.floor(Math.random() * (size));
221           var temp = nodeIdPermutation[first];
222           nodeIdPermutation[first] = nodeIdPermutation[second];
223           nodeIdPermutation[second] = temp;
224         }
225         
226         // subloop for each individual node
227         for (var nodeIndex = 0; nodeIndex < size; nodeIndex++) {
228           var currentNodeId = nodeIdPermutation[nodeIndex];
229           
230           // reset sums
231           var sumOfEdgeWeights = {};
232           for (c = 0; c < size; c++) {
233             sumOfEdgeWeights[c] = 0;
234           }
235           
236           // sum all edges going towards same clusters
237           var adjacentNodes = graph.getAdjacentNodes(currentNodeId);
238           for (var j = 0; j < adjacentNodes.length; j++) {
239             var clusterId = nodeIdClusterMapping[adjacentNodes[j]];
240             var edgeWeight = graph.getEdgeWeight(currentNodeId, adjacentNodes[j]);
241             if (edgeWeight >= threshold) {
242               // performance: one could set the sumOfEdgeWeights here,
243               // instead of initialising every clusterId with 0.
244               sumOfEdgeWeights[clusterId] += edgeWeight;
245             }
246           }
247           
248           // determine the cluster that belongs to the biggest sum
249           var newClusterId = nodeIdClusterMapping[currentNodeId];
250           var maxWeight = 0;
251           for (var k in sumOfEdgeWeights) {
252             if (sumOfEdgeWeights.hasOwnProperty(k)) {
253               if (sumOfEdgeWeights[k] > maxWeight) {
254                 newClusterId = k;
255                 maxWeight = sumOfEdgeWeights[k];
256               }
257             }
258           }
259           
260           // set new clusterId
261           if (nodeIdClusterMapping[currentNodeId] != newClusterId) {
262             nodeIdClusterMapping[currentNodeId] = newClusterId;
263             terminate = false;
264           }
265         }
266       }
267       var clustering = new de.aitools.js.Clustering(nodeIdClusterMapping);
268       delete this.nodeIdClusterMapping;
269       delete this.nodesToCluster;
270       return clustering;
271     }
272   };
273 })
274 
275 ////////////////////////////////////////////////////////////////////////////////
276 
277 ////////////////////////////////////////////////////////////////////////////////
278 /**
279  * Implemention of a randomized clustering algorithm using a singleton pattern.
280  * Note: Only used for testing the quality against other algorithms.
281  * @class Implemention of a randomized clustering algorithm
282  * @constructor
283  * @author Christian Fricke christian.fricke@uni-weimar.de
284  */
285 de.aitools.js.RandomClust = (function () {
286   /**
287    * Constructor function.
288    */
289   return {
290     /**
291      * Method to create the clustering.
292      * @public
293      * @static
294      * @param {de.aitools.js.Graph} graph The graph to be clustered
295      * @returns {de.aitools.js.Clustering} A complete clustering
296      */
297     createClusteringClustering : function (graph) {
298       print("fubar");
299       // initial clustering: each vertex in its own cluster
300       // mapping: key[nodeId]=>value[clusterId]
301       var clusterSize = 40;
302       var nodesToCluster = graph.getNodes();
303       var size = graph.getNodeCount();
304       var nodeIdClusterMapping = {};
305       for (var c = 0; c < size; c++) {
306         nodeIdClusterMapping[nodesToCluster[c]] =
307         Math.floor(Math.random() * clusterSize);
308       }
309       
310       var clustering = new de.aitools.js.Clustering(nodeIdClusterMapping);
311       delete this.nodeIdClusterMapping;
312       delete this.nodesToCluster;
313       return clustering;
314     }
315   };
316 })
317 
318 ////////////////////////////////////////////////////////////////////////////////
319 
320 ////////////////////////////////////////////////////////////////////////////////
321 ///**
322 // * Mapping of ids of the document representation, currently not in use.
323 // * @static
324 // * @author Alexander Kuemmel alexander.kuemmel@uni-weimar.de
325 // * @author Christof Braeutigam christof.braeutigam@uni-weimar.de
326 // * @deprecated
327 // */
328 //de.aitools.js.RepresentationMap = {
329 //  
330 //  size_ : 0,
331 //  
332 //  put : function (id, documentRepresentation) {
333 //    CHECK_NOT_UNDEFINED(id, "id is undefined");
334 //    CHECK_NOT_UNDEFINED(
335 //      documentRepresentation,
336 //      "documentRepresentation is undefined"
337 //    );
338 //    CHECK_NOT_NULL(id, "id is undefined");
339 //    CHECK_NOT_NULL(documentRepresentation,
340 //      "documentRepresentation is undefined"
341 //    );
342 //    this[id] = documentRepresentation;
343 //    this.size_ += 1;
344 //  },
345 //  
346 //  get : function (id) {
347 //    CHECK_NOT_UNDEFINED(id, "id is undefined");
348 //    return this[id];
349 //  },
350 //  
351 //  size : function () {
352 //    return this.size_;
353 //  }
354 //};
355 //
356 ////////////////////////////////////////////////////////////////////////////////
357 
358 ////////////////////////////////////////////////////////////////////////////////
359 
360 if (typeof(de.aitools.js.ClusterLabeling) === "undefined") {
361   /**
362    * @static
363    * @namespace de.aitools.js.ClusterLabeling contains labeling algorithms
364    */
365   de.aitools.js.ClusterLabeling = {};
366 }
367 
368 /**
369  * Implementation of the clustering data structure
370  * @class Implementation of the clustering data structure
371  * @constructor
372  * @param {Array} documentRepresentations Mapping: key[nodeId]=>value[Vector]
373  * @param {de.aitools.js.Clustering} clustering
374  * @param {Number} labelCount Number of labels per cluster
375  * @param {Number} occRate Relevance of occurrence of labels
376  *                         (first most common to occRate'th most common)
377  * @param {de.aitools.js.Vocabulary} vocabularyInstance
378  * @author Alexander Kuemmel alexander.kuemmel@uni-weimar.de
379  * @author Christian Fricke christian.fricke@uni-weimar.de
380  */
381 de.aitools.js.ClusterLabeling.WeightedCentroidCovering = function (
382   documentRepresentations, clustering, labelCount, occRate, vocabularyInstance)
383 {
384 
385   /**
386    * @private
387    */
388   var vocabulary = vocabularyInstance;
389     
390   /**
391    * Datastructure to hold term occurrence rates dependent on occRate. 
392    * termId => [(i)th occ. cluster, (i+1)th occ. cluster, ... , ]; 
393    * i = (0 .. occRate-1)
394    * @private
395    * @type Object
396    */
397   var termOccurrences = {};
398   
399   /**
400    * key[termID] => value[[clusterID,rate], ...]
401    * @private
402    */
403   var clusterRates = {};
404   
405   /**
406    * Sorts an array of tuples [[v1,v2],[v1,v2], ...]
407    * @private
408    * @param {Array} tuplesArr Tupel array to be sorted
409    * @param {Boolean} descOrder True or false
410    * @param {Number} byFirstValue Order by first or second element
411    */
412   var tupleSort = function (tuplesArr, descOrder, byFirstValue) {
413     if (typeof(byFirstValue) === "undefined") { byFirstValue = false; }
414     if (!byFirstValue) {
415       tuplesArr.sort(function(a,b) {
416         return (descOrder) ? b[1] - a[1] : a[1] - b[1];
417       });
418     } else { tuplesArr.sort(); }
419   };
420   
421   /**
422    * Computes the term frequency of a term within the given cluster.
423    * @private
424    * @param {Array} clusteredDocuments A set (array) of documents (nodeIds)
425    * @param {String} termId A term whose term frequency is relevant
426    * @returns {Number} The occurence frequency of the given term
427    */
428   var computeClusterTF = function (clusteredDocuments, termId) {
429     var frequency = 0;
430     // iterate over all vectors (document representations)
431     for (var n = 0; n < clusteredDocuments.length; n++) {
432       // get vector from documentRepresentations with supplied nodeIds
433       var node = documentRepresentations[clusteredDocuments[n]];
434       frequency += node.getValue(termId);
435     }
436     return frequency;
437   };
438   
439   /**
440    * Help function to build a lookup datastructure for faster computation.
441    * @private
442    */
443   var preComputation = function () {
444     var clusterCount = clustering.getClusterCount();
445     var vocSize = vocabulary.size();
446     for (var termId = 0; termId < vocSize; termId++) {
447       // iterate over all clusters; clustering is array of clusters
448       for (var clid = 0; clid < clusterCount; clid++) {
449         // compute term frequency over the cluster with id: clid
450         var tf_c = computeClusterTF(clustering.getItemsOfCluster(clid), termId);
451         // save tf values for later usage
452         if (typeof(clusterRates[termId]) === "undefined") {
453           clusterRates[termId] = [];
454         }
455         clusterRates[termId].push([clid,tf_c]);
456       }
457       
458       // setup termOccurrences
459       // sort in descending order, by second values (tf)
460       tupleSort(clusterRates[termId], true);
461       
462       var prioArr = [];
463       var minPrio = 0;
464       if (occRate < clusterRates[termId].length) { minPrio = occRate; }
465       else { minPrio = clusterRates[termId].length; }
466       
467       // save the i'th most common frequencies of termId
468       // fill the array of priorized label occurrences
469       for (var i = 0; i < minPrio; i++) {
470         // only push clusterId, no use for tf
471         prioArr.push(clusterRates[termId][i][0]);
472       }
473       termOccurrences[termId] = prioArr;
474     }
475   };
476    
477   /**
478    * Computes the term frequency of a term within the given cluster.
479    * @public
480    * @returns {Object} An object according to the following representation:<br/>
481    * {clusterId1 : [label1,label2, ..., label'labelCount], clusterId2 : ....,}
482    */
483   this.getLabeling = function () {
484     var T = [];
485     var labeling = {};
486     var vocSize = vocabulary.size();
487     for (var termId = 0; termId < vocSize; termId++) {
488       for (var i = 0; i < occRate; i++) {
489         var clusterId = termOccurrences[termId][i];
490         var tf = 0;
491         for (var k = 0; k < clusterRates[termId].length; k++) {
492           if (clusterRates[termId][k][0] === clusterId) {
493             tf = clusterRates[termId][k][1];
494             break;
495           }
496         }
497         var triple = [termId, tf, clusterId];
498         T.push(triple);
499       }
500     }
501     // sort triple [termId, tf, clusterId] according to tf, descending
502     tupleSort(T, true);
503     // build the labels
504     for (var lc = 1; lc <= labelCount; lc++) {
505       var assigned = 0;
506       var j = 0;
507       while (assigned < clustering.getClusterCount() && j < T.length) {
508         var triple_j = T[j];
509         if (typeof(labeling[triple_j[2]]) === "undefined") {
510           labeling[triple_j[2]] = [];
511         }
512         // check labels for cluster id
513         if (labeling[triple_j[2]].length < lc) {
514           // add label with termId in triple_j[0] to cluster with id triple_j[2]
515           labeling[triple_j[2]].push(triple_j[0]);
516           T.splice(j, 1);
517           ++assigned;
518         }
519         ++j;
520       }
521     }
522     return labeling;
523   };
524   
525   /**
526    * Constructor function.
527    */
528   (function() {
529     CHECK_NOT_UNDEFINED(
530       documentRepresentations,
531       "Document representations not defined"
532     );
533     CHECK_NOT_UNDEFINED(clustering, "Clustering not defined");
534     CHECK_NOT_UNDEFINED(labelCount, "Label count not defined");
535     CHECK_NOT_UNDEFINED(occRate, "Label occurrence rate not defined");
536     preComputation();
537   })();
538 };
539 
540 ////////////////////////////////////////////////////////////////////////////////
541