LCOV - code coverage report
Current view: top level - cluster - kmeans.cc (source / functions) Hit Total Coverage
Test: Test Coverage for xapian-core 954b5873a738 Lines: 0 56 0.0 %
Date: 2019-06-30 05:20:33 Functions: 0 6 0.0 %
Branches: 0 82 0.0 %

           Branch data     Line data    Source code
       1                 :            : /** @file kmeans.cc
       2                 :            :  *  @brief KMeans clustering API
       3                 :            :  */
       4                 :            : /* Copyright (C) 2016 Richhiey Thomas
       5                 :            :  *
       6                 :            :  * This program is free software; you can redistribute it and/or
       7                 :            :  * modify it under the terms of the GNU General Public License as
       8                 :            :  * published by the Free Software Foundation; either version 2 of the
       9                 :            :  * License, or (at your option) any later version.
      10                 :            :  *
      11                 :            :  * This program is distributed in the hope that it will be useful,
      12                 :            :  * but WITHOUT ANY WARRANTY; without even the implied warranty of
      13                 :            :  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      14                 :            :  * GNU General Public License for more details.
      15                 :            :  *
      16                 :            :  * You should have received a copy of the GNU General Public License
      17                 :            :  * along with this program; if not, write to the Free Software
      18                 :            :  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
      19                 :            :  * USA
      20                 :            :  */
      21                 :            : 
      22                 :            : #include <config.h>
      23                 :            : 
      24                 :            : #include "xapian/cluster.h"
      25                 :            : #include "xapian/error.h"
      26                 :            : 
      27                 :            : #include "debuglog.h"
      28                 :            : 
      29                 :            : #include <limits>
      30                 :            : #include <vector>
      31                 :            : 
      32                 :            : // Threshold value for checking convergence in KMeans
      33                 :            : #define CONVERGENCE_THRESHOLD 0.0000000001
      34                 :            : 
      35                 :            : /** Maximum number of times KMeans algorithm will iterate
      36                 :            :  *  till it converges
      37                 :            :  */
      38                 :            : #define MAX_ITERS 1000
      39                 :            : 
      40                 :            : using namespace Xapian;
      41                 :            : using namespace std;
      42                 :            : 
      43                 :          0 : KMeans::KMeans(unsigned int k_, unsigned int max_iters_)
      44                 :          0 :     : k(k_)
      45                 :            : {
      46                 :            :     LOGCALL_CTOR(API, "KMeans", k_ | max_iters_);
      47         [ #  # ]:          0 :     max_iters = (max_iters_ == 0) ? MAX_ITERS : max_iters_;
      48         [ #  # ]:          0 :     if (k_ == 0)
      49 [ #  # ][ #  # ]:          0 :         throw InvalidArgumentError("Number of required clusters should be greater than zero");
                 [ #  # ]
      50                 :          0 : }
      51                 :            : 
      52                 :            : string
      53                 :          0 : KMeans::get_description() const
      54                 :            : {
      55         [ #  # ]:          0 :     return "KMeans()";
      56                 :            : }
      57                 :            : 
      58                 :            : void
      59                 :          0 : KMeans::set_stopper(const Stopper *stopper_)
      60                 :            : {
      61                 :            :     LOGCALL_VOID(API, "KMeans::set_stopper", stopper_);
      62                 :          0 :     stopper = stopper_;
      63                 :          0 : }
      64                 :            : 
      65                 :            : void
      66                 :          0 : KMeans::initialise_clusters(ClusterSet &cset, doccount num_of_points)
      67                 :            : {
      68                 :            :     LOGCALL_VOID(API, "KMeans::initialise_clusters", cset | num_of_points);
      69                 :            :     // Initial centroids are selected by picking points at roughly even
      70                 :            :     // intervals within the MSet. This is cheap and helps pick diverse
      71                 :            :     // elements since the MSet is usually sorted by some sort of key
      72         [ #  # ]:          0 :     for (unsigned int i = 0; i < k; ++i) {
      73                 :          0 :         unsigned int x = (i * num_of_points) / k;
      74 [ #  # ][ #  # ]:          0 :         cset.add_cluster(Cluster(Centroid(points[x])));
      75                 :            :     }
      76                 :          0 : }
      77                 :            : 
      78                 :            : void
      79                 :          0 : KMeans::initialise_points(const MSet &source)
      80                 :            : {
      81                 :            :     LOGCALL_VOID(API, "KMeans::initialise_points", source);
      82         [ #  # ]:          0 :     TermListGroup tlg(source, stopper.get());
      83 [ #  # ][ #  # ]:          0 :     for (MSetIterator it = source.begin(); it != source.end(); ++it)
                 [ #  # ]
      84 [ #  # ][ #  # ]:          0 :         points.push_back(Point(tlg, it.get_document()));
                 [ #  # ]
      85                 :          0 : }
      86                 :            : 
      87                 :            : ClusterSet
      88                 :          0 : KMeans::cluster(const MSet &mset)
      89                 :            : {
      90                 :            :     LOGCALL(API, ClusterSet, "KMeans::cluster", mset);
      91         [ #  # ]:          0 :     doccount size = mset.size();
      92         [ #  # ]:          0 :     if (k >= size)
      93                 :          0 :         k = size;
      94         [ #  # ]:          0 :     initialise_points(mset);
      95         [ #  # ]:          0 :     ClusterSet cset;
      96         [ #  # ]:          0 :     initialise_clusters(cset, size);
      97                 :          0 :     CosineDistance distance;
      98                 :          0 :     vector<Centroid> previous_centroids;
      99         [ #  # ]:          0 :     for (unsigned int i = 0; i < max_iters; ++i) {
     100                 :            :         // Assign each point to the cluster corresponding to its
     101                 :            :         // closest cluster centroid
     102         [ #  # ]:          0 :         cset.clear_clusters();
     103         [ #  # ]:          0 :         for (unsigned int j = 0; j < size; ++j) {
     104                 :          0 :             double closest_cluster_distance = numeric_limits<double>::max();
     105                 :          0 :             unsigned int closest_cluster = 0;
     106         [ #  # ]:          0 :             for (unsigned int c = 0; c < k; ++c) {
     107 [ #  # ][ #  # ]:          0 :                 const Centroid& centroid = cset[c].get_centroid();
     108         [ #  # ]:          0 :                 double dist = distance.similarity(points[j], centroid);
     109         [ #  # ]:          0 :                 if (closest_cluster_distance > dist) {
     110                 :          0 :                     closest_cluster_distance = dist;
     111                 :          0 :                     closest_cluster = c;
     112                 :            :                 }
     113                 :            :             }
     114         [ #  # ]:          0 :             cset.add_to_cluster(points[j], closest_cluster);
     115                 :            :         }
     116                 :            : 
     117                 :            :         // Remember the previous centroids
     118                 :          0 :         previous_centroids.clear();
     119         [ #  # ]:          0 :         for (unsigned int j = 0; j < k; ++j)
     120 [ #  # ][ #  # ]:          0 :             previous_centroids.push_back(cset[j].get_centroid());
                 [ #  # ]
     121                 :            : 
     122                 :            :         // Recalculate the centroids for current iteration
     123         [ #  # ]:          0 :         cset.recalculate_centroids();
     124                 :            : 
     125                 :            :         // Check whether centroids have converged
     126                 :          0 :         bool has_converged = true;
     127         [ #  # ]:          0 :         for (unsigned int j = 0; j < k; ++j) {
     128 [ #  # ][ #  # ]:          0 :             const Centroid& centroid = cset[j].get_centroid();
     129         [ #  # ]:          0 :             double dist = distance.similarity(previous_centroids[j], centroid);
     130                 :            :             // If distance between any two centroids has changed by
     131                 :            :             // more than the threshold, then KMeans hasn't converged
     132         [ #  # ]:          0 :             if (dist > CONVERGENCE_THRESHOLD) {
     133                 :          0 :                 has_converged = false;
     134                 :          0 :                 break;
     135                 :            :             }
     136                 :            :         }
     137                 :            :         // If converged, then break from the loop
     138         [ #  # ]:          0 :         if (has_converged)
     139                 :          0 :             break;
     140                 :            :     }
     141                 :          0 :     return cset;
     142                 :            : }

Generated by: LCOV version 1.11