LCOV - code coverage report
Current view: top level - cluster - lcd_clusterer.cc (source / functions) Hit Total Coverage
Test: Test Coverage for xapian-core 954b5873a738 Lines: 43 47 91.5 %
Date: 2019-06-30 05:20:33 Functions: 4 5 80.0 %
Branches: 38 68 55.9 %

           Branch data     Line data    Source code
       1                 :            : /** @file lcd_clusterer.cc
       2                 :            :  *  @brief LCD clustering API
       3                 :            :  */
       4                 :            : /* Copyright (C) 2018 Uppinder Chugh
       5                 :            :  *
       6                 :            :  * This program is free software; you can redistribute it and/or
       7                 :            :  * modify it under the terms of the GNU General Public License as
       8                 :            :  * published by the Free Software Foundation; either version 2 of the
       9                 :            :  * License, or (at your option) any later version.
      10                 :            :  *
      11                 :            :  * This program is distributed in the hope that it will be useful,
      12                 :            :  * but WITHOUT ANY WARRANTY; without even the implied warranty of
      13                 :            :  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      14                 :            :  * GNU General Public License for more details.
      15                 :            :  *
      16                 :            :  * You should have received a copy of the GNU General Public License
      17                 :            :  * along with this program; if not, write to the Free Software
      18                 :            :  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
      19                 :            :  * USA
      20                 :            :  */
      21                 :            : 
      22                 :            : #include <config.h>
      23                 :            : 
      24                 :            : #include "xapian/cluster.h"
      25                 :            : #include "xapian/error.h"
      26                 :            : 
      27                 :            : #include "debuglog.h"
      28                 :            : #include "omassert.h"
      29                 :            : 
      30                 :            : #include <algorithm>
      31                 :            : #include <set>
      32                 :            : #include <vector>
      33                 :            : 
      34                 :            : using namespace Xapian;
      35                 :            : using namespace std;
      36                 :            : 
      37                 :            : struct pcompare {
      38                 :        324 :     bool operator()(const pair<Point, double>& a,
      39                 :            :                     const pair<Point, double>& b) const {
      40                 :        324 :         return a.second > b.second;
      41                 :            :     }
      42                 :            : };
      43                 :            : 
      44                 :            : typedef set<pair<Point, double>, pcompare> PSet;
      45                 :            : 
      46                 :            : struct dcompare {
      47                 :        264 :     bool operator()(const pair<PSet::iterator, double>& a,
      48                 :            :                     const pair<PSet::iterator, double>& b) const {
      49                 :        264 :         return a.second < b.second;
      50                 :            :     }
      51                 :            : };
      52                 :            : 
      53                 :         12 : LCDClusterer::LCDClusterer(unsigned int k_)
      54                 :         12 :     : k(k_)
      55                 :            : {
      56                 :            :     LOGCALL_CTOR(API, "LCDClusterer", k_);
      57         [ -  + ]:         12 :     if (k_ == 0)
      58                 :            :         throw InvalidArgumentError("Number of required clusters should be "
      59 [ #  # ][ #  # ]:          0 :                                    "greater than zero");
                 [ #  # ]
      60                 :         12 : }
      61                 :            : 
      62                 :            : string
      63                 :          0 : LCDClusterer::get_description() const
      64                 :            : {
      65         [ #  # ]:          0 :     return "LCDClusterer()";
      66                 :            : }
      67                 :            : 
      68                 :            : ClusterSet
      69                 :         12 : LCDClusterer::cluster(const MSet &mset)
      70                 :            : {
      71                 :            :     LOGCALL(API, ClusterSet, "LCDClusterer::cluster", mset);
      72                 :            : 
      73         [ +  - ]:         12 :     doccount size = mset.size();
      74                 :         12 :     unsigned int k_ = k;
      75         [ -  + ]:         12 :     if (k_ >= size)
      76                 :          0 :         k_ = size;
      77                 :            : 
      78                 :            :     // Store each document and its rel score from given mset
      79         [ +  - ]:         12 :     set<pair<Point, double>, pcompare> points;
      80                 :            : 
      81                 :            :     // Initialise points
      82         [ +  - ]:         24 :     TermListGroup tlg(mset);
      83 [ +  - ][ +  - ]:         96 :     for (MSetIterator it = mset.begin(); it != mset.end(); ++it)
                 [ +  + ]
      84 [ +  - ][ +  - ]:         96 :         points.emplace(Point(tlg, it.get_document()), it.get_weight());
         [ +  - ][ +  - ]
      85                 :            : 
      86                 :            :     // Container for holding the clusters
      87         [ +  - ]:         12 :     ClusterSet cset;
      88                 :            : 
      89                 :            :     // Instantiate this for computing distance between points
      90                 :         12 :     CosineDistance distance;
      91                 :            : 
      92                 :            :     // First cluster center
      93                 :         12 :     PSet::iterator cluster_center = points.begin();
      94                 :            : 
      95                 :            :     // The original algorithm accepts a parameter k which is the number
      96                 :            :     // of documents in each cluster, which can be hard to tune and
      97                 :            :     // especially when using this in conjunction with the diversification
      98                 :            :     // module. So, for now LCD clusterer accepts k as the number of
      99                 :            :     // clusters and divides the documents into k clusters such that among k
     100                 :            :     // clusters, n clusters have x - 1 points and (k - n) have x points. This
     101                 :            :     // needs to be tested on a dataset to see how well this works.
     102                 :            :     // n * (x - 1) + (k_ - n) * x = size, where 0 <= n < k
     103                 :         12 :     unsigned n = k_ - size % k_;
     104                 :         12 :     unsigned x = (size / k_) + 1;
     105                 :            :     AssertEq(n * (x - 1) + (k_ - n) * x, size);
     106                 :            : 
     107                 :         12 :     unsigned cnum = 1;
     108                 :            :     while (true) {
     109                 :            :         // Container for new cluster
     110         [ +  - ]:         48 :         Cluster new_cluster;
     111                 :            : 
     112                 :            :         // Select (num_points - 1) nearest points to cluster_center from
     113                 :            :         // 'points' and form a new cluster
     114         [ +  + ]:         48 :         unsigned int num_points = cnum <= n ? x - 1 : x;
     115                 :            : 
     116                 :            :         // Store distances of each point from current cluster center
     117                 :            :         // Iterator of each point is stored for fast deletion from 'points'
     118         [ +  + ]:         96 :         vector<pair<PSet::iterator, double>> dist_vector;
     119         [ +  + ]:        276 :         for (auto it = points.begin(); it != points.end(); ++it) {
     120         [ +  + ]:        228 :             if (it == cluster_center)
     121                 :         48 :                 continue;
     122                 :            : 
     123         [ +  - ]:        180 :             double dist = distance.similarity(cluster_center->first, it->first);
     124 [ +  - ][ +  - ]:        180 :             dist_vector.push_back(make_pair(it, dist));
     125                 :            :         }
     126                 :            : 
     127                 :            :         // Sort dist_vector in ascending order of distance
     128         [ +  - ]:         48 :         sort(dist_vector.begin(), dist_vector.end(), dcompare());
     129                 :            : 
     130                 :            :         // Add first num_points-1 to cluster
     131         [ +  + ]:         84 :         for (unsigned int i = 0; i < num_points - 1; ++i) {
     132                 :         36 :             auto piterator = dist_vector[i].first;
     133                 :            :             // Add to cluster
     134         [ +  - ]:         36 :             new_cluster.add_point(piterator->first);
     135                 :            :             // Remove from 'points'
     136         [ +  - ]:         36 :             points.erase(piterator);
     137                 :            :         }
     138                 :            : 
     139                 :            :         // Add cluster_center to current cluster
     140         [ +  - ]:         48 :         new_cluster.add_point(cluster_center->first);
     141                 :            : 
     142                 :            :         // Add cluster to cset
     143         [ +  - ]:         48 :         cset.add_cluster(new_cluster);
     144                 :            : 
     145         [ +  + ]:         48 :         if (cnum == k_) break;
     146                 :            : 
     147                 :            :         // Remove current cluster_center from points
     148         [ +  - ]:         36 :         points.erase(cluster_center);
     149                 :            : 
     150                 :            :         // Select a new cluster center which is the point that is farthest away
     151                 :            :         // from the current cluster center
     152                 :         36 :         cluster_center = dist_vector.back().first;
     153                 :            : 
     154         [ +  + ]:         48 :         ++cnum;
     155                 :         36 :     }
     156                 :            : 
     157                 :         24 :     return cset;
     158                 :            : }

Generated by: LCOV version 1.11